chore(deploy): update deployment configs and TUI for scheduler

Update deployment and CLI tooling:
- TUI models (jobs, state) with scheduler data
- TUI store with scheduler endpoints
- TUI config with scheduler settings
- Deployment Makefile with scheduler targets
- Deploy script with scheduler registration
- Docker Compose files with scheduler services
- Remove obsolete Dockerfiles (api-server, full-prod, test)
- Update remaining Dockerfiles with scheduler integration
This commit is contained in:
Jeremie Fraeys 2026-02-26 12:08:31 -05:00
parent 4cdb68907e
commit c459285cab
No known key found for this signature in database
18 changed files with 357 additions and 620 deletions

View file

@ -1,75 +0,0 @@
# Multi-stage build for ML Experiment Manager
FROM golang:1.25-alpine AS go-builder
# Install dependencies
RUN apk add --no-cache git make podman redis
# Set working directory
WORKDIR /app
# Copy go mod files
COPY go.mod go.sum ./
# Download dependencies
RUN go mod download
# Copy source code
COPY . .
# Build Go binaries
RUN make build
# Zig CLI stage
FROM alpine:3.19 AS zig-builder
# Install dependencies
RUN apk add --no-cache curl xz
# Install Zig
RUN curl -L https://ziglang.org/download/0.15.2/zig-linux-aarch64-0.15.2.tar.xz | tar -xJ -C /opt
ENV PATH="/opt/zig-linux-aarch64-0.15.2:${PATH}"
# Copy CLI source
COPY cli/ /app/cli/
# Build Zig CLI
WORKDIR /app/cli
RUN zig build cross
# Final stage
FROM alpine:3.19
# Install runtime dependencies
RUN apk add --no-cache ca-certificates rsync openssh-client redis
# Create app user
RUN addgroup -g 1001 -S appgroup && \
adduser -u 1001 -S appuser -G appgroup
# Set working directory
WORKDIR /app
# Copy binaries from builders
COPY --from=go-builder /app/bin/ /usr/local/bin/
COPY --from=zig-builder /app/cli/zig-out/bin/ml /usr/local/bin/
# Copy configs
COPY --from=go-builder /app/configs/ /app/configs/
# Create directories
RUN mkdir -p /data/experiments /data/datasets /data/snapshots /home/appuser/.ml && \
mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl && \
chown -R appuser:appgroup /data /app /home/appuser
# Switch to app user
USER appuser
# Expose ports
EXPOSE 9101
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD wget --no-verbose --tries=1 --no-check-certificate --spider https://localhost:9101/health || exit 1
# Default command
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"]

View file

@ -1,76 +0,0 @@
# Full Production Dockerfile with Podman and SSH
FROM golang:1.25-alpine AS builder
# Install dependencies
RUN apk add --no-cache git make
# Set working directory
WORKDIR /app
# Copy go mod files
COPY go.mod go.sum ./
# Download dependencies
RUN go mod download
# Copy source code
COPY . .
# Build Go binaries
RUN go build -o bin/api-server cmd/api-server/main.go && \
go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go
# Final stage with Podman
FROM alpine:3.19
# Install runtime dependencies including Podman and SSH
RUN apk add --no-cache ca-certificates redis openssl curl podman openssh
# Create app user
RUN addgroup -g 1001 -S appgroup && \
adduser -u 1001 -S appuser -G appgroup
# Set working directory
WORKDIR /app
# Copy binaries from builder
COPY --from=builder /app/bin/ /usr/local/bin/
# Copy configs
COPY --from=builder /app/configs/ /app/configs/
# Create necessary directories
RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /app/ssh /tmp/fetchml-jobs && \
mkdir -p /data/active/datasets /data/active/snapshots && \
mkdir -p /logs && \
chown -R appuser:appgroup /app /data /logs
# Generate SSL certificates
RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
-subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \
chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem
# Generate SSH keys for container communication
RUN ssh-keygen -t rsa -b 2048 -f /app/ssh/id_rsa -N "" && \
cp /app/ssh/id_rsa.pub /app/ssh/authorized_keys && \
chmod 600 /app/ssh/id_rsa && \
chmod 644 /app/ssh/id_rsa.pub /app/ssh/authorized_keys
# Configure SSH daemon
RUN echo "PermitRootLogin yes" >> /etc/ssh/sshd_config && \
echo "PasswordAuthentication no" >> /etc/ssh/sshd_config && \
echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config && \
echo "AuthorizedKeysFile /app/ssh/authorized_keys" >> /etc/ssh/sshd_config
# Switch to app user
USER appuser
# Expose ports
EXPOSE 9101 22
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
CMD curl -k -f https://localhost:9101/health || exit 1
# Default command for API server
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"]

View file

@ -1,149 +0,0 @@
# Homelab Secure Production Dockerfile
FROM golang:1.25-alpine AS builder
# Install dependencies
RUN apk add --no-cache git make
# Set working directory
WORKDIR /app
# Copy go mod files
COPY go.mod go.sum ./
# Download dependencies
RUN go mod download
# Copy source code
COPY . .
# Build Go binaries
RUN go build -o bin/api-server cmd/api-server/main.go && \
go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go
# Final stage with security hardening
FROM alpine:3.19
# Install security packages and runtime dependencies
RUN apk add --no-cache \
ca-certificates \
redis \
openssl \
curl \
podman \
openssh \
sudo \
fail2ban \
logrotate \
&& rm -rf /var/cache/apk/*
# Create app user and worker user with no shell by default
RUN addgroup -g 1001 -S appgroup && \
adduser -u 1001 -S appuser -G appgroup -s /sbin/nologin && \
addgroup -g 1002 -S workergroup && \
adduser -u 1002 -S worker -G workergroup -s /bin/sh && \
echo "worker:HomelabWorker2024!" | chpasswd && \
mkdir -p /home/worker/.ssh && \
chown -R worker:workergroup /home/worker
# Set working directory
WORKDIR /app
# Copy binaries from builder
COPY --from=builder /app/bin/ /usr/local/bin/
# Copy configs
COPY --from=builder /app/configs/ /app/configs/
# Create necessary directories with proper permissions
RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /tmp/fetchml-jobs && \
mkdir -p /data/active/datasets /data/active/snapshots && \
mkdir -p /logs && \
chown -R appuser:appgroup /app /data /logs && \
chmod 750 /app/data/experiments /app/logs
# Generate SSL certificates with stronger crypto
RUN openssl req -x509 -newkey rsa:4096 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
-subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \
chmod 600 /app/ssl/key.pem && \
chmod 644 /app/ssl/cert.pem
# Generate SSH keys with stronger crypto
RUN ssh-keygen -t rsa -b 4096 -f /home/worker/.ssh/id_rsa -N "" && \
cp /home/worker/.ssh/id_rsa.pub /home/worker/.ssh/authorized_keys && \
chmod 700 /home/worker/.ssh && \
chmod 600 /home/worker/.ssh/id_rsa && \
chmod 644 /home/worker/.ssh/id_rsa.pub /home/worker/.ssh/authorized_keys && \
chown -R worker:workergroup /home/worker/.ssh
# Configure SSH with security hardening
RUN echo "Port 2222" >> /etc/ssh/sshd_config && \
echo "PermitRootLogin no" >> /etc/ssh/sshd_config && \
echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \
echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config && \
echo "AuthorizedKeysFile %h/.ssh/authorized_keys" >> /etc/ssh/sshd_config && \
echo "AllowUsers worker" >> /etc/ssh/sshd_config && \
echo "MaxAuthTries 3" >> /etc/ssh/sshd_config && \
echo "ClientAliveInterval 300" >> /etc/ssh/sshd_config && \
echo "ClientAliveCountMax 2" >> /etc/ssh/sshd_config && \
echo "X11Forwarding no" >> /etc/ssh/sshd_config && \
echo "AllowTcpForwarding no" >> /etc/ssh/sshd_config && \
echo "Banner /etc/ssh/banner" >> /etc/ssh/sshd_config && \
echo "Protocol 2" >> /etc/ssh/sshd_config && \
echo "Ciphers chacha20-poly1305@openssh.com,aes256-gcm@openssh.com,aes128-gcm@openssh.com" >> /etc/ssh/sshd_config && \
echo "MACs hmac-sha2-256-etm@openssh.com,hmac-sha2-512-etm@openssh.com,hmac-sha2-256,hmac-sha2-512" >> /etc/ssh/sshd_config && \
echo "KexAlgorithms curve25519-sha256@libssh.org,diffie-hellman-group16-sha512" >> /etc/ssh/sshd_config
# Create SSH banner
RUN echo "=================================================" > /etc/ssh/banner && \
echo " ML Experiments Homelab Server" >> /etc/ssh/banner && \
echo " Unauthorized access is prohibited" >> /etc/ssh/banner && \
echo " All connections are monitored and logged" >> /etc/ssh/banner && \
echo "=================================================" >> /etc/ssh/banner
# Generate SSH host keys
RUN ssh-keygen -A
# Configure fail2ban for SSH protection
RUN echo "[DEFAULT]" > /etc/fail2ban/jail.local && \
echo "bantime = 3600" >> /etc/fail2ban/jail.local && \
echo "findtime = 600" >> /etc/fail2ban/jail.local && \
echo "maxretry = 3" >> /etc/fail2ban/jail.local && \
echo "" >> /etc/fail2ban/jail.local && \
echo "[sshd]" >> /etc/fail2ban/jail.local && \
echo "enabled = true" >> /etc/fail2ban/jail.local && \
echo "port = 2222" >> /etc/fail2ban/jail.local && \
echo "filter = sshd" >> /etc/fail2ban/jail.local && \
echo "logpath = /var/log/messages" >> /etc/fail2ban/jail.local
# Configure sudo with restricted access
RUN echo "appuser ALL=(ALL) NOPASSWD: /app/start-security.sh" >> /etc/sudoers && \
echo "appuser ALL=(ALL) NOPASSWD: /usr/sbin/sshd" >> /etc/sudoers && \
echo "appuser ALL=(ALL) NOPASSWD: /usr/bin/ssh-keygen" >> /etc/sudoers && \
echo "worker ALL=(ALL) NOPASSWD: /usr/bin/podman" >> /etc/sudoers && \
echo "Defaults:appuser !requiretty" >> /etc/sudoers && \
echo "Defaults:worker !requiretty" >> /etc/sudoers && \
echo "Defaults:appuser !lecture" >> /etc/sudoers && \
echo "Defaults:worker !lecture" >> /etc/sudoers
# Security hardening - remove setuid binaries except sudo
RUN find / -perm /4000 -type f -not -path "/usr/bin/sudo" -exec chmod 755 {} \; 2>/dev/null || true
# Create startup script for security services
RUN echo "#!/bin/sh" > /app/start-security.sh && \
echo "ssh-keygen -A" >> /app/start-security.sh && \
echo "/usr/sbin/sshd -D -p 2222" >> /app/start-security.sh && \
echo "# End of security services" >> /app/start-security.sh && \
chmod 755 /app/start-security.sh
# Switch to app user for application
USER appuser
# Expose ports
EXPOSE 9101 2222
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
CMD curl -k -f https://localhost:9101/health || exit 1
# Default command for API server
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"]

View file

@ -16,9 +16,9 @@ RUN go mod download
# Copy source code
COPY . .
# Build Go binaries with CGO enabled for SQLite
RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \
CGO_ENABLED=1 go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go
# Build Go binaries (native libs not used in Docker since NVML unavailable in Alpine)
RUN CGO_ENABLED=1 go build -o bin/api-server ./cmd/api-server/main.go && \
CGO_ENABLED=1 go build -o bin/worker ./cmd/worker
# Final stage with Podman and secure SSH
FROM alpine:3.19

View file

@ -18,12 +18,13 @@ COPY . .
# Copy and build native C++ libraries (without NVML for non-GPU systems)
COPY native/ ./native/
ENV FETCHML_DOCKER_BUILD=1
RUN rm -rf native/build && cd native && mkdir -p build && cd build && \
cmake .. -DCMAKE_BUILD_TYPE=Release -DFETCHML_DOCKER_BUILD=1 -DBUILD_NVML_GPU=OFF && \
cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_NVML_GPU=OFF && \
make -j$(nproc)
# Build Go binaries (native libs not used in Docker since NVML unavailable in Alpine)
RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \
RUN CGO_ENABLED=1 go build -o bin/api-server ./cmd/api-server/main.go && \
CGO_ENABLED=1 go build -o bin/worker ./cmd/worker
# Final stage

View file

@ -1,62 +0,0 @@
# Test Dockerfile - Go components only
FROM golang:1.25-alpine AS builder
# Install dependencies
RUN apk add --no-cache git gcc musl-dev
# Set working directory
WORKDIR /app
# Copy go mod files
COPY go.mod go.sum ./
# Download dependencies
RUN go mod download
# Copy source code
COPY . .
# Build only Go binaries (skip Zig)
RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \
go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go && \
go build -o bin/tui ./cmd/tui
# Final stage
FROM alpine:3.19
# Install runtime dependencies
RUN apk add --no-cache ca-certificates curl openssl
# Create app user
RUN addgroup -g 1001 -S appgroup && \
adduser -u 1001 -S appuser -G appgroup
# Set working directory
WORKDIR /app
# Copy binaries from builder
COPY --from=builder /app/bin/ /usr/local/bin/
# Copy configs
COPY --from=builder /app/configs/ /app/configs/
# Create necessary directories
RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl && \
mkdir -p /data/experiments /data/datasets /data/snapshots
# Generate SSL certificates for container use
RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
-subj "/C=US/ST=Test/L=Local/O=FetchML/OU=Tests/CN=localhost" && \
chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem
# Ensure app user can write to data/logs and read TLS material
RUN chown -R appuser:appgroup /app/data /app/logs /app/ssl /app/configs /data
# Switch to app user
USER appuser
# Expose ports
EXPOSE 9101
# Default command
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"]

View file

@ -14,22 +14,20 @@ import (
// CLIConfig represents the TOML config structure used by the CLI
type CLIConfig struct {
WorkerHost string `toml:"worker_host"`
WorkerUser string `toml:"worker_user"`
WorkerBase string `toml:"worker_base"`
WorkerPort int `toml:"worker_port"`
APIKey string `toml:"api_key"`
// User context (filled after authentication)
CurrentUser *UserContext `toml:"-"`
WorkerHost string `toml:"worker_host"`
WorkerUser string `toml:"worker_user"`
WorkerBase string `toml:"worker_base"`
APIKey string `toml:"api_key"`
WorkerPort int `toml:"worker_port"`
}
// UserContext represents the authenticated user information
type UserContext struct {
Name string `json:"name"`
Admin bool `json:"admin"`
Roles []string `json:"roles"`
Permissions map[string]bool `json:"permissions"`
Name string `json:"name"`
Roles []string `json:"roles"`
Admin bool `json:"admin"`
}
// LoadCLIConfig loads the CLI's TOML configuration from the provided path.

View file

@ -12,39 +12,31 @@ import (
// Config holds TUI configuration
type Config struct {
Host string `toml:"host"`
User string `toml:"user"`
SSHKey string `toml:"ssh_key"`
Port int `toml:"port"`
BasePath string `toml:"base_path"`
Mode string `toml:"mode"` // "dev" or "prod"
WrapperScript string `toml:"wrapper_script"`
TrainScript string `toml:"train_script"`
RedisAddr string `toml:"redis_addr"`
RedisPassword string `toml:"redis_password"`
RedisDB int `toml:"redis_db"`
KnownHosts string `toml:"known_hosts"`
ServerURL string `toml:"server_url"` // WebSocket server URL (e.g., ws://localhost:8080)
// Local mode configuration
DBPath string `toml:"db_path"` // Path to SQLite database (local mode)
ForceLocal bool `toml:"force_local"` // Force local-only mode
ProjectRoot string `toml:"project_root"` // Project root for local mode
// Experiment configuration
Experiment struct {
Name string `toml:"name"`
Entrypoint string `toml:"entrypoint"`
} `toml:"experiment"`
// Authentication
Auth auth.Config `toml:"auth"`
// Podman settings
PodmanImage string `toml:"podman_image"`
ContainerWorkspace string `toml:"container_workspace"`
ContainerResults string `toml:"container_results"`
GPUDevices []string `toml:"gpu_devices"`
ProjectRoot string `toml:"project_root"`
ServerURL string `toml:"server_url"`
ContainerResults string `toml:"container_results"`
BasePath string `toml:"base_path"`
Mode string `toml:"mode"`
WrapperScript string `toml:"wrapper_script"`
TrainScript string `toml:"train_script"`
RedisAddr string `toml:"redis_addr"`
RedisPassword string `toml:"redis_password"`
ContainerWorkspace string `toml:"container_workspace"`
SSHKey string `toml:"ssh_key"`
DBPath string `toml:"db_path"`
KnownHosts string `toml:"known_hosts"`
PodmanImage string `toml:"podman_image"`
Host string `toml:"host"`
User string `toml:"user"`
Auth auth.Config `toml:"auth"`
GPUDevices []string `toml:"gpu_devices"`
RedisDB int `toml:"redis_db"`
Port int `toml:"port"`
ForceLocal bool `toml:"force_local"`
}
// LoadConfig loads configuration from a TOML file

View file

@ -21,21 +21,19 @@ const (
// Job represents a job in the TUI
type Job struct {
Name string
Status JobStatus
TaskID string
Priority int64
// Narrative fields for research context
OutcomeStatus string
Status JobStatus
TaskID string
Hypothesis string
Context string
Intent string
ExpectedOutcome string
ActualOutcome string
OutcomeStatus string // validated, invalidated, inconclusive
// GPU allocation tracking
GPUDeviceID int // -1 if not assigned
GPUUtilization int // 0-100%
GPUMemoryUsed int64 // MB
Name string
Priority int64
GPUDeviceID int
GPUUtilization int
GPUMemoryUsed int64
}
// Title returns the job title for display

View file

@ -48,50 +48,50 @@ const (
// DatasetInfo represents dataset information in the TUI
type DatasetInfo struct {
Name string `json:"name"`
SizeBytes int64 `json:"size_bytes"`
Location string `json:"location"`
LastAccess time.Time `json:"last_access"`
Name string `json:"name"`
Location string `json:"location"`
SizeBytes int64 `json:"size_bytes"`
}
// State holds the application state
type State struct {
Jobs []Job
JobList list.Model
LastRefresh time.Time
LastGPUUpdate time.Time
LastFrameTime time.Time
JobStats map[JobStatus]int
Status string
APIKey string
ErrorMsg string
Keys KeyMap
QueuedTasks []*Task
Datasets []DatasetInfo
JobList list.Model
Jobs []Job
Input textinput.Model
APIKeyInput textinput.Model
GpuView viewport.Model
ContainerView viewport.Model
QueueView viewport.Model
LogsView viewport.Model
ConfigView viewport.Model
ExperimentHistoryView viewport.Model
TeamView viewport.Model
SettingsView viewport.Model
DatasetView viewport.Model
ExperimentsView viewport.Model
NarrativeView viewport.Model
TeamView viewport.Model
ExperimentHistoryView viewport.Model
ConfigView viewport.Model
LogsView viewport.Model
SelectedJob Job
Input textinput.Model
APIKeyInput textinput.Model
Status string
ErrorMsg string
InputMode bool
Width int
Height int
ShowHelp bool
ContainerView viewport.Model
Spinner spinner.Model
SelectedJob Job
ActiveView ViewMode
LastRefresh time.Time
LastFrameTime time.Time
RefreshRate float64 // measured in ms
RefreshRate float64
FrameCount int
LastGPUUpdate time.Time
IsLoading bool
JobStats map[JobStatus]int
APIKey string
Height int
Width int
SettingsIndex int
Keys KeyMap
ShowHelp bool
IsLoading bool
InputMode bool
}
// InitialState creates the initial application state

View file

@ -18,13 +18,13 @@ type Store struct {
// RunInfo represents a local run from SQLite
type RunInfo struct {
EndTime *string
PID *int64
RunID string
ExperimentID string
Name string
Status string
StartTime string
EndTime *string
PID *int64
Synced bool
}

View file

@ -1,74 +1,200 @@
# Docker Compose Deployment Management
.PHONY: help dev-up dev-down dev-logs dev-restart homelab-secure-up homelab-secure-down prod-up prod-down status clean
.PHONY: help dev-up dev-down dev-logs dev-restart staging-up staging-down staging-logs staging-restart staging-status homelab-secure-up homelab-secure-down prod-up prod-down prod-logs prod-restart prod-status status clean rollback security-mode check-audit-sink health-check security-scan
# Default target
help: ## Show this help message
@echo "Available commands:"
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-25s\033[0m %s\n", $$1, $$2}'
# Development environment
dev-up: ## Start development environment
@echo "Starting development environment..."
docker-compose -f deployments/docker-compose.dev.yml up -d
docker-compose -f docker-compose.dev.yml up -d
@echo "Services: Caddy (8080/8443), Redis (6379), Prometheus (9090), Grafana (3000)"
dev-down: ## Stop development environment
@echo "Stopping development environment..."
docker-compose -f deployments/docker-compose.dev.yml down
docker-compose -f docker-compose.dev.yml down
dev-logs: ## Show development logs
docker-compose -f deployments/docker-compose.dev.yml logs -f
docker-compose -f docker-compose.dev.yml logs -f
dev-restart: ## Restart development environment
@echo "Restarting development environment..."
docker-compose -f deployments/docker-compose.dev.yml restart
docker-compose -f docker-compose.dev.yml restart
# Staging environment
staging-up: ## Start staging environment
@echo "Starting staging environment..."
@if [ ! -f .env.staging ]; then \
echo "Creating staging environment file..."; \
echo "DATA_DIR=./data/staging" > .env.staging; \
echo "LOG_LEVEL=info" >> .env.staging; \
echo "COMPLIANCE_MODE=standard" >> .env.staging; \
fi
docker-compose -f docker-compose.staging.yml up -d
@echo "Staging services: Caddy (9080/9443), Redis (6380), API (9102), MinIO (9002/9003)"
staging-down: ## Stop staging environment
@echo "Stopping staging environment..."
docker-compose -f docker-compose.staging.yml down
staging-logs: ## Show staging logs
docker-compose -f docker-compose.staging.yml logs -f
staging-restart: ## Restart staging environment
@echo "Restarting staging environment..."
docker-compose -f docker-compose.staging.yml restart
staging-status: ## Show staging status
docker-compose -f docker-compose.staging.yml ps
# Homelab environment
homelab-secure-up: ## Start secure homelab environment
@echo "Starting secure homelab environment..."
docker-compose -f deployments/docker-compose.homelab-secure.yml up -d
docker-compose -f docker-compose.homelab-secure.yml up -d
homelab-secure-down: ## Stop secure homelab environment
@echo "Stopping secure homelab environment..."
docker-compose -f deployments/docker-compose.homelab-secure.yml down
docker-compose -f docker-compose.homelab-secure.yml down
# Production environment
prod-up: ## Start production environment
@echo "Starting production environment..."
docker-compose -f deployments/docker-compose.prod.yml up -d
@echo "⚠ WARNING: This is production! Ensure you have proper backups."
@read -p "Continue? [y/N] " confirm && [ "$$confirm" = "y" ] || exit 1
docker-compose -f docker-compose.prod.yml up -d
prod-down: ## Stop production environment
@echo "Stopping production environment..."
docker-compose -f deployments/docker-compose.prod.yml down
docker-compose -f docker-compose.prod.yml down
prod-logs: ## Show production logs
docker-compose -f docker-compose.prod.yml logs -f
prod-restart: ## Restart production environment
@echo "Restarting production environment..."
@read -p "Restart production? [y/N] " confirm && [ "$$confirm" = "y" ] || exit 1
docker-compose -f docker-compose.prod.yml restart
prod-status: ## Show production status
docker-compose -f docker-compose.prod.yml ps
# Utility commands
status: ## Show status of all environments
@echo "=== Development Status ==="
@if [ -f deployments/docker-compose.dev.yml ]; then \
docker-compose -f deployments/docker-compose.dev.yml ps; \
@if [ -f docker-compose.dev.yml ]; then \
docker-compose -f docker-compose.dev.yml ps 2>/dev/null || echo "Not running"; \
fi
@echo ""
@echo "=== Staging Status ==="
@if [ -f docker-compose.staging.yml ]; then \
docker-compose -f docker-compose.staging.yml ps 2>/dev/null || echo "Not running"; \
fi
@echo ""
@echo "=== Homelab Secure Status ==="
@if [ -f deployments/docker-compose.homelab-secure.yml ]; then \
docker-compose -f deployments/docker-compose.homelab-secure.yml ps 2>/dev/null || echo "Not running"; \
@if [ -f docker-compose.homelab-secure.yml ]; then \
docker-compose -f docker-compose.homelab-secure.yml ps 2>/dev/null || echo "Not running"; \
fi
@echo ""
@echo "=== Production Status ==="
@if [ -f deployments/docker-compose.prod.yml ]; then \
docker-compose -f deployments/docker-compose.prod.yml ps 2>/dev/null || echo "Not running"; \
@if [ -f docker-compose.prod.yml ]; then \
docker-compose -f docker-compose.prod.yml ps 2>/dev/null || echo "Not running"; \
fi
clean: ## Clean up all containers and volumes
@echo "Cleaning up all Docker resources..."
@echo "This will remove all containers and volumes. Continue? [y/N]"
@read -r confirm && [ "$$confirm" = "y" ] || exit 1
docker-compose -f deployments/docker-compose.dev.yml down -v 2>/dev/null || true
docker-compose -f deployments/docker-compose.homelab-secure.yml down -v 2>/dev/null || true
docker-compose -f deployments/docker-compose.prod.yml down -v 2>/dev/null || true
docker-compose -f docker-compose.dev.yml down -v 2>/dev/null || true
docker-compose -f docker-compose.staging.yml down -v 2>/dev/null || true
docker-compose -f docker-compose.homelab-secure.yml down -v 2>/dev/null || true
docker-compose -f docker-compose.prod.yml down -v 2>/dev/null || true
docker system prune -f
@echo "Cleanup complete."
# Security mode targets
security-mode-dev: ## Run worker in dev security mode
@echo "Running with dev security mode (relaxed validation)..."
COMPLIANCE_MODE=dev docker-compose -f docker-compose.dev.yml up -d worker
security-mode-standard: ## Run worker in standard security mode
@echo "Running with standard security mode..."
COMPLIANCE_MODE=standard docker-compose -f docker-compose.dev.yml up -d worker
security-mode-hipaa: ## Run worker in HIPAA security mode
@echo "Running with HIPAA security mode (strict compliance)..."
@echo "✓ Network mode: none"
@echo "✓ Seccomp profile: default-hardened"
@echo "✓ No new privileges: enforced"
@echo "✓ Audit sink: required"
@read -p "Confirm HIPAA mode deployment? [y/N] " confirm && [ "$$confirm" = "y" ] || exit 1
COMPLIANCE_MODE=hipaa docker-compose -f docker-compose.dev.yml up -d worker
# Rollback targets
rollback-staging: ## Rollback staging deployment
@echo "Rolling back staging deployment..."
@echo "⚠ This rolls back the image only - queue state and audit log are NOT rolled back"
@read -p "Continue with rollback? [y/N] " confirm && [ "$$confirm" = "y" ] || exit 1
docker-compose -f docker-compose.staging.yml down
@if [ -f .staging-deployment.log ]; then \
PREVIOUS_TAG=$$(tail -2 .staging-deployment.log | head -1 | grep -o 'tag=[^ ]*' | cut -d'=' -f2 || echo "latest"); \
echo "Previous tag: $$PREVIOUS_TAG"; \
docker-compose -f docker-compose.staging.yml up -d; \
fi
@echo "$$(date -Iseconds) | rollback | staging | actor=$$(whoami)" >> .staging-audit.log
rollback-prod: ## Rollback production deployment
@echo "Rolling back production deployment..."
@echo "⚠ CRITICAL: This rolls back the image only"
@echo "⚠ Queue state is NOT rolled back"
@echo "⚠ Audit log chain is NOT rolled back (must never break chain)"
@echo "⚠ New artifacts remain in storage"
@read -p "CONFIRM PRODUCTION ROLLBACK? [yes/N] " confirm && [ "$$confirm" = "yes" ] || exit 1
docker-compose -f docker-compose.prod.yml down
@if [ -f .prod-audit.log ]; then \
PREVIOUS_SHA=$$(tail -2 .prod-audit.log | head -1 | grep -o 'sha-[a-f0-9]*' || echo "previous"); \
echo "Rolling back to: $$PREVIOUS_SHA"; \
docker-compose -f docker-compose.prod.yml up -d; \
fi
@echo "$$(date -Iseconds) | rollback | prod | actor=$$(whoami)" >> .prod-audit.log
@echo "Rollback complete. Verify health: make prod-status"
check-audit-sink: ## Check audit sink reachability
@echo "Checking audit sink..."
@if [ -f ../scripts/check-audit-sink.sh ]; then \
../scripts/check-audit-sink.sh --env staging; \
else \
echo "Audit sink check script not found"; \
fi
health-check: ## Run health checks on all environments
@echo "=== Health Checks ==="
@echo "Development (localhost:9101):"
@curl -fsS http://localhost:9101/health 2>/dev/null && echo "✓ Healthy" || echo "✗ Not responding"
@echo ""
@echo "Staging (localhost:9102):"
@curl -fsS http://localhost:9102/health 2>/dev/null && echo "✓ Healthy" || echo "✗ Not responding"
@echo ""
@echo "Production (localhost:9101):"
@curl -fsS http://localhost:9101/health 2>/dev/null && echo "✓ Healthy" || echo "✗ Not responding"
security-scan: ## Run security scanners locally
@echo "Running security scanners..."
@if command -v gosec >/dev/null 2>&1; then \
echo "Running gosec..."; \
cd .. && gosec ./... 2>/dev/null || echo "gosec found issues"; \
else \
echo "gosec not installed - skipping"; \
fi
@if command -v nancy >/dev/null 2>&1; then \
echo "Running nancy..."; \
cd .. && go list -json -deps ./... 2>/dev/null | nancy sleuth 2>/dev/null || echo "nancy found issues"; \
else \
echo "nancy not installed - skipping"; \
fi
# Quick aliases
up: dev-up ## Alias for dev-up
down: dev-down ## Alias for dev-down

View file

@ -37,6 +37,7 @@ show_usage() {
echo ""
echo "Environments:"
echo " dev Development environment"
echo " staging Staging environment (pre-production)"
echo " secure Secure homelab environment"
echo " prod Production environment"
echo ""
@ -46,11 +47,17 @@ show_usage() {
echo " restart Restart services"
echo " logs Show logs"
echo " status Show status"
echo " rollback Rollback to previous deployment (image only)"
echo " health-check Check service health and compliance mode"
echo " check-audit-sink Verify audit sink reachability"
echo ""
echo "Examples:"
echo " $0 dev up # Start development environment"
echo " $0 prod down # Stop production environment"
echo " $0 secure logs # Show secure environment logs"
echo " $0 dev up # Start development environment"
echo " $0 staging up # Start staging environment"
echo " $0 prod down # Stop production environment"
echo " $0 staging rollback # Rollback staging deployment"
echo " $0 prod health-check # Check production health"
echo " $0 prod check-audit-sink # Verify audit sink before deploy"
}
# Function to check if docker-compose file exists
@ -62,6 +69,9 @@ check_compose_file() {
"dev")
compose_file="${FETCHML_REPO_ROOT}/deployments/docker-compose.dev.yml"
;;
"staging")
compose_file="${FETCHML_REPO_ROOT}/deployments/docker-compose.staging.yml"
;;
"secure")
compose_file="${FETCHML_REPO_ROOT}/deployments/docker-compose.homelab-secure.yml"
;;
@ -154,6 +164,71 @@ main() {
print_status "Status of $environment environment:"
docker-compose --project-directory "${FETCHML_REPO_ROOT}" -f "$compose_file" ps
;;
"rollback")
print_warning "Rolling back $environment environment..."
print_warning "⚠ This rolls back the image only - queue state and audit log are NOT rolled back"
if [ "$environment" = "prod" ]; then
print_warning "⚠ CRITICAL: Production rollback"
print_warning "⚠ Queue state is NOT rolled back"
print_warning "⚠ Audit log chain is NOT rolled back (must never break chain)"
read -p "CONFIRM PRODUCTION ROLLBACK? [yes/N] " confirm
if [ "$confirm" != "yes" ]; then
print_error "Rollback cancelled"
exit 1
fi
fi
# Get previous deployment info
LOG_FILE="${FETCHML_REPO_ROOT}/deployments/.${environment}-audit.log"
if [ -f "$LOG_FILE" ]; then
PREVIOUS_SHA=$(tail -2 "$LOG_FILE" | head -1 | grep -o 'sha-[a-f0-9]*' || echo "")
if [ -n "$PREVIOUS_SHA" ]; then
print_status "Rolling back to: $PREVIOUS_SHA"
fi
fi
docker-compose --project-directory "${FETCHML_REPO_ROOT}" -f "$compose_file" down
docker-compose --project-directory "${FETCHML_REPO_ROOT}" -f "$compose_file" up -d
# Write rollback entry to audit log
echo "$(date -Iseconds) | rollback | $environment | actor=$(whoami)" >> "$LOG_FILE" 2>/dev/null || true
print_success "$environment rollback complete!"
print_status "Verify health with: $0 $environment health-check"
;;
"health-check"|"health")
print_status "Health check for $environment environment..."
# Determine port based on environment
case $environment in
dev) PORT=9101 ;;
staging) PORT=9102 ;;
prod) PORT=9101 ;;
*) PORT=9101 ;;
esac
# Check API health
if curl -fsS "http://localhost:${PORT}/health" > /dev/null 2>&1; then
print_success "API is healthy (port $PORT)"
# Check compliance_mode
COMPLIANCE_MODE=$(curl -fsS "http://localhost:${PORT}/health" 2>/dev/null | grep -o '"compliance_mode":"[^"]*"' | cut -d'"' -f4 || echo "unknown")
print_status "Compliance mode: $COMPLIANCE_MODE"
else
print_error "API health check failed (port $PORT)"
exit 1
fi
;;
"check-audit-sink")
print_status "Checking audit sink for $environment..."
if [ -f "${FETCHML_REPO_ROOT}/scripts/check-audit-sink.sh" ]; then
"${FETCHML_REPO_ROOT}/scripts/check-audit-sink.sh" --env "$environment"
else
print_warning "Audit sink check script not found"
fi
;;
*)
print_error "Unknown action: $action"
show_usage

View file

@ -1,6 +1,6 @@
---
# Homelab Docker Compose with Centralized Monitoring
# Includes: API, Redis, Prometheus, Grafana, Loki
# Development Docker Compose
# Includes: API, Redis, MinIO, Worker, Caddy
services:
caddy:
image: caddy:2-alpine
@ -11,8 +11,8 @@ services:
- "8443:443"
volumes:
- ./deployments/Caddyfile.dev:/etc/caddy/Caddyfile:ro
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/caddy/data:/data
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/caddy/config:/config
- ${DATA_DIR:-./data/smoke}/caddy/data:/data
- ${DATA_DIR:-./data/smoke}/caddy/config:/config
depends_on:
api-server:
condition: service_healthy
@ -42,12 +42,12 @@ services:
expose:
- "9101" # API and health endpoints (internal; external access via Caddy)
volumes:
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/logs:/logs
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/experiments:/data/experiments
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/active:/data/active
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/workspaces:/data/active/workspaces:delegated
- ./configs/api/dev.yaml:/app/configs/api/dev.yaml
- ./ssl:/app/ssl
- ${DATA_DIR:-./data/smoke}/logs:/logs
- ${DATA_DIR:-./data/smoke}/experiments:/data/experiments
- ${DATA_DIR:-./data/smoke}/active:/data/active
- ${DATA_DIR:-./data/smoke}/workspaces:/data/active/workspaces:delegated
- ${DATA_DIR:-./data/smoke}/configs:/app/configs:ro
- ${DATA_DIR:-./data/smoke}/ssl:/app/ssl:ro
depends_on:
- redis
restart: unless-stopped
@ -62,67 +62,41 @@ services:
retries: 3
start_period: 40s
labels:
logging: "promtail"
job: "api-server"
# MinIO for local development (single-node filesystem backend)
minio:
image: minio/minio:latest
container_name: ml-experiments-minio
container_name: ml-dev-minio
ports:
- "9000:9000"
- "9001:9001"
volumes:
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/minio:/data
- ${DATA_DIR:-./data/smoke}/minio:/data
environment:
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin123
- MINIO_BROWSER=on
command: ["server", "/data", "--console-address", ":9001"]
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 10s
test: ["CMD", "curl", "-fsS", "http://localhost:9000/minio/health/live"]
interval: 5s
timeout: 5s
retries: 10
retries: 5
restart: unless-stopped
# Initialize minio bucket (runs once)
minio-init:
image: alpine:3.19
container_name: ml-experiments-minio-init
image: minio/mc:latest
container_name: ml-dev-minio-init
depends_on:
minio:
condition: service_healthy
entrypoint: ["/bin/sh", "-c"]
command:
- |
set -eu
apk add --no-cache ca-certificates curl tar gzip
ARCH=$$(uname -m)
MC_ARCH=amd64
if [ "$$ARCH" = "aarch64" ] || [ "$$ARCH" = "arm64" ]; then
MC_ARCH=arm64
fi
curl -fsSL -o /usr/local/bin/mc "https://dl.min.io/client/mc/release/linux-$$MC_ARCH/mc"
chmod +x /usr/local/bin/mc
i=0
while ! mc alias set local http://minio:9000 minioadmin minioadmin123; do
i=$$((i+1))
if [ $$i -ge 30 ]; then
echo "minio not ready after 30 attempts" >&2
exit 1
fi
echo "waiting for minio... ($$i/30)"
sleep 1
done
# Skip if bucket already exists
if mc ls local/fetchml-snapshots 2>/dev/null; then
echo "Bucket fetchml-snapshots already exists, skipping init"
exit 0
fi
mc mb -p local/fetchml-snapshots || true
mkdir -p /tmp/snapshots/snap-1
echo -n "hello" > /tmp/snapshots/snap-1/hello.txt
tar -C /tmp/snapshots/snap-1 -czf /tmp/snap-1.tar.gz .
mc cp /tmp/snap-1.tar.gz local/fetchml-snapshots/snapshots/snap-1.tar.gz
FILE_SHA=$$(sha256sum /tmp/snapshots/snap-1/hello.txt | cut -d' ' -f1)
SNAP_SHA=$$(echo -n "$$FILE_SHA" | sha256sum | cut -d' ' -f1)
echo "snapshot_id=snap-1 snapshot_sha256=$$SNAP_SHA"
mc alias set local http://minio:9000 minioadmin minioadmin123 || exit 1
mc mb -p local/fetchml-snapshots 2>/dev/null || echo "Bucket exists"
echo "MinIO initialized"
restart: "no"
worker:
build:
@ -133,11 +107,12 @@ services:
ports:
- "8888:8888"
volumes:
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/logs:/logs
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/active:/data/active
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/experiments:/data/experiments
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/workspaces:/data/active/workspaces:delegated
- ./configs/workers/docker-dev.yaml:/app/configs/worker.yaml
- ${DATA_DIR:-./data/smoke}/logs:/logs
- ${DATA_DIR:-./data/smoke}/active:/data/active
- ${DATA_DIR:-./data/smoke}/experiments:/data/experiments
- ${DATA_DIR:-./data/smoke}/workspaces:/data/active/workspaces:delegated
- ${DATA_DIR:-./data/smoke}/configs/worker/docker-dev.yaml:/app/configs/worker.yaml:ro
- ${DATA_DIR:-./data/smoke}/ssl:/app/ssl:ro
- /sys/fs/cgroup:/sys/fs/cgroup:rw
depends_on:
redis:
@ -158,71 +133,6 @@ services:
# Native libs enabled via build tag: -tags native_libs
privileged: true
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
# # Prometheus - Metrics collection
# prometheus:
# image: prom/prometheus:latest
# container_name: ml-experiments-prometheus
# ports:
# - "9090:9090"
# volumes:
# - ${FETCHML_REPO_ROOT:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
# - prometheus_data:/prometheus
# command:
# - '--config.file=/etc/prometheus/prometheus.yml'
# - '--storage.tsdb.path=/prometheus'
# - '--web.console.libraries=/etc/prometheus/console_libraries'
# - '--web.console.templates=/etc/prometheus/consoles'
# - '--web.enable-lifecycle'
# restart: unless-stopped
#
# # Grafana - Visualization
# grafana:
# image: grafana/grafana:latest
# container_name: ml-experiments-grafana
# ports:
# - "3000:3000"
# volumes:
# - grafana_data:/var/lib/grafana
# - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning
# - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards
# environment:
# - GF_SECURITY_ADMIN_PASSWORD=admin123
# - GF_USERS_ALLOW_SIGN_UP=false
# restart: unless-stopped
# depends_on:
# - prometheus
# - loki
#
# # Loki - Log aggregation
# loki:
# image: grafana/loki:latest
# container_name: ml-experiments-loki
# ports:
# - "3100:3100"
# volumes:
# - ${FETCHML_REPO_ROOT:-.}/monitoring/loki-config.yml:/etc/loki/local-config.yaml
# - loki_data:/loki
# command: -config.file=/etc/loki/local-config.yaml
# restart: unless-stopped
# Promtail - Log collector
promtail:
image: grafana/promtail:latest
container_name: ml-experiments-promtail
volumes:
- ${SMOKE_TEST_DATA_DIR:-./monitoring}/promtail-config.yml:/etc/promtail/config.yml
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/logs:/var/log/app
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- /var/run/docker.sock:/var/run/docker.sock
command: -config.file=/etc/promtail/config.yml
restart: unless-stopped
# depends_on:
# - loki
volumes:
redis_data:
driver: local
prometheus_data:
driver: local
grafana_data:
driver: local
loki_data:
driver: local

View file

@ -14,8 +14,8 @@ services:
- ${HOMELAB_DATA_DIR:-./data/homelab}/experiments:/data/experiments
- ${HOMELAB_DATA_DIR:-./data/homelab}/active:/data/active
- ${HOMELAB_DATA_DIR:-./data/homelab}/logs:/logs
- ./ssl:/app/ssl:ro
- ./configs/api/homelab-secure.yaml:/app/configs/api/prod.yaml:ro
- ${HOMELAB_DATA_DIR:-./data/homelab}/ssl:/app/ssl:ro
- ${HOMELAB_DATA_DIR:-./data/homelab}/configs/api/homelab-secure.yaml:/app/configs/api/prod.yaml:ro
- ${FETCHML_REPO_ROOT:-..}/.env.secure:/app/.env.secure:ro
depends_on:
redis:
@ -32,7 +32,6 @@ services:
retries: 3
start_period: 40s
labels:
logging: "promtail"
job: "api-server"
command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"]
networks:
@ -52,28 +51,27 @@ services:
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
command: ["server", "/data", "--console-address", ":9001"]
healthcheck:
test: ["CMD", "curl", "-fsS", "http://localhost:9000/minio/health/live"]
interval: 5s
timeout: 5s
retries: 5
restart: unless-stopped
networks:
- ml-backend-network
minio-init:
image: alpine:3.19
image: minio/mc:latest
container_name: ml-experiments-minio-init
depends_on:
- minio
minio:
condition: service_healthy
entrypoint: ["/bin/sh", "-c"]
command:
- |
apk add --no-cache ca-certificates curl >/dev/null
curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc
chmod +x /usr/local/bin/mc
mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123}
# Skip if bucket already exists
if mc ls local/fetchml-snapshots 2>/dev/null; then
echo "Bucket fetchml-snapshots already exists, skipping init"
exit 0
fi
mc mb -p local/fetchml-snapshots || true
mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123} || exit 1
mc mb -p local/fetchml-snapshots 2>/dev/null || echo "Bucket exists"
echo "MinIO initialized"
restart: "no"
networks:
- ml-backend-network
@ -87,14 +85,14 @@ services:
- ${HOMELAB_DATA_DIR:-./data/homelab}/experiments:/app/data/experiments
- ${HOMELAB_DATA_DIR:-./data/homelab}/active:/data/active
- ${HOMELAB_DATA_DIR:-./data/homelab}/logs:/logs
- ./configs/workers/homelab-secure.yaml:/app/configs/worker.yaml
- ${HOMELAB_DATA_DIR:-./data/homelab}/configs/worker/homelab-secure.yaml:/app/configs/worker.yaml:ro
depends_on:
redis:
condition: service_healthy
api-server:
condition: service_healthy
minio-init:
condition: service_started
condition: service_completed_successfully
restart: unless-stopped
environment:
- LOG_LEVEL=info
@ -115,7 +113,7 @@ services:
- "443:443"
volumes:
- ./deployments/Caddyfile.homelab-secure:/etc/caddy/Caddyfile:ro
- ./ssl:/etc/caddy/ssl:ro
- ${HOMELAB_DATA_DIR:-./data/homelab}/ssl:/etc/caddy/ssl:ro
- ${HOMELAB_DATA_DIR:-./data/homelab}/caddy/data:/data
- ${HOMELAB_DATA_DIR:-./data/homelab}/caddy/config:/config
environment:
@ -135,7 +133,7 @@ services:
- "127.0.0.1:6379:6379" # Bind to localhost only
volumes:
- ${HOMELAB_DATA_DIR:-./data/homelab}/redis:/data
- ./redis/redis-secure.conf:/usr/local/etc/redis/redis.conf:ro
- ${HOMELAB_DATA_DIR:-./data/homelab}/configs/redis/redis-secure.conf:/usr/local/etc/redis/redis.conf:ro
restart: unless-stopped
command: redis-server /usr/local/etc/redis/redis.conf --requirepass ${REDIS_PASSWORD}
healthcheck:

View file

@ -7,11 +7,11 @@ services:
ports:
- "9101:9101"
volumes:
- ${LOCAL_DATA_DIR:-../data/dev}/logs:/logs
- ${LOCAL_DATA_DIR:-../data/dev}/experiments:/data/experiments
- ${LOCAL_DATA_DIR:-../data/dev}/active:/data/active
- ${LOCAL_DATA_DIR:-../data/dev}/workspaces:/data/active/workspaces:delegated
- ../configs/api/dev.yaml:/app/configs/api/dev.yaml
- ${LOCAL_DATA_DIR:-./data/dev}/logs:/logs
- ${LOCAL_DATA_DIR:-./data/dev}/experiments:/data/experiments
- ${LOCAL_DATA_DIR:-./data/dev}/active:/data/active
- ${LOCAL_DATA_DIR:-./data/dev}/workspaces:/data/active/workspaces:delegated
- ${LOCAL_DATA_DIR:-./data/dev}/configs/api/dev.yaml:/app/configs/api/dev.yaml:ro
environment:
- LOG_LEVEL=info
depends_on:
@ -30,11 +30,12 @@ services:
ports:
- "8888:8888"
volumes:
- ${LOCAL_DATA_DIR:-../data/dev}/logs:/logs
- ${LOCAL_DATA_DIR:-../data/dev}/active:/data/active
- ${LOCAL_DATA_DIR:-../data/dev}/experiments:/data/experiments
- ${LOCAL_DATA_DIR:-../data/dev}/workspaces:/data/active/workspaces:delegated
- ../configs/workers/docker-dev.yaml:/app/configs/worker.yaml
- ${LOCAL_DATA_DIR:-./data/dev}/logs:/logs
- ${LOCAL_DATA_DIR:-./data/dev}/active:/data/active
- ${LOCAL_DATA_DIR:-./data/dev}/experiments:/data/experiments
- ${LOCAL_DATA_DIR:-./data/dev}/workspaces:/data/active/workspaces:delegated
- ${LOCAL_DATA_DIR:-./data/dev}/snapshots:/data/snapshots
- ${LOCAL_DATA_DIR:-./data/dev}/configs/worker/docker-dev.yaml:/app/configs/worker.yaml:ro
- /sys/fs/cgroup:/sys/fs/cgroup:rw
environment:
- LOG_LEVEL=info

View file

@ -45,7 +45,7 @@ services:
- ${SMOKE_TEST_DATA_DIR:-./data/prod-smoke}/experiments:/data/experiments
- ${SMOKE_TEST_DATA_DIR:-./data/prod-smoke}/active:/data/active
- ${SMOKE_TEST_DATA_DIR:-./data/prod-smoke}/logs:/logs
- ./configs/api/dev.yaml:/app/configs/api/dev.yaml:ro
- ${SMOKE_TEST_DATA_DIR:-./data/prod-smoke}/configs/api/dev.yaml:/app/configs/api/dev.yaml:ro
command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
environment:
- LOG_LEVEL=info
@ -67,7 +67,7 @@ services:
- PASSWORD_ACCESS=false
volumes:
- ./deployments/test_keys:/tmp:ro
- ${FETCHML_REPO_ROOT:-..}/bin/tui-linux:/usr/local/bin/tui:ro
- ./bin/tui:/usr/local/bin/tui:ro
- ./deployments/tui-test-config.toml:/config/.ml/config.toml:ro
ports:
- "2222:2222"

View file

@ -28,7 +28,7 @@ services:
- ${PROD_DATA_DIR:-./data/prod}/experiments:/app/data/experiments
- ${PROD_DATA_DIR:-./data/prod}/active:/data/active
- ${PROD_DATA_DIR:-./data/prod}/logs:/logs
- ./configs/api/multi-user.yaml:/app/configs/api/prod.yaml
- ${PROD_DATA_DIR:-./data/prod}/configs/api/multi-user.yaml:/app/configs/api/prod.yaml:ro
depends_on:
redis:
condition: service_healthy
@ -62,7 +62,7 @@ services:
- ${PROD_DATA_DIR:-./data/prod}/experiments:/app/data/experiments
- ${PROD_DATA_DIR:-./data/prod}/active:/data/active
- ${PROD_DATA_DIR:-./data/prod}/logs:/logs
- ./configs/workers/docker-prod.yaml:/app/configs/worker.yaml
- ${PROD_DATA_DIR:-./data/prod}/configs/worker/docker-prod.yaml:/app/configs/worker.yaml:ro
depends_on:
redis:
condition: service_healthy