diff --git a/build/docker/api-server.Dockerfile b/build/docker/api-server.Dockerfile deleted file mode 100644 index 6933c52..0000000 --- a/build/docker/api-server.Dockerfile +++ /dev/null @@ -1,75 +0,0 @@ -# Multi-stage build for ML Experiment Manager -FROM golang:1.25-alpine AS go-builder - -# Install dependencies -RUN apk add --no-cache git make podman redis - -# Set working directory -WORKDIR /app - -# Copy go mod files -COPY go.mod go.sum ./ - -# Download dependencies -RUN go mod download - -# Copy source code -COPY . . - -# Build Go binaries -RUN make build - -# Zig CLI stage -FROM alpine:3.19 AS zig-builder - -# Install dependencies -RUN apk add --no-cache curl xz - -# Install Zig -RUN curl -L https://ziglang.org/download/0.15.2/zig-linux-aarch64-0.15.2.tar.xz | tar -xJ -C /opt -ENV PATH="/opt/zig-linux-aarch64-0.15.2:${PATH}" - -# Copy CLI source -COPY cli/ /app/cli/ - -# Build Zig CLI -WORKDIR /app/cli -RUN zig build cross - -# Final stage -FROM alpine:3.19 - -# Install runtime dependencies -RUN apk add --no-cache ca-certificates rsync openssh-client redis - -# Create app user -RUN addgroup -g 1001 -S appgroup && \ - adduser -u 1001 -S appuser -G appgroup - -# Set working directory -WORKDIR /app - -# Copy binaries from builders -COPY --from=go-builder /app/bin/ /usr/local/bin/ -COPY --from=zig-builder /app/cli/zig-out/bin/ml /usr/local/bin/ - -# Copy configs -COPY --from=go-builder /app/configs/ /app/configs/ - -# Create directories -RUN mkdir -p /data/experiments /data/datasets /data/snapshots /home/appuser/.ml && \ - mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl && \ - chown -R appuser:appgroup /data /app /home/appuser - -# Switch to app user -USER appuser - -# Expose ports -EXPOSE 9101 - -# Health check -HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ - CMD wget --no-verbose --tries=1 --no-check-certificate --spider https://localhost:9101/health || exit 1 - -# Default command -CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"] diff --git a/build/docker/full-prod.Dockerfile b/build/docker/full-prod.Dockerfile deleted file mode 100644 index 3968d0b..0000000 --- a/build/docker/full-prod.Dockerfile +++ /dev/null @@ -1,76 +0,0 @@ -# Full Production Dockerfile with Podman and SSH -FROM golang:1.25-alpine AS builder - -# Install dependencies -RUN apk add --no-cache git make - -# Set working directory -WORKDIR /app - -# Copy go mod files -COPY go.mod go.sum ./ - -# Download dependencies -RUN go mod download - -# Copy source code -COPY . . - -# Build Go binaries -RUN go build -o bin/api-server cmd/api-server/main.go && \ - go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go - -# Final stage with Podman -FROM alpine:3.19 - -# Install runtime dependencies including Podman and SSH -RUN apk add --no-cache ca-certificates redis openssl curl podman openssh - -# Create app user -RUN addgroup -g 1001 -S appgroup && \ - adduser -u 1001 -S appuser -G appgroup - -# Set working directory -WORKDIR /app - -# Copy binaries from builder -COPY --from=builder /app/bin/ /usr/local/bin/ - -# Copy configs -COPY --from=builder /app/configs/ /app/configs/ - -# Create necessary directories -RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /app/ssh /tmp/fetchml-jobs && \ - mkdir -p /data/active/datasets /data/active/snapshots && \ - mkdir -p /logs && \ - chown -R appuser:appgroup /app /data /logs - -# Generate SSL certificates -RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \ - -subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \ - chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem - -# Generate SSH keys for container communication -RUN ssh-keygen -t rsa -b 2048 -f /app/ssh/id_rsa -N "" && \ - cp /app/ssh/id_rsa.pub /app/ssh/authorized_keys && \ - chmod 600 /app/ssh/id_rsa && \ - chmod 644 /app/ssh/id_rsa.pub /app/ssh/authorized_keys - -# Configure SSH daemon -RUN echo "PermitRootLogin yes" >> /etc/ssh/sshd_config && \ - echo "PasswordAuthentication no" >> /etc/ssh/sshd_config && \ - echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config && \ - echo "AuthorizedKeysFile /app/ssh/authorized_keys" >> /etc/ssh/sshd_config - -# Switch to app user -USER appuser - -# Expose ports -EXPOSE 9101 22 - -# Health check -HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ - CMD curl -k -f https://localhost:9101/health || exit 1 - -# Default command for API server -CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"] diff --git a/build/docker/homelab-secure.Dockerfile b/build/docker/homelab-secure.Dockerfile deleted file mode 100644 index fdda694..0000000 --- a/build/docker/homelab-secure.Dockerfile +++ /dev/null @@ -1,149 +0,0 @@ -# Homelab Secure Production Dockerfile -FROM golang:1.25-alpine AS builder - -# Install dependencies -RUN apk add --no-cache git make - -# Set working directory -WORKDIR /app - -# Copy go mod files -COPY go.mod go.sum ./ - -# Download dependencies -RUN go mod download - -# Copy source code -COPY . . - -# Build Go binaries -RUN go build -o bin/api-server cmd/api-server/main.go && \ - go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go - -# Final stage with security hardening -FROM alpine:3.19 - -# Install security packages and runtime dependencies -RUN apk add --no-cache \ - ca-certificates \ - redis \ - openssl \ - curl \ - podman \ - openssh \ - sudo \ - fail2ban \ - logrotate \ - && rm -rf /var/cache/apk/* - -# Create app user and worker user with no shell by default -RUN addgroup -g 1001 -S appgroup && \ - adduser -u 1001 -S appuser -G appgroup -s /sbin/nologin && \ - addgroup -g 1002 -S workergroup && \ - adduser -u 1002 -S worker -G workergroup -s /bin/sh && \ - echo "worker:HomelabWorker2024!" | chpasswd && \ - mkdir -p /home/worker/.ssh && \ - chown -R worker:workergroup /home/worker - -# Set working directory -WORKDIR /app - -# Copy binaries from builder -COPY --from=builder /app/bin/ /usr/local/bin/ - -# Copy configs -COPY --from=builder /app/configs/ /app/configs/ - -# Create necessary directories with proper permissions -RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /tmp/fetchml-jobs && \ - mkdir -p /data/active/datasets /data/active/snapshots && \ - mkdir -p /logs && \ - chown -R appuser:appgroup /app /data /logs && \ - chmod 750 /app/data/experiments /app/logs - -# Generate SSL certificates with stronger crypto -RUN openssl req -x509 -newkey rsa:4096 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \ - -subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \ - chmod 600 /app/ssl/key.pem && \ - chmod 644 /app/ssl/cert.pem - -# Generate SSH keys with stronger crypto -RUN ssh-keygen -t rsa -b 4096 -f /home/worker/.ssh/id_rsa -N "" && \ - cp /home/worker/.ssh/id_rsa.pub /home/worker/.ssh/authorized_keys && \ - chmod 700 /home/worker/.ssh && \ - chmod 600 /home/worker/.ssh/id_rsa && \ - chmod 644 /home/worker/.ssh/id_rsa.pub /home/worker/.ssh/authorized_keys && \ - chown -R worker:workergroup /home/worker/.ssh - -# Configure SSH with security hardening -RUN echo "Port 2222" >> /etc/ssh/sshd_config && \ - echo "PermitRootLogin no" >> /etc/ssh/sshd_config && \ - echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \ - echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config && \ - echo "AuthorizedKeysFile %h/.ssh/authorized_keys" >> /etc/ssh/sshd_config && \ - echo "AllowUsers worker" >> /etc/ssh/sshd_config && \ - echo "MaxAuthTries 3" >> /etc/ssh/sshd_config && \ - echo "ClientAliveInterval 300" >> /etc/ssh/sshd_config && \ - echo "ClientAliveCountMax 2" >> /etc/ssh/sshd_config && \ - echo "X11Forwarding no" >> /etc/ssh/sshd_config && \ - echo "AllowTcpForwarding no" >> /etc/ssh/sshd_config && \ - echo "Banner /etc/ssh/banner" >> /etc/ssh/sshd_config && \ - echo "Protocol 2" >> /etc/ssh/sshd_config && \ - echo "Ciphers chacha20-poly1305@openssh.com,aes256-gcm@openssh.com,aes128-gcm@openssh.com" >> /etc/ssh/sshd_config && \ - echo "MACs hmac-sha2-256-etm@openssh.com,hmac-sha2-512-etm@openssh.com,hmac-sha2-256,hmac-sha2-512" >> /etc/ssh/sshd_config && \ - echo "KexAlgorithms curve25519-sha256@libssh.org,diffie-hellman-group16-sha512" >> /etc/ssh/sshd_config - -# Create SSH banner -RUN echo "=================================================" > /etc/ssh/banner && \ - echo " ML Experiments Homelab Server" >> /etc/ssh/banner && \ - echo " Unauthorized access is prohibited" >> /etc/ssh/banner && \ - echo " All connections are monitored and logged" >> /etc/ssh/banner && \ - echo "=================================================" >> /etc/ssh/banner - -# Generate SSH host keys -RUN ssh-keygen -A - -# Configure fail2ban for SSH protection -RUN echo "[DEFAULT]" > /etc/fail2ban/jail.local && \ - echo "bantime = 3600" >> /etc/fail2ban/jail.local && \ - echo "findtime = 600" >> /etc/fail2ban/jail.local && \ - echo "maxretry = 3" >> /etc/fail2ban/jail.local && \ - echo "" >> /etc/fail2ban/jail.local && \ - echo "[sshd]" >> /etc/fail2ban/jail.local && \ - echo "enabled = true" >> /etc/fail2ban/jail.local && \ - echo "port = 2222" >> /etc/fail2ban/jail.local && \ - echo "filter = sshd" >> /etc/fail2ban/jail.local && \ - echo "logpath = /var/log/messages" >> /etc/fail2ban/jail.local - -# Configure sudo with restricted access -RUN echo "appuser ALL=(ALL) NOPASSWD: /app/start-security.sh" >> /etc/sudoers && \ - echo "appuser ALL=(ALL) NOPASSWD: /usr/sbin/sshd" >> /etc/sudoers && \ - echo "appuser ALL=(ALL) NOPASSWD: /usr/bin/ssh-keygen" >> /etc/sudoers && \ - echo "worker ALL=(ALL) NOPASSWD: /usr/bin/podman" >> /etc/sudoers && \ - echo "Defaults:appuser !requiretty" >> /etc/sudoers && \ - echo "Defaults:worker !requiretty" >> /etc/sudoers && \ - echo "Defaults:appuser !lecture" >> /etc/sudoers && \ - echo "Defaults:worker !lecture" >> /etc/sudoers - -# Security hardening - remove setuid binaries except sudo -RUN find / -perm /4000 -type f -not -path "/usr/bin/sudo" -exec chmod 755 {} \; 2>/dev/null || true - -# Create startup script for security services -RUN echo "#!/bin/sh" > /app/start-security.sh && \ - echo "ssh-keygen -A" >> /app/start-security.sh && \ - echo "/usr/sbin/sshd -D -p 2222" >> /app/start-security.sh && \ - echo "# End of security services" >> /app/start-security.sh && \ - chmod 755 /app/start-security.sh - -# Switch to app user for application -USER appuser - -# Expose ports -EXPOSE 9101 2222 - -# Health check -HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ - CMD curl -k -f https://localhost:9101/health || exit 1 - -# Default command for API server -CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"] diff --git a/build/docker/secure-prod.Dockerfile b/build/docker/secure-prod.Dockerfile index 7b0ce3c..f739b1b 100644 --- a/build/docker/secure-prod.Dockerfile +++ b/build/docker/secure-prod.Dockerfile @@ -16,9 +16,9 @@ RUN go mod download # Copy source code COPY . . -# Build Go binaries with CGO enabled for SQLite -RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \ - CGO_ENABLED=1 go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go +# Build Go binaries (native libs not used in Docker since NVML unavailable in Alpine) +RUN CGO_ENABLED=1 go build -o bin/api-server ./cmd/api-server/main.go && \ + CGO_ENABLED=1 go build -o bin/worker ./cmd/worker # Final stage with Podman and secure SSH FROM alpine:3.19 diff --git a/build/docker/simple.Dockerfile b/build/docker/simple.Dockerfile index e6fa196..7d5060e 100644 --- a/build/docker/simple.Dockerfile +++ b/build/docker/simple.Dockerfile @@ -18,12 +18,13 @@ COPY . . # Copy and build native C++ libraries (without NVML for non-GPU systems) COPY native/ ./native/ +ENV FETCHML_DOCKER_BUILD=1 RUN rm -rf native/build && cd native && mkdir -p build && cd build && \ - cmake .. -DCMAKE_BUILD_TYPE=Release -DFETCHML_DOCKER_BUILD=1 -DBUILD_NVML_GPU=OFF && \ + cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_NVML_GPU=OFF && \ make -j$(nproc) # Build Go binaries (native libs not used in Docker since NVML unavailable in Alpine) -RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \ +RUN CGO_ENABLED=1 go build -o bin/api-server ./cmd/api-server/main.go && \ CGO_ENABLED=1 go build -o bin/worker ./cmd/worker # Final stage diff --git a/build/docker/test.Dockerfile b/build/docker/test.Dockerfile deleted file mode 100644 index 41b2639..0000000 --- a/build/docker/test.Dockerfile +++ /dev/null @@ -1,62 +0,0 @@ -# Test Dockerfile - Go components only -FROM golang:1.25-alpine AS builder - -# Install dependencies -RUN apk add --no-cache git gcc musl-dev - -# Set working directory -WORKDIR /app - -# Copy go mod files -COPY go.mod go.sum ./ - -# Download dependencies -RUN go mod download - -# Copy source code -COPY . . - -# Build only Go binaries (skip Zig) -RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \ - go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go && \ - go build -o bin/tui ./cmd/tui - -# Final stage -FROM alpine:3.19 - -# Install runtime dependencies -RUN apk add --no-cache ca-certificates curl openssl - -# Create app user -RUN addgroup -g 1001 -S appgroup && \ - adduser -u 1001 -S appuser -G appgroup - -# Set working directory -WORKDIR /app - -# Copy binaries from builder -COPY --from=builder /app/bin/ /usr/local/bin/ - -# Copy configs -COPY --from=builder /app/configs/ /app/configs/ - -# Create necessary directories -RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl && \ - mkdir -p /data/experiments /data/datasets /data/snapshots - -# Generate SSL certificates for container use -RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \ - -subj "/C=US/ST=Test/L=Local/O=FetchML/OU=Tests/CN=localhost" && \ - chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem - -# Ensure app user can write to data/logs and read TLS material -RUN chown -R appuser:appgroup /app/data /app/logs /app/ssl /app/configs /data - -# Switch to app user -USER appuser - -# Expose ports -EXPOSE 9101 - -# Default command -CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"] diff --git a/cmd/tui/internal/config/cli_config.go b/cmd/tui/internal/config/cli_config.go index fe12ff5..4ca0ec9 100644 --- a/cmd/tui/internal/config/cli_config.go +++ b/cmd/tui/internal/config/cli_config.go @@ -14,22 +14,20 @@ import ( // CLIConfig represents the TOML config structure used by the CLI type CLIConfig struct { - WorkerHost string `toml:"worker_host"` - WorkerUser string `toml:"worker_user"` - WorkerBase string `toml:"worker_base"` - WorkerPort int `toml:"worker_port"` - APIKey string `toml:"api_key"` - - // User context (filled after authentication) CurrentUser *UserContext `toml:"-"` + WorkerHost string `toml:"worker_host"` + WorkerUser string `toml:"worker_user"` + WorkerBase string `toml:"worker_base"` + APIKey string `toml:"api_key"` + WorkerPort int `toml:"worker_port"` } // UserContext represents the authenticated user information type UserContext struct { - Name string `json:"name"` - Admin bool `json:"admin"` - Roles []string `json:"roles"` Permissions map[string]bool `json:"permissions"` + Name string `json:"name"` + Roles []string `json:"roles"` + Admin bool `json:"admin"` } // LoadCLIConfig loads the CLI's TOML configuration from the provided path. diff --git a/cmd/tui/internal/config/config.go b/cmd/tui/internal/config/config.go index ab885b3..11b274d 100644 --- a/cmd/tui/internal/config/config.go +++ b/cmd/tui/internal/config/config.go @@ -12,39 +12,31 @@ import ( // Config holds TUI configuration type Config struct { - Host string `toml:"host"` - User string `toml:"user"` - SSHKey string `toml:"ssh_key"` - Port int `toml:"port"` - BasePath string `toml:"base_path"` - Mode string `toml:"mode"` // "dev" or "prod" - WrapperScript string `toml:"wrapper_script"` - TrainScript string `toml:"train_script"` - RedisAddr string `toml:"redis_addr"` - RedisPassword string `toml:"redis_password"` - RedisDB int `toml:"redis_db"` - KnownHosts string `toml:"known_hosts"` - ServerURL string `toml:"server_url"` // WebSocket server URL (e.g., ws://localhost:8080) - - // Local mode configuration - DBPath string `toml:"db_path"` // Path to SQLite database (local mode) - ForceLocal bool `toml:"force_local"` // Force local-only mode - ProjectRoot string `toml:"project_root"` // Project root for local mode - - // Experiment configuration Experiment struct { Name string `toml:"name"` Entrypoint string `toml:"entrypoint"` } `toml:"experiment"` - - // Authentication - Auth auth.Config `toml:"auth"` - - // Podman settings - PodmanImage string `toml:"podman_image"` - ContainerWorkspace string `toml:"container_workspace"` - ContainerResults string `toml:"container_results"` - GPUDevices []string `toml:"gpu_devices"` + ProjectRoot string `toml:"project_root"` + ServerURL string `toml:"server_url"` + ContainerResults string `toml:"container_results"` + BasePath string `toml:"base_path"` + Mode string `toml:"mode"` + WrapperScript string `toml:"wrapper_script"` + TrainScript string `toml:"train_script"` + RedisAddr string `toml:"redis_addr"` + RedisPassword string `toml:"redis_password"` + ContainerWorkspace string `toml:"container_workspace"` + SSHKey string `toml:"ssh_key"` + DBPath string `toml:"db_path"` + KnownHosts string `toml:"known_hosts"` + PodmanImage string `toml:"podman_image"` + Host string `toml:"host"` + User string `toml:"user"` + Auth auth.Config `toml:"auth"` + GPUDevices []string `toml:"gpu_devices"` + RedisDB int `toml:"redis_db"` + Port int `toml:"port"` + ForceLocal bool `toml:"force_local"` } // LoadConfig loads configuration from a TOML file diff --git a/cmd/tui/internal/model/jobs.go b/cmd/tui/internal/model/jobs.go index e7e563c..7f08094 100644 --- a/cmd/tui/internal/model/jobs.go +++ b/cmd/tui/internal/model/jobs.go @@ -21,21 +21,19 @@ const ( // Job represents a job in the TUI type Job struct { - Name string - Status JobStatus - TaskID string - Priority int64 - // Narrative fields for research context + OutcomeStatus string + Status JobStatus + TaskID string Hypothesis string Context string Intent string ExpectedOutcome string ActualOutcome string - OutcomeStatus string // validated, invalidated, inconclusive - // GPU allocation tracking - GPUDeviceID int // -1 if not assigned - GPUUtilization int // 0-100% - GPUMemoryUsed int64 // MB + Name string + Priority int64 + GPUDeviceID int + GPUUtilization int + GPUMemoryUsed int64 } // Title returns the job title for display diff --git a/cmd/tui/internal/model/state.go b/cmd/tui/internal/model/state.go index e7000d3..09b1eb7 100644 --- a/cmd/tui/internal/model/state.go +++ b/cmd/tui/internal/model/state.go @@ -48,50 +48,50 @@ const ( // DatasetInfo represents dataset information in the TUI type DatasetInfo struct { - Name string `json:"name"` - SizeBytes int64 `json:"size_bytes"` - Location string `json:"location"` LastAccess time.Time `json:"last_access"` + Name string `json:"name"` + Location string `json:"location"` + SizeBytes int64 `json:"size_bytes"` } // State holds the application state type State struct { - Jobs []Job + JobList list.Model + LastRefresh time.Time + LastGPUUpdate time.Time + LastFrameTime time.Time + JobStats map[JobStatus]int + Status string + APIKey string + ErrorMsg string + Keys KeyMap QueuedTasks []*Task Datasets []DatasetInfo - JobList list.Model + Jobs []Job + Input textinput.Model + APIKeyInput textinput.Model GpuView viewport.Model - ContainerView viewport.Model QueueView viewport.Model + LogsView viewport.Model + ConfigView viewport.Model + ExperimentHistoryView viewport.Model + TeamView viewport.Model SettingsView viewport.Model DatasetView viewport.Model ExperimentsView viewport.Model NarrativeView viewport.Model - TeamView viewport.Model - ExperimentHistoryView viewport.Model - ConfigView viewport.Model - LogsView viewport.Model - SelectedJob Job - Input textinput.Model - APIKeyInput textinput.Model - Status string - ErrorMsg string - InputMode bool - Width int - Height int - ShowHelp bool + ContainerView viewport.Model Spinner spinner.Model + SelectedJob Job ActiveView ViewMode - LastRefresh time.Time - LastFrameTime time.Time - RefreshRate float64 // measured in ms + RefreshRate float64 FrameCount int - LastGPUUpdate time.Time - IsLoading bool - JobStats map[JobStatus]int - APIKey string + Height int + Width int SettingsIndex int - Keys KeyMap + ShowHelp bool + IsLoading bool + InputMode bool } // InitialState creates the initial application state diff --git a/cmd/tui/internal/store/store.go b/cmd/tui/internal/store/store.go index e8d11ef..7f33cdd 100644 --- a/cmd/tui/internal/store/store.go +++ b/cmd/tui/internal/store/store.go @@ -18,13 +18,13 @@ type Store struct { // RunInfo represents a local run from SQLite type RunInfo struct { + EndTime *string + PID *int64 RunID string ExperimentID string Name string Status string StartTime string - EndTime *string - PID *int64 Synced bool } diff --git a/deployments/Makefile b/deployments/Makefile index f684d96..522f177 100644 --- a/deployments/Makefile +++ b/deployments/Makefile @@ -1,74 +1,200 @@ # Docker Compose Deployment Management -.PHONY: help dev-up dev-down dev-logs dev-restart homelab-secure-up homelab-secure-down prod-up prod-down status clean +.PHONY: help dev-up dev-down dev-logs dev-restart staging-up staging-down staging-logs staging-restart staging-status homelab-secure-up homelab-secure-down prod-up prod-down prod-logs prod-restart prod-status status clean rollback security-mode check-audit-sink health-check security-scan # Default target help: ## Show this help message @echo "Available commands:" - @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-25s\033[0m %s\n", $$1, $$2}' # Development environment dev-up: ## Start development environment @echo "Starting development environment..." - docker-compose -f deployments/docker-compose.dev.yml up -d + docker-compose -f docker-compose.dev.yml up -d @echo "Services: Caddy (8080/8443), Redis (6379), Prometheus (9090), Grafana (3000)" dev-down: ## Stop development environment @echo "Stopping development environment..." - docker-compose -f deployments/docker-compose.dev.yml down + docker-compose -f docker-compose.dev.yml down dev-logs: ## Show development logs - docker-compose -f deployments/docker-compose.dev.yml logs -f + docker-compose -f docker-compose.dev.yml logs -f dev-restart: ## Restart development environment @echo "Restarting development environment..." - docker-compose -f deployments/docker-compose.dev.yml restart + docker-compose -f docker-compose.dev.yml restart + +# Staging environment +staging-up: ## Start staging environment + @echo "Starting staging environment..." + @if [ ! -f .env.staging ]; then \ + echo "Creating staging environment file..."; \ + echo "DATA_DIR=./data/staging" > .env.staging; \ + echo "LOG_LEVEL=info" >> .env.staging; \ + echo "COMPLIANCE_MODE=standard" >> .env.staging; \ + fi + docker-compose -f docker-compose.staging.yml up -d + @echo "Staging services: Caddy (9080/9443), Redis (6380), API (9102), MinIO (9002/9003)" + +staging-down: ## Stop staging environment + @echo "Stopping staging environment..." + docker-compose -f docker-compose.staging.yml down + +staging-logs: ## Show staging logs + docker-compose -f docker-compose.staging.yml logs -f + +staging-restart: ## Restart staging environment + @echo "Restarting staging environment..." + docker-compose -f docker-compose.staging.yml restart + +staging-status: ## Show staging status + docker-compose -f docker-compose.staging.yml ps # Homelab environment homelab-secure-up: ## Start secure homelab environment @echo "Starting secure homelab environment..." - docker-compose -f deployments/docker-compose.homelab-secure.yml up -d + docker-compose -f docker-compose.homelab-secure.yml up -d homelab-secure-down: ## Stop secure homelab environment @echo "Stopping secure homelab environment..." - docker-compose -f deployments/docker-compose.homelab-secure.yml down + docker-compose -f docker-compose.homelab-secure.yml down # Production environment prod-up: ## Start production environment @echo "Starting production environment..." - docker-compose -f deployments/docker-compose.prod.yml up -d + @echo "⚠ WARNING: This is production! Ensure you have proper backups." + @read -p "Continue? [y/N] " confirm && [ "$$confirm" = "y" ] || exit 1 + docker-compose -f docker-compose.prod.yml up -d prod-down: ## Stop production environment @echo "Stopping production environment..." - docker-compose -f deployments/docker-compose.prod.yml down + docker-compose -f docker-compose.prod.yml down + +prod-logs: ## Show production logs + docker-compose -f docker-compose.prod.yml logs -f + +prod-restart: ## Restart production environment + @echo "Restarting production environment..." + @read -p "Restart production? [y/N] " confirm && [ "$$confirm" = "y" ] || exit 1 + docker-compose -f docker-compose.prod.yml restart + +prod-status: ## Show production status + docker-compose -f docker-compose.prod.yml ps # Utility commands status: ## Show status of all environments @echo "=== Development Status ===" - @if [ -f deployments/docker-compose.dev.yml ]; then \ - docker-compose -f deployments/docker-compose.dev.yml ps; \ + @if [ -f docker-compose.dev.yml ]; then \ + docker-compose -f docker-compose.dev.yml ps 2>/dev/null || echo "Not running"; \ + fi + @echo "" + @echo "=== Staging Status ===" + @if [ -f docker-compose.staging.yml ]; then \ + docker-compose -f docker-compose.staging.yml ps 2>/dev/null || echo "Not running"; \ fi @echo "" @echo "=== Homelab Secure Status ===" - @if [ -f deployments/docker-compose.homelab-secure.yml ]; then \ - docker-compose -f deployments/docker-compose.homelab-secure.yml ps 2>/dev/null || echo "Not running"; \ + @if [ -f docker-compose.homelab-secure.yml ]; then \ + docker-compose -f docker-compose.homelab-secure.yml ps 2>/dev/null || echo "Not running"; \ fi @echo "" @echo "=== Production Status ===" - @if [ -f deployments/docker-compose.prod.yml ]; then \ - docker-compose -f deployments/docker-compose.prod.yml ps 2>/dev/null || echo "Not running"; \ + @if [ -f docker-compose.prod.yml ]; then \ + docker-compose -f docker-compose.prod.yml ps 2>/dev/null || echo "Not running"; \ fi clean: ## Clean up all containers and volumes @echo "Cleaning up all Docker resources..." @echo "This will remove all containers and volumes. Continue? [y/N]" @read -r confirm && [ "$$confirm" = "y" ] || exit 1 - docker-compose -f deployments/docker-compose.dev.yml down -v 2>/dev/null || true - docker-compose -f deployments/docker-compose.homelab-secure.yml down -v 2>/dev/null || true - docker-compose -f deployments/docker-compose.prod.yml down -v 2>/dev/null || true + docker-compose -f docker-compose.dev.yml down -v 2>/dev/null || true + docker-compose -f docker-compose.staging.yml down -v 2>/dev/null || true + docker-compose -f docker-compose.homelab-secure.yml down -v 2>/dev/null || true + docker-compose -f docker-compose.prod.yml down -v 2>/dev/null || true docker system prune -f @echo "Cleanup complete." +# Security mode targets +security-mode-dev: ## Run worker in dev security mode + @echo "Running with dev security mode (relaxed validation)..." + COMPLIANCE_MODE=dev docker-compose -f docker-compose.dev.yml up -d worker + +security-mode-standard: ## Run worker in standard security mode + @echo "Running with standard security mode..." + COMPLIANCE_MODE=standard docker-compose -f docker-compose.dev.yml up -d worker + +security-mode-hipaa: ## Run worker in HIPAA security mode + @echo "Running with HIPAA security mode (strict compliance)..." + @echo "✓ Network mode: none" + @echo "✓ Seccomp profile: default-hardened" + @echo "✓ No new privileges: enforced" + @echo "✓ Audit sink: required" + @read -p "Confirm HIPAA mode deployment? [y/N] " confirm && [ "$$confirm" = "y" ] || exit 1 + COMPLIANCE_MODE=hipaa docker-compose -f docker-compose.dev.yml up -d worker + +# Rollback targets +rollback-staging: ## Rollback staging deployment + @echo "Rolling back staging deployment..." + @echo "⚠ This rolls back the image only - queue state and audit log are NOT rolled back" + @read -p "Continue with rollback? [y/N] " confirm && [ "$$confirm" = "y" ] || exit 1 + docker-compose -f docker-compose.staging.yml down + @if [ -f .staging-deployment.log ]; then \ + PREVIOUS_TAG=$$(tail -2 .staging-deployment.log | head -1 | grep -o 'tag=[^ ]*' | cut -d'=' -f2 || echo "latest"); \ + echo "Previous tag: $$PREVIOUS_TAG"; \ + docker-compose -f docker-compose.staging.yml up -d; \ + fi + @echo "$$(date -Iseconds) | rollback | staging | actor=$$(whoami)" >> .staging-audit.log + +rollback-prod: ## Rollback production deployment + @echo "Rolling back production deployment..." + @echo "⚠ CRITICAL: This rolls back the image only" + @echo "⚠ Queue state is NOT rolled back" + @echo "⚠ Audit log chain is NOT rolled back (must never break chain)" + @echo "⚠ New artifacts remain in storage" + @read -p "CONFIRM PRODUCTION ROLLBACK? [yes/N] " confirm && [ "$$confirm" = "yes" ] || exit 1 + docker-compose -f docker-compose.prod.yml down + @if [ -f .prod-audit.log ]; then \ + PREVIOUS_SHA=$$(tail -2 .prod-audit.log | head -1 | grep -o 'sha-[a-f0-9]*' || echo "previous"); \ + echo "Rolling back to: $$PREVIOUS_SHA"; \ + docker-compose -f docker-compose.prod.yml up -d; \ + fi + @echo "$$(date -Iseconds) | rollback | prod | actor=$$(whoami)" >> .prod-audit.log + @echo "Rollback complete. Verify health: make prod-status" + +check-audit-sink: ## Check audit sink reachability + @echo "Checking audit sink..." + @if [ -f ../scripts/check-audit-sink.sh ]; then \ + ../scripts/check-audit-sink.sh --env staging; \ + else \ + echo "Audit sink check script not found"; \ + fi + +health-check: ## Run health checks on all environments + @echo "=== Health Checks ===" + @echo "Development (localhost:9101):" + @curl -fsS http://localhost:9101/health 2>/dev/null && echo "✓ Healthy" || echo "✗ Not responding" + @echo "" + @echo "Staging (localhost:9102):" + @curl -fsS http://localhost:9102/health 2>/dev/null && echo "✓ Healthy" || echo "✗ Not responding" + @echo "" + @echo "Production (localhost:9101):" + @curl -fsS http://localhost:9101/health 2>/dev/null && echo "✓ Healthy" || echo "✗ Not responding" + +security-scan: ## Run security scanners locally + @echo "Running security scanners..." + @if command -v gosec >/dev/null 2>&1; then \ + echo "Running gosec..."; \ + cd .. && gosec ./... 2>/dev/null || echo "gosec found issues"; \ + else \ + echo "gosec not installed - skipping"; \ + fi + @if command -v nancy >/dev/null 2>&1; then \ + echo "Running nancy..."; \ + cd .. && go list -json -deps ./... 2>/dev/null | nancy sleuth 2>/dev/null || echo "nancy found issues"; \ + else \ + echo "nancy not installed - skipping"; \ + fi + # Quick aliases up: dev-up ## Alias for dev-up down: dev-down ## Alias for dev-down diff --git a/deployments/deploy.sh b/deployments/deploy.sh index 2539325..b0cc8b2 100755 --- a/deployments/deploy.sh +++ b/deployments/deploy.sh @@ -37,6 +37,7 @@ show_usage() { echo "" echo "Environments:" echo " dev Development environment" + echo " staging Staging environment (pre-production)" echo " secure Secure homelab environment" echo " prod Production environment" echo "" @@ -46,11 +47,17 @@ show_usage() { echo " restart Restart services" echo " logs Show logs" echo " status Show status" + echo " rollback Rollback to previous deployment (image only)" + echo " health-check Check service health and compliance mode" + echo " check-audit-sink Verify audit sink reachability" echo "" echo "Examples:" - echo " $0 dev up # Start development environment" - echo " $0 prod down # Stop production environment" - echo " $0 secure logs # Show secure environment logs" + echo " $0 dev up # Start development environment" + echo " $0 staging up # Start staging environment" + echo " $0 prod down # Stop production environment" + echo " $0 staging rollback # Rollback staging deployment" + echo " $0 prod health-check # Check production health" + echo " $0 prod check-audit-sink # Verify audit sink before deploy" } # Function to check if docker-compose file exists @@ -62,6 +69,9 @@ check_compose_file() { "dev") compose_file="${FETCHML_REPO_ROOT}/deployments/docker-compose.dev.yml" ;; + "staging") + compose_file="${FETCHML_REPO_ROOT}/deployments/docker-compose.staging.yml" + ;; "secure") compose_file="${FETCHML_REPO_ROOT}/deployments/docker-compose.homelab-secure.yml" ;; @@ -154,6 +164,71 @@ main() { print_status "Status of $environment environment:" docker-compose --project-directory "${FETCHML_REPO_ROOT}" -f "$compose_file" ps ;; + "rollback") + print_warning "Rolling back $environment environment..." + print_warning "⚠ This rolls back the image only - queue state and audit log are NOT rolled back" + + if [ "$environment" = "prod" ]; then + print_warning "⚠ CRITICAL: Production rollback" + print_warning "⚠ Queue state is NOT rolled back" + print_warning "⚠ Audit log chain is NOT rolled back (must never break chain)" + read -p "CONFIRM PRODUCTION ROLLBACK? [yes/N] " confirm + if [ "$confirm" != "yes" ]; then + print_error "Rollback cancelled" + exit 1 + fi + fi + + # Get previous deployment info + LOG_FILE="${FETCHML_REPO_ROOT}/deployments/.${environment}-audit.log" + if [ -f "$LOG_FILE" ]; then + PREVIOUS_SHA=$(tail -2 "$LOG_FILE" | head -1 | grep -o 'sha-[a-f0-9]*' || echo "") + if [ -n "$PREVIOUS_SHA" ]; then + print_status "Rolling back to: $PREVIOUS_SHA" + fi + fi + + docker-compose --project-directory "${FETCHML_REPO_ROOT}" -f "$compose_file" down + docker-compose --project-directory "${FETCHML_REPO_ROOT}" -f "$compose_file" up -d + + # Write rollback entry to audit log + echo "$(date -Iseconds) | rollback | $environment | actor=$(whoami)" >> "$LOG_FILE" 2>/dev/null || true + + print_success "$environment rollback complete!" + print_status "Verify health with: $0 $environment health-check" + ;; + "health-check"|"health") + print_status "Health check for $environment environment..." + + # Determine port based on environment + case $environment in + dev) PORT=9101 ;; + staging) PORT=9102 ;; + prod) PORT=9101 ;; + *) PORT=9101 ;; + esac + + # Check API health + if curl -fsS "http://localhost:${PORT}/health" > /dev/null 2>&1; then + print_success "API is healthy (port $PORT)" + + # Check compliance_mode + COMPLIANCE_MODE=$(curl -fsS "http://localhost:${PORT}/health" 2>/dev/null | grep -o '"compliance_mode":"[^"]*"' | cut -d'"' -f4 || echo "unknown") + print_status "Compliance mode: $COMPLIANCE_MODE" + else + print_error "API health check failed (port $PORT)" + exit 1 + fi + ;; + "check-audit-sink") + print_status "Checking audit sink for $environment..." + + if [ -f "${FETCHML_REPO_ROOT}/scripts/check-audit-sink.sh" ]; then + "${FETCHML_REPO_ROOT}/scripts/check-audit-sink.sh" --env "$environment" + else + print_warning "Audit sink check script not found" + fi + ;; *) print_error "Unknown action: $action" show_usage diff --git a/deployments/docker-compose.dev.yml b/deployments/docker-compose.dev.yml index 70b82d3..d8095da 100644 --- a/deployments/docker-compose.dev.yml +++ b/deployments/docker-compose.dev.yml @@ -1,6 +1,6 @@ --- -# Homelab Docker Compose with Centralized Monitoring -# Includes: API, Redis, Prometheus, Grafana, Loki +# Development Docker Compose +# Includes: API, Redis, MinIO, Worker, Caddy services: caddy: image: caddy:2-alpine @@ -11,8 +11,8 @@ services: - "8443:443" volumes: - ./deployments/Caddyfile.dev:/etc/caddy/Caddyfile:ro - - ${SMOKE_TEST_DATA_DIR:-./data/dev}/caddy/data:/data - - ${SMOKE_TEST_DATA_DIR:-./data/dev}/caddy/config:/config + - ${DATA_DIR:-./data/smoke}/caddy/data:/data + - ${DATA_DIR:-./data/smoke}/caddy/config:/config depends_on: api-server: condition: service_healthy @@ -42,12 +42,12 @@ services: expose: - "9101" # API and health endpoints (internal; external access via Caddy) volumes: - - ${SMOKE_TEST_DATA_DIR:-./data/dev}/logs:/logs - - ${SMOKE_TEST_DATA_DIR:-./data/dev}/experiments:/data/experiments - - ${SMOKE_TEST_DATA_DIR:-./data/dev}/active:/data/active - - ${SMOKE_TEST_DATA_DIR:-./data/dev}/workspaces:/data/active/workspaces:delegated - - ./configs/api/dev.yaml:/app/configs/api/dev.yaml - - ./ssl:/app/ssl + - ${DATA_DIR:-./data/smoke}/logs:/logs + - ${DATA_DIR:-./data/smoke}/experiments:/data/experiments + - ${DATA_DIR:-./data/smoke}/active:/data/active + - ${DATA_DIR:-./data/smoke}/workspaces:/data/active/workspaces:delegated + - ${DATA_DIR:-./data/smoke}/configs:/app/configs:ro + - ${DATA_DIR:-./data/smoke}/ssl:/app/ssl:ro depends_on: - redis restart: unless-stopped @@ -62,67 +62,41 @@ services: retries: 3 start_period: 40s labels: - logging: "promtail" job: "api-server" + # MinIO for local development (single-node filesystem backend) minio: image: minio/minio:latest - container_name: ml-experiments-minio + container_name: ml-dev-minio ports: - "9000:9000" - "9001:9001" volumes: - - ${SMOKE_TEST_DATA_DIR:-./data/dev}/minio:/data + - ${DATA_DIR:-./data/smoke}/minio:/data environment: - MINIO_ROOT_USER=minioadmin - MINIO_ROOT_PASSWORD=minioadmin123 + - MINIO_BROWSER=on command: ["server", "/data", "--console-address", ":9001"] healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] - interval: 10s + test: ["CMD", "curl", "-fsS", "http://localhost:9000/minio/health/live"] + interval: 5s timeout: 5s - retries: 10 + retries: 5 restart: unless-stopped + + # Initialize minio bucket (runs once) minio-init: - image: alpine:3.19 - container_name: ml-experiments-minio-init + image: minio/mc:latest + container_name: ml-dev-minio-init depends_on: minio: condition: service_healthy entrypoint: ["/bin/sh", "-c"] command: - | - set -eu - apk add --no-cache ca-certificates curl tar gzip - ARCH=$$(uname -m) - MC_ARCH=amd64 - if [ "$$ARCH" = "aarch64" ] || [ "$$ARCH" = "arm64" ]; then - MC_ARCH=arm64 - fi - curl -fsSL -o /usr/local/bin/mc "https://dl.min.io/client/mc/release/linux-$$MC_ARCH/mc" - chmod +x /usr/local/bin/mc - i=0 - while ! mc alias set local http://minio:9000 minioadmin minioadmin123; do - i=$$((i+1)) - if [ $$i -ge 30 ]; then - echo "minio not ready after 30 attempts" >&2 - exit 1 - fi - echo "waiting for minio... ($$i/30)" - sleep 1 - done - # Skip if bucket already exists - if mc ls local/fetchml-snapshots 2>/dev/null; then - echo "Bucket fetchml-snapshots already exists, skipping init" - exit 0 - fi - mc mb -p local/fetchml-snapshots || true - mkdir -p /tmp/snapshots/snap-1 - echo -n "hello" > /tmp/snapshots/snap-1/hello.txt - tar -C /tmp/snapshots/snap-1 -czf /tmp/snap-1.tar.gz . - mc cp /tmp/snap-1.tar.gz local/fetchml-snapshots/snapshots/snap-1.tar.gz - FILE_SHA=$$(sha256sum /tmp/snapshots/snap-1/hello.txt | cut -d' ' -f1) - SNAP_SHA=$$(echo -n "$$FILE_SHA" | sha256sum | cut -d' ' -f1) - echo "snapshot_id=snap-1 snapshot_sha256=$$SNAP_SHA" + mc alias set local http://minio:9000 minioadmin minioadmin123 || exit 1 + mc mb -p local/fetchml-snapshots 2>/dev/null || echo "Bucket exists" + echo "MinIO initialized" restart: "no" worker: build: @@ -133,11 +107,12 @@ services: ports: - "8888:8888" volumes: - - ${SMOKE_TEST_DATA_DIR:-./data/dev}/logs:/logs - - ${SMOKE_TEST_DATA_DIR:-./data/dev}/active:/data/active - - ${SMOKE_TEST_DATA_DIR:-./data/dev}/experiments:/data/experiments - - ${SMOKE_TEST_DATA_DIR:-./data/dev}/workspaces:/data/active/workspaces:delegated - - ./configs/workers/docker-dev.yaml:/app/configs/worker.yaml + - ${DATA_DIR:-./data/smoke}/logs:/logs + - ${DATA_DIR:-./data/smoke}/active:/data/active + - ${DATA_DIR:-./data/smoke}/experiments:/data/experiments + - ${DATA_DIR:-./data/smoke}/workspaces:/data/active/workspaces:delegated + - ${DATA_DIR:-./data/smoke}/configs/worker/docker-dev.yaml:/app/configs/worker.yaml:ro + - ${DATA_DIR:-./data/smoke}/ssl:/app/ssl:ro - /sys/fs/cgroup:/sys/fs/cgroup:rw depends_on: redis: @@ -158,71 +133,6 @@ services: # Native libs enabled via build tag: -tags native_libs privileged: true command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"] - # # Prometheus - Metrics collection - # prometheus: - # image: prom/prometheus:latest - # container_name: ml-experiments-prometheus - # ports: - # - "9090:9090" - # volumes: - # - ${FETCHML_REPO_ROOT:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml - # - prometheus_data:/prometheus - # command: - # - '--config.file=/etc/prometheus/prometheus.yml' - # - '--storage.tsdb.path=/prometheus' - # - '--web.console.libraries=/etc/prometheus/console_libraries' - # - '--web.console.templates=/etc/prometheus/consoles' - # - '--web.enable-lifecycle' - # restart: unless-stopped - # - # # Grafana - Visualization - # grafana: - # image: grafana/grafana:latest - # container_name: ml-experiments-grafana - # ports: - # - "3000:3000" - # volumes: - # - grafana_data:/var/lib/grafana - # - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning - # - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards - # environment: - # - GF_SECURITY_ADMIN_PASSWORD=admin123 - # - GF_USERS_ALLOW_SIGN_UP=false - # restart: unless-stopped - # depends_on: - # - prometheus - # - loki - # - # # Loki - Log aggregation - # loki: - # image: grafana/loki:latest - # container_name: ml-experiments-loki - # ports: - # - "3100:3100" - # volumes: - # - ${FETCHML_REPO_ROOT:-.}/monitoring/loki-config.yml:/etc/loki/local-config.yaml - # - loki_data:/loki - # command: -config.file=/etc/loki/local-config.yaml - # restart: unless-stopped - # Promtail - Log collector - promtail: - image: grafana/promtail:latest - container_name: ml-experiments-promtail - volumes: - - ${SMOKE_TEST_DATA_DIR:-./monitoring}/promtail-config.yml:/etc/promtail/config.yml - - ${SMOKE_TEST_DATA_DIR:-./data/dev}/logs:/var/log/app - - /var/lib/docker/containers:/var/lib/docker/containers:ro - - /var/run/docker.sock:/var/run/docker.sock - command: -config.file=/etc/promtail/config.yml - restart: unless-stopped - # depends_on: - # - loki volumes: redis_data: driver: local - prometheus_data: - driver: local - grafana_data: - driver: local - loki_data: - driver: local diff --git a/deployments/docker-compose.homelab-secure.yml b/deployments/docker-compose.homelab-secure.yml index ab72f8b..fb05eed 100644 --- a/deployments/docker-compose.homelab-secure.yml +++ b/deployments/docker-compose.homelab-secure.yml @@ -14,8 +14,8 @@ services: - ${HOMELAB_DATA_DIR:-./data/homelab}/experiments:/data/experiments - ${HOMELAB_DATA_DIR:-./data/homelab}/active:/data/active - ${HOMELAB_DATA_DIR:-./data/homelab}/logs:/logs - - ./ssl:/app/ssl:ro - - ./configs/api/homelab-secure.yaml:/app/configs/api/prod.yaml:ro + - ${HOMELAB_DATA_DIR:-./data/homelab}/ssl:/app/ssl:ro + - ${HOMELAB_DATA_DIR:-./data/homelab}/configs/api/homelab-secure.yaml:/app/configs/api/prod.yaml:ro - ${FETCHML_REPO_ROOT:-..}/.env.secure:/app/.env.secure:ro depends_on: redis: @@ -32,7 +32,6 @@ services: retries: 3 start_period: 40s labels: - logging: "promtail" job: "api-server" command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"] networks: @@ -52,28 +51,27 @@ services: - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin} - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123} command: ["server", "/data", "--console-address", ":9001"] + healthcheck: + test: ["CMD", "curl", "-fsS", "http://localhost:9000/minio/health/live"] + interval: 5s + timeout: 5s + retries: 5 restart: unless-stopped networks: - ml-backend-network minio-init: - image: alpine:3.19 + image: minio/mc:latest container_name: ml-experiments-minio-init depends_on: - - minio + minio: + condition: service_healthy entrypoint: ["/bin/sh", "-c"] command: - | - apk add --no-cache ca-certificates curl >/dev/null - curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc - chmod +x /usr/local/bin/mc - mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123} - # Skip if bucket already exists - if mc ls local/fetchml-snapshots 2>/dev/null; then - echo "Bucket fetchml-snapshots already exists, skipping init" - exit 0 - fi - mc mb -p local/fetchml-snapshots || true + mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123} || exit 1 + mc mb -p local/fetchml-snapshots 2>/dev/null || echo "Bucket exists" + echo "MinIO initialized" restart: "no" networks: - ml-backend-network @@ -87,14 +85,14 @@ services: - ${HOMELAB_DATA_DIR:-./data/homelab}/experiments:/app/data/experiments - ${HOMELAB_DATA_DIR:-./data/homelab}/active:/data/active - ${HOMELAB_DATA_DIR:-./data/homelab}/logs:/logs - - ./configs/workers/homelab-secure.yaml:/app/configs/worker.yaml + - ${HOMELAB_DATA_DIR:-./data/homelab}/configs/worker/homelab-secure.yaml:/app/configs/worker.yaml:ro depends_on: redis: condition: service_healthy api-server: condition: service_healthy minio-init: - condition: service_started + condition: service_completed_successfully restart: unless-stopped environment: - LOG_LEVEL=info @@ -115,7 +113,7 @@ services: - "443:443" volumes: - ./deployments/Caddyfile.homelab-secure:/etc/caddy/Caddyfile:ro - - ./ssl:/etc/caddy/ssl:ro + - ${HOMELAB_DATA_DIR:-./data/homelab}/ssl:/etc/caddy/ssl:ro - ${HOMELAB_DATA_DIR:-./data/homelab}/caddy/data:/data - ${HOMELAB_DATA_DIR:-./data/homelab}/caddy/config:/config environment: @@ -135,7 +133,7 @@ services: - "127.0.0.1:6379:6379" # Bind to localhost only volumes: - ${HOMELAB_DATA_DIR:-./data/homelab}/redis:/data - - ./redis/redis-secure.conf:/usr/local/etc/redis/redis.conf:ro + - ${HOMELAB_DATA_DIR:-./data/homelab}/configs/redis/redis-secure.conf:/usr/local/etc/redis/redis.conf:ro restart: unless-stopped command: redis-server /usr/local/etc/redis/redis.conf --requirepass ${REDIS_PASSWORD} healthcheck: diff --git a/deployments/docker-compose.local.yml b/deployments/docker-compose.local.yml index f6b4495..20d8756 100644 --- a/deployments/docker-compose.local.yml +++ b/deployments/docker-compose.local.yml @@ -7,11 +7,11 @@ services: ports: - "9101:9101" volumes: - - ${LOCAL_DATA_DIR:-../data/dev}/logs:/logs - - ${LOCAL_DATA_DIR:-../data/dev}/experiments:/data/experiments - - ${LOCAL_DATA_DIR:-../data/dev}/active:/data/active - - ${LOCAL_DATA_DIR:-../data/dev}/workspaces:/data/active/workspaces:delegated - - ../configs/api/dev.yaml:/app/configs/api/dev.yaml + - ${LOCAL_DATA_DIR:-./data/dev}/logs:/logs + - ${LOCAL_DATA_DIR:-./data/dev}/experiments:/data/experiments + - ${LOCAL_DATA_DIR:-./data/dev}/active:/data/active + - ${LOCAL_DATA_DIR:-./data/dev}/workspaces:/data/active/workspaces:delegated + - ${LOCAL_DATA_DIR:-./data/dev}/configs/api/dev.yaml:/app/configs/api/dev.yaml:ro environment: - LOG_LEVEL=info depends_on: @@ -30,11 +30,12 @@ services: ports: - "8888:8888" volumes: - - ${LOCAL_DATA_DIR:-../data/dev}/logs:/logs - - ${LOCAL_DATA_DIR:-../data/dev}/active:/data/active - - ${LOCAL_DATA_DIR:-../data/dev}/experiments:/data/experiments - - ${LOCAL_DATA_DIR:-../data/dev}/workspaces:/data/active/workspaces:delegated - - ../configs/workers/docker-dev.yaml:/app/configs/worker.yaml + - ${LOCAL_DATA_DIR:-./data/dev}/logs:/logs + - ${LOCAL_DATA_DIR:-./data/dev}/active:/data/active + - ${LOCAL_DATA_DIR:-./data/dev}/experiments:/data/experiments + - ${LOCAL_DATA_DIR:-./data/dev}/workspaces:/data/active/workspaces:delegated + - ${LOCAL_DATA_DIR:-./data/dev}/snapshots:/data/snapshots + - ${LOCAL_DATA_DIR:-./data/dev}/configs/worker/docker-dev.yaml:/app/configs/worker.yaml:ro - /sys/fs/cgroup:/sys/fs/cgroup:rw environment: - LOG_LEVEL=info diff --git a/deployments/docker-compose.prod.smoke.yml b/deployments/docker-compose.prod.smoke.yml index db7a96c..b038309 100644 --- a/deployments/docker-compose.prod.smoke.yml +++ b/deployments/docker-compose.prod.smoke.yml @@ -45,7 +45,7 @@ services: - ${SMOKE_TEST_DATA_DIR:-./data/prod-smoke}/experiments:/data/experiments - ${SMOKE_TEST_DATA_DIR:-./data/prod-smoke}/active:/data/active - ${SMOKE_TEST_DATA_DIR:-./data/prod-smoke}/logs:/logs - - ./configs/api/dev.yaml:/app/configs/api/dev.yaml:ro + - ${SMOKE_TEST_DATA_DIR:-./data/prod-smoke}/configs/api/dev.yaml:/app/configs/api/dev.yaml:ro command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"] environment: - LOG_LEVEL=info @@ -67,7 +67,7 @@ services: - PASSWORD_ACCESS=false volumes: - ./deployments/test_keys:/tmp:ro - - ${FETCHML_REPO_ROOT:-..}/bin/tui-linux:/usr/local/bin/tui:ro + - ./bin/tui:/usr/local/bin/tui:ro - ./deployments/tui-test-config.toml:/config/.ml/config.toml:ro ports: - "2222:2222" diff --git a/deployments/docker-compose.prod.yml b/deployments/docker-compose.prod.yml index 7461766..f8e2212 100644 --- a/deployments/docker-compose.prod.yml +++ b/deployments/docker-compose.prod.yml @@ -28,7 +28,7 @@ services: - ${PROD_DATA_DIR:-./data/prod}/experiments:/app/data/experiments - ${PROD_DATA_DIR:-./data/prod}/active:/data/active - ${PROD_DATA_DIR:-./data/prod}/logs:/logs - - ./configs/api/multi-user.yaml:/app/configs/api/prod.yaml + - ${PROD_DATA_DIR:-./data/prod}/configs/api/multi-user.yaml:/app/configs/api/prod.yaml:ro depends_on: redis: condition: service_healthy @@ -62,7 +62,7 @@ services: - ${PROD_DATA_DIR:-./data/prod}/experiments:/app/data/experiments - ${PROD_DATA_DIR:-./data/prod}/active:/data/active - ${PROD_DATA_DIR:-./data/prod}/logs:/logs - - ./configs/workers/docker-prod.yaml:/app/configs/worker.yaml + - ${PROD_DATA_DIR:-./data/prod}/configs/worker/docker-prod.yaml:/app/configs/worker.yaml:ro depends_on: redis: condition: service_healthy