chore(deploy): update deployment configs and TUI for scheduler
Update deployment and CLI tooling: - TUI models (jobs, state) with scheduler data - TUI store with scheduler endpoints - TUI config with scheduler settings - Deployment Makefile with scheduler targets - Deploy script with scheduler registration - Docker Compose files with scheduler services - Remove obsolete Dockerfiles (api-server, full-prod, test) - Update remaining Dockerfiles with scheduler integration
This commit is contained in:
parent
4cdb68907e
commit
c459285cab
18 changed files with 357 additions and 620 deletions
|
|
@ -1,75 +0,0 @@
|
|||
# Multi-stage build for ML Experiment Manager
|
||||
FROM golang:1.25-alpine AS go-builder
|
||||
|
||||
# Install dependencies
|
||||
RUN apk add --no-cache git make podman redis
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy go mod files
|
||||
COPY go.mod go.sum ./
|
||||
|
||||
# Download dependencies
|
||||
RUN go mod download
|
||||
|
||||
# Copy source code
|
||||
COPY . .
|
||||
|
||||
# Build Go binaries
|
||||
RUN make build
|
||||
|
||||
# Zig CLI stage
|
||||
FROM alpine:3.19 AS zig-builder
|
||||
|
||||
# Install dependencies
|
||||
RUN apk add --no-cache curl xz
|
||||
|
||||
# Install Zig
|
||||
RUN curl -L https://ziglang.org/download/0.15.2/zig-linux-aarch64-0.15.2.tar.xz | tar -xJ -C /opt
|
||||
ENV PATH="/opt/zig-linux-aarch64-0.15.2:${PATH}"
|
||||
|
||||
# Copy CLI source
|
||||
COPY cli/ /app/cli/
|
||||
|
||||
# Build Zig CLI
|
||||
WORKDIR /app/cli
|
||||
RUN zig build cross
|
||||
|
||||
# Final stage
|
||||
FROM alpine:3.19
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apk add --no-cache ca-certificates rsync openssh-client redis
|
||||
|
||||
# Create app user
|
||||
RUN addgroup -g 1001 -S appgroup && \
|
||||
adduser -u 1001 -S appuser -G appgroup
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy binaries from builders
|
||||
COPY --from=go-builder /app/bin/ /usr/local/bin/
|
||||
COPY --from=zig-builder /app/cli/zig-out/bin/ml /usr/local/bin/
|
||||
|
||||
# Copy configs
|
||||
COPY --from=go-builder /app/configs/ /app/configs/
|
||||
|
||||
# Create directories
|
||||
RUN mkdir -p /data/experiments /data/datasets /data/snapshots /home/appuser/.ml && \
|
||||
mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl && \
|
||||
chown -R appuser:appgroup /data /app /home/appuser
|
||||
|
||||
# Switch to app user
|
||||
USER appuser
|
||||
|
||||
# Expose ports
|
||||
EXPOSE 9101
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD wget --no-verbose --tries=1 --no-check-certificate --spider https://localhost:9101/health || exit 1
|
||||
|
||||
# Default command
|
||||
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"]
|
||||
|
|
@ -1,76 +0,0 @@
|
|||
# Full Production Dockerfile with Podman and SSH
|
||||
FROM golang:1.25-alpine AS builder
|
||||
|
||||
# Install dependencies
|
||||
RUN apk add --no-cache git make
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy go mod files
|
||||
COPY go.mod go.sum ./
|
||||
|
||||
# Download dependencies
|
||||
RUN go mod download
|
||||
|
||||
# Copy source code
|
||||
COPY . .
|
||||
|
||||
# Build Go binaries
|
||||
RUN go build -o bin/api-server cmd/api-server/main.go && \
|
||||
go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go
|
||||
|
||||
# Final stage with Podman
|
||||
FROM alpine:3.19
|
||||
|
||||
# Install runtime dependencies including Podman and SSH
|
||||
RUN apk add --no-cache ca-certificates redis openssl curl podman openssh
|
||||
|
||||
# Create app user
|
||||
RUN addgroup -g 1001 -S appgroup && \
|
||||
adduser -u 1001 -S appuser -G appgroup
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy binaries from builder
|
||||
COPY --from=builder /app/bin/ /usr/local/bin/
|
||||
|
||||
# Copy configs
|
||||
COPY --from=builder /app/configs/ /app/configs/
|
||||
|
||||
# Create necessary directories
|
||||
RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /app/ssh /tmp/fetchml-jobs && \
|
||||
mkdir -p /data/active/datasets /data/active/snapshots && \
|
||||
mkdir -p /logs && \
|
||||
chown -R appuser:appgroup /app /data /logs
|
||||
|
||||
# Generate SSL certificates
|
||||
RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
|
||||
-subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \
|
||||
chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem
|
||||
|
||||
# Generate SSH keys for container communication
|
||||
RUN ssh-keygen -t rsa -b 2048 -f /app/ssh/id_rsa -N "" && \
|
||||
cp /app/ssh/id_rsa.pub /app/ssh/authorized_keys && \
|
||||
chmod 600 /app/ssh/id_rsa && \
|
||||
chmod 644 /app/ssh/id_rsa.pub /app/ssh/authorized_keys
|
||||
|
||||
# Configure SSH daemon
|
||||
RUN echo "PermitRootLogin yes" >> /etc/ssh/sshd_config && \
|
||||
echo "PasswordAuthentication no" >> /etc/ssh/sshd_config && \
|
||||
echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config && \
|
||||
echo "AuthorizedKeysFile /app/ssh/authorized_keys" >> /etc/ssh/sshd_config
|
||||
|
||||
# Switch to app user
|
||||
USER appuser
|
||||
|
||||
# Expose ports
|
||||
EXPOSE 9101 22
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
|
||||
CMD curl -k -f https://localhost:9101/health || exit 1
|
||||
|
||||
# Default command for API server
|
||||
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"]
|
||||
|
|
@ -1,149 +0,0 @@
|
|||
# Homelab Secure Production Dockerfile
|
||||
FROM golang:1.25-alpine AS builder
|
||||
|
||||
# Install dependencies
|
||||
RUN apk add --no-cache git make
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy go mod files
|
||||
COPY go.mod go.sum ./
|
||||
|
||||
# Download dependencies
|
||||
RUN go mod download
|
||||
|
||||
# Copy source code
|
||||
COPY . .
|
||||
|
||||
# Build Go binaries
|
||||
RUN go build -o bin/api-server cmd/api-server/main.go && \
|
||||
go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go
|
||||
|
||||
# Final stage with security hardening
|
||||
FROM alpine:3.19
|
||||
|
||||
# Install security packages and runtime dependencies
|
||||
RUN apk add --no-cache \
|
||||
ca-certificates \
|
||||
redis \
|
||||
openssl \
|
||||
curl \
|
||||
podman \
|
||||
openssh \
|
||||
sudo \
|
||||
fail2ban \
|
||||
logrotate \
|
||||
&& rm -rf /var/cache/apk/*
|
||||
|
||||
# Create app user and worker user with no shell by default
|
||||
RUN addgroup -g 1001 -S appgroup && \
|
||||
adduser -u 1001 -S appuser -G appgroup -s /sbin/nologin && \
|
||||
addgroup -g 1002 -S workergroup && \
|
||||
adduser -u 1002 -S worker -G workergroup -s /bin/sh && \
|
||||
echo "worker:HomelabWorker2024!" | chpasswd && \
|
||||
mkdir -p /home/worker/.ssh && \
|
||||
chown -R worker:workergroup /home/worker
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy binaries from builder
|
||||
COPY --from=builder /app/bin/ /usr/local/bin/
|
||||
|
||||
# Copy configs
|
||||
COPY --from=builder /app/configs/ /app/configs/
|
||||
|
||||
# Create necessary directories with proper permissions
|
||||
RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /tmp/fetchml-jobs && \
|
||||
mkdir -p /data/active/datasets /data/active/snapshots && \
|
||||
mkdir -p /logs && \
|
||||
chown -R appuser:appgroup /app /data /logs && \
|
||||
chmod 750 /app/data/experiments /app/logs
|
||||
|
||||
# Generate SSL certificates with stronger crypto
|
||||
RUN openssl req -x509 -newkey rsa:4096 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
|
||||
-subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \
|
||||
chmod 600 /app/ssl/key.pem && \
|
||||
chmod 644 /app/ssl/cert.pem
|
||||
|
||||
# Generate SSH keys with stronger crypto
|
||||
RUN ssh-keygen -t rsa -b 4096 -f /home/worker/.ssh/id_rsa -N "" && \
|
||||
cp /home/worker/.ssh/id_rsa.pub /home/worker/.ssh/authorized_keys && \
|
||||
chmod 700 /home/worker/.ssh && \
|
||||
chmod 600 /home/worker/.ssh/id_rsa && \
|
||||
chmod 644 /home/worker/.ssh/id_rsa.pub /home/worker/.ssh/authorized_keys && \
|
||||
chown -R worker:workergroup /home/worker/.ssh
|
||||
|
||||
# Configure SSH with security hardening
|
||||
RUN echo "Port 2222" >> /etc/ssh/sshd_config && \
|
||||
echo "PermitRootLogin no" >> /etc/ssh/sshd_config && \
|
||||
echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \
|
||||
echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config && \
|
||||
echo "AuthorizedKeysFile %h/.ssh/authorized_keys" >> /etc/ssh/sshd_config && \
|
||||
echo "AllowUsers worker" >> /etc/ssh/sshd_config && \
|
||||
echo "MaxAuthTries 3" >> /etc/ssh/sshd_config && \
|
||||
echo "ClientAliveInterval 300" >> /etc/ssh/sshd_config && \
|
||||
echo "ClientAliveCountMax 2" >> /etc/ssh/sshd_config && \
|
||||
echo "X11Forwarding no" >> /etc/ssh/sshd_config && \
|
||||
echo "AllowTcpForwarding no" >> /etc/ssh/sshd_config && \
|
||||
echo "Banner /etc/ssh/banner" >> /etc/ssh/sshd_config && \
|
||||
echo "Protocol 2" >> /etc/ssh/sshd_config && \
|
||||
echo "Ciphers chacha20-poly1305@openssh.com,aes256-gcm@openssh.com,aes128-gcm@openssh.com" >> /etc/ssh/sshd_config && \
|
||||
echo "MACs hmac-sha2-256-etm@openssh.com,hmac-sha2-512-etm@openssh.com,hmac-sha2-256,hmac-sha2-512" >> /etc/ssh/sshd_config && \
|
||||
echo "KexAlgorithms curve25519-sha256@libssh.org,diffie-hellman-group16-sha512" >> /etc/ssh/sshd_config
|
||||
|
||||
# Create SSH banner
|
||||
RUN echo "=================================================" > /etc/ssh/banner && \
|
||||
echo " ML Experiments Homelab Server" >> /etc/ssh/banner && \
|
||||
echo " Unauthorized access is prohibited" >> /etc/ssh/banner && \
|
||||
echo " All connections are monitored and logged" >> /etc/ssh/banner && \
|
||||
echo "=================================================" >> /etc/ssh/banner
|
||||
|
||||
# Generate SSH host keys
|
||||
RUN ssh-keygen -A
|
||||
|
||||
# Configure fail2ban for SSH protection
|
||||
RUN echo "[DEFAULT]" > /etc/fail2ban/jail.local && \
|
||||
echo "bantime = 3600" >> /etc/fail2ban/jail.local && \
|
||||
echo "findtime = 600" >> /etc/fail2ban/jail.local && \
|
||||
echo "maxretry = 3" >> /etc/fail2ban/jail.local && \
|
||||
echo "" >> /etc/fail2ban/jail.local && \
|
||||
echo "[sshd]" >> /etc/fail2ban/jail.local && \
|
||||
echo "enabled = true" >> /etc/fail2ban/jail.local && \
|
||||
echo "port = 2222" >> /etc/fail2ban/jail.local && \
|
||||
echo "filter = sshd" >> /etc/fail2ban/jail.local && \
|
||||
echo "logpath = /var/log/messages" >> /etc/fail2ban/jail.local
|
||||
|
||||
# Configure sudo with restricted access
|
||||
RUN echo "appuser ALL=(ALL) NOPASSWD: /app/start-security.sh" >> /etc/sudoers && \
|
||||
echo "appuser ALL=(ALL) NOPASSWD: /usr/sbin/sshd" >> /etc/sudoers && \
|
||||
echo "appuser ALL=(ALL) NOPASSWD: /usr/bin/ssh-keygen" >> /etc/sudoers && \
|
||||
echo "worker ALL=(ALL) NOPASSWD: /usr/bin/podman" >> /etc/sudoers && \
|
||||
echo "Defaults:appuser !requiretty" >> /etc/sudoers && \
|
||||
echo "Defaults:worker !requiretty" >> /etc/sudoers && \
|
||||
echo "Defaults:appuser !lecture" >> /etc/sudoers && \
|
||||
echo "Defaults:worker !lecture" >> /etc/sudoers
|
||||
|
||||
# Security hardening - remove setuid binaries except sudo
|
||||
RUN find / -perm /4000 -type f -not -path "/usr/bin/sudo" -exec chmod 755 {} \; 2>/dev/null || true
|
||||
|
||||
# Create startup script for security services
|
||||
RUN echo "#!/bin/sh" > /app/start-security.sh && \
|
||||
echo "ssh-keygen -A" >> /app/start-security.sh && \
|
||||
echo "/usr/sbin/sshd -D -p 2222" >> /app/start-security.sh && \
|
||||
echo "# End of security services" >> /app/start-security.sh && \
|
||||
chmod 755 /app/start-security.sh
|
||||
|
||||
# Switch to app user for application
|
||||
USER appuser
|
||||
|
||||
# Expose ports
|
||||
EXPOSE 9101 2222
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
|
||||
CMD curl -k -f https://localhost:9101/health || exit 1
|
||||
|
||||
# Default command for API server
|
||||
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"]
|
||||
|
|
@ -16,9 +16,9 @@ RUN go mod download
|
|||
# Copy source code
|
||||
COPY . .
|
||||
|
||||
# Build Go binaries with CGO enabled for SQLite
|
||||
RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \
|
||||
CGO_ENABLED=1 go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go
|
||||
# Build Go binaries (native libs not used in Docker since NVML unavailable in Alpine)
|
||||
RUN CGO_ENABLED=1 go build -o bin/api-server ./cmd/api-server/main.go && \
|
||||
CGO_ENABLED=1 go build -o bin/worker ./cmd/worker
|
||||
|
||||
# Final stage with Podman and secure SSH
|
||||
FROM alpine:3.19
|
||||
|
|
|
|||
|
|
@ -18,12 +18,13 @@ COPY . .
|
|||
|
||||
# Copy and build native C++ libraries (without NVML for non-GPU systems)
|
||||
COPY native/ ./native/
|
||||
ENV FETCHML_DOCKER_BUILD=1
|
||||
RUN rm -rf native/build && cd native && mkdir -p build && cd build && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release -DFETCHML_DOCKER_BUILD=1 -DBUILD_NVML_GPU=OFF && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_NVML_GPU=OFF && \
|
||||
make -j$(nproc)
|
||||
|
||||
# Build Go binaries (native libs not used in Docker since NVML unavailable in Alpine)
|
||||
RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \
|
||||
RUN CGO_ENABLED=1 go build -o bin/api-server ./cmd/api-server/main.go && \
|
||||
CGO_ENABLED=1 go build -o bin/worker ./cmd/worker
|
||||
|
||||
# Final stage
|
||||
|
|
|
|||
|
|
@ -1,62 +0,0 @@
|
|||
# Test Dockerfile - Go components only
|
||||
FROM golang:1.25-alpine AS builder
|
||||
|
||||
# Install dependencies
|
||||
RUN apk add --no-cache git gcc musl-dev
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy go mod files
|
||||
COPY go.mod go.sum ./
|
||||
|
||||
# Download dependencies
|
||||
RUN go mod download
|
||||
|
||||
# Copy source code
|
||||
COPY . .
|
||||
|
||||
# Build only Go binaries (skip Zig)
|
||||
RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \
|
||||
go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go && \
|
||||
go build -o bin/tui ./cmd/tui
|
||||
|
||||
# Final stage
|
||||
FROM alpine:3.19
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apk add --no-cache ca-certificates curl openssl
|
||||
|
||||
# Create app user
|
||||
RUN addgroup -g 1001 -S appgroup && \
|
||||
adduser -u 1001 -S appuser -G appgroup
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy binaries from builder
|
||||
COPY --from=builder /app/bin/ /usr/local/bin/
|
||||
|
||||
# Copy configs
|
||||
COPY --from=builder /app/configs/ /app/configs/
|
||||
|
||||
# Create necessary directories
|
||||
RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl && \
|
||||
mkdir -p /data/experiments /data/datasets /data/snapshots
|
||||
|
||||
# Generate SSL certificates for container use
|
||||
RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
|
||||
-subj "/C=US/ST=Test/L=Local/O=FetchML/OU=Tests/CN=localhost" && \
|
||||
chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem
|
||||
|
||||
# Ensure app user can write to data/logs and read TLS material
|
||||
RUN chown -R appuser:appgroup /app/data /app/logs /app/ssl /app/configs /data
|
||||
|
||||
# Switch to app user
|
||||
USER appuser
|
||||
|
||||
# Expose ports
|
||||
EXPOSE 9101
|
||||
|
||||
# Default command
|
||||
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"]
|
||||
|
|
@ -14,22 +14,20 @@ import (
|
|||
|
||||
// CLIConfig represents the TOML config structure used by the CLI
|
||||
type CLIConfig struct {
|
||||
WorkerHost string `toml:"worker_host"`
|
||||
WorkerUser string `toml:"worker_user"`
|
||||
WorkerBase string `toml:"worker_base"`
|
||||
WorkerPort int `toml:"worker_port"`
|
||||
APIKey string `toml:"api_key"`
|
||||
|
||||
// User context (filled after authentication)
|
||||
CurrentUser *UserContext `toml:"-"`
|
||||
WorkerHost string `toml:"worker_host"`
|
||||
WorkerUser string `toml:"worker_user"`
|
||||
WorkerBase string `toml:"worker_base"`
|
||||
APIKey string `toml:"api_key"`
|
||||
WorkerPort int `toml:"worker_port"`
|
||||
}
|
||||
|
||||
// UserContext represents the authenticated user information
|
||||
type UserContext struct {
|
||||
Name string `json:"name"`
|
||||
Admin bool `json:"admin"`
|
||||
Roles []string `json:"roles"`
|
||||
Permissions map[string]bool `json:"permissions"`
|
||||
Name string `json:"name"`
|
||||
Roles []string `json:"roles"`
|
||||
Admin bool `json:"admin"`
|
||||
}
|
||||
|
||||
// LoadCLIConfig loads the CLI's TOML configuration from the provided path.
|
||||
|
|
|
|||
|
|
@ -12,39 +12,31 @@ import (
|
|||
|
||||
// Config holds TUI configuration
|
||||
type Config struct {
|
||||
Host string `toml:"host"`
|
||||
User string `toml:"user"`
|
||||
SSHKey string `toml:"ssh_key"`
|
||||
Port int `toml:"port"`
|
||||
BasePath string `toml:"base_path"`
|
||||
Mode string `toml:"mode"` // "dev" or "prod"
|
||||
WrapperScript string `toml:"wrapper_script"`
|
||||
TrainScript string `toml:"train_script"`
|
||||
RedisAddr string `toml:"redis_addr"`
|
||||
RedisPassword string `toml:"redis_password"`
|
||||
RedisDB int `toml:"redis_db"`
|
||||
KnownHosts string `toml:"known_hosts"`
|
||||
ServerURL string `toml:"server_url"` // WebSocket server URL (e.g., ws://localhost:8080)
|
||||
|
||||
// Local mode configuration
|
||||
DBPath string `toml:"db_path"` // Path to SQLite database (local mode)
|
||||
ForceLocal bool `toml:"force_local"` // Force local-only mode
|
||||
ProjectRoot string `toml:"project_root"` // Project root for local mode
|
||||
|
||||
// Experiment configuration
|
||||
Experiment struct {
|
||||
Name string `toml:"name"`
|
||||
Entrypoint string `toml:"entrypoint"`
|
||||
} `toml:"experiment"`
|
||||
|
||||
// Authentication
|
||||
Auth auth.Config `toml:"auth"`
|
||||
|
||||
// Podman settings
|
||||
PodmanImage string `toml:"podman_image"`
|
||||
ContainerWorkspace string `toml:"container_workspace"`
|
||||
ContainerResults string `toml:"container_results"`
|
||||
GPUDevices []string `toml:"gpu_devices"`
|
||||
ProjectRoot string `toml:"project_root"`
|
||||
ServerURL string `toml:"server_url"`
|
||||
ContainerResults string `toml:"container_results"`
|
||||
BasePath string `toml:"base_path"`
|
||||
Mode string `toml:"mode"`
|
||||
WrapperScript string `toml:"wrapper_script"`
|
||||
TrainScript string `toml:"train_script"`
|
||||
RedisAddr string `toml:"redis_addr"`
|
||||
RedisPassword string `toml:"redis_password"`
|
||||
ContainerWorkspace string `toml:"container_workspace"`
|
||||
SSHKey string `toml:"ssh_key"`
|
||||
DBPath string `toml:"db_path"`
|
||||
KnownHosts string `toml:"known_hosts"`
|
||||
PodmanImage string `toml:"podman_image"`
|
||||
Host string `toml:"host"`
|
||||
User string `toml:"user"`
|
||||
Auth auth.Config `toml:"auth"`
|
||||
GPUDevices []string `toml:"gpu_devices"`
|
||||
RedisDB int `toml:"redis_db"`
|
||||
Port int `toml:"port"`
|
||||
ForceLocal bool `toml:"force_local"`
|
||||
}
|
||||
|
||||
// LoadConfig loads configuration from a TOML file
|
||||
|
|
|
|||
|
|
@ -21,21 +21,19 @@ const (
|
|||
|
||||
// Job represents a job in the TUI
|
||||
type Job struct {
|
||||
Name string
|
||||
Status JobStatus
|
||||
TaskID string
|
||||
Priority int64
|
||||
// Narrative fields for research context
|
||||
OutcomeStatus string
|
||||
Status JobStatus
|
||||
TaskID string
|
||||
Hypothesis string
|
||||
Context string
|
||||
Intent string
|
||||
ExpectedOutcome string
|
||||
ActualOutcome string
|
||||
OutcomeStatus string // validated, invalidated, inconclusive
|
||||
// GPU allocation tracking
|
||||
GPUDeviceID int // -1 if not assigned
|
||||
GPUUtilization int // 0-100%
|
||||
GPUMemoryUsed int64 // MB
|
||||
Name string
|
||||
Priority int64
|
||||
GPUDeviceID int
|
||||
GPUUtilization int
|
||||
GPUMemoryUsed int64
|
||||
}
|
||||
|
||||
// Title returns the job title for display
|
||||
|
|
|
|||
|
|
@ -48,50 +48,50 @@ const (
|
|||
|
||||
// DatasetInfo represents dataset information in the TUI
|
||||
type DatasetInfo struct {
|
||||
Name string `json:"name"`
|
||||
SizeBytes int64 `json:"size_bytes"`
|
||||
Location string `json:"location"`
|
||||
LastAccess time.Time `json:"last_access"`
|
||||
Name string `json:"name"`
|
||||
Location string `json:"location"`
|
||||
SizeBytes int64 `json:"size_bytes"`
|
||||
}
|
||||
|
||||
// State holds the application state
|
||||
type State struct {
|
||||
Jobs []Job
|
||||
JobList list.Model
|
||||
LastRefresh time.Time
|
||||
LastGPUUpdate time.Time
|
||||
LastFrameTime time.Time
|
||||
JobStats map[JobStatus]int
|
||||
Status string
|
||||
APIKey string
|
||||
ErrorMsg string
|
||||
Keys KeyMap
|
||||
QueuedTasks []*Task
|
||||
Datasets []DatasetInfo
|
||||
JobList list.Model
|
||||
Jobs []Job
|
||||
Input textinput.Model
|
||||
APIKeyInput textinput.Model
|
||||
GpuView viewport.Model
|
||||
ContainerView viewport.Model
|
||||
QueueView viewport.Model
|
||||
LogsView viewport.Model
|
||||
ConfigView viewport.Model
|
||||
ExperimentHistoryView viewport.Model
|
||||
TeamView viewport.Model
|
||||
SettingsView viewport.Model
|
||||
DatasetView viewport.Model
|
||||
ExperimentsView viewport.Model
|
||||
NarrativeView viewport.Model
|
||||
TeamView viewport.Model
|
||||
ExperimentHistoryView viewport.Model
|
||||
ConfigView viewport.Model
|
||||
LogsView viewport.Model
|
||||
SelectedJob Job
|
||||
Input textinput.Model
|
||||
APIKeyInput textinput.Model
|
||||
Status string
|
||||
ErrorMsg string
|
||||
InputMode bool
|
||||
Width int
|
||||
Height int
|
||||
ShowHelp bool
|
||||
ContainerView viewport.Model
|
||||
Spinner spinner.Model
|
||||
SelectedJob Job
|
||||
ActiveView ViewMode
|
||||
LastRefresh time.Time
|
||||
LastFrameTime time.Time
|
||||
RefreshRate float64 // measured in ms
|
||||
RefreshRate float64
|
||||
FrameCount int
|
||||
LastGPUUpdate time.Time
|
||||
IsLoading bool
|
||||
JobStats map[JobStatus]int
|
||||
APIKey string
|
||||
Height int
|
||||
Width int
|
||||
SettingsIndex int
|
||||
Keys KeyMap
|
||||
ShowHelp bool
|
||||
IsLoading bool
|
||||
InputMode bool
|
||||
}
|
||||
|
||||
// InitialState creates the initial application state
|
||||
|
|
|
|||
|
|
@ -18,13 +18,13 @@ type Store struct {
|
|||
|
||||
// RunInfo represents a local run from SQLite
|
||||
type RunInfo struct {
|
||||
EndTime *string
|
||||
PID *int64
|
||||
RunID string
|
||||
ExperimentID string
|
||||
Name string
|
||||
Status string
|
||||
StartTime string
|
||||
EndTime *string
|
||||
PID *int64
|
||||
Synced bool
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,74 +1,200 @@
|
|||
# Docker Compose Deployment Management
|
||||
.PHONY: help dev-up dev-down dev-logs dev-restart homelab-secure-up homelab-secure-down prod-up prod-down status clean
|
||||
.PHONY: help dev-up dev-down dev-logs dev-restart staging-up staging-down staging-logs staging-restart staging-status homelab-secure-up homelab-secure-down prod-up prod-down prod-logs prod-restart prod-status status clean rollback security-mode check-audit-sink health-check security-scan
|
||||
|
||||
# Default target
|
||||
help: ## Show this help message
|
||||
@echo "Available commands:"
|
||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
|
||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-25s\033[0m %s\n", $$1, $$2}'
|
||||
|
||||
# Development environment
|
||||
dev-up: ## Start development environment
|
||||
@echo "Starting development environment..."
|
||||
docker-compose -f deployments/docker-compose.dev.yml up -d
|
||||
docker-compose -f docker-compose.dev.yml up -d
|
||||
@echo "Services: Caddy (8080/8443), Redis (6379), Prometheus (9090), Grafana (3000)"
|
||||
|
||||
dev-down: ## Stop development environment
|
||||
@echo "Stopping development environment..."
|
||||
docker-compose -f deployments/docker-compose.dev.yml down
|
||||
docker-compose -f docker-compose.dev.yml down
|
||||
|
||||
dev-logs: ## Show development logs
|
||||
docker-compose -f deployments/docker-compose.dev.yml logs -f
|
||||
docker-compose -f docker-compose.dev.yml logs -f
|
||||
|
||||
dev-restart: ## Restart development environment
|
||||
@echo "Restarting development environment..."
|
||||
docker-compose -f deployments/docker-compose.dev.yml restart
|
||||
docker-compose -f docker-compose.dev.yml restart
|
||||
|
||||
# Staging environment
|
||||
staging-up: ## Start staging environment
|
||||
@echo "Starting staging environment..."
|
||||
@if [ ! -f .env.staging ]; then \
|
||||
echo "Creating staging environment file..."; \
|
||||
echo "DATA_DIR=./data/staging" > .env.staging; \
|
||||
echo "LOG_LEVEL=info" >> .env.staging; \
|
||||
echo "COMPLIANCE_MODE=standard" >> .env.staging; \
|
||||
fi
|
||||
docker-compose -f docker-compose.staging.yml up -d
|
||||
@echo "Staging services: Caddy (9080/9443), Redis (6380), API (9102), MinIO (9002/9003)"
|
||||
|
||||
staging-down: ## Stop staging environment
|
||||
@echo "Stopping staging environment..."
|
||||
docker-compose -f docker-compose.staging.yml down
|
||||
|
||||
staging-logs: ## Show staging logs
|
||||
docker-compose -f docker-compose.staging.yml logs -f
|
||||
|
||||
staging-restart: ## Restart staging environment
|
||||
@echo "Restarting staging environment..."
|
||||
docker-compose -f docker-compose.staging.yml restart
|
||||
|
||||
staging-status: ## Show staging status
|
||||
docker-compose -f docker-compose.staging.yml ps
|
||||
|
||||
|
||||
# Homelab environment
|
||||
homelab-secure-up: ## Start secure homelab environment
|
||||
@echo "Starting secure homelab environment..."
|
||||
docker-compose -f deployments/docker-compose.homelab-secure.yml up -d
|
||||
docker-compose -f docker-compose.homelab-secure.yml up -d
|
||||
|
||||
homelab-secure-down: ## Stop secure homelab environment
|
||||
@echo "Stopping secure homelab environment..."
|
||||
docker-compose -f deployments/docker-compose.homelab-secure.yml down
|
||||
docker-compose -f docker-compose.homelab-secure.yml down
|
||||
|
||||
# Production environment
|
||||
prod-up: ## Start production environment
|
||||
@echo "Starting production environment..."
|
||||
docker-compose -f deployments/docker-compose.prod.yml up -d
|
||||
@echo "⚠ WARNING: This is production! Ensure you have proper backups."
|
||||
@read -p "Continue? [y/N] " confirm && [ "$$confirm" = "y" ] || exit 1
|
||||
docker-compose -f docker-compose.prod.yml up -d
|
||||
|
||||
prod-down: ## Stop production environment
|
||||
@echo "Stopping production environment..."
|
||||
docker-compose -f deployments/docker-compose.prod.yml down
|
||||
docker-compose -f docker-compose.prod.yml down
|
||||
|
||||
prod-logs: ## Show production logs
|
||||
docker-compose -f docker-compose.prod.yml logs -f
|
||||
|
||||
prod-restart: ## Restart production environment
|
||||
@echo "Restarting production environment..."
|
||||
@read -p "Restart production? [y/N] " confirm && [ "$$confirm" = "y" ] || exit 1
|
||||
docker-compose -f docker-compose.prod.yml restart
|
||||
|
||||
prod-status: ## Show production status
|
||||
docker-compose -f docker-compose.prod.yml ps
|
||||
|
||||
# Utility commands
|
||||
status: ## Show status of all environments
|
||||
@echo "=== Development Status ==="
|
||||
@if [ -f deployments/docker-compose.dev.yml ]; then \
|
||||
docker-compose -f deployments/docker-compose.dev.yml ps; \
|
||||
@if [ -f docker-compose.dev.yml ]; then \
|
||||
docker-compose -f docker-compose.dev.yml ps 2>/dev/null || echo "Not running"; \
|
||||
fi
|
||||
@echo ""
|
||||
@echo "=== Staging Status ==="
|
||||
@if [ -f docker-compose.staging.yml ]; then \
|
||||
docker-compose -f docker-compose.staging.yml ps 2>/dev/null || echo "Not running"; \
|
||||
fi
|
||||
@echo ""
|
||||
@echo "=== Homelab Secure Status ==="
|
||||
@if [ -f deployments/docker-compose.homelab-secure.yml ]; then \
|
||||
docker-compose -f deployments/docker-compose.homelab-secure.yml ps 2>/dev/null || echo "Not running"; \
|
||||
@if [ -f docker-compose.homelab-secure.yml ]; then \
|
||||
docker-compose -f docker-compose.homelab-secure.yml ps 2>/dev/null || echo "Not running"; \
|
||||
fi
|
||||
@echo ""
|
||||
@echo "=== Production Status ==="
|
||||
@if [ -f deployments/docker-compose.prod.yml ]; then \
|
||||
docker-compose -f deployments/docker-compose.prod.yml ps 2>/dev/null || echo "Not running"; \
|
||||
@if [ -f docker-compose.prod.yml ]; then \
|
||||
docker-compose -f docker-compose.prod.yml ps 2>/dev/null || echo "Not running"; \
|
||||
fi
|
||||
|
||||
clean: ## Clean up all containers and volumes
|
||||
@echo "Cleaning up all Docker resources..."
|
||||
@echo "This will remove all containers and volumes. Continue? [y/N]"
|
||||
@read -r confirm && [ "$$confirm" = "y" ] || exit 1
|
||||
docker-compose -f deployments/docker-compose.dev.yml down -v 2>/dev/null || true
|
||||
docker-compose -f deployments/docker-compose.homelab-secure.yml down -v 2>/dev/null || true
|
||||
docker-compose -f deployments/docker-compose.prod.yml down -v 2>/dev/null || true
|
||||
docker-compose -f docker-compose.dev.yml down -v 2>/dev/null || true
|
||||
docker-compose -f docker-compose.staging.yml down -v 2>/dev/null || true
|
||||
docker-compose -f docker-compose.homelab-secure.yml down -v 2>/dev/null || true
|
||||
docker-compose -f docker-compose.prod.yml down -v 2>/dev/null || true
|
||||
docker system prune -f
|
||||
@echo "Cleanup complete."
|
||||
|
||||
# Security mode targets
|
||||
security-mode-dev: ## Run worker in dev security mode
|
||||
@echo "Running with dev security mode (relaxed validation)..."
|
||||
COMPLIANCE_MODE=dev docker-compose -f docker-compose.dev.yml up -d worker
|
||||
|
||||
security-mode-standard: ## Run worker in standard security mode
|
||||
@echo "Running with standard security mode..."
|
||||
COMPLIANCE_MODE=standard docker-compose -f docker-compose.dev.yml up -d worker
|
||||
|
||||
security-mode-hipaa: ## Run worker in HIPAA security mode
|
||||
@echo "Running with HIPAA security mode (strict compliance)..."
|
||||
@echo "✓ Network mode: none"
|
||||
@echo "✓ Seccomp profile: default-hardened"
|
||||
@echo "✓ No new privileges: enforced"
|
||||
@echo "✓ Audit sink: required"
|
||||
@read -p "Confirm HIPAA mode deployment? [y/N] " confirm && [ "$$confirm" = "y" ] || exit 1
|
||||
COMPLIANCE_MODE=hipaa docker-compose -f docker-compose.dev.yml up -d worker
|
||||
|
||||
# Rollback targets
|
||||
rollback-staging: ## Rollback staging deployment
|
||||
@echo "Rolling back staging deployment..."
|
||||
@echo "⚠ This rolls back the image only - queue state and audit log are NOT rolled back"
|
||||
@read -p "Continue with rollback? [y/N] " confirm && [ "$$confirm" = "y" ] || exit 1
|
||||
docker-compose -f docker-compose.staging.yml down
|
||||
@if [ -f .staging-deployment.log ]; then \
|
||||
PREVIOUS_TAG=$$(tail -2 .staging-deployment.log | head -1 | grep -o 'tag=[^ ]*' | cut -d'=' -f2 || echo "latest"); \
|
||||
echo "Previous tag: $$PREVIOUS_TAG"; \
|
||||
docker-compose -f docker-compose.staging.yml up -d; \
|
||||
fi
|
||||
@echo "$$(date -Iseconds) | rollback | staging | actor=$$(whoami)" >> .staging-audit.log
|
||||
|
||||
rollback-prod: ## Rollback production deployment
|
||||
@echo "Rolling back production deployment..."
|
||||
@echo "⚠ CRITICAL: This rolls back the image only"
|
||||
@echo "⚠ Queue state is NOT rolled back"
|
||||
@echo "⚠ Audit log chain is NOT rolled back (must never break chain)"
|
||||
@echo "⚠ New artifacts remain in storage"
|
||||
@read -p "CONFIRM PRODUCTION ROLLBACK? [yes/N] " confirm && [ "$$confirm" = "yes" ] || exit 1
|
||||
docker-compose -f docker-compose.prod.yml down
|
||||
@if [ -f .prod-audit.log ]; then \
|
||||
PREVIOUS_SHA=$$(tail -2 .prod-audit.log | head -1 | grep -o 'sha-[a-f0-9]*' || echo "previous"); \
|
||||
echo "Rolling back to: $$PREVIOUS_SHA"; \
|
||||
docker-compose -f docker-compose.prod.yml up -d; \
|
||||
fi
|
||||
@echo "$$(date -Iseconds) | rollback | prod | actor=$$(whoami)" >> .prod-audit.log
|
||||
@echo "Rollback complete. Verify health: make prod-status"
|
||||
|
||||
check-audit-sink: ## Check audit sink reachability
|
||||
@echo "Checking audit sink..."
|
||||
@if [ -f ../scripts/check-audit-sink.sh ]; then \
|
||||
../scripts/check-audit-sink.sh --env staging; \
|
||||
else \
|
||||
echo "Audit sink check script not found"; \
|
||||
fi
|
||||
|
||||
health-check: ## Run health checks on all environments
|
||||
@echo "=== Health Checks ==="
|
||||
@echo "Development (localhost:9101):"
|
||||
@curl -fsS http://localhost:9101/health 2>/dev/null && echo "✓ Healthy" || echo "✗ Not responding"
|
||||
@echo ""
|
||||
@echo "Staging (localhost:9102):"
|
||||
@curl -fsS http://localhost:9102/health 2>/dev/null && echo "✓ Healthy" || echo "✗ Not responding"
|
||||
@echo ""
|
||||
@echo "Production (localhost:9101):"
|
||||
@curl -fsS http://localhost:9101/health 2>/dev/null && echo "✓ Healthy" || echo "✗ Not responding"
|
||||
|
||||
security-scan: ## Run security scanners locally
|
||||
@echo "Running security scanners..."
|
||||
@if command -v gosec >/dev/null 2>&1; then \
|
||||
echo "Running gosec..."; \
|
||||
cd .. && gosec ./... 2>/dev/null || echo "gosec found issues"; \
|
||||
else \
|
||||
echo "gosec not installed - skipping"; \
|
||||
fi
|
||||
@if command -v nancy >/dev/null 2>&1; then \
|
||||
echo "Running nancy..."; \
|
||||
cd .. && go list -json -deps ./... 2>/dev/null | nancy sleuth 2>/dev/null || echo "nancy found issues"; \
|
||||
else \
|
||||
echo "nancy not installed - skipping"; \
|
||||
fi
|
||||
|
||||
# Quick aliases
|
||||
up: dev-up ## Alias for dev-up
|
||||
down: dev-down ## Alias for dev-down
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@ show_usage() {
|
|||
echo ""
|
||||
echo "Environments:"
|
||||
echo " dev Development environment"
|
||||
echo " staging Staging environment (pre-production)"
|
||||
echo " secure Secure homelab environment"
|
||||
echo " prod Production environment"
|
||||
echo ""
|
||||
|
|
@ -46,11 +47,17 @@ show_usage() {
|
|||
echo " restart Restart services"
|
||||
echo " logs Show logs"
|
||||
echo " status Show status"
|
||||
echo " rollback Rollback to previous deployment (image only)"
|
||||
echo " health-check Check service health and compliance mode"
|
||||
echo " check-audit-sink Verify audit sink reachability"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " $0 dev up # Start development environment"
|
||||
echo " $0 prod down # Stop production environment"
|
||||
echo " $0 secure logs # Show secure environment logs"
|
||||
echo " $0 dev up # Start development environment"
|
||||
echo " $0 staging up # Start staging environment"
|
||||
echo " $0 prod down # Stop production environment"
|
||||
echo " $0 staging rollback # Rollback staging deployment"
|
||||
echo " $0 prod health-check # Check production health"
|
||||
echo " $0 prod check-audit-sink # Verify audit sink before deploy"
|
||||
}
|
||||
|
||||
# Function to check if docker-compose file exists
|
||||
|
|
@ -62,6 +69,9 @@ check_compose_file() {
|
|||
"dev")
|
||||
compose_file="${FETCHML_REPO_ROOT}/deployments/docker-compose.dev.yml"
|
||||
;;
|
||||
"staging")
|
||||
compose_file="${FETCHML_REPO_ROOT}/deployments/docker-compose.staging.yml"
|
||||
;;
|
||||
"secure")
|
||||
compose_file="${FETCHML_REPO_ROOT}/deployments/docker-compose.homelab-secure.yml"
|
||||
;;
|
||||
|
|
@ -154,6 +164,71 @@ main() {
|
|||
print_status "Status of $environment environment:"
|
||||
docker-compose --project-directory "${FETCHML_REPO_ROOT}" -f "$compose_file" ps
|
||||
;;
|
||||
"rollback")
|
||||
print_warning "Rolling back $environment environment..."
|
||||
print_warning "⚠ This rolls back the image only - queue state and audit log are NOT rolled back"
|
||||
|
||||
if [ "$environment" = "prod" ]; then
|
||||
print_warning "⚠ CRITICAL: Production rollback"
|
||||
print_warning "⚠ Queue state is NOT rolled back"
|
||||
print_warning "⚠ Audit log chain is NOT rolled back (must never break chain)"
|
||||
read -p "CONFIRM PRODUCTION ROLLBACK? [yes/N] " confirm
|
||||
if [ "$confirm" != "yes" ]; then
|
||||
print_error "Rollback cancelled"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Get previous deployment info
|
||||
LOG_FILE="${FETCHML_REPO_ROOT}/deployments/.${environment}-audit.log"
|
||||
if [ -f "$LOG_FILE" ]; then
|
||||
PREVIOUS_SHA=$(tail -2 "$LOG_FILE" | head -1 | grep -o 'sha-[a-f0-9]*' || echo "")
|
||||
if [ -n "$PREVIOUS_SHA" ]; then
|
||||
print_status "Rolling back to: $PREVIOUS_SHA"
|
||||
fi
|
||||
fi
|
||||
|
||||
docker-compose --project-directory "${FETCHML_REPO_ROOT}" -f "$compose_file" down
|
||||
docker-compose --project-directory "${FETCHML_REPO_ROOT}" -f "$compose_file" up -d
|
||||
|
||||
# Write rollback entry to audit log
|
||||
echo "$(date -Iseconds) | rollback | $environment | actor=$(whoami)" >> "$LOG_FILE" 2>/dev/null || true
|
||||
|
||||
print_success "$environment rollback complete!"
|
||||
print_status "Verify health with: $0 $environment health-check"
|
||||
;;
|
||||
"health-check"|"health")
|
||||
print_status "Health check for $environment environment..."
|
||||
|
||||
# Determine port based on environment
|
||||
case $environment in
|
||||
dev) PORT=9101 ;;
|
||||
staging) PORT=9102 ;;
|
||||
prod) PORT=9101 ;;
|
||||
*) PORT=9101 ;;
|
||||
esac
|
||||
|
||||
# Check API health
|
||||
if curl -fsS "http://localhost:${PORT}/health" > /dev/null 2>&1; then
|
||||
print_success "API is healthy (port $PORT)"
|
||||
|
||||
# Check compliance_mode
|
||||
COMPLIANCE_MODE=$(curl -fsS "http://localhost:${PORT}/health" 2>/dev/null | grep -o '"compliance_mode":"[^"]*"' | cut -d'"' -f4 || echo "unknown")
|
||||
print_status "Compliance mode: $COMPLIANCE_MODE"
|
||||
else
|
||||
print_error "API health check failed (port $PORT)"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
"check-audit-sink")
|
||||
print_status "Checking audit sink for $environment..."
|
||||
|
||||
if [ -f "${FETCHML_REPO_ROOT}/scripts/check-audit-sink.sh" ]; then
|
||||
"${FETCHML_REPO_ROOT}/scripts/check-audit-sink.sh" --env "$environment"
|
||||
else
|
||||
print_warning "Audit sink check script not found"
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
print_error "Unknown action: $action"
|
||||
show_usage
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
---
|
||||
# Homelab Docker Compose with Centralized Monitoring
|
||||
# Includes: API, Redis, Prometheus, Grafana, Loki
|
||||
# Development Docker Compose
|
||||
# Includes: API, Redis, MinIO, Worker, Caddy
|
||||
services:
|
||||
caddy:
|
||||
image: caddy:2-alpine
|
||||
|
|
@ -11,8 +11,8 @@ services:
|
|||
- "8443:443"
|
||||
volumes:
|
||||
- ./deployments/Caddyfile.dev:/etc/caddy/Caddyfile:ro
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/caddy/data:/data
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/caddy/config:/config
|
||||
- ${DATA_DIR:-./data/smoke}/caddy/data:/data
|
||||
- ${DATA_DIR:-./data/smoke}/caddy/config:/config
|
||||
depends_on:
|
||||
api-server:
|
||||
condition: service_healthy
|
||||
|
|
@ -42,12 +42,12 @@ services:
|
|||
expose:
|
||||
- "9101" # API and health endpoints (internal; external access via Caddy)
|
||||
volumes:
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/logs:/logs
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/experiments:/data/experiments
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/active:/data/active
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/workspaces:/data/active/workspaces:delegated
|
||||
- ./configs/api/dev.yaml:/app/configs/api/dev.yaml
|
||||
- ./ssl:/app/ssl
|
||||
- ${DATA_DIR:-./data/smoke}/logs:/logs
|
||||
- ${DATA_DIR:-./data/smoke}/experiments:/data/experiments
|
||||
- ${DATA_DIR:-./data/smoke}/active:/data/active
|
||||
- ${DATA_DIR:-./data/smoke}/workspaces:/data/active/workspaces:delegated
|
||||
- ${DATA_DIR:-./data/smoke}/configs:/app/configs:ro
|
||||
- ${DATA_DIR:-./data/smoke}/ssl:/app/ssl:ro
|
||||
depends_on:
|
||||
- redis
|
||||
restart: unless-stopped
|
||||
|
|
@ -62,67 +62,41 @@ services:
|
|||
retries: 3
|
||||
start_period: 40s
|
||||
labels:
|
||||
logging: "promtail"
|
||||
job: "api-server"
|
||||
# MinIO for local development (single-node filesystem backend)
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
container_name: ml-experiments-minio
|
||||
container_name: ml-dev-minio
|
||||
ports:
|
||||
- "9000:9000"
|
||||
- "9001:9001"
|
||||
volumes:
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/minio:/data
|
||||
- ${DATA_DIR:-./data/smoke}/minio:/data
|
||||
environment:
|
||||
- MINIO_ROOT_USER=minioadmin
|
||||
- MINIO_ROOT_PASSWORD=minioadmin123
|
||||
- MINIO_BROWSER=on
|
||||
command: ["server", "/data", "--console-address", ":9001"]
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||
interval: 10s
|
||||
test: ["CMD", "curl", "-fsS", "http://localhost:9000/minio/health/live"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
retries: 5
|
||||
restart: unless-stopped
|
||||
|
||||
# Initialize minio bucket (runs once)
|
||||
minio-init:
|
||||
image: alpine:3.19
|
||||
container_name: ml-experiments-minio-init
|
||||
image: minio/mc:latest
|
||||
container_name: ml-dev-minio-init
|
||||
depends_on:
|
||||
minio:
|
||||
condition: service_healthy
|
||||
entrypoint: ["/bin/sh", "-c"]
|
||||
command:
|
||||
- |
|
||||
set -eu
|
||||
apk add --no-cache ca-certificates curl tar gzip
|
||||
ARCH=$$(uname -m)
|
||||
MC_ARCH=amd64
|
||||
if [ "$$ARCH" = "aarch64" ] || [ "$$ARCH" = "arm64" ]; then
|
||||
MC_ARCH=arm64
|
||||
fi
|
||||
curl -fsSL -o /usr/local/bin/mc "https://dl.min.io/client/mc/release/linux-$$MC_ARCH/mc"
|
||||
chmod +x /usr/local/bin/mc
|
||||
i=0
|
||||
while ! mc alias set local http://minio:9000 minioadmin minioadmin123; do
|
||||
i=$$((i+1))
|
||||
if [ $$i -ge 30 ]; then
|
||||
echo "minio not ready after 30 attempts" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "waiting for minio... ($$i/30)"
|
||||
sleep 1
|
||||
done
|
||||
# Skip if bucket already exists
|
||||
if mc ls local/fetchml-snapshots 2>/dev/null; then
|
||||
echo "Bucket fetchml-snapshots already exists, skipping init"
|
||||
exit 0
|
||||
fi
|
||||
mc mb -p local/fetchml-snapshots || true
|
||||
mkdir -p /tmp/snapshots/snap-1
|
||||
echo -n "hello" > /tmp/snapshots/snap-1/hello.txt
|
||||
tar -C /tmp/snapshots/snap-1 -czf /tmp/snap-1.tar.gz .
|
||||
mc cp /tmp/snap-1.tar.gz local/fetchml-snapshots/snapshots/snap-1.tar.gz
|
||||
FILE_SHA=$$(sha256sum /tmp/snapshots/snap-1/hello.txt | cut -d' ' -f1)
|
||||
SNAP_SHA=$$(echo -n "$$FILE_SHA" | sha256sum | cut -d' ' -f1)
|
||||
echo "snapshot_id=snap-1 snapshot_sha256=$$SNAP_SHA"
|
||||
mc alias set local http://minio:9000 minioadmin minioadmin123 || exit 1
|
||||
mc mb -p local/fetchml-snapshots 2>/dev/null || echo "Bucket exists"
|
||||
echo "MinIO initialized"
|
||||
restart: "no"
|
||||
worker:
|
||||
build:
|
||||
|
|
@ -133,11 +107,12 @@ services:
|
|||
ports:
|
||||
- "8888:8888"
|
||||
volumes:
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/logs:/logs
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/active:/data/active
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/experiments:/data/experiments
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/workspaces:/data/active/workspaces:delegated
|
||||
- ./configs/workers/docker-dev.yaml:/app/configs/worker.yaml
|
||||
- ${DATA_DIR:-./data/smoke}/logs:/logs
|
||||
- ${DATA_DIR:-./data/smoke}/active:/data/active
|
||||
- ${DATA_DIR:-./data/smoke}/experiments:/data/experiments
|
||||
- ${DATA_DIR:-./data/smoke}/workspaces:/data/active/workspaces:delegated
|
||||
- ${DATA_DIR:-./data/smoke}/configs/worker/docker-dev.yaml:/app/configs/worker.yaml:ro
|
||||
- ${DATA_DIR:-./data/smoke}/ssl:/app/ssl:ro
|
||||
- /sys/fs/cgroup:/sys/fs/cgroup:rw
|
||||
depends_on:
|
||||
redis:
|
||||
|
|
@ -158,71 +133,6 @@ services:
|
|||
# Native libs enabled via build tag: -tags native_libs
|
||||
privileged: true
|
||||
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
|
||||
# # Prometheus - Metrics collection
|
||||
# prometheus:
|
||||
# image: prom/prometheus:latest
|
||||
# container_name: ml-experiments-prometheus
|
||||
# ports:
|
||||
# - "9090:9090"
|
||||
# volumes:
|
||||
# - ${FETCHML_REPO_ROOT:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
# - prometheus_data:/prometheus
|
||||
# command:
|
||||
# - '--config.file=/etc/prometheus/prometheus.yml'
|
||||
# - '--storage.tsdb.path=/prometheus'
|
||||
# - '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||
# - '--web.console.templates=/etc/prometheus/consoles'
|
||||
# - '--web.enable-lifecycle'
|
||||
# restart: unless-stopped
|
||||
#
|
||||
# # Grafana - Visualization
|
||||
# grafana:
|
||||
# image: grafana/grafana:latest
|
||||
# container_name: ml-experiments-grafana
|
||||
# ports:
|
||||
# - "3000:3000"
|
||||
# volumes:
|
||||
# - grafana_data:/var/lib/grafana
|
||||
# - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning
|
||||
# - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards
|
||||
# environment:
|
||||
# - GF_SECURITY_ADMIN_PASSWORD=admin123
|
||||
# - GF_USERS_ALLOW_SIGN_UP=false
|
||||
# restart: unless-stopped
|
||||
# depends_on:
|
||||
# - prometheus
|
||||
# - loki
|
||||
#
|
||||
# # Loki - Log aggregation
|
||||
# loki:
|
||||
# image: grafana/loki:latest
|
||||
# container_name: ml-experiments-loki
|
||||
# ports:
|
||||
# - "3100:3100"
|
||||
# volumes:
|
||||
# - ${FETCHML_REPO_ROOT:-.}/monitoring/loki-config.yml:/etc/loki/local-config.yaml
|
||||
# - loki_data:/loki
|
||||
# command: -config.file=/etc/loki/local-config.yaml
|
||||
# restart: unless-stopped
|
||||
# Promtail - Log collector
|
||||
promtail:
|
||||
image: grafana/promtail:latest
|
||||
container_name: ml-experiments-promtail
|
||||
volumes:
|
||||
- ${SMOKE_TEST_DATA_DIR:-./monitoring}/promtail-config.yml:/etc/promtail/config.yml
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/dev}/logs:/var/log/app
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
command: -config.file=/etc/promtail/config.yml
|
||||
restart: unless-stopped
|
||||
# depends_on:
|
||||
# - loki
|
||||
volumes:
|
||||
redis_data:
|
||||
driver: local
|
||||
prometheus_data:
|
||||
driver: local
|
||||
grafana_data:
|
||||
driver: local
|
||||
loki_data:
|
||||
driver: local
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@ services:
|
|||
- ${HOMELAB_DATA_DIR:-./data/homelab}/experiments:/data/experiments
|
||||
- ${HOMELAB_DATA_DIR:-./data/homelab}/active:/data/active
|
||||
- ${HOMELAB_DATA_DIR:-./data/homelab}/logs:/logs
|
||||
- ./ssl:/app/ssl:ro
|
||||
- ./configs/api/homelab-secure.yaml:/app/configs/api/prod.yaml:ro
|
||||
- ${HOMELAB_DATA_DIR:-./data/homelab}/ssl:/app/ssl:ro
|
||||
- ${HOMELAB_DATA_DIR:-./data/homelab}/configs/api/homelab-secure.yaml:/app/configs/api/prod.yaml:ro
|
||||
- ${FETCHML_REPO_ROOT:-..}/.env.secure:/app/.env.secure:ro
|
||||
depends_on:
|
||||
redis:
|
||||
|
|
@ -32,7 +32,6 @@ services:
|
|||
retries: 3
|
||||
start_period: 40s
|
||||
labels:
|
||||
logging: "promtail"
|
||||
job: "api-server"
|
||||
command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"]
|
||||
networks:
|
||||
|
|
@ -52,28 +51,27 @@ services:
|
|||
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
||||
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
|
||||
command: ["server", "/data", "--console-address", ":9001"]
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-fsS", "http://localhost:9000/minio/health/live"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- ml-backend-network
|
||||
|
||||
minio-init:
|
||||
image: alpine:3.19
|
||||
image: minio/mc:latest
|
||||
container_name: ml-experiments-minio-init
|
||||
depends_on:
|
||||
- minio
|
||||
minio:
|
||||
condition: service_healthy
|
||||
entrypoint: ["/bin/sh", "-c"]
|
||||
command:
|
||||
- |
|
||||
apk add --no-cache ca-certificates curl >/dev/null
|
||||
curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc
|
||||
chmod +x /usr/local/bin/mc
|
||||
mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123}
|
||||
# Skip if bucket already exists
|
||||
if mc ls local/fetchml-snapshots 2>/dev/null; then
|
||||
echo "Bucket fetchml-snapshots already exists, skipping init"
|
||||
exit 0
|
||||
fi
|
||||
mc mb -p local/fetchml-snapshots || true
|
||||
mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123} || exit 1
|
||||
mc mb -p local/fetchml-snapshots 2>/dev/null || echo "Bucket exists"
|
||||
echo "MinIO initialized"
|
||||
restart: "no"
|
||||
networks:
|
||||
- ml-backend-network
|
||||
|
|
@ -87,14 +85,14 @@ services:
|
|||
- ${HOMELAB_DATA_DIR:-./data/homelab}/experiments:/app/data/experiments
|
||||
- ${HOMELAB_DATA_DIR:-./data/homelab}/active:/data/active
|
||||
- ${HOMELAB_DATA_DIR:-./data/homelab}/logs:/logs
|
||||
- ./configs/workers/homelab-secure.yaml:/app/configs/worker.yaml
|
||||
- ${HOMELAB_DATA_DIR:-./data/homelab}/configs/worker/homelab-secure.yaml:/app/configs/worker.yaml:ro
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
api-server:
|
||||
condition: service_healthy
|
||||
minio-init:
|
||||
condition: service_started
|
||||
condition: service_completed_successfully
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- LOG_LEVEL=info
|
||||
|
|
@ -115,7 +113,7 @@ services:
|
|||
- "443:443"
|
||||
volumes:
|
||||
- ./deployments/Caddyfile.homelab-secure:/etc/caddy/Caddyfile:ro
|
||||
- ./ssl:/etc/caddy/ssl:ro
|
||||
- ${HOMELAB_DATA_DIR:-./data/homelab}/ssl:/etc/caddy/ssl:ro
|
||||
- ${HOMELAB_DATA_DIR:-./data/homelab}/caddy/data:/data
|
||||
- ${HOMELAB_DATA_DIR:-./data/homelab}/caddy/config:/config
|
||||
environment:
|
||||
|
|
@ -135,7 +133,7 @@ services:
|
|||
- "127.0.0.1:6379:6379" # Bind to localhost only
|
||||
volumes:
|
||||
- ${HOMELAB_DATA_DIR:-./data/homelab}/redis:/data
|
||||
- ./redis/redis-secure.conf:/usr/local/etc/redis/redis.conf:ro
|
||||
- ${HOMELAB_DATA_DIR:-./data/homelab}/configs/redis/redis-secure.conf:/usr/local/etc/redis/redis.conf:ro
|
||||
restart: unless-stopped
|
||||
command: redis-server /usr/local/etc/redis/redis.conf --requirepass ${REDIS_PASSWORD}
|
||||
healthcheck:
|
||||
|
|
|
|||
|
|
@ -7,11 +7,11 @@ services:
|
|||
ports:
|
||||
- "9101:9101"
|
||||
volumes:
|
||||
- ${LOCAL_DATA_DIR:-../data/dev}/logs:/logs
|
||||
- ${LOCAL_DATA_DIR:-../data/dev}/experiments:/data/experiments
|
||||
- ${LOCAL_DATA_DIR:-../data/dev}/active:/data/active
|
||||
- ${LOCAL_DATA_DIR:-../data/dev}/workspaces:/data/active/workspaces:delegated
|
||||
- ../configs/api/dev.yaml:/app/configs/api/dev.yaml
|
||||
- ${LOCAL_DATA_DIR:-./data/dev}/logs:/logs
|
||||
- ${LOCAL_DATA_DIR:-./data/dev}/experiments:/data/experiments
|
||||
- ${LOCAL_DATA_DIR:-./data/dev}/active:/data/active
|
||||
- ${LOCAL_DATA_DIR:-./data/dev}/workspaces:/data/active/workspaces:delegated
|
||||
- ${LOCAL_DATA_DIR:-./data/dev}/configs/api/dev.yaml:/app/configs/api/dev.yaml:ro
|
||||
environment:
|
||||
- LOG_LEVEL=info
|
||||
depends_on:
|
||||
|
|
@ -30,11 +30,12 @@ services:
|
|||
ports:
|
||||
- "8888:8888"
|
||||
volumes:
|
||||
- ${LOCAL_DATA_DIR:-../data/dev}/logs:/logs
|
||||
- ${LOCAL_DATA_DIR:-../data/dev}/active:/data/active
|
||||
- ${LOCAL_DATA_DIR:-../data/dev}/experiments:/data/experiments
|
||||
- ${LOCAL_DATA_DIR:-../data/dev}/workspaces:/data/active/workspaces:delegated
|
||||
- ../configs/workers/docker-dev.yaml:/app/configs/worker.yaml
|
||||
- ${LOCAL_DATA_DIR:-./data/dev}/logs:/logs
|
||||
- ${LOCAL_DATA_DIR:-./data/dev}/active:/data/active
|
||||
- ${LOCAL_DATA_DIR:-./data/dev}/experiments:/data/experiments
|
||||
- ${LOCAL_DATA_DIR:-./data/dev}/workspaces:/data/active/workspaces:delegated
|
||||
- ${LOCAL_DATA_DIR:-./data/dev}/snapshots:/data/snapshots
|
||||
- ${LOCAL_DATA_DIR:-./data/dev}/configs/worker/docker-dev.yaml:/app/configs/worker.yaml:ro
|
||||
- /sys/fs/cgroup:/sys/fs/cgroup:rw
|
||||
environment:
|
||||
- LOG_LEVEL=info
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ services:
|
|||
- ${SMOKE_TEST_DATA_DIR:-./data/prod-smoke}/experiments:/data/experiments
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/prod-smoke}/active:/data/active
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/prod-smoke}/logs:/logs
|
||||
- ./configs/api/dev.yaml:/app/configs/api/dev.yaml:ro
|
||||
- ${SMOKE_TEST_DATA_DIR:-./data/prod-smoke}/configs/api/dev.yaml:/app/configs/api/dev.yaml:ro
|
||||
command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
|
||||
environment:
|
||||
- LOG_LEVEL=info
|
||||
|
|
@ -67,7 +67,7 @@ services:
|
|||
- PASSWORD_ACCESS=false
|
||||
volumes:
|
||||
- ./deployments/test_keys:/tmp:ro
|
||||
- ${FETCHML_REPO_ROOT:-..}/bin/tui-linux:/usr/local/bin/tui:ro
|
||||
- ./bin/tui:/usr/local/bin/tui:ro
|
||||
- ./deployments/tui-test-config.toml:/config/.ml/config.toml:ro
|
||||
ports:
|
||||
- "2222:2222"
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ services:
|
|||
- ${PROD_DATA_DIR:-./data/prod}/experiments:/app/data/experiments
|
||||
- ${PROD_DATA_DIR:-./data/prod}/active:/data/active
|
||||
- ${PROD_DATA_DIR:-./data/prod}/logs:/logs
|
||||
- ./configs/api/multi-user.yaml:/app/configs/api/prod.yaml
|
||||
- ${PROD_DATA_DIR:-./data/prod}/configs/api/multi-user.yaml:/app/configs/api/prod.yaml:ro
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
|
|
@ -62,7 +62,7 @@ services:
|
|||
- ${PROD_DATA_DIR:-./data/prod}/experiments:/app/data/experiments
|
||||
- ${PROD_DATA_DIR:-./data/prod}/active:/data/active
|
||||
- ${PROD_DATA_DIR:-./data/prod}/logs:/logs
|
||||
- ./configs/workers/docker-prod.yaml:/app/configs/worker.yaml
|
||||
- ${PROD_DATA_DIR:-./data/prod}/configs/worker/docker-prod.yaml:/app/configs/worker.yaml:ro
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
|
|
|
|||
Loading…
Reference in a new issue