fetch_ml/deployments/docker-compose.dev.yml
Jeremie Fraeys 93d6d63d8d
chore(deploy): update Docker compose files and add MinIO lifecycle policies
Docker Compose updates:
- docker-compose.dev.yml: add GPU support, local scheduler and worker
- docker-compose.staging.yml: production-like staging with SSL termination
- docker-compose.test.yml: ephemeral test environment with seeded data

MinIO lifecycle management:
- Add lifecycle-dev.json: 7-day retention for dev artifacts
- Add lifecycle-staging.json: 30-day retention with transition to cold

Build improvements:
- Makefile: add native library build targets and cross-platform support
- scripts/release/cleanup.sh: improved artifact cleanup with dry-run mode
2026-03-12 12:06:16 -04:00

255 lines
7.7 KiB
YAML

---
# Development Docker Compose
# Includes: API, Redis, MinIO, Worker, Caddy
services:
caddy:
image: caddy:2-alpine
container_name: ml-dev-caddy
restart: unless-stopped
pull_policy: always
ports:
- "8080:80"
- "8443:443"
volumes:
- ./Caddyfile.dev:/etc/caddy/Caddyfile:ro
- ${DATA_DIR:-./data/smoke}/caddy/data:/data
- ${DATA_DIR:-./data/smoke}/caddy/config:/config
depends_on:
api-server:
condition: service_healthy
labels:
- "fetchml.data.lifecycle=persistent"
- "fetchml.component=proxy"
deploy:
resources:
limits:
cpus: '0.5'
memory: 128M
reservations:
cpus: '0.25'
memory: 64M
redis:
image: redis:7-alpine
container_name: ml-experiments-redis
user: "999:999"
pull_policy: always
ports:
- "6379:6379"
volumes:
- redis_data:/data
restart: unless-stopped
command: redis-server --appendonly yes
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 30s
timeout: 10s
retries: 3
labels:
- "fetchml.data.lifecycle=persistent"
- "fetchml.component=cache"
deploy:
resources:
limits:
cpus: '0.5'
memory: 512M
reservations:
cpus: '0.25'
memory: 256M
api-server:
build:
context: ..
dockerfile: build/docker/simple.Dockerfile
container_name: ml-experiments-api
user: "0:0"
ports:
- "9101:9101"
expose:
- "9101" # API and health endpoints (internal; external access via Caddy)
volumes:
- ${DATA_DIR:-./data/smoke}/logs:/logs
- ${DATA_DIR:-./data/smoke}/experiments:/data/experiments
- ${DATA_DIR:-./data/smoke}/active:/data/active
- ${DATA_DIR:-./data/smoke}/workspaces:/data/active/workspaces:delegated
- ${CONFIG_DIR:-../configs}/api/dev.yaml:/app/configs/api/dev.yaml:ro
- ${DATA_DIR:-./data/smoke}/ssl:/app/ssl:ro
depends_on:
- redis
restart: unless-stopped
command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
environment:
- LOG_LEVEL=info
# Native libs enabled via build tag: -tags native_libs
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9101/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
labels:
- "fetchml.data.lifecycle=persistent"
- "fetchml.component=api"
deploy:
resources:
limits:
cpus: '1.0'
memory: 512M
reservations:
cpus: '0.5'
memory: 256M
# MinIO for local development (single-node filesystem backend)
minio:
image: minio/minio:latest
container_name: ml-dev-minio
pull_policy: always
ports:
- "9000:9000"
- "9001:9001"
volumes:
- ${DATA_DIR:-./data/smoke}/minio:/data
environment:
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin123
- MINIO_BROWSER=on
command: ["server", "/data", "--console-address", ":9001"]
healthcheck:
test: ["CMD", "curl", "-fsS", "http://localhost:9000/minio/health/live"]
interval: 5s
timeout: 5s
retries: 5
restart: unless-stopped
labels:
- "fetchml.data.lifecycle=persistent"
- "fetchml.data.max-age=168h"
- "fetchml.component=object-store"
# Initialize minio bucket (runs once)
minio-init:
image: minio/mc:latest
container_name: ml-dev-minio-init
pull_policy: always
depends_on:
minio:
condition: service_healthy
volumes:
- ./minio/lifecycle-dev.json:/tmp/lifecycle.json:ro
entrypoint: ["/bin/sh", "-c"]
command:
- |
mc alias set local http://minio:9000 minioadmin minioadmin123 || exit 1
mc mb -p local/fetchml-snapshots 2>/dev/null || echo "Bucket exists"
# Apply lifecycle policy for automatic cleanup
mc ilm import local/fetchml-snapshots /tmp/lifecycle.json 2>/dev/null || echo "Lifecycle policy may already exist"
echo "MinIO initialized"
restart: "no"
worker:
build:
context: ..
dockerfile: build/docker/simple.Dockerfile
# Remove fixed container name to allow scaling
# container_name: ml-experiments-worker
user: "0:0"
ports:
- "8888-8891:8888" # Port range for multiple workers
volumes:
- ${DATA_DIR:-./data/smoke}/logs:/logs
- ${DATA_DIR:-./data/smoke}/active:/data/active
- ${DATA_DIR:-./data/smoke}/experiments:/data/experiments
- ${DATA_DIR:-./data/smoke}/workspaces:/data/active/workspaces:delegated
- ${CONFIG_DIR:-../configs}/worker/docker-dev.yaml:/app/configs/worker.yaml:ro
- ${DATA_DIR:-./data/smoke}/ssl:/app/ssl:ro
- /sys/fs/cgroup:/sys/fs/cgroup:rw
depends_on:
redis:
condition: service_healthy
api-server:
condition: service_healthy
minio-init:
condition: service_completed_successfully
restart: unless-stopped
environment:
- LOG_LEVEL=info
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin123
- FETCHML_JUPYTER_DEFAULT_IMAGE=quay.io/jupyter/minimal-notebook:latest
- FETCHML_JUPYTER_CONDA_ENV=base
- FETCHML_JUPYTER_KERNEL_NAME=python
- FETCHML_PODMAN_CGROUPS=disabled
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:8888/health"]
interval: 10s
timeout: 5s
retries: 3
start_period: 30s
# Native libs enabled via build tag: -tags native_libs
privileged: true
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
labels:
- "fetchml.data.lifecycle=persistent"
- "fetchml.component=worker"
deploy:
replicas: 2
resources:
limits:
cpus: '2.0'
memory: 2G
reservations:
cpus: '1.0'
memory: 1G
# Local profile: use pre-built images instead of building from source
api-server-local:
image: fetchml-api:latest
profiles: ["local"]
ports:
- "9101:9101"
volumes:
- ${LOCAL_DATA_DIR:-./data/dev}/logs:/logs
- ${LOCAL_DATA_DIR:-./data/dev}/experiments:/data/experiments
- ${LOCAL_DATA_DIR:-./data/dev}/active:/data/active
- ${LOCAL_DATA_DIR:-./data/dev}/workspaces:/data/active/workspaces:delegated
- ${CONFIG_DIR:-../configs}/api/dev.yaml:/app/configs/api/dev.yaml:ro
environment:
- LOG_LEVEL=info
depends_on:
redis:
condition: service_healthy
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9101/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
labels:
- "fetchml.data.lifecycle=persistent"
- "fetchml.component=api"
worker-local:
image: fetchml-worker:latest
profiles: ["local"]
privileged: true
ports:
- "8888:8888"
volumes:
- ${LOCAL_DATA_DIR:-./data/dev}/logs:/logs
- ${LOCAL_DATA_DIR:-./data/dev}/active:/data/active
- ${LOCAL_DATA_DIR:-./data/dev}/experiments:/data/experiments
- ${LOCAL_DATA_DIR:-./data/dev}/workspaces:/data/active/workspaces:delegated
- ${LOCAL_DATA_DIR:-./data/dev}/snapshots:/data/snapshots
- ${CONFIG_DIR:-../configs}/worker/docker-dev.yaml:/app/configs/worker.yaml:ro
- /sys/fs/cgroup:/sys/fs/cgroup:rw
environment:
- LOG_LEVEL=info
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin123
labels:
- "fetchml.data.lifecycle=persistent"
- "fetchml.component=worker"
depends_on:
redis:
condition: service_healthy
api-server-local:
condition: service_healthy
volumes:
redis_data:
driver: local