fetch_ml/deployments/docker-compose.dev.yml
Jeremie Fraeys bff2336db2
fix(smoke-test): use temp directory for smoke test data
Use /tmp for smoke test data to avoid file sharing issues on macOS/Colima:

- smoke-test.sh: Create temp dir with mktemp, export SMOKE_TEST_DATA_DIR
- docker-compose.dev.yml: Use SMOKE_TEST_DATA_DIR with fallback to data/dev
- Remove file sharing permission checks (no longer needed with tmp)

This avoids Docker Desktop/Colima file sharing permission issues entirely
by using a system temp directory that's always accessible.
2026-02-24 11:37:45 -05:00

228 lines
8 KiB
YAML

---
# Homelab Docker Compose with Centralized Monitoring
# Includes: API, Redis, Prometheus, Grafana, Loki
services:
caddy:
image: caddy:2-alpine
container_name: ml-dev-caddy
restart: unless-stopped
ports:
- "8080:80"
- "8443:443"
volumes:
- ${FETCHML_REPO_ROOT:-..}/deployments/Caddyfile.dev:/etc/caddy/Caddyfile:ro
- ${SMOKE_TEST_DATA_DIR:-${FETCHML_REPO_ROOT:-..}/data/dev}/caddy/data:/data
- ${SMOKE_TEST_DATA_DIR:-${FETCHML_REPO_ROOT:-..}/data/dev}/caddy/config:/config
depends_on:
api-server:
condition: service_healthy
redis:
image: redis:7-alpine
container_name: ml-experiments-redis
user: "999:999"
ports:
- "6379:6379"
volumes:
- redis_data:/data
restart: unless-stopped
command: redis-server --appendonly yes
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 30s
timeout: 10s
retries: 3
api-server:
build:
context: ${FETCHML_REPO_ROOT:-..}
dockerfile: build/docker/simple.Dockerfile
container_name: ml-experiments-api
user: "0:0"
ports:
- "9101:9101"
expose:
- "9101" # API and health endpoints (internal; external access via Caddy)
volumes:
- ${SMOKE_TEST_DATA_DIR:-${FETCHML_REPO_ROOT:-..}/data/dev}/logs:/logs
- ${SMOKE_TEST_DATA_DIR:-${FETCHML_REPO_ROOT:-..}/data/dev}/experiments:/data/experiments
- ${SMOKE_TEST_DATA_DIR:-${FETCHML_REPO_ROOT:-..}/data/dev}/active:/data/active
- ${SMOKE_TEST_DATA_DIR:-${FETCHML_REPO_ROOT:-..}/data/dev}/workspaces:/data/active/workspaces:delegated
- ${FETCHML_REPO_ROOT:-..}/configs/api/dev.yaml:/app/configs/api/dev.yaml
- ${FETCHML_REPO_ROOT:-..}/ssl:/app/ssl
depends_on:
- redis
restart: unless-stopped
command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
environment:
- LOG_LEVEL=info
# Native libs enabled via build tag: -tags native_libs
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9101/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
labels:
logging: "promtail"
job: "api-server"
minio:
image: minio/minio:latest
container_name: ml-experiments-minio
ports:
- "9000:9000"
- "9001:9001"
volumes:
- ${SMOKE_TEST_DATA_DIR:-${FETCHML_REPO_ROOT:-..}/data/dev}/minio:/data
environment:
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin123
command: ["server", "/data", "--console-address", ":9001"]
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 10s
timeout: 5s
retries: 10
restart: unless-stopped
minio-init:
image: alpine:3.19
container_name: ml-experiments-minio-init
depends_on:
minio:
condition: service_healthy
entrypoint: ["/bin/sh", "-c"]
command:
- |
set -eu
apk add --no-cache ca-certificates curl tar gzip
ARCH=$$(uname -m)
MC_ARCH=amd64
if [ "$$ARCH" = "aarch64" ] || [ "$$ARCH" = "arm64" ]; then
MC_ARCH=arm64
fi
curl -fsSL -o /usr/local/bin/mc "https://dl.min.io/client/mc/release/linux-$$MC_ARCH/mc"
chmod +x /usr/local/bin/mc
i=0
while ! mc alias set local http://minio:9000 minioadmin minioadmin123; do
i=$$((i+1))
if [ $$i -ge 30 ]; then
echo "minio not ready after 30 attempts" >&2
exit 1
fi
echo "waiting for minio... ($$i/30)"
sleep 1
done
# Skip if bucket already exists
if mc ls local/fetchml-snapshots 2>/dev/null; then
echo "Bucket fetchml-snapshots already exists, skipping init"
exit 0
fi
mc mb -p local/fetchml-snapshots || true
mkdir -p /tmp/snapshots/snap-1
echo -n "hello" > /tmp/snapshots/snap-1/hello.txt
tar -C /tmp/snapshots/snap-1 -czf /tmp/snap-1.tar.gz .
mc cp /tmp/snap-1.tar.gz local/fetchml-snapshots/snapshots/snap-1.tar.gz
FILE_SHA=$$(sha256sum /tmp/snapshots/snap-1/hello.txt | cut -d' ' -f1)
SNAP_SHA=$$(echo -n "$$FILE_SHA" | sha256sum | cut -d' ' -f1)
echo "snapshot_id=snap-1 snapshot_sha256=$$SNAP_SHA"
restart: "no"
worker:
build:
context: ${FETCHML_REPO_ROOT:-..}
dockerfile: build/docker/simple.Dockerfile
container_name: ml-experiments-worker
user: "0:0"
ports:
- "8888:8888"
volumes:
- ${SMOKE_TEST_DATA_DIR:-${FETCHML_REPO_ROOT:-..}/data/dev}/logs:/logs
- ${SMOKE_TEST_DATA_DIR:-${FETCHML_REPO_ROOT:-..}/data/dev}/active:/data/active
- ${SMOKE_TEST_DATA_DIR:-${FETCHML_REPO_ROOT:-..}/data/dev}/experiments:/data/experiments
- ${SMOKE_TEST_DATA_DIR:-${FETCHML_REPO_ROOT:-..}/data/dev}/workspaces:/data/active/workspaces:delegated
- ${FETCHML_REPO_ROOT:-..}/configs/workers/docker-dev.yaml:/app/configs/worker.yaml
- /sys/fs/cgroup:/sys/fs/cgroup:rw
depends_on:
redis:
condition: service_healthy
api-server:
condition: service_healthy
minio-init:
condition: service_completed_successfully
restart: unless-stopped
environment:
- LOG_LEVEL=info
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin123
- FETCHML_JUPYTER_DEFAULT_IMAGE=quay.io/jupyter/minimal-notebook:latest
- FETCHML_JUPYTER_CONDA_ENV=base
- FETCHML_JUPYTER_KERNEL_NAME=python
- FETCHML_PODMAN_CGROUPS=disabled
# Native libs enabled via build tag: -tags native_libs
privileged: true
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
# # Prometheus - Metrics collection
# prometheus:
# image: prom/prometheus:latest
# container_name: ml-experiments-prometheus
# ports:
# - "9090:9090"
# volumes:
# - ${FETCHML_REPO_ROOT:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
# - prometheus_data:/prometheus
# command:
# - '--config.file=/etc/prometheus/prometheus.yml'
# - '--storage.tsdb.path=/prometheus'
# - '--web.console.libraries=/etc/prometheus/console_libraries'
# - '--web.console.templates=/etc/prometheus/consoles'
# - '--web.enable-lifecycle'
# restart: unless-stopped
#
# # Grafana - Visualization
# grafana:
# image: grafana/grafana:latest
# container_name: ml-experiments-grafana
# ports:
# - "3000:3000"
# volumes:
# - grafana_data:/var/lib/grafana
# - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning
# - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards
# environment:
# - GF_SECURITY_ADMIN_PASSWORD=admin123
# - GF_USERS_ALLOW_SIGN_UP=false
# restart: unless-stopped
# depends_on:
# - prometheus
# - loki
#
# # Loki - Log aggregation
# loki:
# image: grafana/loki:latest
# container_name: ml-experiments-loki
# ports:
# - "3100:3100"
# volumes:
# - ${FETCHML_REPO_ROOT:-.}/monitoring/loki-config.yml:/etc/loki/local-config.yaml
# - loki_data:/loki
# command: -config.file=/etc/loki/local-config.yaml
# restart: unless-stopped
# Promtail - Log collector
promtail:
image: grafana/promtail:latest
container_name: ml-experiments-promtail
volumes:
- ${FETCHML_REPO_ROOT:-..}/monitoring/promtail-config.yml:/etc/promtail/config.yml
- ${SMOKE_TEST_DATA_DIR:-${FETCHML_REPO_ROOT:-..}/data/dev}/logs:/var/log/app
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- /var/run/docker.sock:/var/run/docker.sock
command: -config.file=/etc/promtail/config.yml
restart: unless-stopped
# depends_on:
# - loki
volumes:
redis_data:
driver: local
prometheus_data:
driver: local
grafana_data:
driver: local
loki_data:
driver: local