fetch_ml/deployments/docker-compose.dev.yml

225 lines
7.3 KiB
YAML

# Homelab Docker Compose with Centralized Monitoring
# Includes: API, Redis, Prometheus, Grafana, Loki
services:
caddy:
image: caddy:2-alpine
container_name: ml-dev-caddy
restart: unless-stopped
ports:
- "8080:80"
- "8443:443"
volumes:
- ${FETCHML_REPO_ROOT:-.}/deployments/Caddyfile.dev:/etc/caddy/Caddyfile:ro
- ${FETCHML_REPO_ROOT:-.}/data/dev/caddy/data:/data
- ${FETCHML_REPO_ROOT:-.}/data/dev/caddy/config:/config
depends_on:
api-server:
condition: service_healthy
redis:
image: redis:7-alpine
container_name: ml-experiments-redis
user: "999:999"
ports:
- "6379:6379"
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/dev/redis:/data
restart: unless-stopped
command: redis-server --appendonly yes
healthcheck:
test: [ "CMD", "redis-cli", "ping" ]
interval: 30s
timeout: 10s
retries: 3
api-server:
build:
context: ${FETCHML_REPO_ROOT:-.}
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
container_name: ml-experiments-api
user: "0:0"
expose:
- "9101" # API and health endpoints (internal; external access via Caddy)
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs
- ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments
- ${FETCHML_REPO_ROOT:-.}/data/dev/active:/data/active
- ${FETCHML_REPO_ROOT:-.}/data/dev/workspaces:/data/active/workspaces:delegated
- ${FETCHML_REPO_ROOT:-.}/configs/api/dev.yaml:/app/configs/api/dev.yaml
- ${FETCHML_REPO_ROOT:-.}/ssl:/app/ssl
depends_on:
- redis
restart: unless-stopped
command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
environment:
- LOG_LEVEL=info
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
labels:
logging: "promtail"
job: "api-server"
minio:
image: minio/minio:latest
container_name: ml-experiments-minio
ports:
- "9000:9000"
- "9001:9001"
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/dev/minio:/data
environment:
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin123
command: ["server", "/data", "--console-address", ":9001"]
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 10s
timeout: 5s
retries: 10
restart: unless-stopped
minio-init:
image: alpine:3.19
container_name: ml-experiments-minio-init
depends_on:
minio:
condition: service_healthy
entrypoint: ["/bin/sh", "-c"]
command:
- |
set -eu
apk add --no-cache ca-certificates curl tar gzip
ARCH=$$(uname -m)
MC_ARCH=amd64
if [ "$$ARCH" = "aarch64" ] || [ "$$ARCH" = "arm64" ]; then
MC_ARCH=arm64
fi
curl -fsSL -o /usr/local/bin/mc "https://dl.min.io/client/mc/release/linux-$$MC_ARCH/mc"
chmod +x /usr/local/bin/mc
i=0
while ! mc alias set local http://minio:9000 minioadmin minioadmin123; do
i=$$((i+1))
if [ $$i -ge 30 ]; then
echo "minio not ready after 30 attempts" >&2
exit 1
fi
echo "waiting for minio... ($$i/30)"
sleep 1
done
mc mb -p local/fetchml-snapshots || true
mkdir -p /tmp/snapshots/snap-1
echo -n "hello" > /tmp/snapshots/snap-1/hello.txt
tar -C /tmp/snapshots/snap-1 -czf /tmp/snap-1.tar.gz .
mc cp /tmp/snap-1.tar.gz local/fetchml-snapshots/snapshots/snap-1.tar.gz
FILE_SHA=$$(sha256sum /tmp/snapshots/snap-1/hello.txt | cut -d' ' -f1)
SNAP_SHA=$$(echo -n "$$FILE_SHA" | sha256sum | cut -d' ' -f1)
echo "snapshot_id=snap-1 snapshot_sha256=$$SNAP_SHA"
restart: "no"
worker:
build:
context: ${FETCHML_REPO_ROOT:-.}
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
container_name: ml-experiments-worker
user: "0:0"
ports:
- "8888:8888"
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs
- ${FETCHML_REPO_ROOT:-.}/data/dev/active:/data/active
- ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments
- ${FETCHML_REPO_ROOT:-.}/data/dev/workspaces:/data/active/workspaces:delegated
- ${FETCHML_REPO_ROOT:-.}/configs/workers/docker-dev.yaml:/app/configs/worker.yaml
- /sys/fs/cgroup:/sys/fs/cgroup:rw
depends_on:
redis:
condition: service_healthy
api-server:
condition: service_healthy
minio-init:
condition: service_completed_successfully
restart: unless-stopped
environment:
- LOG_LEVEL=info
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin123
- FETCHML_JUPYTER_DEFAULT_IMAGE=quay.io/jupyter/base-notebook:latest
- FETCHML_JUPYTER_CONDA_ENV=base
- FETCHML_JUPYTER_KERNEL_NAME=python
- FETCHML_PODMAN_CGROUPS=disabled
privileged: true
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
# Prometheus - Metrics collection
prometheus:
image: prom/prometheus:latest
container_name: ml-experiments-prometheus
ports:
- "9090:9090"
volumes:
- ${FETCHML_REPO_ROOT:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
restart: unless-stopped
# Grafana - Visualization
grafana:
image: grafana/grafana:latest
container_name: ml-experiments-grafana
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
- ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning
- ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_USERS_ALLOW_SIGN_UP=false
restart: unless-stopped
depends_on:
- prometheus
- loki
# Loki - Log aggregation
loki:
image: grafana/loki:latest
container_name: ml-experiments-loki
ports:
- "3100:3100"
volumes:
- ${FETCHML_REPO_ROOT:-.}/monitoring/loki-config.yml:/etc/loki/local-config.yaml
- loki_data:/loki
command: -config.file=/etc/loki/local-config.yaml
restart: unless-stopped
# Promtail - Log collector
promtail:
image: grafana/promtail:latest
container_name: ml-experiments-promtail
volumes:
- ${FETCHML_REPO_ROOT:-.}/monitoring/promtail-config.yml:/etc/promtail/config.yml
- ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/var/log/app
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- /var/run/docker.sock:/var/run/docker.sock
command: -config.file=/etc/promtail/config.yml
restart: unless-stopped
depends_on:
- loki
volumes:
prometheus_data:
driver: local
grafana_data:
driver: local
loki_data:
driver: local