--- # Homelab Docker Compose with Centralized Monitoring # Includes: API, Redis, Prometheus, Grafana, Loki services: caddy: image: caddy:2-alpine container_name: ml-dev-caddy restart: unless-stopped ports: - "8080:80" - "8443:443" volumes: - ${FETCHML_REPO_ROOT:-..}/deployments/Caddyfile.dev:/etc/caddy/Caddyfile:ro - ${FETCHML_REPO_ROOT:-..}/data/dev/caddy/data:/data - ${FETCHML_REPO_ROOT:-..}/data/dev/caddy/config:/config depends_on: api-server: condition: service_healthy redis: image: redis:7-alpine container_name: ml-experiments-redis user: "999:999" ports: - "6379:6379" volumes: - redis_data:/data restart: unless-stopped command: redis-server --appendonly yes healthcheck: test: ["CMD", "redis-cli", "ping"] interval: 30s timeout: 10s retries: 3 api-server: build: context: ${FETCHML_REPO_ROOT:-..} dockerfile: build/docker/simple.Dockerfile container_name: ml-experiments-api user: "0:0" ports: - "9101:9101" expose: - "9101" # API and health endpoints (internal; external access via Caddy) volumes: - ${FETCHML_REPO_ROOT:-..}/data/dev/logs:/logs - ${FETCHML_REPO_ROOT:-..}/data/dev/experiments:/data/experiments - ${FETCHML_REPO_ROOT:-..}/data/dev/active:/data/active - ${FETCHML_REPO_ROOT:-..}/data/dev/workspaces:/data/active/workspaces:delegated - ${FETCHML_REPO_ROOT:-..}/configs/api/dev.yaml:/app/configs/api/dev.yaml - ${FETCHML_REPO_ROOT:-..}/ssl:/app/ssl depends_on: - redis restart: unless-stopped command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"] environment: - LOG_LEVEL=info # Native libs enabled via build tag: -tags native_libs healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9101/health"] interval: 30s timeout: 10s retries: 3 start_period: 40s labels: logging: "promtail" job: "api-server" minio: image: minio/minio:latest container_name: ml-experiments-minio ports: - "9000:9000" - "9001:9001" volumes: - ${FETCHML_REPO_ROOT:-..}/data/dev/minio:/data environment: - MINIO_ROOT_USER=minioadmin - MINIO_ROOT_PASSWORD=minioadmin123 command: ["server", "/data", "--console-address", ":9001"] healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] interval: 10s timeout: 5s retries: 10 restart: unless-stopped minio-init: image: alpine:3.19 container_name: ml-experiments-minio-init depends_on: minio: condition: service_healthy entrypoint: ["/bin/sh", "-c"] command: - | set -eu apk add --no-cache ca-certificates curl tar gzip ARCH=$$(uname -m) MC_ARCH=amd64 if [ "$$ARCH" = "aarch64" ] || [ "$$ARCH" = "arm64" ]; then MC_ARCH=arm64 fi curl -fsSL -o /usr/local/bin/mc "https://dl.min.io/client/mc/release/linux-$$MC_ARCH/mc" chmod +x /usr/local/bin/mc i=0 while ! mc alias set local http://minio:9000 minioadmin minioadmin123; do i=$$((i+1)) if [ $$i -ge 30 ]; then echo "minio not ready after 30 attempts" >&2 exit 1 fi echo "waiting for minio... ($$i/30)" sleep 1 done # Skip if bucket already exists if mc ls local/fetchml-snapshots 2>/dev/null; then echo "Bucket fetchml-snapshots already exists, skipping init" exit 0 fi mc mb -p local/fetchml-snapshots || true mkdir -p /tmp/snapshots/snap-1 echo -n "hello" > /tmp/snapshots/snap-1/hello.txt tar -C /tmp/snapshots/snap-1 -czf /tmp/snap-1.tar.gz . mc cp /tmp/snap-1.tar.gz local/fetchml-snapshots/snapshots/snap-1.tar.gz FILE_SHA=$$(sha256sum /tmp/snapshots/snap-1/hello.txt | cut -d' ' -f1) SNAP_SHA=$$(echo -n "$$FILE_SHA" | sha256sum | cut -d' ' -f1) echo "snapshot_id=snap-1 snapshot_sha256=$$SNAP_SHA" restart: "no" worker: build: context: ${FETCHML_REPO_ROOT:-..} dockerfile: build/docker/simple.Dockerfile container_name: ml-experiments-worker user: "0:0" ports: - "8888:8888" volumes: - ${FETCHML_REPO_ROOT:-..}/data/dev/logs:/logs - ${FETCHML_REPO_ROOT:-..}/data/dev/active:/data/active - ${FETCHML_REPO_ROOT:-..}/data/dev/experiments:/data/experiments - ${FETCHML_REPO_ROOT:-..}/data/dev/workspaces:/data/active/workspaces:delegated - ${FETCHML_REPO_ROOT:-..}/configs/workers/docker-dev.yaml:/app/configs/worker.yaml - /sys/fs/cgroup:/sys/fs/cgroup:rw depends_on: redis: condition: service_healthy api-server: condition: service_healthy minio-init: condition: service_completed_successfully restart: unless-stopped environment: - LOG_LEVEL=info - MINIO_ROOT_USER=minioadmin - MINIO_ROOT_PASSWORD=minioadmin123 - FETCHML_JUPYTER_DEFAULT_IMAGE=quay.io/jupyter/minimal-notebook:latest - FETCHML_JUPYTER_CONDA_ENV=base - FETCHML_JUPYTER_KERNEL_NAME=python - FETCHML_PODMAN_CGROUPS=disabled # Native libs enabled via build tag: -tags native_libs privileged: true command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"] # # Prometheus - Metrics collection # prometheus: # image: prom/prometheus:latest # container_name: ml-experiments-prometheus # ports: # - "9090:9090" # volumes: # - ${FETCHML_REPO_ROOT:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml # - prometheus_data:/prometheus # command: # - '--config.file=/etc/prometheus/prometheus.yml' # - '--storage.tsdb.path=/prometheus' # - '--web.console.libraries=/etc/prometheus/console_libraries' # - '--web.console.templates=/etc/prometheus/consoles' # - '--web.enable-lifecycle' # restart: unless-stopped # # # Grafana - Visualization # grafana: # image: grafana/grafana:latest # container_name: ml-experiments-grafana # ports: # - "3000:3000" # volumes: # - grafana_data:/var/lib/grafana # - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning # - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards # environment: # - GF_SECURITY_ADMIN_PASSWORD=admin123 # - GF_USERS_ALLOW_SIGN_UP=false # restart: unless-stopped # depends_on: # - prometheus # - loki # # # Loki - Log aggregation # loki: # image: grafana/loki:latest # container_name: ml-experiments-loki # ports: # - "3100:3100" # volumes: # - ${FETCHML_REPO_ROOT:-.}/monitoring/loki-config.yml:/etc/loki/local-config.yaml # - loki_data:/loki # command: -config.file=/etc/loki/local-config.yaml # restart: unless-stopped # Promtail - Log collector promtail: image: grafana/promtail:latest container_name: ml-experiments-promtail volumes: - ${FETCHML_REPO_ROOT:-..}/monitoring/promtail-config.yml:/etc/promtail/config.yml - ${FETCHML_REPO_ROOT:-..}/data/dev/logs:/var/log/app - /var/lib/docker/containers:/var/lib/docker/containers:ro - /var/run/docker.sock:/var/run/docker.sock command: -config.file=/etc/promtail/config.yml restart: unless-stopped # depends_on: # - loki volumes: redis_data: driver: local prometheus_data: driver: local grafana_data: driver: local loki_data: driver: local