chore(config): update configurations and deployment scripts

- Update API server and worker config schemas
- Refine Docker Compose configurations (dev/prod)
- Update deployment scripts and documentation
This commit is contained in:
Jeremie Fraeys 2026-02-12 12:05:37 -05:00
parent 5144d291cb
commit 2209ae24c6
No known key found for this signature in database
7 changed files with 342 additions and 402 deletions

View file

@ -1,227 +1,233 @@
# Fetch ML Configuration Schema (JSON Schema expressed as YAML)
$schema: "http://json-schema.org/draft-07/schema#"
title: "Fetch ML API Server Configuration"
type: object
additionalProperties: false
required:
- auth
- server
properties:
base_path:
type: string
description: Base path for experiment data
default: "/tmp/ml-experiments"
data_dir:
type: string
description: Data directory (datasets/snapshots) for integrity validation
default: "/data/active"
auth:
type: object
additionalProperties: false
required:
- enabled
properties:
enabled:
type: boolean
description: Enable or disable authentication
api_keys:
type: object
description: API key registry
additionalProperties:
type: object
additionalProperties: false
required:
- hash
properties:
hash:
type: string
description: SHA256 hash of the API key
admin:
type: boolean
default: false
roles:
type: array
items:
type: string
permissions:
type: object
additionalProperties:
type: boolean
server:
type: object
additionalProperties: false
required: [address]
properties:
address:
type: string
description: Listen address, e.g. ":9101"
tls:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
default: false
cert_file:
type: string
key_file:
type: string
monitoring:
type: object
additionalProperties: false
properties:
prometheus:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
port:
type: integer
minimum: 1
maximum: 65535
path:
type: string
health_checks:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
interval:
type: string
database:
type: object
additionalProperties: false
properties:
type:
type: string
enum: [sqlite, postgres, mysql]
default: sqlite
connection:
type: string
host:
type: string
port:
type: integer
minimum: 1
maximum: 65535
username:
type: string
password:
type: string
database:
type: string
redis:
type: object
additionalProperties: false
properties:
url:
type: string
pattern: "^redis://"
addr:
type: string
description: Optional host:port shorthand for Redis
password:
type: string
db:
type: integer
minimum: 0
default: 0
queue:
type: object
additionalProperties: false
properties:
backend:
type: string
enum: [redis, sqlite]
default: redis
sqlite_path:
type: string
logging:
type: object
additionalProperties: false
properties:
level:
type: string
enum: [debug, info, warn, error]
default: "info"
file:
type: string
audit_log:
type: string
security:
type: object
additionalProperties: false
properties:
production_mode:
type: boolean
default: false
allowed_origins:
type: array
items:
type: string
api_key_rotation_days:
type: integer
minimum: 0
audit_logging:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
log_path:
type: string
ip_whitelist:
type: array
items:
type: string
failed_login_lockout:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
max_attempts:
type: integer
minimum: 1
lockout_duration:
type: string
description: Duration string, e.g. "15m"
rate_limit:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
default: false
requests_per_minute:
type: integer
minimum: 1
default: 60
burst_size:
type: integer
minimum: 1
resources:
type: object
description: Resource configuration
additionalProperties: false
properties:
max_workers:
type: integer
minimum: 1
default: 1
desired_rps_per_worker:
type: integer
minimum: 1
requests_per_sec:
type: integer
minimum: 1
podman_cpus:
type: string
podman_memory:
type: string
request_burst:
type: integer
minimum: 0
# Fetch ML Configuration Schema (JSON Schema expressed as YAML)
$schema: "http://json-schema.org/draft-07/schema#"
title: "Fetch ML API Server Configuration"
type: object
additionalProperties: false
required:
- auth
- server
properties:
base_path:
type: string
description: Base path for experiment data
default: "/tmp/ml-experiments"
data_dir:
type: string
description: Data directory (datasets/snapshots) for integrity validation
default: "/data/active"
auth:
type: object
additionalProperties: false
required:
- enabled
properties:
enabled:
type: boolean
description: Enable or disable authentication
api_keys:
type: object
description: API key registry
additionalProperties:
type: object
additionalProperties: false
required:
- hash
properties:
hash:
type: string
description: SHA256 hash of the API key
admin:
type: boolean
default: false
roles:
type: array
items:
type: string
permissions:
type: object
additionalProperties:
type: boolean
server:
type: object
additionalProperties: false
required: [address]
properties:
address:
type: string
description: Listen address, e.g. ":9101"
tls:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
default: false
cert_file:
type: string
key_file:
type: string
monitoring:
type: object
additionalProperties: false
properties:
prometheus:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
port:
type: integer
minimum: 1
maximum: 65535
path:
type: string
health_checks:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
interval:
type: string
database:
type: object
additionalProperties: false
properties:
type:
type: string
enum: [sqlite, postgres, mysql]
default: sqlite
connection:
type: string
host:
type: string
port:
type: integer
minimum: 1
maximum: 65535
username:
type: string
password:
type: string
database:
type: string
redis:
type: object
additionalProperties: false
properties:
url:
type: string
pattern: "^redis://"
addr:
type: string
description: Optional host:port shorthand for Redis
password:
type: string
db:
type: integer
minimum: 0
default: 0
queue:
type: object
additionalProperties: false
properties:
backend:
type: string
enum: [redis, sqlite, filesystem]
default: redis
sqlite_path:
type: string
filesystem_path:
type: string
fallback_to_filesystem:
type: boolean
default: false
logging:
type: object
additionalProperties: false
properties:
level:
type: string
enum: [debug, info, warn, error]
default: "info"
file:
type: string
audit_log:
type: string
security:
type: object
additionalProperties: false
properties:
production_mode:
type: boolean
default: false
allowed_origins:
type: array
items:
type: string
api_key_rotation_days:
type: integer
minimum: 0
audit_logging:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
log_path:
type: string
ip_whitelist:
type: array
items:
type: string
failed_login_lockout:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
max_attempts:
type: integer
minimum: 1
lockout_duration:
type: string
description: Duration string, e.g. "15m"
rate_limit:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
default: false
requests_per_minute:
type: integer
minimum: 1
default: 60
burst_size:
type: integer
minimum: 1
resources:
type: object
description: Resource configuration
additionalProperties: false
properties:
max_workers:
type: integer
minimum: 1
default: 1
desired_rps_per_worker:
type: integer
minimum: 1
requests_per_sec:
type: integer
minimum: 1
podman_cpus:
type: string
podman_memory:
type: string
request_burst:
type: integer
minimum: 0

View file

@ -17,6 +17,17 @@ allOf:
properties:
queue:
required: [sqlite_path]
- if:
properties:
queue:
properties:
backend:
const: filesystem
required: [queue]
then:
properties:
queue:
required: [filesystem_path]
else:
anyOf:
- required: [redis_addr]
@ -70,11 +81,18 @@ properties:
properties:
backend:
type: string
enum: [redis, sqlite]
enum: [redis, sqlite, filesystem]
default: redis
sqlite_path:
type: string
description: Path to queue.db (sqlite backend only)
filesystem_path:
type: string
description: Base directory for filesystem queue state
fallback_to_filesystem:
type: boolean
default: false
description: If true, fall back to filesystem queue when primary backend is unavailable
known_hosts:
type: string
description: Path to SSH known hosts file

View file

@ -18,8 +18,8 @@ data_dir: "/data/active"
snapshot_store:
enabled: true
endpoint: "minio:9000"
secure: false
endpoint: "blizzard.jfraeys.com"
secure: true
bucket: "fetchml-snapshots"
prefix: "snapshots"
timeout: "5m"

View file

@ -1,47 +0,0 @@
{
email {$CADDY_EMAIL}
admin off
servers {
protocols h1 h2
}
}
{$FETCHML_DOMAIN} {
encode gzip
request_body {
max_size 10MB
}
header {
-Server
X-Frame-Options "DENY"
X-Content-Type-Options "nosniff"
Referrer-Policy "strict-origin-when-cross-origin"
Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
Content-Security-Policy "default-src 'self'; base-uri 'self'; frame-ancestors 'none'"
}
@admin path /admin/*
@admin_private remote_ip private_ranges
handle @admin {
respond @admin_private 404
respond 404
}
handle /health {
reverse_proxy api-server:9101
}
handle /ws* {
reverse_proxy api-server:9101
}
handle /api/* {
reverse_proxy api-server:9101
}
handle {
respond 404
}
}

View file

@ -3,6 +3,10 @@
set -e
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
REPO_ROOT=$(cd "${SCRIPT_DIR}/.." && pwd)
export FETCHML_REPO_ROOT="${FETCHML_REPO_ROOT:-${REPO_ROOT}}"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
@ -56,13 +60,13 @@ check_compose_file() {
case $env in
"dev")
compose_file="deployments/docker-compose.dev.yml"
compose_file="${FETCHML_REPO_ROOT}/deployments/docker-compose.dev.yml"
;;
"secure")
compose_file="deployments/docker-compose.homelab-secure.yml"
compose_file="${FETCHML_REPO_ROOT}/deployments/docker-compose.homelab-secure.yml"
;;
"prod")
compose_file="deployments/docker-compose.prod.yml"
compose_file="${FETCHML_REPO_ROOT}/deployments/docker-compose.prod.yml"
;;
*)
print_error "Unknown environment: $env"
@ -83,14 +87,14 @@ check_compose_file() {
check_env_file() {
local env=$1
if [ ! -f ".env" ]; then
if [ ! -f "${FETCHML_REPO_ROOT}/.env" ]; then
print_warning ".env file not found. Creating from example..."
if [ "$env" = "dev" ]; then
cp deployments/env.dev.example .env
cp "${FETCHML_REPO_ROOT}/deployments/env.dev.example" "${FETCHML_REPO_ROOT}/.env"
elif [ "$env" = "prod" ]; then
cp deployments/env.prod.example .env
cp "${FETCHML_REPO_ROOT}/deployments/env.prod.example" "${FETCHML_REPO_ROOT}/.env"
else
cp deployments/env.dev.example .env
cp "${FETCHML_REPO_ROOT}/deployments/env.dev.example" "${FETCHML_REPO_ROOT}/.env"
fi
print_warning "Please edit .env file with your configuration"
fi
@ -120,7 +124,7 @@ main() {
case $action in
"up")
print_status "Starting $environment environment..."
docker-compose -f "$compose_file" up -d
docker-compose --project-directory "${FETCHML_REPO_ROOT}" -f "$compose_file" up -d
print_success "$environment environment started successfully!"
# Show service URLs
@ -134,21 +138,21 @@ main() {
;;
"down")
print_status "Stopping $environment environment..."
docker-compose -f "$compose_file" down
docker-compose --project-directory "${FETCHML_REPO_ROOT}" -f "$compose_file" down
print_success "$environment environment stopped successfully!"
;;
"restart")
print_status "Restarting $environment environment..."
docker-compose -f "$compose_file" restart
docker-compose --project-directory "${FETCHML_REPO_ROOT}" -f "$compose_file" restart
print_success "$environment environment restarted successfully!"
;;
"logs")
print_status "Showing logs for $environment environment..."
docker-compose -f "$compose_file" logs -f
docker-compose --project-directory "${FETCHML_REPO_ROOT}" -f "$compose_file" logs -f
;;
"status")
print_status "Status of $environment environment:"
docker-compose -f "$compose_file" ps
docker-compose --project-directory "${FETCHML_REPO_ROOT}" -f "$compose_file" ps
;;
*)
print_error "Unknown action: $action"

View file

@ -1,6 +1,6 @@
---
# Homelab Docker Compose with Centralized Monitoring
# Includes: API, Redis, Prometheus, Grafana, Loki
services:
caddy:
image: caddy:2-alpine
@ -16,7 +16,6 @@ services:
depends_on:
api-server:
condition: service_healthy
redis:
image: redis:7-alpine
container_name: ml-experiments-redis
@ -28,19 +27,20 @@ services:
restart: unless-stopped
command: redis-server --appendonly yes
healthcheck:
test: [ "CMD", "redis-cli", "ping" ]
test: ["CMD", "redis-cli", "ping"]
interval: 30s
timeout: 10s
retries: 3
api-server:
build:
context: ${FETCHML_REPO_ROOT:-.}
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
container_name: ml-experiments-api
user: "0:0"
ports:
- "9101:9101"
expose:
- "9101" # API and health endpoints (internal; external access via Caddy)
- "9101" # API and health endpoints (internal; external access via Caddy)
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs
- ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments
@ -55,7 +55,7 @@ services:
environment:
- LOG_LEVEL=info
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
test: ["CMD", "curl", "-f", "http://localhost:9101/health"]
interval: 30s
timeout: 10s
retries: 3
@ -63,7 +63,6 @@ services:
labels:
logging: "promtail"
job: "api-server"
minio:
image: minio/minio:latest
container_name: ml-experiments-minio
@ -82,7 +81,6 @@ services:
timeout: 5s
retries: 10
restart: unless-stopped
minio-init:
image: alpine:3.19
container_name: ml-experiments-minio-init
@ -120,7 +118,6 @@ services:
SNAP_SHA=$$(echo -n "$$FILE_SHA" | sha256sum | cut -d' ' -f1)
echo "snapshot_id=snap-1 snapshot_sha256=$$SNAP_SHA"
restart: "no"
worker:
build:
context: ${FETCHML_REPO_ROOT:-.}
@ -148,60 +145,58 @@ services:
- LOG_LEVEL=info
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin123
- FETCHML_JUPYTER_DEFAULT_IMAGE=quay.io/jupyter/base-notebook:latest
- FETCHML_JUPYTER_DEFAULT_IMAGE=quay.io/jupyter/minimal-notebook:latest
- FETCHML_JUPYTER_CONDA_ENV=base
- FETCHML_JUPYTER_KERNEL_NAME=python
- FETCHML_PODMAN_CGROUPS=disabled
privileged: true
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
# Prometheus - Metrics collection
prometheus:
image: prom/prometheus:latest
container_name: ml-experiments-prometheus
ports:
- "9090:9090"
volumes:
- ${FETCHML_REPO_ROOT:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
restart: unless-stopped
# Grafana - Visualization
grafana:
image: grafana/grafana:latest
container_name: ml-experiments-grafana
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
- ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning
- ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_USERS_ALLOW_SIGN_UP=false
restart: unless-stopped
depends_on:
- prometheus
- loki
# Loki - Log aggregation
loki:
image: grafana/loki:latest
container_name: ml-experiments-loki
ports:
- "3100:3100"
volumes:
- ${FETCHML_REPO_ROOT:-.}/monitoring/loki-config.yml:/etc/loki/local-config.yaml
- loki_data:/loki
command: -config.file=/etc/loki/local-config.yaml
restart: unless-stopped
# # Prometheus - Metrics collection
# prometheus:
# image: prom/prometheus:latest
# container_name: ml-experiments-prometheus
# ports:
# - "9090:9090"
# volumes:
# - ${FETCHML_REPO_ROOT:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
# - prometheus_data:/prometheus
# command:
# - '--config.file=/etc/prometheus/prometheus.yml'
# - '--storage.tsdb.path=/prometheus'
# - '--web.console.libraries=/etc/prometheus/console_libraries'
# - '--web.console.templates=/etc/prometheus/consoles'
# - '--web.enable-lifecycle'
# restart: unless-stopped
#
# # Grafana - Visualization
# grafana:
# image: grafana/grafana:latest
# container_name: ml-experiments-grafana
# ports:
# - "3000:3000"
# volumes:
# - grafana_data:/var/lib/grafana
# - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning
# - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards
# environment:
# - GF_SECURITY_ADMIN_PASSWORD=admin123
# - GF_USERS_ALLOW_SIGN_UP=false
# restart: unless-stopped
# depends_on:
# - prometheus
# - loki
#
# # Loki - Log aggregation
# loki:
# image: grafana/loki:latest
# container_name: ml-experiments-loki
# ports:
# - "3100:3100"
# volumes:
# - ${FETCHML_REPO_ROOT:-.}/monitoring/loki-config.yml:/etc/loki/local-config.yaml
# - loki_data:/loki
# command: -config.file=/etc/loki/local-config.yaml
# restart: unless-stopped
# Promtail - Log collector
promtail:
image: grafana/promtail:latest
@ -215,7 +210,6 @@ services:
restart: unless-stopped
depends_on:
- loki
volumes:
prometheus_data:
driver: local

View file

@ -1,23 +1,5 @@
# Full Production Docker Environment with Podman and SQLite
services:
caddy:
image: caddy:2-alpine
container_name: ml-prod-caddy
restart: unless-stopped
ports:
- "80:80"
- "443:443"
volumes:
- ./Caddyfile.prod:/etc/caddy/Caddyfile:ro
- ${FETCHML_REPO_ROOT:-.}/data/prod/caddy/data:/data
- ${FETCHML_REPO_ROOT:-.}/data/prod/caddy/config:/config
environment:
- FETCHML_DOMAIN=${FETCHML_DOMAIN}
- CADDY_EMAIL=${CADDY_EMAIL}
depends_on:
api-server:
condition: service_healthy
redis:
image: redis:7-alpine
container_name: ml-prod-redis
@ -40,8 +22,8 @@ services:
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/secure-prod.Dockerfile
container_name: ml-prod-api
expose:
- "9101" # API server port (internal; external access via Caddy)
- "2222" # Secure SSH port for Podman communication (internal)
- "9101"
- "2222"
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/prod/experiments:/app/data/experiments
- ${FETCHML_REPO_ROOT:-.}/data/prod/active:/data/active
@ -59,37 +41,17 @@ services:
timeout: 10s
retries: 3
start_period: 40s
# Start API server (ensure data_dir exists for snapshot/dataset validation)
command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"]
minio:
image: minio/minio:latest
container_name: ml-prod-minio
expose:
- "9000"
- "9001"
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/prod/minio:/data
environment:
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
command: ["server", "/data", "--console-address", ":9001"]
restart: unless-stopped
minio-init:
image: alpine:3.19
container_name: ml-prod-minio-init
depends_on:
- minio
entrypoint: ["/bin/sh", "-c"]
command:
- |
apk add --no-cache ca-certificates curl >/dev/null
curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc
chmod +x /usr/local/bin/mc
mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123}
mc mb -p local/fetchml-snapshots || true
restart: "no"
labels:
- "traefik.enable=true"
- "traefik.docker.network=${TRAEFIK_NETWORK:-traefik}"
- "traefik.http.services.fetchml.loadbalancer.server.port=9101"
- "traefik.http.routers.fetchml.rule=Host(`${FETCHML_DOMAIN}`) && (PathPrefix(`/api`) || PathPrefix(`/ws`) || Path(`/health`))"
- "traefik.http.routers.fetchml.entrypoints=${TRAEFIK_ENTRYPOINT:-websecure}"
- "traefik.http.routers.fetchml.tls=true"
networks:
- default
- traefik
worker:
build:
@ -106,14 +68,14 @@ services:
condition: service_healthy
api-server:
condition: service_healthy
minio-init:
condition: service_started
restart: unless-stopped
environment:
- LOG_LEVEL=info
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
privileged: true # Required for Podman to work in Docker
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN}
- AWS_REGION=${AWS_REGION}
privileged: true
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
volumes: {}
@ -121,3 +83,6 @@ volumes: {}
networks:
default:
name: ml-prod-network
traefik:
external: true
name: ${TRAEFIK_NETWORK:-traefik}