fetch_ml/configs/examples/prod.yaml.example
Jeremie Fraeys 8a7e7695f4
config: consolidate and cleanup configuration files
- Remove redundant config examples (distributed/, standalone/, examples/)
- Delete dev-local.yaml variants (use dev.yaml with env vars)
- Delete prod.yaml (use multi-user.yaml or homelab-secure.yaml)
- Clean up worker configs: remove docker.yaml, homelab-sandbox.yaml
- Update remaining configs with current best practices
- Simplify config schema and documentation
2026-03-04 13:22:52 -05:00

236 lines
7.2 KiB
Text

# fetch_ml Production Setup Example
#
# This configuration provides a production-ready multi-worker setup
# with the scheduler managing job distribution across workers.
#
# Usage:
# 1. Copy this file to your working directory
# 2. Adjust values for your environment
# 3. Deploy with: docker-compose -f docker-compose.prod.yml up -d
# =============================================================================
# API SERVER CONFIGURATION
# =============================================================================
# File: configs/api/multi-user.yaml
resources:
max_workers: 4 # Maximum concurrent workers
desired_rps_per_worker: 3 # Target requests per second per worker
podman_cpus: "2" # CPU limit per Podman container
podman_memory: "4Gi" # Memory limit per Podman container
scheduler:
enabled: true # Enable the job scheduler
strategy: "round-robin" # Job distribution strategy
# Options: round-robin, least-loaded, priority
max_concurrent_jobs: 16 # Max jobs running across all workers
queue:
type: "redis" # Queue backend
redis_addr: "redis:6379" # Redis connection address
# For Redis Cluster:
# redis_cluster_addrs: "redis-node-1:6379,redis-node-2:6379,redis-node-3:6379"
worker_discovery:
mode: "dynamic" # Dynamic worker discovery
heartbeat_timeout: "30s" # Worker considered dead after this timeout
health_check_interval: "10s" # Health check frequency
# =============================================================================
# WORKER CONFIGURATION
# =============================================================================
# File: configs/worker/docker-prod.yaml
backend:
type: "redis" # Must match scheduler queue type
redis:
addr: "redis:6379" # Redis address
password: "" # Set via REDIS_PASSWORD env var
db: 0
worker:
id: "${FETCHML_WORKER_ID}" # Unique worker ID - MUST be unique per worker!
mode: "distributed" # Connect to scheduler via Redis
heartbeat_interval: "10s" # How often to send heartbeats
max_concurrent_jobs: 4 # Jobs this worker can run simultaneously
# Auto-scaling settings (optional)
autoscale:
enabled: false # Enable worker auto-scaling
min_workers: 2 # Minimum workers to maintain
max_workers: 8 # Maximum workers to scale to
target_queue_depth: 10 # Scale up when queue exceeds this
sandbox:
type: "podman"
podman:
socket: "/run/podman/podman.sock"
cpus: "2"
memory: "4Gi"
# Security options
privileged: true # Required for Podman-in-Docker
seccomp_profile: "" # Optional: custom seccomp profile
# =============================================================================
# DEPLOYMENT EXAMPLES
# =============================================================================
# Example 1: Docker Compose (Recommended for single node)
# --------------------------------------------------------
# Save as docker-compose.yml and run:
# docker-compose up -d
#
# services:
# api-server:
# image: fetchml-api:latest
# ports:
# - "9101:9101"
# volumes:
# - ./configs/api/multi-user.yaml:/app/configs/api/prod.yaml:ro
# depends_on:
# - redis
#
# worker:
# image: fetchml-worker:latest
# volumes:
# - ./configs/worker/docker-prod.yaml:/app/configs/worker.yaml:ro
# depends_on:
# - api-server
# - redis
# deploy:
# replicas: 4 # Start with 4 workers
# privileged: true
#
# redis:
# image: redis:7-alpine
# volumes:
# - redis_data:/data
#
# volumes:
# redis_data:
# Example 2: Kubernetes Deployment
# --------------------------------
# Use the Helm chart or create deployments manually:
#
# apiVersion: apps/v1
# kind: Deployment
# metadata:
# name: fetchml-api
# spec:
# replicas: 1
# template:
# spec:
# containers:
# - name: api
# image: fetchml-api:latest
# env:
# - name: FETCHML_CONFIG_PATH
# value: /app/configs/prod.yaml
# ---
# apiVersion: apps/v1
# kind: Deployment
# metadata:
# name: fetchml-worker
# spec:
# replicas: 4
# template:
# spec:
# containers:
# - name: worker
# image: fetchml-worker:latest
# env:
# - name: FETCHML_WORKER_ID
# valueFrom:
# fieldRef:
# fieldPath: metadata.name
# - name: FETCHML_CONFIG_PATH
# value: /app/configs/worker.yaml
# securityContext:
# privileged: true
# Example 3: Systemd Service (Bare Metal)
# ----------------------------------------
# /etc/systemd/system/fetchml-api.service:
#
# [Unit]
# Description=FetchML API Server
# After=network.target redis.service
#
# [Service]
# ExecStart=/usr/local/bin/fetchml-api -config /etc/fetchml/api.yaml
# Restart=always
# User=fetchml
# Group=fetchml
#
# [Install]
# WantedBy=multi-user.target
#
# /etc/systemd/system/fetchml-worker@.service (template):
#
# [Unit]
# Description=FetchML Worker %i
# After=fetchml-api.service
#
# [Service]
# ExecStart=/usr/local/bin/fetchml-worker -config /etc/fetchml/worker.yaml
# Environment="FETCHML_WORKER_ID=worker-%i"
# Restart=always
#
# [Install]
# WantedBy=multi-user.target
#
# Enable 4 workers:
# systemctl enable fetchml-worker@{1..4}
# =============================================================================
# MONITORING & MAINTENANCE
# =============================================================================
# Check worker status
# $ curl http://localhost:9101/api/v1/workers
# View Redis queue depth
# $ redis-cli LLEN fetchml:queue:pending
# View worker heartbeats
# $ redis-cli HGETALL fetchml:worker:heartbeats
# Restart all workers
# $ docker-compose restart worker
# Scale workers dynamically
# $ docker-compose up -d --scale worker=8
# =============================================================================
# TROUBLESHOOTING
# =============================================================================
# Workers not appearing in API:
# 1. Check Redis connection: redis-cli ping
# 2. Verify worker logs: docker logs <worker-container>
# 3. Check FETCHML_WORKER_ID is unique per worker
# 4. Verify worker mode is "distributed" not "standalone"
# Jobs stuck in queue:
# 1. Check worker capacity not exceeded
# 2. Verify workers are healthy: docker ps
# 3. Check Redis queue: redis-cli LRANGE fetchml:queue:pending 0 10
# 4. Review worker logs for errors
# High job latency:
# 1. Consider increasing max_concurrent_jobs per worker
# 2. Scale to more workers: docker-compose up -d --scale worker=8
# 3. Check if jobs are CPU-bound (increase podman_cpus)
# 4. Consider using "least-loaded" strategy instead of "round-robin"
# =============================================================================
# SECURITY BEST PRACTICES
# =============================================================================
# 1. Firewall Redis - never expose to public internet
# 2. Use Redis AUTH password in production
# 3. Run workers with minimal privileges (use custom seccomp profile)
# 4. Enable TLS for API server in production
# 5. Use separate networks for API/Worker/Redis communication
# 6. Regular security updates for Podman and container images