- Remove redundant config examples (distributed/, standalone/, examples/) - Delete dev-local.yaml variants (use dev.yaml with env vars) - Delete prod.yaml (use multi-user.yaml or homelab-secure.yaml) - Clean up worker configs: remove docker.yaml, homelab-sandbox.yaml - Update remaining configs with current best practices - Simplify config schema and documentation
236 lines
7.2 KiB
Text
236 lines
7.2 KiB
Text
# fetch_ml Production Setup Example
|
|
#
|
|
# This configuration provides a production-ready multi-worker setup
|
|
# with the scheduler managing job distribution across workers.
|
|
#
|
|
# Usage:
|
|
# 1. Copy this file to your working directory
|
|
# 2. Adjust values for your environment
|
|
# 3. Deploy with: docker-compose -f docker-compose.prod.yml up -d
|
|
|
|
# =============================================================================
|
|
# API SERVER CONFIGURATION
|
|
# =============================================================================
|
|
# File: configs/api/multi-user.yaml
|
|
|
|
resources:
|
|
max_workers: 4 # Maximum concurrent workers
|
|
desired_rps_per_worker: 3 # Target requests per second per worker
|
|
podman_cpus: "2" # CPU limit per Podman container
|
|
podman_memory: "4Gi" # Memory limit per Podman container
|
|
|
|
scheduler:
|
|
enabled: true # Enable the job scheduler
|
|
strategy: "round-robin" # Job distribution strategy
|
|
# Options: round-robin, least-loaded, priority
|
|
max_concurrent_jobs: 16 # Max jobs running across all workers
|
|
|
|
queue:
|
|
type: "redis" # Queue backend
|
|
redis_addr: "redis:6379" # Redis connection address
|
|
# For Redis Cluster:
|
|
# redis_cluster_addrs: "redis-node-1:6379,redis-node-2:6379,redis-node-3:6379"
|
|
|
|
worker_discovery:
|
|
mode: "dynamic" # Dynamic worker discovery
|
|
heartbeat_timeout: "30s" # Worker considered dead after this timeout
|
|
health_check_interval: "10s" # Health check frequency
|
|
|
|
# =============================================================================
|
|
# WORKER CONFIGURATION
|
|
# =============================================================================
|
|
# File: configs/worker/docker-prod.yaml
|
|
|
|
backend:
|
|
type: "redis" # Must match scheduler queue type
|
|
redis:
|
|
addr: "redis:6379" # Redis address
|
|
password: "" # Set via REDIS_PASSWORD env var
|
|
db: 0
|
|
|
|
worker:
|
|
id: "${FETCHML_WORKER_ID}" # Unique worker ID - MUST be unique per worker!
|
|
mode: "distributed" # Connect to scheduler via Redis
|
|
heartbeat_interval: "10s" # How often to send heartbeats
|
|
max_concurrent_jobs: 4 # Jobs this worker can run simultaneously
|
|
|
|
# Auto-scaling settings (optional)
|
|
autoscale:
|
|
enabled: false # Enable worker auto-scaling
|
|
min_workers: 2 # Minimum workers to maintain
|
|
max_workers: 8 # Maximum workers to scale to
|
|
target_queue_depth: 10 # Scale up when queue exceeds this
|
|
|
|
sandbox:
|
|
type: "podman"
|
|
podman:
|
|
socket: "/run/podman/podman.sock"
|
|
cpus: "2"
|
|
memory: "4Gi"
|
|
# Security options
|
|
privileged: true # Required for Podman-in-Docker
|
|
seccomp_profile: "" # Optional: custom seccomp profile
|
|
|
|
# =============================================================================
|
|
# DEPLOYMENT EXAMPLES
|
|
# =============================================================================
|
|
|
|
# Example 1: Docker Compose (Recommended for single node)
|
|
# --------------------------------------------------------
|
|
# Save as docker-compose.yml and run:
|
|
# docker-compose up -d
|
|
#
|
|
# services:
|
|
# api-server:
|
|
# image: fetchml-api:latest
|
|
# ports:
|
|
# - "9101:9101"
|
|
# volumes:
|
|
# - ./configs/api/multi-user.yaml:/app/configs/api/prod.yaml:ro
|
|
# depends_on:
|
|
# - redis
|
|
#
|
|
# worker:
|
|
# image: fetchml-worker:latest
|
|
# volumes:
|
|
# - ./configs/worker/docker-prod.yaml:/app/configs/worker.yaml:ro
|
|
# depends_on:
|
|
# - api-server
|
|
# - redis
|
|
# deploy:
|
|
# replicas: 4 # Start with 4 workers
|
|
# privileged: true
|
|
#
|
|
# redis:
|
|
# image: redis:7-alpine
|
|
# volumes:
|
|
# - redis_data:/data
|
|
#
|
|
# volumes:
|
|
# redis_data:
|
|
|
|
# Example 2: Kubernetes Deployment
|
|
# --------------------------------
|
|
# Use the Helm chart or create deployments manually:
|
|
#
|
|
# apiVersion: apps/v1
|
|
# kind: Deployment
|
|
# metadata:
|
|
# name: fetchml-api
|
|
# spec:
|
|
# replicas: 1
|
|
# template:
|
|
# spec:
|
|
# containers:
|
|
# - name: api
|
|
# image: fetchml-api:latest
|
|
# env:
|
|
# - name: FETCHML_CONFIG_PATH
|
|
# value: /app/configs/prod.yaml
|
|
# ---
|
|
# apiVersion: apps/v1
|
|
# kind: Deployment
|
|
# metadata:
|
|
# name: fetchml-worker
|
|
# spec:
|
|
# replicas: 4
|
|
# template:
|
|
# spec:
|
|
# containers:
|
|
# - name: worker
|
|
# image: fetchml-worker:latest
|
|
# env:
|
|
# - name: FETCHML_WORKER_ID
|
|
# valueFrom:
|
|
# fieldRef:
|
|
# fieldPath: metadata.name
|
|
# - name: FETCHML_CONFIG_PATH
|
|
# value: /app/configs/worker.yaml
|
|
# securityContext:
|
|
# privileged: true
|
|
|
|
# Example 3: Systemd Service (Bare Metal)
|
|
# ----------------------------------------
|
|
# /etc/systemd/system/fetchml-api.service:
|
|
#
|
|
# [Unit]
|
|
# Description=FetchML API Server
|
|
# After=network.target redis.service
|
|
#
|
|
# [Service]
|
|
# ExecStart=/usr/local/bin/fetchml-api -config /etc/fetchml/api.yaml
|
|
# Restart=always
|
|
# User=fetchml
|
|
# Group=fetchml
|
|
#
|
|
# [Install]
|
|
# WantedBy=multi-user.target
|
|
#
|
|
# /etc/systemd/system/fetchml-worker@.service (template):
|
|
#
|
|
# [Unit]
|
|
# Description=FetchML Worker %i
|
|
# After=fetchml-api.service
|
|
#
|
|
# [Service]
|
|
# ExecStart=/usr/local/bin/fetchml-worker -config /etc/fetchml/worker.yaml
|
|
# Environment="FETCHML_WORKER_ID=worker-%i"
|
|
# Restart=always
|
|
#
|
|
# [Install]
|
|
# WantedBy=multi-user.target
|
|
#
|
|
# Enable 4 workers:
|
|
# systemctl enable fetchml-worker@{1..4}
|
|
|
|
# =============================================================================
|
|
# MONITORING & MAINTENANCE
|
|
# =============================================================================
|
|
|
|
# Check worker status
|
|
# $ curl http://localhost:9101/api/v1/workers
|
|
|
|
# View Redis queue depth
|
|
# $ redis-cli LLEN fetchml:queue:pending
|
|
|
|
# View worker heartbeats
|
|
# $ redis-cli HGETALL fetchml:worker:heartbeats
|
|
|
|
# Restart all workers
|
|
# $ docker-compose restart worker
|
|
|
|
# Scale workers dynamically
|
|
# $ docker-compose up -d --scale worker=8
|
|
|
|
# =============================================================================
|
|
# TROUBLESHOOTING
|
|
# =============================================================================
|
|
|
|
# Workers not appearing in API:
|
|
# 1. Check Redis connection: redis-cli ping
|
|
# 2. Verify worker logs: docker logs <worker-container>
|
|
# 3. Check FETCHML_WORKER_ID is unique per worker
|
|
# 4. Verify worker mode is "distributed" not "standalone"
|
|
|
|
# Jobs stuck in queue:
|
|
# 1. Check worker capacity not exceeded
|
|
# 2. Verify workers are healthy: docker ps
|
|
# 3. Check Redis queue: redis-cli LRANGE fetchml:queue:pending 0 10
|
|
# 4. Review worker logs for errors
|
|
|
|
# High job latency:
|
|
# 1. Consider increasing max_concurrent_jobs per worker
|
|
# 2. Scale to more workers: docker-compose up -d --scale worker=8
|
|
# 3. Check if jobs are CPU-bound (increase podman_cpus)
|
|
# 4. Consider using "least-loaded" strategy instead of "round-robin"
|
|
|
|
# =============================================================================
|
|
# SECURITY BEST PRACTICES
|
|
# =============================================================================
|
|
|
|
# 1. Firewall Redis - never expose to public internet
|
|
# 2. Use Redis AUTH password in production
|
|
# 3. Run workers with minimal privileges (use custom seccomp profile)
|
|
# 4. Enable TLS for API server in production
|
|
# 5. Use separate networks for API/Worker/Redis communication
|
|
# 6. Regular security updates for Podman and container images
|