# fetch_ml Production Setup Example # # This configuration provides a production-ready multi-worker setup # with the scheduler managing job distribution across workers. # # Usage: # 1. Copy this file to your working directory # 2. Adjust values for your environment # 3. Deploy with: docker-compose -f docker-compose.prod.yml up -d # ============================================================================= # API SERVER CONFIGURATION # ============================================================================= # File: configs/api/multi-user.yaml resources: max_workers: 4 # Maximum concurrent workers desired_rps_per_worker: 3 # Target requests per second per worker podman_cpus: "2" # CPU limit per Podman container podman_memory: "4Gi" # Memory limit per Podman container scheduler: enabled: true # Enable the job scheduler strategy: "round-robin" # Job distribution strategy # Options: round-robin, least-loaded, priority max_concurrent_jobs: 16 # Max jobs running across all workers queue: type: "redis" # Queue backend redis_addr: "redis:6379" # Redis connection address # For Redis Cluster: # redis_cluster_addrs: "redis-node-1:6379,redis-node-2:6379,redis-node-3:6379" worker_discovery: mode: "dynamic" # Dynamic worker discovery heartbeat_timeout: "30s" # Worker considered dead after this timeout health_check_interval: "10s" # Health check frequency # ============================================================================= # WORKER CONFIGURATION # ============================================================================= # File: configs/worker/docker-prod.yaml backend: type: "redis" # Must match scheduler queue type redis: addr: "redis:6379" # Redis address password: "" # Set via REDIS_PASSWORD env var db: 0 worker: id: "${FETCHML_WORKER_ID}" # Unique worker ID - MUST be unique per worker! mode: "distributed" # Connect to scheduler via Redis heartbeat_interval: "10s" # How often to send heartbeats max_concurrent_jobs: 4 # Jobs this worker can run simultaneously # Auto-scaling settings (optional) autoscale: enabled: false # Enable worker auto-scaling min_workers: 2 # Minimum workers to maintain max_workers: 8 # Maximum workers to scale to target_queue_depth: 10 # Scale up when queue exceeds this sandbox: type: "podman" podman: socket: "/run/podman/podman.sock" cpus: "2" memory: "4Gi" # Security options privileged: true # Required for Podman-in-Docker seccomp_profile: "" # Optional: custom seccomp profile # ============================================================================= # DEPLOYMENT EXAMPLES # ============================================================================= # Example 1: Docker Compose (Recommended for single node) # -------------------------------------------------------- # Save as docker-compose.yml and run: # docker-compose up -d # # services: # api-server: # image: fetchml-api:latest # ports: # - "9101:9101" # volumes: # - ./configs/api/multi-user.yaml:/app/configs/api/prod.yaml:ro # depends_on: # - redis # # worker: # image: fetchml-worker:latest # volumes: # - ./configs/worker/docker-prod.yaml:/app/configs/worker.yaml:ro # depends_on: # - api-server # - redis # deploy: # replicas: 4 # Start with 4 workers # privileged: true # # redis: # image: redis:7-alpine # volumes: # - redis_data:/data # # volumes: # redis_data: # Example 2: Kubernetes Deployment # -------------------------------- # Use the Helm chart or create deployments manually: # # apiVersion: apps/v1 # kind: Deployment # metadata: # name: fetchml-api # spec: # replicas: 1 # template: # spec: # containers: # - name: api # image: fetchml-api:latest # env: # - name: FETCHML_CONFIG_PATH # value: /app/configs/prod.yaml # --- # apiVersion: apps/v1 # kind: Deployment # metadata: # name: fetchml-worker # spec: # replicas: 4 # template: # spec: # containers: # - name: worker # image: fetchml-worker:latest # env: # - name: FETCHML_WORKER_ID # valueFrom: # fieldRef: # fieldPath: metadata.name # - name: FETCHML_CONFIG_PATH # value: /app/configs/worker.yaml # securityContext: # privileged: true # Example 3: Systemd Service (Bare Metal) # ---------------------------------------- # /etc/systemd/system/fetchml-api.service: # # [Unit] # Description=FetchML API Server # After=network.target redis.service # # [Service] # ExecStart=/usr/local/bin/fetchml-api -config /etc/fetchml/api.yaml # Restart=always # User=fetchml # Group=fetchml # # [Install] # WantedBy=multi-user.target # # /etc/systemd/system/fetchml-worker@.service (template): # # [Unit] # Description=FetchML Worker %i # After=fetchml-api.service # # [Service] # ExecStart=/usr/local/bin/fetchml-worker -config /etc/fetchml/worker.yaml # Environment="FETCHML_WORKER_ID=worker-%i" # Restart=always # # [Install] # WantedBy=multi-user.target # # Enable 4 workers: # systemctl enable fetchml-worker@{1..4} # ============================================================================= # MONITORING & MAINTENANCE # ============================================================================= # Check worker status # $ curl http://localhost:9101/api/v1/workers # View Redis queue depth # $ redis-cli LLEN fetchml:queue:pending # View worker heartbeats # $ redis-cli HGETALL fetchml:worker:heartbeats # Restart all workers # $ docker-compose restart worker # Scale workers dynamically # $ docker-compose up -d --scale worker=8 # ============================================================================= # TROUBLESHOOTING # ============================================================================= # Workers not appearing in API: # 1. Check Redis connection: redis-cli ping # 2. Verify worker logs: docker logs # 3. Check FETCHML_WORKER_ID is unique per worker # 4. Verify worker mode is "distributed" not "standalone" # Jobs stuck in queue: # 1. Check worker capacity not exceeded # 2. Verify workers are healthy: docker ps # 3. Check Redis queue: redis-cli LRANGE fetchml:queue:pending 0 10 # 4. Review worker logs for errors # High job latency: # 1. Consider increasing max_concurrent_jobs per worker # 2. Scale to more workers: docker-compose up -d --scale worker=8 # 3. Check if jobs are CPU-bound (increase podman_cpus) # 4. Consider using "least-loaded" strategy instead of "round-robin" # ============================================================================= # SECURITY BEST PRACTICES # ============================================================================= # 1. Firewall Redis - never expose to public internet # 2. Use Redis AUTH password in production # 3. Run workers with minimal privileges (use custom seccomp profile) # 4. Enable TLS for API server in production # 5. Use separate networks for API/Worker/Redis communication # 6. Regular security updates for Podman and container images