fetch_ml/configs/worker/docker-prod.yaml
Jeremie Fraeys 8a7e7695f4
config: consolidate and cleanup configuration files
- Remove redundant config examples (distributed/, standalone/, examples/)
- Delete dev-local.yaml variants (use dev.yaml with env vars)
- Delete prod.yaml (use multi-user.yaml or homelab-secure.yaml)
- Clean up worker configs: remove docker.yaml, homelab-sandbox.yaml
- Update remaining configs with current best practices
- Simplify config schema and documentation
2026-03-04 13:22:52 -05:00

89 lines
1.9 KiB
YAML

worker_id: "docker-worker"
base_path: "/tmp/fetchml-jobs"
entrypoint: "train.py"
redis_url: "redis://redis:6379/0"
local_mode: true
max_workers: 1
poll_interval_seconds: 2
auto_fetch_data: false
data_manager_path: "./data_manager"
dataset_cache_ttl: "30m"
data_dir: "/data/active"
snapshot_store:
enabled: true
endpoint: "blizzard.jfraeys.com"
secure: true
bucket: "fetchml-snapshots"
prefix: "snapshots"
timeout: "5m"
max_retries: 3
podman_image: "python:3.9-slim"
container_workspace: "/workspace"
container_results: "/results"
gpu_vendor: "nvidia"
gpu_visible_devices: [0]
gpu_devices: ["/dev/nvidia0"]
resources:
max_workers: 1
desired_rps_per_worker: 2
podman_cpus: "2"
podman_memory: "4Gi"
metrics:
enabled: true
listen_addr: ":9100"
metrics_flush_interval: "500ms"
task_lease_duration: "30m"
heartbeat_interval: "1m"
max_retries: 3
graceful_timeout: "5m"
# Plugin Configuration
plugins:
# Jupyter Notebook/Lab Service
jupyter:
enabled: true
image: "quay.io/jupyter/base-notebook:latest"
default_port: 8888
# Security settings
security:
trusted_channels:
- "conda-forge"
- "defaults"
- "pytorch"
blocked_packages:
- "requests"
- "urllib3"
- "httpx"
require_password: true
# Resource limits (enforced by scheduler quota system)
max_gpu_per_instance: 1
max_memory_per_instance: "8Gi"
# vLLM Inference Service
vllm:
enabled: true
image: "vllm/vllm-openai:latest"
default_port: 8000
# Model cache location
model_cache: "/models"
# Supported quantization methods: awq, gptq, fp8, squeezellm
default_quantization: "" # empty = no quantization
# Resource limits
max_gpu_per_instance: 4
max_model_len: 4096
# Environment variables passed to container
env:
- "HF_HOME=/models"
- "VLLM_WORKER_MULTIPROC_METHOD=spawn"