fetch_ml/configs/worker/worker-prod.toml
Jeremie Fraeys 86f9ae5a7e
docs(config): reorganize configuration structure and add documentation
Restructure configuration files for better organization:
- Add scheduler configuration examples (scheduler.yaml.example)
- Reorganize worker configs into subdirectories:
  - distributed/ - Multi-node cluster configurations
  - standalone/ - Single-node deployment configs
- Add environment-specific configs:
  - dev-local.yaml, docker-dev.yaml, docker-prod.yaml
  - homelab-secure.yaml, worker-prod.toml
- Add deployment configs for different security modes:
  - docker-standard.yaml, docker-hipaa.yaml, docker-dev.yaml

Add documentation:
- configs/README.md with configuration guidelines
- configs/SECURITY.md with security configuration best practices
2026-02-26 12:04:11 -05:00

47 lines
981 B
TOML

worker_id = "worker-prod-01"
base_path = "/data/ml-experiments"
max_workers = 4
# Redis connection
redis_addr = "localhost:6379"
redis_password = "CHANGE_ME_REDIS_PASSWORD"
redis_db = 0
# SSH connection (for remote operations)
host = "localhost"
user = "ml-user"
port = 22
ssh_key = "~/.ssh/id_rsa"
# Podman configuration
podman_image = "ml-training:latest"
gpu_vendor = "none"
gpu_visible_devices = []
gpu_devices = []
container_workspace = "/workspace"
container_results = "/results"
train_script = "train.py"
# Dataset management
auto_fetch_data = true
data_dir = "/data/datasets"
data_manager_path = "/usr/local/bin/data_manager"
dataset_cache_ttl = "24h"
# Task management
task_lease_duration = "1h"
heartbeat_interval = "30s"
graceful_timeout = "5m"
poll_interval_seconds = 1
metrics_flush_interval = "10s"
[resources]
max_workers = 4
desired_rps_per_worker = 2
podman_cpus = "4"
podman_memory = "16g"
# Metrics exporter
[metrics]
enabled = true
listen_addr = ":9100"