worker_id = "worker-prod-01" base_path = "/data/ml-experiments" max_workers = 4 # Redis connection redis_addr = "localhost:6379" redis_password = "CHANGE_ME_REDIS_PASSWORD" redis_db = 0 # SSH connection (for remote operations) host = "localhost" user = "ml-user" port = 22 ssh_key = "~/.ssh/id_rsa" # Podman configuration podman_image = "ml-training:latest" gpu_vendor = "none" gpu_visible_devices = [] gpu_devices = [] container_workspace = "/workspace" container_results = "/results" train_script = "train.py" # Dataset management auto_fetch_data = true data_dir = "/data/datasets" data_manager_path = "/usr/local/bin/data_manager" dataset_cache_ttl = "24h" # Task management task_lease_duration = "1h" heartbeat_interval = "30s" graceful_timeout = "5m" poll_interval_seconds = 1 metrics_flush_interval = "10s" [resources] max_workers = 4 desired_rps_per_worker = 2 podman_cpus = "4" podman_memory = "16g" # Metrics exporter [metrics] enabled = true listen_addr = ":9100"