worker_id = "worker-prod-01" base_path = "/data/ml-experiments" max_workers = 4 # Redis connection redis_addr = "localhost:6379" redis_password = "CHANGE_ME_REDIS_PASSWORD" redis_db = 0 # SSH connection (for remote operations) host = "localhost" user = "ml-user" port = 22 ssh_key = "~/.ssh/id_rsa" # Podman configuration podman_image = "ml-training:latest" gpu_vendor = "none" gpu_visible_devices = [] gpu_devices = [] container_workspace = "/workspace" container_results = "/results" train_script = "train.py" # Dataset management auto_fetch_data = true data_dir = "/data/datasets" data_manager_path = "/usr/local/bin/data_manager" dataset_cache_ttl = "24h" # Task management task_lease_duration = "1h" heartbeat_interval = "30s" graceful_timeout = "5m" poll_interval_seconds = 1 metrics_flush_interval = "10s" [resources] max_workers = 4 desired_rps_per_worker = 2 podman_cpus = "4" podman_memory = "16g" # Metrics exporter [metrics] enabled = true listen_addr = ":9100" # Plugin Configuration [plugins] [plugins.jupyter] enabled = true image = "quay.io/jupyter/base-notebook:latest" default_port = 8888 mode = "lab" max_gpu_per_instance = 1 max_memory_per_instance = "8Gi" [plugins.jupyter.security] require_password = true trusted_channels = ["conda-forge", "defaults", "pytorch"] blocked_packages = ["requests", "urllib3", "httpx"] [plugins.vllm] enabled = true image = "vllm/vllm-openai:latest" default_port = 8000 model_cache = "/models" default_quantization = "" # Options: awq, gptq, fp8, squeezellm max_gpu_per_instance = 2 max_model_len = 4096 tensor_parallel_size = 1 # Environment variables for vLLM [plugins.vllm.env] HF_HOME = "/models" VLLM_WORKER_MULTIPROC_METHOD = "spawn"