Restructure configuration files for better organization: - Add scheduler configuration examples (scheduler.yaml.example) - Reorganize worker configs into subdirectories: - distributed/ - Multi-node cluster configurations - standalone/ - Single-node deployment configs - Add environment-specific configs: - dev-local.yaml, docker-dev.yaml, docker-prod.yaml - homelab-secure.yaml, worker-prod.toml - Add deployment configs for different security modes: - docker-standard.yaml, docker-hipaa.yaml, docker-dev.yaml Add documentation: - configs/README.md with configuration guidelines - configs/SECURITY.md with security configuration best practices
47 lines
981 B
TOML
47 lines
981 B
TOML
worker_id = "worker-prod-01"
|
|
base_path = "/data/ml-experiments"
|
|
max_workers = 4
|
|
|
|
# Redis connection
|
|
redis_addr = "localhost:6379"
|
|
redis_password = "CHANGE_ME_REDIS_PASSWORD"
|
|
redis_db = 0
|
|
|
|
# SSH connection (for remote operations)
|
|
host = "localhost"
|
|
user = "ml-user"
|
|
port = 22
|
|
ssh_key = "~/.ssh/id_rsa"
|
|
|
|
# Podman configuration
|
|
podman_image = "ml-training:latest"
|
|
gpu_vendor = "none"
|
|
gpu_visible_devices = []
|
|
gpu_devices = []
|
|
container_workspace = "/workspace"
|
|
container_results = "/results"
|
|
train_script = "train.py"
|
|
|
|
# Dataset management
|
|
auto_fetch_data = true
|
|
data_dir = "/data/datasets"
|
|
data_manager_path = "/usr/local/bin/data_manager"
|
|
dataset_cache_ttl = "24h"
|
|
|
|
# Task management
|
|
task_lease_duration = "1h"
|
|
heartbeat_interval = "30s"
|
|
graceful_timeout = "5m"
|
|
poll_interval_seconds = 1
|
|
metrics_flush_interval = "10s"
|
|
|
|
[resources]
|
|
max_workers = 4
|
|
desired_rps_per_worker = 2
|
|
podman_cpus = "4"
|
|
podman_memory = "16g"
|
|
|
|
# Metrics exporter
|
|
[metrics]
|
|
enabled = true
|
|
listen_addr = ":9100"
|