- Reorganize configs into environments/, workers/, deprecated/ folders - Reorganize scripts into testing/, deployment/, maintenance/, benchmarks/ folders - Add comprehensive testing guide documentation - Add new Makefile targets: test-full, test-auth, test-status - Update script paths in Makefile to match new organization - Create testing protocol documentation - Add cleanup status checking functionality Testing framework now includes: - Quick authentication tests (make test-auth) - Full test suite runner (make test-full) - Cleanup status monitoring (make test-status) - Comprehensive documentation and troubleshooting guides
45 lines
955 B
TOML
45 lines
955 B
TOML
worker_id = "worker-prod-01"
|
|
base_path = "/data/ml-experiments"
|
|
max_workers = 4
|
|
|
|
# Redis connection
|
|
redis_addr = "localhost:6379"
|
|
redis_password = "JZVd2Y6IDaLNaYLBOFgQ7ae4Ox5t37NTIyPMQlLJD4k="
|
|
redis_db = 0
|
|
|
|
# SSH connection (for remote operations)
|
|
host = "localhost"
|
|
user = "ml-user"
|
|
port = 22
|
|
ssh_key = "~/.ssh/id_rsa"
|
|
|
|
# Podman configuration
|
|
podman_image = "ml-training:latest"
|
|
gpu_access = true
|
|
container_workspace = "/workspace"
|
|
container_results = "/results"
|
|
train_script = "train.py"
|
|
|
|
[resources]
|
|
max_workers = 4
|
|
desired_rps_per_worker = 2
|
|
podman_cpus = "4"
|
|
podman_memory = "16g"
|
|
|
|
# Dataset management
|
|
auto_fetch_data = true
|
|
data_dir = "/data/datasets"
|
|
data_manager_path = "/usr/local/bin/data_manager"
|
|
dataset_cache_ttl = "24h"
|
|
|
|
# Task management
|
|
task_lease_duration = "1h"
|
|
heartbeat_interval = "30s"
|
|
graceful_timeout = "5m"
|
|
poll_interval = "100ms"
|
|
metrics_flush_interval = "10s"
|
|
|
|
# Metrics exporter
|
|
[metrics]
|
|
enabled = true
|
|
listen_addr = ":9090"
|