- Fix YAML tags in auth config struct (json -> yaml) - Update CLI configs to use pre-hashed API keys - Remove double hashing in WebSocket client - Fix port mapping (9102 -> 9103) in CLI commands - Update permission keys to use jobs:read, jobs:create, etc. - Clean up all debug logging from CLI and server - All user roles now authenticate correctly: * Admin: Can queue jobs and see all jobs * Researcher: Can queue jobs and see own jobs * Analyst: Can see status (read-only access) Multi-user authentication is now fully functional.
45 lines
955 B
TOML
45 lines
955 B
TOML
worker_id = "worker-prod-01"
|
|
base_path = "/data/ml-experiments"
|
|
max_workers = 4
|
|
|
|
# Redis connection
|
|
redis_addr = "localhost:6379"
|
|
redis_password = "JZVd2Y6IDaLNaYLBOFgQ7ae4Ox5t37NTIyPMQlLJD4k="
|
|
redis_db = 0
|
|
|
|
# SSH connection (for remote operations)
|
|
host = "localhost"
|
|
user = "ml-user"
|
|
port = 22
|
|
ssh_key = "~/.ssh/id_rsa"
|
|
|
|
# Podman configuration
|
|
podman_image = "ml-training:latest"
|
|
gpu_access = true
|
|
container_workspace = "/workspace"
|
|
container_results = "/results"
|
|
train_script = "train.py"
|
|
|
|
[resources]
|
|
max_workers = 4
|
|
desired_rps_per_worker = 2
|
|
podman_cpus = "4"
|
|
podman_memory = "16g"
|
|
|
|
# Dataset management
|
|
auto_fetch_data = true
|
|
data_dir = "/data/datasets"
|
|
data_manager_path = "/usr/local/bin/data_manager"
|
|
dataset_cache_ttl = "24h"
|
|
|
|
# Task management
|
|
task_lease_duration = "1h"
|
|
heartbeat_interval = "30s"
|
|
graceful_timeout = "5m"
|
|
poll_interval = "100ms"
|
|
metrics_flush_interval = "10s"
|
|
|
|
# Metrics exporter
|
|
[metrics]
|
|
enabled = true
|
|
listen_addr = ":9090"
|