From 86f9ae5a7e0967a216da42f112795b574fef5ae4 Mon Sep 17 00:00:00 2001
From: Jeremie Fraeys <jfaeys@gmail.com>
Date: Thu, 26 Feb 2026 12:04:11 -0500
Subject: [PATCH] docs(config): reorganize configuration structure and add
 documentation

Restructure configuration files for better organization:
- Add scheduler configuration examples (scheduler.yaml.example)
- Reorganize worker configs into subdirectories:
  - distributed/ - Multi-node cluster configurations
  - standalone/ - Single-node deployment configs
- Add environment-specific configs:
  - dev-local.yaml, docker-dev.yaml, docker-prod.yaml
  - homelab-secure.yaml, worker-prod.toml
- Add deployment configs for different security modes:
  - docker-standard.yaml, docker-hipaa.yaml, docker-dev.yaml

Add documentation:
- configs/README.md with configuration guidelines
- configs/SECURITY.md with security configuration best practices
---
 configs/README.md                             |  60 ++++++++
 configs/SECURITY.md                           | 130 ++++++++++++++++++
 configs/scheduler/scheduler.yaml.example      |  32 +++++
 configs/{workers => worker}/dev-local.yaml    |   0
 .../worker/distributed/worker.yaml.example    |  33 +++++
 configs/{workers => worker}/docker-dev.yaml   |   0
 configs/{workers => worker}/docker-prod.yaml  |   0
 configs/{workers => worker}/docker.yaml       |   0
 .../{workers => worker}/homelab-secure.yaml   |   0
 configs/worker/standalone/worker.yaml.example |  32 +++++
 configs/{workers => worker}/worker-prod.toml  |   0
 configs/workers/examples/prewarm-worker.yaml  |  27 ----
 deployments/configs/worker/docker-dev.yaml    |  31 +++++
 deployments/configs/worker/docker-hipaa.yaml  |  53 +++++++
 .../configs/worker/docker-standard.yaml       |  35 +++++
 15 files changed, 406 insertions(+), 27 deletions(-)
 create mode 100644 configs/README.md
 create mode 100644 configs/SECURITY.md
 create mode 100644 configs/scheduler/scheduler.yaml.example
 rename configs/{workers => worker}/dev-local.yaml (100%)
 create mode 100644 configs/worker/distributed/worker.yaml.example
 rename configs/{workers => worker}/docker-dev.yaml (100%)
 rename configs/{workers => worker}/docker-prod.yaml (100%)
 rename configs/{workers => worker}/docker.yaml (100%)
 rename configs/{workers => worker}/homelab-secure.yaml (100%)
 create mode 100644 configs/worker/standalone/worker.yaml.example
 rename configs/{workers => worker}/worker-prod.toml (100%)
 delete mode 100644 configs/workers/examples/prewarm-worker.yaml
 create mode 100644 deployments/configs/worker/docker-dev.yaml
 create mode 100644 deployments/configs/worker/docker-hipaa.yaml
 create mode 100644 deployments/configs/worker/docker-standard.yaml

diff --git a/configs/README.md b/configs/README.md
new file mode 100644
index 0000000..31e5042
--- /dev/null
+++ b/configs/README.md
@@ -0,0 +1,60 @@
+# fetch_ml Configuration Guide
+
+## Quick Start
+
+### Standalone Mode (Existing Behavior)
+```bash
+# Single worker, direct queue access
+go run ./cmd/worker -config configs/worker/standalone/worker.yaml
+```
+
+### Distributed Mode
+```bash
+# Terminal 1: Start scheduler
+go run ./cmd/scheduler -config configs/scheduler/scheduler.yaml
+
+# Terminal 2: Start worker
+go run ./cmd/worker -config configs/worker/distributed/worker.yaml
+```
+
+### Single-Node Mode (Zero Config)
+```bash
+# Both scheduler and worker in one process
+go run ./cmd/fetch_ml -config configs/multi-node/single-node.yaml
+```
+
+## Config Structure
+
+```
+configs/
+├── scheduler/
+│   └── scheduler.yaml       # Central scheduler configuration
+├── worker/
+│   ├── standalone/
+│   │   └── worker.yaml      # Direct queue access (Redis/SQLite)
+│   └── distributed/
+│       └── worker.yaml      # WebSocket to scheduler
+└── multi-node/
+    └── single-node.yaml     # Combined scheduler+worker
+```
+
+## Key Configuration Modes
+
+| Mode | Use Case | Backend |
+|------|----------|---------|
+| `standalone` | Single machine, existing behavior | Redis/SQLite/Filesystem |
+| `distributed` | Multiple workers, central scheduler | WebSocket to scheduler |
+| `both` | Quick testing, single process | In-process scheduler |
+
+## Worker Mode Selection
+
+Set `worker.mode` to switch between implementations:
+
+```yaml
+worker:
+  mode: "standalone"    # Uses Redis/SQLite queue.Backend
+  # OR
+  mode: "distributed"   # Uses SchedulerBackend over WebSocket
+```
+
+The worker code is unchanged — only the backend implementation changes.
diff --git a/configs/SECURITY.md b/configs/SECURITY.md
new file mode 100644
index 0000000..7138ff1
--- /dev/null
+++ b/configs/SECURITY.md
@@ -0,0 +1,130 @@
+# Security Guidelines for fetch_ml Distributed Mode
+
+## Token Management
+
+### Quick Start (Recommended)
+
+```bash
+# 1. Generate config with tokens
+scheduler -init -config scheduler.yaml
+
+# 2. Or generate a single token
+scheduler -generate-token
+```
+
+### Generating Tokens
+
+**Option 1: Initialize full config (recommended)**
+```bash
+# Generate config with 3 worker tokens
+scheduler -init -config /etc/fetch_ml/scheduler.yaml
+
+# Generate with more tokens
+scheduler -init -config /etc/fetch_ml/scheduler.yaml -tokens 5
+```
+
+**Option 2: Generate single token**
+```bash
+# Generate one token
+scheduler -generate-token
+# Output: wkr_abc123...
+```
+
+**Option 3: Using OpenSSL**
+```bash
+openssl rand -hex 32
+```
+
+### Token Storage
+
+- **NEVER commit tokens to git** — config files with real tokens are gitignored
+- Store tokens in environment variables or secure secret management
+- Use `.env` files locally (already gitignored)
+- Rotate tokens periodically
+
+### Config File Security
+
+```
+configs/
+├── scheduler/scheduler.yaml          # ⛔ NEVER commit with real tokens
+├── scheduler/scheduler.yaml.example  # ✅ Safe to commit (placeholders)
+└── worker/distributed/worker.yaml    # ⛔ NEVER commit with real tokens
+```
+
+All `*.yaml` files in `configs/` subdirectories are gitignored by default.
+
+### Distribution Workflow
+
+```bash
+# On scheduler host:
+$ scheduler -init -config /etc/fetch_ml/scheduler.yaml
+Config generated: /etc/fetch_ml/scheduler.yaml
+
+Generated 3 worker tokens. Copy the appropriate token to each worker's config.
+
+=== Generated Worker Tokens ===
+Copy these to your worker configs:
+
+Worker: worker-01
+Token:  wkr_abc123...
+
+Worker: worker-02
+Token:  wkr_def456...
+
+# On each worker host - copy the appropriate token:
+$ cat > /etc/fetch_ml/worker.yaml <<EOF
+scheduler:
+  address: "scheduler-host:7777"
+  cert: "/etc/fetch_ml/scheduler.crt"
+  token: "wkr_abc123..."  # Copy from above
+EOF
+```
+
+## TLS Configuration
+
+### Self-Signed Certs (Development)
+
+```yaml
+scheduler:
+  auto_generate_certs: true
+  cert_file: "/etc/fetch_ml/scheduler.crt"
+  key_file: "/etc/fetch_ml/scheduler.key"
+```
+
+Auto-generated certs are for development only. The scheduler prints the cert path on first run — distribute this to workers securely.
+
+### Production TLS
+
+Use proper certificates from your CA:
+
+```yaml
+scheduler:
+  auto_generate_certs: false
+  cert_file: "/etc/ssl/certs/fetch_ml.crt"
+  key_file: "/etc/ssl/private/fetch_ml.key"
+```
+
+## Network Security
+
+- Scheduler bind address defaults to `0.0.0.0:7777` — firewall appropriately
+- WebSocket connections use WSS with cert pinning (no CA chain required)
+- Token authentication on every WebSocket connection
+- Metrics endpoint (`/metrics`) has no auth — bind to localhost or add proxy auth
+
+## Audit Logging
+
+Enable audit logging to track job lifecycle:
+
+```yaml
+scheduler:
+  audit_log: "/var/log/fetch_ml/audit.log"
+```
+
+## Security Checklist
+
+- [ ] Tokens generated via `scheduler -init` or `scheduler -generate-token`
+- [ ] Config files with tokens NOT in git
+- [ ] TLS certs distributed securely to workers
+- [ ] Scheduler bind address firewalled
+- [ ] Metrics endpoint protected (if exposed)
+- [ ] Audit logging enabled
diff --git a/configs/scheduler/scheduler.yaml.example b/configs/scheduler/scheduler.yaml.example
new file mode 100644
index 0000000..704df59
--- /dev/null
+++ b/configs/scheduler/scheduler.yaml.example
@@ -0,0 +1,32 @@
+# Scheduler Configuration Example
+# Copy this to scheduler.yaml and replace placeholders with real values
+# DO NOT commit the actual scheduler.yaml with real tokens
+
+scheduler:
+  bind_addr: "0.0.0.0:7777"
+  
+  # Auto-generate self-signed certs if files don't exist
+  auto_generate_certs: true
+  cert_file: "/etc/fetch_ml/scheduler.crt"
+  key_file: "/etc/fetch_ml/scheduler.key"
+  
+  state_dir: "/var/lib/fetch_ml"
+  
+  default_batch_slots: 3
+  default_service_slots: 1
+  
+  starvation_threshold_mins: 5
+  priority_aging_rate: 0.1
+  
+  gang_alloc_timeout_secs: 60
+  acceptance_timeout_secs: 30
+  
+  metrics_addr: "0.0.0.0:9090"
+  
+  # Generate tokens using: openssl rand -hex 32
+  # Example: wkr_abc123... (64 hex chars after wkr_)
+  worker_tokens:
+    - id: "worker-01"
+      token: "wkr_PLACEHOLDER_GENERATE_WITH_OPENSSL_RAND_HEX_32"
+    - id: "worker-02"
+      token: "wkr_PLACEHOLDER_GENERATE_WITH_OPENSSL_RAND_HEX_32"
diff --git a/configs/workers/dev-local.yaml b/configs/worker/dev-local.yaml
similarity index 100%
rename from configs/workers/dev-local.yaml
rename to configs/worker/dev-local.yaml
diff --git a/configs/worker/distributed/worker.yaml.example b/configs/worker/distributed/worker.yaml.example
new file mode 100644
index 0000000..8465519
--- /dev/null
+++ b/configs/worker/distributed/worker.yaml.example
@@ -0,0 +1,33 @@
+# Distributed Worker Configuration Example
+# Copy this to worker.yaml and replace placeholders with real values
+# DO NOT commit the actual worker.yaml with real tokens
+
+node:
+  role: "worker"
+  id: ""           # Auto-generated UUID if empty
+
+worker:
+  mode: "distributed"
+  max_workers: 3
+
+scheduler:
+  address: "192.168.1.10:7777"
+  cert: "/etc/fetch_ml/scheduler.crt"
+  # Copy token from scheduler config for this worker
+  token: "wkr_COPY_FROM_SCHEDULER_CONFIG"
+
+slots:
+  service_slots: 1
+  ports:
+    service_range_start: 8000
+    service_range_end:   8099
+
+gpu:
+  vendor: "auto"
+
+prewarm:
+  enabled: true
+
+log:
+  level:  "info"
+  format: "json"
diff --git a/configs/workers/docker-dev.yaml b/configs/worker/docker-dev.yaml
similarity index 100%
rename from configs/workers/docker-dev.yaml
rename to configs/worker/docker-dev.yaml
diff --git a/configs/workers/docker-prod.yaml b/configs/worker/docker-prod.yaml
similarity index 100%
rename from configs/workers/docker-prod.yaml
rename to configs/worker/docker-prod.yaml
diff --git a/configs/workers/docker.yaml b/configs/worker/docker.yaml
similarity index 100%
rename from configs/workers/docker.yaml
rename to configs/worker/docker.yaml
diff --git a/configs/workers/homelab-secure.yaml b/configs/worker/homelab-secure.yaml
similarity index 100%
rename from configs/workers/homelab-secure.yaml
rename to configs/worker/homelab-secure.yaml
diff --git a/configs/worker/standalone/worker.yaml.example b/configs/worker/standalone/worker.yaml.example
new file mode 100644
index 0000000..c9e5ee8
--- /dev/null
+++ b/configs/worker/standalone/worker.yaml.example
@@ -0,0 +1,32 @@
+# Standalone Worker Configuration Example
+# Copy this to worker.yaml and adjust for your environment
+
+node:
+  role: "worker"
+  id: ""
+
+worker:
+  mode: "standalone"
+  max_workers: 3
+
+queue:
+  backend: "redis"
+  redis_addr: "localhost:6379"
+  redis_password: ""      # Set if Redis requires auth
+  redis_db: 0
+
+slots:
+  service_slots: 1
+  ports:
+    service_range_start: 8000
+    service_range_end:   8099
+
+gpu:
+  vendor: "auto"
+
+prewarm:
+  enabled: true
+
+log:
+  level:  "info"
+  format: "json"
diff --git a/configs/workers/worker-prod.toml b/configs/worker/worker-prod.toml
similarity index 100%
rename from configs/workers/worker-prod.toml
rename to configs/worker/worker-prod.toml
diff --git a/configs/workers/examples/prewarm-worker.yaml b/configs/workers/examples/prewarm-worker.yaml
deleted file mode 100644
index 2d8c8b4..0000000
--- a/configs/workers/examples/prewarm-worker.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-worker_id: "test-prewarm-worker"
-host: "localhost"
-port: 8081
-base_path: "/tmp/fetch-ml-test"
-data_dir: "/tmp/fetch-ml-test/data"
-max_workers: 2
-local_mode: true
-auto_fetch_data: true
-prewarm_enabled: true
-metrics:
-  enabled: true
-  listen_addr: ":9102"
-train_script: "train.py"
-snapshot_store:
-  enabled: false
-  endpoint: ""
-  secure: false
-  region: ""
-  bucket: ""
-  prefix: ""
-  access_key: ""
-  secret_key: ""
-  session_token: ""
-  max_retries: 3
-  timeout: 0s
-gpu_devices: []
-gpu_access: "none"
diff --git a/deployments/configs/worker/docker-dev.yaml b/deployments/configs/worker/docker-dev.yaml
new file mode 100644
index 0000000..fbad8e1
--- /dev/null
+++ b/deployments/configs/worker/docker-dev.yaml
@@ -0,0 +1,31 @@
+# Development mode worker configuration
+# Relaxed validation for fast iteration
+host: localhost
+port: 22
+user: dev-user
+base_path: /tmp/fetchml_dev
+train_script: train.py
+
+# Redis configuration
+redis_url: redis://redis:6379
+
+# Development mode - relaxed security
+compliance_mode: dev
+max_workers: 4
+
+# Sandbox settings (relaxed for development)
+sandbox:
+  network_mode: bridge
+  seccomp_profile: ""
+  no_new_privileges: false
+  allowed_secrets: []  # All secrets allowed in dev
+
+# GPU configuration
+gpu_vendor: none
+
+# Artifact handling (relaxed limits)
+max_artifact_files: 10000
+max_artifact_total_bytes: 1073741824  # 1GB
+
+# Provenance (disabled in dev for speed)
+provenance_best_effort: false
diff --git a/deployments/configs/worker/docker-hipaa.yaml b/deployments/configs/worker/docker-hipaa.yaml
new file mode 100644
index 0000000..3fbd6b4
--- /dev/null
+++ b/deployments/configs/worker/docker-hipaa.yaml
@@ -0,0 +1,53 @@
+# HIPAA compliance mode worker configuration
+# Strict validation, no network, PHI protection
+host: localhost
+port: 22
+user: hipaa-worker
+base_path: /var/lib/fetchml/secure
+train_script: train.py
+
+# Redis configuration (must use env var for password)
+redis_url: redis://redis:6379
+redis_password: ${REDIS_PASSWORD}
+
+# HIPAA mode - strict compliance
+compliance_mode: hipaa
+max_workers: 1
+
+# Sandbox settings (strict isolation required by HIPAA)
+sandbox:
+  # Network must be disabled for HIPAA compliance
+  network_mode: none
+  # Seccomp profile must be set
+  seccomp_profile: default-hardened
+  # No new privileges must be enforced
+  no_new_privileges: true
+  # Only approved secrets allowed (no PHI fields)
+  allowed_secrets:
+    - HF_TOKEN
+    - WANDB_API_KEY
+    - AWS_ACCESS_KEY_ID
+    - AWS_SECRET_ACCESS_KEY
+    # PHI fields are EXPLICITLY DENIED:
+    # - PATIENT_ID
+    # - SSN
+    # - MEDICAL_RECORD_NUMBER
+    # - DIAGNOSIS_CODE
+    # - DOB
+    # - INSURANCE_ID
+
+# GPU configuration
+gpu_vendor: none
+
+# Artifact handling (strict limits for HIPAA)
+max_artifact_files: 100
+max_artifact_total_bytes: 104857600  # 100MB
+
+# Provenance (strictly required for HIPAA)
+provenance_best_effort: false
+
+# SSH key must use environment variable
+ssh_key: ${SSH_KEY_PATH}
+
+# Config hash computation enabled (required for audit)
+# This is automatically computed by Validate()
diff --git a/deployments/configs/worker/docker-standard.yaml b/deployments/configs/worker/docker-standard.yaml
new file mode 100644
index 0000000..c121476
--- /dev/null
+++ b/deployments/configs/worker/docker-standard.yaml
@@ -0,0 +1,35 @@
+# Standard security mode worker configuration
+# Normal sandbox, network isolation
+host: localhost
+port: 22
+user: worker-user
+base_path: /var/lib/fetchml
+train_script: train.py
+
+# Redis configuration
+redis_url: redis://redis:6379
+
+# Standard mode - normal security
+compliance_mode: standard
+max_workers: 2
+
+# Sandbox settings (standard isolation)
+sandbox:
+  network_mode: none
+  seccomp_profile: default
+  no_new_privileges: true
+  allowed_secrets:
+    - HF_TOKEN
+    - WANDB_API_KEY
+    - AWS_ACCESS_KEY_ID
+    - AWS_SECRET_ACCESS_KEY
+
+# GPU configuration
+gpu_vendor: none
+
+# Artifact handling (reasonable limits)
+max_artifact_files: 1000
+max_artifact_total_bytes: 536870912  # 512MB
+
+# Provenance (enabled)
+provenance_best_effort: true