chore(ops): reorganize deployments/monitoring and remove legacy scripts

2026-01-05 12:31:26 -05:00 · 2026-01-05 12:31:26 -05:00 · f726806770
commit f726806770
parent 5ef24e4c6d
101 changed files with 3598 additions and 4982 deletions
--- a/configs/api/dev.yaml
+++ b/configs/api/dev.yaml
@ -0,0 +1,56 @@
+base_path: "/data/experiments"
+
+data_dir: "/data/active"
+
+auth:
+  enabled: false
+
+server:
+  address: "0.0.0.0:9101"
+  tls:
+    enabled: false
+    cert_file: "/app/ssl/cert.pem"
+    key_file: "/app/ssl/key.pem"
+
+security:
+  production_mode: false
+  allowed_origins:
+    - "http://localhost:3000"
+  api_key_rotation_days: 90
+  audit_logging:
+    enabled: true
+    log_path: "/tmp/fetchml-audit.log"
+  rate_limit:
+    enabled: false
+    requests_per_minute: 60
+    burst_size: 10
+  ip_whitelist: []
+
+monitoring:
+  prometheus:
+    enabled: true
+    port: 9101
+    path: "/metrics"
+  health_checks:
+    enabled: true
+    interval: "30s"
+
+redis:
+  addr: "redis:6379"
+  password: ""
+  db: 0
+
+database:
+  type: "sqlite"
+  connection: "/tmp/fetchml.sqlite"
+
+logging:
+  level: "info"
+  file: ""
+  audit_log: ""
+
+resources:
+  max_workers: 1
+  desired_rps_per_worker: 2
+  podman_cpus: "2"
+  podman_memory: "4Gi"
--- a/configs/api/homelab-secure.yaml
+++ b/configs/api/homelab-secure.yaml
@ -0,0 +1,71 @@
+base_path: "/data/experiments"
+
+data_dir: "/data/active"
+
+auth:
+  enabled: true
+  api_keys:
+    homelab_admin:
+      hash: "CHANGE_ME_SHA256_HOMELAB_ADMIN_KEY"
+      admin: true
+      roles:
+        - admin
+      permissions:
+        "*": true
+    homelab_user:
+      hash: "CHANGE_ME_SHA256_HOMELAB_USER_KEY"
+      admin: false
+      roles:
+        - researcher
+      permissions:
+        experiments: true
+        datasets: true
+        jupyter: true
+
+server:
+  address: ":9101"
+  tls:
+    enabled: false
+    cert_file: "/app/ssl/cert.pem"
+    key_file: "/app/ssl/key.pem"
+
+security:
+  production_mode: true
+  allowed_origins:
+    - "https://ml-experiments.example.com"
+  rate_limit:
+    enabled: true
+    requests_per_minute: 60
+    burst_size: 10
+  ip_whitelist:
+    - "127.0.0.1"
+    - "192.168.0.0/16"
+
+monitoring:
+  prometheus:
+    enabled: true
+    port: 9101
+    path: "/metrics"
+  health_checks:
+    enabled: true
+    interval: "30s"
+
+redis:
+  url: "redis://:CHANGE_ME_REDIS_PASSWORD@redis:6379"
+  password: ""
+  db: 0
+
+database:
+  type: "sqlite"
+  connection: "/data/experiments/fetch_ml.sqlite"
+
+logging:
+  level: "info"
+  file: "/logs/fetch_ml.log"
+  audit_log: ""
+
+resources:
+  max_workers: 1
+  desired_rps_per_worker: 2
+  podman_cpus: "2"
+  podman_memory: "4Gi"
--- a/configs/api/multi-user.yaml
+++ b/configs/api/multi-user.yaml
@ -0,0 +1,74 @@
+base_path: "/app/data/experiments"
+
+data_dir: "/data/active"
+
+auth:
+  enabled: true
+  api_keys:
+    admin_user:
+      hash: "CHANGE_ME_SHA256_ADMIN_USER_KEY"
+      admin: true
+      roles: ["user", "admin"]
+      permissions:
+        "*": true
+    researcher1:
+      hash: "CHANGE_ME_SHA256_RESEARCHER1_KEY"
+      admin: false
+      roles: ["user", "researcher"]
+      permissions:
+        "jobs:read": true
+        "jobs:create": true
+        "jobs:update": true
+        "jobs:delete": false
+    analyst1:
+      hash: "CHANGE_ME_SHA256_ANALYST1_KEY"
+      admin: false
+      roles: ["user", "analyst"]
+      permissions:
+        "jobs:read": true
+        "jobs:create": false
+        "jobs:update": false
+        "jobs:delete": false
+
+server:
+  address: ":9101"
+  tls:
+    enabled: false
+
+security:
+  production_mode: false
+  allowed_origins: []
+  rate_limit:
+    enabled: true
+    requests_per_minute: 60
+    burst_size: 20
+  ip_whitelist: []
+
+monitoring:
+  prometheus:
+    enabled: true
+    port: 9101
+    path: "/metrics"
+  health_checks:
+    enabled: true
+    interval: "30s"
+
+redis:
+  url: "redis://redis:6379"
+  password: ""
+  db: 0
+
+database:
+  type: "sqlite"
+  connection: "/app/data/experiments/fetch_ml.sqlite"
+
+logging:
+  level: "info"
+  file: "/logs/app.log"
+  audit_log: ""
+
+resources:
+  max_workers: 3
+  desired_rps_per_worker: 3
+  podman_cpus: "2"
+  podman_memory: "4Gi"
--- a/configs/api/prod.yaml
+++ b/configs/api/prod.yaml
@ -0,0 +1,59 @@
+base_path: "/app/data/experiments"
+
+data_dir: "/data/active"
+
+auth:
+  enabled: true
+  api_keys:
+    admin:
+      hash: "replace-with-sha256-of-your-api-key"
+      admin: true
+      roles:
+        - admin
+      permissions:
+        "*": true
+
+server:
+  address: ":9101"
+  tls:
+    enabled: true
+    cert_file: "/app/ssl/cert.pem"
+    key_file: "/app/ssl/key.pem"
+
+security:
+  production_mode: false
+  allowed_origins: []
+  rate_limit:
+    enabled: true
+    requests_per_minute: 60
+    burst_size: 10
+  ip_whitelist: []
+
+monitoring:
+  prometheus:
+    enabled: true
+    port: 9101
+    path: "/metrics"
+  health_checks:
+    enabled: true
+    interval: "30s"
+
+redis:
+  addr: "redis:6379"
+  password: ""
+  db: 0
+
+database:
+  type: "sqlite"
+  connection: "/app/data/experiments/fetch_ml.sqlite"
+
+logging:
+  level: "info"
+  file: "/logs/fetch_ml.log"
+  audit_log: ""
+
+resources:
+  max_workers: 2
+  desired_rps_per_worker: 5
+  podman_cpus: "2"
+  podman_memory: "4Gi"
--- a/configs/config-local.toml
+++ b/configs/config-local.toml
@ -1,8 +0,0 @@
-# Local development config (TOML)
-# Used by both CLI and TUI when no overrides are set
-
-worker_host = "127.0.0.1"
-worker_user = "dev_user"
-worker_base = "/tmp/ml-experiments"
-worker_port = 9101
-api_key = "your-api-key-here"
--- a/configs/config-test.yaml
+++ b/configs/config-test.yaml
@ -1,26 +0,0 @@
-auth:
-    enabled: true
-    api_keys:
-        dev_user:
-            hash: "replace-with-sha256-of-your-api-key"
-            admin: true
-            roles:
-                - admin
-            permissions:
-                '*': true
-
-server:
-    address: ":9101"
-    tls:
-        enabled: false
-
-security:
-    rate_limit:
-        enabled: false
-
-redis:
-    url: "redis://redis:6379"
-
-logging:
-    level: info
-    console: true
--- a/configs/deprecated/config-debug.yaml
+++ b/configs/deprecated/config-debug.yaml
@ -1,17 +0,0 @@
-base_path: "/app/data/experiments"
-
-auth:
-  enabled: false
-
-server:
-  address: ":9101"
-
-database:
-  type: "sqlite"
-  connection: "/app/data/experiments/fetch_ml.db"
-
-redis:
-  url: "redis://redis:6379"
-
-logging:
-  level: "debug"
--- a/configs/deprecated/config-docker-full.yaml
+++ b/configs/deprecated/config-docker-full.yaml
@ -1,46 +0,0 @@
-base_path: "/app/data/experiments"
-
-auth:
-  enabled: true
-  api_keys:
-    homelab_user:
-      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
-      admin: true
-      roles: ["user", "admin"]
-      permissions:
-        read: true
-        write: true
-        delete: true
-
-server:
-  address: ":9101"
-  tls:
-    enabled: true
-    cert_file: "/app/ssl/cert.pem"
-    key_file: "/app/ssl/key.pem"
-
-security:
-  rate_limit:
-    enabled: true
-    requests_per_minute: 30
-  ip_whitelist: []
-
-# SQLite database for persistence
-database:
-  type: "sqlite"
-  connection: "/app/data/fetch_ml.db"
-
-redis:
-  url: "redis://redis:6379"
-  max_connections: 10
-
-logging:
-  level: "info"
-  file: "/app/logs/app.log"
-  audit_file: "/app/logs/audit.log"
-
-resources:
-  max_workers: 1
-  desired_rps_per_worker: 2
-  podman_cpus: "2"
-  podman_memory: "8g"
--- a/configs/environments/config-docker.yaml
+++ b/configs/environments/config-docker.yaml
@ -1,39 +0,0 @@
-base_path: "/app/data/experiments"
-
-auth:
-  enabled: true
-  api_keys:
-    homelab_user:
-      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
-      admin: true
-      roles: ["user", "admin"]
-      permissions:
-        read: true
-        write: true
-        delete: true
-
-server:
-  address: ":9101"
-  tls:
-    enabled: true
-    cert_file: "/app/ssl/cert.pem"
-    key_file: "/app/ssl/key.pem"
-
-security:
-  rate_limit:
-    enabled: true
-    requests_per_minute: 30
-  ip_whitelist:
-    - "127.0.0.1"
-    - "::1"
-    - "192.168.0.0/16"
-    - "10.0.0.0/8"
-
-redis:
-  url: "redis://redis:6379"
-  max_connections: 10
-
-logging:
-  level: "info"
-  file: "/app/logs/app.log"
-  audit_file: "/app/logs/audit.log"
--- a/configs/environments/config-homelab-secure.yaml
+++ b/configs/environments/config-homelab-secure.yaml
@ -1,58 +0,0 @@
-# Secure Homelab Configuration
-# IMPORTANT: Keep your API keys safe and never share them!
-
-redis:
-    url: "redis://redis:6379"
-    max_connections: 10
-
-auth:
-    enabled: true
-    api_keys:
-        homelab_admin:
-            hash: b444f7d99edd0e32c838d900c4f0dfab86690b55871b587b730f3bc84812dd5f
-            admin: true
-            roles:
-                - admin
-            permissions:
-                '*': true
-        homelab_user:
-            hash: 5badb9721b0cb19f5be512854885cadbc7490afc0de1f62db5ae3144c6cc294c
-            admin: false
-            roles:
-                - researcher
-            permissions:
-                'experiments': true
-                'datasets': true
-                'jupyter': true
-
-server:
-    address: ":9101"
-    tls:
-        enabled: true
-        key_file: "/app/ssl/key.pem"
-        cert_file: "/app/ssl/cert.pem"
-
-security:
-    rate_limit:
-        enabled: true
-        requests_per_minute: 60
-        burst_size: 10
-    ip_whitelist: []
-
-logging:
-    level: "info"
-    file: "logs/fetch_ml.log"
-    console: true
-
-resources:
-    cpu_limit: "2"
-    memory_limit: "4Gi"
-    gpu_limit: 0
-    disk_limit: "10Gi"
-
-# Prometheus metrics
-metrics:
-    enabled: true
-    listen_addr: ":9100"
-    tls:
-        enabled: false
--- a/configs/environments/config-local.yaml
+++ b/configs/environments/config-local.yaml
@ -1,49 +0,0 @@
-redis:
-    url: "redis://redis:6379"
-    max_connections: 10
-
-auth:
-    enabled: true
-    api_keys:
-        homelab_admin:
-            hash: b444f7d99edd0e32c838d900c4f0dfab86690b55871b587b730f3bc84812dd5f
-            admin: true
-            roles:
-                - admin
-            permissions:
-                '*': true
-        homelab_user:
-            hash: 5badb9721b0cb19f5be512854885cadbc7490afc0de1f62db5ae3144c6cc294c
-            admin: false
-            roles:
-                - researcher
-            permissions:
-                'experiments': true
-                'datasets': true
-                'jupyter': true
-
-server:
-    address: ":9101"
-    tls:
-        enabled: true
-        cert_file: "/app/ssl/cert.pem"
-        key_file: "/app/ssl/key.pem"
-
-security:
-    rate_limit:
-        enabled: true
-        requests_per_minute: 60
-        burst_size: 10
-    ip_whitelist:
-        - "127.0.0.1"
-        - "::1"
-        - "172.21.0.1"       # Docker gateway
-
-# Prometheus metrics
-metrics:
-    enabled: true
-    listen_addr: ":9100"
-    tls:
-        enabled: true
-        cert_file: "/app/ssl/cert.pem"
-        key_file: "/app/ssl/key.pem"
--- a/configs/environments/config-multi-user.yaml
+++ b/configs/environments/config-multi-user.yaml
@ -1,78 +0,0 @@
-base_path: "/app/data/experiments"
-
-auth:
-  enabled: true
-  api_keys:
-    admin_user:
-      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
-      admin: true
-      roles: ["user", "admin"]
-      permissions:
-        read: true
-        write: true
-        delete: true
-    researcher1:
-      hash: "ef92b778ba7a6c8f2150019a5678047b6a9a2b95cef8189518f9b35c54d2e3ae" # "research123"
-      admin: false
-      roles: ["user", "researcher"]
-      permissions:
-        jobs:read: true
-        jobs:create: true
-        jobs:update: true
-        jobs:delete: false
-    analyst1:
-      hash: "a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3" # "analyst123"
-      admin: false
-      roles: ["user", "analyst"]
-      permissions:
-        jobs:read: true
-        jobs:create: false
-        jobs:update: false
-        jobs:delete: false
-
-server:
-  address: ":9101"
-  tls:
-    enabled: false
-
-security:
-  rate_limit:
-    enabled: true
-    requests_per_minute: 60
-    burst_size: 20
-  ip_whitelist: []
-  cors:
-    enabled: true
-    allowed_origins: ["https://localhost:9103", "https://localhost:3000"]
-    allowed_methods: ["GET", "POST", "PUT", "DELETE", "OPTIONS"]
-    allowed_headers: ["Content-Type", "Authorization"]
-
-database:
-  type: "sqlite"
-  connection: "/app/data/experiments/fetch_ml.db"
-  max_connections: 20
-  connection_timeout: "30s"
-
-redis:
-  url: "redis://redis:6379"
-  max_connections: 15
-  connection_timeout: "10s"
-
-logging:
-  level: "info"
-  file: "/app/logs/app.log"
-  max_size: "100MB"
-  max_backups: 5
-  compress: true
-
-resources:
-  max_workers: 3
-  desired_rps_per_worker: 3
-  podman_cpus: "2"
-  podman_memory: "4g"
-  job_timeout: "30m"
-
-monitoring:
-  enabled: true
-  metrics_path: "/metrics"
-  health_check_interval: "30s"
--- a/configs/environments/config-prod.yaml
+++ b/configs/environments/config-prod.yaml
@ -1,59 +0,0 @@
-base_path: "./data/ml-experiments"
-
-auth:
-  enabled: true
-  apikeys:
-    homelab_user:
-      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
-      admin: true
-      roles: ["admin"]
-      permissions:
-        read: true
-        write: true
-        delete: true
-
-server:
-  address: ":9101"
-  tls:
-    enabled: false  # Disabled for local testing
-    cert_file: "./ssl/cert.pem"
-    key_file: "./ssl/key.pem"
-    min_version: "1.3"
-
-security:
-  rate_limit:
-    enabled: true
-    requests_per_minute: 60
-    burst_size: 10
-  ip_whitelist:
-    - "127.0.0.1"
-    - "::1"
-    - "localhost"
-    - "10.0.0.0/8"
-    - "192.168.0.0/16"
-    - "172.16.0.0/12"
-  failed_login_lockout:
-    enabled: true
-    max_attempts: 5
-    lockout_duration: "15m"
-
-# SQLite database for production
-database:
-  type: "sqlite"
-  connection: "data/fetch_ml.db"
-  
-redis:
-  url: "redis://localhost:6379"
-  addr: "localhost:6379"
-  password: "JZVd2Y6IDaLNaYLBOFgQ7ae4Ox5t37NTIyPMQlLJD4k="
-
-logging:
-  level: "info"
-  file: "logs/fetch_ml.log"
-  audit_log: "logs/audit.log"
-
-resources:
-  max_workers: 2
-  desired_rps_per_worker: 5
-  podman_cpus: "8"
-  podman_memory: "32g"
--- a/configs/examples/config-postgres.yaml
+++ b/configs/examples/config-postgres.yaml
@ -1,13 +1,17 @@
 # Fetch ML Configuration Example for PostgreSQL
 # This example shows how to configure Fetch ML to use PostgreSQL as the database

+base_path: "./data/experiments"
+
 auth:
  enabled: true
-  apikeys:
+  api_keys:
    admin:
      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd5f8b6c8b0b4f0b8e3" # "password"
      admin: true
      roles: ["admin"]
+      permissions:
+        "*": true

 server:
  address: ":9101"
@ -25,40 +29,34 @@ database:
  # connection: "postgres://fetchml:your_password_here@localhost:5432/fetchml?sslmode=disable"

 redis:
-  host: "localhost"
-  port: 6379
+  addr: "localhost:6379"
  password: ""
  db: 0
-  pool_size: 10
-  max_retries: 3

 logging:
  level: "info"
-  console: true
-  format: "text"
+  file: ""
+  audit_log: ""

 security:
-  secret_key: "your-secret-key-here-at-least-16-characters"
-  jwt_expiry: "24h"
+  production_mode: false
  rate_limit:
    enabled: false
    requests_per_minute: 60
    burst_size: 10
+  ip_whitelist: []

-containers:
-  runtime: "podman"
-  registry: "docker.io"
-  pull_policy: "missing"
-  resources:
-    cpu_limit: "2"
-    memory_limit: "4Gi"
-    gpu_limit: 1
-
-storage:
-  data_path: "data"
-  results_path: "results"
-  temp_path: "/tmp/fetch_ml"
-  cleanup:
+monitoring:
+  prometheus:
    enabled: true
-    max_age_hours: 168
-    max_size_gb: 10
+    port: 9101
+    path: "/metrics"
+  health_checks:
+    enabled: true
+    interval: "30s"
+
+resources:
+  max_workers: 1
+  desired_rps_per_worker: 2
+  podman_cpus: "2"
+  podman_memory: "4Gi"
--- a/configs/examples/config.yaml.example
+++ b/configs/examples/config.yaml.example
@ -1,6 +1,8 @@
 # Fetch ML Configuration Example
 # Copy this file to config.yaml and customize for your environment

+base_path: "./data/experiments"
+
 auth:
  enabled: true
  api_keys:
@ -13,54 +15,43 @@ auth:
        "*": true

 server:
-  host: "localhost"
-  port: 8080
+  address: ":9101"
+  tls:
+    enabled: false

 database:
  type: "sqlite"
  connection: "data/fetch_ml.db"
-  host: ""
-  port: 5432
-  username: ""
-  password: ""
-  database: "fetch_ml"

 redis:
-  url: "redis://localhost:6379"
-  host: "localhost"
-  port: 6379
+  addr: "localhost:6379"
  password: ""
  db: 0
-  pool_size: 10
-  max_retries: 3

 logging:
  level: "info"
  file: "logs/fetch_ml.log"
-  format: "text"
-  console: true
+  audit_log: "logs/audit.log"

 security:
-  secret_key: "your-secret-key-at-least-16-chars"
-  jwt_expiry: "24h"
  rate_limit:
    enabled: false
    requests_per_minute: 60
+    burst_size: 10
+  ip_whitelist: []
+  production_mode: false

-containers:
-  runtime: "podman"
-  registry: "docker.io"
-  pull_policy: "missing"
-  resources:
-    cpu_limit: "2"
-    memory_limit: "4Gi"
-    gpu_limit: 1
-
-storage:
-  data_path: "data"
-  results_path: "results"
-  temp_path: "/tmp/fetch_ml"
-  cleanup:
+monitoring:
+  prometheus:
    enabled: true
-    max_age_hours: 168
-    max_size_gb: 10
+    port: 9101
+    path: "/metrics"
+  health_checks:
+    enabled: true
+    interval: "30s"
+
+resources:
+  max_workers: 1
+  desired_rps_per_worker: 2
+  podman_cpus: "2"
+  podman_memory: "4Gi"
--- a/configs/schema/api_server_config.yaml
+++ b/configs/schema/api_server_config.yaml
@ -12,6 +12,10 @@ properties:
    type: string
    description: Base path for experiment data
    default: "/tmp/ml-experiments"
+  data_dir:
+    type: string
+    description: Data directory (datasets/snapshots) for integrity validation
+    default: "/data/active"
  auth:
    type: object
    additionalProperties: false
@ -40,7 +44,6 @@ properties:
              type: array
              items:
                type: string
-                enum: [admin, data_scientist, data_engineer, viewer, operator]
            permissions:
              type: object
              additionalProperties:
@ -64,9 +67,30 @@ properties:
            type: string
          key_file:
            type: string
-          min_version:
+  monitoring:
+    type: object
+    additionalProperties: false
+    properties:
+      prometheus:
+        type: object
+        additionalProperties: false
+        properties:
+          enabled:
+            type: boolean
+          port:
+            type: integer
+            minimum: 1
+            maximum: 65535
+          path:
+            type: string
+      health_checks:
+        type: object
+        additionalProperties: false
+        properties:
+          enabled:
+            type: boolean
+          interval:
            type: string
-            description: Minimum TLS version (e.g. "1.3")
  database:
    type: object
    additionalProperties: false
@ -99,58 +123,56 @@ properties:
      addr:
        type: string
        description: Optional host:port shorthand for Redis
-      host:
-        type: string
-        default: "localhost"
-      port:
-        type: integer
-        minimum: 1
-        maximum: 65535
-        default: 6379
      password:
        type: string
      db:
        type: integer
        minimum: 0
        default: 0
-      pool_size:
-        type: integer
-        minimum: 1
-        default: 10
-      max_retries:
-        type: integer
-        minimum: 0
-        default: 3
+  queue:
+    type: object
+    additionalProperties: false
+    properties:
+      backend:
+        type: string
+        enum: [redis, sqlite]
+        default: redis
+      sqlite_path:
+        type: string
  logging:
    type: object
    additionalProperties: false
    properties:
      level:
        type: string
-        enum: [debug, info, warn, error, fatal]
+        enum: [debug, info, warn, error]
        default: "info"
      file:
        type: string
      audit_log:
        type: string
-      format:
-        type: string
-        enum: [text, json]
-        default: "text"
-      console:
-        type: boolean
-        default: true
  security:
    type: object
    additionalProperties: false
    properties:
-      secret_key:
-        type: string
-        minLength: 16
-      jwt_expiry:
-        type: string
-        pattern: "^\\d+[smhd]$"
-        default: "24h"
+      production_mode:
+        type: boolean
+        default: false
+      allowed_origins:
+        type: array
+        items:
+          type: string
+      api_key_rotation_days:
+        type: integer
+        minimum: 0
+      audit_logging:
+        type: object
+        additionalProperties: false
+        properties:
+          enabled:
+            type: boolean
+          log_path:
+            type: string
      ip_whitelist:
        type: array
        items:
@ -183,23 +205,23 @@ properties:
            minimum: 1
  resources:
    type: object
-    description: Resource configuration defaults
+    description: Resource configuration
    additionalProperties: false
    properties:
-      cpu_limit:
-        type: string
-        description: Default CPU limit (e.g., "2" or "500m")
-        default: "2"
-      memory_limit:
-        type: string
-        description: Default memory limit (e.g., "1Gi" or "512Mi")
-        default: "4Gi"
-      gpu_limit:
+      max_workers:
        type: integer
-        description: Default GPU limit
-        minimum: 0
-        default: 0
-      disk_limit:
+        minimum: 1
+        default: 1
+      desired_rps_per_worker:
+        type: integer
+        minimum: 1
+      requests_per_sec:
+        type: integer
+        minimum: 1
+      podman_cpus:
        type: string
-        description: Default disk limit
-        default: "10Gi"
+      podman_memory:
+        type: string
+      request_burst:
+        type: integer
+        minimum: 0
--- a/configs/schema/config_schema.yaml
+++ b/configs/schema/config_schema.yaml
--- a/configs/schema/worker_config_schema.yaml
+++ b/configs/schema/worker_config_schema.yaml
@ -2,10 +2,28 @@ $schema: "http://json-schema.org/draft-07/schema#"
 title: "Fetch ML Worker Configuration"
 type: object
 additionalProperties: false
+allOf:
+  # forbid both index and UUID at once (allow zero or one)
+  - not:
+      required: [gpu_visible_devices, gpu_visible_device_ids]
+  - if:
+      properties:
+        queue:
+          properties:
+            backend:
+              const: sqlite
+      required: [queue]
+    then:
+      properties:
+        queue:
+          required: [sqlite_path]
+    else:
+      anyOf:
+        - required: [redis_addr]
+        - required: [redis_url]
 required:
  - base_path
  - worker_id
-  - redis_addr
  - podman_image
  - container_workspace
  - container_results
@ -31,6 +49,9 @@ properties:
  train_script:
    type: string
    description: Path to training script
+  redis_url:
+    type: string
+    description: Legacy Redis URL (if set, redis_addr/password/db are derived)
  redis_addr:
    type: string
    description: Redis server address
@ -42,6 +63,18 @@ properties:
    minimum: 0
    default: 0
    description: Redis database number
+  queue:
+    type: object
+    description: Queue backend configuration (optional; defaults to redis)
+    additionalProperties: false
+    properties:
+      backend:
+        type: string
+        enum: [redis, sqlite]
+        default: redis
+      sqlite_path:
+        type: string
+        description: Path to queue.db (sqlite backend only)
  known_hosts:
    type: string
    description: Path to SSH known hosts file
@ -116,6 +149,48 @@ properties:
    type: string
    description: Dataset cache TTL duration
    default: "30m"
+  snapshot_store:
+    type: object
+    description: Optional S3-compatible snapshot store configuration (worker pulls snapshots by snapshot_id)
+    additionalProperties: false
+    properties:
+      enabled:
+        type: boolean
+        default: false
+      endpoint:
+        type: string
+        description: S3-compatible endpoint (e.g. "s3.amazonaws.com" or "minio:9000")
+      secure:
+        type: boolean
+        default: true
+      region:
+        type: string
+      bucket:
+        type: string
+      prefix:
+        type: string
+        description: Object key prefix where snapshots are stored
+      access_key:
+        type: string
+        description: Optional static access key (otherwise uses env credentials)
+      secret_key:
+        type: string
+        description: Optional static secret key (otherwise uses env credentials)
+      session_token:
+        type: string
+        description: Optional session token for temporary credentials
+      timeout:
+        type: string
+        description: Duration string (e.g., "10m")
+        default: "10m"
+      max_retries:
+        type: integer
+        minimum: 0
+        default: 3
+  prewarm_enabled:
+    type: boolean
+    description: Enable best-effort prewarming of next task artifacts (snapshot/datasets/env image). Default off.
+    default: false
  podman_image:
    type: string
    minLength: 1
@ -126,10 +201,40 @@ properties:
  container_results:
    type: string
    description: Container results path
-  gpu_access:
-    type: boolean
-    default: false
-    description: Enable GPU access
+  gpu_devices:
+    type: array
+    description: GPU device paths to expose to the container (e.g. ["/dev/dri"]).
+    items:
+      type: string
+  gpu_vendor:
+    type: string
+    enum: [nvidia, amd, apple, none]
+    description: GPU vendor/runtime selection for env var injection (nvidia|amd|apple|none).
+    default: "none"
+  gpu_visible_devices:
+    type: array
+    description: GPU indices to expose via vendor-specific env (e.g. [0,1]).
+    items:
+      type: integer
+  gpu_visible_device_ids:
+    type: array
+    description: NVIDIA GPU UUIDs to expose via CUDA_VISIBLE_DEVICES (e.g. ["GPU-..."]). Mutually exclusive with gpu_visible_devices.
+    items:
+      type: string
+  apple_gpu:
+    type: object
+    description: Apple M-series GPU configuration
+    additionalProperties: false
+    properties:
+      enabled:
+        type: boolean
+        default: false
+      metal_device:
+        type: string
+        description: Path to Metal device node (e.g. /dev/metal)
+      mps_runtime:
+        type: string
+        description: Path to MPS runtime device node (e.g. /dev/mps)
  task_lease_duration:
    type: string
    description: Task lease duration
--- a/configs/workers/docker-dev.yaml
+++ b/configs/workers/docker-dev.yaml
@ -0,0 +1,58 @@
+worker_id: "docker-worker"
+base_path: "/data/experiments"
+train_script: "train.py"
+
+redis_url: "redis://redis:6379/0"
+
+local_mode: true
+
+prewarm_enabled: true
+
+max_workers: 1
+poll_interval_seconds: 2
+
+auto_fetch_data: false
+
+data_manager_path: "./data_manager"
+dataset_cache_ttl: "30m"
+
+data_dir: "/data/active"
+
+snapshot_store:
+  enabled: true
+  endpoint: "minio:9000"
+  secure: false
+  bucket: "fetchml-snapshots"
+  prefix: "snapshots"
+  timeout: "2m"
+  max_retries: 3
+
+podman_image: "python:3.9-slim"
+container_workspace: "/workspace"
+container_results: "/results"
+gpu_devices:
+  - "/dev/dri"
+gpu_vendor: "apple"
+gpu_visible_devices: []
+
+# Apple M-series GPU configuration
+apple_gpu:
+  enabled: true
+  metal_device: "/dev/metal"
+  mps_runtime: "/dev/mps"
+
+resources:
+  max_workers: 1
+  desired_rps_per_worker: 2
+  podman_cpus: "2"
+  podman_memory: "4Gi"
+
+metrics:
+  enabled: true
+  listen_addr: ":9100"
+metrics_flush_interval: "500ms"
+
+task_lease_duration: "30m"
+heartbeat_interval: "1m"
+max_retries: 3
+graceful_timeout: "5m"
--- a/configs/workers/docker-prod.yaml
+++ b/configs/workers/docker-prod.yaml
@ -0,0 +1,50 @@
+worker_id: "docker-worker"
+base_path: "/tmp/fetchml-jobs"
+train_script: "train.py"
+
+redis_url: "redis://redis:6379/0"
+
+local_mode: true
+
+max_workers: 1
+poll_interval_seconds: 2
+
+auto_fetch_data: false
+
+data_manager_path: "./data_manager"
+dataset_cache_ttl: "30m"
+
+data_dir: "/data/active"
+
+snapshot_store:
+  enabled: true
+  endpoint: "minio:9000"
+  secure: false
+  bucket: "fetchml-snapshots"
+  prefix: "snapshots"
+  timeout: "5m"
+  max_retries: 3
+
+podman_image: "python:3.9-slim"
+container_workspace: "/workspace"
+container_results: "/results"
+gpu_vendor: "nvidia"
+gpu_visible_devices: [0]
+gpu_devices: ["/dev/nvidia0"]
+
+
+resources:
+  max_workers: 1
+  desired_rps_per_worker: 2
+  podman_cpus: "2"
+  podman_memory: "4Gi"
+
+metrics:
+  enabled: true
+  listen_addr: ":9100"
+metrics_flush_interval: "500ms"
+
+task_lease_duration: "30m"
+heartbeat_interval: "1m"
+max_retries: 3
+graceful_timeout: "5m"
--- a/configs/workers/docker.yaml
+++ b/configs/workers/docker.yaml
@ -0,0 +1,43 @@
+worker_id: "docker-worker"
+base_path: "/tmp/fetchml-jobs"
+train_script: "train.py"
+
+redis_addr: "redis:6379"
+redis_password: ""
+redis_db: 0
+
+local_mode: true
+
+max_workers: 1
+poll_interval_seconds: 5
+
+auto_fetch_data: false
+
+data_manager_path: "./data_manager"
+dataset_cache_ttl: "30m"
+
+snapshot_store:
+  enabled: false
+
+podman_image: "python:3.9-slim"
+container_workspace: "/workspace"
+container_results: "/results"
+gpu_devices: []
+gpu_vendor: "none"
+gpu_visible_devices: []
+
+resources:
+  max_workers: 1
+  desired_rps_per_worker: 2
+  podman_cpus: "2"
+  podman_memory: "4Gi"
+
+metrics:
+  enabled: true
+  listen_addr: ":9100"
+metrics_flush_interval: "500ms"
+
+task_lease_duration: "30m"
+heartbeat_interval: "1m"
+max_retries: 3
+graceful_timeout: "5m"
--- a/configs/workers/examples/prewarm-worker.yaml
+++ b/configs/workers/examples/prewarm-worker.yaml
@ -0,0 +1,27 @@
+worker_id: "test-prewarm-worker"
+host: "localhost"
+port: 8081
+base_path: "/tmp/fetch-ml-test"
+data_dir: "/tmp/fetch-ml-test/data"
+max_workers: 2
+local_mode: true
+auto_fetch_data: true
+prewarm_enabled: true
+metrics:
+  enabled: true
+  listen_addr: ":9102"
+train_script: "train.py"
+snapshot_store:
+  enabled: false
+  endpoint: ""
+  secure: false
+  region: ""
+  bucket: ""
+  prefix: ""
+  access_key: ""
+  secret_key: ""
+  session_token: ""
+  max_retries: 3
+  timeout: 0s
+gpu_devices: []
+gpu_access: "none"
--- a/configs/workers/homelab-secure.yaml
+++ b/configs/workers/homelab-secure.yaml
@ -0,0 +1,47 @@
+worker_id: "homelab-worker"
+base_path: "/tmp/fetchml-jobs"
+train_script: "train.py"
+
+redis_url: "redis://:${REDIS_PASSWORD}@redis:6379/0"
+
+local_mode: true
+
+max_workers: 1
+poll_interval_seconds: 2
+
+auto_fetch_data: false
+
+data_manager_path: "./data_manager"
+dataset_cache_ttl: "30m"
+
+data_dir: "/data/active"
+
+snapshot_store:
+  enabled: true
+  endpoint: "minio:9000"
+  secure: false
+  bucket: "fetchml-snapshots"
+  prefix: "snapshots"
+  timeout: "5m"
+  max_retries: 3
+
+podman_image: "python:3.9-slim"
+container_workspace: "/workspace"
+container_results: "/results"
+gpu_devices: []
+
+resources:
+  max_workers: 1
+  desired_rps_per_worker: 2
+  podman_cpus: "2"
+  podman_memory: "4Gi"
+
+metrics:
+  enabled: true
+  listen_addr: ":9100"
+metrics_flush_interval: "500ms"
+
+task_lease_duration: "30m"
+heartbeat_interval: "1m"
+max_retries: 3
+graceful_timeout: "5m"
--- a/configs/workers/worker-docker.yaml
+++ b/configs/workers/worker-docker.yaml
@ -1,51 +0,0 @@
-# Worker configuration for Docker production-like testing
-worker_id: "docker-test-worker-1"
-
-# Redis configuration
-redis:
-  url: "redis://redis:6379"
-  max_connections: 10
-
-# Local mode settings
-local_mode: false  # Use Podman for containerized job execution
-
-# Job paths
-base_path: "/tmp/fetchml-jobs"
-
-# Container workspace (not used in local mode)
-container_workspace: "/workspace"
-container_results: "/results"
-
-# Podman settings (not used in local mode)
-podman_image: "python:3.9-slim"
-podman_cpus: "2"
-podman_memory: "4g"
-
-# Worker configuration
-heartbeat_interval: "30s"
-lease_duration: "5m"
-max_concurrent_tasks: 1
-
-# Data manager settings
-data_manager:
-  enabled: false
-  base_path: "/data"
-
-# SSH settings for Podman communication
-ssh:
-  enabled: true
-  host: "localhost"
-  port: 2222
-  user: "worker"
-  password: "SecureWorkerPass2024!"
-  key_path: "/home/worker/.ssh/id_rsa"
-
-# Logging
-logging:
-  level: "info"
-  file: "/logs/worker.log"
-
-# Metrics
-metrics:
-  enabled: true
-  endpoint: ":9100"
--- a/configs/workers/worker-homelab-secure.yaml
+++ b/configs/workers/worker-homelab-secure.yaml
@ -1,79 +0,0 @@
-# Worker configuration for Homelab secure environment
-worker_id: "homelab-secure-worker-1"
-
-# Redis configuration with connection pooling
-redis:
-  url: "redis://redis:6379"
-  max_connections: 10
-  connection_timeout: "10s"
-  read_timeout: "5s"
-  write_timeout: "5s"
-
-# Local mode disabled for containerized execution
-local_mode: false
-
-# Job paths with security considerations
-base_path: "/tmp/fetchml-jobs"
-container_workspace: "/workspace"
-container_results: "/results"
-
-# Podman settings with resource limits
-podman_image: "python:3.11-slim"
-podman_cpus: "2"
-podman_memory: "4g"
-podman_network: "ml-job-network"
-podman_timeout: "30m"
-
-# Worker configuration with security
-heartbeat_interval: "30s"
-lease_duration: "5m"
-max_concurrent_tasks: 2
-task_timeout: "30m"
-
-# Data manager settings
-data_manager:
-  enabled: true
-  base_path: "/data"
-  encryption_enabled: true
-  backup_enabled: true
-
-# SSH settings with secure configuration
-ssh:
-  enabled: true
-  host: "localhost"
-  port: 2222
-  user: "worker"
-  password: "HomelabWorker2024!"
-  key_path: "/home/worker/.ssh/id_rsa"
-  max_retries: 3
-  connection_timeout: "30s"
-  strict_host_key_checking: false
-
-# Logging with rotation and security
-logging:
-  level: "info"
-  file: "/logs/worker.log"
-  max_size: "50MB"
-  max_backups: 5
-  compress: true
-  audit_enabled: true
-
-# Metrics and monitoring
-metrics:
-  enabled: true
-  endpoint: ":9100"
-  path: "/metrics"
-
-# Security settings
-security:
-  enable_job_isolation: true
-  sandbox_enabled: true
-  resource_monitoring: true
-  audit_commands: true
-
-# Health check configuration
-health_check:
-  enabled: true
-  interval: "30s"
-  timeout: "10s"
-  failure_threshold: 3
--- a/configs/workers/worker-prod.toml
+++ b/configs/workers/worker-prod.toml
@ -4,7 +4,7 @@ max_workers = 4

 # Redis connection
 redis_addr = "localhost:6379"
-redis_password = "your-redis-password"
+redis_password = "CHANGE_ME_REDIS_PASSWORD"
 redis_db = 0

 # SSH connection (for remote operations)
@ -15,17 +15,13 @@ ssh_key = "~/.ssh/id_rsa"

 # Podman configuration
 podman_image = "ml-training:latest"
-gpu_access = true
+gpu_vendor = "none"
+gpu_visible_devices = []
+gpu_devices = []
 container_workspace = "/workspace"
 container_results = "/results"
 train_script = "train.py"

-[resources]
-max_workers = 4
-desired_rps_per_worker = 2
-podman_cpus = "4"
-podman_memory = "16g"
-
 # Dataset management
 auto_fetch_data = true
 data_dir = "/data/datasets"
@ -36,10 +32,16 @@ dataset_cache_ttl = "24h"
 task_lease_duration = "1h"
 heartbeat_interval = "30s"
 graceful_timeout = "5m"
-poll_interval = "100ms"
+poll_interval_seconds = 1
 metrics_flush_interval = "10s"

+[resources]
+max_workers = 4
+desired_rps_per_worker = 2
+podman_cpus = "4"
+podman_memory = "16g"
+
 # Metrics exporter
 [metrics]
 enabled = true
-listen_addr = ":9090"
+listen_addr = ":9100"
--- a/deployments/Caddyfile.dev
+++ b/deployments/Caddyfile.dev
@ -0,0 +1,45 @@
+{
+	auto_https off
+	admin off
+	servers {
+		protocols h1 h2
+	}
+}
+
+http://localhost {
+	handle /health {
+		reverse_proxy api-server:9101
+	}
+
+	handle /ws* {
+		reverse_proxy api-server:9101
+	}
+
+	handle /api/* {
+		reverse_proxy api-server:9101
+	}
+
+	handle {
+		respond 404
+	}
+}
+
+https://localhost {
+	tls internal
+
+	handle /health {
+		reverse_proxy api-server:9101
+	}
+
+	handle /ws* {
+		reverse_proxy api-server:9101
+	}
+
+	handle /api/* {
+		reverse_proxy api-server:9101
+	}
+
+	handle {
+		respond 404
+	}
+}
--- a/deployments/Caddyfile.homelab-secure
+++ b/deployments/Caddyfile.homelab-secure
@ -0,0 +1,44 @@
+{
+	admin off
+	servers {
+		protocols h1 h2
+	}
+}
+
+{$FETCHML_DOMAIN} {
+	encode gzip
+
+	tls /etc/caddy/ssl/cert.pem /etc/caddy/ssl/key.pem
+
+	header {
+		-Server
+		X-Frame-Options "DENY"
+		X-Content-Type-Options "nosniff"
+		Referrer-Policy "strict-origin-when-cross-origin"
+		Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
+		Content-Security-Policy "default-src 'self'; base-uri 'self'; frame-ancestors 'none'"
+	}
+
+	@admin path /admin/*
+	@admin_private remote_ip private_ranges
+	handle @admin {
+		respond @admin_private 404
+		respond 404
+	}
+
+	handle /health {
+		reverse_proxy api-server:9101
+	}
+
+	handle /ws* {
+		reverse_proxy api-server:9101
+	}
+
+	handle /api/* {
+		reverse_proxy api-server:9101
+	}
+
+	handle {
+		respond 404
+	}
+}
--- a/deployments/Caddyfile.prod
+++ b/deployments/Caddyfile.prod
@ -0,0 +1,47 @@
+{
+	email {$CADDY_EMAIL}
+	admin off
+	servers {
+		protocols h1 h2
+	}
+}
+
+{$FETCHML_DOMAIN} {
+	encode gzip
+
+	request_body {
+		max_size 10MB
+	}
+
+	header {
+		-Server
+		X-Frame-Options "DENY"
+		X-Content-Type-Options "nosniff"
+		Referrer-Policy "strict-origin-when-cross-origin"
+		Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
+		Content-Security-Policy "default-src 'self'; base-uri 'self'; frame-ancestors 'none'"
+	}
+
+	@admin path /admin/*
+	@admin_private remote_ip private_ranges
+	handle @admin {
+		respond @admin_private 404
+		respond 404
+	}
+
+	handle /health {
+		reverse_proxy api-server:9101
+	}
+
+	handle /ws* {
+		reverse_proxy api-server:9101
+	}
+
+	handle /api/* {
+		reverse_proxy api-server:9101
+	}
+
+	handle {
+		respond 404
+	}
+}
--- a/deployments/Caddyfile.smoke
+++ b/deployments/Caddyfile.smoke
@ -0,0 +1,23 @@
+{
+	auto_https off
+}
+
+localhost {
+	tls internal
+
+	handle /health {
+		reverse_proxy api-server:9101
+	}
+
+	handle /ws* {
+		reverse_proxy api-server:9101
+	}
+
+	handle /api/* {
+		reverse_proxy api-server:9101
+	}
+
+	handle {
+		respond 404
+	}
+}
--- a/deployments/Makefile
+++ b/deployments/Makefile
@ -0,0 +1,76 @@
+# Docker Compose Deployment Management
+.PHONY: help dev-up dev-down dev-logs dev-restart homelab-secure-up homelab-secure-down prod-up prod-down status clean
+
+# Default target
+help: ## Show this help message
+	@echo "Available commands:"
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
+
+# Development environment
+dev-up: ## Start development environment
+	@echo "Starting development environment..."
+	docker-compose -f deployments/docker-compose.dev.yml up -d
+	@echo "Services: Caddy (8080/8443), Redis (6379), Prometheus (9090), Grafana (3000)"
+
+dev-down: ## Stop development environment
+	@echo "Stopping development environment..."
+	docker-compose -f deployments/docker-compose.dev.yml down
+
+dev-logs: ## Show development logs
+	docker-compose -f deployments/docker-compose.dev.yml logs -f
+
+dev-restart: ## Restart development environment
+	@echo "Restarting development environment..."
+	docker-compose -f deployments/docker-compose.dev.yml restart
+
+
+# Homelab environment
+homelab-secure-up: ## Start secure homelab environment
+	@echo "Starting secure homelab environment..."
+	docker-compose -f deployments/docker-compose.homelab-secure.yml up -d
+
+homelab-secure-down: ## Stop secure homelab environment
+	@echo "Stopping secure homelab environment..."
+	docker-compose -f deployments/docker-compose.homelab-secure.yml down
+
+# Production environment
+prod-up: ## Start production environment
+	@echo "Starting production environment..."
+	docker-compose -f deployments/docker-compose.prod.yml up -d
+
+prod-down: ## Stop production environment
+	@echo "Stopping production environment..."
+	docker-compose -f deployments/docker-compose.prod.yml down
+
+# Utility commands
+status: ## Show status of all environments
+	@echo "=== Development Status ==="
+	@if [ -f deployments/docker-compose.dev.yml ]; then \
+		docker-compose -f deployments/docker-compose.dev.yml ps; \
+	fi
+	@echo ""
+	@echo "=== Homelab Secure Status ==="
+	@if [ -f deployments/docker-compose.homelab-secure.yml ]; then \
+		docker-compose -f deployments/docker-compose.homelab-secure.yml ps 2>/dev/null || echo "Not running"; \
+	fi
+	@echo ""
+	@echo "=== Production Status ==="
+	@if [ -f deployments/docker-compose.prod.yml ]; then \
+		docker-compose -f deployments/docker-compose.prod.yml ps 2>/dev/null || echo "Not running"; \
+	fi
+
+clean: ## Clean up all containers and volumes
+	@echo "Cleaning up all Docker resources..."
+	@echo "This will remove all containers and volumes. Continue? [y/N]"
+	@read -r confirm && [ "$$confirm" = "y" ] || exit 1
+	docker-compose -f deployments/docker-compose.dev.yml down -v 2>/dev/null || true
+	docker-compose -f deployments/docker-compose.homelab-secure.yml down -v 2>/dev/null || true
+	docker-compose -f deployments/docker-compose.prod.yml down -v 2>/dev/null || true
+	docker system prune -f
+	@echo "Cleanup complete."
+
+# Quick aliases
+up: dev-up ## Alias for dev-up
+down: dev-down ## Alias for dev-down
+logs: dev-logs ## Alias for dev-logs
+restart: dev-restart ## Alias for dev-restart
--- a/deployments/README.md
+++ b/deployments/README.md
@ -2,33 +2,123 @@

 This directory contains Docker Compose configurations for different deployment environments.

-## Files
+## Environment Configurations

- `docker-compose.homelab-secure.yml` - Secure homelab deployment with TLS and authentication
- `docker-compose.prod.yml` - Production deployment configuration
+### Development (`docker-compose.dev.yml`)
+- Full development stack with monitoring
+- Includes: API, Worker, Redis, MinIO (snapshots), Prometheus, Grafana, Loki, Promtail
+- Optimized for local development and testing
+- **Usage**: `docker-compose -f deployments/docker-compose.dev.yml up -d`

-## Usage
+### Homelab - Secure (`docker-compose.homelab-secure.yml`)
+- Secure homelab deployment with authentication and a Caddy reverse proxy
+- TLS is terminated at the reverse proxy (Approach A)
+- Includes: API, Redis (password protected), Caddy reverse proxy
+- **Usage**: `docker-compose -f deployments/docker-compose.homelab-secure.yml up -d`
+
+### Production (`docker-compose.prod.yml`)
+- Production deployment configuration
+- Optimized for performance and security
+- External services assumed (Redis, monitoring)
+- **Usage**: `docker-compose -f deployments/docker-compose.prod.yml up -d`
+
+Note: `docker-compose.prod.yml` is a reproducible staging/testing harness. Real production deployments do not require Docker; you can run the Go services directly (systemd) and use Caddy for TLS/WSS termination.
+
+## TLS / WSS Policy
+
+- The Zig CLI currently supports `ws://` only (native `wss://` is not implemented).
+- Production deployments terminate TLS/WSS at a reverse proxy (Caddy in `docker-compose.prod.yml`) and keep the API server on internal `ws://`.
+- Homelab deployments terminate TLS/WSS at a reverse proxy (Caddy) and keep the API server on internal `ws://`.
+- Health checks in compose files should use `http://localhost:9101/health` when `server.tls.enabled: false`.
+
+## Required Volume Mounts
+
+- `base_path` (experiments) must be writable by the API server.
+- `data_dir` should be mounted if you want snapshot/dataset integrity validation via `ml validate`.
+
+For the default configs:
+
+- `base_path`: `/data/experiments` (dev/homelab configs) or `/app/data/experiments` (prod configs)
+- `data_dir`: `/data/active`
+
+## Quick Start

-### Development
 ```bash
-# Use the main docker-compose.yml in project root
-docker-compose up -d
+# Development (most common)
+docker-compose -f deployments/docker-compose.dev.yml up -d
+
+# Check status
+docker-compose -f deployments/docker-compose.dev.yml ps
+
+# View logs
+docker-compose -f deployments/docker-compose.dev.yml logs -f api-server
+
+# Stop services
+docker-compose -f deployments/docker-compose.dev.yml down
 ```

-### Homelab (Secure)
-```bash
-docker-compose -f deployments/docker-compose.homelab-secure.yml up -d
-```
+## Dev: MinIO-backed snapshots (smoke test)

-### Production
-```bash
-docker-compose -f deployments/docker-compose.prod.yml up -d
-```
+The dev compose file provisions a MinIO bucket and uploads a small example snapshot object at:
+
+`s3://fetchml-snapshots/snapshots/snap-1.tar.gz`
+
+To queue a task that forces the worker to pull the snapshot from MinIO:
+
+1. Start the dev stack:
+   `docker-compose -f deployments/docker-compose.dev.yml up -d`
+
+2. Read the `snapshot_sha256` printed by the init job:
+   `docker-compose -f deployments/docker-compose.dev.yml logs minio-init`
+
+3. Queue a job using the snapshot fields:
+   `ml queue <job-name> --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`
+
+## Smoke tests
+
+- `make dev-smoke` runs the development stack smoke test.
+- `make prod-smoke` runs a Docker-based staging smoke test for the production stack, using a localhost-only Caddy configuration.
+
+   Note: `ml queue` by itself will generate a random commit ID. For full provenance enforcement (manifest + dependency manifest), use `ml sync ./your-project --queue` so the server has real code + dependency files.
+
+   Examples:
+   - `ml queue train-mnist --priority 3 --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`
+   - `ml queue train-a train-b train-c --priority 5 --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`

 ## Environment Variables

-Each deployment may require specific environment variables. Refer to the individual compose files for requirements.
+Create a `.env` file in the project root:
+
+```bash
+# Grafana
+GRAFANA_ADMIN_PASSWORD=your_secure_password
+
+# API Configuration
+LOG_LEVEL=info
+
+# TLS (for secure deployments)
+TLS_CERT_PATH=/app/ssl/cert.pem
+TLS_KEY_PATH=/app/ssl/key.pem
+```
+
+## Service Ports
+
+| Service | Development | Homelab | Production |
+|---------|-------------|---------|------------|
+| API Server | 9101 | 9101 | 9101 |
+| Redis | 6379 | 6379 | - |
+| Prometheus | 9090 | - | - |
+| Grafana | 3000 | - | - |
+| Loki | 3100 | - | - |

 ## Monitoring

-Performance monitoring configurations are in `monitoring/docker-compose.performance.yml`
+- **Development**: Full monitoring stack included
+- **Homelab**: Basic monitoring (configurable)
+- **Production**: External monitoring assumed
+
+## Security Notes
+
+- If you need HTTPS externally, terminate TLS at a reverse proxy.
+- API keys should be managed via environment variables
+- Database credentials should use secrets management in production
--- a/deployments/deploy.sh
+++ b/deployments/deploy.sh
@ -0,0 +1,162 @@
+#!/bin/bash
+# Quick deployment script for fetch_ml
+
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Function to print colored output
+print_status() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+print_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Function to show usage
+show_usage() {
+    echo "Usage: $0 [ENVIRONMENT] [ACTION]"
+    echo ""
+    echo "Environments:"
+    echo "  dev         Development environment"
+    echo "  secure      Secure homelab environment"
+    echo "  prod        Production environment"
+    echo ""
+    echo "Actions:"
+    echo "  up          Start services"
+    echo "  down        Stop services"
+    echo "  restart     Restart services"
+    echo "  logs        Show logs"
+    echo "  status      Show status"
+    echo ""
+    echo "Examples:"
+    echo "  $0 dev up           # Start development environment"
+    echo "  $0 prod down         # Stop production environment"
+    echo "  $0 secure logs       # Show secure environment logs"
+}
+
+# Function to check if docker-compose file exists
+check_compose_file() {
+    local env=$1
+    local compose_file=""
+    
+    case $env in
+        "dev")
+            compose_file="deployments/docker-compose.dev.yml"
+            ;;
+        "secure")
+            compose_file="deployments/docker-compose.homelab-secure.yml"
+            ;;
+        "prod")
+            compose_file="deployments/docker-compose.prod.yml"
+            ;;
+        *)
+            print_error "Unknown environment: $env"
+            show_usage
+            exit 1
+            ;;
+    esac
+    
+    if [ ! -f "$compose_file" ]; then
+        print_error "Docker Compose file not found: $compose_file"
+        exit 1
+    fi
+    
+    echo "$compose_file"
+}
+
+# Function to check if .env file exists
+check_env_file() {
+    local env=$1
+    
+    if [ ! -f ".env" ]; then
+        print_warning ".env file not found. Creating from example..."
+        if [ "$env" = "dev" ]; then
+            cp deployments/env.dev.example .env
+        elif [ "$env" = "prod" ]; then
+            cp deployments/env.prod.example .env
+        else
+            cp deployments/env.dev.example .env
+        fi
+        print_warning "Please edit .env file with your configuration"
+    fi
+}
+
+# Main script
+main() {
+    if [ $# -ne 2 ]; then
+        show_usage
+        exit 1
+    fi
+    
+    local environment=$1
+    local action=$2
+    
+    print_status "Environment: $environment"
+    print_status "Action: $action"
+    
+    # Check compose file
+    compose_file=$(check_compose_file "$environment")
+    print_status "Using: $compose_file"
+    
+    # Check .env file
+    check_env_file "$environment"
+    
+    # Execute action
+    case $action in
+        "up")
+            print_status "Starting $environment environment..."
+            docker-compose -f "$compose_file" up -d
+            print_success "$environment environment started successfully!"
+            
+            # Show service URLs
+            echo ""
+            print_status "Service URLs:"
+            echo "  API Server: http://localhost:9101"
+            if [ "$environment" = "dev" ]; then
+                echo "  Grafana: http://localhost:3000 (admin/admin123)"
+                echo "  Prometheus: http://localhost:9090"
+            fi
+            ;;
+        "down")
+            print_status "Stopping $environment environment..."
+            docker-compose -f "$compose_file" down
+            print_success "$environment environment stopped successfully!"
+            ;;
+        "restart")
+            print_status "Restarting $environment environment..."
+            docker-compose -f "$compose_file" restart
+            print_success "$environment environment restarted successfully!"
+            ;;
+        "logs")
+            print_status "Showing logs for $environment environment..."
+            docker-compose -f "$compose_file" logs -f
+            ;;
+        "status")
+            print_status "Status of $environment environment:"
+            docker-compose -f "$compose_file" ps
+            ;;
+        *)
+            print_error "Unknown action: $action"
+            show_usage
+            exit 1
+            ;;
+    esac
+}
+
+# Run main function
+main "$@"
--- a/deployments/docker-compose.dev.yml
+++ b/deployments/docker-compose.dev.yml
@ -0,0 +1,225 @@
+# Homelab Docker Compose with Centralized Monitoring
+# Includes: API, Redis, Prometheus, Grafana, Loki
+
+services:
+  caddy:
+    image: caddy:2-alpine
+    container_name: ml-dev-caddy
+    restart: unless-stopped
+    ports:
+      - "8080:80"
+      - "8443:443"
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/deployments/Caddyfile.dev:/etc/caddy/Caddyfile:ro
+      - ${FETCHML_REPO_ROOT:-.}/data/dev/caddy/data:/data
+      - ${FETCHML_REPO_ROOT:-.}/data/dev/caddy/config:/config
+    depends_on:
+      api-server:
+        condition: service_healthy
+
+  redis:
+    image: redis:7-alpine
+    container_name: ml-experiments-redis
+    user: "999:999"
+    ports:
+      - "6379:6379"
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/data/dev/redis:/data
+    restart: unless-stopped
+    command: redis-server --appendonly yes
+    healthcheck:
+      test: [ "CMD", "redis-cli", "ping" ]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
+  api-server:
+    build:
+      context: ${FETCHML_REPO_ROOT:-.}
+      dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
+    container_name: ml-experiments-api
+    user: "0:0"
+    expose:
+      - "9101"  # API and health endpoints (internal; external access via Caddy)
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs
+      - ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments
+      - ${FETCHML_REPO_ROOT:-.}/data/dev/active:/data/active
+      - ${FETCHML_REPO_ROOT:-.}/data/dev/workspaces:/data/active/workspaces:delegated
+      - ${FETCHML_REPO_ROOT:-.}/configs/api/dev.yaml:/app/configs/api/dev.yaml
+      - ${FETCHML_REPO_ROOT:-.}/ssl:/app/ssl
+    depends_on:
+      - redis
+    restart: unless-stopped
+    command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
+    environment:
+      - LOG_LEVEL=info
+    healthcheck:
+      test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+    labels:
+      logging: "promtail"
+      job: "api-server"
+
+  minio:
+    image: minio/minio:latest
+    container_name: ml-experiments-minio
+    ports:
+      - "9000:9000"
+      - "9001:9001"
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/data/dev/minio:/data
+    environment:
+      - MINIO_ROOT_USER=minioadmin
+      - MINIO_ROOT_PASSWORD=minioadmin123
+    command: ["server", "/data", "--console-address", ":9001"]
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
+      interval: 10s
+      timeout: 5s
+      retries: 10
+    restart: unless-stopped
+
+  minio-init:
+    image: alpine:3.19
+    container_name: ml-experiments-minio-init
+    depends_on:
+      minio:
+        condition: service_healthy
+    entrypoint: ["/bin/sh", "-c"]
+    command:
+      - |
+        set -eu
+        apk add --no-cache ca-certificates curl tar gzip
+        ARCH=$$(uname -m)
+        MC_ARCH=amd64
+        if [ "$$ARCH" = "aarch64" ] || [ "$$ARCH" = "arm64" ]; then
+          MC_ARCH=arm64
+        fi
+        curl -fsSL -o /usr/local/bin/mc "https://dl.min.io/client/mc/release/linux-$$MC_ARCH/mc"
+        chmod +x /usr/local/bin/mc
+        i=0
+        while ! mc alias set local http://minio:9000 minioadmin minioadmin123; do
+          i=$$((i+1))
+          if [ $$i -ge 30 ]; then
+            echo "minio not ready after 30 attempts" >&2
+            exit 1
+          fi
+          echo "waiting for minio... ($$i/30)"
+          sleep 1
+        done
+        mc mb -p local/fetchml-snapshots || true
+        mkdir -p /tmp/snapshots/snap-1
+        echo -n "hello" > /tmp/snapshots/snap-1/hello.txt
+        tar -C /tmp/snapshots/snap-1 -czf /tmp/snap-1.tar.gz .
+        mc cp /tmp/snap-1.tar.gz local/fetchml-snapshots/snapshots/snap-1.tar.gz
+        FILE_SHA=$$(sha256sum /tmp/snapshots/snap-1/hello.txt | cut -d' ' -f1)
+        SNAP_SHA=$$(echo -n "$$FILE_SHA" | sha256sum | cut -d' ' -f1)
+        echo "snapshot_id=snap-1 snapshot_sha256=$$SNAP_SHA"
+    restart: "no"
+
+  worker:
+    build:
+      context: ${FETCHML_REPO_ROOT:-.}
+      dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
+    container_name: ml-experiments-worker
+    user: "0:0"
+    ports:
+      - "8888:8888"
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs
+      - ${FETCHML_REPO_ROOT:-.}/data/dev/active:/data/active
+      - ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments
+      - ${FETCHML_REPO_ROOT:-.}/data/dev/workspaces:/data/active/workspaces:delegated
+      - ${FETCHML_REPO_ROOT:-.}/configs/workers/docker-dev.yaml:/app/configs/worker.yaml
+      - /sys/fs/cgroup:/sys/fs/cgroup:rw
+    depends_on:
+      redis:
+        condition: service_healthy
+      api-server:
+        condition: service_healthy
+      minio-init:
+        condition: service_completed_successfully
+    restart: unless-stopped
+    environment:
+      - LOG_LEVEL=info
+      - MINIO_ROOT_USER=minioadmin
+      - MINIO_ROOT_PASSWORD=minioadmin123
+      - FETCHML_JUPYTER_DEFAULT_IMAGE=quay.io/jupyter/base-notebook:latest
+      - FETCHML_JUPYTER_CONDA_ENV=base
+      - FETCHML_JUPYTER_KERNEL_NAME=python
+      - FETCHML_PODMAN_CGROUPS=disabled
+    privileged: true
+    command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
+
+  # Prometheus - Metrics collection
+  prometheus:
+    image: prom/prometheus:latest
+    container_name: ml-experiments-prometheus
+    ports:
+      - "9090:9090"
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
+      - prometheus_data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--web.console.libraries=/etc/prometheus/console_libraries'
+      - '--web.console.templates=/etc/prometheus/consoles'
+      - '--web.enable-lifecycle'
+    restart: unless-stopped
+
+  # Grafana - Visualization
+  grafana:
+    image: grafana/grafana:latest
+    container_name: ml-experiments-grafana
+    ports:
+      - "3000:3000"
+    volumes:
+      - grafana_data:/var/lib/grafana
+      - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning
+      - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards
+    environment:
+      - GF_SECURITY_ADMIN_PASSWORD=admin123
+      - GF_USERS_ALLOW_SIGN_UP=false
+    restart: unless-stopped
+    depends_on:
+      - prometheus
+      - loki
+
+  # Loki - Log aggregation
+  loki:
+    image: grafana/loki:latest
+    container_name: ml-experiments-loki
+    ports:
+      - "3100:3100"
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/monitoring/loki-config.yml:/etc/loki/local-config.yaml
+      - loki_data:/loki
+    command: -config.file=/etc/loki/local-config.yaml
+    restart: unless-stopped
+
+  # Promtail - Log collector
+  promtail:
+    image: grafana/promtail:latest
+    container_name: ml-experiments-promtail
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/monitoring/promtail-config.yml:/etc/promtail/config.yml
+      - ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/var/log/app
+      - /var/lib/docker/containers:/var/lib/docker/containers:ro
+      - /var/run/docker.sock:/var/run/docker.sock
+    command: -config.file=/etc/promtail/config.yml
+    restart: unless-stopped
+    depends_on:
+      - loki
+
+volumes:
+  prometheus_data:
+    driver: local
+  grafana_data:
+    driver: local
+  loki_data:
+    driver: local
--- a/deployments/docker-compose.homelab-secure.yml
+++ b/deployments/docker-compose.homelab-secure.yml
@ -1,104 +1,152 @@
-# Homelab Secure Docker Environment
-services:
-  redis:
-    image: redis:7-alpine
-    container_name: ml-homelab-redis
-    ports:
-      - "6379:6379"
-    volumes:
-      - redis_homelab_data:/data
-    restart: unless-stopped
-    command: >
-      redis-server 
-      --appendonly yes 
-      --requirepass "HomelabRedis2024!"
-      --maxmemory 512mb
-      --maxmemory-policy allkeys-lru
-    healthcheck:
-      test: ["CMD", "redis-cli", "-a", "HomelabRedis2024!", "ping"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-    networks:
-      - ml-homelab-network
+# Secure Homelab Docker Compose Configuration
+# Use with: docker-compose -f docker-compose.yml -f docker-compose.homelab-secure.yml up -d

+services:
  api-server:
    build:
-      context: .
-      dockerfile: build/docker/homelab-secure.Dockerfile
-    container_name: ml-homelab-api
+      context: ${FETCHML_REPO_ROOT:-.}
+      dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
+    container_name: ml-experiments-api
    ports:
-      - "9104:9101"  # API server port
-      - "2223:2222"  # Secure SSH port
-      - "9101:9100"  # Prometheus metrics
+      - "9101:9101"
+      - "9100:9100" # Prometheus metrics endpoint
    volumes:
-      - ./data:/app/data/experiments
-      - ./logs:/logs
-      - ./configs/config-homelab-secure.yaml:/app/configs/config.yaml
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/experiments:/data/experiments
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/active:/data/active
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/logs:/logs
+      - ${FETCHML_REPO_ROOT:-.}/ssl:/app/ssl:ro
+      - ${FETCHML_REPO_ROOT:-.}/configs/api/homelab-secure.yaml:/app/configs/api/prod.yaml:ro
+      - ${FETCHML_REPO_ROOT:-.}/.env.secure:/app/.env.secure:ro
    depends_on:
      redis:
        condition: service_healthy
    restart: unless-stopped
    environment:
-      - REDIS_URL=redis://:HomelabRedis2024!@redis:6379
      - LOG_LEVEL=info
-      - TZ=America/New_York
+      # Load secure environment variables
+      - JWT_SECRET_FILE=/app/.env.secure
    healthcheck:
-      test: ["CMD", "curl", "-k", "-f", "https://localhost:9101/health"]
+      test: ["CMD", "curl", "-f", "http://localhost:9101/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
-    command: >
-      sh -c "
-        sudo /app/start-security.sh &
-        /usr/local/bin/api-server -config /app/configs/config.yaml
-      "
+    labels:
+      logging: "promtail"
+      job: "api-server"
+    command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"]
    networks:
-      - ml-homelab-network
+      - ml-experiments-network
+      # Add internal network for secure communication
+      - ml-backend-network
+
+  minio:
+    image: minio/minio:latest
+    container_name: ml-experiments-minio
+    ports:
+      - "9000:9000"
+      - "9001:9001"
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/minio:/data
+    environment:
+      - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
+      - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
+    command: ["server", "/data", "--console-address", ":9001"]
+    restart: unless-stopped
+    networks:
+      - ml-backend-network
+
+  minio-init:
+    image: alpine:3.19
+    container_name: ml-experiments-minio-init
+    depends_on:
+      - minio
+    entrypoint: ["/bin/sh", "-c"]
+    command:
+      - |
+        apk add --no-cache ca-certificates curl >/dev/null
+        curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc
+        chmod +x /usr/local/bin/mc
+        mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123}
+        mc mb -p local/fetchml-snapshots || true
+    restart: "no"
+    networks:
+      - ml-backend-network

  worker:
    build:
-      context: .
-      dockerfile: build/docker/homelab-secure.Dockerfile
-    container_name: ml-homelab-worker
+      context: ${FETCHML_REPO_ROOT:-.}
+      dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
+    container_name: ml-experiments-worker
    volumes:
-      - ./data:/app/data/experiments
-      - ./logs:/logs
-      - ./configs/worker-homelab-secure.yaml:/app/configs/worker.yaml
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/experiments:/app/data/experiments
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/active:/data/active
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/logs:/logs
+      - ${FETCHML_REPO_ROOT:-.}/configs/workers/homelab-secure.yaml:/app/configs/worker.yaml
    depends_on:
      redis:
        condition: service_healthy
      api-server:
        condition: service_healthy
+      minio-init:
+        condition: service_started
    restart: unless-stopped
    environment:
-      - REDIS_URL=redis://:HomelabRedis2024!@redis:6379
      - LOG_LEVEL=info
-      - TZ=America/New_York
-    privileged: true  # Required for Podman
-    security_opt:
-      - no-new-privileges:true
-    cap_drop:
-      - ALL
-    cap_add:
-      - NET_ADMIN
-      - SYS_ADMIN
-    command: >
-      sh -c "
-        sudo /app/start-security.sh &
-        /usr/local/bin/worker -config /app/configs/worker.yaml
-      "
+      - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
+      - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
+      - REDIS_PASSWORD=${REDIS_PASSWORD}
+    privileged: true
+    command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
    networks:
-      - ml-homelab-network
+      - ml-backend-network

-volumes:
-  redis_homelab_data:
-    driver: local
+  caddy:
+    image: caddy:2-alpine
+    container_name: ml-experiments-caddy
+    restart: unless-stopped
+    ports:
+      - "80:80"
+      - "443:443"
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/deployments/Caddyfile.homelab-secure:/etc/caddy/Caddyfile:ro
+      - ${FETCHML_REPO_ROOT:-.}/ssl:/etc/caddy/ssl:ro
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/caddy/data:/data
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/caddy/config:/config
+    environment:
+      - FETCHML_DOMAIN=${FETCHML_DOMAIN:-ml.local}
+    depends_on:
+      api-server:
+        condition: service_healthy
+    networks:
+      - ml-experiments-network
+
+  # Redis with authentication
+  redis:
+    image: redis:7-alpine
+    container_name: ml-experiments-redis
+    user: "999:999"
+    ports:
+      - "127.0.0.1:6379:6379"  # Bind to localhost only
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/redis:/data
+      - ${FETCHML_REPO_ROOT:-.}/redis/redis-secure.conf:/usr/local/etc/redis/redis.conf:ro
+    restart: unless-stopped
+    command: redis-server /usr/local/etc/redis/redis.conf --requirepass ${REDIS_PASSWORD}
+    healthcheck:
+      test: ["CMD", "redis-cli", "--no-auth-warning", "-a", "${REDIS_PASSWORD}", "ping"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+    networks:
+      - ml-backend-network
+    environment:
+      - REDIS_PASSWORD=${REDIS_PASSWORD}
+
+volumes: {}

 networks:
-  ml-homelab-network:
+  ml-experiments-network:
+    driver: bridge
+  ml-backend-network:
    driver: bridge
-    ipam:
-      config:
-        - subnet: 172.25.0.0/16
--- a/deployments/docker-compose.prod.smoke.yml
+++ b/deployments/docker-compose.prod.smoke.yml
@ -0,0 +1,75 @@
+services:
+  caddy:
+    image: caddy:2-alpine
+    environment:
+      - FETCHML_DOMAIN=localhost
+      - CADDY_EMAIL=smoke@example.invalid
+    ports:
+      - "8080:80"
+      - "8443:443"
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/caddy/data:/data
+      - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/caddy/config:/config
+    command:
+      - /bin/sh
+      - -c
+      - |
+        cat > /etc/caddy/Caddyfile <<'EOF'
+        {
+        	debug
+        	servers {
+        		protocols h1 h2
+        	}
+        }
+        
+        https://localhost {
+        	tls internal {
+        		protocols tls1.2 tls1.3
+        	}
+        
+        	handle {
+        		reverse_proxy api-server:9101
+        	}
+        }
+        EOF
+        exec caddy run --config /etc/caddy/Caddyfile
+
+  redis:
+    image: redis:7-alpine
+    user: "999:999"
+    restart: unless-stopped
+    expose:
+      - "6379"
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/redis:/data
+    command: redis-server --appendonly yes
+    healthcheck:
+      test: [ "CMD", "redis-cli", "ping" ]
+      interval: 10s
+      timeout: 5s
+      retries: 10
+
+  api-server:
+    build:
+      context: ${FETCHML_REPO_ROOT:-.}
+      dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
+    user: "0:0"
+    restart: unless-stopped
+    expose:
+      - "9101"
+    depends_on:
+      redis:
+        condition: service_healthy
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/experiments:/data/experiments
+      - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/active:/data/active
+      - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/logs:/logs
+      - ${FETCHML_REPO_ROOT:-.}/configs/api/dev.yaml:/app/configs/api/dev.yaml:ro
+    command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
+    healthcheck:
+      test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
+      interval: 10s
+      timeout: 5s
+      retries: 10
+
+volumes: {}
--- a/deployments/docker-compose.prod.yml
+++ b/deployments/docker-compose.prod.yml
@ -1,12 +1,31 @@
 # Full Production Docker Environment with Podman and SQLite
 services:
+  caddy:
+    image: caddy:2-alpine
+    container_name: ml-prod-caddy
+    restart: unless-stopped
+    ports:
+      - "80:80"
+      - "443:443"
+    volumes:
+      - ./Caddyfile.prod:/etc/caddy/Caddyfile:ro
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/caddy/data:/data
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/caddy/config:/config
+    environment:
+      - FETCHML_DOMAIN=${FETCHML_DOMAIN}
+      - CADDY_EMAIL=${CADDY_EMAIL}
+    depends_on:
+      api-server:
+        condition: service_healthy
+
  redis:
    image: redis:7-alpine
    container_name: ml-prod-redis
-    ports:
-      - "6379:6379"
+    user: "999:999"
+    expose:
+      - "6379"
    volumes:
-      - redis_prod_data:/data
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/redis:/data
    restart: unless-stopped
    command: redis-server --appendonly yes
    healthcheck:
@ -17,57 +36,87 @@ services:

  api-server:
    build:
-      context: .
-      dockerfile: build/docker/secure-prod.Dockerfile
+      context: ${FETCHML_REPO_ROOT:-.}
+      dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/secure-prod.Dockerfile
    container_name: ml-prod-api
-    ports:
-      - "9103:9101"  # API server port
-      - "2222:2222"  # Secure SSH port for Podman communication
-      - "9100:9100"  # Prometheus metrics
+    expose:
+      - "9101"  # API server port (internal; external access via Caddy)
+      - "2222"  # Secure SSH port for Podman communication (internal)
    volumes:
-      - ./data:/app/data/experiments
-      - ./logs:/logs
-      - ./configs/config-multi-user.yaml:/app/configs/config.yaml
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/experiments:/app/data/experiments
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/active:/data/active
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/logs:/logs
+      - ${FETCHML_REPO_ROOT:-.}/configs/api/multi-user.yaml:/app/configs/api/prod.yaml
    depends_on:
      redis:
        condition: service_healthy
    restart: unless-stopped
    environment:
-      - REDIS_URL=redis://redis:6379
      - LOG_LEVEL=info
    healthcheck:
-      test: [ "CMD", "curl", "-k", "https://localhost:9101/health" ]
+      test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
-    # Start SSH daemon for Podman communication
-    command: ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"]
+    # Start API server (ensure data_dir exists for snapshot/dataset validation)
+    command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"]
+
+  minio:
+    image: minio/minio:latest
+    container_name: ml-prod-minio
+    expose:
+      - "9000"
+      - "9001"
+    volumes:
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/minio:/data
+    environment:
+      - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
+      - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
+    command: ["server", "/data", "--console-address", ":9001"]
+    restart: unless-stopped
+
+  minio-init:
+    image: alpine:3.19
+    container_name: ml-prod-minio-init
+    depends_on:
+      - minio
+    entrypoint: ["/bin/sh", "-c"]
+    command:
+      - |
+        apk add --no-cache ca-certificates curl >/dev/null
+        curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc
+        chmod +x /usr/local/bin/mc
+        mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123}
+        mc mb -p local/fetchml-snapshots || true
+    restart: "no"

  worker:
    build:
-      context: .
-      dockerfile: build/docker/secure-prod.Dockerfile
+      context: ${FETCHML_REPO_ROOT:-.}
+      dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
    container_name: ml-prod-worker
    volumes:
-      - ./data:/app/data/experiments
-      - ./logs:/logs
-      - ./configs/worker-docker.yaml:/app/configs/worker.yaml
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/experiments:/app/data/experiments
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/active:/data/active
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/logs:/logs
+      - ${FETCHML_REPO_ROOT:-.}/configs/workers/docker-prod.yaml:/app/configs/worker.yaml
    depends_on:
      redis:
        condition: service_healthy
      api-server:
        condition: service_healthy
+      minio-init:
+        condition: service_started
    restart: unless-stopped
    environment:
-      - REDIS_URL=redis://redis:6379
      - LOG_LEVEL=info
+      - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
+      - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
    privileged: true  # Required for Podman to work in Docker
    command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]

-volumes:
-  redis_prod_data:
-    driver: local
+volumes: {}

 networks:
  default:
--- a/deployments/env.dev.example
+++ b/deployments/env.dev.example
@ -0,0 +1,17 @@
+# Development Environment Variables
+# Copy this file to .env and modify as needed
+
+# Grafana
+GRAFANA_ADMIN_PASSWORD=admin123
+
+# API Configuration
+LOG_LEVEL=info
+
+# TLS (development uses self-signed certs)
+TLS_CERT_PATH=/app/ssl/cert.pem
+TLS_KEY_PATH=/app/ssl/key.pem
+
+# Development-specific
+ENVIRONMENT=development
+DEBUG=true
+API_KEY=development_key_only
--- a/deployments/env.prod.example
+++ b/deployments/env.prod.example
@ -0,0 +1,28 @@
+# Production Environment Variables
+# Copy this file to .env and modify as needed
+
+# Grafana (if using)
+GRAFANA_ADMIN_PASSWORD=CHANGE_ME_SECURE_PASSWORD
+
+# API Configuration
+LOG_LEVEL=warn
+
+# TLS (production should use CA-signed certs)
+TLS_CERT_PATH=/app/ssl/cert.pem
+TLS_KEY_PATH=/app/ssl/key.pem
+
+# Caddy (TLS/WSS termination)
+FETCHML_DOMAIN=ml.example.com
+CADDY_EMAIL=admin@example.com
+
+# Production-specific
+ENVIRONMENT=production
+DEBUG=false
+
+# Security
+API_KEY=CHANGE_ME_SECURE_API_KEY
+ALLOWED_ORIGINS=https://yourdomain.com
+
+# External services (if applicable)
+EXTERNAL_REDIS_URL=redis://external-redis:6379
+EXTERNAL_PROMETHEUS_URL=http://external-prometheus:9090
--- a/deployments/setup.sh
+++ b/deployments/setup.sh
@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  cat <<'EOF'
+Usage: ./deployments/setup.sh
+
+This script DOES NOT install dependencies.
+It prints the manual steps and required/optional dependencies for a real (non-Docker) production deployment.
+EOF
+}
+
+if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
+  usage
+  exit 0
+fi
+
+cat <<'EOF'
+== FetchML production setup (non-Docker) ==
+
+Required (core):
+- Go-built binaries: api-server, worker
+- Redis (reachable from api-server + worker)
+- A writable base_path for experiments
+- A writable data_dir if you want snapshot/dataset staging + integrity validation
+
+Required (TLS/WSS):
+- Caddy (recommended) OR another reverse proxy that can terminate TLS and proxy WebSockets
+
+Optional:
+- systemd (recommended) for service supervision
+- MinIO / S3-compatible storage (only if you use remote snapshot_store)
+- Podman (only if your worker executes jobs in Podman)
+
+Notes:
+- The Zig CLI currently supports ws:// only. In production, keep the API server internal on ws:// and terminate TLS/WSS at Caddy.
+- This script is informational; it will not modify your system.
+
+---
+1) Build binaries
+
+  make prod
+
+Artifacts:
+  ./bin/api-server
+  ./bin/worker
+
+---
+2) Create a dedicated user (recommended)
+
+  useradd --system --create-home --shell /usr/sbin/nologin fetchml
+
+---
+3) Create directories (example paths)
+
+  mkdir -p /var/lib/fetchml/experiments
+  mkdir -p /var/lib/fetchml/active/datasets /var/lib/fetchml/active/snapshots
+  mkdir -p /var/log/fetchml
+
+Ensure ownership:
+  chown -R fetchml:fetchml /var/lib/fetchml /var/log/fetchml
+
+---
+4) Configure the API server
+
+- Start from: configs/api/prod.yaml (or your multi-user config)
+- For real production, keep server.tls.enabled: false
+- Ensure monitoring.health_checks.enabled is set appropriately
+
+Example flags:
+  ./bin/api-server -config /etc/fetchml/api.yaml
+
+---
+5) Configure Caddy (TLS/WSS termination)
+
+- Recommended: use deployments/Caddyfile.prod as a baseline.
+- Caddy should listen on 443 and reverse proxy to the API server (internal) on 9101.
+
+Example layout:
+  /etc/caddy/Caddyfile
+  /var/lib/caddy
+
+---
+6) Configure Redis
+
+- Use Redis AUTH in production.
+- Ensure the api-server + worker can reach it.
+
+---
+7) Run under systemd (recommended)
+
+Create unit files (example):
+  /etc/systemd/system/fetchml-api.service
+  /etc/systemd/system/fetchml-worker.service
+  /etc/systemd/system/caddy.service (if not already provided)
+
+Then:
+  systemctl daemon-reload
+  systemctl enable --now fetchml-api
+  systemctl enable --now fetchml-worker
+  systemctl enable --now caddy
+
+---
+8) Smoke check
+
+Internal health (no TLS):
+  curl -f http://127.0.0.1:9101/health
+
+External health (through Caddy TLS termination):
+  curl -f https://YOUR_DOMAIN/health
+
+EOF
--- a/monitoring/README.md
+++ b/monitoring/README.md
@ -1,13 +1,52 @@
-# Centralized Monitoring Stack
+# Monitoring Stack
+
+## Directory Structure (Canonical)
+
+All monitoring configuration lives under `monitoring/`.
+
+```text
+monitoring/
+  prometheus/
+    prometheus.yml                # Prometheus scrape configuration
+  grafana/
+    dashboards/                   # Grafana dashboards (JSON)
+    provisioning/
+      datasources/                # Grafana data sources (Prometheus/Loki)
+      dashboards/                 # Grafana dashboard provider (points at dashboards/)
+  loki-config.yml                 # Loki configuration
+  promtail-config.yml             # Promtail configuration
+```
+
+### What is "Grafana provisioning"?
+
+Grafana provisioning is how Grafana auto-configures itself on startup (no clicking in the UI):
+
+- **`grafana/provisioning/datasources/*.yml`**
+  - Defines where Grafana reads data from (e.g. Prometheus at `http://prometheus:9090`, Loki at `http://loki:3100`).
+- **`grafana/provisioning/dashboards/*.yml`**
+  - Tells Grafana to load dashboard JSON files from `/var/lib/grafana/dashboards`.
+- **`grafana/dashboards/*.json`**
+  - The dashboards themselves.
+
+### Source of truth
+
+- **Dashboards**: edit/add JSON in `monitoring/grafana/dashboards/`.
+- **Grafana provisioning**: edit files in `monitoring/grafana/provisioning/`.
+- **Prometheus scrape config**: edit `monitoring/prometheus/prometheus.yml`.
+
+`scripts/setup_monitoring.py` is intentionally **provisioning-only**:
+
+- It (re)writes Grafana **datasources** and the **dashboard provider**.
+- It does **not** create or overwrite any dashboard JSON files.

 ## Quick Start

 ```bash
-# Start everything
-docker-compose up -d
+# Start deployment
+make deploy-up

 # Access services
-open http://localhost:3000  # Grafana (admin/admin)
+open http://localhost:3000  # Grafana (admin/admin123)
 open http://localhost:9090  # Prometheus
 ```

@ -15,137 +54,80 @@ open http://localhost:9090  # Prometheus

 ### Grafana (Port 3000)
 **Main monitoring dashboard**
- Username: `admin`
- Password: `admin`
- Pre-configured datasources: Prometheus + Loki
- Pre-loaded ML Queue dashboard
+- Username: `admin`  
+- Password: `admin123`
+- Data source: Prometheus (http://localhost:9090)

 ### Prometheus (Port 9090)
-**Metrics collection**  
- Scrapes metrics from API server (`:9100/metrics`)
- 15s scrape interval
- Data retention: 15 days (default)
+**Metrics collection and storage**

 ### Loki (Port 3100)
 **Log aggregation**
- Collects logs from all containers
- Collects application logs from `./logs/`
- Retention: 7 days

-### Promtail
-**Log shipping**
- Watches Docker container logs
- Watches `./logs/*.log`
- Sends to Loki
+## Dashboards

-## Viewing Data
+Available dashboard configurations in `grafana/dashboards/`:

-### Metrics
-1. Open Grafana: http://localhost:3000
-2. Go to "ML Task Queue Monitoring" dashboard
-3. See: queue depth, task duration, error rates, etc.
+- `load-test-performance.json` - Load test metrics
+- `websocket-performance.json` - WebSocket performance
+- `system-health.json` - System health monitoring
+- `rsync-performance.json` - Rsync performance metrics

-### Logs
-1. Open Grafana → Explore
-2. Select "Loki" datasource
-3. Query examples:
-   ```logql
-   {job="app_logs"}                    # All app logs
-   {job="docker",service="api-server"} # API server logs
-   {job="docker"} |= "error"          # All errors
-   ```
+### Importing Dashboards

-## Architecture
-
-```
-┌─────────────┐
-│  API Server │──┐
-└─────────────┘  │
-                 ├──► Prometheus ──► Grafana
-┌─────────────┐  │                      ▲
-│   Worker    │──┘                      │
-└─────────────┘                         │
-                                        │
-┌─────────────┐                         │
-│  App Logs   │──┐                      │
-└─────────────┘  │                      │
-                 ├──► Promtail ──► Loki ┘
-┌─────────────┐  │
-│Docker Logs  │──┘
-└─────────────┘
-```
+1. Go to Grafana → "+" → "Import"
+2. Upload JSON files from `grafana/dashboards/` directory
+3. Select Prometheus data source

 ## Configuration Files

- `prometheus.yml` - Metrics scraping config
- `loki-config.yml` - Log storage config  
- `promtail-config.yml` - Log collection config
- `grafana/provisioning/` - Auto-configuration
+- `prometheus/prometheus.yml` - Prometheus configuration
+- `loki-config.yml` - Loki configuration  
+- `promtail-config.yml` - Promtail configuration
+- `security_rules.yml` - Security rules

-## Customization
+## Usage

-### Add More Scrapers
-Edit `monitoring/prometheus.yml`:
-```yaml
-scrape_configs:
-  - job_name: 'my-service'
-    static_configs:
-      - targets: ['my-service:9100']
-```
+1. Start monitoring stack: `make deploy-up`
+2. Access Grafana: http://localhost:3000 (admin/admin123)
+3. Import dashboards from `grafana/dashboards/` directory
+4. View metrics and test results in real-time

-### Change Retention
-**Prometheus:** Add to command in docker-compose:
-```yaml
- '--storage.tsdb.retention.time=30d'
-```
+## Health Endpoints

-**Loki:** Edit `loki-config.yml`:
-```yaml
-limits_config:
-  retention_period: 720h  # 30 days
-```
+The API server provides health check endpoints for monitoring:

-## Troubleshooting
+- **`/health`** - Overall service health (for Docker healthcheck)
+- **`/health/live`** - Liveness probe (is the service running?)
+- **`/health/ready`** - Readiness probe (can the service accept traffic?)

-**No metrics showing:**
-```bash
-# Check if Prometheus can reach targets
-curl http://localhost:9090/api/v1/targets
-
-# Check if API exposes metrics
-curl http://localhost:9100/metrics
-```
-
-**No logs showing:**
-```bash
-# Check Promtail status
-docker logs ml-experiments-promtail
-
-# Verify Loki is receiving logs
-curl http://localhost:3100/ready
-```
-
-**Grafana can't connect to datasources:**
-```bash
-# Restart Grafana  
-docker-compose restart grafana
-```
-
-## Profiling Quick Start
-
-To capture CPU profiles while exercising real workloads:
+### Testing Health Endpoints

 ```bash
-# HTTP LoadTestSuite (MediumLoad scenario)
-make profile-load
+# Basic health check
+curl -k https://localhost:9101/health

-# WebSocket → Redis queue → worker integration
-make profile-ws-queue
+# Liveness check (for K8s or monitoring)
+curl -k https://localhost:9101/health/live
+
+# Readiness check (verifies dependencies)
+curl -k https://localhost:9101/health/ready
 ```

-Then inspect profiles with:
+See `health-testing.md` for detailed testing procedures.

-```bash
-go tool pprof cpu_load.out   # HTTP load
-go tool pprof cpu_ws.out     # WebSocket/queue/worker
-```
+## Prometheus Integration
+
+Prometheus scrapes the following endpoints:
+- `api-server:9101/metrics` - Application metrics (future)
+- `api-server:9101/health` - Health status monitoring
+- `host.docker.internal:9100/metrics` - Worker metrics (when the worker runs on the host)
+- `worker:9100/metrics` - Worker metrics (when the worker runs as a container in the compose network)
+
+## Cleanup (deprecated paths)
+
+These legacy paths may still exist in the repo but are **not used** by the current dev compose config:
+
+- `monitoring/dashboards/` (old dashboards location)
+- `monitoring/prometheus.yml` (old Prometheus config location)
+- `monitoring/grafana/provisioning/dashboards/dashboard.yml` (duplicate of `dashboards.yml`)
--- a/monitoring/dashboards/grafana-dashboard.json
+++ b/monitoring/dashboards/grafana-dashboard.json
@ -1,147 +0,0 @@
-{
-    "dashboard": {
-        "title": "ML Task Queue Monitoring",
-        "tags": [
-            "ml",
-            "queue",
-            "fetch_ml"
-        ],
-        "timezone": "browser",
-        "panels": [
-            {
-                "title": "Queue Depth",
-                "type": "graph",
-                "gridPos": {
-                    "h": 8,
-                    "w": 12,
-                    "x": 0,
-                    "y": 0
-                },
-                "targets": [
-                    {
-                        "expr": "fetch_ml_queue_depth",
-                        "legendFormat": "Queue Depth"
-                    }
-                ]
-            },
-            {
-                "title": "Active Tasks",
-                "type": "graph",
-                "gridPos": {
-                    "h": 8,
-                    "w": 12,
-                    "x": 12,
-                    "y": 0
-                },
-                "targets": [
-                    {
-                        "expr": "sum(fetch_ml_active_tasks) by (worker_id)",
-                        "legendFormat": "{{worker_id}}"
-                    }
-                ]
-            },
-            {
-                "title": "Task Duration (p50, p95, p99)",
-                "type": "graph",
-                "gridPos": {
-                    "h": 8,
-                    "w": 24,
-                    "x": 0,
-                    "y": 8
-                },
-                "targets": [
-                    {
-                        "expr": "histogram_quantile(0.50, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
-                        "legendFormat": "p50"
-                    },
-                    {
-                        "expr": "histogram_quantile(0.95, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
-                        "legendFormat": "p95"
-                    },
-                    {
-                        "expr": "histogram_quantile(0.99, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
-                        "legendFormat": "p99"
-                    }
-                ]
-            },
-            {
-                "title": "Task Completion Rate",
-                "type": "graph",
-                "gridPos": {
-                    "h": 8,
-                    "w": 12,
-                    "x": 0,
-                    "y": 16
-                },
-                "targets": [
-                    {
-                        "expr": "rate(fetch_ml_tasks_completed_total[5m])",
-                        "legendFormat": "{{status}}"
-                    }
-                ]
-            },
-            {
-                "title": "Failure Rate by Error Category",
-                "type": "graph",
-                "gridPos": {
-                    "h": 8,
-                    "w": 12,
-                    "x": 12,
-                    "y": 16
-                },
-                "targets": [
-                    {
-                        "expr": "rate(fetch_ml_task_failures_total[5m])",
-                        "legendFormat": "{{error_category}}"
-                    }
-                ]
-            },
-            {
-                "title": "Retry Rate",
-                "type": "graph",
-                "gridPos": {
-                    "h": 8,
-                    "w": 12,
-                    "x": 0,
-                    "y": 24
-                },
-                "targets": [
-                    {
-                        "expr": "rate(fetch_ml_task_retries_total[5m])",
-                        "legendFormat": "{{error_category}}"
-                    }
-                ]
-            },
-            {
-                "title": "Dead Letter Queue Size",
-                "type": "stat",
-                "gridPos": {
-                    "h": 8,
-                    "w": 6,
-                    "x": 12,
-                    "y": 24
-                },
-                "targets": [
-                    {
-                        "expr": "fetch_ml_dlq_size"
-                    }
-                ]
-            },
-            {
-                "title": "Lease Expirations",
-                "type": "stat",
-                "gridPos": {
-                    "h": 8,
-                    "w": 6,
-                    "x": 18,
-                    "y": 24
-                },
-                "targets": [
-                    {
-                        "expr": "fetch_ml_lease_expirations_total"
-                    }
-                ]
-            }
-        ]
-    }
-}
--- a/monitoring/dashboards/logs-dashboard.json
+++ b/monitoring/dashboards/logs-dashboard.json
@ -1,278 +0,0 @@
-{
-    "dashboard": {
-        "title": "Application Logs",
-        "tags": [
-            "logs",
-            "loki",
-            "fetch_ml"
-        ],
-        "timezone": "browser",
-        "editable": true,
-        "graphTooltip": 1,
-        "time": {
-            "from": "now-1h",
-            "to": "now"
-        },
-        "timepicker": {
-            "refresh_intervals": [
-                "5s",
-                "10s",
-                "30s",
-                "1m",
-                "5m",
-                "15m",
-                "30m",
-                "1h"
-            ],
-            "time_options": [
-                "5m",
-                "15m",
-                "1h",
-                "6h",
-                "12h",
-                "24h",
-                "2d",
-                "7d",
-                "30d"
-            ]
-        },
-        "panels": [
-            {
-                "title": "Log Stream",
-                "type": "logs",
-                "gridPos": {
-                    "x": 0,
-                    "y": 0,
-                    "w": 24,
-                    "h": 12
-                },
-                "id": 1,
-                "targets": [
-                    {
-                        "expr": "{job=\"app_logs\"}",
-                        "refId": "A",
-                        "datasource": "Loki"
-                    }
-                ],
-                "options": {
-                    "showTime": true,
-                    "showLabels": true,
-                    "showCommonLabels": false,
-                    "wrapLogMessage": false,
-                    "prettifyLogMessage": false,
-                    "enableLogDetails": true,
-                    "dedupStrategy": "none",
-                    "sortOrder": "Descending"
-                }
-            },
-            {
-                "title": "Log Level Distribution",
-                "type": "bargauge",
-                "gridPos": {
-                    "x": 0,
-                    "y": 12,
-                    "w": 8,
-                    "h": 8
-                },
-                "id": 2,
-                "targets": [
-                    {
-                        "expr": "sum by (level) (count_over_time({job=\"app_logs\"} | logfmt | level != \"\" [5m]))",
-                        "refId": "A",
-                        "datasource": "Loki",
-                        "legendFormat": "{{level}}"
-                    }
-                ],
-                "options": {
-                    "orientation": "horizontal",
-                    "displayMode": "gradient",
-                    "showUnfilled": true
-                },
-                "fieldConfig": {
-                    "defaults": {
-                        "color": {
-                            "mode": "palette-classic"
-                        }
-                    },
-                    "overrides": [
-                        {
-                            "matcher": {
-                                "id": "byName",
-                                "options": "INFO"
-                            },
-                            "properties": [
-                                {
-                                    "id": "color",
-                                    "value": {
-                                        "mode": "fixed",
-                                        "fixedColor": "green"
-                                    }
-                                }
-                            ]
-                        },
-                        {
-                            "matcher": {
-                                "id": "byName",
-                                "options": "WARN"
-                            },
-                            "properties": [
-                                {
-                                    "id": "color",
-                                    "value": {
-                                        "mode": "fixed",
-                                        "fixedColor": "yellow"
-                                    }
-                                }
-                            ]
-                        },
-                        {
-                            "matcher": {
-                                "id": "byName",
-                                "options": "ERROR"
-                            },
-                            "properties": [
-                                {
-                                    "id": "color",
-                                    "value": {
-                                        "mode": "fixed",
-                                        "fixedColor": "red"
-                                    }
-                                }
-                            ]
-                        }
-                    ]
-                }
-            },
-            {
-                "title": "Error Logs (Last Hour)",
-                "type": "table",
-                "gridPos": {
-                    "x": 8,
-                    "y": 12,
-                    "w": 16,
-                    "h": 8
-                },
-                "id": 3,
-                "targets": [
-                    {
-                        "expr": "{job=\"app_logs\"} | logfmt | level=\"ERROR\"",
-                        "refId": "A",
-                        "datasource": "Loki"
-                    }
-                ],
-                "options": {
-                    "showHeader": true
-                },
-                "transformations": [
-                    {
-                        "id": "labelsToFields",
-                        "options": {}
-                    }
-                ]
-            },
-            {
-                "title": "Logs by Component",
-                "type": "timeseries",
-                "gridPos": {
-                    "x": 0,
-                    "y": 20,
-                    "w": 12,
-                    "h": 8
-                },
-                "id": 4,
-                "targets": [
-                    {
-                        "expr": "sum by (component) (rate({job=\"app_logs\"} | logfmt [1m]))",
-                        "refId": "A",
-                        "datasource": "Loki",
-                        "legendFormat": "{{component}}"
-                    }
-                ],
-                "fieldConfig": {
-                    "defaults": {
-                        "custom": {
-                            "drawStyle": "line",
-                            "lineInterpolation": "smooth",
-                            "fillOpacity": 10,
-                            "spanNulls": false,
-                            "showPoints": "never",
-                            "stacking": {
-                                "mode": "none"
-                            }
-                        },
-                        "unit": "reqps"
-                    }
-                }
-            },
-            {
-                "title": "Warning Logs Timeline",
-                "type": "timeseries",
-                "gridPos": {
-                    "x": 12,
-                    "y": 20,
-                    "w": 12,
-                    "h": 8
-                },
-                "id": 5,
-                "targets": [
-                    {
-                        "expr": "sum(count_over_time({job=\"app_logs\"} | logfmt | level=\"WARN\" [1m]))",
-                        "refId": "A",
-                        "datasource": "Loki",
-                        "legendFormat": "Warnings"
-                    }
-                ],
-                "fieldConfig": {
-                    "defaults": {
-                        "custom": {
-                            "drawStyle": "bars",
-                            "fillOpacity": 50
-                        },
-                        "color": {
-                            "mode": "fixed",
-                            "fixedColor": "yellow"
-                        }
-                    }
-                }
-            },
-            {
-                "title": "Search Logs",
-                "type": "logs",
-                "gridPos": {
-                    "x": 0,
-                    "y": 28,
-                    "w": 24,
-                    "h": 10
-                },
-                "id": 6,
-                "targets": [
-                    {
-                        "expr": "{job=\"app_logs\"} |= \"$search_term\"",
-                        "refId": "A",
-                        "datasource": "Loki"
-                    }
-                ],
-                "options": {
-                    "showTime": true,
-                    "showLabels": true,
-                    "wrapLogMessage": true,
-                    "enableLogDetails": true
-                }
-            }
-        ],
-        "templating": {
-            "list": [
-                {
-                    "name": "search_term",
-                    "type": "textbox",
-                    "label": "Search Term",
-                    "current": {
-                        "value": "",
-                        "text": ""
-                    }
-                }
-            ]
-        },
-        "refresh": "30s"
-    }
-}
--- a/monitoring/dashboards/performance-dashboard.json
+++ b/monitoring/dashboards/performance-dashboard.json
@ -1,157 +0,0 @@
-{
-  "annotations": {
-    "list": [
-      {
-        "builtIn": 1,
-        "datasource": "-- Grafana --",
-        "enable": true,
-        "hide": true,
-        "iconColor": "rgba(0, 211, 255, 1)",
-        "name": "Annotations & Alerts",
-        "type": "dashboard"
-      }
-    ]
-  },
-  "editable": true,
-  "gnetId": null,
-  "graphTooltip": 0,
-  "id": null,
-  "links": [],
-  "panels": [
-    {
-      "aliasColors": {},
-      "bars": false,
-      "dashLength": 10,
-      "dashes": false,
-      "datasource": "loki",
-      "fill": 1,
-      "fillGradient": 0,
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 0
-      },
-      "hiddenSeries": false,
-      "id": 1,
-      "legend": {
-        "avg": false,
-        "current": false,
-        "max": false,
-        "min": false,
-        "show": true,
-        "total": false,
-        "values": false
-      },
-      "lines": true,
-      "linewidth": 1,
-      "nullPointMode": "null",
-      "options": {
-        "dataLinks": []
-      },
-      "percentage": false,
-      "pointradius": 2,
-      "points": false,
-      "renderer": "flot",
-      "seriesOverrides": [],
-      "spaceLength": 10,
-      "stack": false,
-      "steppedLine": false,
-      "targets": [
-        {
-          "expr": "{job=\"fetchml-performance\"} |= \"BenchmarkAPIServerCreateJobSimple\"",
-          "legendFormat": "API Job Creation",
-          "refId": "A"
-        },
-        {
-          "expr": "{job=\"fetchml-performance\"} |= \"BenchmarkMLExperimentExecution/SmallExperiment\"",
-          "legendFormat": "ML Small Experiment",
-          "refId": "B"
-        },
-        {
-          "expr": "{job=\"fetchml-performance\"} |= \"BenchmarkDatasetOperations/DatasetCreation\"",
-          "legendFormat": "Dataset Creation",
-          "refId": "C"
-        }
-      ],
-      "thresholds": [],
-      "timeFrom": null,
-      "timeRegions": [],
-      "timeShift": null,
-      "title": "API Performance Trends",
-      "tooltip": {
-        "shared": true,
-        "sort": 0,
-        "value_type": "individual"
-      },
-      "type": "graph",
-      "xaxis": {
-        "buckets": null,
-        "mode": "time",
-        "name": null,
-        "show": true,
-        "values": []
-      },
-      "yaxes": [
-        {
-          "format": "short",
-          "label": "Time (ns/op)",
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        },
-        {
-          "format": "short",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        }
-      ],
-      "yaxis": {
-        "align": false,
-        "alignLevel": null
-      }
-    },
-    {
-      "datasource": "loki",
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 0
-      },
-      "id": 2,
-      "options": {
-        "showLabels": true
-      },
-      "targets": [
-        {
-          "expr": "{job=\"fetchml-performance\"} |= \"Performance Summary\"",
-          "legendFormat": "{{timestamp}}",
-          "refId": "A"
-        }
-      ],
-      "title": "Latest Performance Summary",
-      "type": "logs"
-    }
-  ],
-  "refresh": "30s",
-  "schemaVersion": 27,
-  "style": "dark",
-  "tags": ["fetchml", "performance"],
-  "templating": {
-    "list": []
-  },
-  "time": {
-    "from": "now-1h",
-    "to": "now"
-  },
-  "timepicker": {},
-  "timezone": "",
-  "title": "Fetch ML Performance Dashboard",
-  "uid": "fetchml-performance",
-  "version": 1
-}
--- a/monitoring/docker-compose.performance.yml
+++ b/monitoring/docker-compose.performance.yml
@ -1,64 +0,0 @@
-services:
-  prometheus:
-    image: prom/prometheus:latest
-    ports:
-      - "9090:9090"
-    command:
-      - '--config.file=/etc/prometheus/prometheus.yml'
-      - '--storage.tsdb.path=/prometheus'
-      - '--web.console.libraries=/etc/prometheus/console_libraries'
-      - '--web.console.templates=/etc/prometheus/consoles'
-      - '--web.enable-lifecycle'
-    volumes:
-      - ./prometheus.yml:/etc/prometheus/prometheus.yml
-      - prometheus-data:/prometheus
-    networks:
-      - monitoring
-
-  loki:
-    image: grafana/loki:2.9.0
-    ports:
-      - "3100:3100"
-    command: -config.file=/etc/loki/local-config.yaml
-    volumes:
-      - ./loki-performance-config.yaml:/etc/loki/local-config.yaml
-    networks:
-      - monitoring
-
-  promtail:
-    image: grafana/promtail:latest
-    volumes:
-      - ./promtail-performance-config.yaml:/etc/promtail/config.yml
-      - /var/log:/var/log:ro
-    command: -config.file=/etc/promtail/config.yml
-    networks:
-      - monitoring
-
-  pushgateway:
-    image: prom/pushgateway:latest
-    ports:
-      - "9091:9091"
-    networks:
-      - monitoring
-
-  grafana:
-    image: grafana/grafana:latest
-    ports:
-      - "3001:3000"
-    environment:
-      - GF_SECURITY_ADMIN_PASSWORD=admin
-    volumes:
-      - grafana-data:/var/lib/grafana
-      - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards
-      - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources
-    networks:
-      - monitoring
-
-volumes:
-  loki-data:
-  grafana-data:
-  prometheus-data:
-
-networks:
-  monitoring:
-    driver: bridge
--- a/monitoring/grafana/dashboards/load-test-performance.json
+++ b/monitoring/grafana/dashboards/load-test-performance.json
@ -0,0 +1,51 @@
+{
+  "dashboard": {
+    "id": null,
+    "title": "Load Test Performance",
+    "tags": [
+      "load-test",
+      "performance"
+    ],
+    "panels": [
+      {
+        "id": 1,
+        "title": "Service Health",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "up",
+            "legendFormat": "{{job}}"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 0
+        }
+      },
+      {
+        "id": 2,
+        "title": "Request Rate",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "rate(http_requests_total[5m])",
+            "legendFormat": "RPS"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 0
+        }
+      }
+    ],
+    "time": {
+      "from": "now-1h",
+      "to": "now"
+    },
+    "refresh": "5s"
+  }
+}
--- a/monitoring/grafana/dashboards/load-test-simple.json
+++ b/monitoring/grafana/dashboards/load-test-simple.json
@ -0,0 +1 @@
+{"dashboard": {"id": null, "title": "Load Test Performance", "tags": ["load-test", "performance"], "panels": [{"id": 1, "title": "Service Status", "type": "stat", "targets": [{"expr": "up", "legendFormat": "{{job}}"}]}]}}
--- a/monitoring/grafana/dashboards/loki-logs.json
+++ b/monitoring/grafana/dashboards/loki-logs.json
@ -0,0 +1,51 @@
+{
+  "dashboard": {
+    "id": null,
+    "title": "Log Analysis",
+    "tags": [
+      "loki",
+      "logs"
+    ],
+    "panels": [
+      {
+        "id": 1,
+        "title": "Error Logs",
+        "type": "logs",
+        "targets": [
+          {
+            "expr": "{job=~\".+\"} |= \"error\"",
+            "legendFormat": "Errors"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 0
+        }
+      },
+      {
+        "id": 2,
+        "title": "All Logs",
+        "type": "logs",
+        "targets": [
+          {
+            "expr": "{job=~\".+\"}",
+            "legendFormat": "All logs"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 0
+        }
+      }
+    ],
+    "time": {
+      "from": "now-30m",
+      "to": "now"
+    },
+    "refresh": "30s"
+  }
+}
--- a/monitoring/grafana/dashboards/prewarm-performance.txt
+++ b/monitoring/grafana/dashboards/prewarm-performance.txt
@ -0,0 +1,135 @@
+# Grafana Dashboard: Prewarm Performance
+# Import this JSON into Grafana to create a prewarm monitoring dashboard
+
+{
+  "dashboard": {
+    "id": null,
+    "title": "Prewarm Performance",
+    "tags": ["prewarm", "performance", "worker"],
+    "panels": [
+      {
+        "id": 1,
+        "title": "Environment Prewarm Hit Rate (%)",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "100 * (fetchml_prewarm_env_hit_total / clamp_min(fetchml_prewarm_env_hit_total + fetchml_prewarm_env_miss_total, 1))",
+            "legendFormat": "{{worker_id}}"
+          }
+        ],
+        "gridPos": {"h": 6, "w": 6, "x": 0, "y": 0},
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "thresholds": {
+              "steps": [
+                {"color": "red", "value": 0},
+                {"color": "yellow", "value": 50},
+                {"color": "green", "value": 80}
+              ]
+            }
+          }
+        }
+      },
+      {
+        "id": 2,
+        "title": "Snapshot Prewarm Hit Rate (%)",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "100 * (fetchml_prewarm_snapshot_hit_total / clamp_min(fetchml_prewarm_snapshot_hit_total + fetchml_prewarm_snapshot_miss_total, 1))",
+            "legendFormat": "{{worker_id}}"
+          }
+        ],
+        "gridPos": {"h": 6, "w": 6, "x": 6, "y": 0},
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "thresholds": {
+              "steps": [
+                {"color": "red", "value": 0},
+                {"color": "yellow", "value": 50},
+                {"color": "green", "value": 80}
+              ]
+            }
+          }
+        }
+      },
+      {
+        "id": 3,
+        "title": "Environment Prewarm Hits vs Misses",
+        "type": "graph",
+        "targets": [
+          {"expr": "rate(fetchml_prewarm_env_hit_total[5m])", "legendFormat": "hits {{worker_id}}"},
+          {"expr": "rate(fetchml_prewarm_env_miss_total[5m])", "legendFormat": "misses {{worker_id}}"}
+        ],
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
+        "yAxes": [{"unit": "reqps"}]
+      },
+      {
+        "id": 4,
+        "title": "Snapshot Prewarm Hits vs Misses",
+        "type": "graph",
+        "targets": [
+          {"expr": "rate(fetchml_prewarm_snapshot_hit_total[5m])", "legendFormat": "hits {{worker_id}}"},
+          {"expr": "rate(fetchml_prewarm_snapshot_miss_total[5m])", "legendFormat": "misses {{worker_id}}"}
+        ],
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
+        "yAxes": [{"unit": "reqps"}]
+      },
+      {
+        "id": 5,
+        "title": "Environment Build Time",
+        "type": "graph",
+        "targets": [
+          {"expr": "rate(fetchml_prewarm_env_time_seconds_total[5m])", "legendFormat": "build time {{worker_id}}"}
+        ],
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
+        "yAxes": [{"unit": "seconds"}]
+      },
+      {
+        "id": 6,
+        "title": "Snapshot Prewarm Time",
+        "type": "graph",
+        "targets": [
+          {"expr": "rate(fetchml_prewarm_snapshot_time_seconds_total[5m])", "legendFormat": "prewarm time {{worker_id}}"}
+        ],
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
+        "yAxes": [{"unit": "seconds"}]
+      },
+      {
+        "id": 7,
+        "title": "Environment Images Built",
+        "type": "graph",
+        "targets": [
+          {"expr": "increase(fetchml_prewarm_env_built_total[1h])", "legendFormat": "built {{worker_id}}"}
+        ],
+        "gridPos": {"h": 8, "w": 6, "x": 12, "y": 16},
+        "yAxes": [{"unit": "short"}]
+      },
+      {
+        "id": 8,
+        "title": "Snapshots Prewarmed",
+        "type": "graph",
+        "targets": [
+          {"expr": "increase(fetchml_prewarm_snapshot_built_total[1h])", "legendFormat": "prewarmed {{worker_id}}"}
+        ],
+        "gridPos": {"h": 8, "w": 6, "x": 18, "y": 16},
+        "yAxes": [{"unit": "short"}]
+      },
+      {
+        "id": 9,
+        "title": "Prewarm Efficiency",
+        "type": "graph",
+        "targets": [
+          {"expr": "fetchml_prewarm_env_hit_total + fetchml_prewarm_snapshot_hit_total", "legendFormat": "total hits {{worker_id}}"},
+          {"expr": "fetchml_prewarm_env_miss_total + fetchml_prewarm_snapshot_miss_total", "legendFormat": "total misses {{worker_id}}"}
+        ],
+        "gridPos": {"h": 8, "w": 24, "x": 0, "y": 24},
+        "yAxes": [{"unit": "short"}]
+      }
+    ],
+    "time": {"from": "now-1h", "to": "now"},
+    "refresh": "5s"
+  }
+}
--- a/monitoring/grafana/dashboards/rsync-performance.json
+++ b/monitoring/grafana/dashboards/rsync-performance.json
@ -0,0 +1,86 @@
+{
+  "dashboard": {
+    "id": null,
+    "title": "Rsync Performance",
+    "tags": [
+      "rsync",
+      "sync",
+      "performance"
+    ],
+    "panels": [
+      {
+        "id": 1,
+        "title": "Rsync Operations",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "rate(rsync_operations_total[5m])",
+            "legendFormat": "Operations/sec"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 0
+        }
+      },
+      {
+        "id": 2,
+        "title": "Data Transfer Rate",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "rate(rsync_bytes_transferred_total[5m])",
+            "legendFormat": "Bytes/sec"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 0
+        }
+      },
+      {
+        "id": 3,
+        "title": "Sync Duration",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "rsync_sync_duration_seconds",
+            "legendFormat": "Duration"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 8
+        }
+      },
+      {
+        "id": 4,
+        "title": "Sync Errors",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "rate(rsync_errors_total[5m])",
+            "legendFormat": "Errors/sec"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 8
+        }
+      }
+    ],
+    "time": {
+      "from": "now-1h",
+      "to": "now"
+    },
+    "refresh": "5s"
+  }
+}
--- a/monitoring/grafana/dashboards/system-health.json
+++ b/monitoring/grafana/dashboards/system-health.json
@ -0,0 +1,51 @@
+{
+  "dashboard": {
+    "id": null,
+    "title": "System Health",
+    "tags": [
+      "system",
+      "health"
+    ],
+    "panels": [
+      {
+        "id": 1,
+        "title": "Service Status",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "up",
+            "legendFormat": "{{job}}"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 0
+        }
+      },
+      {
+        "id": 2,
+        "title": "Memory Usage",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "process_resident_memory_bytes",
+            "legendFormat": "Memory"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 0
+        }
+      }
+    ],
+    "time": {
+      "from": "now-1h",
+      "to": "now"
+    },
+    "refresh": "10s"
+  }
+}
--- a/monitoring/grafana/dashboards/websocket-performance.json
+++ b/monitoring/grafana/dashboards/websocket-performance.json
@ -0,0 +1,68 @@
+{
+  "dashboard": {
+    "id": null,
+    "title": "WebSocket Performance",
+    "tags": [
+      "websocket",
+      "performance"
+    ],
+    "panels": [
+      {
+        "id": 1,
+        "title": "WebSocket Connections",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "websocket_connections_active",
+            "legendFormat": "Active Connections"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 0
+        }
+      },
+      {
+        "id": 2,
+        "title": "WebSocket Messages",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "rate(websocket_messages_total[5m])",
+            "legendFormat": "Messages/sec"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 0
+        }
+      },
+      {
+        "id": 3,
+        "title": "Connection Errors",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "rate(websocket_connection_errors_total[5m])",
+            "legendFormat": "Errors/sec"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 8
+        }
+      }
+    ],
+    "time": {
+      "from": "now-1h",
+      "to": "now"
+    },
+    "refresh": "5s"
+  }
+}
--- a/monitoring/grafana/dashboards/worker-resources.json
+++ b/monitoring/grafana/dashboards/worker-resources.json
@ -0,0 +1,280 @@
+{
+  "id": null,
+  "title": "Worker Resources",
+  "tags": [
+    "worker",
+    "resources"
+  ],
+  "panels": [
+      {
+        "id": 1,
+        "title": "CPU Free",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "fetchml_resources_cpu_free",
+            "legendFormat": "{{worker_id}}"
+          }
+        ],
+        "gridPos": {
+          "h": 6,
+          "w": 6,
+          "x": 0,
+          "y": 0
+        }
+      },
+      {
+        "id": 2,
+        "title": "CPU Total",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "fetchml_resources_cpu_total",
+            "legendFormat": "{{worker_id}}"
+          }
+        ],
+        "gridPos": {
+          "h": 6,
+          "w": 6,
+          "x": 6,
+          "y": 0
+        }
+      },
+      {
+        "id": 3,
+        "title": "CPU Utilization (%)",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "100 * (1 - (fetchml_resources_cpu_free / clamp_min(fetchml_resources_cpu_total, 1)))",
+            "legendFormat": "{{worker_id}}"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 0
+        }
+      },
+      {
+        "id": 4,
+        "title": "GPU Slots Free",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "fetchml_resources_gpu_slots_free",
+            "legendFormat": "{{worker_id}} gpu={{gpu_index}}"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 6
+        }
+      },
+      {
+        "id": 5,
+        "title": "Acquire Wait / Timeout (Totals)",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "fetchml_resources_acquire_wait_total",
+            "legendFormat": "wait {{worker_id}}"
+          },
+          {
+            "expr": "fetchml_resources_acquire_timeout_total",
+            "legendFormat": "timeout {{worker_id}}"
+          },
+          {
+            "expr": "fetchml_resources_acquire_total",
+            "legendFormat": "total {{worker_id}}"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 8
+        }
+      },
+      {
+        "id": 6,
+        "title": "Avg Acquire Wait (seconds)",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "fetchml_resources_acquire_wait_seconds_total / clamp_min(fetchml_resources_acquire_wait_total, 1)",
+            "legendFormat": "{{worker_id}}"
+          }
+        ],
+        "gridPos": {
+          "h": 6,
+          "w": 6,
+          "x": 0,
+          "y": 14
+        }
+      },
+      {
+        "id": 7,
+        "title": "Acquire Wait Ratio",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "fetchml_resources_acquire_wait_total / clamp_min(fetchml_resources_acquire_total, 1)",
+            "legendFormat": "{{worker_id}}"
+          }
+        ],
+        "gridPos": {
+          "h": 6,
+          "w": 6,
+          "x": 6,
+          "y": 14
+        }
+      },
+      {
+        "id": 8,
+        "title": "Environment Prewarm Hit Rate (%)",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "100 * (fetchml_prewarm_env_hit_total / clamp_min(fetchml_prewarm_env_hit_total + fetchml_prewarm_env_miss_total, 1))",
+            "legendFormat": "{{worker_id}}"
+          }
+        ],
+        "gridPos": {
+          "h": 6,
+          "w": 6,
+          "x": 12,
+          "y": 14
+        },
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "thresholds": {
+              "steps": [
+                {"color": "red", "value": 0},
+                {"color": "yellow", "value": 50},
+                {"color": "green", "value": 80}
+              ]
+            }
+          }
+        }
+      },
+      {
+        "id": 9,
+        "title": "Snapshot Prewarm Hit Rate (%)",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "100 * (fetchml_prewarm_snapshot_hit_total / clamp_min(fetchml_prewarm_snapshot_hit_total + fetchml_prewarm_snapshot_miss_total, 1))",
+            "legendFormat": "{{worker_id}}"
+          }
+        ],
+        "gridPos": {
+          "h": 6,
+          "w": 6,
+          "x": 18,
+          "y": 14
+        },
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "thresholds": {
+              "steps": [
+                {"color": "red", "value": 0},
+                {"color": "yellow", "value": 50},
+                {"color": "green", "value": 80}
+              ]
+            }
+          }
+        }
+      },
+      {
+        "id": 10,
+        "title": "Prewarm Hits vs Misses",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "rate(fetchml_prewarm_env_hit_total[5m])",
+            "legendFormat": "env hits {{worker_id}}"
+          },
+          {
+            "expr": "rate(fetchml_prewarm_env_miss_total[5m])",
+            "legendFormat": "env misses {{worker_id}}"
+          },
+          {
+            "expr": "rate(fetchml_prewarm_snapshot_hit_total[5m])",
+            "legendFormat": "snapshot hits {{worker_id}}"
+          },
+          {
+            "expr": "rate(fetchml_prewarm_snapshot_miss_total[5m])",
+            "legendFormat": "snapshot misses {{worker_id}}"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 24,
+          "x": 0,
+          "y": 20
+        },
+        "yAxes": [
+          {"unit": "reqps"}
+        ]
+      },
+      {
+        "id": 11,
+        "title": "Prewarm Build Time",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "rate(fetchml_prewarm_env_time_seconds_total[5m])",
+            "legendFormat": "env build {{worker_id}}"
+          },
+          {
+            "expr": "rate(fetchml_prewarm_snapshot_time_seconds_total[5m])",
+            "legendFormat": "snapshot prewarm {{worker_id}}"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 28
+        },
+        "yAxes": [
+          {"unit": "seconds"}
+        ]
+      },
+      {
+        "id": 12,
+        "title": "Prewarm Builds",
+        "type": "graph",
+        "targets": [
+          {
+            "expr": "increase(fetchml_prewarm_env_built_total[1h])",
+            "legendFormat": "env built {{worker_id}}"
+          },
+          {
+            "expr": "increase(fetchml_prewarm_snapshot_built_total[1h])",
+            "legendFormat": "snapshots prewarmed {{worker_id}}"
+          }
+        ],
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 28
+        },
+        "yAxes": [
+          {"unit": "short"}
+        ]
+      }
+    ],
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "refresh": "5s"
+}
--- a/monitoring/grafana/provisioning/dashboards/dashboards.yml
+++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml
@ -1,5 +1,4 @@
 apiVersion: 1
-
 providers:
  - name: 'default'
    orgId: 1
--- a/monitoring/grafana/provisioning/datasources/loki.yml
+++ b/monitoring/grafana/provisioning/datasources/loki.yml
@ -0,0 +1,9 @@
+apiVersion: 1
+datasources:
+  - name: Loki
+    type: loki
+    access: proxy
+    url: http://loki:3100
+    editable: true
+    jsonData:
+      maxLines: 1000
--- a/monitoring/grafana/provisioning/datasources/datasources.yml
+++ b/monitoring/grafana/provisioning/datasources/datasources.yml
@ -1,16 +1,10 @@
 apiVersion: 1
-
 datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
-    isDefault: false
-    editable: false
-
-  - name: Loki
-    type: loki
-    access: proxy
-    url: http://loki:3100
    isDefault: true
-    editable: false
+    editable: true
+    jsonData:
+      timeInterval: "5s"
--- a/monitoring/health-testing.md
+++ b/monitoring/health-testing.md
@ -0,0 +1,100 @@
+# Testing Health Endpoints with Monitoring Stack
+
+## Verify Health Endpoints
+
+```bash
+# 1. Start the monitoring stack
+cd deployments
+docker-compose -f docker-compose.dev.yml up -d
+
+# 2. Wait for services to start (30 seconds)
+sleep 30
+
+# 3. Test health endpoints
+curl -k https://localhost:9101/health
+# Expected: {"status":"healthy","timestamp":"...","checks":{}}
+
+curl -k https://localhost:9101/health/live  
+# Expected: {"status":"alive","timestamp":"..."}
+
+curl -k https://localhost:9101/health/ready
+# Expected: {"status":"ready","timestamp":"...","checks":{"queue":"ok","experiments":"ok"}}
+
+# 4. Check Docker health status
+docker ps | grep api-server
+# Should show: (healthy)
+
+# 5. Access Grafana
+open http://localhost:3000
+# Login: admin / admin123
+
+# 6. Access Prometheus
+open http://localhost:9090
+# Check targets: Status > Targets
+# Should see: api-server, api-server-health
+
+# 7. Query health metrics in Prometheus
+# Go to Graph and enter: up{job="api-server-health"}
+# Should show: value=1 (service is up)
+```
+
+## Health Check Integration
+
+### Docker Compose
+The health check is configured in `deployments/docker-compose.dev.yml`:
+```yaml
+healthcheck:
+  test: [ "CMD", "curl", "-k", "https://localhost:9101/health" ]
+  interval: 30s
+  timeout: 10s
+  retries: 3
+  start_period: 40s
+```
+
+### Prometheus Monitoring
+Prometheus scrapes health status every 30s from:
+- `/health` - Overall service health
+- `/metrics` - Future Prometheus metrics (when implemented)
+
+### Kubernetes (Future)
+Health endpoints ready for K8s probes:
+```yaml
+livenessProbe:
+  httpGet:
+    path: /health/live
+    port: 9101
+    scheme: HTTPS
+  initialDelaySeconds: 30
+  periodSeconds: 10
+
+readinessProbe:
+  httpGet:
+    path: /health/ready
+    port: 9101
+    scheme: HTTPS
+  initialDelaySeconds: 10
+  periodSeconds: 5
+```
+
+## Monitoring Stack Services
+
+- **Grafana** (port 3000): Dashboards and visualization
+- **Prometheus** (port 9090): Metrics collection
+- **Loki** (port 3100): Log aggregation
+- **Promtail**: Log shipping
+
+## Troubleshooting
+
+```bash
+# Check API server logs
+docker logs ml-experiments-api
+
+# Check Prometheus targets
+curl http://localhost:9090/api/v1/targets
+
+# Check health endpoint directly
+docker exec ml-experiments-api curl -k https://localhost:9101/health
+
+# Restart services
+docker-compose -f deployments/docker-compose.dev.yml restart api-server
+```
--- a/monitoring/loki-config.yml
+++ b/monitoring/loki-config.yml
@ -12,7 +12,7 @@ common:
      rules_directory: /loki/rules
  replication_factor: 1
  ring:
-    instance_addr: 127.0.0.1
+    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory

--- a/monitoring/loki-performance-config.yaml
+++ b/monitoring/loki-performance-config.yaml
@ -1,40 +0,0 @@
-auth_enabled: false
-
-server:
-  http_listen_port: 3100
-
-ingester:
-  lifecycler:
-    address: 127.0.0.1
-    ring:
-      kvstore:
-        store: inmemory
-      replication_factor: 1
-    final_sleep: 0s
-    min_ready_duration: 0s
-  chunk_idle_period: 1h
-  max_chunk_age: 1h
-  chunk_target_size: 1048576
-  chunk_retain_period: 30s
-
-schema_config:
-  configs:
-    - from: 2020-10-24
-      store: boltdb-shipper
-      object_store: filesystem
-      schema: v11
-      index:
-        prefix: index_
-        period: 24h
-
-storage_config:
-  boltdb_shipper:
-    active_index_directory: /loki/boltdb-shipper-active
-    cache_location: /loki/boltdb-shipper-cache
-  filesystem:
-    directory: /loki/chunks
-
-limits_config:
-  reject_old_samples: true
-  reject_old_samples_max_age: 168h
-  allow_structured_metadata: false
--- a/monitoring/performance/grafana-dashboards/performance-dashboard.json
+++ b/monitoring/performance/grafana-dashboards/performance-dashboard.json
--- a/monitoring/prometheus/prometheus.yml
+++ b/monitoring/prometheus/prometheus.yml
@ -5,39 +5,35 @@ global:
  evaluation_interval: 15s

 scrape_configs:
-  # API Server metrics
+  # API Server metrics and health
  - job_name: 'api-server'
+    scheme: http
    static_configs:
-      - targets: ['api-server:9100']
+      - targets: ['api-server:9101']
        labels:
          service: 'api-server'
+    metrics_path: /metrics  # Future: Prometheus metrics endpoint

-  # Worker metrics (if running in docker)
+  # Benchmark metrics from Pushgateway
+  - job_name: 'benchmark'
+    static_configs: []
+
+  # Worker metrics (ResourceManager + task execution)
+  # For docker-compose dev on macOS/Windows, Prometheus can reach a locally running worker
+  # via host.docker.internal.
  - job_name: 'worker'
+    scrape_interval: 15s
    static_configs:
      - targets: ['worker:9100']
        labels:
          service: 'worker'
-    # Allow failures if worker not running
-    relabel_configs:
-      - source_labels: [__address__]
-        target_label: __param_target
-      - source_labels: [__param_target]
-        target_label: instance
-
-  # Benchmark metrics from Pushgateway
-  - job_name: 'benchmark'
-    static_configs:
-      - targets: ['localhost:9091']
-        labels:
-          service: 'benchmark'
+          target_type: 'container'
    metrics_path: /metrics
-    honor_labels: true

  # Loki metrics
  - job_name: 'loki'
    static_configs:
-      - targets: ['ml-experiments-loki:3100']
+      - targets: ['loki:3100']
        labels:
          service: 'loki'
    metrics_path: /metrics
--- a/monitoring/promtail-performance-config.yaml
+++ b/monitoring/promtail-performance-config.yaml
@ -1,50 +0,0 @@
-server:
-  http_listen_port: 9080
-  grpc_listen_port: 0
-
-positions:
-  filename: /tmp/positions.yaml
-
-clients:
-  - url: http://loki:3100/loki/api/v1/push
-
-scrape_configs:
- job_name: fetchml-performance
-  static_configs:
-  - targets:
-      - localhost
-    labels:
-      job: fetchml-performance
-      __path__: /reports/performance.log
-  
-  pipeline_stages:
-  - json:
-      expressions:
-        timestamp: timestamp
-        git_commit: git_commit
-        benchmark_name: name
-        time_per_op: time_per_op_ns
-        memory_per_op: memory_per_op_b
-        allocs_per_op: allocs_per_op
-  
-  - labels:
-      benchmark_name:
-      git_commit:
-  
-  - output:
-      source: output
-
- job_name: fetchml-performance-summary
-  static_configs:
-  - targets:
-      - localhost
-    labels:
-      job: fetchml-performance
-      __path__: /reports/performance_summary.log
-  
-  pipeline_stages:
-  - regex:
-      expression: "=== Performance Summary ==="
-  
-  - output:
-      source: output
--- a/monitoring/security_rules.yml
+++ b/monitoring/security_rules.yml
@ -1,112 +0,0 @@
-groups:
-  - name: security.rules
-    rules:
-      # High rate of failed authentication attempts
-      - alert: HighFailedAuthRate
-        expr: rate(failed_auth_total[5m]) > 10
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High rate of failed authentication attempts"
-          description: "More than 10 failed auth attempts per minute for the last 2 minutes"
-
-      # Potential brute force attack
-      - alert: BruteForceAttack
-        expr: rate(failed_auth_total[1m]) > 30
-        for: 1m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Potential brute force attack detected"
-          description: "More than 30 failed auth attempts per minute"
-
-      # Unusual WebSocket connection patterns
-      - alert: UnusualWebSocketActivity
-        expr: rate(websocket_connections_total[5m]) > 100
-        for: 3m
-        labels:
-          severity: warning
-        annotations:
-          summary: "Unusual WebSocket connection activity"
-          description: "WebSocket connection rate is unusually high"
-
-      # Rate limit breaches
-      - alert: RateLimitBreached
-        expr: rate(rate_limit_exceeded_total[5m]) > 5
-        for: 1m
-        labels:
-          severity: warning
-        annotations:
-          summary: "Rate limits being exceeded"
-          description: "Rate limit exceeded more than 5 times per minute"
-
-      # SSL certificate expiration warning
-      - alert: SSLCertificateExpiring
-        expr: ssl_certificate_expiry_days < 30
-        for: 1h
-        labels:
-          severity: warning
-        annotations:
-          summary: "SSL certificate expiring soon"
-          description: "SSL certificate will expire in less than 30 days"
-
-      # High memory usage
-      - alert: HighMemoryUsage
-        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High memory usage detected"
-          description: "Memory usage is above 90%"
-
-      # High CPU usage
-      - alert: HighCPUUsage
-        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High CPU usage detected"
-          description: "CPU usage is above 80%"
-
-      # Disk space running low
-      - alert: LowDiskSpace
-        expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10
-        for: 5m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Low disk space"
-          description: "Disk space is below 10%"
-
-      # Service down
-      - alert: ServiceDown
-        expr: up == 0
-        for: 1m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Service is down"
-          description: "{{ $labels.instance }} service has been down for more than 1 minute"
-
-      # Unexpected error rates
-      - alert: HighErrorRate
-        expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High error rate detected"
-          description: "Error rate is above 10%"
-
-      # Suspicious IP activity
-      - alert: SuspiciousIPActivity
-        expr: rate(requests_by_ip[5m]) > 1000
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: "Suspicious IP activity"
-          description: "IP address making unusually many requests"
--- a/podman/README.md
+++ b/podman/README.md
@ -118,7 +118,7 @@ jupyter>=1.0.0
  "allow_network": false,
  "blocked_packages": ["requests", "urllib3", "httpx"],
  "max_execution_time": 3600,
-  "gpu_access": true,
+  "gpu_devices": ["/dev/dri"],
  "ml_env": "ml_env",
  "package_manager": "mamba"
 }
--- a/podman/secure-ml-runner.podfile
+++ b/podman/secure-ml-runner.podfile
@ -32,6 +32,10 @@ RUN mamba install -n ml_env \
    -c pytorch -c conda-forge -y && \
    conda clean -afy

+# Poetry (for pyproject.toml + poetry.lock projects)
+RUN mamba install -n ml_env poetry -c conda-forge -y && \
+    conda clean -afy
+
 # Copy security wrapper
 COPY secure_runner.py /usr/local/bin/secure_runner.py
 COPY security_policy.json /etc/ml_runner/security_policy.json
--- a/podman/secure_runner.py
+++ b/podman/secure_runner.py
@ -45,7 +45,7 @@ class SecurityPolicy:
                ],
                "max_execution_time": 3600,
                "max_memory_gb": 16,
-                "gpu_access": True,
+                "gpu_devices": ["/dev/dri"],
                "allow_file_writes": True,
                "resource_limits": {
                    "cpu_count": 4,
@ -106,97 +106,197 @@ class CondaRunner:
        self.conda_prefix = os.environ.get("CONDA_PREFIX", "/opt/conda")
        self.env_path = f"{self.conda_prefix}/envs/{self.conda_env}"

-    def setup_environment(self, requirements_file: Path) -> bool:
-        """Setup Conda environment with mamba"""
+        self.gpu_devices = self.security_policy.policy.get("gpu_devices", [])
+
+    def setup_environment(self, deps_file: Path) -> bool:
+        """Setup Conda environment based on a dependency manifest."""
        try:
-            # Read requirements
-            with open(requirements_file, "r") as f:
-                requirements = [
-                    line.strip()
-                    for line in f
-                    if line.strip() and not line.startswith("#")
-                ]
+            name = deps_file.name

-            # Check each package for security
-            for req in requirements:
-                package_name = (
-                    req.split("==")[0].split(">=")[0].split("<=")[0].strip()
-                )
-                if not self.security_policy.check_package_safety(package_name):
-                    print(
-                        f"[SECURITY] Package '{package_name}' is blocked for security reasons"
-                    )
-                    return False
+            print(f"[MANIFEST] Using dependency manifest: {name}")

-            # Install packages with mamba (super fast!)
-            for req in requirements:
-                package_name = (
-                    req.split("==")[0].split(">=")[0].split("<=")[0].strip()
-                )
-
-                # Check if already installed with conda
-                check_cmd = [
-                    "conda",
-                    "run",
-                    "-n",
-                    self.conda_env,
-                    "python",
-                    "-c",
-                    f"import {package_name.replace('-', '_')}",
-                ]
-                result = subprocess.run(
-                    check_cmd, capture_output=True, text=True
-                )
-
-                if result.returncode == 0:
-                    print(f"[OK] {package_name} already installed in conda env")
-                    continue
-
-                # Try conda-forge first (faster and more reliable)
-                print(
-                    f"[INSTALL] Installing {req} with {self.package_manager}..."
-                )
-                install_cmd = [
+            if name in ("environment.yml", "environment.yaml"):
+                print(f"[SETUP] Applying conda environment file: {deps_file}")
+                cmd = [
                    self.package_manager,
-                    "install",
+                    "env",
+                    "update",
                    "-n",
                    self.conda_env,
-                    req,
-                    "-c",
-                    "conda-forge",
+                    "-f",
+                    str(deps_file),
                    "-y",
                ]
-                result = subprocess.run(
-                    install_cmd, capture_output=True, text=True, timeout=300
+                result = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
+                if result.returncode != 0:
+                    print(f"[ERROR] Failed to apply environment file: {result.stderr}")
+                    return False
+                return True
+
+            if name == "poetry.lock":
+                pyproject = self.workspace_dir / "pyproject.toml"
+                if not pyproject.exists():
+                    print("[ERROR] poetry.lock provided but pyproject.toml is missing")
+                    return False
+
+                print(f"[SETUP] Installing dependencies from Poetry lockfile: {deps_file}")
+                env = os.environ.copy()
+                env.update(
+                    {
+                        "POETRY_VIRTUALENVS_CREATE": "false",
+                        "POETRY_NO_INTERACTION": "1",
+                    }
                )

-                if result.returncode == 0:
-                    print(f"[OK] Installed {req} with {self.package_manager}")
-                    continue
+                # Ensure Poetry is available in the conda env.
+                check = subprocess.run(
+                    ["conda", "run", "-n", self.conda_env, "poetry", "--version"],
+                    capture_output=True,
+                    text=True,
+                    env=env,
+                )
+                if check.returncode != 0:
+                    print("[ERROR] Poetry is not available in the container environment")
+                    print(check.stderr)
+                    return False

-                # Fallback to pip if conda fails
-                print(f"[FALLBACK] Trying pip for {req}...")
-                pip_cmd = [
+                # Install into the conda env (no separate venv).
+                install = subprocess.run(
+                    [
+                        "conda",
+                        "run",
+                        "-n",
+                        self.conda_env,
+                        "poetry",
+                        "install",
+                        "--no-ansi",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=900,
+                    cwd=str(self.workspace_dir),
+                    env=env,
+                )
+                if install.returncode != 0:
+                    print("[ERROR] Poetry install failed")
+                    print(install.stderr)
+                    return False
+
+                return True
+
+            if name == "pyproject.toml":
+                # Use pip's PEP517/pyproject support (no Poetry required).
+                # This installs the project itself; dependencies may be fetched as needed.
+                print(f"[SETUP] Installing project from pyproject.toml: {deps_file}")
+                cmd = [
                    "conda",
                    "run",
                    "-n",
                    self.conda_env,
                    "pip",
                    "install",
-                    req,
+                    str(self.workspace_dir),
                    "--no-cache-dir",
                ]
-                result = subprocess.run(
-                    pip_cmd, capture_output=True, text=True, timeout=300
-                )
-
+                result = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
                if result.returncode != 0:
-                    print(f"[ERROR] Failed to install {req}: {result.stderr}")
+                    print(f"[ERROR] Failed to install project from pyproject.toml: {result.stderr}")
                    return False
+                return True

-                print(f"[OK] Installed {req} with pip")
+            if name == "requirements.txt":
+                # Read requirements
+                with open(deps_file, "r") as f:
+                    requirements = [
+                        line.strip()
+                        for line in f
+                        if line.strip() and not line.startswith("#")
+                    ]

-            return True
+                # Check each package for security
+                for req in requirements:
+                    package_name = (
+                        req.split("==")[0].split(">=")[0].split("<=")[0].strip()
+                    )
+                    if not self.security_policy.check_package_safety(package_name):
+                        print(
+                            f"[SECURITY] Package '{package_name}' is blocked for security reasons"
+                        )
+                        return False
+
+                # Install packages with mamba (super fast!)
+                for req in requirements:
+                    package_name = (
+                        req.split("==")[0].split(">=")[0].split("<=")[0].strip()
+                    )
+
+                    # Check if already installed with conda
+                    check_cmd = [
+                        "conda",
+                        "run",
+                        "-n",
+                        self.conda_env,
+                        "python",
+                        "-c",
+                        f"import {package_name.replace('-', '_')}",
+                    ]
+                    result = subprocess.run(
+                        check_cmd, capture_output=True, text=True
+                    )
+
+                    if result.returncode == 0:
+                        print(f"[OK] {package_name} already installed in conda env")
+                        continue
+
+                    # Try conda-forge first (faster and more reliable)
+                    print(
+                        f"[INSTALL] Installing {req} with {self.package_manager}..."
+                    )
+                    install_cmd = [
+                        self.package_manager,
+                        "install",
+                        "-n",
+                        self.conda_env,
+                        req,
+                        "-c",
+                        "conda-forge",
+                        "-y",
+                    ]
+                    result = subprocess.run(
+                        install_cmd, capture_output=True, text=True, timeout=300
+                    )
+
+                    if result.returncode == 0:
+                        print(f"[OK] Installed {req} with {self.package_manager}")
+                        continue
+
+                    # Fallback to pip if conda fails
+                    print(f"[FALLBACK] Trying pip for {req}...")
+                    pip_cmd = [
+                        "conda",
+                        "run",
+                        "-n",
+                        self.conda_env,
+                        "pip",
+                        "install",
+                        req,
+                        "--no-cache-dir",
+                    ]
+                    result = subprocess.run(
+                        pip_cmd, capture_output=True, text=True, timeout=300
+                    )
+
+                    if result.returncode != 0:
+                        print(f"[ERROR] Failed to install {req}: {result.stderr}")
+                        return False
+
+                    print(f"[OK] Installed {req} with pip")
+
+                return True
+
+            print(f"[ERROR] Unsupported dependency manifest: {deps_file}")
+            print("Supported: environment.yml, environment.yaml, poetry.lock (requires pyproject.toml), pyproject.toml, requirements.txt")
+            return False

        except Exception as e:
            print(f"[ERROR] Environment setup failed: {e}")
@ -217,7 +317,7 @@ class CondaRunner:
            env.update(
                {
                    "CONDA_DEFAULT_ENV": self.conda_env,
-                    "CUDA_VISIBLE_DEVICES": "0",  # Allow GPU access
+                    "CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES", ""),  # Allow GPU access
                    "SECURE_MODE": "1",
                    "NETWORK_ACCESS": (
                        "1"
@ -280,7 +380,7 @@ class CondaRunner:
                        "stdout": stdout,
                        "stderr": stderr,
                        "return_code": process.returncode,
-                        "gpu_accessible": True,
+                        "gpu_accessible": len(self.gpu_devices) > 0,
                        "security_mode": "enabled",
                        "container_type": "conda",
                        "conda_env": self.conda_env,
@ -338,8 +438,12 @@ def main():
    parser.add_argument(
        "--workspace", default="/workspace", help="Workspace directory"
    )
-    parser.add_argument("--requirements", help="Requirements file path")
+    parser.add_argument("--deps", help="Dependency manifest path (environment.yml | poetry.lock | pyproject.toml | requirements.txt)")
+    parser.add_argument("--requirements", help="Deprecated alias for --deps")
    parser.add_argument("--script", help="Training script path")
+    parser.add_argument(
+        "--prepare-only", action="store_true", help="Only prepare dependencies and exit"
+    )
    parser.add_argument(
        "--args",
        nargs=argparse.REMAINDER,
@ -383,17 +487,26 @@ def main():
    if args.check_gpu:
        return 0

+    deps_arg = args.deps or args.requirements
+    if not deps_arg:
+        print("[ERROR] Missing dependency manifest. Provide --deps.")
+        return 1
+
    # Setup environment
-    requirements_path = Path(args.requirements)
-    if not requirements_path.exists():
-        print(f"[ERROR] Requirements file not found: {requirements_path}")
+    deps_path = Path(deps_arg)
+    if not deps_path.exists():
+        print(f"[ERROR] Dependency manifest not found: {deps_path}")
        return 1

    print("[SETUP] Setting up secure environment...")
-    if not runner.setup_environment(requirements_path):
+    if not runner.setup_environment(deps_path):
        print("[ERROR] Failed to setup secure environment")
        return 1

+    if args.prepare_only:
+        print("[DONE] Environment prepared successfully")
+        return 0
+
    # Run experiment
    script_path = Path(args.script)
    if not script_path.exists():
--- a/podman/security_policy.json
+++ b/podman/security_policy.json
@ -24,7 +24,7 @@
  ],
  "max_execution_time": 3600,
  "max_memory_gb": 16,
-  "gpu_access": true,
+  "gpu_devices": ["/dev/dri"],
  "allow_file_writes": true,
  "resource_limits": {
    "cpu_count": 4,
--- a/scripts/README.md
+++ b/scripts/README.md
@ -20,19 +20,12 @@ This directory contains setup and utility scripts for FetchML.
 sudo ./scripts/setup-prod.sh /data/ml-experiments ml-user ml-group
 ```

-### `validate-prod-config.sh`
-**Purpose**: Validates production configuration files  
-**Usage**: `./scripts/validate-prod-config.sh [api-config] [worker-config]`  
-**What it does**:
- Checks config file syntax
- Verifies base_path consistency
- Tests Redis connectivity
- Validates Podman setup
- Checks directory permissions
+### Configuration validation
+Validate configs using the built-in config lint targets:

-**Example**:
 ```bash
-./scripts/validate-prod-config.sh configs/config-prod.yaml configs/worker-prod.toml
+make configlint
+make worker-configlint
 ```

 ## Legacy Setup Scripts (Deprecated)
@ -44,12 +37,11 @@ The following scripts are from earlier iterations and are **deprecated** in favo
 - `auto_setup.sh` - Old automated setup (superseded)
 - `setup_common.sh` - Common functions (integrated into setup-prod.sh)
 - `quick_start.sh` - Quick dev setup (use docker-compose on macOS instead)
- `test_tools.sh` - Tool testing (integrated into validate-prod-config.sh)
+

 ### Cleanup Recommendation
 These legacy scripts can be removed or archived. The current production setup only needs:
 - `setup-prod.sh`
- `validate-prod-config.sh`

 ## Usage Workflow

@ -59,8 +51,8 @@ These legacy scripts can be removed or archived. The current production setup on
 sudo ./scripts/setup-prod.sh

 # 2. Copy and configure
-sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml
-sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml
+sudo cp configs/api/prod.yaml /etc/fetch_ml/config.yaml
+sudo cp configs/workers/worker-prod.toml /etc/fetch_ml/worker.toml
 sudo vim /etc/fetch_ml/config.yaml  # Update API keys, etc.

 # 3. Build and install
@ -68,7 +60,8 @@ make prod
 sudo make install

 # 4. Validate
-./scripts/validate-prod-config.sh /etc/fetch_ml/config.yaml /etc/fetch_ml/worker.toml
+./bin/configlint --schema configs/schema/api_server_config.yaml /etc/fetch_ml/config.yaml
+./bin/configlint --schema configs/schema/worker_config_schema.yaml /etc/fetch_ml/worker.toml

 # 5. Start services
 sudo systemctl start fetchml-api fetchml-worker
@ -82,7 +75,7 @@ docker-compose up -d

 # Or run components directly
 make dev
-./bin/api-server -config configs/config-local.yaml
+./bin/api-server -config configs/api/dev.yaml
 ```

 ## Script Maintenance
--- a/scripts/auto-cleanup.service
+++ b/scripts/auto-cleanup.service
--- a/scripts/auto-cleanup.timer
+++ b/scripts/auto-cleanup.timer
--- a/scripts/benchmarks/run-benchmarks-local.sh
+++ b/scripts/benchmarks/run-benchmarks-local.sh
@ -8,6 +8,7 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
 LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
+ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"
 TIMESTAMP=$(date -u +"%Y%m%d_%H%M%S")
 RUN_DIR="$LOCAL_ARTIFACTS_DIR/run_$TIMESTAMP"

@ -168,14 +169,25 @@ if [ -f "$SCRIPT_DIR/cleanup-benchmarks.sh" ]; then
    "$SCRIPT_DIR/cleanup-benchmarks.sh" benchmarks
 else
    # Fallback cleanup if script not available
-    echo "Cleaning old benchmark runs (keeping last 10)..."
+    echo "Archiving old benchmark runs (keeping last 10)..."
+    stamp=$(date -u +%Y%m%d-%H%M%S)
+    mkdir -p "$ARCHIVE_DIR/$stamp"
    cd "$LOCAL_ARTIFACTS_DIR"
-    ls -1t run_* 2>/dev/null | tail -n +11 | xargs rm -rf 2>/dev/null || echo "No old runs to clean"
+    ls -1t run_* 2>/dev/null | tail -n +11 | while read -r run; do
+        [ -n "$run" ] || continue
+        mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
+    done
    
    # Clean temporary files
-    echo "Cleaning temporary files..."
-    find /tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
-    find /var/tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
+    echo "Archiving temporary files..."
+    tmp_archive_dir="$LOCAL_ARTIFACTS_DIR/tmp-archive/$stamp"
+    mkdir -p "$tmp_archive_dir"
+    find /tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
+        mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
+    done
+    find /var/tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
+        mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
+    done
    
    # Clean Go build cache
    echo "Cleaning Go build cache..."
--- a/scripts/cleanup-status.sh
+++ b/scripts/cleanup-status.sh
--- a/scripts/cleanup.sh
+++ b/scripts/cleanup.sh
--- a/scripts/create_bitwarden_fetchml_item.sh
+++ b/scripts/create_bitwarden_fetchml_item.sh
@ -1,49 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-# Create a Bitwarden item for a FetchML API user.
-#
-# Usage:
-#   ./scripts/create_bitwarden_fetchml_item.sh <username> <api_key> <api_key_hash>
-#
-# Requirements:
-#   - Bitwarden CLI (bw) installed
-#   - You are logged in and unlocked (bw login; bw unlock)
-#   - jq installed
-#
-# This script does NOT run on the homelab server. Run it from your
-# own machine where you manage Bitwarden.
-
-if [[ $# -ne 3 ]]; then
-  echo "Usage: $0 <username> <api_key> <api_key_hash>" >&2
-  exit 1
-fi
-
-USER_NAME="$1"
-API_KEY="$2"
-API_KEY_HASH="$3"
-
-ITEM_NAME="FetchML API  $USER_NAME"
-
-# Get base item template
-TEMPLATE_JSON=$(bw get template item)
-
-# Build item JSON with jq
-ITEM_JSON=$(echo "$TEMPLATE_JSON" | jq \
-  --arg name "$ITEM_NAME" \
-  --arg username "$USER_NAME" \
-  --arg password "$API_KEY" \
-  --arg hash "$API_KEY_HASH" \
-  '.name = $name
-   | .login.username = $username
-   | .login.password = $password
-   | .notes = "FetchML API key for user " + $username
-   | .fields = [{"name":"api_key_hash","value":$hash,"type":1}]')
-
-# Create item in Bitwarden
-# If you ever want to edit instead, you can capture the ID from this call
-# and use: bw edit item <id> <json>
-
-echo "$ITEM_JSON" | bw encode | bw create item
-
-echo "Created Bitwarden item: $ITEM_NAME"
--- a/scripts/deployment/setup-auto-cleanup.sh
+++ b/scripts/deployment/setup-auto-cleanup.sh
@ -1,90 +0,0 @@
-#!/bin/bash
-
-# Setup auto-cleanup service for fetch_ml
-# This creates a systemd timer that runs cleanup daily
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
-
-# Colors
-GREEN='\033[0;32m'
-BLUE='\033[0;34m'
-NC='\033[0m'
-
-log_info() {
-    echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-log_success() {
-    echo -e "${GREEN}[SUCCESS]${NC} $1"
-}
-
-log_info "Setting up auto-cleanup service..."
-
-# Check if running on macOS or Linux
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    log_info "Detected macOS - setting up launchd agent"
-    
-    # Create launchd plist
-    cat > ~/Library/LaunchAgents/com.fetchml.cleanup.plist << EOF
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-    <key>Label</key>
-    <string>com.fetchml.cleanup</string>
-    <key>ProgramArguments</key>
-    <array>
-        <string>$PROJECT_DIR/scripts/cleanup.sh</string>
-        <string>--force</string>
-    </array>
-    <key>StartInterval</key>
-    <integer>86400</integer>
-    <key>RunAtLoad</key>
-    <false/>
-    <key>StandardOutPath</key>
-    <string>/tmp/fetchml-cleanup.log</string>
-    <key>StandardErrorPath</key>
-    <string>/tmp/fetchml-cleanup.error.log</string>
-</dict>
-</plist>
-EOF
-    
-    # Load the launchd agent
-    launchctl load ~/Library/LaunchAgents/com.fetchml.cleanup.plist
-    
-    log_success "Auto-cleanup service installed for macOS"
-    log_info "Logs will be in /tmp/fetchml-cleanup.log"
-    
-elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
-    log_info "Detected Linux - setting up systemd timer"
-    
-    # Copy service files
-    sudo cp "$SCRIPT_DIR/auto-cleanup.service" /etc/systemd/system/
-    sudo cp "$SCRIPT_DIR/auto-cleanup.timer" /etc/systemd/system/
-    
-    # Reload systemd and enable timer
-    sudo systemctl daemon-reload
-    sudo systemctl enable auto-cleanup.timer
-    sudo systemctl start auto-cleanup.timer
-    
-    log_success "Auto-cleanup service installed for Linux"
-    log_info "Check status with: systemctl status auto-cleanup.timer"
-    
-else
-    echo "Unsupported OS: $OSTYPE"
-    exit 1
-fi
-
-log_info "Auto-cleanup will run daily"
-log_info "To uninstall:"
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    echo "  launchctl unload ~/Library/LaunchAgents/com.fetchml.cleanup.plist"
-    echo "  rm ~/Library/LaunchAgents/com.fetchml.cleanup.plist"
-else
-    echo "  sudo systemctl stop auto-cleanup.timer"
-    echo "  sudo systemctl disable auto-cleanup.timer"
-    echo "  sudo rm /etc/systemd/system/auto-cleanup.*"
-fi
--- a/scripts/deployment/setup-monitoring-prod.sh
+++ b/scripts/deployment/setup-monitoring-prod.sh
@ -1,275 +0,0 @@
-#!/bin/bash
-# Production Monitoring Stack Setup for Linux
-# Deploys Prometheus/Grafana/Loki/Promtail as Podman containers with systemd
-# Compatible with: Rocky/RHEL/CentOS, Ubuntu/Debian, Arch, SUSE, etc.
-
-set -e
-
-BOLD='\033[1m'
-GREEN='\033[0;32m'
-BLUE='\033[0;34m'
-YELLOW='\033[0;33m'
-NC='\033[0m'
-
-echo -e "${BOLD}=== FetchML Monitoring Stack Setup (Linux) ===${NC}\n"
-
-# Detect Linux distribution and package manager
-detect_distro() {
-    if [ -f /etc/os-release ]; then
-        . /etc/os-release
-        DISTRO=$ID
-        DISTRO_VERSION=$VERSION_ID
-    elif [ -f /etc/redhat-release ]; then
-        DISTRO="rhel"
-    else
-        DISTRO="unknown"
-    fi
-    
-    # Detect package manager
-    if command -v dnf &>/dev/null; then
-        PKG_MANAGER="dnf"
-    elif command -v yum &>/dev/null; then
-        PKG_MANAGER="yum"
-    elif command -v apt-get &>/dev/null; then
-        PKG_MANAGER="apt"
-    elif command -v pacman &>/dev/null; then
-        PKG_MANAGER="pacman"
-    elif command -v zypper &>/dev/null; then
-        PKG_MANAGER="zypper"
-    else
-        echo -e "${YELLOW}Warning: No known package manager found${NC}"
-        PKG_MANAGER="unknown"
-    fi
-    
-    echo "Detected distribution: $DISTRO (using $PKG_MANAGER)"
-}
-
-detect_distro
-
-# Configuration
-DATA_PATH="${1:-/data/monitoring}"
-ML_USER="${2:-ml-user}"
-ML_GROUP="${3:-ml-group}"
-
-echo "Configuration:"
-echo "  Monitoring data path: $DATA_PATH"
-echo "  User: $ML_USER"
-echo "  Group: $ML_GROUP"
-echo ""
-
-# Create pod for monitoring stack
-POD_NAME="monitoring"
-
-# 1. Create directories
-echo -e "${BLUE}[1/6]${NC} Creating directory structure..."
-sudo mkdir -p "${DATA_PATH}"/{prometheus,grafana,loki,promtail-config}
-sudo mkdir -p /etc/fetch_ml/monitoring
-sudo mkdir -p /var/lib/grafana/dashboards
-
-sudo chown -R $ML_USER:$ML_GROUP $DATA_PATH
-sudo chmod 755 $DATA_PATH
-
-echo -e "${GREEN}✓${NC} Directories created"
-
-# 2. Copy configuration files
-echo -e "${BLUE}[2/6]${NC} Copying configuration files..."
-sudo cp monitoring/prometheus.yml /etc/fetch_ml/monitoring/
-sudo cp monitoring/loki-config.yml /etc/fetch_ml/monitoring/
-sudo cp monitoring/promtail-config.yml /etc/fetch_ml/monitoring/
-sudo cp monitoring/grafana/provisioning /etc/fetch_ml/monitoring/ -r
-sudo cp monitoring/grafana-dashboard.json /var/lib/grafana/dashboards/ml-queue.json
-sudo cp monitoring/logs-dashboard.json /var/lib/grafana/dashboards/logs.json
-
-sudo chown -R $ML_USER:$ML_GROUP /etc/fetch_ml/monitoring
-sudo chown -R $ML_USER:$ML_GROUP /var/lib/grafana
-
-echo -e "${GREEN}✓${NC} Configuration copied"
-
-# 3. Create Podman pod
-echo -e "${BLUE}[3/6]${NC} Creating Podman pod..."
-sudo -u $ML_USER podman pod create \\
-    --name $POD_NAME \\
-    -p 3000:3000 \\
-    -p 9090:9090 \\
-    -p 3100:3100 \\
-    || echo "Pod may already exist"
-
-echo -e "${GREEN}✓${NC} Pod created"
-
-# 4. Create systemd service for monitoring pod
-echo -e "${BLUE}[4/6]${NC} Creating systemd services..."
-
-# Prometheus service
-sudo tee /etc/systemd/system/prometheus.service >/dev/null <<EOF
-[Unit]
-Description=Prometheus Monitoring
-After=network.target
-PartOf=$POD_NAME-pod.service
-
-[Service]
-Type=simple
-User=$ML_USER
-Group=$ML_GROUP
-Restart=always
-RestartSec=10
-
-ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 9090:9090
-ExecStart=/usr/bin/podman run --rm --name prometheus \\
-    --pod $POD_NAME \\
-    -v /etc/fetch_ml/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro \\
-    -v ${DATA_PATH}/prometheus:/prometheus \\
-    docker.io/prom/prometheus:latest \\
-    --config.file=/etc/prometheus/prometheus.yml \\
-    --storage.tsdb.path=/prometheus \\
-    --web.enable-lifecycle
-
-ExecStop=/usr/bin/podman stop -t 10 prometheus
-
-[Install]
-WantedBy=multi-user.target
-EOF
-
-# Loki service
-sudo tee /etc/systemd/system/loki.service >/dev/null <<EOF
-[Unit]
-Description=Loki Log Aggregation
-After=network.target
-PartOf=$POD_NAME-pod.service
-
-[Service]
-Type=simple
-User=$ML_USER
-Group=$ML_GROUP
-Restart=always
-RestartSec=10
-
-ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 3100:3100
-ExecStart=/usr/bin/podman run --rm --name loki \\
-    --pod $POD_NAME \\
-    -v /etc/fetch_ml/monitoring/loki-config.yml:/etc/loki/local-config.yaml:ro \\
-    -v ${DATA_PATH}/loki:/loki \\
-    docker.io/grafana/loki:latest \\
-    -config.file=/etc/loki/local-config.yaml
-
-ExecStop=/usr/bin/podman stop -t 10 loki
-
-[Install]
-WantedBy=multi-user.target
-EOF
-
-# Grafana service
-sudo tee /etc/systemd/system/grafana.service >/dev/null <<EOF
-[Unit]
-Description=Grafana Visualization
-After=network.target prometheus.service loki.service
-PartOf=$POD_NAME-pod.service
-
-[Service]
-Type=simple
-User=$ML_USER
-Group=$ML_GROUP
-Restart=always
-RestartSec=10
-
-ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 3000:3000
-ExecStart=/usr/bin/podman run --rm --name grafana \\
-    --pod $POD_NAME \\
-    -v ${DATA_PATH}/grafana:/var/lib/grafana \\
-    -v /etc/fetch_ml/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro \\
-    -v /var/lib/grafana/dashboards:/var/lib/grafana/dashboards:ro \\
-    -e GF_SECURITY_ADMIN_PASSWORD=\${GRAFANA_ADMIN_PASSWORD:-$(openssl rand -base64 32)} \\
-    -e GF_USERS_ALLOW_SIGN_UP=false \\
-    -e GF_AUTH_ANONYMOUS_ENABLED=false \\
-    docker.io/grafana/grafana:latest
-
-ExecStop=/usr/bin/podman stop -t 10 grafana
-
-[Install]
-WantedBy=multi-user.target
-EOF
-
-# Promtail service
-sudo tee /etc/systemd/system/promtail.service >/dev/null <<EOF
-[Unit]
-Description=Promtail Log Collector
-After=network.target loki.service
-PartOf=$POD_NAME-pod.service
-
-[Service]
-Type=simple
-User=$ML_USER
-Group=$ML_GROUP
-Restart=always
-RestartSec=10
-
-ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME
-ExecStart=/usr/bin/podman run --rm --name promtail \\
-    --pod $POD_NAME \\
-    -v /etc/fetch_ml/monitoring/promtail-config.yml:/etc/promtail/config.yml:ro \\
-    -v /var/log/fetch_ml:/var/log/app:ro \\
-    docker.io/grafana/promtail:latest \\
-    -config.file=/etc/promtail/config.yml
-
-ExecStop=/usr/bin/podman stop -t 10 promtail
-
-[Install]
-WantedBy=multi-user.target
-EOF
-
-sudo systemctl daemon-reload
-echo -e "${GREEN}✓${NC} Systemd services created"
-
-# 5. Create monitoring pod service
-echo -e "${BLUE}[5/6]${NC} Creating pod management service..."
-sudo -u $ML_USER podman generate systemd --new --name $POD_NAME \\
-    | sudo tee /etc/systemd/system/$POD_NAME-pod.service >/dev/null
-
-sudo systemctl daemon-reload
-echo -e "${GREEN}✓${NC} Pod service created"
-
-# 6. Setup firewall rules
-echo -e "${BLUE}[6/6]${NC} Configuring firewall..."
-if command -v firewall-cmd &>/dev/null; then
-    # RHEL/Rocky/Fedora (firewalld)
-    sudo firewall-cmd --permanent --add-port=3000/tcp  # Grafana
-    sudo firewall-cmd --permanent --add-port=9090/tcp  # Prometheus
-    sudo firewall-cmd --reload
-    echo -e "${GREEN}✓${NC} Firewall configured (firewalld)"
-elif command -v ufw &>/dev/null; then
-    # Ubuntu/Debian (ufw)
-    sudo ufw allow 3000/tcp comment 'Grafana'
-    sudo ufw allow 9090/tcp comment 'Prometheus'
-    echo -e "${GREEN}✓${NC} Firewall configured (ufw)"
-else
-    echo -e "${YELLOW}!${NC} No firewall detected. You may need to manually open ports 3000 and 9090"
-fi
-
-# Summary
-echo ""
-echo -e "${BOLD}=== Monitoring Stack Setup Complete! ===${NC}"
-echo ""
-echo "Services created:"
-echo "  - prometheus.service (Metrics collection)"
-echo "  - loki.service (Log aggregation)"
-echo "  - grafana.service (Visualization)"
-echo "  - promtail.service (Log shipping)"
-echo ""
-echo -e "${BOLD}Next steps:${NC}"
-echo "1. Start services:"
-echo "   sudo systemctl start prometheus"
-echo "   sudo systemctl start loki"
-echo "   sudo systemctl start promtail"
-echo "   sudo systemctl start grafana"
-echo ""
-echo "2. Enable on boot:"
-echo "   sudo systemctl enable prometheus loki promtail grafana"
-echo ""
-echo "3. Access Grafana:"
-echo "   http://YOUR_SERVER_IP:3000"
-echo "   Username: admin"
-echo "   Password: admin (change on first login)"
-echo ""
-echo "4. Check logs:"
-echo "   sudo journalctl -u prometheus -f"
-echo "   sudo journalctl -u grafana -f"
-echo ""
--- a/scripts/deployment/setup-prod.sh
+++ b/scripts/deployment/setup-prod.sh
@ -1,229 +0,0 @@
-#!/bin/bash
-# Production Setup Script for Rocky Linux (Bare Metal)
-# This script sets up the complete FetchML environment on bare metal
-
-set -e
-
-BOLD='\033[1m'
-GREEN='\033[0;32m'
-BLUE='\033[0;34m'
-NC='\033[0m'
-
-echo -e "${BOLD}=== FetchML Production Setup (Rocky Linux Bare Metal) ===${NC}\n"
-
-# Configuration
-BASE_PATH="${1:-/data/ml-experiments}"
-ML_USER="${2:-ml-user}"
-ML_GROUP="${3:-ml-group}"
-
-echo "Configuration:"
-echo "  Base path: $BASE_PATH"
-echo "  ML user: $ML_USER"
-echo "  ML group: $ML_GROUP"
-echo ""
-
-# 1. Create system user if it doesn't exist
-echo -e "${BLUE}[1/8]${NC} Creating system user..."
-if id "$ML_USER" &>/dev/null; then
-    echo "  User $ML_USER already exists"
-else
-    sudo useradd -r -s /bin/bash -m -d /home/$ML_USER -c "ML System User" $ML_USER
-    echo -e "${GREEN}✓${NC} Created user: $ML_USER"
-fi
-
-# 2. Create directory structure
-echo -e "${BLUE}[2/8]${NC} Creating directory structure..."
-sudo mkdir -p "${BASE_PATH}"/{experiments,pending,running,finished,failed,datasets}
-sudo mkdir -p /var/log/fetch_ml
-sudo mkdir -p /etc/fetch_ml
-
-echo -e "${GREEN}✓${NC} Created directories:"
-echo "  $BASE_PATH/experiments/"
-echo "  $BASE_PATH/pending/"
-echo "  $BASE_PATH/running/"
-echo "  $BASE_PATH/finished/"
-echo "  $BASE_PATH/failed/"
-echo "  $BASE_PATH/datasets/"
-echo "  /var/log/fetch_ml/"
-echo "  /etc/fetch_ml/"
-
-# 3. Set ownership and permissions
-echo -e "${BLUE}[3/8]${NC} Setting permissions..."
-sudo chown -R $ML_USER:$ML_GROUP $BASE_PATH
-sudo chmod 755 $BASE_PATH
-sudo chmod 700 $BASE_PATH/experiments  # Restrict experiment data
-
-sudo chown -R $ML_USER:$ML_GROUP /var/log/fetch_ml
-sudo chmod 755 /var/log/fetch_ml
-
-echo -e "${GREEN}✓${NC} Permissions set"
-
-# 4. Install system dependencies (Rocky Linux)
-echo -e "${BLUE}[4/8]${NC} Installing system dependencies..."
-sudo dnf install -y \
-    golang \
-    podman \
-    redis \
-    git \
-    make \
-    gcc \
-    || echo "Some packages may already be installed"
-
-echo -e "${GREEN}✓${NC} Dependencies installed"
-
-# 5. Configure Podman for GPU access (if NVIDIA GPU present)
-echo -e "${BLUE}[5/8]${NC} Configuring Podman..."
-if lspci | grep -i nvidia &>/dev/null; then
-    echo "  NVIDIA GPU detected, configuring GPU access..."
-    
-    # Install nvidia-container-toolkit if not present
-    if ! command -v nvidia-container-toolkit &>/dev/null; then
-        echo "  Installing nvidia-container-toolkit..."
-        sudo dnf config-manager --add-repo \
-            https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo
-        sudo dnf install -y nvidia-container-toolkit
-    fi
-    
-    # Configure Podman CDI
-    sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
-    echo -e "${GREEN}✓${NC} GPU support configured"
-else
-    echo "  No NVIDIA GPU detected, skipping GPU setup"
-fi
-
-# 6. Configure Redis
-echo -e "${BLUE}[6/8]${NC} Configuring Redis..."
-sudo systemctl enable redis
-sudo systemctl start redis || echo "Redis may already be running"
-
-# Set Redis password if not already configured
-if ! sudo grep -q "^requirepass" /etc/redis/redis.conf 2>/dev/null; then
-    REDIS_PASSWORD=$(openssl rand -base64 32)
-    echo "requirepass $REDIS_PASSWORD" | sudo tee -a /etc/redis/redis.conf >/dev/null
-    sudo systemctl restart redis
-    echo "  Generated Redis password: $REDIS_PASSWORD"
-    echo "  Save this password for your configuration!"
-else
-    echo "  Redis password already configured"
-fi
-
-echo -e "${GREEN}✓${NC} Redis configured"
-
-# 7. Setup systemd services
-echo -e "${BLUE}[7/8]${NC} Creating systemd services..."
-
-# API Server service
-sudo tee /etc/systemd/system/fetchml-api.service >/dev/null <<EOF
-[Unit]
-Description=FetchML API Server
-After=network.target redis.service
-Wants=redis.service
-
-[Service]
-Type=simple
-User=$ML_USER
-Group=$ML_GROUP
-WorkingDirectory=/opt/fetch_ml
-ExecStart=/usr/local/bin/fetchml-api -config /etc/fetch_ml/config.yaml
-Restart=always
-RestartSec=10
-StandardOutput=append:/var/log/fetch_ml/api.log
-StandardError=append:/var/log/fetch_ml/api-error.log
-
-# Security hardening
-NoNewPrivileges=true
-PrivateTmp=true
-ProtectSystem=strict
-ProtectHome=true
-ReadWritePaths=$BASE_PATH /var/log/fetch_ml
-
-[Install]
-WantedBy=multi-user.target
-EOF
-
-# Worker service
-sudo tee /etc/systemd/system/fetchml-worker.service >/dev/null <<EOF
-[Unit]
-Description=FetchML Worker
-After=network.target redis.service fetchml-api.service
-Wants=redis.service
-
-[Service]
-Type=simple
-User=$ML_USER
-Group=$ML_GROUP
-WorkingDirectory=/opt/fetch_ml
-ExecStart=/usr/local/bin/fetchml-worker -config /etc/fetch_ml/worker.toml
-Restart=always
-RestartSec=10
-StandardOutput=append:/var/log/fetch_ml/worker.log
-StandardError=append:/var/log/fetch_ml/worker-error.log
-
-# Security hardening
-NoNewPrivileges=true
-PrivateTmp=true
-ProtectSystem=strict
-ProtectHome=true
-ReadWritePaths=$BASE_PATH /var/log/fetch_ml
-
-[Install]
-WantedBy=multi-user.target
-EOF
-
-sudo systemctl daemon-reload
-echo -e "${GREEN}✓${NC} Systemd services created"
-
-# 8. Setup logrotate
-echo -e "${BLUE}[8/8]${NC} Configuring log rotation..."
-sudo tee /etc/logrotate.d/fetchml >/dev/null <<EOF
-/var/log/fetch_ml/*.log {
-    daily
-    rotate 14
-    compress
-    delaycompress
-    notifempty
-    missingok
-    create 0640 $ML_USER $ML_GROUP
-    sharedscripts
-    postrotate
-        systemctl reload fetchml-api >/dev/null 2>&1 || true
-        systemctl reload fetchml-worker >/dev/null 2>&1 || true
-    endscript
-}
-EOF
-
-echo -e "${GREEN}✓${NC} Log rotation configured"
-
-# Summary
-echo ""
-echo -e "${BOLD}=== Setup Complete! ===${NC}"
-echo ""
-echo "Directory structure created at: $BASE_PATH"
-echo "Logs will be written to: /var/log/fetch_ml/"
-echo "Configuration directory: /etc/fetch_ml/"
-echo ""
-echo -e "${BOLD}Next steps:${NC}"
-echo "1. Copy your config files:"
-echo "   sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml"
-echo "   sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml"
-echo ""
-echo "2. Build and install binaries:"
-echo "   make build"
-echo "   sudo cp bin/api-server /usr/local/bin/fetchml-api"
-echo "   sudo cp bin/worker /usr/local/bin/fetchml-worker"
-echo ""
-echo "3. Update config files with your settings (Redis password, API keys, etc.)"
-echo ""
-echo "4. Start services:"
-echo "   sudo systemctl start fetchml-api"
-echo "   sudo systemctl start fetchml-worker"
-echo ""
-echo "5. Enable services to start on boot:"
-echo "   sudo systemctl enable fetchml-api"
-echo "   sudo systemctl enable fetchml-worker"
-echo ""
-echo "6. Check status:"
-echo "   sudo systemctl status fetchml-api"
-echo "   sudo systemctl status fetchml-worker"
-echo "   sudo journalctl -u fetchml-api -f"
-echo ""
--- a/scripts/deployment/setup-production.sh
+++ b/scripts/deployment/setup-production.sh
--- a/scripts/legacy/auto_setup.sh
+++ b/scripts/legacy/auto_setup.sh
@ -1,455 +0,0 @@
-#!/bin/bash
-
-# Automatic Setup Script for ML Experiment Manager
-# Handles complete environment setup with security features
-
-set -euo pipefail
-
-# Colors
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-NC='\033[0m'
-
-print_info() {
-    echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-print_success() {
-    echo -e "${GREEN}[SUCCESS]${NC} $1"
-}
-
-print_warning() {
-    echo -e "${YELLOW}[WARNING]${NC} $1"
-}
-
-print_error() {
-    echo -e "${RED}[ERROR]${NC} $1"
-}
-
-detect_os() {
-    if [[ "$OSTYPE" == "darwin"* ]]; then
-        echo "macos"
-    elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
-        echo "linux"
-    else
-        echo "unknown"
-    fi
-}
-
-install_go() {
-    print_info "Installing Go..."
-    
-    local os=$(detect_os)
-    local go_version="1.23.0"
-    
-    if [[ "$os" == "macos" ]]; then
-        if command -v brew &> /dev/null; then
-            brew install go
-        else
-            print_error "Homebrew not found. Please install Go manually."
-            return 1
-        fi
-    elif [[ "$os" == "linux" ]]; then
-        wget -q "https://go.dev/dl/go${go_version}.linux-amd64.tar.gz"
-        sudo rm -rf /usr/local/go
-        sudo tar -C /usr/local -xzf "go${go_version}.linux-amd64.tar.gz"
-        rm "go${go_version}.linux-amd64.tar.gz"
-        
-        # Add to PATH
-        echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc
-        export PATH=$PATH:/usr/local/go/bin
-    fi
-    
-    print_success "Go installed"
-}
-
-install_zig() {
-    print_info "Installing Zig..."
-    
-    local os=$(detect_os)
-    
-    if [[ "$os" == "macos" ]]; then
-        if command -v brew &> /dev/null; then
-            brew install zig
-        else
-            print_error "Homebrew not found. Please install Zig manually."
-            return 1
-        fi
-    elif [[ "$os" == "linux" ]]; then
-        # Download Zig binary
-        local zig_version="0.13.0"
-        wget -q "https://ziglang.org/download/${zig_version}/zig-linux-x86_64-${zig_version}.tar.xz"
-        tar -xf "zig-linux-x86_64-${zig_version}.tar.xz"
-        sudo mv "zig-linux-x86_64-${zig_version}/zig" /usr/local/bin/
-        rm -rf "zig-linux-x86_64-${zig_version}.tar.xz" "zig-linux-x86_64-${zig_version}"
-    fi
-    
-    print_success "Zig installed"
-}
-
-install_docker() {
-    print_info "Installing Docker..."
-    
-    local os=$(detect_os)
-    
-    if [[ "$os" == "macos" ]]; then
-        if command -v brew &> /dev/null; then
-            brew install --cask docker
-            print_warning "Docker Desktop installed. Please start it manually."
-        else
-            print_error "Homebrew not found. Please install Docker manually."
-            return 1
-        fi
-    elif [[ "$os" == "linux" ]]; then
-        # Install Docker using official script
-        curl -fsSL https://get.docker.com -o get-docker.sh
-        sudo sh get-docker.sh
-        sudo usermod -aG docker $USER
-        rm get-docker.sh
-        
-        # Start Docker
-        sudo systemctl enable docker
-        sudo systemctl start docker
-        
-        print_success "Docker installed. You may need to log out and log back in."
-    fi
-}
-
-install_redis() {
-    print_info "Installing Redis..."
-    
-    local os=$(detect_os)
-    
-    if [[ "$os" == "macos" ]]; then
-        if command -v brew &> /dev/null; then
-            brew install redis
-            brew services start redis
-        else
-            print_error "Homebrew not found. Please install Redis manually."
-            return 1
-        fi
-    elif [[ "$os" == "linux" ]]; then
-        sudo apt-get update
-        sudo apt-get install -y redis-server
-        sudo systemctl enable redis-server
-        sudo systemctl start redis-server
-    fi
-    
-    print_success "Redis installed and started"
-}
-
-install_dependencies() {
-    print_info "Installing dependencies..."
-    
-    local os=$(detect_os)
-    
-    # Install basic tools
-    if [[ "$os" == "macos" ]]; then
-        if command -v brew &> /dev/null; then
-            brew install openssl curl jq
-        fi
-    elif [[ "$os" == "linux" ]]; then
-        sudo apt-get update
-        sudo apt-get install -y openssl curl jq build-essential
-    fi
-    
-    # Install Go tools
-    if command -v go &> /dev/null; then
-        go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest
-        go install golang.org/x/tools/cmd/goimports@latest
-    fi
-    
-    print_success "Dependencies installed"
-}
-
-setup_project() {
-    print_info "Setting up project..."
-    
-    # Create directories
-    mkdir -p bin
-    mkdir -p data
-    mkdir -p logs
-    mkdir -p db
-    mkdir -p ssl
-    mkdir -p configs
-    mkdir -p scripts
-    
-    # Build project
-    if command -v make &> /dev/null; then
-        make build
-        if command -v zig &> /dev/null; then
-            make cli-build
-        fi
-    else
-        print_warning "Make not found, building manually..."
-        go build -o bin/worker ./cmd/worker
-        go build -o bin/tui ./cmd/tui
-        go build -o bin/data_manager ./cmd/data_manager
-        go build -o bin/user_manager ./cmd/user_manager
-        go build -o bin/api-server ./cmd/api-server
-        
-        if command -v zig &> /dev/null; then
-            cd cli && zig build && cd ..
-        fi
-    fi
-    
-    print_success "Project setup completed"
-}
-
-setup_security() {
-    print_info "Setting up security features..."
-    
-    # Generate SSL certificates
-    if command -v openssl &> /dev/null; then
-        openssl req -x509 -newkey rsa:4096 -keyout ssl/key.pem -out ssl/cert.pem \
-            -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" \
-            -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null || {
-            print_warning "Failed to generate SSL certificates"
-        }
-        print_success "SSL certificates generated"
-    fi
-    
-    # Generate secure configuration
-    local redis_password=$(openssl rand -base64 32 2>/dev/null || echo "dev_redis_password_123")
-    local jwt_secret=$(openssl rand -base64 64 2>/dev/null || echo "dev_jwt_secret_1234567890123456789012345678901234567890123456789012345678901234")
-    
-    cat > configs/security-config.yaml << EOF
-base_path: "/data/ml-experiments"
-
-auth:
-  enabled: true
-  api_keys:
-    test_user:
-      hash: "$(echo -n "dev_test_api_key_12345" | sha256sum | cut -d' ' -f1)"
-      admin: true
-      roles: ["data_scientist", "admin"]
-      permissions:
-        read: true
-        write: true
-        delete: true
-
-server:
-  address: ":9101"
-  tls:
-    enabled: true
-    cert_file: "./ssl/cert.pem"
-    key_file: "./ssl/key.pem"
-    min_version: "1.3"
-
-security:
-  rate_limit:
-    enabled: true
-    requests_per_minute: 60
-    burst_size: 10
-  ip_whitelist:
-    - "127.0.0.1"
-    - "::1"
-    - "10.0.0.0/8"
-    - "192.168.0.0/16"
-    - "172.16.0.0/12"
-  failed_login_lockout:
-    enabled: true
-    max_attempts: 5
-    lockout_duration: "15m"
-
-redis:
-  url: "redis://localhost:6379"
-  password: "${redis_password}"
-
-logging:
-  level: "info"
-  file: "logs/fetch_ml.log"
-  audit_log: "logs/audit.log"
-EOF
-
-    cat > .env.dev << EOF
-# Development environment variables
-REDIS_PASSWORD=${redis_password}
-JWT_SECRET=${jwt_secret}
-GRAFANA_USER=admin
-GRAFANA_PASSWORD=$(openssl rand -base64 16 2>/dev/null || echo "dev_grafana_password")
-EOF
-
-    print_success "Security configuration created"
-}
-
-test_installation() {
-    print_info "Testing installation..."
-    
-    local tests_passed=0
-    local tests_total=0
-    
-    # Test Go
-    tests_total=$((tests_total + 1))
-    if command -v go &> /dev/null; then
-        print_success "Go: Installed"
-        tests_passed=$((tests_passed + 1))
-    else
-        print_error "Go: Not found"
-    fi
-    
-    # Test Zig
-    tests_total=$((tests_total + 1))
-    if command -v zig &> /dev/null; then
-        print_success "Zig: Installed"
-        tests_passed=$((tests_passed + 1))
-    else
-        print_warning "Zig: Not found (optional)"
-        tests_total=$((tests_total - 1))
-    fi
-    
-    # Test Docker
-    tests_total=$((tests_total + 1))
-    if command -v docker &> /dev/null; then
-        print_success "Docker: Installed"
-        tests_passed=$((tests_passed + 1))
-    else
-        print_warning "Docker: Not found (optional)"
-        tests_total=$((tests_total - 1))
-    fi
-    
-    # Test Redis
-    tests_total=$((tests_total + 1))
-    if command -v redis-cli &> /dev/null; then
-        if redis-cli ping | grep -q "PONG"; then
-            print_success "Redis: Running"
-            tests_passed=$((tests_passed + 1))
-        else
-            print_warning "Redis: Not running"
-        fi
-    else
-        print_warning "Redis: Not found (optional)"
-        tests_total=$((tests_total - 1))
-    fi
-    
-    # Test binaries
-    if [[ -f "bin/api-server" ]]; then
-        tests_total=$((tests_total + 1))
-        if ./bin/api-server --help > /dev/null 2>&1; then
-            print_success "API Server: Built"
-            tests_passed=$((tests_passed + 1))
-        else
-            print_error "API Server: Build failed"
-        fi
-    fi
-    
-    if [[ $tests_total -gt 0 ]]; then
-        local success_rate=$((tests_passed * 100 / tests_total))
-        print_info "Tests: $tests_passed/$tests_total passed ($success_rate%)"
-    fi
-    
-    print_success "Installation testing completed"
-}
-
-show_next_steps() {
-    print_success "Automatic setup completed!"
-    echo
-    echo "Next Steps:"
-    echo "==========="
-    echo ""
-    echo "1. Load environment variables:"
-    echo "   source .env.dev"
-    echo ""
-    echo "2. Start the API server:"
-    echo "   ./bin/api-server -config configs/config.yaml"
-    echo ""
-    echo "3. Test the Zig CLI (if installed):"
-    echo "   ./cli/zig-out/bin/ml --help"
-    echo ""
-    echo "4. Deploy with Docker (optional):"
-    echo "   make docker-run"
-    echo ""
-    echo "5. Docker Compose deployment:"
-    echo "   docker-compose up -d"
-    echo ""
-    echo "Configuration Files:"
-    echo "  configs/config.yaml            # Main configuration"
-    echo "  configs/config_local.yaml      # Local development"
-    echo "  ssl/cert.pem, ssl/key.pem     # TLS certificates"
-    echo ""
-    echo "Documentation:"
-    echo "  docs/DEPLOYMENT.md            # Deployment guide"
-    echo ""
-    echo "Quick Commands:"
-    echo "  make help                     # Show all commands"
-    echo "  make test                     # Run tests"
-    echo "  docker-compose up -d          # Start services"
-    echo ""
-    print_success "Ready to use ML Experiment Manager!"
-}
-
-# Main setup function
-main() {
-    echo "ML Experiment Manager Automatic Setup"
-    echo "====================================="
-    echo ""
-    
-    print_info "Starting automatic setup..."
-    echo ""
-    
-    # Check and install dependencies
-    if ! command -v go &> /dev/null; then
-        print_info "Go not found, installing..."
-        install_go
-    fi
-    
-    if ! command -v zig &> /dev/null; then
-        print_info "Zig not found, installing..."
-        install_zig
-    fi
-    
-    if ! command -v docker &> /dev/null; then
-        print_info "Docker not found, installing..."
-        install_docker
-    fi
-    
-    if ! command -v redis-cli &> /dev/null; then
-        print_info "Redis not found, installing..."
-        install_redis
-    fi
-    
-    # Install additional dependencies
-    install_dependencies
-    
-    # Setup project
-    setup_project
-    
-    # Setup security
-    setup_security
-    
-    # Test installation
-    test_installation
-    
-    # Show next steps
-    show_next_steps
-}
-
-# Handle command line arguments
-case "${1:-setup}" in
-    "setup")
-        main
-        ;;
-    "deps")
-        install_dependencies
-        ;;
-    "test")
-        test_installation
-        ;;
-    "help"|"-h"|"--help")
-        echo "Automatic Setup Script"
-        echo "Usage: $0 {setup|deps|test|help}"
-        echo ""
-        echo "Commands:"
-        echo "  setup  - Run full automatic setup"
-        echo "  deps   - Install dependencies only"
-        echo "  test   - Test installation"
-        echo "  help   - Show this help"
-        ;;
-    *)
-        print_error "Unknown command: $1"
-        echo "Use '$0 help' for usage information"
-        exit 1
-        ;;
-esac
--- a/scripts/legacy/quick_start.sh
+++ b/scripts/legacy/quick_start.sh
@ -1,314 +0,0 @@
-#!/usr/bin/env bash
-
-# Fetch ML Quick Start Script with Security
-# Sets up development environment with security features and creates test user
-
-set -euo pipefail
-
-# Colors
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-NC='\033[0m'
-
-print_info() {
-    echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-print_success() {
-    echo -e "${GREEN}[SUCCESS]${NC} $1"
-}
-
-print_warning() {
-    echo -e "${YELLOW}[WARNING]${NC} $1"
-}
-
-print_error() {
-    echo -e "${RED}[ERROR]${NC} $1"
-}
-
-check_prerequisites() {
-    print_info "Checking prerequisites..."
-
-    # Check Go
-    if ! command -v go &> /dev/null; then
-        print_error "Go is not installed. Please install Go 1.25 or later."
-        exit 1
-    fi
-
-    local go_version=$(go version | awk '{print $3}' | sed 's/go//')
-    print_info "Go version: $go_version"
-
-    # Check Zig
-    if ! command -v zig &> /dev/null; then
-        print_warning "Zig is not installed. CLI features will not be available."
-    else
-        local zig_version=$(zig version)
-        print_info "Zig version: $zig_version"
-    fi
-
-    # Check Docker
-    if ! command -v docker &> /dev/null; then
-        print_warning "Docker is not installed. Container features will not work."
-    fi
-
-    # Check Redis
-    if ! command -v redis-server &> /dev/null && ! command -v redis-cli &> /dev/null; then
-        print_warning "Redis is not installed. Starting local Redis..."
-    fi
-
-    # Check OpenSSL for certificates
-    if ! command -v openssl &> /dev/null; then
-        print_warning "OpenSSL is not installed. TLS certificates will not be generated."
-    fi
-
-    print_success "Prerequisites checked"
-}
-
-setup_project() {
-    print_info "Setting up Fetch ML project..."
-
-    # Create directories
-    mkdir -p bin
-    mkdir -p data
-    mkdir -p logs
-    mkdir -p db
-    mkdir -p ssl
-    mkdir -p configs
-
-    print_success "Project directories created"
-}
-
-build_project() {
-    print_info "Building Fetch ML..."
-
-    # Build Go binaries
-    make build
-
-    # Build Zig CLI if available
-    if command -v zig &> /dev/null; then
-        make cli-build
-        print_success "Zig CLI built"
-    fi
-
-    print_success "Build completed"
-}
-
-generate_ssl_certificates() {
-    print_info "Generating SSL certificates..."
-
-    if command -v openssl &> /dev/null; then
-        # Generate self-signed certificate for development
-        openssl req -x509 -newkey rsa:4096 -keyout ssl/key.pem -out ssl/cert.pem \
-            -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" \
-            -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null || {
-            print_warning "Failed to generate SSL certificates"
-            return 1
-        }
-        
-        print_success "SSL certificates generated in ssl/"
-        print_info "Certificates are self-signed (development only)"
-    else
-        print_warning "OpenSSL not available, skipping SSL certificates"
-    fi
-}
-
-setup_redis() {
-    print_info "Setting up Redis..."
-
-    if command -v redis-server &> /dev/null; then
-        if ! pgrep -f "redis-server" > /dev/null; then
-            redis-server --daemonize yes --port 6379
-            print_success "Redis started"
-        else
-            print_info "Redis already running"
-        fi
-    else
-        print_warning "Redis not available, some features may be limited"
-    fi
-}
-
-create_secure_config() {
-    print_info "Creating secure development configuration..."
-
-    # Generate secure passwords and secrets
-    local redis_password=$(openssl rand -base64 32 2>/dev/null || echo "dev_redis_password_123")
-    local jwt_secret=$(openssl rand -base64 64 2>/dev/null || echo "dev_jwt_secret_1234567890123456789012345678901234567890123456789012345678901234")
-
-    # Create development config
-    cat > configs/config.yaml << EOF
-base_path: "/data/ml-experiments"
-
-auth:
-  enabled: true
-  api_keys:
-    test_user:
-      hash: "$(echo -n "dev_test_api_key_12345" | sha256sum | cut -d' ' -f1)"
-      admin: true
-      roles: ["data_scientist", "admin"]
-      permissions:
-        read: true
-        write: true
-        delete: true
-
-server:
-  address: ":9101"
-  tls:
-    enabled: true
-    cert_file: "./ssl/cert.pem"
-    key_file: "./ssl/key.pem"
-    min_version: "1.3"
-
-security:
-  rate_limit:
-    enabled: true
-    requests_per_minute: 60
-    burst_size: 10
-  ip_whitelist:
-    - "127.0.0.1"
-    - "::1"
-    - "10.0.0.0/8"
-    - "192.168.0.0/16"
-    - "172.16.0.0/12"
-  failed_login_lockout:
-    enabled: true
-    max_attempts: 5
-    lockout_duration: "15m"
-
-redis:
-  url: "redis://localhost:6379"
-  password: "${redis_password}"
-
-logging:
-  level: "info"
-  file: "logs/fetch_ml.log"
-  audit_log: "logs/audit.log"
-EOF
-
-    # Create environment file
-    cat > .env.dev << EOF
-# Development environment variables
-REDIS_PASSWORD=${redis_password}
-JWT_SECRET=${jwt_secret}
-GRAFANA_USER=admin
-GRAFANA_PASSWORD=$(openssl rand -base64 16 2>/dev/null || echo "dev_grafana_password")
-EOF
-
-    print_success "Secure configuration created"
-    print_warning "Using development certificates and passwords"
-}
-
-create_test_user() {
-    print_info "Creating test user..."
-
-    # Generate API key for test user
-    local api_key="dev_test_api_key_12345"
-    local api_key_hash=$(echo -n "$api_key" | sha256sum | cut -d' ' -f1)
-
-    print_success "Test user created successfully"
-    echo "Username: test_user"
-    echo "API Key: $api_key"
-    echo "API Key Hash: $api_key_hash"
-    echo "Store this key safely!"
-    echo ""
-    echo "Environment variables in .env.dev"
-    echo "Run: source .env.dev"
-}
-
-test_setup() {
-    print_info "Testing setup..."
-
-    # Test Go binaries
-    if [[ -f "bin/api-server" ]]; then
-        ./bin/api-server --help > /dev/null 2>&1 || true
-        print_success "API server binary OK"
-    fi
-
-    if [[ -f "bin/worker" ]]; then
-        ./bin/worker --help > /dev/null 2>&1 || true
-        print_success "Worker binary OK"
-    fi
-
-    # Test Zig CLI
-    if [[ -f "cli/zig-out/bin/ml" ]]; then
-        ./cli/zig-out/bin/ml --help > /dev/null 2>&1 || true
-        print_success "Zig CLI binary OK"
-    fi
-
-    # Test Redis connection
-    if command -v redis-cli &> /dev/null; then
-        if redis-cli ping > /dev/null 2>&1; then
-            print_success "Redis connection OK"
-        else
-            print_warning "Redis not responding"
-        fi
-    fi
-
-    # Test SSL certificates
-    if [[ -f "ssl/cert.pem" && -f "ssl/key.pem" ]]; then
-        if openssl x509 -in ssl/cert.pem -noout -checkend 86400 > /dev/null 2>&1; then
-            print_success "SSL certificates valid"
-        else
-            print_warning "SSL certificates expired or invalid"
-        fi
-    fi
-}
-
-show_next_steps() {
-    print_success "Secure quick start completed!"
-    echo
-    echo "Next steps:"
-    echo "1. Load environment variables:"
-    echo "   source .env.dev"
-    echo
-    echo "2. Start API server:"
-    echo "   ./bin/api-server -config configs/config.yaml"
-    echo
-    echo "3. Test Zig CLI:"
-    echo "   ./cli/zig-out/bin/ml --help"
-    echo
-    echo "4. Test with curl (HTTPS):"
-    echo "   curl -k -H 'X-API-Key: dev_test_api_key_12345' https://localhost:9101/health"
-    echo
-    echo "5. Deploy with Docker:"
-    echo "   docker-compose up -d"
-    echo
-    echo "Features Enabled:"
-    echo "  ✅ HTTPS/TLS encryption"
-    echo "  ✅ API key authentication"
-    echo "  ✅ Rate limiting"
-    echo "  ✅ IP whitelisting"
-    echo "  ✅ Security headers"
-    echo "  ✅ Audit logging"
-    echo
-    echo "Configuration Files:"
-    echo "  configs/config.yaml            # Main configuration"
-    echo "  .env.dev                      # Environment variables"
-    echo "  ssl/cert.pem, ssl/key.pem     # TLS certificates"
-    echo
-    echo "Documentation:"
-    echo "  docs/DEPLOYMENT.md            # Deployment guide"
-    echo ""
-    print_success "Ready to run ML experiments!"
-}
-
-# Main function
-main() {
-    echo "Fetch ML Quick Start Script (with Security & Zig CLI)"
-    echo "===================================================="
-    echo ""
-
-    check_prerequisites
-    setup_project
-    build_project
-    generate_ssl_certificates
-    setup_redis
-    create_secure_config
-    create_test_user
-    test_setup
-    show_next_steps
-}
-
-# Run main function
-main "$@"
--- a/scripts/legacy/setup_common.sh
+++ b/scripts/legacy/setup_common.sh
@ -1,124 +0,0 @@
-#!/usr/bin/env bash
-
-# Shared helper functions for Fetch ML setup scripts (Ubuntu/Rocky)
-set -euo pipefail
-
-# Colors
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-NC='\033[0m'
-
-# Configuration defaults
-FETCH_ML_USER="fetchml"
-FETCH_ML_HOME="/opt/fetchml"
-SERVICE_DIR="/etc/systemd/system"
-LOG_DIR="/var/log/fetchml"
-DATA_DIR="/var/lib/fetchml"
-CONFIG_DIR="$FETCH_ML_HOME/configs"
-
-log_info()    { echo -e "${BLUE}[INFO]${NC} $1"; }
-log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
-log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
-log_error()   { echo -e "${RED}[ERROR]${NC} $1"; }
-
-# Download file with checksum verification
-# Args: url, checksum, dest
-secure_download() {
-    local url="$1" checksum="$2" dest="$3"
-    curl -fsSL "$url" -o "$dest"
-    echo "$checksum  $dest" | sha256sum --check --status || {
-        log_error "Checksum verification failed for $dest"
-        rm -f "$dest"
-        exit 1
-    }
-}
-
-cleanup_temp() {
-    if [[ -n "${TMP_FILES:-}" ]]; then
-        rm -f $TMP_FILES || true
-    fi
-}
-trap cleanup_temp EXIT
-
-ensure_user() {
-    if ! id "$FETCH_ML_USER" &>/dev/null; then
-        useradd -m -d "$FETCH_ML_HOME" -s /bin/bash "$FETCH_ML_USER"
-    fi
-    usermod -aG podman "$FETCH_ML_USER" || true
-}
-
-create_directories() {
-    mkdir -p "$FETCH_ML_HOME" "$LOG_DIR" "$DATA_DIR" "$FETCH_ML_HOME/bin" "$CONFIG_DIR"
-    chown -R "$FETCH_ML_USER":"$FETCH_ML_USER" "$FETCH_ML_HOME" "$LOG_DIR" "$DATA_DIR"
-}
-
-setup_systemd_service() {
-    local name="$1" exec="$2"
-    cat > "$SERVICE_DIR/${name}.service" <<EOF
-[Unit]
-Description=Fetch ML ${name^} Service
-After=network.target redis.service
-Wants=redis.service
-
-[Service]
-Type=simple
-User=$FETCH_ML_USER
-Group=$FETCH_ML_USER
-WorkingDirectory=$FETCH_ML_HOME
-Environment=PATH=$FETCH_ML_HOME/bin:/usr/local/go/bin:/usr/bin:/bin
-ExecStart=$exec
-Restart=on-failure
-RestartSec=5
-StandardOutput=journal
-StandardError=journal
-SyslogIdentifier=fetch_ml_${name}
-
-[Install]
-WantedBy=multi-user.target
-EOF
-}
-
-setup_logrotate() {
-    cat > /etc/logrotate.d/fetch_ml <<'EOF'
-/var/log/fetchml/*.log {
-    daily
-    missingok
-    rotate 14
-    compress
-    delaycompress
-    notifempty
-    create 0640 fetchml fetchml
-}
-EOF
-}
-
-hardening_steps() {
-    # Increase file limits
-    if ! grep -q fetchml /etc/security/limits.conf; then
-        cat >> /etc/security/limits.conf <<'EOF'
-fetchml soft nofile 65536
-fetchml hard nofile 65536
-EOF
-    fi
-
-    # Enable unattended security upgrades if available
-    if command -v apt-get &>/dev/null; then
-        apt-get install -y unattended-upgrades >/dev/null || true
-    elif command -v dnf &>/dev/null; then
-        dnf install -y dnf-automatic >/dev/null || true
-    fi
-}
-
-selinux_guidance() {
-    if command -v getenforce &>/dev/null; then
-        local mode=$(getenforce)
-        log_info "SELinux mode: $mode"
-        if [[ "$mode" == "Enforcing" ]]; then
-            log_info "Ensure systemd units and directories have proper contexts. Example:"
-            echo "  semanage fcontext -a -t bin_t '$FETCH_ML_HOME/bin(/.*)?'"
-            echo "  restorecon -Rv $FETCH_ML_HOME/bin"
-        fi
-    fi
-}
--- a/scripts/legacy/setup_rocky.sh
+++ b/scripts/legacy/setup_rocky.sh
@ -1,417 +0,0 @@
-#!/usr/bin/env bash
-
-# Fetch ML Rocky Linux Setup Script
-# Optimized for ML experiments on Rocky Linux 8/9
-
-set -euo pipefail
-
-# shellcheck source=scripts/setup_common.sh
-SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
-source "$SCRIPT_DIR/setup_common.sh"
-
-check_root() {
-    if [[ $EUID -ne 0 ]]; then
-        log_error "This script must be run as root"
-        exit 1
-    fi
-}
-
-check_rocky() {
-    if ! command -v dnf &> /dev/null && ! command -v yum &> /dev/null; then
-        log_error "This script is designed for Rocky Linux systems"
-        exit 1
-    fi
-
-    local rocky_version=$(cat /etc/rocky-release | grep -oE '[0-9]+\.[0-9]+')
-    log_info "Rocky Linux version: $rocky_version"
-
-    # Use dnf for Rocky 9+, yum for Rocky 8
-    if command -v dnf &> /dev/null; then
-        PKG_MANAGER="dnf"
-    else
-        PKG_MANAGER="yum"
-    fi
-}
-
-update_system() {
-    log_info "Updating system packages..."
-    $PKG_MANAGER update -y
-    $PKG_MANAGER upgrade -y
-    $PKG_MANAGER install -y curl wget gnupg2
-}
-
-enable_epel() {
-    log_info "Enabling EPEL repository..."
-
-    if $PKG_MANAGER repolist | grep -q "epel"; then
-        log_info "EPEL already enabled"
-        return
-    fi
-
-    $PKG_MANAGER install -y epel-release
-    $PKG_MANAGER config-manager --set-enabled powertools
-
-    log_success "EPEL repository enabled"
-}
-
-install_go() {
-    log_info "Installing Go 1.25..."
-
-    if command -v go &> /dev/null; then
-        local go_version=$(go version | awk '{print $3}' | sed 's/go//')
-        log_info "Go already installed: $go_version"
-        return
-    fi
-
-    cd /tmp
-    TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz"
-    secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz"
-    tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz
-
-    # Add to PATH
-    echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile
-    echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile
-    export PATH=$PATH:/usr/local/go/bin
-
-    log_success "Go 1.25 installed"
-}
-
-install_podman() {
-    log_info "Installing Podman..."
-
-    if command -v podman &> /dev/null; then
-        log_info "Podman already installed"
-        return
-    fi
-
-    # Install Podman and related tools
-    $PKG_MANAGER install -y podman podman-compose containernetworking-plugins
-
-    # Configure Podman
-    mkdir -p /etc/containers
-    cat > /etc/containers/containers.conf << EOF
-[containers]
-user_namespace_enable = 1
-runtime = "crun"
-
-[network]
-network_backend = "netavark"
-
-[engine]
-cgroup_manager = "systemd"
-EOF
-
-    # Enable user namespaces
-    echo "user.max_user_namespaces=15000" >> /etc/sysctl.conf
-    sysctl -p user.max_user_namespaces=15000
-
-    log_success "Podman installed"
-}
-
-install_redis() {
-    log_info "Installing Redis..."
-
-    if command -v redis-server &> /dev/null; then
-        log_info "Redis already installed"
-        return
-    fi
-
-    $PKG_MANAGER install -y redis
-
-    # Configure Redis for production
-    sed -i 's/supervised no/supervised systemd/' /etc/redis.conf
-    sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis.conf
-
-    systemctl enable redis
-    systemctl start redis
-
-    log_success "Redis installed and configured"
-}
-
-install_nvidia_drivers() {
-    log_info "Checking for NVIDIA GPU..."
-
-    if command -v nvidia-smi &> /dev/null; then
-        log_info "NVIDIA drivers already installed"
-        nvidia-smi
-        return
-    fi
-
-    if lspci | grep -i nvidia &> /dev/null; then
-        log_info "NVIDIA GPU detected, installing drivers..."
-
-        # Enable NVIDIA repository
-        $PKG_MANAGER config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel$(rpm -E %rhel)/x86_64/cuda-rhel.repo
-
-        # Clean and install
-        $PKG_MANAGER clean all
-        $PKG_MANAGER module enable -y nvidia-driver:latest-dkms
-        $PKG_MANAGER install -y nvidia-driver nvidia-cuda-toolkit
-
-        # Configure Podman for NVIDIA (only if needed)
-        if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then
-            log_warning "NVIDIA GPU access test failed, you may need to reboot"
-        else
-            log_success "NVIDIA drivers installed and GPU access verified"
-        fi
-
-        # Reboot required
-        log_warning "System reboot required for NVIDIA drivers"
-        log_info "Run: reboot"
-    else
-        log_info "No NVIDIA GPU detected, skipping driver installation"
-    fi
-}
-
-install_ml_tools() {
-    log_info "Installing ML tools and dependencies..."
-
-    # Python and ML packages
-    $PKG_MANAGER install -y python3 python3-pip python3-devel
-
-    # System dependencies for ML
-    $PKG_MANAGER groupinstall -y "Development Tools"
-    $PKG_MANAGER install -y cmake git pkgconfig
-    $PKG_MANAGER install -y libjpeg-turbo-devel libpng-devel libtiff-devel
-    $PKG_MANAGER install -y mesa-libGL-devel mesa-libGLU-devel
-    $PKG_MANAGER install -y gtk3-devel
-    $PKG_MANAGER install -y atlas-devel blas-devel lapack-devel
-
-    # Install common ML libraries
-    pip3 install --upgrade pip
-    pip3 install numpy scipy scikit-learn pandas
-    pip3 install jupyter matplotlib seaborn
-    pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-
-    log_success "ML tools installed"
-}
-
-create_user() {
-    log_info "Creating fetchml user..."
-
-    if id "$FETCH_ML_USER" &>/dev/null; then
-        log_info "User $FETCH_ML_USER already exists"
-        return
-    fi
-
-    useradd -m -d $FETCH_ML_HOME -s /bin/bash $FETCH_ML_USER
-    usermod -aG podman $FETCH_ML_USER
-
-    # Create directories
-    mkdir -p $FETCH_ML_HOME/.config/containers
-    mkdir -p $FETCH_ML_HOME/go/bin
-    mkdir -p $LOG_DIR
-    mkdir -p $DATA_DIR
-
-    chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
-    chown -R $FETCH_ML_USER:$FETCH_ML_USER $LOG_DIR
-    chown -R $FETCH_ML_USER:$FETCH_ML_USER $DATA_DIR
-
-    log_success "User $FETCH_ML_USER created"
-}
-
-setup_firewall() {
-    log_info "Configuring firewall..."
-
-    if command -v firewall-cmd &> /dev/null; then
-        systemctl enable firewalld
-        systemctl start firewalld
-
-        firewall-cmd --permanent --add-service=ssh
-        firewall-cmd --permanent --add-port=8080/tcp  # Worker API
-        firewall-cmd --permanent --add-port=8081/tcp  # Data manager API
-        firewall-cmd --permanent --add-port=6379/tcp  # Redis
-        firewall-cmd --reload
-
-        firewall-cmd --list-all
-    else
-        log_warning "Firewalld not available, skipping firewall configuration"
-    fi
-}
-
-setup_systemd_services() {
-    log_info "Setting up systemd services..."
-
-    # Fetch ML Worker service
-    cat > $SERVICE_DIR/fetch_ml_worker.service << EOF
-[Unit]
-Description=Fetch ML Worker Service
-After=network.target redis.service
-Wants=redis.service
-
-[Service]
-Type=simple
-User=$FETCH_ML_USER
-Group=$FETCH_ML_USER
-WorkingDirectory=$FETCH_ML_HOME
-Environment=FETCH_ML_HOME=$FETCH_ML_HOME
-Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin
-ExecStart=$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml
-Restart=always
-RestartSec=5
-StandardOutput=journal
-StandardError=journal
-SyslogIdentifier=fetch_ml_worker
-
-[Install]
-WantedBy=multi-user.target
-EOF
-
-    # Fetch ML Data Manager service
-    cat > $SERVICE_DIR/fetch_ml_data_manager.service << EOF
-[Unit]
-Description=Fetch ML Data Manager Service
-After=network.target redis.service
-Wants=redis.service
-
-[Service]
-Type=simple
-User=$FETCH_ML_USER
-Group=$FETCH_ML_USER
-WorkingDirectory=$FETCH_ML_HOME
-Environment=FETCH_ML_HOME=$FETCH_ML_HOME
-Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin
-ExecStart=$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml
-Restart=always
-RestartSec=5
-StandardOutput=journal
-StandardError=journal
-SyslogIdentifier=fetch_ml_data_manager
-
-[Install]
-WantedBy=multi-user.target
-EOF
-
-    # Enable services
-    systemctl daemon-reload
-    systemctl enable fetch_ml_worker
-    systemctl enable fetch_ml_data_manager
-
-    log_success "Systemd services configured"
-}
-
-setup_log_rotation() {
-    log_info "Setting up log rotation..."
-
-    cat > /etc/logrotate.d/fetch_ml << EOF
-$LOG_DIR/*.log {
-    daily
-    missingok
-    rotate 30
-    compress
-    delaycompress
-    notifempty
-    create 0644 $FETCH_ML_USER $FETCH_ML_USER
-    postrotate
-        systemctl reload fetch_ml_worker || true
-        systemctl reload fetch_ml_data_manager || true
-    endscript
-}
-EOF
-
-    log_success "Log rotation configured"
-}
-
-optimize_system() {
-    log_info "Optimizing system for ML workloads..."
-
-    # Increase file limits
-    echo "* soft nofile 65536" >> /etc/security/limits.conf
-    echo "* hard nofile 65536" >> /etc/security/limits.conf
-
-    # Optimize kernel parameters for ML
-    cat >> /etc/sysctl.conf << EOF
-# ML Optimization
-net.core.rmem_max = 134217728
-net.core.wmem_max = 134217728
-vm.swappiness = 10
-vm.dirty_ratio = 15
-vm.dirty_background_ratio = 5
-EOF
-
-    sysctl -p
-
-    # Configure GPU persistence mode if NVIDIA available
-    if command -v nvidia-smi &> /dev/null; then
-        nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode"
-    fi
-
-    # Disable SELinux for better container compatibility (optional)
-    if [[ -f /etc/selinux/config ]]; then
-        log_warning "Consider setting SELinux to permissive mode for better container compatibility"
-        log_info "Edit /etc/selinux/config and set SELINUX=permissive"
-    fi
-
-    log_success "System optimized for ML workloads"
-}
-
-install_fetch_ml() {
-    log_info "Installing Fetch ML..."
-
-    # Clone or copy Fetch ML
-    cd $FETCH_ML_HOME
-
-    if [[ ! -d "fetch_ml" ]]; then
-        log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml"
-        log_info "Example: git clone https://github.com/your-org/fetch_ml.git"
-        return
-    fi
-
-    cd fetch_ml
-
-    # Build
-    export PATH=$PATH:/usr/local/go/bin
-    make build
-
-    # Copy binaries
-    cp bin/* $FETCH_ML_HOME/bin/
-    chmod +x $FETCH_ML_HOME/bin/*
-
-    # Copy configs
-    mkdir -p $FETCH_ML_HOME/configs
-    cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml
-
-    # Set permissions
-    chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
-
-    log_success "Fetch ML installed"
-}
-
-main() {
-    log_info "Starting Fetch ML Rocky Linux server setup..."
-
-    check_root
-    check_rocky
-
-    update_system
-    enable_epel
-    install_go
-    install_podman
-    install_redis
-    install_nvidia_drivers
-    install_ml_tools
-    ensure_user
-    create_directories
-    setup_firewall
-    setup_systemd_services
-    setup_logrotate
-    hardening_steps
-    selinux_guidance
-    install_fetch_ml
-
-    log_success "Fetch ML setup complete!"
-    echo
-    log_info "Next steps:"
-    echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml"
-    echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml"
-    echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager"
-    echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager"
-    echo "5. View logs: journalctl -u fetch_ml_worker -f"
-    echo
-    log_info "Services will be available at:"
-    echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080"
-    echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081"
-}
-
-# Run main function
-main "$@"
--- a/scripts/legacy/setup_ubuntu.sh
+++ b/scripts/legacy/setup_ubuntu.sh
@ -1,294 +0,0 @@
-#!/usr/bin/env bash
-
-# Fetch ML Ubuntu Server Setup Script
-# Optimized for ML experiments on Ubuntu 20.04/22.04
-
-set -euo pipefail
-
-# shellcheck source=scripts/setup_common.sh
-SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
-source "$SCRIPT_DIR/setup_common.sh"
-
-check_root() {
-    if [[ $EUID -ne 0 ]]; then
-        log_error "This script must be run as root"
-        exit 1
-    fi
-}
-
-check_ubuntu() {
-    if ! command -v apt-get &> /dev/null; then
-        log_error "This script is designed for Ubuntu systems"
-        exit 1
-    fi
-
-    local ubuntu_version=$(lsb_release -rs)
-    log_info "Ubuntu version: $ubuntu_version"
-
-    if (( $(echo "$ubuntu_version < 20.04" | bc -l) == 1 )); then
-        log_warning "Ubuntu version < 20.04 may not support all features"
-    fi
-}
-
-update_system() {
-    log_info "Updating system packages..."
-    apt-get update -y
-    apt-get upgrade -y
-    apt-get install -y curl wget gnupg lsb-release software-properties-common
-}
-
-install_go() {
-    log_info "Installing Go 1.25..."
-
-    if command -v go &> /dev/null; then
-        local go_version=$(go version | awk '{print $3}' | sed 's/go//')
-        log_info "Go already installed: $go_version"
-        return
-    fi
-
-    cd /tmp
-    TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz"
-    secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz"
-    tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz
-
-    # Add to PATH
-    echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile
-    echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile
-    export PATH=$PATH:/usr/local/go/bin
-
-    log_success "Go 1.25 installed"
-}
-
-install_podman() {
-    log_info "Installing Podman..."
-
-    if command -v podman &> /dev/null; then
-        log_info "Podman already installed"
-        return
-    fi
-
-    # Add official Podman repository
-    echo "deb https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/ /" | tee /etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list
-    curl -L "https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/Release.key" | apt-key add -
-
-    apt-get update -y
-    apt-get install -y podman podman-compose
-
-    # Configure Podman for rootless operation
-    echo "user_namespace_enable = 1" >> /etc/containers/containers.conf
-    echo "runtime = \"crun\"" >> /etc/containers/containers.conf
-
-    log_success "Podman installed"
-}
-
-install_redis() {
-    log_info "Installing Redis..."
-
-    if command -v redis-server &> /dev/null; then
-        log_info "Redis already installed"
-        return
-    fi
-
-    apt-get install -y redis-server
-
-    # Configure Redis for production
-    sed -i 's/supervised no/supervised systemd/' /etc/redis/redis.conf
-    sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis/redis.conf
-
-    systemctl enable redis-server
-    systemctl start redis-server
-
-    log_success "Redis installed and configured"
-}
-
-install_nvidia_drivers() {
-    log_info "Checking for NVIDIA GPU..."
-
-    if command -v nvidia-smi &> /dev/null; then
-        log_info "NVIDIA drivers already installed"
-        nvidia-smi
-        return
-    fi
-
-    if lspci | grep -i nvidia &> /dev/null; then
-        log_info "NVIDIA GPU detected, installing drivers..."
-
-        # Add NVIDIA repository
-        TMP_FILES="/tmp/cuda-keyring_1.1-1_all.deb"
-        secure_download "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$(lsb_release -rs | cut -d. -f1)/x86_64/cuda-keyring_1.1-1_all.deb" "cfa6b4109e7e3d9be060a016b7dc07e8edcd5356c0eabcc0c537a76e6c603d76" "/tmp/cuda-keyring_1.1-1_all.deb"
-        dpkg -i /tmp/cuda-keyring_1.1-1_all.deb
-        apt-get update -y
-
-        # Install drivers
-        apt-get install -y nvidia-driver-535 nvidia-cuda-toolkit
-
-        # Configure Podman for NVIDIA (only if needed)
-        if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then
-            log_warning "NVIDIA GPU access test failed, you may need to reboot"
-        else
-            log_success "NVIDIA drivers installed and GPU access verified"
-        fi
-
-    else
-        log_info "No NVIDIA GPU detected, skipping driver installation"
-    fi
-}
-
-install_ml_tools() {
-    log_info "Installing ML tools and dependencies..."
-
-    # Python and ML packages
-    apt-get install -y python3 python3-pip python3-venv
-
-    # System dependencies for ML
-    apt-get install -y build-essential cmake git pkg-config
-    apt-get install -y libjpeg-dev libpng-dev libtiff-dev
-    apt-get install -y libavcodec-dev libavformat-dev libswscale-dev
-    apt-get install -y libgtk2.0-dev libcanberra-gtk-module
-    apt-get install -y libxvidcore-dev libx264-dev
-    apt-get install -y libatlas-base-dev gfortran
-
-    # Install common ML libraries
-    pip3 install --upgrade pip
-    pip3 install numpy scipy scikit-learn pandas
-    pip3 install jupyter matplotlib seaborn
-    pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-
-    log_success "ML tools installed"
-}
-
-create_user() {
-    log_info "Creating fetchml user..."
-    ensure_user
-    create_directories
-    log_success "User $FETCH_ML_USER and directories created"
-}
-
-setup_firewall() {
-    log_info "Configuring firewall..."
-
-    if command -v ufw &> /dev/null; then
-        ufw --force enable
-        ufw allow ssh
-        ufw allow 8080/tcp  # Worker API
-        ufw allow 8081/tcp  # Data manager API
-        ufw allow 6379/tcp  # Redis
-        ufw status
-    else
-        log_warning "UFW not available, skipping firewall configuration"
-    fi
-}
-
-setup_systemd_services() {
-    log_info "Setting up systemd services..."
-
-    setup_systemd_service "fetch_ml_worker" "$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml"
-    setup_systemd_service "fetch_ml_data_manager" "$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml"
-
-    # Enable services
-    systemctl daemon-reload
-    systemctl enable fetch_ml_worker
-    systemctl enable fetch_ml_data_manager
-
-    log_success "Systemd services configured"
-}
-
-setup_log_rotation() {
-    log_info "Setting up log rotation..."
-    setup_logrotate
-    log_success "Log rotation configured"
-}
-
-optimize_system() {
-    log_info "Optimizing system for ML workloads..."
-    hardening_steps
-
-    # Optimize kernel parameters for ML
-    cat >> /etc/sysctl.conf << EOF
-# ML Optimization
-net.core.rmem_max = 134217728
-net.core.wmem_max = 134217728
-vm.swappiness = 10
-vm.dirty_ratio = 15
-vm.dirty_background_ratio = 5
-EOF
-
-    sysctl -p
-
-    # Configure GPU persistence mode if NVIDIA available
-    if command -v nvidia-smi &> /dev/null; then
-        nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode"
-    fi
-
-    log_success "System optimized for ML workloads"
-}
-
-install_fetch_ml() {
-    log_info "Installing Fetch ML..."
-
-    # Clone or copy Fetch ML
-    cd $FETCH_ML_HOME
-
-    if [[ ! -d "fetch_ml" ]]; then
-        # This would be replaced with actual repository URL
-        log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml"
-        log_info "Example: git clone https://github.com/your-org/fetch_ml.git"
-        return
-    fi
-
-    cd fetch_ml
-
-    # Build
-    export PATH=$PATH:/usr/local/go/bin
-    make build
-
-    # Copy binaries
-    cp bin/* $FETCH_ML_HOME/bin/
-    chmod +x $FETCH_ML_HOME/bin/*
-
-    # Copy configs
-    mkdir -p $FETCH_ML_HOME/configs
-    cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml
-
-    # Set permissions
-    chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
-
-    log_success "Fetch ML installed"
-}
-
-main() {
-    log_info "Starting Fetch ML Ubuntu server setup..."
-
-    check_root
-    check_ubuntu
-
-    update_system
-    install_go
-    install_podman
-    install_redis
-    install_nvidia_drivers
-    install_ml_tools
-    ensure_user
-    create_directories
-    setup_firewall
-    setup_systemd_services
-    setup_logrotate
-    hardening_steps
-    install_fetch_ml
-
-    log_success "Fetch ML setup complete!"
-    echo
-    log_info "Next steps:"
-    echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml"
-    echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml"
-    echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager"
-    echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager"
-    echo "5. View logs: journalctl -u fetch_ml_worker -f"
-    echo
-    log_info "Services will be available at:"
-    echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080"
-    echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081"
-}
-
-# Run main function
-main "$@"
--- a/scripts/legacy/test_tools.sh
+++ b/scripts/legacy/test_tools.sh
@ -1,67 +0,0 @@
-#!/bin/bash
-set -e
-
-echo "=== Test Tools Harness ==="
-
-# Function to check if Redis is running, start temporary instance if needed
-ensure_redis() {
-    if ! redis-cli ping >/dev/null 2>&1; then
-        echo "Starting temporary Redis instance..."
-        redis-server --daemonize yes --port 6379
-        sleep 2
-        if ! redis-cli ping >/dev/null 2>&1; then
-            echo "Failed to start Redis"
-            exit 1
-        fi
-        echo "Redis started successfully"
-        # Set up cleanup trap
-        trap 'echo "Stopping temporary Redis..."; redis-cli shutdown || true' EXIT
-    else
-        echo "Redis is already running"
-    fi
-}
-
-# Step 1: Build Go binaries
-echo "Building Go binaries..."
-go build -o bin/api-server ./cmd/api-server
-go build -o bin/worker ./cmd/worker
-go build -o bin/data_manager ./cmd/data_manager
-go build -o bin/user_manager ./cmd/user_manager
-
-# Step 2: Build Zig CLI
-echo "Building Zig CLI..."
-cd cli
-zig build
-cd ..
-
-# Step 3: Ensure Redis is running
-ensure_redis
-
-# Step 4: Run Go tests
-echo "Running Go tests..."
-go test ./...
-
-# Step 5: Run Zig tests
-echo "Running Zig CLI tests..."
-cd cli
-zig test
-cd ..
-
-# Step 6: Run Go E2E tests (Redis is already available)
-echo "Running Go E2E tests..."
-go test ./tests/e2e/...
-
-# Step 7: Smoke test API server and CLI
-echo "Running smoke test..."
-# Start API server in background on different port
-./bin/api-server -config configs/config.yaml -port 19101 -no-tls > /tmp/api-server.log 2>&1 &
-API_PID=$!
-sleep 2
-
-# Test CLI status
-./cli/zig-out/bin/ml status -server http://localhost:19101
-
-# Clean up
-kill $API_PID 2>/dev/null || true
-
-echo "=== All tests completed successfully ==="
--- a/scripts/maintenance/auto-cleanup.service
+++ b/scripts/maintenance/auto-cleanup.service
@ -5,7 +5,7 @@ Requires=docker.service

 [Service]
 Type=oneshot
-ExecStart=/Users/jfraeys/Documents/dev/fetch_ml/scripts/cleanup.sh --force
+ExecStart=/Users/jfraeys/Documents/dev/fetch_ml/scripts/maintenance/cleanup.sh --dry-run
 User=jfraeys
 Group=staff
 StandardOutput=journal
--- a/scripts/maintenance/cleanup-benchmarks.sh
+++ b/scripts/maintenance/cleanup-benchmarks.sh
@ -8,6 +8,7 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
 LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
+ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"

 # Colors for output
 RED='\033[0;31m'
@ -43,22 +44,34 @@ cleanup_benchmark_artifacts() {
        
        case "${1:-keep-10}" in
            "all")
-                print_status "Removing ALL benchmark artifacts..."
-                rm -rf "$LOCAL_ARTIFACTS_DIR"
-                print_success "Removed all artifacts (was $size_before)"
+                print_status "Archiving ALL benchmark artifacts..."
+                local stamp=$(date -u +%Y%m%d-%H%M%S)
+                mkdir -p "$ARCHIVE_DIR/$stamp"
+                mv "$LOCAL_ARTIFACTS_DIR"/run_* "$ARCHIVE_DIR/$stamp"/ 2>/dev/null || true
+                print_success "Archived all artifacts (was $size_before)"
                ;;
            "keep-5")
-                print_status "Keeping last 5 runs, removing older ones..."
+                print_status "Keeping last 5 runs, archiving older ones..."
+                local stamp=$(date -u +%Y%m%d-%H%M%S)
+                mkdir -p "$ARCHIVE_DIR/$stamp"
                cd "$LOCAL_ARTIFACTS_DIR"
-                ls -1t run_* 2>/dev/null | tail -n +6 | xargs rm -rf 2>/dev/null || true
+                ls -1t run_* 2>/dev/null | tail -n +6 | while read -r run; do
+                    [ -n "$run" ] || continue
+                    mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
+                done
                local count_after=$(ls -1d run_* 2>/dev/null | wc -l)
                local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B")
                print_success "Cleaned old runs: $count_before → $count_after runs ($size_before → $size_after)"
                ;;
            "keep-10")
-                print_status "Keeping last 10 runs, removing older ones..."
+                print_status "Keeping last 10 runs, archiving older ones..."
+                local stamp=$(date -u +%Y%m%d-%H%M%S)
+                mkdir -p "$ARCHIVE_DIR/$stamp"
                cd "$LOCAL_ARTIFACTS_DIR"
-                ls -1t run_* 2>/dev/null | tail -n +11 | xargs rm -rf 2>/dev/null || true
+                ls -1t run_* 2>/dev/null | tail -n +11 | while read -r run; do
+                    [ -n "$run" ] || continue
+                    mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
+                done
                local count_after=$(ls -1d run_* 2>/dev/null | wc -l)
                local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B")
                print_success "Cleaned old runs: $count_before → $count_after runs ($size_before → $size_after)"
@ -80,12 +93,18 @@ cleanup_temp_files() {
    # Clean temp directories
    local temp_cleaned=0
    
+    local stamp=$(date -u +%Y%m%d-%H%M%S)
+    local tmp_archive_dir="$LOCAL_ARTIFACTS_DIR/tmp-archive/$stamp"
+    mkdir -p "$tmp_archive_dir"
+
    # /tmp cleanup
    if [ -d "/tmp" ]; then
        local tmp_files=$(find /tmp -name "benchmark_*" -type f 2>/dev/null | wc -l)
        if [ "$tmp_files" -gt 0 ]; then
-            find /tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
-            print_success "Cleaned $tmp_files temporary files from /tmp"
+            find /tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
+                mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
+            done
+            print_success "Archived $tmp_files temporary files from /tmp"
            temp_cleaned=$((temp_cleaned + tmp_files))
        fi
    fi
@ -94,8 +113,10 @@ cleanup_temp_files() {
    if [ -d "/var/tmp" ]; then
        local vartmp_files=$(find /var/tmp -name "benchmark_*" -type f 2>/dev/null | wc -l)
        if [ "$vartmp_files" -gt 0 ]; then
-            find /var/tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
-            print_success "Cleaned $vartmp_files temporary files from /var/tmp"
+            find /var/tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
+                mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
+            done
+            print_success "Archived $vartmp_files temporary files from /var/tmp"
            temp_cleaned=$((temp_cleaned + vartmp_files))
        fi
    fi
@ -104,8 +125,10 @@ cleanup_temp_files() {
    if [ -d "$HOME/tmp" ]; then
        local user_tmp_files=$(find "$HOME/tmp" -name "benchmark_*" -type f 2>/dev/null | wc -l)
        if [ "$user_tmp_files" -gt 0 ]; then
-            find "$HOME/tmp" -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
-            print_success "Cleaned $user_tmp_files temporary files from ~/tmp"
+            find "$HOME/tmp" -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
+                mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
+            done
+            print_success "Archived $user_tmp_files temporary files from ~/tmp"
            temp_cleaned=$((temp_cleaned + user_tmp_files))
        fi
    fi
@ -177,9 +200,16 @@ cleanup_logs() {
    for log_dir in "${log_dirs[@]}"; do
        if [ -d "$log_dir" ]; then
            local log_size_before=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B")
-            # Remove log files older than 7 days
-            find "$log_dir" -name "*.log" -type f -mtime +7 -delete 2>/dev/null || true
-            find "$log_dir" -name "*.log.*" -type f -mtime +7 -delete 2>/dev/null || true
+            local stamp=$(date -u +%Y%m%d-%H%M%S)
+            local log_archive_dir="$log_dir/archive/$stamp"
+            mkdir -p "$log_archive_dir"
+            # Move log files older than 7 days to archive
+            find "$log_dir" -name "*.log" -type f -mtime +7 -print0 2>/dev/null | while IFS= read -r -d '' f; do
+                mv "$f" "$log_archive_dir/" 2>/dev/null || true
+            done
+            find "$log_dir" -name "*.log.*" -type f -mtime +7 -print0 2>/dev/null | while IFS= read -r -d '' f; do
+                mv "$f" "$log_archive_dir/" 2>/dev/null || true
+            done
            local log_size_after=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B")
            if [ "$log_size_before" != "$log_size_after" ]; then
                print_success "Cleaned old logs in $log_dir: $log_size_before → $log_size_after"
--- a/scripts/maintenance/cleanup.sh
+++ b/scripts/maintenance/cleanup.sh
@ -144,12 +144,12 @@ else
    log_info "No running containers found"
 fi

-# Remove containers
+ # Remove containers
 log_info "Removing containers..."
 containers=$(docker ps -aq --filter "name=ml-")
 if [ -n "$containers" ]; then
    if [ "$DRY_RUN" = false ]; then
-        echo "$containers" | xargs docker rm -f
+        echo "$containers" | xargs docker rm
        log_success "Containers removed"
    fi
 else
@ -168,9 +168,9 @@ else
    log_info "No networks found"
 fi

-# Remove volumes (with caution)
-log_warning "Removing volumes (this will delete data)..."
-if [ "$FORCE" = true ] || [ "$ALL" = true ]; then
+ # Remove volumes (with caution)
+log_warning "Skipping volumes by default (use --all to remove them)"
+if [ "$ALL" = true ]; then
    volumes=$(docker volume ls -q --filter "name=ml-")
    if [ -n "$volumes" ]; then
        if [ "$DRY_RUN" = false ]; then
@ -181,16 +181,16 @@ if [ "$FORCE" = true ] || [ "$ALL" = true ]; then
        log_info "No volumes found"
    fi
 else
-    log_info "Skipping volumes (use --force or --all to remove them)"
+    log_info "Skipping volumes"
 fi

-# Remove images if requested
+ # Remove images if requested
 if [ "$ALL" = true ]; then
    log_info "Removing images..."
    images=$(docker images -q --filter "reference=fetch_ml-*")
    if [ -n "$images" ]; then
        if [ "$DRY_RUN" = false ]; then
-            echo "$images" | xargs docker rmi -f
+            echo "$images" | xargs docker rmi
            log_success "Images removed"
        fi
    else
@ -200,11 +200,15 @@ else
    log_info "Skipping images (use --all to remove them)"
 fi

-# General Docker cleanup
-log_info "Running general Docker cleanup..."
-if [ "$DRY_RUN" = false ]; then
-    docker system prune -f
-    log_success "General cleanup completed"
+ # General Docker cleanup
+if [ "$ALL" = true ]; then
+    log_info "Running general Docker cleanup (docker system prune)..."
+    if [ "$DRY_RUN" = false ]; then
+        docker system prune -f
+        log_success "General cleanup completed"
+    fi
+else
+    log_info "Skipping docker system prune (use --all to enable)"
 fi

 # Show final state
--- a/scripts/manage-artifacts.sh
+++ b/scripts/manage-artifacts.sh
@ -8,6 +8,7 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
 LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
+ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"

 # Create artifacts directory if it doesn't exist
 mkdir -p "$LOCAL_ARTIFACTS_DIR"
@ -41,17 +42,21 @@ case "${1:-help}" in
        echo "=== Cleaning Artifacts ==="
        case "${2:-all}" in
            "all")
-                echo "Removing all artifacts..."
-                rm -rf "$LOCAL_ARTIFACTS_DIR"
-                echo "All artifacts removed"
+                echo "Archiving all artifacts..."
+                stamp=$(date -u +%Y%m%d-%H%M%S)
+                mkdir -p "$ARCHIVE_DIR/$stamp"
+                mv "$LOCAL_ARTIFACTS_DIR"/run_* "$ARCHIVE_DIR/$stamp"/ 2>/dev/null || true
+                echo "All artifacts archived"
                ;;
            "old")
                keep_count="${3:-10}"
-                echo "Keeping last $keep_count runs, removing older ones..."
+                echo "Keeping last $keep_count runs, archiving older ones..."
+                stamp=$(date -u +%Y%m%d-%H%M%S)
+                mkdir -p "$ARCHIVE_DIR/$stamp"
                cd "$LOCAL_ARTIFACTS_DIR"
                ls -1t run_* 2>/dev/null | tail -n +$((keep_count + 1)) | while read -r run; do
-                    echo "Removing: $run"
-                    rm -rf "$run"
+                    echo "Archiving: $run"
+                    mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
                done
                ;;
            "run")
@ -64,8 +69,10 @@ case "${1:-help}" in
                fi
                run_dir="$LOCAL_ARTIFACTS_DIR/run_$run_id"
                if [ -d "$run_dir" ]; then
-                    echo "Removing run: $run_id"
-                    rm -rf "$run_dir"
+                    echo "Archiving run: $run_id"
+                    stamp=$(date -u +%Y%m%d-%H%M%S)
+                    mkdir -p "$ARCHIVE_DIR/$stamp"
+                    mv "$run_dir" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
                else
                    echo "Run not found: $run_id"
                fi
--- a/scripts/setup-auto-cleanup.sh
+++ b/scripts/setup-auto-cleanup.sh
--- a/scripts/setup-secure-homelab.sh
+++ b/scripts/setup-secure-homelab.sh
@ -1,169 +0,0 @@
-#!/bin/bash
-
-# Secure Homelab Setup Script for Fetch ML
-# This script generates secure API keys and TLS certificates
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
-CONFIG_DIR="$PROJECT_ROOT/configs/environments"
-SSL_DIR="$PROJECT_ROOT/ssl"
-
-echo "🔒 Setting up secure homelab configuration..."
-
-# Create SSL directory
-mkdir -p "$SSL_DIR"
-
-# Generate TLS certificates
-echo "📜 Generating TLS certificates..."
-if [[ ! -f "$SSL_DIR/cert.pem" ]] || [[ ! -f "$SSL_DIR/key.pem" ]]; then
-    openssl req -x509 -newkey rsa:4096 -keyout "$SSL_DIR/key.pem" -out "$SSL_DIR/cert.pem" -days 365 -nodes \
-        -subj "/C=US/ST=Homelab/L=Local/O=FetchML/OU=Homelab/CN=localhost" \
-        -addext "subjectAltName=DNS:localhost,DNS:$(hostname),IP:127.0.0.1"
-    chmod 600 "$SSL_DIR/key.pem"
-    chmod 644 "$SSL_DIR/cert.pem"
-    echo "✅ TLS certificates generated in $SSL_DIR/"
-else
-    echo "ℹ️  TLS certificates already exist, skipping generation"
-fi
-
-# Generate secure API keys
-echo "🔑 Generating secure API keys..."
-generate_api_key() {
-    openssl rand -hex 32
-}
-
-# Hash function
-hash_key() {
-    echo -n "$1" | sha256sum | cut -d' ' -f1
-}
-
-# Generate keys
-ADMIN_KEY=$(generate_api_key)
-USER_KEY=$(generate_api_key)
-ADMIN_HASH=$(hash_key "$ADMIN_KEY")
-USER_HASH=$(hash_key "$USER_KEY")
-
-# Create secure config
-echo "⚙️  Creating secure configuration..."
-cat > "$CONFIG_DIR/config-homelab-secure.yaml" << EOF
-# Secure Homelab Configuration
-# IMPORTANT: Keep your API keys safe and never share them!
-
-redis:
-    url: "redis://localhost:6379"
-    max_connections: 10
-
-auth:
-    enabled: true
-    api_keys:
-        homelab_admin:
-            hash: $ADMIN_HASH
-            admin: true
-            roles:
-                - admin
-            permissions:
-                '*': true
-        homelab_user:
-            hash: $USER_HASH
-            admin: false
-            roles:
-                - researcher
-            permissions:
-                'experiments': true
-                'datasets': true
-                'jupyter': true
-
-server:
-    address: ":9101"
-    tls:
-        enabled: true
-        cert_file: "$SSL_DIR/cert.pem"
-        key_file: "$SSL_DIR/key.pem"
-
-security:
-    rate_limit:
-        enabled: true
-        requests_per_minute: 60
-        burst_size: 10
-    ip_whitelist:
-        - "127.0.0.1"
-        - "::1"
-        - "localhost"
-        - "192.168.1.0/24"  # Adjust to your network
-        - "10.0.0.0/8"
-
-logging:
-    level: "info"
-    file: "logs/fetch_ml.log"
-    console: true
-
-resources:
-    cpu_limit: "2"
-    memory_limit: "4Gi"
-    gpu_limit: 0
-    disk_limit: "10Gi"
-
-# Prometheus metrics
-metrics:
-    enabled: true
-    listen_addr: ":9100"
-    tls:
-        enabled: false
-EOF
-
-# Save API keys to a secure file
-echo "🔐 Saving API keys..."
-cat > "$PROJECT_ROOT/.api-keys" << EOF
-# Fetch ML Homelab API Keys
-# IMPORTANT: Keep this file secure and never commit to version control!
-
-ADMIN_API_KEY: $ADMIN_KEY
-USER_API_KEY: $USER_KEY
-
-# Usage examples:
-# curl -H "X-API-Key: $ADMIN_KEY" https://localhost:9101/health
-# curl -H "X-API-Key: $USER_KEY" https://localhost:9101/api/jupyter/services
-EOF
-
-chmod 600 "$PROJECT_ROOT/.api-keys"
-
-# Create environment file for JWT secret
-JWT_SECRET=$(generate_api_key)
-cat > "$PROJECT_ROOT/.env.secure" << EOF
-# Secure environment variables for Fetch ML
-# IMPORTANT: Keep this file secure and never commit to version control!
-
-JWT_SECRET=$JWT_SECRET
-
-# Source this file before running the server:
-# source .env.secure
-EOF
-
-chmod 600 "$PROJECT_ROOT/.env.secure"
-
-# Update .gitignore to exclude sensitive files
-echo "📝 Updating .gitignore..."
-if ! grep -q ".api-keys" "$PROJECT_ROOT/.gitignore"; then
-    echo -e "\n# Security files\n.api-keys\n.env.secure\nssl/\n*.pem\n*.key" >> "$PROJECT_ROOT/.gitignore"
-fi
-
-echo ""
-echo "🎉 Secure homelab setup complete!"
-echo ""
-echo "📋 Next steps:"
-echo "1. Review and adjust the IP whitelist in config-homelab-secure.yaml"
-echo "2. Start the server with: ./api-server -config configs/environments/config-homelab-secure.yaml"
-echo "3. Source the environment: source .env.secure"
-echo "4. Your API keys are saved in .api-keys"
-echo ""
-echo "🔐 API Keys:"
-echo "   Admin: $ADMIN_KEY"
-echo "   User:  $USER_KEY"
-echo ""
-echo "⚠️  IMPORTANT:"
-echo "   - Never share your API keys"
-echo "   - Never commit .api-keys or .env.secure to version control"
-echo "   - Backup your SSL certificates and API keys securely"
-echo "   - Consider using a password manager for storing keys"
--- a/scripts/setup.sh
+++ b/scripts/setup.sh
@ -1,311 +0,0 @@
-#!/bin/bash
-# setup.sh: One-shot homelab setup (security + core services)
-# Keeps essential security (Fail2Ban, monitoring) while simplifying complexity
-
-set -euo pipefail
-
-readonly RED='\033[0;31m'
-readonly GREEN='\033[0;32m'
-readonly YELLOW='\033[1;33m'
-readonly BLUE='\033[0;34m'
-readonly NC='\033[0m'
-
-print_info() {
-    echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-print_success() {
-    echo -e "${GREEN}[SUCCESS]${NC} $1"
-}
-
-print_warning() {
-    echo -e "${YELLOW}[WARNING]${NC} $1"
-}
-
-print_error() {
-    echo -e "${RED}[ERROR]${NC} $1"
-}
-
-# Simple dependency check
-check_deps() {
-    print_info "Checking dependencies..."
-    
-    local missing=()
-    
-    if ! command -v go &> /dev/null; then
-        missing+=("go")
-    fi
-    
-    if ! command -v zig &> /dev/null; then
-        missing+=("zig")
-    fi
-    
-    if ! command -v redis-server &> /dev/null; then
-        missing+=("redis-server")
-    fi
-    
-    if ! command -v docker &> /dev/null; then
-        missing+=("docker")
-    fi
-    
-    if [[ ${#missing[@]} -gt 0 ]]; then
-        print_error "Missing dependencies: ${missing[*]}"
-        echo ""
-        echo "Install with:"
-        echo "  macOS: brew install ${missing[*]}"
-        echo "  Ubuntu: sudo apt-get install ${missing[*]}"
-        exit 1
-    fi
-    
-    print_success "Dependencies OK"
-}
-
-# Simple setup
-setup_project() {
-    print_info "Setting up project..."
-    
-    # Create essential directories
-    mkdir -p ssl logs configs data monitoring
-    
-    # Generate simple SSL cert
-    if [[ ! -f "ssl/cert.pem" ]]; then
-        openssl req -x509 -newkey rsa:2048 -keyout ssl/key.pem -out ssl/cert.pem \
-            -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Homelab/CN=localhost" \
-            -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null
-        print_success "SSL certificates generated"
-    fi
-    
-    # Create balanced config
-    cat > configs/config.yaml << 'EOF'
-base_path: "./data/experiments"
-
-auth:
-  enabled: true
-  api_keys:
-    homelab_user:
-      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
-      admin: true
-      roles: ["user", "admin"]
-      permissions:
-        read: true
-        write: true
-        delete: true
-
-server:
-  address: ":9101"
-  tls:
-    enabled: true
-    cert_file: "./ssl/cert.pem"
-    key_file: "./ssl/key.pem"
-
-security:
-  rate_limit:
-    enabled: true
-    requests_per_minute: 30
-    burst_size: 5
-  ip_whitelist:
-    - "127.0.0.1"
-    - "::1"
-    - "192.168.0.0/16"
-    - "10.0.0.0/8"
-    - "172.16.0.0/12"
-  failed_login_lockout:
-    enabled: true
-    max_attempts: 3
-    lockout_duration: "15m"
-
-redis:
-  url: "redis://localhost:6379"
-
-logging:
-  level: "info"
-  file: "./logs/app.log"
-  audit_log: "./logs/audit.log"
-  access_log: "./logs/access.log"
-
-monitoring:
-  enabled: true
-  metrics_port: 9090
-  health_check_interval: "30s"
-EOF
-
-    print_success "Configuration created"
-}
-
-# Simple build
-build_project() {
-    print_info "Building project..."
-    
-    # Build Go apps
-    go build -o bin/api-server ./cmd/api-server
-    go build -o bin/worker ./cmd/worker
-    go build -o bin/tui ./cmd/tui
-    
-    # Build Zig CLI
-    cd cli && zig build && cd ..
-    
-    print_success "Build completed"
-}
-
-# Setup Fail2Ban
-setup_fail2ban() {
-    print_info "Setting up Fail2Ban..."
-    
-    if ! command -v fail2ban-server &> /dev/null; then
-        print_warning "Fail2Ban not installed, skipping..."
-        return
-    fi
-    
-    # Create Fail2Ban configuration
-    sudo mkdir -p /etc/fail2ban/jail.d 2>/dev/null || true
-    
-    cat > /tmp/ml-experiments-jail.conf << 'EOF'
-[DEFAULT]
-bantime = 3600
-findtime = 600
-maxretry = 3
-backend = systemd
-
-[sshd]
-enabled = true
-port = ssh
-logpath = /var/log/auth.log
-maxretry = 3
-
-[ml-experiments-api]
-enabled = true
-port = 9101
-filter = ml-experiments-api
-logpath = ./logs/audit.log
-maxretry = 5
-bantime = 7200
-
-[ml-experiments-auth]
-enabled = true
-filter = ml-experiments-auth
-logpath = ./logs/audit.log
-maxretry = 3
-bantime = 3600
-EOF
-
-    # Create filter definitions
-    cat > /tmp/ml-experiments-api.conf << 'EOF'
-[Definition]
-failregex = ^.*<HOST>.*"status":40[13].*$
-ignoreregex =
-EOF
-
-    cat > /tmp/ml-experiments-auth.conf << 'EOF'
-[Definition]
-failregex = ^.*"event":"failed_login".*"client_ip":"<HOST>".*$
-ignoreregex =
-EOF
-
-    # Try to install configurations
-    if sudo cp /tmp/ml-experiments-jail.conf /etc/fail2ban/jail.d/ 2>/dev/null; then
-        sudo cp /tmp/ml-experiments-*.conf /etc/fail2ban/filter.d/ 2>/dev/null || true
-        sudo systemctl restart fail2ban 2>/dev/null || true
-        print_success "Fail2Ban configured"
-    else
-        print_warning "Could not configure Fail2Ban (requires sudo)"
-    fi
-    
-    rm -f /tmp/ml-experiments-*.conf
-}
-
-# Setup Redis
-setup_redis() {
-    print_info "Setting up Redis..."
-    
-    if ! pgrep -f "redis-server" > /dev/null; then
-        redis-server --daemonize yes --port 6379
-        print_success "Redis started"
-    else
-        print_info "Redis already running"
-    fi
-}
-
-# Create simple management script
-create_manage_script() {
-    cat > manage.sh << 'EOF'
-#!/bin/bash
-
-# Simple management script
-
-case "${1:-status}" in
-    "start")
-        echo "Starting services..."
-        redis-server --daemonize yes --port 6379 2>/dev/null || true
-        ./bin/api-server -config configs/config.yaml &
-        echo "Services started"
-        ;;
-    "stop")
-        echo "Stopping services..."
-        pkill -f "api-server" || true
-        redis-cli shutdown 2>/dev/null || true
-        echo "Services stopped"
-        ;;
-    "status")
-        echo "=== Status ==="
-        if pgrep -f "redis-server" > /dev/null; then
-            echo "✅ Redis: Running"
-        else
-            echo "❌ Redis: Stopped"
-        fi
-        
-        if pgrep -f "api-server" > /dev/null; then
-            echo "✅ API Server: Running"
-        else
-            echo "❌ API Server: Stopped"
-        fi
-        ;;
-    "logs")
-        echo "=== Recent Logs ==="
-        tail -20 logs/app.log 2>/dev/null || echo "No logs yet"
-        ;;
-    "test")
-        echo "=== Testing ==="
-        curl -k -s https://localhost:9101/health || echo "API server not responding"
-        ;;
-    *)
-        echo "Usage: $0 {start|stop|status|logs|test}"
-        ;;
-esac
-EOF
-    
-    chmod +x manage.sh
-    print_success "Management script created"
-}
-
-# Show next steps
-show_next_steps() {
-    print_success "Setup completed!"
-    echo ""
-    echo "🎉 Setup complete!"
-    echo ""
-    echo "Next steps:"
-    echo "  1. Start services: ./tools/manage.sh start"
-    echo "  2. Check status: ./tools/manage.sh status"
-    echo "  3. Test API: curl -k -H 'X-API-Key: password' https://localhost:9101/health"
-    echo ""
-    echo "Configuration: configs/config.yaml"
-    echo "Logs: logs/app.log and logs/audit.log"
-    echo ""
-    print_success "Ready for homelab use!"
-}
-
-# Main setup
-main() {
-    echo "ML Experiment Manager - Homelab Setup"
-    echo "====================================="
-    echo ""
-    
-    check_deps
-    setup_project
-    build_project
-    setup_redis
-    create_manage_script
-    show_next_steps
-}
-
-main "$@"
--- a/scripts/setup_monitoring.py
+++ b/scripts/setup_monitoring.py
@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+import os
+
+# Create monitoring directory structure
+repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+monitoring_dir = os.path.join(repo_root, 'monitoring')
+grafana_dir = os.path.join(monitoring_dir, 'grafana')
+
+datasources_dir = os.path.join(grafana_dir, 'provisioning', 'datasources')
+providers_dir = os.path.join(grafana_dir, 'provisioning', 'dashboards')
+
+os.makedirs(datasources_dir, exist_ok=True)
+os.makedirs(providers_dir, exist_ok=True)
+
+# Essential datasource configurations
+datasources = {
+    'prometheus.yml': """apiVersion: 1
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: true
+    jsonData:
+      timeInterval: "5s"
+""",
+    'loki.yml': """apiVersion: 1
+datasources:
+  - name: Loki
+    type: loki
+    access: proxy
+    url: http://loki:3100
+    editable: true
+    jsonData:
+      maxLines: 1000
+""",
+    'dashboards.yml': """apiVersion: 1
+providers:
+  - name: 'default'
+    orgId: 1
+    folder: ''
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /var/lib/grafana/dashboards
+"""
+}
+
+# Write configuration files
+for filename, content in datasources.items():
+    if filename == 'dashboards.yml':
+        path = os.path.join(providers_dir, filename)
+    else:
+        path = os.path.join(datasources_dir, filename)
+    
+    with open(path, 'w') as f:
+        f.write(content)
+
+print("Monitoring setup completed!")
--- a/scripts/smoke-test.sh
+++ b/scripts/smoke-test.sh
@ -0,0 +1,111 @@
+set -euo pipefail;
+
+repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+export FETCHML_REPO_ROOT="$repo_root"
+
+env="${1:-dev}";
+if [ "$env" != "dev" ] && [ "$env" != "prod" ]; then
+    echo "usage: $0 [dev|prod]" >&2
+    exit 2
+fi
+
+probe_https_health_openssl() {
+    host="$1"
+    port="$2"
+    path="$3"
+
+    req="GET ${path} HTTP/1.1\r\nHost: ${host}\r\nConnection: close\r\n\r\n"
+    resp=$(printf "%b" "$req" | openssl s_client -connect "127.0.0.1:${port}" -servername "${host}" -tls1_2 -quiet 2>/dev/null || true)
+    printf "%s" "$resp" | tr -d '\r' | head -n 1 | grep -Eq '^HTTP/1\.[01] 200'
+}
+
+compose_cmd="docker-compose";
+if ! command -v docker-compose >/dev/null 2>&1; then
+    compose_cmd="docker compose";
+fi
+
+compose_files=()
+compose_project_args=("--project-directory" "$repo_root")
+api_base=""
+prometheus_base=""
+stack_name=""
+
+if [ "$env" = "dev" ]; then
+    mkdir -p \
+        "$repo_root/data/dev/redis" \
+        "$repo_root/data/dev/minio" \
+        "$repo_root/data/dev/prometheus" \
+        "$repo_root/data/dev/grafana" \
+        "$repo_root/data/dev/loki" \
+        "$repo_root/data/dev/logs" \
+        "$repo_root/data/dev/experiments" \
+        "$repo_root/data/dev/active" \
+        "$repo_root/data/dev/workspaces"
+
+    stack_name="dev"
+    compose_files=("-f" "$repo_root/deployments/docker-compose.dev.yml")
+    api_base="https://localhost:9101"
+    if ! curl -skf "$api_base/health" >/dev/null 2>&1; then
+        api_base="http://localhost:9101"
+    fi
+    prometheus_base="http://localhost:9090"
+else
+    mkdir -p \
+        "$repo_root/data/prod-smoke/caddy/data" \
+        "$repo_root/data/prod-smoke/caddy/config" \
+        "$repo_root/data/prod-smoke/redis" \
+        "$repo_root/data/prod-smoke/logs" \
+        "$repo_root/data/prod-smoke/experiments" \
+        "$repo_root/data/prod-smoke/active"
+
+    stack_name="prod"
+    compose_files=("-f" "$repo_root/deployments/docker-compose.prod.smoke.yml")
+    api_base="https://localhost:8443"
+    export FETCHML_DOMAIN=localhost
+    export CADDY_EMAIL=smoke@example.invalid
+fi
+
+cleanup() {
+    status=$?;
+    if [ "$status" -ne 0 ]; then
+        $compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" logs --no-color || true;
+    fi
+    if [ "${KEEP_STACK:-0}" != "1" ]; then
+        $compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" down -v >/dev/null 2>&1 || true;
+    fi
+    exit "$status";
+}
+
+trap cleanup EXIT;
+echo "Starting $stack_name stack for smoke test...";
+
+$compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" up -d --build >/dev/null;
+echo "Waiting for API to become healthy...";
+
+deadline=$(($(date +%s) + 90));
+while true; do
+    if [ "$env" = "dev" ]; then
+        if curl -skf "$api_base/health" >/dev/null 2>&1; then break; fi;
+    else
+        if probe_https_health_openssl "localhost" "8443" "/health"; then break; fi;
+    fi
+    if [ $(date +%s) -ge $deadline ]; then echo "Timed out waiting for $api_base/health"; exit 1; fi;
+    sleep 2;
+done;
+
+if [ "$env" = "dev" ]; then
+    echo "Checking metrics endpoint...";
+    curl -skf "$api_base/metrics" >/dev/null;
+
+    echo "Waiting for Prometheus target api-server to be up...";
+    deadline=$(($(date +%s) + 90));
+    query_url="$prometheus_base/api/v1/query?query=up%7Bjob%3D%22api-server%22%7D";
+
+    while true; do
+        resp=$(curl -sf "$query_url" || true);
+        resp_compact=$(printf "%s" "$resp" | tr -d '\n' | tr -d '\r');
+        if echo "$resp_compact" | grep -Fq '"instance":"api-server:9101"' && echo "$resp_compact" | grep -Fq ',"1"]'; then break; fi;
+        if [ $(date +%s) -ge $deadline ]; then echo "Timed out waiting for Prometheus api-server target to be up"; echo "$resp"; exit 1; fi;
+        sleep 2;
+    done;
+fi
--- a/scripts/testing/run-full-test-suite.sh
+++ b/scripts/testing/run-full-test-suite.sh
--- a/scripts/testing/test-homelab-secure.sh
+++ b/scripts/testing/test-homelab-secure.sh
@ -1,80 +0,0 @@
-#!/bin/bash
-
-# Homelab Secure Test Environment Script
-set -e
-
-echo "Starting Homelab Secure Production Environment..."
-
-# Clean up any existing containers
-echo "Cleaning up existing containers..."
-docker-compose -f deployments/docker-compose.homelab-secure.yml down -v
-
-# Create necessary directories with proper permissions
-echo "Creating directories..."
-mkdir -p data logs
-chmod 750 data logs
-
-# Build and start services
-echo "Building and starting services..."
-docker-compose -f deployments/docker-compose.homelab-secure.yml up --build -d
-
-# Wait for services to be healthy
-echo "Waiting for services to be healthy..."
-sleep 20
-
-# Check service health
-echo "Checking service health..."
-docker-compose -f deployments/docker-compose.homelab-secure.yml ps
-
-# Test API server with TLS
-echo "Testing API server..."
-curl -k -s https://localhost:9104/health || echo "API health check failed"
-
-# Test Redis with authentication
-echo "Testing Redis with authentication..."
-docker exec ml-homelab-redis redis-cli -a "HomelabRedis2024!" ping || echo "Redis health check failed"
-
-# Test SSH connectivity with security
-echo "Testing SSH connectivity..."
-docker exec -u worker ml-homelab-worker ssh -o StrictHostKeyChecking=no -o Port=2222 worker@localhost "echo 'SSH OK'" || echo "SSH test failed"
-
-# Test fail2ban status
-echo "Testing fail2ban..."
-docker exec ml-homelab-api fail2ban-client status sshd || echo "fail2ban check failed"
-
-echo ""
-echo "Homelab secure production environment is ready!"
-echo ""
-echo "Services:"
-echo "  - API Server: https://localhost:9104"
-echo "  - SSH: localhost:2223 (worker user)"
-echo "  - Redis: localhost:6379 (with password)"
-echo "  - Metrics: http://localhost:9101"
-echo ""
-echo "Security Features:"
-echo "  ✓ Strong TLS 1.3 with modern ciphers"
-echo "  ✓ SSH with fail2ban protection"
-echo "  ✓ Redis with password authentication"
-echo "  ✓ SQLite database with encryption"
-echo "  ✓ Container security hardening"
-echo "  ✓ Rate limiting and CORS protection"
-echo "  ✓ Security headers and CSRF protection"
-echo "  ✓ Podman sandboxed job execution"
-echo "  ✓ Audit logging and monitoring"
-echo ""
-echo "Credentials:"
-echo "  - API User: homelab_user / password"
-echo "  - SSH User: worker / HomelabWorker2024!"
-echo "  - Redis Password: HomelabRedis2024!"
-echo ""
-echo "To test with CLI:"
-echo "  ./cli/zig-out/bin/ml queue homelab-secure-test"
-echo "  ./cli/zig-out/bin/ml status"
-echo ""
-echo "To view logs:"
-echo "  docker-compose -f deployments/docker-compose.homelab-secure.yml logs -f api-server"
-echo "  docker-compose -f deployments/docker-compose.homelab-secure.yml logs -f worker"
-echo "  docker-compose -f deployments/docker-compose.homelab-secure.yml down"
-echo ""
-echo "To stop:"
-echo "  docker-compose -f deployments/docker-compose.homelab-secure.yml down"
--- a/scripts/track_performance.sh
+++ b/scripts/track_performance.sh
@ -0,0 +1,64 @@
+#!/bin/bash
+# Simple performance tracking script
+
+RESULTS_DIR="test_results/performance"
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+RESULTS_FILE="$RESULTS_DIR/load_test_$TIMESTAMP.json"
+
+mkdir -p "$RESULTS_DIR"
+
+echo "Running load test performance tracking..."
+echo "Timestamp: $TIMESTAMP"
+
+# Run tests and capture results
+go test ./tests/load -run=TestLoadTestSuite -v -load-suite=medium -timeout=10m > "$RESULTS_DIR/raw_$TIMESTAMP.log"
+
+# Extract key metrics
+{
+  echo "{"
+  echo "  \"timestamp\": \"$TIMESTAMP\","
+  echo "  \"tests\": ["
+  
+  # Parse light load
+  LIGHT_RPS=$(grep -A1 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Throughput" | awk '{print $2}')
+  LIGHT_ERROR=$(grep -A2 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Error rate" | awk '{print $3}')
+  LIGHT_P99=$(grep -A4 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "P99 latency" | awk '{print $3}')
+  
+  echo "    {"
+  echo "      \"name\": \"LightLoad\","
+  echo "      \"throughput_rps\": $LIGHT_RPS,"
+  echo "      \"error_rate_percent\": $LIGHT_ERROR,"
+  echo "      \"p99_latency_ms\": \"$LIGHT_P99\""
+  echo "    },"
+  
+  # Parse medium load
+  MEDIUM_RPS=$(grep -A1 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Throughput" | awk '{print $2}')
+  MEDIUM_ERROR=$(grep -A2 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Error rate" | awk '{print $3}')
+  MEDIUM_P99=$(grep -A4 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "P99 latency" | awk '{print $3}')
+  
+  echo "    {"
+  echo "      \"name\": \"MediumLoad\","
+  echo "      \"throughput_rps\": $MEDIUM_RPS,"
+  echo "      \"error_rate_percent\": $MEDIUM_ERROR,"
+  echo "      \"p99_latency_ms\": \"$MEDIUM_P99\""
+  echo "    }"
+  echo "  ]"
+  echo "}"
+} > "$RESULTS_FILE"
+
+echo "Results saved to: $RESULTS_FILE"
+echo "Raw logs: $RESULTS_DIR/raw_$TIMESTAMP.log"
+
+# Show comparison with previous run if exists
+PREV_FILE=$(ls -t "$RESULTS_DIR"/load_test_*.json | sed -n '2p')
+if [ -n "$PREV_FILE" ]; then
+  echo ""
+  echo "=== Comparison with previous run ==="
+  echo "Previous: $(basename $PREV_FILE)"
+  echo "Current:  $(basename $RESULTS_FILE)"
+  echo ""
+  echo "Light Load Throughput:"
+  echo "  Previous: $(jq -r '.tests[0].throughput_rps' "$PREV_FILE") RPS"
+  echo "  Current:  $(jq -r '.tests[0].throughput_rps' "$RESULTS_FILE") RPS"
+  echo "  Change:   $(echo "$(jq -r '.tests[0].throughput_rps' "$RESULTS_FILE") - $(jq -r '.tests[0].throughput_rps' "$PREV_FILE")" | bc -l) RPS"
+fi
--- a/scripts/validate-prod-config.sh
+++ b/scripts/validate-prod-config.sh
@ -1,204 +0,0 @@
-#!/bin/bash
-# Production Configuration Validator
-# Verifies all paths and configs are consistent for experiment lifecycle
-
-set -e
-
-BOLD='\033[1m'
-GREEN='\033[0;32m'
-RED='\033[0;31m'
-YELLOW='\033[1;33m'
-NC='\033[0m' # No Color
-
-echo -e "${BOLD}=== FetchML Production Configuration Validator ===${NC}\n"
-
-# Configuration file paths
-API_CONFIG="${1:-configs/config-prod.yaml}"
-WORKER_CONFIG="${2:-configs/worker-prod.toml}"
-
-errors=0
-warnings=0
-
-check_pass() {
-    echo -e "${GREEN}✓${NC} $1"
-}
-
-check_fail() {
-    echo -e "${RED}✗${NC} $1"
-    ((errors++))
-}
-
-check_warn() {
-    echo -e "${YELLOW}⚠${NC} $1"
-    ((warnings++))
-}
-
-# 1. Check API server config exists
-echo -e "${BOLD}Checking API Server Configuration${NC}"
-if [ ! -f "$API_CONFIG" ]; then
-    check_fail "API config not found: $API_CONFIG"
-else
-    check_pass "API config found: $API_CONFIG"
-    
-    # Extract base_path from API config
-    API_BASE_PATH=$(grep 'base_path:' "$API_CONFIG" | head -1 | awk '{print $2}' | tr -d '"')
-    echo "  Base path: $API_BASE_PATH"
-    
-    # Check if path is absolute
-    if [[ "$API_BASE_PATH" != /* ]]; then
-        check_fail "base_path must be absolute: $API_BASE_PATH"
-    else
-        check_pass "base_path is absolute"
-    fi
-    
-    # Check Redis config
-    if grep -q 'redis:' "$API_CONFIG"; then
-        check_pass "Redis configuration present"
-    else
-        check_fail "Redis configuration missing"
-    fi
-    
-    # Check auth enabled
-    if grep -q 'enabled: true' "$API_CONFIG"; then
-        check_pass "Authentication enabled"
-    else
-        check_warn "Authentication disabled (not recommended for production)"
-    fi
-fi
-
-echo ""
-
-# 2. Check Worker config (if provided)
-if [ -f "$WORKER_CONFIG" ]; then
-    echo -e "${BOLD}Checking Worker Configuration${NC}"
-    check_pass "Worker config found: $WORKER_CONFIG"
-    
-    # Extract base_path from worker config
-    WORKER_BASE_PATH=$(grep 'base_path' "$WORKER_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
-    echo "  Base path: $WORKER_BASE_PATH"
-    
-    # Compare paths
-    if [ "$API_BASE_PATH" = "$WORKER_BASE_PATH" ]; then
-        check_pass "API and Worker base_path match"
-    else
-        check_fail "base_path mismatch! API: $API_BASE_PATH, Worker: $WORKER_BASE_PATH"
-    fi
-    
-    # Check podman_image configured
-    if grep -q 'podman_image' "$WORKER_CONFIG"; then
-        PODMAN_IMAGE=$(grep 'podman_image' "$WORKER_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
-        check_pass "Podman image configured: $PODMAN_IMAGE"
-    else
-        check_fail "podman_image not configured"
-    fi
-else
-    check_warn "Worker config not found: $WORKER_CONFIG (optional for API server only)"
-fi
-
-echo ""
-
-# 3. Check directory structure (if base_path exists)
-if [ -n "$API_BASE_PATH" ] && [ -d "$API_BASE_PATH" ]; then
-    echo -e "${BOLD}Checking Directory Structure${NC}"
-    check_pass "Base directory exists: $API_BASE_PATH"
-    
-    # Check subdirectories
-    for dir in experiments pending running finished failed; do
-        if [ -d "$API_BASE_PATH/$dir" ]; then
-            check_pass "$dir/ directory exists"
-        else
-            check_warn "$dir/ directory missing (will be created automatically)"
-        fi
-    done
-    
-    # Check permissions
-    if [ -w "$API_BASE_PATH" ]; then
-        check_pass "Base directory is writable"
-    else
-        check_fail "Base directory is not writable (check permissions)"
-    fi
-    
-elif [ -n "$API_BASE_PATH" ]; then
-    check_warn "Base directory does not exist: $API_BASE_PATH (will need to be created)"
-fi
-
-echo ""
-
-# 4. Check Redis connectivity (if server is running)
-echo -e "${BOLD}Checking Redis Connectivity${NC}"
-if command -v redis-cli &> /dev/null; then
-    if redis-cli ping &> /dev/null; then
-        check_pass "Redis server is running and accessible"
-        
-        # Check queue
-        QUEUE_SIZE=$(redis-cli llen fetchml:tasks:queue 2>/dev/null || echo "0")
-        echo "  Queue size: $QUEUE_SIZE tasks"
-    else
-        check_warn "Redis server not accessible (start with: redis-server)"
-    fi
-else
-    check_warn "redis-cli not installed (cannot verify Redis connectivity)"
-fi
-
-echo ""
-
-# 5. Check Podman (if worker config exists)
-if [ -f "$WORKER_CONFIG" ]; then
-    echo -e "${BOLD}Checking Podman${NC}"
-    if command -v podman &> /dev/null; then
-        check_pass "Podman is installed"
-        
-        # Check if image exists
-        if [ -n "$PODMAN_IMAGE" ]; then
-            if podman image exists "$PODMAN_IMAGE" 2>/dev/null; then
-                check_pass "Podman image exists: $PODMAN_IMAGE"
-            else
-                check_warn "Podman image not found: $PODMAN_IMAGE (needs to be built)"
-            fi
-        fi
-        
-        # Check GPU access (if configured)
-        if grep -q 'gpu_access.*true' "$WORKER_CONFIG" 2>/dev/null; then
-            if podman run --rm --device nvidia.com/gpu=all nvidia/cuda:11.8.0-base nvidia-smi &>/dev/null; then
-                check_pass "GPU access working"
-            else
-                check_warn "GPU access configured but not working (check nvidia-container-toolkit)"
-            fi
-        fi
-    else
-        check_fail "Podman not installed (required for worker)"
-    fi
-fi
-
-echo ""
-
-# 6. Check CLI config consistency
-echo -e "${BOLD}Checking CLI Configuration${NC}"
-CLI_CONFIG="$HOME/.ml/config.toml"
-if [ -f "$CLI_CONFIG" ]; then
-    check_pass "CLI config found: $CLI_CONFIG"
-    
-    CLI_BASE=$(grep 'worker_base' "$CLI_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
-    if [ "$CLI_BASE" = "$API_BASE_PATH" ]; then
-        check_pass "CLI worker_base matches server base_path"
-    else
-        check_warn "CLI worker_base ($CLI_BASE) differs from server ($API_BASE_PATH)"
-    fi
-else
-    check_warn "CLI config not found (run: ml init)"
-fi
-
-echo ""
-
-# Summary
-echo -e "${BOLD}=== Summary ===${NC}"
-if [ $errors -eq 0 ] && [ $warnings -eq 0 ]; then
-    echo -e "${GREEN}All checks passed! Configuration is ready for production.${NC}"
-    exit 0
-elif [ $errors -eq 0 ]; then
-    echo -e "${YELLOW}Configuration has $warnings warning(s). Review before deployment.${NC}"
-    exit 0
-else
-    echo -e "${RED}Configuration has $errors error(s) and $warnings warning(s). Fix before deployment.${NC}"
-    exit 1
-fi
--- a/scripts/verify_release.sh
+++ b/scripts/verify_release.sh
@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  cat <<'EOF'
+Usage:
+  scripts/verify_release.sh --dir <release_dir> [--repo <org>/<repo>]
+
+What it does:
+- Verifies checksums.txt signature (keyless cosign) if cosign + checksums.txt.sig/.cert are present
+- Verifies *.tar.gz files against checksums.txt
+
+Notes:
+- --repo enables strict Sigstore identity checking against the release workflow.
+- Without cosign, the script still verifies SHA256 hashes.
+
+Examples:
+  scripts/verify_release.sh --dir ./release --repo jfraeys/fetch_ml
+  scripts/verify_release.sh --dir .
+EOF
+}
+
+release_dir=""
+repo=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --dir)
+      release_dir="${2:-}"
+      shift 2
+      ;;
+    --repo)
+      repo="${2:-}"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "unknown argument: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+if [[ -z "$release_dir" ]]; then
+  echo "missing --dir" >&2
+  usage >&2
+  exit 2
+fi
+
+if [[ ! -d "$release_dir" ]]; then
+  echo "directory not found: $release_dir" >&2
+  exit 2
+fi
+
+cd "$release_dir"
+
+if [[ ! -f checksums.txt ]]; then
+  echo "missing checksums.txt in $release_dir" >&2
+  exit 2
+fi
+
+has_cosign=false
+if command -v cosign >/dev/null 2>&1; then
+  has_cosign=true
+fi
+
+verify_sigstore() {
+  if [[ ! -f checksums.txt.sig ]] || [[ ! -f checksums.txt.cert ]]; then
+    echo "[verify] cosign available, but checksums.txt.sig/.cert not found; skipping signature verification" >&2
+    return 0
+  fi
+
+  if [[ -z "$repo" ]]; then
+    echo "[verify] verifying signature (no repo identity pin; pass --repo to pin identity)" >&2
+    COSIGN_YES=true cosign verify-blob \
+      --certificate checksums.txt.cert \
+      --signature checksums.txt.sig \
+      --certificate-oidc-issuer https://token.actions.githubusercontent.com \
+      checksums.txt >/dev/null
+    echo "[ok] checksums.txt signature verified (un-pinned identity)"
+    return 0
+  fi
+
+  local identity
+  identity="^https://github.com/${repo}/\.github/workflows/release\.yml@refs/tags/v.*$"
+
+  COSIGN_YES=true cosign verify-blob \
+    --certificate checksums.txt.cert \
+    --signature checksums.txt.sig \
+    --certificate-identity-regexp "$identity" \
+    --certificate-oidc-issuer https://token.actions.githubusercontent.com \
+    checksums.txt >/dev/null
+
+  echo "[ok] checksums.txt signature verified (pinned to ${repo} release workflow)"
+}
+
+verify_hashes() {
+  local failures=0
+
+  local has_sha256sum=false
+  if command -v sha256sum >/dev/null 2>&1; then
+    has_sha256sum=true
+  fi
+
+  while IFS= read -r expected file; do
+    [[ -z "${expected}" ]] && continue
+    [[ -z "${file}" ]] && continue
+
+    if [[ ! -f "$file" ]]; then
+      continue
+    fi
+
+    local actual
+    if [[ "$has_sha256sum" == true ]]; then
+      actual="$(sha256sum "$file" | awk '{print $1}')"
+    else
+      actual="$(shasum -a 256 "$file" | awk '{print $1}')"
+    fi
+
+    if [[ "$actual" != "$expected" ]]; then
+      echo "[fail] $file" >&2
+      echo "       expected: $expected" >&2
+      echo "       actual:   $actual" >&2
+      failures=$((failures+1))
+    fi
+  done < <(awk '{print $1, $2}' checksums.txt)
+
+  if [[ $failures -gt 0 ]]; then
+    echo "[fail] checksum verification failed ($failures file(s))" >&2
+    exit 1
+  fi
+
+  echo "[ok] all available artifacts match checksums.txt"
+}
+
+if [[ "$has_cosign" == true ]]; then
+  verify_sigstore
+else
+  echo "[verify] cosign not installed; skipping signature verification" >&2
+fi
+
+verify_hashes
+
+echo "[ok] release verification complete"
--- a/tools/manage.sh
+++ b/tools/manage.sh
@ -5,6 +5,10 @@

 set -euo pipefail

+make_target_exists() {
+    make -n "$1" >/dev/null 2>&1
+}
+
 # Colors
 RED='\033[0;31m'
 GREEN='\033[0;32m'
@ -45,7 +49,7 @@ show_status() {
    
    # Check Go apps
    print_app "Go Applications:"
-    local go_apps=("api-server" "worker" "tui" "data_manager" "user_manager")
+    local go_apps=("api-server" "worker" "tui")
    for app in "${go_apps[@]}"; do
        if [[ -f "bin/$app" ]]; then
            echo "  ✅ $app: Built"
@ -85,7 +89,7 @@ show_status() {
    
    # Check configuration
    print_app "Configuration:"
-    if [[ -f "configs/config-local.yaml" ]]; then
+    if [[ -f "configs/api/dev.yaml" ]]; then
        echo "  ✅ Security config: Found"
    else
        echo "  ⚠️  Security config: Not found"
@ -110,14 +114,14 @@ build_all() {
    echo "============================="
    echo ""
    
-    print_info "Building Go applications..."
-    make build
-    
    if command -v zig &> /dev/null; then
-        print_info "Building Zig CLI..."
-        make cli-build
+        print_info "Building all components (Go + Zig CLI)..."
+        make build
    else
-        print_warning "Zig not found, skipping CLI build"
+        print_warning "Zig not found, building Go components only"
+        go build -o bin/api-server cmd/api-server/main.go
+        go build -o bin/worker cmd/worker/worker_server.go
+        go build -o bin/tui ./cmd/tui
    fi
    
    print_success "Build completed!"
@ -128,11 +132,13 @@ test_all() {
    echo "===================="
    echo ""
    
-    print_info "Running main test suite..."
-    make test
-    
-    print_info "Running comprehensive tests..."
-    make test-all
+    if make_target_exists test-full; then
+        print_info "Running full test suite..."
+        make test-full
+    else
+        print_info "Running test suite..."
+        make test
+    fi
    
    print_success "All tests completed!"
 }
@ -156,8 +162,8 @@ start_services() {
    # Start API server if built
    if [[ -f "bin/api-server" ]]; then
        print_info "Starting API server..."
-        if [[ -f "configs/config-local.yaml" ]]; then
-            ./bin/api-server --config configs/config-local.yaml &
+        if [[ -f "configs/api/dev.yaml" ]]; then
+            ./bin/api-server --config configs/api/dev.yaml &
        else
            print_warning "No config found, using defaults"
            ./bin/api-server &
@ -187,13 +193,25 @@ check_health() {
    print_info "Port 9101 is open, checking API health endpoint..."
    
    # Try the health endpoint
-    response=$(curl -k -s --max-time 3 -H 'X-API-Key: password' -H 'X-Forwarded-For: 127.0.0.1' https://localhost:9101/health 2>/dev/null)
+    local api_key_header=""
+    if [[ -n "${FETCH_ML_API_KEY:-}" ]]; then
+        api_key_header="-H X-API-Key: ${FETCH_ML_API_KEY}"
+    fi
+
+    response=$(curl -s --max-time 3 ${api_key_header} http://localhost:9101/health 2>/dev/null || true)
+    if [[ -z "$response" ]]; then
+        response=$(curl -k -s --max-time 3 ${api_key_header} https://localhost:9101/health 2>/dev/null || true)
+    fi
    
    if [[ "$response" == "OK" ]]; then
        print_success "API is healthy: $response"
    elif [[ "$response" == *"IP not whitelisted"* ]]; then
        print_warning "API running but IP not whitelisted (expected behavior)"
-        print_info "Try: curl -k -H 'X-API-Key: password' -H 'X-Forwarded-For: 127.0.0.1' https://localhost:9101/health"
+        if [[ -n "${FETCH_ML_API_KEY:-}" ]]; then
+            print_info "Try: curl -k -H 'X-API-Key: $FETCH_ML_API_KEY' https://localhost:9101/health"
+        else
+            print_info "Try: curl -k https://localhost:9101/health"
+        fi
    else
        print_error "Unexpected response: $response"
    fi
@ -229,19 +247,36 @@ run_security() {
    case "${1:-check}" in
        "check")
            print_info "Running security checks..."
-            make security-check
+            if make_target_exists security-check; then
+                make security-check
+            else
+                print_warning "No 'security-check' Make target found"
+                print_info "Try: make ci-local"
+            fi
            ;;
        "monitor")
            print_info "Starting security monitoring..."
-            make security-monitor
+            if make_target_exists security-monitor; then
+                make security-monitor
+            else
+                print_warning "No 'security-monitor' Make target found"
+            fi
            ;;
        "deploy")
            print_info "Deploying with security..."
-            make security-deploy
+            if make_target_exists security-deploy; then
+                make security-deploy
+            else
+                print_warning "No 'security-deploy' Make target found"
+            fi
            ;;
        "audit")
            print_info "Running security audit..."
-            make security-audit
+            if make_target_exists security-audit; then
+                make security-audit
+            else
+                print_warning "No 'security-audit' Make target found"
+            fi
            ;;
        *)
            echo "Usage: $0 security {check|monitor|deploy|audit}"
@ -258,15 +293,22 @@ run_development() {
    case "${1:-setup}" in
        "setup")
            print_info "Setting up development environment..."
-            ./scripts/auto_setup.sh
+            print_warning "Legacy setup scripts were removed; using Makefile/deployments instead"
+            print_info "Try: make dev"
+            print_info "Or: ./deployments/deploy.sh dev up"
            ;;
        "quick")
            print_info "Running quick start..."
-            ./scripts/quick_start.sh
+            print_warning "Legacy quick start script was removed; using deployments instead"
+            print_info "Try: ./deployments/deploy.sh dev up"
            ;;
        "deps")
            print_info "Installing dependencies..."
-            make install-deps
+            if make_target_exists install-deps; then
+                make install-deps
+            else
+                print_warning "No 'install-deps' Make target found"
+            fi
            ;;
        *)
            echo "Usage: $0 dev {setup|quick|deps}"
@ -309,7 +351,7 @@ cleanup() {
    echo ""
    
    print_info "Cleaning project artifacts..."
-    make clean-all
+    make clean
    
    print_info "Stopping services..."
    stop_services
@ -330,7 +372,7 @@ show_help() {
    echo "  start     - Start all services"
    echo "  stop      - Stop all services"
    echo "  health    - Check API health endpoint"
-echo "  security  - Security management (check|monitor|deploy|audit)"
+    echo "  security  - Security management (check|monitor|deploy|audit)"
    echo "  dev       - Development environment (setup|quick|deps)"
    echo "  logs      - Show application logs"
    echo "  cleanup   - Clean project artifacts and stop services"
--- a/tools/performance_regression_detector.go
+++ b/tools/performance_regression_detector.go
@ -47,7 +47,10 @@ type Improvement struct {
 }

 // NewPerformanceRegressionDetector creates a new detector instance
-func NewPerformanceRegressionDetector(baselineFile string, threshold float64) *PerformanceRegressionDetector {
+func NewPerformanceRegressionDetector(
+	baselineFile string,
+	threshold float64,
+) *PerformanceRegressionDetector {
 	return &PerformanceRegressionDetector{
 		BaselineFile: baselineFile,
 		Threshold:    threshold,
@ -74,7 +77,9 @@ func (prd *PerformanceRegressionDetector) LoadBaseline() ([]BenchmarkResult, err
 }

 // AnalyzeResults analyzes current results against baseline
-func (prd *PerformanceRegressionDetector) AnalyzeResults(current []BenchmarkResult) (*RegressionReport, error) {
+func (prd *PerformanceRegressionDetector) AnalyzeResults(
+	current []BenchmarkResult,
+) (*RegressionReport, error) {
 	baseline, err := prd.LoadBaseline()
 	if err != nil {
 		return nil, fmt.Errorf("failed to load baseline: %w", err)
--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				`{"dashboard": {"id": null, "title": "Load Test Performance", "tags": ["load-test", "performance"], "panels": [{"id": 1, "title": "Service Status", "type": "stat", "targets": [{"expr": "up", "legendFormat": "{{job}}"}]}]}}`