chore(ops): reorganize deployments/monitoring and remove legacy scripts

2026-01-05 12:31:26 -05:00 · 2026-01-05 12:31:26 -05:00 · f726806770
commit f726806770
parent 5ef24e4c6d
101 changed files with 3598 additions and 4982 deletions
--- a/configs/api/dev.yaml
+++ b/configs/api/dev.yaml
@ -0,0 +1,56 @@
 base_path: "/data/experiments"
 data_dir: "/data/active"
 auth:
  enabled: false
 server:
  address: "0.0.0.0:9101"
  tls:
    enabled: false
    cert_file: "/app/ssl/cert.pem"
    key_file: "/app/ssl/key.pem"
 security:
  production_mode: false
  allowed_origins:
    - "http://localhost:3000"
  api_key_rotation_days: 90
  audit_logging:
    enabled: true
    log_path: "/tmp/fetchml-audit.log"
  rate_limit:
    enabled: false
    requests_per_minute: 60
    burst_size: 10
  ip_whitelist: []
 monitoring:
  prometheus:
    enabled: true
    port: 9101
    path: "/metrics"
  health_checks:
    enabled: true
    interval: "30s"
 redis:
  addr: "redis:6379"
  password: ""
  db: 0
 database:
  type: "sqlite"
  connection: "/tmp/fetchml.sqlite"
 logging:
  level: "info"
  file: ""
  audit_log: ""
 resources:
  max_workers: 1
  desired_rps_per_worker: 2
  podman_cpus: "2"
  podman_memory: "4Gi"
--- a/configs/api/homelab-secure.yaml
+++ b/configs/api/homelab-secure.yaml
@ -0,0 +1,71 @@
 base_path: "/data/experiments"
 data_dir: "/data/active"
 auth:
  enabled: true
  api_keys:
    homelab_admin:
      hash: "CHANGE_ME_SHA256_HOMELAB_ADMIN_KEY"
      admin: true
      roles:
        - admin
      permissions:
        "*": true
    homelab_user:
      hash: "CHANGE_ME_SHA256_HOMELAB_USER_KEY"
      admin: false
      roles:
        - researcher
      permissions:
        experiments: true
        datasets: true
        jupyter: true
 server:
  address: ":9101"
  tls:
    enabled: false
    cert_file: "/app/ssl/cert.pem"
    key_file: "/app/ssl/key.pem"
 security:
  production_mode: true
  allowed_origins:
    - "https://ml-experiments.example.com"
  rate_limit:
    enabled: true
    requests_per_minute: 60
    burst_size: 10
  ip_whitelist:
    - "127.0.0.1"
    - "192.168.0.0/16"
 monitoring:
  prometheus:
    enabled: true
    port: 9101
    path: "/metrics"
  health_checks:
    enabled: true
    interval: "30s"
 redis:
  url: "redis://:CHANGE_ME_REDIS_PASSWORD@redis:6379"
  password: ""
  db: 0
 database:
  type: "sqlite"
  connection: "/data/experiments/fetch_ml.sqlite"
 logging:
  level: "info"
  file: "/logs/fetch_ml.log"
  audit_log: ""
 resources:
  max_workers: 1
  desired_rps_per_worker: 2
  podman_cpus: "2"
  podman_memory: "4Gi"
--- a/configs/api/multi-user.yaml
+++ b/configs/api/multi-user.yaml
@ -0,0 +1,74 @@
 base_path: "/app/data/experiments"
 data_dir: "/data/active"
 auth:
  enabled: true
  api_keys:
    admin_user:
      hash: "CHANGE_ME_SHA256_ADMIN_USER_KEY"
      admin: true
      roles: ["user", "admin"]
      permissions:
        "*": true
    researcher1:
      hash: "CHANGE_ME_SHA256_RESEARCHER1_KEY"
      admin: false
      roles: ["user", "researcher"]
      permissions:
        "jobs:read": true
        "jobs:create": true
        "jobs:update": true
        "jobs:delete": false
    analyst1:
      hash: "CHANGE_ME_SHA256_ANALYST1_KEY"
      admin: false
      roles: ["user", "analyst"]
      permissions:
        "jobs:read": true
        "jobs:create": false
        "jobs:update": false
        "jobs:delete": false
 server:
  address: ":9101"
  tls:
    enabled: false
 security:
  production_mode: false
  allowed_origins: []
  rate_limit:
    enabled: true
    requests_per_minute: 60
    burst_size: 20
  ip_whitelist: []
 monitoring:
  prometheus:
    enabled: true
    port: 9101
    path: "/metrics"
  health_checks:
    enabled: true
    interval: "30s"
 redis:
  url: "redis://redis:6379"
  password: ""
  db: 0
 database:
  type: "sqlite"
  connection: "/app/data/experiments/fetch_ml.sqlite"
 logging:
  level: "info"
  file: "/logs/app.log"
  audit_log: ""
 resources:
  max_workers: 3
  desired_rps_per_worker: 3
  podman_cpus: "2"
  podman_memory: "4Gi"
--- a/configs/api/prod.yaml
+++ b/configs/api/prod.yaml
@ -0,0 +1,59 @@
 base_path: "/app/data/experiments"
 data_dir: "/data/active"
 auth:
  enabled: true
  api_keys:
    admin:
      hash: "replace-with-sha256-of-your-api-key"
      admin: true
      roles:
        - admin
      permissions:
        "*": true
 server:
  address: ":9101"
  tls:
    enabled: true
    cert_file: "/app/ssl/cert.pem"
    key_file: "/app/ssl/key.pem"
 security:
  production_mode: false
  allowed_origins: []
  rate_limit:
    enabled: true
    requests_per_minute: 60
    burst_size: 10
  ip_whitelist: []
 monitoring:
  prometheus:
    enabled: true
    port: 9101
    path: "/metrics"
  health_checks:
    enabled: true
    interval: "30s"
 redis:
  addr: "redis:6379"
  password: ""
  db: 0
 database:
  type: "sqlite"
  connection: "/app/data/experiments/fetch_ml.sqlite"
 logging:
  level: "info"
  file: "/logs/fetch_ml.log"
  audit_log: ""
 resources:
  max_workers: 2
  desired_rps_per_worker: 5
  podman_cpus: "2"
  podman_memory: "4Gi"
--- a/configs/config-local.toml
+++ b/configs/config-local.toml
@ -1,8 +0,0 @@
 # Local development config (TOML)
 # Used by both CLI and TUI when no overrides are set
 worker_host = "127.0.0.1"
 worker_user = "dev_user"
 worker_base = "/tmp/ml-experiments"
 worker_port = 9101
 api_key = "your-api-key-here"
--- a/configs/config-test.yaml
+++ b/configs/config-test.yaml
@ -1,26 +0,0 @@
 auth:
    enabled: true
    api_keys:
        dev_user:
            hash: "replace-with-sha256-of-your-api-key"
            admin: true
            roles:
                - admin
            permissions:
                '*': true
 server:
    address: ":9101"
    tls:
        enabled: false
 security:
    rate_limit:
        enabled: false
 redis:
    url: "redis://redis:6379"
 logging:
    level: info
    console: true
--- a/configs/deprecated/config-debug.yaml
+++ b/configs/deprecated/config-debug.yaml
@ -1,17 +0,0 @@
 base_path: "/app/data/experiments"
 auth:
  enabled: false
 server:
  address: ":9101"
 database:
  type: "sqlite"
  connection: "/app/data/experiments/fetch_ml.db"
 redis:
  url: "redis://redis:6379"
 logging:
  level: "debug"
--- a/configs/deprecated/config-docker-full.yaml
+++ b/configs/deprecated/config-docker-full.yaml
@ -1,46 +0,0 @@
 base_path: "/app/data/experiments"
 auth:
  enabled: true
  api_keys:
    homelab_user:
      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
      admin: true
      roles: ["user", "admin"]
      permissions:
        read: true
        write: true
        delete: true
 server:
  address: ":9101"
  tls:
    enabled: true
    cert_file: "/app/ssl/cert.pem"
    key_file: "/app/ssl/key.pem"
 security:
  rate_limit:
    enabled: true
    requests_per_minute: 30
  ip_whitelist: []
 # SQLite database for persistence
 database:
  type: "sqlite"
  connection: "/app/data/fetch_ml.db"
 redis:
  url: "redis://redis:6379"
  max_connections: 10
 logging:
  level: "info"
  file: "/app/logs/app.log"
  audit_file: "/app/logs/audit.log"
 resources:
  max_workers: 1
  desired_rps_per_worker: 2
  podman_cpus: "2"
  podman_memory: "8g"
--- a/configs/environments/config-docker.yaml
+++ b/configs/environments/config-docker.yaml
@ -1,39 +0,0 @@
 base_path: "/app/data/experiments"
 auth:
  enabled: true
  api_keys:
    homelab_user:
      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
      admin: true
      roles: ["user", "admin"]
      permissions:
        read: true
        write: true
        delete: true
 server:
  address: ":9101"
  tls:
    enabled: true
    cert_file: "/app/ssl/cert.pem"
    key_file: "/app/ssl/key.pem"
 security:
  rate_limit:
    enabled: true
    requests_per_minute: 30
  ip_whitelist:
    - "127.0.0.1"
    - "::1"
    - "192.168.0.0/16"
    - "10.0.0.0/8"
 redis:
  url: "redis://redis:6379"
  max_connections: 10
 logging:
  level: "info"
  file: "/app/logs/app.log"
  audit_file: "/app/logs/audit.log"
--- a/configs/environments/config-homelab-secure.yaml
+++ b/configs/environments/config-homelab-secure.yaml
@ -1,58 +0,0 @@
 # Secure Homelab Configuration
 # IMPORTANT: Keep your API keys safe and never share them!
 redis:
    url: "redis://redis:6379"
    max_connections: 10
 auth:
    enabled: true
    api_keys:
        homelab_admin:
            hash: b444f7d99edd0e32c838d900c4f0dfab86690b55871b587b730f3bc84812dd5f
            admin: true
            roles:
                - admin
            permissions:
                '*': true
        homelab_user:
            hash: 5badb9721b0cb19f5be512854885cadbc7490afc0de1f62db5ae3144c6cc294c
            admin: false
            roles:
                - researcher
            permissions:
                'experiments': true
                'datasets': true
                'jupyter': true
 server:
    address: ":9101"
    tls:
        enabled: true
        key_file: "/app/ssl/key.pem"
        cert_file: "/app/ssl/cert.pem"
 security:
    rate_limit:
        enabled: true
        requests_per_minute: 60
        burst_size: 10
    ip_whitelist: []
 logging:
    level: "info"
    file: "logs/fetch_ml.log"
    console: true
 resources:
    cpu_limit: "2"
    memory_limit: "4Gi"
    gpu_limit: 0
    disk_limit: "10Gi"
 # Prometheus metrics
 metrics:
    enabled: true
    listen_addr: ":9100"
    tls:
        enabled: false
--- a/configs/environments/config-local.yaml
+++ b/configs/environments/config-local.yaml
@ -1,49 +0,0 @@
 redis:
    url: "redis://redis:6379"
    max_connections: 10
 auth:
    enabled: true
    api_keys:
        homelab_admin:
            hash: b444f7d99edd0e32c838d900c4f0dfab86690b55871b587b730f3bc84812dd5f
            admin: true
            roles:
                - admin
            permissions:
                '*': true
        homelab_user:
            hash: 5badb9721b0cb19f5be512854885cadbc7490afc0de1f62db5ae3144c6cc294c
            admin: false
            roles:
                - researcher
            permissions:
                'experiments': true
                'datasets': true
                'jupyter': true
 server:
    address: ":9101"
    tls:
        enabled: true
        cert_file: "/app/ssl/cert.pem"
        key_file: "/app/ssl/key.pem"
 security:
    rate_limit:
        enabled: true
        requests_per_minute: 60
        burst_size: 10
    ip_whitelist:
        - "127.0.0.1"
        - "::1"
        - "172.21.0.1"       # Docker gateway
 # Prometheus metrics
 metrics:
    enabled: true
    listen_addr: ":9100"
    tls:
        enabled: true
        cert_file: "/app/ssl/cert.pem"
        key_file: "/app/ssl/key.pem"
--- a/configs/environments/config-multi-user.yaml
+++ b/configs/environments/config-multi-user.yaml
@ -1,78 +0,0 @@
 base_path: "/app/data/experiments"
 auth:
  enabled: true
  api_keys:
    admin_user:
      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
      admin: true
      roles: ["user", "admin"]
      permissions:
        read: true
        write: true
        delete: true
    researcher1:
      hash: "ef92b778ba7a6c8f2150019a5678047b6a9a2b95cef8189518f9b35c54d2e3ae" # "research123"
      admin: false
      roles: ["user", "researcher"]
      permissions:
        jobs:read: true
        jobs:create: true
        jobs:update: true
        jobs:delete: false
    analyst1:
      hash: "a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3" # "analyst123"
      admin: false
      roles: ["user", "analyst"]
      permissions:
        jobs:read: true
        jobs:create: false
        jobs:update: false
        jobs:delete: false
 server:
  address: ":9101"
  tls:
    enabled: false
 security:
  rate_limit:
    enabled: true
    requests_per_minute: 60
    burst_size: 20
  ip_whitelist: []
  cors:
    enabled: true
    allowed_origins: ["https://localhost:9103", "https://localhost:3000"]
    allowed_methods: ["GET", "POST", "PUT", "DELETE", "OPTIONS"]
    allowed_headers: ["Content-Type", "Authorization"]
 database:
  type: "sqlite"
  connection: "/app/data/experiments/fetch_ml.db"
  max_connections: 20
  connection_timeout: "30s"
 redis:
  url: "redis://redis:6379"
  max_connections: 15
  connection_timeout: "10s"
 logging:
  level: "info"
  file: "/app/logs/app.log"
  max_size: "100MB"
  max_backups: 5
  compress: true
 resources:
  max_workers: 3
  desired_rps_per_worker: 3
  podman_cpus: "2"
  podman_memory: "4g"
  job_timeout: "30m"
 monitoring:
  enabled: true
  metrics_path: "/metrics"
  health_check_interval: "30s"
--- a/configs/environments/config-prod.yaml
+++ b/configs/environments/config-prod.yaml
@ -1,59 +0,0 @@
 base_path: "./data/ml-experiments"
 auth:
  enabled: true
  apikeys:
    homelab_user:
      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
      admin: true
      roles: ["admin"]
      permissions:
        read: true
        write: true
        delete: true
 server:
  address: ":9101"
  tls:
    enabled: false  # Disabled for local testing
    cert_file: "./ssl/cert.pem"
    key_file: "./ssl/key.pem"
    min_version: "1.3"
 security:
  rate_limit:
    enabled: true
    requests_per_minute: 60
    burst_size: 10
  ip_whitelist:
    - "127.0.0.1"
    - "::1"
    - "localhost"
    - "10.0.0.0/8"
    - "192.168.0.0/16"
    - "172.16.0.0/12"
  failed_login_lockout:
    enabled: true
    max_attempts: 5
    lockout_duration: "15m"
 # SQLite database for production
 database:
  type: "sqlite"
  connection: "data/fetch_ml.db"
 redis:
  url: "redis://localhost:6379"
  addr: "localhost:6379"
  password: "JZVd2Y6IDaLNaYLBOFgQ7ae4Ox5t37NTIyPMQlLJD4k="
 logging:
  level: "info"
  file: "logs/fetch_ml.log"
  audit_log: "logs/audit.log"
 resources:
  max_workers: 2
  desired_rps_per_worker: 5
  podman_cpus: "8"
  podman_memory: "32g"
--- a/configs/examples/config-postgres.yaml
+++ b/configs/examples/config-postgres.yaml
@ -1,13 +1,17 @@
 # Fetch ML Configuration Example for PostgreSQL
 # This example shows how to configure Fetch ML to use PostgreSQL as the database
 base_path: "./data/experiments"
 auth:
  enabled: true
-  apikeys:
+  api_keys:
    admin:
      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd5f8b6c8b0b4f0b8e3" # "password"
      admin: true
      roles: ["admin"]
      permissions:
        "*": true
 server:
  address: ":9101"
@ -25,40 +29,34 @@ database:
  # connection: "postgres://fetchml:your_password_here@localhost:5432/fetchml?sslmode=disable"
 redis:
-  host: "localhost"
+  addr: "localhost:6379"
  port: 6379
  password: ""
  db: 0
  pool_size: 10
  max_retries: 3
 logging:
  level: "info"
-  console: true
+  file: ""
-  format: "text"
+  audit_log: ""
 security:
-  secret_key: "your-secret-key-here-at-least-16-characters"
+  production_mode: false
  jwt_expiry: "24h"
  rate_limit:
    enabled: false
    requests_per_minute: 60
    burst_size: 10
  ip_whitelist: []
-containers:
+monitoring:
-  runtime: "podman"
+  prometheus:
  registry: "docker.io"
  pull_policy: "missing"
  resources:
    cpu_limit: "2"
    memory_limit: "4Gi"
    gpu_limit: 1
 storage:
  data_path: "data"
  results_path: "results"
  temp_path: "/tmp/fetch_ml"
  cleanup:
    enabled: true
-    max_age_hours: 168
+    port: 9101
-    max_size_gb: 10
+    path: "/metrics"
  health_checks:
    enabled: true
    interval: "30s"
 resources:
  max_workers: 1
  desired_rps_per_worker: 2
  podman_cpus: "2"
  podman_memory: "4Gi"
--- a/configs/examples/config.yaml.example
+++ b/configs/examples/config.yaml.example
@ -1,6 +1,8 @@
 # Fetch ML Configuration Example
 # Copy this file to config.yaml and customize for your environment
 base_path: "./data/experiments"
 auth:
  enabled: true
  api_keys:
@ -13,54 +15,43 @@ auth:
        "*": true
 server:
-  host: "localhost"
+  address: ":9101"
-  port: 8080
+  tls:
    enabled: false
 database:
  type: "sqlite"
  connection: "data/fetch_ml.db"
  host: ""
  port: 5432
  username: ""
  password: ""
  database: "fetch_ml"
 redis:
-  url: "redis://localhost:6379"
+  addr: "localhost:6379"
  host: "localhost"
  port: 6379
  password: ""
  db: 0
  pool_size: 10
  max_retries: 3
 logging:
  level: "info"
  file: "logs/fetch_ml.log"
-  format: "text"
+  audit_log: "logs/audit.log"
  console: true
 security:
  secret_key: "your-secret-key-at-least-16-chars"
  jwt_expiry: "24h"
  rate_limit:
    enabled: false
    requests_per_minute: 60
    burst_size: 10
  ip_whitelist: []
  production_mode: false
-containers:
+monitoring:
-  runtime: "podman"
+  prometheus:
  registry: "docker.io"
  pull_policy: "missing"
  resources:
    cpu_limit: "2"
    memory_limit: "4Gi"
    gpu_limit: 1
 storage:
  data_path: "data"
  results_path: "results"
  temp_path: "/tmp/fetch_ml"
  cleanup:
    enabled: true
-    max_age_hours: 168
+    port: 9101
-    max_size_gb: 10
+    path: "/metrics"
  health_checks:
    enabled: true
    interval: "30s"
 resources:
  max_workers: 1
  desired_rps_per_worker: 2
  podman_cpus: "2"
  podman_memory: "4Gi"
--- a/configs/schema/api_server_config.yaml
+++ b/configs/schema/api_server_config.yaml
@ -12,6 +12,10 @@ properties:
    type: string
    description: Base path for experiment data
    default: "/tmp/ml-experiments"
  data_dir:
    type: string
    description: Data directory (datasets/snapshots) for integrity validation
    default: "/data/active"
  auth:
    type: object
    additionalProperties: false
@ -40,7 +44,6 @@ properties:
              type: array
              items:
                type: string
                enum: [admin, data_scientist, data_engineer, viewer, operator]
            permissions:
              type: object
              additionalProperties:
@ -64,9 +67,30 @@ properties:
            type: string
          key_file:
            type: string
-          min_version:
+  monitoring:
    type: object
    additionalProperties: false
    properties:
      prometheus:
        type: object
        additionalProperties: false
        properties:
          enabled:
            type: boolean
          port:
            type: integer
            minimum: 1
            maximum: 65535
          path:
            type: string
      health_checks:
        type: object
        additionalProperties: false
        properties:
          enabled:
            type: boolean
          interval:
            type: string
            description: Minimum TLS version (e.g. "1.3")
  database:
    type: object
    additionalProperties: false
@ -99,58 +123,56 @@ properties:
      addr:
        type: string
        description: Optional host:port shorthand for Redis
      host:
        type: string
        default: "localhost"
      port:
        type: integer
        minimum: 1
        maximum: 65535
        default: 6379
      password:
        type: string
      db:
        type: integer
        minimum: 0
        default: 0
-      pool_size:
+  queue:
-        type: integer
+    type: object
-        minimum: 1
+    additionalProperties: false
-        default: 10
+    properties:
-      max_retries:
+      backend:
-        type: integer
+        type: string
-        minimum: 0
+        enum: [redis, sqlite]
-        default: 3
+        default: redis
      sqlite_path:
        type: string
  logging:
    type: object
    additionalProperties: false
    properties:
      level:
        type: string
-        enum: [debug, info, warn, error, fatal]
+        enum: [debug, info, warn, error]
        default: "info"
      file:
        type: string
      audit_log:
        type: string
      format:
        type: string
        enum: [text, json]
        default: "text"
      console:
        type: boolean
        default: true
  security:
    type: object
    additionalProperties: false
    properties:
-      secret_key:
+      production_mode:
-        type: string
+        type: boolean
-        minLength: 16
+        default: false
-      jwt_expiry:
+      allowed_origins:
-        type: string
+        type: array
-        pattern: "^\\d+[smhd]$"
+        items:
-        default: "24h"
+          type: string
      api_key_rotation_days:
        type: integer
        minimum: 0
      audit_logging:
        type: object
        additionalProperties: false
        properties:
          enabled:
            type: boolean
          log_path:
            type: string
      ip_whitelist:
        type: array
        items:
@ -183,23 +205,23 @@ properties:
            minimum: 1
  resources:
    type: object
-    description: Resource configuration defaults
+    description: Resource configuration
    additionalProperties: false
    properties:
-      cpu_limit:
+      max_workers:
        type: string
        description: Default CPU limit (e.g., "2" or "500m")
        default: "2"
      memory_limit:
        type: string
        description: Default memory limit (e.g., "1Gi" or "512Mi")
        default: "4Gi"
      gpu_limit:
        type: integer
-        description: Default GPU limit
+        minimum: 1
-        minimum: 0
+        default: 1
-        default: 0
+      desired_rps_per_worker:
-      disk_limit:
+        type: integer
        minimum: 1
      requests_per_sec:
        type: integer
        minimum: 1
      podman_cpus:
        type: string
-        description: Default disk limit
+      podman_memory:
-        default: "10Gi"
+        type: string
      request_burst:
        type: integer
        minimum: 0
--- a/configs/schema/config_schema.yaml
+++ b/configs/schema/config_schema.yaml
--- a/configs/schema/worker_config_schema.yaml
+++ b/configs/schema/worker_config_schema.yaml
@ -2,10 +2,28 @@ $schema: "http://json-schema.org/draft-07/schema#"
 title: "Fetch ML Worker Configuration"
 type: object
 additionalProperties: false
 allOf:
  # forbid both index and UUID at once (allow zero or one)
  - not:
      required: [gpu_visible_devices, gpu_visible_device_ids]
  - if:
      properties:
        queue:
          properties:
            backend:
              const: sqlite
      required: [queue]
    then:
      properties:
        queue:
          required: [sqlite_path]
    else:
      anyOf:
        - required: [redis_addr]
        - required: [redis_url]
 required:
  - base_path
  - worker_id
  - redis_addr
  - podman_image
  - container_workspace
  - container_results
@ -31,6 +49,9 @@ properties:
  train_script:
    type: string
    description: Path to training script
  redis_url:
    type: string
    description: Legacy Redis URL (if set, redis_addr/password/db are derived)
  redis_addr:
    type: string
    description: Redis server address
@ -42,6 +63,18 @@ properties:
    minimum: 0
    default: 0
    description: Redis database number
  queue:
    type: object
    description: Queue backend configuration (optional; defaults to redis)
    additionalProperties: false
    properties:
      backend:
        type: string
        enum: [redis, sqlite]
        default: redis
      sqlite_path:
        type: string
        description: Path to queue.db (sqlite backend only)
  known_hosts:
    type: string
    description: Path to SSH known hosts file
@ -116,6 +149,48 @@ properties:
    type: string
    description: Dataset cache TTL duration
    default: "30m"
  snapshot_store:
    type: object
    description: Optional S3-compatible snapshot store configuration (worker pulls snapshots by snapshot_id)
    additionalProperties: false
    properties:
      enabled:
        type: boolean
        default: false
      endpoint:
        type: string
        description: S3-compatible endpoint (e.g. "s3.amazonaws.com" or "minio:9000")
      secure:
        type: boolean
        default: true
      region:
        type: string
      bucket:
        type: string
      prefix:
        type: string
        description: Object key prefix where snapshots are stored
      access_key:
        type: string
        description: Optional static access key (otherwise uses env credentials)
      secret_key:
        type: string
        description: Optional static secret key (otherwise uses env credentials)
      session_token:
        type: string
        description: Optional session token for temporary credentials
      timeout:
        type: string
        description: Duration string (e.g., "10m")
        default: "10m"
      max_retries:
        type: integer
        minimum: 0
        default: 3
  prewarm_enabled:
    type: boolean
    description: Enable best-effort prewarming of next task artifacts (snapshot/datasets/env image). Default off.
    default: false
  podman_image:
    type: string
    minLength: 1
@ -126,10 +201,40 @@ properties:
  container_results:
    type: string
    description: Container results path
-  gpu_access:
+  gpu_devices:
-    type: boolean
+    type: array
-    default: false
+    description: GPU device paths to expose to the container (e.g. ["/dev/dri"]).
-    description: Enable GPU access
+    items:
      type: string
  gpu_vendor:
    type: string
    enum: [nvidia, amd, apple, none]
    description: GPU vendor/runtime selection for env var injection (nvidia|amd|apple|none).
    default: "none"
  gpu_visible_devices:
    type: array
    description: GPU indices to expose via vendor-specific env (e.g. [0,1]).
    items:
      type: integer
  gpu_visible_device_ids:
    type: array
    description: NVIDIA GPU UUIDs to expose via CUDA_VISIBLE_DEVICES (e.g. ["GPU-..."]). Mutually exclusive with gpu_visible_devices.
    items:
      type: string
  apple_gpu:
    type: object
    description: Apple M-series GPU configuration
    additionalProperties: false
    properties:
      enabled:
        type: boolean
        default: false
      metal_device:
        type: string
        description: Path to Metal device node (e.g. /dev/metal)
      mps_runtime:
        type: string
        description: Path to MPS runtime device node (e.g. /dev/mps)
  task_lease_duration:
    type: string
    description: Task lease duration
--- a/configs/workers/docker-dev.yaml
+++ b/configs/workers/docker-dev.yaml
@ -0,0 +1,58 @@
 worker_id: "docker-worker"
 base_path: "/data/experiments"
 train_script: "train.py"
 redis_url: "redis://redis:6379/0"
 local_mode: true
 prewarm_enabled: true
 max_workers: 1
 poll_interval_seconds: 2
 auto_fetch_data: false
 data_manager_path: "./data_manager"
 dataset_cache_ttl: "30m"
 data_dir: "/data/active"
 snapshot_store:
  enabled: true
  endpoint: "minio:9000"
  secure: false
  bucket: "fetchml-snapshots"
  prefix: "snapshots"
  timeout: "2m"
  max_retries: 3
 podman_image: "python:3.9-slim"
 container_workspace: "/workspace"
 container_results: "/results"
 gpu_devices:
  - "/dev/dri"
 gpu_vendor: "apple"
 gpu_visible_devices: []
 # Apple M-series GPU configuration
 apple_gpu:
  enabled: true
  metal_device: "/dev/metal"
  mps_runtime: "/dev/mps"
 resources:
  max_workers: 1
  desired_rps_per_worker: 2
  podman_cpus: "2"
  podman_memory: "4Gi"
 metrics:
  enabled: true
  listen_addr: ":9100"
 metrics_flush_interval: "500ms"
 task_lease_duration: "30m"
 heartbeat_interval: "1m"
 max_retries: 3
 graceful_timeout: "5m"
--- a/configs/workers/docker-prod.yaml
+++ b/configs/workers/docker-prod.yaml
@ -0,0 +1,50 @@
 worker_id: "docker-worker"
 base_path: "/tmp/fetchml-jobs"
 train_script: "train.py"
 redis_url: "redis://redis:6379/0"
 local_mode: true
 max_workers: 1
 poll_interval_seconds: 2
 auto_fetch_data: false
 data_manager_path: "./data_manager"
 dataset_cache_ttl: "30m"
 data_dir: "/data/active"
 snapshot_store:
  enabled: true
  endpoint: "minio:9000"
  secure: false
  bucket: "fetchml-snapshots"
  prefix: "snapshots"
  timeout: "5m"
  max_retries: 3
 podman_image: "python:3.9-slim"
 container_workspace: "/workspace"
 container_results: "/results"
 gpu_vendor: "nvidia"
 gpu_visible_devices: [0]
 gpu_devices: ["/dev/nvidia0"]
 resources:
  max_workers: 1
  desired_rps_per_worker: 2
  podman_cpus: "2"
  podman_memory: "4Gi"
 metrics:
  enabled: true
  listen_addr: ":9100"
 metrics_flush_interval: "500ms"
 task_lease_duration: "30m"
 heartbeat_interval: "1m"
 max_retries: 3
 graceful_timeout: "5m"
--- a/configs/workers/docker.yaml
+++ b/configs/workers/docker.yaml
@ -0,0 +1,43 @@
 worker_id: "docker-worker"
 base_path: "/tmp/fetchml-jobs"
 train_script: "train.py"
 redis_addr: "redis:6379"
 redis_password: ""
 redis_db: 0
 local_mode: true
 max_workers: 1
 poll_interval_seconds: 5
 auto_fetch_data: false
 data_manager_path: "./data_manager"
 dataset_cache_ttl: "30m"
 snapshot_store:
  enabled: false
 podman_image: "python:3.9-slim"
 container_workspace: "/workspace"
 container_results: "/results"
 gpu_devices: []
 gpu_vendor: "none"
 gpu_visible_devices: []
 resources:
  max_workers: 1
  desired_rps_per_worker: 2
  podman_cpus: "2"
  podman_memory: "4Gi"
 metrics:
  enabled: true
  listen_addr: ":9100"
 metrics_flush_interval: "500ms"
 task_lease_duration: "30m"
 heartbeat_interval: "1m"
 max_retries: 3
 graceful_timeout: "5m"
--- a/configs/workers/examples/prewarm-worker.yaml
+++ b/configs/workers/examples/prewarm-worker.yaml
@ -0,0 +1,27 @@
 worker_id: "test-prewarm-worker"
 host: "localhost"
 port: 8081
 base_path: "/tmp/fetch-ml-test"
 data_dir: "/tmp/fetch-ml-test/data"
 max_workers: 2
 local_mode: true
 auto_fetch_data: true
 prewarm_enabled: true
 metrics:
  enabled: true
  listen_addr: ":9102"
 train_script: "train.py"
 snapshot_store:
  enabled: false
  endpoint: ""
  secure: false
  region: ""
  bucket: ""
  prefix: ""
  access_key: ""
  secret_key: ""
  session_token: ""
  max_retries: 3
  timeout: 0s
 gpu_devices: []
 gpu_access: "none"
--- a/configs/workers/homelab-secure.yaml
+++ b/configs/workers/homelab-secure.yaml
@ -0,0 +1,47 @@
 worker_id: "homelab-worker"
 base_path: "/tmp/fetchml-jobs"
 train_script: "train.py"
 redis_url: "redis://:${REDIS_PASSWORD}@redis:6379/0"
 local_mode: true
 max_workers: 1
 poll_interval_seconds: 2
 auto_fetch_data: false
 data_manager_path: "./data_manager"
 dataset_cache_ttl: "30m"
 data_dir: "/data/active"
 snapshot_store:
  enabled: true
  endpoint: "minio:9000"
  secure: false
  bucket: "fetchml-snapshots"
  prefix: "snapshots"
  timeout: "5m"
  max_retries: 3
 podman_image: "python:3.9-slim"
 container_workspace: "/workspace"
 container_results: "/results"
 gpu_devices: []
 resources:
  max_workers: 1
  desired_rps_per_worker: 2
  podman_cpus: "2"
  podman_memory: "4Gi"
 metrics:
  enabled: true
  listen_addr: ":9100"
 metrics_flush_interval: "500ms"
 task_lease_duration: "30m"
 heartbeat_interval: "1m"
 max_retries: 3
 graceful_timeout: "5m"
--- a/configs/workers/worker-docker.yaml
+++ b/configs/workers/worker-docker.yaml
@ -1,51 +0,0 @@
 # Worker configuration for Docker production-like testing
 worker_id: "docker-test-worker-1"
 # Redis configuration
 redis:
  url: "redis://redis:6379"
  max_connections: 10
 # Local mode settings
 local_mode: false  # Use Podman for containerized job execution
 # Job paths
 base_path: "/tmp/fetchml-jobs"
 # Container workspace (not used in local mode)
 container_workspace: "/workspace"
 container_results: "/results"
 # Podman settings (not used in local mode)
 podman_image: "python:3.9-slim"
 podman_cpus: "2"
 podman_memory: "4g"
 # Worker configuration
 heartbeat_interval: "30s"
 lease_duration: "5m"
 max_concurrent_tasks: 1
 # Data manager settings
 data_manager:
  enabled: false
  base_path: "/data"
 # SSH settings for Podman communication
 ssh:
  enabled: true
  host: "localhost"
  port: 2222
  user: "worker"
  password: "SecureWorkerPass2024!"
  key_path: "/home/worker/.ssh/id_rsa"
 # Logging
 logging:
  level: "info"
  file: "/logs/worker.log"
 # Metrics
 metrics:
  enabled: true
  endpoint: ":9100"
--- a/configs/workers/worker-homelab-secure.yaml
+++ b/configs/workers/worker-homelab-secure.yaml
@ -1,79 +0,0 @@
 # Worker configuration for Homelab secure environment
 worker_id: "homelab-secure-worker-1"
 # Redis configuration with connection pooling
 redis:
  url: "redis://redis:6379"
  max_connections: 10
  connection_timeout: "10s"
  read_timeout: "5s"
  write_timeout: "5s"
 # Local mode disabled for containerized execution
 local_mode: false
 # Job paths with security considerations
 base_path: "/tmp/fetchml-jobs"
 container_workspace: "/workspace"
 container_results: "/results"
 # Podman settings with resource limits
 podman_image: "python:3.11-slim"
 podman_cpus: "2"
 podman_memory: "4g"
 podman_network: "ml-job-network"
 podman_timeout: "30m"
 # Worker configuration with security
 heartbeat_interval: "30s"
 lease_duration: "5m"
 max_concurrent_tasks: 2
 task_timeout: "30m"
 # Data manager settings
 data_manager:
  enabled: true
  base_path: "/data"
  encryption_enabled: true
  backup_enabled: true
 # SSH settings with secure configuration
 ssh:
  enabled: true
  host: "localhost"
  port: 2222
  user: "worker"
  password: "HomelabWorker2024!"
  key_path: "/home/worker/.ssh/id_rsa"
  max_retries: 3
  connection_timeout: "30s"
  strict_host_key_checking: false
 # Logging with rotation and security
 logging:
  level: "info"
  file: "/logs/worker.log"
  max_size: "50MB"
  max_backups: 5
  compress: true
  audit_enabled: true
 # Metrics and monitoring
 metrics:
  enabled: true
  endpoint: ":9100"
  path: "/metrics"
 # Security settings
 security:
  enable_job_isolation: true
  sandbox_enabled: true
  resource_monitoring: true
  audit_commands: true
 # Health check configuration
 health_check:
  enabled: true
  interval: "30s"
  timeout: "10s"
  failure_threshold: 3
--- a/configs/workers/worker-prod.toml
+++ b/configs/workers/worker-prod.toml
@ -4,7 +4,7 @@ max_workers = 4
 # Redis connection
 redis_addr = "localhost:6379"
-redis_password = "your-redis-password"
+redis_password = "CHANGE_ME_REDIS_PASSWORD"
 redis_db = 0
 # SSH connection (for remote operations)
@ -15,17 +15,13 @@ ssh_key = "~/.ssh/id_rsa"
 # Podman configuration
 podman_image = "ml-training:latest"
-gpu_access = true
+gpu_vendor = "none"
 gpu_visible_devices = []
 gpu_devices = []
 container_workspace = "/workspace"
 container_results = "/results"
 train_script = "train.py"
 [resources]
 max_workers = 4
 desired_rps_per_worker = 2
 podman_cpus = "4"
 podman_memory = "16g"
 # Dataset management
 auto_fetch_data = true
 data_dir = "/data/datasets"
@ -36,10 +32,16 @@ dataset_cache_ttl = "24h"
 task_lease_duration = "1h"
 heartbeat_interval = "30s"
 graceful_timeout = "5m"
-poll_interval = "100ms"
+poll_interval_seconds = 1
 metrics_flush_interval = "10s"
 [resources]
 max_workers = 4
 desired_rps_per_worker = 2
 podman_cpus = "4"
 podman_memory = "16g"
 # Metrics exporter
 [metrics]
 enabled = true
-listen_addr = ":9090"
+listen_addr = ":9100"
--- a/deployments/Caddyfile.dev
+++ b/deployments/Caddyfile.dev
@ -0,0 +1,45 @@
 {
 	auto_https off
 	admin off
 	servers {
 		protocols h1 h2
 	}
 }
 http://localhost {
 	handle /health {
 		reverse_proxy api-server:9101
 	}
 	handle /ws* {
 		reverse_proxy api-server:9101
 	}
 	handle /api/* {
 		reverse_proxy api-server:9101
 	}
 	handle {
 		respond 404
 	}
 }
 https://localhost {
 	tls internal
 	handle /health {
 		reverse_proxy api-server:9101
 	}
 	handle /ws* {
 		reverse_proxy api-server:9101
 	}
 	handle /api/* {
 		reverse_proxy api-server:9101
 	}
 	handle {
 		respond 404
 	}
 }
--- a/deployments/Caddyfile.homelab-secure
+++ b/deployments/Caddyfile.homelab-secure
@ -0,0 +1,44 @@
 {
 	admin off
 	servers {
 		protocols h1 h2
 	}
 }
 {$FETCHML_DOMAIN} {
 	encode gzip
 	tls /etc/caddy/ssl/cert.pem /etc/caddy/ssl/key.pem
 	header {
 		-Server
 		X-Frame-Options "DENY"
 		X-Content-Type-Options "nosniff"
 		Referrer-Policy "strict-origin-when-cross-origin"
 		Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
 		Content-Security-Policy "default-src 'self'; base-uri 'self'; frame-ancestors 'none'"
 	}
 	@admin path /admin/*
 	@admin_private remote_ip private_ranges
 	handle @admin {
 		respond @admin_private 404
 		respond 404
 	}
 	handle /health {
 		reverse_proxy api-server:9101
 	}
 	handle /ws* {
 		reverse_proxy api-server:9101
 	}
 	handle /api/* {
 		reverse_proxy api-server:9101
 	}
 	handle {
 		respond 404
 	}
 }
--- a/deployments/Caddyfile.prod
+++ b/deployments/Caddyfile.prod
@ -0,0 +1,47 @@
 {
 	email {$CADDY_EMAIL}
 	admin off
 	servers {
 		protocols h1 h2
 	}
 }
 {$FETCHML_DOMAIN} {
 	encode gzip
 	request_body {
 		max_size 10MB
 	}
 	header {
 		-Server
 		X-Frame-Options "DENY"
 		X-Content-Type-Options "nosniff"
 		Referrer-Policy "strict-origin-when-cross-origin"
 		Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
 		Content-Security-Policy "default-src 'self'; base-uri 'self'; frame-ancestors 'none'"
 	}
 	@admin path /admin/*
 	@admin_private remote_ip private_ranges
 	handle @admin {
 		respond @admin_private 404
 		respond 404
 	}
 	handle /health {
 		reverse_proxy api-server:9101
 	}
 	handle /ws* {
 		reverse_proxy api-server:9101
 	}
 	handle /api/* {
 		reverse_proxy api-server:9101
 	}
 	handle {
 		respond 404
 	}
 }
--- a/deployments/Caddyfile.smoke
+++ b/deployments/Caddyfile.smoke
@ -0,0 +1,23 @@
 {
 	auto_https off
 }
 localhost {
 	tls internal
 	handle /health {
 		reverse_proxy api-server:9101
 	}
 	handle /ws* {
 		reverse_proxy api-server:9101
 	}
 	handle /api/* {
 		reverse_proxy api-server:9101
 	}
 	handle {
 		respond 404
 	}
 }
--- a/deployments/Makefile
+++ b/deployments/Makefile
@ -0,0 +1,76 @@
 # Docker Compose Deployment Management
 .PHONY: help dev-up dev-down dev-logs dev-restart homelab-secure-up homelab-secure-down prod-up prod-down status clean
 # Default target
 help: ## Show this help message
 	@echo "Available commands:"
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
 # Development environment
 dev-up: ## Start development environment
 	@echo "Starting development environment..."
 	docker-compose -f deployments/docker-compose.dev.yml up -d
 	@echo "Services: Caddy (8080/8443), Redis (6379), Prometheus (9090), Grafana (3000)"
 dev-down: ## Stop development environment
 	@echo "Stopping development environment..."
 	docker-compose -f deployments/docker-compose.dev.yml down
 dev-logs: ## Show development logs
 	docker-compose -f deployments/docker-compose.dev.yml logs -f
 dev-restart: ## Restart development environment
 	@echo "Restarting development environment..."
 	docker-compose -f deployments/docker-compose.dev.yml restart
 # Homelab environment
 homelab-secure-up: ## Start secure homelab environment
 	@echo "Starting secure homelab environment..."
 	docker-compose -f deployments/docker-compose.homelab-secure.yml up -d
 homelab-secure-down: ## Stop secure homelab environment
 	@echo "Stopping secure homelab environment..."
 	docker-compose -f deployments/docker-compose.homelab-secure.yml down
 # Production environment
 prod-up: ## Start production environment
 	@echo "Starting production environment..."
 	docker-compose -f deployments/docker-compose.prod.yml up -d
 prod-down: ## Stop production environment
 	@echo "Stopping production environment..."
 	docker-compose -f deployments/docker-compose.prod.yml down
 # Utility commands
 status: ## Show status of all environments
 	@echo "=== Development Status ==="
 	@if [ -f deployments/docker-compose.dev.yml ]; then \
 		docker-compose -f deployments/docker-compose.dev.yml ps; \
 	fi
 	@echo ""
 	@echo "=== Homelab Secure Status ==="
 	@if [ -f deployments/docker-compose.homelab-secure.yml ]; then \
 		docker-compose -f deployments/docker-compose.homelab-secure.yml ps 2>/dev/null || echo "Not running"; \
 	fi
 	@echo ""
 	@echo "=== Production Status ==="
 	@if [ -f deployments/docker-compose.prod.yml ]; then \
 		docker-compose -f deployments/docker-compose.prod.yml ps 2>/dev/null || echo "Not running"; \
 	fi
 clean: ## Clean up all containers and volumes
 	@echo "Cleaning up all Docker resources..."
 	@echo "This will remove all containers and volumes. Continue? [y/N]"
 	@read -r confirm && [ "$$confirm" = "y" ] || exit 1
 	docker-compose -f deployments/docker-compose.dev.yml down -v 2>/dev/null || true
 	docker-compose -f deployments/docker-compose.homelab-secure.yml down -v 2>/dev/null || true
 	docker-compose -f deployments/docker-compose.prod.yml down -v 2>/dev/null || true
 	docker system prune -f
 	@echo "Cleanup complete."
 # Quick aliases
 up: dev-up ## Alias for dev-up
 down: dev-down ## Alias for dev-down
 logs: dev-logs ## Alias for dev-logs
 restart: dev-restart ## Alias for dev-restart
--- a/deployments/README.md
+++ b/deployments/README.md
@ -2,33 +2,123 @@
 This directory contains Docker Compose configurations for different deployment environments.
-## Files
+## Environment Configurations
- `docker-compose.homelab-secure.yml` - Secure homelab deployment with TLS and authentication
+### Development (`docker-compose.dev.yml`)
- `docker-compose.prod.yml` - Production deployment configuration
+- Full development stack with monitoring
 - Includes: API, Worker, Redis, MinIO (snapshots), Prometheus, Grafana, Loki, Promtail
 - Optimized for local development and testing
 - **Usage**: `docker-compose -f deployments/docker-compose.dev.yml up -d`
-## Usage
+### Homelab - Secure (`docker-compose.homelab-secure.yml`)
 - Secure homelab deployment with authentication and a Caddy reverse proxy
 - TLS is terminated at the reverse proxy (Approach A)
 - Includes: API, Redis (password protected), Caddy reverse proxy
 - **Usage**: `docker-compose -f deployments/docker-compose.homelab-secure.yml up -d`
 ### Production (`docker-compose.prod.yml`)
 - Production deployment configuration
 - Optimized for performance and security
 - External services assumed (Redis, monitoring)
 - **Usage**: `docker-compose -f deployments/docker-compose.prod.yml up -d`
 Note: `docker-compose.prod.yml` is a reproducible staging/testing harness. Real production deployments do not require Docker; you can run the Go services directly (systemd) and use Caddy for TLS/WSS termination.
 ## TLS / WSS Policy
 - The Zig CLI currently supports `ws://` only (native `wss://` is not implemented).
 - Production deployments terminate TLS/WSS at a reverse proxy (Caddy in `docker-compose.prod.yml`) and keep the API server on internal `ws://`.
 - Homelab deployments terminate TLS/WSS at a reverse proxy (Caddy) and keep the API server on internal `ws://`.
 - Health checks in compose files should use `http://localhost:9101/health` when `server.tls.enabled: false`.
 ## Required Volume Mounts
 - `base_path` (experiments) must be writable by the API server.
 - `data_dir` should be mounted if you want snapshot/dataset integrity validation via `ml validate`.
 For the default configs:
 - `base_path`: `/data/experiments` (dev/homelab configs) or `/app/data/experiments` (prod configs)
 - `data_dir`: `/data/active`
 ## Quick Start
 ### Development
 ```bash
-# Use the main docker-compose.yml in project root
+# Development (most common)
-docker-compose up -d
+docker-compose -f deployments/docker-compose.dev.yml up -d
 # Check status
 docker-compose -f deployments/docker-compose.dev.yml ps
 # View logs
 docker-compose -f deployments/docker-compose.dev.yml logs -f api-server
 # Stop services
 docker-compose -f deployments/docker-compose.dev.yml down
 ```
-### Homelab (Secure)
+## Dev: MinIO-backed snapshots (smoke test)
 ```bash
 docker-compose -f deployments/docker-compose.homelab-secure.yml up -d
 ```
-### Production
+The dev compose file provisions a MinIO bucket and uploads a small example snapshot object at:
-```bash
+
-docker-compose -f deployments/docker-compose.prod.yml up -d
+`s3://fetchml-snapshots/snapshots/snap-1.tar.gz`
-```
+
 To queue a task that forces the worker to pull the snapshot from MinIO:
 1. Start the dev stack:
   `docker-compose -f deployments/docker-compose.dev.yml up -d`
 2. Read the `snapshot_sha256` printed by the init job:
   `docker-compose -f deployments/docker-compose.dev.yml logs minio-init`
 3. Queue a job using the snapshot fields:
   `ml queue <job-name> --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`
 ## Smoke tests
 - `make dev-smoke` runs the development stack smoke test.
 - `make prod-smoke` runs a Docker-based staging smoke test for the production stack, using a localhost-only Caddy configuration.
   Note: `ml queue` by itself will generate a random commit ID. For full provenance enforcement (manifest + dependency manifest), use `ml sync ./your-project --queue` so the server has real code + dependency files.
   Examples:
   - `ml queue train-mnist --priority 3 --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`
   - `ml queue train-a train-b train-c --priority 5 --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`
 ## Environment Variables
-Each deployment may require specific environment variables. Refer to the individual compose files for requirements.
+Create a `.env` file in the project root:
 ```bash
 # Grafana
 GRAFANA_ADMIN_PASSWORD=your_secure_password
 # API Configuration
 LOG_LEVEL=info
 # TLS (for secure deployments)
 TLS_CERT_PATH=/app/ssl/cert.pem
 TLS_KEY_PATH=/app/ssl/key.pem
 ```
 ## Service Ports
 | Service | Development | Homelab | Production |
 |---------|-------------|---------|------------|
 | API Server | 9101 | 9101 | 9101 |
 | Redis | 6379 | 6379 | - |
 | Prometheus | 9090 | - | - |
 | Grafana | 3000 | - | - |
 | Loki | 3100 | - | - |
 ## Monitoring
-Performance monitoring configurations are in `monitoring/docker-compose.performance.yml`
+- **Development**: Full monitoring stack included
 - **Homelab**: Basic monitoring (configurable)
 - **Production**: External monitoring assumed
 ## Security Notes
 - If you need HTTPS externally, terminate TLS at a reverse proxy.
 - API keys should be managed via environment variables
 - Database credentials should use secrets management in production
--- a/deployments/deploy.sh
+++ b/deployments/deploy.sh
@ -0,0 +1,162 @@
 #!/bin/bash
 # Quick deployment script for fetch_ml
 set -e
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 # Function to print colored output
 print_status() {
    echo -e "${BLUE}[INFO]${NC} $1"
 }
 print_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
 }
 print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
 }
 print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
 }
 # Function to show usage
 show_usage() {
    echo "Usage: $0 [ENVIRONMENT] [ACTION]"
    echo ""
    echo "Environments:"
    echo "  dev         Development environment"
    echo "  secure      Secure homelab environment"
    echo "  prod        Production environment"
    echo ""
    echo "Actions:"
    echo "  up          Start services"
    echo "  down        Stop services"
    echo "  restart     Restart services"
    echo "  logs        Show logs"
    echo "  status      Show status"
    echo ""
    echo "Examples:"
    echo "  $0 dev up           # Start development environment"
    echo "  $0 prod down         # Stop production environment"
    echo "  $0 secure logs       # Show secure environment logs"
 }
 # Function to check if docker-compose file exists
 check_compose_file() {
    local env=$1
    local compose_file=""
    case $env in
        "dev")
            compose_file="deployments/docker-compose.dev.yml"
            ;;
        "secure")
            compose_file="deployments/docker-compose.homelab-secure.yml"
            ;;
        "prod")
            compose_file="deployments/docker-compose.prod.yml"
            ;;
        *)
            print_error "Unknown environment: $env"
            show_usage
            exit 1
            ;;
    esac
    if [ ! -f "$compose_file" ]; then
        print_error "Docker Compose file not found: $compose_file"
        exit 1
    fi
    echo "$compose_file"
 }
 # Function to check if .env file exists
 check_env_file() {
    local env=$1
    if [ ! -f ".env" ]; then
        print_warning ".env file not found. Creating from example..."
        if [ "$env" = "dev" ]; then
            cp deployments/env.dev.example .env
        elif [ "$env" = "prod" ]; then
            cp deployments/env.prod.example .env
        else
            cp deployments/env.dev.example .env
        fi
        print_warning "Please edit .env file with your configuration"
    fi
 }
 # Main script
 main() {
    if [ $# -ne 2 ]; then
        show_usage
        exit 1
    fi
    local environment=$1
    local action=$2
    print_status "Environment: $environment"
    print_status "Action: $action"
    # Check compose file
    compose_file=$(check_compose_file "$environment")
    print_status "Using: $compose_file"
    # Check .env file
    check_env_file "$environment"
    # Execute action
    case $action in
        "up")
            print_status "Starting $environment environment..."
            docker-compose -f "$compose_file" up -d
            print_success "$environment environment started successfully!"
            # Show service URLs
            echo ""
            print_status "Service URLs:"
            echo "  API Server: http://localhost:9101"
            if [ "$environment" = "dev" ]; then
                echo "  Grafana: http://localhost:3000 (admin/admin123)"
                echo "  Prometheus: http://localhost:9090"
            fi
            ;;
        "down")
            print_status "Stopping $environment environment..."
            docker-compose -f "$compose_file" down
            print_success "$environment environment stopped successfully!"
            ;;
        "restart")
            print_status "Restarting $environment environment..."
            docker-compose -f "$compose_file" restart
            print_success "$environment environment restarted successfully!"
            ;;
        "logs")
            print_status "Showing logs for $environment environment..."
            docker-compose -f "$compose_file" logs -f
            ;;
        "status")
            print_status "Status of $environment environment:"
            docker-compose -f "$compose_file" ps
            ;;
        *)
            print_error "Unknown action: $action"
            show_usage
            exit 1
            ;;
    esac
 }
 # Run main function
 main "$@"
--- a/deployments/docker-compose.dev.yml
+++ b/deployments/docker-compose.dev.yml
@ -0,0 +1,225 @@
 # Homelab Docker Compose with Centralized Monitoring
 # Includes: API, Redis, Prometheus, Grafana, Loki
 services:
  caddy:
    image: caddy:2-alpine
    container_name: ml-dev-caddy
    restart: unless-stopped
    ports:
      - "8080:80"
      - "8443:443"
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/deployments/Caddyfile.dev:/etc/caddy/Caddyfile:ro
      - ${FETCHML_REPO_ROOT:-.}/data/dev/caddy/data:/data
      - ${FETCHML_REPO_ROOT:-.}/data/dev/caddy/config:/config
    depends_on:
      api-server:
        condition: service_healthy
  redis:
    image: redis:7-alpine
    container_name: ml-experiments-redis
    user: "999:999"
    ports:
      - "6379:6379"
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/data/dev/redis:/data
    restart: unless-stopped
    command: redis-server --appendonly yes
    healthcheck:
      test: [ "CMD", "redis-cli", "ping" ]
      interval: 30s
      timeout: 10s
      retries: 3
  api-server:
    build:
      context: ${FETCHML_REPO_ROOT:-.}
      dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
    container_name: ml-experiments-api
    user: "0:0"
    expose:
      - "9101"  # API and health endpoints (internal; external access via Caddy)
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs
      - ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments
      - ${FETCHML_REPO_ROOT:-.}/data/dev/active:/data/active
      - ${FETCHML_REPO_ROOT:-.}/data/dev/workspaces:/data/active/workspaces:delegated
      - ${FETCHML_REPO_ROOT:-.}/configs/api/dev.yaml:/app/configs/api/dev.yaml
      - ${FETCHML_REPO_ROOT:-.}/ssl:/app/ssl
    depends_on:
      - redis
    restart: unless-stopped
    command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
    environment:
      - LOG_LEVEL=info
    healthcheck:
      test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
    labels:
      logging: "promtail"
      job: "api-server"
  minio:
    image: minio/minio:latest
    container_name: ml-experiments-minio
    ports:
      - "9000:9000"
      - "9001:9001"
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/data/dev/minio:/data
    environment:
      - MINIO_ROOT_USER=minioadmin
      - MINIO_ROOT_PASSWORD=minioadmin123
    command: ["server", "/data", "--console-address", ":9001"]
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
      interval: 10s
      timeout: 5s
      retries: 10
    restart: unless-stopped
  minio-init:
    image: alpine:3.19
    container_name: ml-experiments-minio-init
    depends_on:
      minio:
        condition: service_healthy
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -eu
        apk add --no-cache ca-certificates curl tar gzip
        ARCH=$$(uname -m)
        MC_ARCH=amd64
        if [ "$$ARCH" = "aarch64" ] || [ "$$ARCH" = "arm64" ]; then
          MC_ARCH=arm64
        fi
        curl -fsSL -o /usr/local/bin/mc "https://dl.min.io/client/mc/release/linux-$$MC_ARCH/mc"
        chmod +x /usr/local/bin/mc
        i=0
        while ! mc alias set local http://minio:9000 minioadmin minioadmin123; do
          i=$$((i+1))
          if [ $$i -ge 30 ]; then
            echo "minio not ready after 30 attempts" >&2
            exit 1
          fi
          echo "waiting for minio... ($$i/30)"
          sleep 1
        done
        mc mb -p local/fetchml-snapshots || true
        mkdir -p /tmp/snapshots/snap-1
        echo -n "hello" > /tmp/snapshots/snap-1/hello.txt
        tar -C /tmp/snapshots/snap-1 -czf /tmp/snap-1.tar.gz .
        mc cp /tmp/snap-1.tar.gz local/fetchml-snapshots/snapshots/snap-1.tar.gz
        FILE_SHA=$$(sha256sum /tmp/snapshots/snap-1/hello.txt | cut -d' ' -f1)
        SNAP_SHA=$$(echo -n "$$FILE_SHA" | sha256sum | cut -d' ' -f1)
        echo "snapshot_id=snap-1 snapshot_sha256=$$SNAP_SHA"
    restart: "no"
  worker:
    build:
      context: ${FETCHML_REPO_ROOT:-.}
      dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
    container_name: ml-experiments-worker
    user: "0:0"
    ports:
      - "8888:8888"
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs
      - ${FETCHML_REPO_ROOT:-.}/data/dev/active:/data/active
      - ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments
      - ${FETCHML_REPO_ROOT:-.}/data/dev/workspaces:/data/active/workspaces:delegated
      - ${FETCHML_REPO_ROOT:-.}/configs/workers/docker-dev.yaml:/app/configs/worker.yaml
      - /sys/fs/cgroup:/sys/fs/cgroup:rw
    depends_on:
      redis:
        condition: service_healthy
      api-server:
        condition: service_healthy
      minio-init:
        condition: service_completed_successfully
    restart: unless-stopped
    environment:
      - LOG_LEVEL=info
      - MINIO_ROOT_USER=minioadmin
      - MINIO_ROOT_PASSWORD=minioadmin123
      - FETCHML_JUPYTER_DEFAULT_IMAGE=quay.io/jupyter/base-notebook:latest
      - FETCHML_JUPYTER_CONDA_ENV=base
      - FETCHML_JUPYTER_KERNEL_NAME=python
      - FETCHML_PODMAN_CGROUPS=disabled
    privileged: true
    command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
  # Prometheus - Metrics collection
  prometheus:
    image: prom/prometheus:latest
    container_name: ml-experiments-prometheus
    ports:
      - "9090:9090"
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--web.enable-lifecycle'
    restart: unless-stopped
  # Grafana - Visualization
  grafana:
    image: grafana/grafana:latest
    container_name: ml-experiments-grafana
    ports:
      - "3000:3000"
    volumes:
      - grafana_data:/var/lib/grafana
      - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning
      - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin123
      - GF_USERS_ALLOW_SIGN_UP=false
    restart: unless-stopped
    depends_on:
      - prometheus
      - loki
  # Loki - Log aggregation
  loki:
    image: grafana/loki:latest
    container_name: ml-experiments-loki
    ports:
      - "3100:3100"
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/monitoring/loki-config.yml:/etc/loki/local-config.yaml
      - loki_data:/loki
    command: -config.file=/etc/loki/local-config.yaml
    restart: unless-stopped
  # Promtail - Log collector
  promtail:
    image: grafana/promtail:latest
    container_name: ml-experiments-promtail
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/monitoring/promtail-config.yml:/etc/promtail/config.yml
      - ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/var/log/app
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
      - /var/run/docker.sock:/var/run/docker.sock
    command: -config.file=/etc/promtail/config.yml
    restart: unless-stopped
    depends_on:
      - loki
 volumes:
  prometheus_data:
    driver: local
  grafana_data:
    driver: local
  loki_data:
    driver: local
--- a/deployments/docker-compose.homelab-secure.yml
+++ b/deployments/docker-compose.homelab-secure.yml
@ -1,104 +1,152 @@
-# Homelab Secure Docker Environment
+# Secure Homelab Docker Compose Configuration
-services:
+# Use with: docker-compose -f docker-compose.yml -f docker-compose.homelab-secure.yml up -d
  redis:
    image: redis:7-alpine
    container_name: ml-homelab-redis
    ports:
      - "6379:6379"
    volumes:
      - redis_homelab_data:/data
    restart: unless-stopped
    command: >
      redis-server 
      --appendonly yes 
      --requirepass "HomelabRedis2024!"
      --maxmemory 512mb
      --maxmemory-policy allkeys-lru
    healthcheck:
      test: ["CMD", "redis-cli", "-a", "HomelabRedis2024!", "ping"]
      interval: 30s
      timeout: 10s
      retries: 3
    networks:
      - ml-homelab-network
 services:
  api-server:
    build:
-      context: .
+      context: ${FETCHML_REPO_ROOT:-.}
-      dockerfile: build/docker/homelab-secure.Dockerfile
+      dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
-    container_name: ml-homelab-api
+    container_name: ml-experiments-api
    ports:
-      - "9104:9101"  # API server port
+      - "9101:9101"
-      - "2223:2222"  # Secure SSH port
+      - "9100:9100" # Prometheus metrics endpoint
      - "9101:9100"  # Prometheus metrics
    volumes:
-      - ./data:/app/data/experiments
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/experiments:/data/experiments
-      - ./logs:/logs
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/active:/data/active
-      - ./configs/config-homelab-secure.yaml:/app/configs/config.yaml
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/logs:/logs
      - ${FETCHML_REPO_ROOT:-.}/ssl:/app/ssl:ro
      - ${FETCHML_REPO_ROOT:-.}/configs/api/homelab-secure.yaml:/app/configs/api/prod.yaml:ro
      - ${FETCHML_REPO_ROOT:-.}/.env.secure:/app/.env.secure:ro
    depends_on:
      redis:
        condition: service_healthy
    restart: unless-stopped
    environment:
      - REDIS_URL=redis://:HomelabRedis2024!@redis:6379
      - LOG_LEVEL=info
-      - TZ=America/New_York
+      # Load secure environment variables
      - JWT_SECRET_FILE=/app/.env.secure
    healthcheck:
-      test: ["CMD", "curl", "-k", "-f", "https://localhost:9101/health"]
+      test: ["CMD", "curl", "-f", "http://localhost:9101/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
-    command: >
+    labels:
-      sh -c "
+      logging: "promtail"
-        sudo /app/start-security.sh &
+      job: "api-server"
-        /usr/local/bin/api-server -config /app/configs/config.yaml
+    command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"]
      "
    networks:
-      - ml-homelab-network
+      - ml-experiments-network
      # Add internal network for secure communication
      - ml-backend-network
  minio:
    image: minio/minio:latest
    container_name: ml-experiments-minio
    ports:
      - "9000:9000"
      - "9001:9001"
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/data/homelab/minio:/data
    environment:
      - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
      - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
    command: ["server", "/data", "--console-address", ":9001"]
    restart: unless-stopped
    networks:
      - ml-backend-network
  minio-init:
    image: alpine:3.19
    container_name: ml-experiments-minio-init
    depends_on:
      - minio
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        apk add --no-cache ca-certificates curl >/dev/null
        curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc
        chmod +x /usr/local/bin/mc
        mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123}
        mc mb -p local/fetchml-snapshots || true
    restart: "no"
    networks:
      - ml-backend-network
  worker:
    build:
-      context: .
+      context: ${FETCHML_REPO_ROOT:-.}
-      dockerfile: build/docker/homelab-secure.Dockerfile
+      dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
-    container_name: ml-homelab-worker
+    container_name: ml-experiments-worker
    volumes:
-      - ./data:/app/data/experiments
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/experiments:/app/data/experiments
-      - ./logs:/logs
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/active:/data/active
-      - ./configs/worker-homelab-secure.yaml:/app/configs/worker.yaml
+      - ${FETCHML_REPO_ROOT:-.}/data/homelab/logs:/logs
      - ${FETCHML_REPO_ROOT:-.}/configs/workers/homelab-secure.yaml:/app/configs/worker.yaml
    depends_on:
      redis:
        condition: service_healthy
      api-server:
        condition: service_healthy
      minio-init:
        condition: service_started
    restart: unless-stopped
    environment:
      - REDIS_URL=redis://:HomelabRedis2024!@redis:6379
      - LOG_LEVEL=info
-      - TZ=America/New_York
+      - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
-    privileged: true  # Required for Podman
+      - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
-    security_opt:
+      - REDIS_PASSWORD=${REDIS_PASSWORD}
-      - no-new-privileges:true
+    privileged: true
-    cap_drop:
+    command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
      - ALL
    cap_add:
      - NET_ADMIN
      - SYS_ADMIN
    command: >
      sh -c "
        sudo /app/start-security.sh &
        /usr/local/bin/worker -config /app/configs/worker.yaml
      "
    networks:
-      - ml-homelab-network
+      - ml-backend-network
-volumes:
+  caddy:
-  redis_homelab_data:
+    image: caddy:2-alpine
-    driver: local
+    container_name: ml-experiments-caddy
    restart: unless-stopped
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/deployments/Caddyfile.homelab-secure:/etc/caddy/Caddyfile:ro
      - ${FETCHML_REPO_ROOT:-.}/ssl:/etc/caddy/ssl:ro
      - ${FETCHML_REPO_ROOT:-.}/data/homelab/caddy/data:/data
      - ${FETCHML_REPO_ROOT:-.}/data/homelab/caddy/config:/config
    environment:
      - FETCHML_DOMAIN=${FETCHML_DOMAIN:-ml.local}
    depends_on:
      api-server:
        condition: service_healthy
    networks:
      - ml-experiments-network
  # Redis with authentication
  redis:
    image: redis:7-alpine
    container_name: ml-experiments-redis
    user: "999:999"
    ports:
      - "127.0.0.1:6379:6379"  # Bind to localhost only
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/data/homelab/redis:/data
      - ${FETCHML_REPO_ROOT:-.}/redis/redis-secure.conf:/usr/local/etc/redis/redis.conf:ro
    restart: unless-stopped
    command: redis-server /usr/local/etc/redis/redis.conf --requirepass ${REDIS_PASSWORD}
    healthcheck:
      test: ["CMD", "redis-cli", "--no-auth-warning", "-a", "${REDIS_PASSWORD}", "ping"]
      interval: 30s
      timeout: 10s
      retries: 3
    networks:
      - ml-backend-network
    environment:
      - REDIS_PASSWORD=${REDIS_PASSWORD}
 volumes: {}
 networks:
-  ml-homelab-network:
+  ml-experiments-network:
    driver: bridge
  ml-backend-network:
    driver: bridge
    ipam:
      config:
        - subnet: 172.25.0.0/16
--- a/deployments/docker-compose.prod.smoke.yml
+++ b/deployments/docker-compose.prod.smoke.yml
@ -0,0 +1,75 @@
 services:
  caddy:
    image: caddy:2-alpine
    environment:
      - FETCHML_DOMAIN=localhost
      - CADDY_EMAIL=smoke@example.invalid
    ports:
      - "8080:80"
      - "8443:443"
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/caddy/data:/data
      - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/caddy/config:/config
    command:
      - /bin/sh
      - -c
      - |
        cat > /etc/caddy/Caddyfile <<'EOF'
        {
        	debug
        	servers {
        		protocols h1 h2
        	}
        }
        https://localhost {
        	tls internal {
        		protocols tls1.2 tls1.3
        	}
        	handle {
        		reverse_proxy api-server:9101
        	}
        }
        EOF
        exec caddy run --config /etc/caddy/Caddyfile
  redis:
    image: redis:7-alpine
    user: "999:999"
    restart: unless-stopped
    expose:
      - "6379"
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/redis:/data
    command: redis-server --appendonly yes
    healthcheck:
      test: [ "CMD", "redis-cli", "ping" ]
      interval: 10s
      timeout: 5s
      retries: 10
  api-server:
    build:
      context: ${FETCHML_REPO_ROOT:-.}
      dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
    user: "0:0"
    restart: unless-stopped
    expose:
      - "9101"
    depends_on:
      redis:
        condition: service_healthy
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/experiments:/data/experiments
      - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/active:/data/active
      - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/logs:/logs
      - ${FETCHML_REPO_ROOT:-.}/configs/api/dev.yaml:/app/configs/api/dev.yaml:ro
    command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
    healthcheck:
      test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
      interval: 10s
      timeout: 5s
      retries: 10
 volumes: {}
--- a/deployments/docker-compose.prod.yml
+++ b/deployments/docker-compose.prod.yml
@ -1,12 +1,31 @@
 # Full Production Docker Environment with Podman and SQLite
 services:
  caddy:
    image: caddy:2-alpine
    container_name: ml-prod-caddy
    restart: unless-stopped
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./Caddyfile.prod:/etc/caddy/Caddyfile:ro
      - ${FETCHML_REPO_ROOT:-.}/data/prod/caddy/data:/data
      - ${FETCHML_REPO_ROOT:-.}/data/prod/caddy/config:/config
    environment:
      - FETCHML_DOMAIN=${FETCHML_DOMAIN}
      - CADDY_EMAIL=${CADDY_EMAIL}
    depends_on:
      api-server:
        condition: service_healthy
  redis:
    image: redis:7-alpine
    container_name: ml-prod-redis
-    ports:
+    user: "999:999"
-      - "6379:6379"
+    expose:
      - "6379"
    volumes:
-      - redis_prod_data:/data
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/redis:/data
    restart: unless-stopped
    command: redis-server --appendonly yes
    healthcheck:
@ -17,57 +36,87 @@ services:
  api-server:
    build:
-      context: .
+      context: ${FETCHML_REPO_ROOT:-.}
-      dockerfile: build/docker/secure-prod.Dockerfile
+      dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/secure-prod.Dockerfile
    container_name: ml-prod-api
-    ports:
+    expose:
-      - "9103:9101"  # API server port
+      - "9101"  # API server port (internal; external access via Caddy)
-      - "2222:2222"  # Secure SSH port for Podman communication
+      - "2222"  # Secure SSH port for Podman communication (internal)
      - "9100:9100"  # Prometheus metrics
    volumes:
-      - ./data:/app/data/experiments
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/experiments:/app/data/experiments
-      - ./logs:/logs
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/active:/data/active
-      - ./configs/config-multi-user.yaml:/app/configs/config.yaml
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/logs:/logs
      - ${FETCHML_REPO_ROOT:-.}/configs/api/multi-user.yaml:/app/configs/api/prod.yaml
    depends_on:
      redis:
        condition: service_healthy
    restart: unless-stopped
    environment:
      - REDIS_URL=redis://redis:6379
      - LOG_LEVEL=info
    healthcheck:
-      test: [ "CMD", "curl", "-k", "https://localhost:9101/health" ]
+      test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
-    # Start SSH daemon for Podman communication
+    # Start API server (ensure data_dir exists for snapshot/dataset validation)
-    command: ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"]
+    command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"]
  minio:
    image: minio/minio:latest
    container_name: ml-prod-minio
    expose:
      - "9000"
      - "9001"
    volumes:
      - ${FETCHML_REPO_ROOT:-.}/data/prod/minio:/data
    environment:
      - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
      - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
    command: ["server", "/data", "--console-address", ":9001"]
    restart: unless-stopped
  minio-init:
    image: alpine:3.19
    container_name: ml-prod-minio-init
    depends_on:
      - minio
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        apk add --no-cache ca-certificates curl >/dev/null
        curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc
        chmod +x /usr/local/bin/mc
        mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123}
        mc mb -p local/fetchml-snapshots || true
    restart: "no"
  worker:
    build:
-      context: .
+      context: ${FETCHML_REPO_ROOT:-.}
-      dockerfile: build/docker/secure-prod.Dockerfile
+      dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
    container_name: ml-prod-worker
    volumes:
-      - ./data:/app/data/experiments
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/experiments:/app/data/experiments
-      - ./logs:/logs
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/active:/data/active
-      - ./configs/worker-docker.yaml:/app/configs/worker.yaml
+      - ${FETCHML_REPO_ROOT:-.}/data/prod/logs:/logs
      - ${FETCHML_REPO_ROOT:-.}/configs/workers/docker-prod.yaml:/app/configs/worker.yaml
    depends_on:
      redis:
        condition: service_healthy
      api-server:
        condition: service_healthy
      minio-init:
        condition: service_started
    restart: unless-stopped
    environment:
      - REDIS_URL=redis://redis:6379
      - LOG_LEVEL=info
      - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
      - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
    privileged: true  # Required for Podman to work in Docker
    command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
-volumes:
+volumes: {}
  redis_prod_data:
    driver: local
 networks:
  default:
--- a/deployments/env.dev.example
+++ b/deployments/env.dev.example
@ -0,0 +1,17 @@
 # Development Environment Variables
 # Copy this file to .env and modify as needed
 # Grafana
 GRAFANA_ADMIN_PASSWORD=admin123
 # API Configuration
 LOG_LEVEL=info
 # TLS (development uses self-signed certs)
 TLS_CERT_PATH=/app/ssl/cert.pem
 TLS_KEY_PATH=/app/ssl/key.pem
 # Development-specific
 ENVIRONMENT=development
 DEBUG=true
 API_KEY=development_key_only
--- a/deployments/env.prod.example
+++ b/deployments/env.prod.example
@ -0,0 +1,28 @@
 # Production Environment Variables
 # Copy this file to .env and modify as needed
 # Grafana (if using)
 GRAFANA_ADMIN_PASSWORD=CHANGE_ME_SECURE_PASSWORD
 # API Configuration
 LOG_LEVEL=warn
 # TLS (production should use CA-signed certs)
 TLS_CERT_PATH=/app/ssl/cert.pem
 TLS_KEY_PATH=/app/ssl/key.pem
 # Caddy (TLS/WSS termination)
 FETCHML_DOMAIN=ml.example.com
 CADDY_EMAIL=admin@example.com
 # Production-specific
 ENVIRONMENT=production
 DEBUG=false
 # Security
 API_KEY=CHANGE_ME_SECURE_API_KEY
 ALLOWED_ORIGINS=https://yourdomain.com
 # External services (if applicable)
 EXTERNAL_REDIS_URL=redis://external-redis:6379
 EXTERNAL_PROMETHEUS_URL=http://external-prometheus:9090
--- a/deployments/setup.sh
+++ b/deployments/setup.sh
@ -0,0 +1,112 @@
 #!/usr/bin/env bash
 set -euo pipefail
 usage() {
  cat <<'EOF'
 Usage: ./deployments/setup.sh
 This script DOES NOT install dependencies.
 It prints the manual steps and required/optional dependencies for a real (non-Docker) production deployment.
 EOF
 }
 if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
  usage
  exit 0
 fi
 cat <<'EOF'
 == FetchML production setup (non-Docker) ==
 Required (core):
 - Go-built binaries: api-server, worker
 - Redis (reachable from api-server + worker)
 - A writable base_path for experiments
 - A writable data_dir if you want snapshot/dataset staging + integrity validation
 Required (TLS/WSS):
 - Caddy (recommended) OR another reverse proxy that can terminate TLS and proxy WebSockets
 Optional:
 - systemd (recommended) for service supervision
 - MinIO / S3-compatible storage (only if you use remote snapshot_store)
 - Podman (only if your worker executes jobs in Podman)
 Notes:
 - The Zig CLI currently supports ws:// only. In production, keep the API server internal on ws:// and terminate TLS/WSS at Caddy.
 - This script is informational; it will not modify your system.
 ---
 1) Build binaries
  make prod
 Artifacts:
  ./bin/api-server
  ./bin/worker
 ---
 2) Create a dedicated user (recommended)
  useradd --system --create-home --shell /usr/sbin/nologin fetchml
 ---
 3) Create directories (example paths)
  mkdir -p /var/lib/fetchml/experiments
  mkdir -p /var/lib/fetchml/active/datasets /var/lib/fetchml/active/snapshots
  mkdir -p /var/log/fetchml
 Ensure ownership:
  chown -R fetchml:fetchml /var/lib/fetchml /var/log/fetchml
 ---
 4) Configure the API server
 - Start from: configs/api/prod.yaml (or your multi-user config)
 - For real production, keep server.tls.enabled: false
 - Ensure monitoring.health_checks.enabled is set appropriately
 Example flags:
  ./bin/api-server -config /etc/fetchml/api.yaml
 ---
 5) Configure Caddy (TLS/WSS termination)
 - Recommended: use deployments/Caddyfile.prod as a baseline.
 - Caddy should listen on 443 and reverse proxy to the API server (internal) on 9101.
 Example layout:
  /etc/caddy/Caddyfile
  /var/lib/caddy
 ---
 6) Configure Redis
 - Use Redis AUTH in production.
 - Ensure the api-server + worker can reach it.
 ---
 7) Run under systemd (recommended)
 Create unit files (example):
  /etc/systemd/system/fetchml-api.service
  /etc/systemd/system/fetchml-worker.service
  /etc/systemd/system/caddy.service (if not already provided)
 Then:
  systemctl daemon-reload
  systemctl enable --now fetchml-api
  systemctl enable --now fetchml-worker
  systemctl enable --now caddy
 ---
 8) Smoke check
 Internal health (no TLS):
  curl -f http://127.0.0.1:9101/health
 External health (through Caddy TLS termination):
  curl -f https://YOUR_DOMAIN/health
 EOF
--- a/monitoring/README.md
+++ b/monitoring/README.md
@ -1,13 +1,52 @@
-# Centralized Monitoring Stack
+# Monitoring Stack
 ## Directory Structure (Canonical)
 All monitoring configuration lives under `monitoring/`.
 ```text
 monitoring/
  prometheus/
    prometheus.yml                # Prometheus scrape configuration
  grafana/
    dashboards/                   # Grafana dashboards (JSON)
    provisioning/
      datasources/                # Grafana data sources (Prometheus/Loki)
      dashboards/                 # Grafana dashboard provider (points at dashboards/)
  loki-config.yml                 # Loki configuration
  promtail-config.yml             # Promtail configuration
 ```
 ### What is "Grafana provisioning"?
 Grafana provisioning is how Grafana auto-configures itself on startup (no clicking in the UI):
 - **`grafana/provisioning/datasources/*.yml`**
  - Defines where Grafana reads data from (e.g. Prometheus at `http://prometheus:9090`, Loki at `http://loki:3100`).
 - **`grafana/provisioning/dashboards/*.yml`**
  - Tells Grafana to load dashboard JSON files from `/var/lib/grafana/dashboards`.
 - **`grafana/dashboards/*.json`**
  - The dashboards themselves.
 ### Source of truth
 - **Dashboards**: edit/add JSON in `monitoring/grafana/dashboards/`.
 - **Grafana provisioning**: edit files in `monitoring/grafana/provisioning/`.
 - **Prometheus scrape config**: edit `monitoring/prometheus/prometheus.yml`.
 `scripts/setup_monitoring.py` is intentionally **provisioning-only**:
 - It (re)writes Grafana **datasources** and the **dashboard provider**.
 - It does **not** create or overwrite any dashboard JSON files.
 ## Quick Start
 ```bash
-# Start everything
+# Start deployment
-docker-compose up -d
+make deploy-up
 # Access services
-open http://localhost:3000  # Grafana (admin/admin)
+open http://localhost:3000  # Grafana (admin/admin123)
 open http://localhost:9090  # Prometheus
 ```
@ -15,137 +54,80 @@ open http://localhost:9090  # Prometheus
 ### Grafana (Port 3000)
 **Main monitoring dashboard**
- Username: `admin`
+- Username: `admin`  
- Password: `admin`
+- Password: `admin123`
- Pre-configured datasources: Prometheus + Loki
+- Data source: Prometheus (http://localhost:9090)
 - Pre-loaded ML Queue dashboard
 ### Prometheus (Port 9090)
-**Metrics collection**  
+**Metrics collection and storage**
 - Scrapes metrics from API server (`:9100/metrics`)
 - 15s scrape interval
 - Data retention: 15 days (default)
 ### Loki (Port 3100)
 **Log aggregation**
 - Collects logs from all containers
 - Collects application logs from `./logs/`
 - Retention: 7 days
-### Promtail
+## Dashboards
 **Log shipping**
 - Watches Docker container logs
 - Watches `./logs/*.log`
 - Sends to Loki
-## Viewing Data
+Available dashboard configurations in `grafana/dashboards/`:
-### Metrics
+- `load-test-performance.json` - Load test metrics
-1. Open Grafana: http://localhost:3000
+- `websocket-performance.json` - WebSocket performance
-2. Go to "ML Task Queue Monitoring" dashboard
+- `system-health.json` - System health monitoring
-3. See: queue depth, task duration, error rates, etc.
+- `rsync-performance.json` - Rsync performance metrics
-### Logs
+### Importing Dashboards
 1. Open Grafana → Explore
 2. Select "Loki" datasource
 3. Query examples:
   ```logql
   {job="app_logs"}                    # All app logs
   {job="docker",service="api-server"} # API server logs
   {job="docker"} |= "error"          # All errors
   ```
-## Architecture
+1. Go to Grafana → "+" → "Import"
-
+2. Upload JSON files from `grafana/dashboards/` directory
-```
+3. Select Prometheus data source
 ┌─────────────┐
 │  API Server │──┐
 └─────────────┘  │
                 ├──► Prometheus ──► Grafana
 ┌─────────────┐  │                      ▲
 │   Worker    │──┘                      │
 └─────────────┘                         │
                                        │
 ┌─────────────┐                         │
 │  App Logs   │──┐                      │
 └─────────────┘  │                      │
                 ├──► Promtail ──► Loki ┘
 ┌─────────────┐  │
 │Docker Logs  │──┘
 └─────────────┘
 ```
 ## Configuration Files
- `prometheus.yml` - Metrics scraping config
+- `prometheus/prometheus.yml` - Prometheus configuration
- `loki-config.yml` - Log storage config  
+- `loki-config.yml` - Loki configuration  
- `promtail-config.yml` - Log collection config
+- `promtail-config.yml` - Promtail configuration
- `grafana/provisioning/` - Auto-configuration
+- `security_rules.yml` - Security rules
-## Customization
+## Usage
-### Add More Scrapers
+1. Start monitoring stack: `make deploy-up`
-Edit `monitoring/prometheus.yml`:
+2. Access Grafana: http://localhost:3000 (admin/admin123)
-```yaml
+3. Import dashboards from `grafana/dashboards/` directory
-scrape_configs:
+4. View metrics and test results in real-time
  - job_name: 'my-service'
    static_configs:
      - targets: ['my-service:9100']
 ```
-### Change Retention
+## Health Endpoints
 **Prometheus:** Add to command in docker-compose:
 ```yaml
 - '--storage.tsdb.retention.time=30d'
 ```
-**Loki:** Edit `loki-config.yml`:
+The API server provides health check endpoints for monitoring:
 ```yaml
 limits_config:
  retention_period: 720h  # 30 days
 ```
-## Troubleshooting
+- **`/health`** - Overall service health (for Docker healthcheck)
 - **`/health/live`** - Liveness probe (is the service running?)
 - **`/health/ready`** - Readiness probe (can the service accept traffic?)
-**No metrics showing:**
+### Testing Health Endpoints
 ```bash
 # Check if Prometheus can reach targets
 curl http://localhost:9090/api/v1/targets
 # Check if API exposes metrics
 curl http://localhost:9100/metrics
 ```
 **No logs showing:**
 ```bash
 # Check Promtail status
 docker logs ml-experiments-promtail
 # Verify Loki is receiving logs
 curl http://localhost:3100/ready
 ```
 **Grafana can't connect to datasources:**
 ```bash
 # Restart Grafana  
 docker-compose restart grafana
 ```
 ## Profiling Quick Start
 To capture CPU profiles while exercising real workloads:
 ```bash
-# HTTP LoadTestSuite (MediumLoad scenario)
+# Basic health check
-make profile-load
+curl -k https://localhost:9101/health
-# WebSocket → Redis queue → worker integration
+# Liveness check (for K8s or monitoring)
-make profile-ws-queue
+curl -k https://localhost:9101/health/live
 # Readiness check (verifies dependencies)
 curl -k https://localhost:9101/health/ready
 ```
-Then inspect profiles with:
+See `health-testing.md` for detailed testing procedures.
-```bash
+## Prometheus Integration
-go tool pprof cpu_load.out   # HTTP load
+
-go tool pprof cpu_ws.out     # WebSocket/queue/worker
+Prometheus scrapes the following endpoints:
-```
+- `api-server:9101/metrics` - Application metrics (future)
 - `api-server:9101/health` - Health status monitoring
 - `host.docker.internal:9100/metrics` - Worker metrics (when the worker runs on the host)
 - `worker:9100/metrics` - Worker metrics (when the worker runs as a container in the compose network)
 ## Cleanup (deprecated paths)
 These legacy paths may still exist in the repo but are **not used** by the current dev compose config:
 - `monitoring/dashboards/` (old dashboards location)
 - `monitoring/prometheus.yml` (old Prometheus config location)
 - `monitoring/grafana/provisioning/dashboards/dashboard.yml` (duplicate of `dashboards.yml`)
--- a/monitoring/dashboards/grafana-dashboard.json
+++ b/monitoring/dashboards/grafana-dashboard.json
@ -1,147 +0,0 @@
 {
    "dashboard": {
        "title": "ML Task Queue Monitoring",
        "tags": [
            "ml",
            "queue",
            "fetch_ml"
        ],
        "timezone": "browser",
        "panels": [
            {
                "title": "Queue Depth",
                "type": "graph",
                "gridPos": {
                    "h": 8,
                    "w": 12,
                    "x": 0,
                    "y": 0
                },
                "targets": [
                    {
                        "expr": "fetch_ml_queue_depth",
                        "legendFormat": "Queue Depth"
                    }
                ]
            },
            {
                "title": "Active Tasks",
                "type": "graph",
                "gridPos": {
                    "h": 8,
                    "w": 12,
                    "x": 12,
                    "y": 0
                },
                "targets": [
                    {
                        "expr": "sum(fetch_ml_active_tasks) by (worker_id)",
                        "legendFormat": "{{worker_id}}"
                    }
                ]
            },
            {
                "title": "Task Duration (p50, p95, p99)",
                "type": "graph",
                "gridPos": {
                    "h": 8,
                    "w": 24,
                    "x": 0,
                    "y": 8
                },
                "targets": [
                    {
                        "expr": "histogram_quantile(0.50, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
                        "legendFormat": "p50"
                    },
                    {
                        "expr": "histogram_quantile(0.95, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
                        "legendFormat": "p95"
                    },
                    {
                        "expr": "histogram_quantile(0.99, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
                        "legendFormat": "p99"
                    }
                ]
            },
            {
                "title": "Task Completion Rate",
                "type": "graph",
                "gridPos": {
                    "h": 8,
                    "w": 12,
                    "x": 0,
                    "y": 16
                },
                "targets": [
                    {
                        "expr": "rate(fetch_ml_tasks_completed_total[5m])",
                        "legendFormat": "{{status}}"
                    }
                ]
            },
            {
                "title": "Failure Rate by Error Category",
                "type": "graph",
                "gridPos": {
                    "h": 8,
                    "w": 12,
                    "x": 12,
                    "y": 16
                },
                "targets": [
                    {
                        "expr": "rate(fetch_ml_task_failures_total[5m])",
                        "legendFormat": "{{error_category}}"
                    }
                ]
            },
            {
                "title": "Retry Rate",
                "type": "graph",
                "gridPos": {
                    "h": 8,
                    "w": 12,
                    "x": 0,
                    "y": 24
                },
                "targets": [
                    {
                        "expr": "rate(fetch_ml_task_retries_total[5m])",
                        "legendFormat": "{{error_category}}"
                    }
                ]
            },
            {
                "title": "Dead Letter Queue Size",
                "type": "stat",
                "gridPos": {
                    "h": 8,
                    "w": 6,
                    "x": 12,
                    "y": 24
                },
                "targets": [
                    {
                        "expr": "fetch_ml_dlq_size"
                    }
                ]
            },
            {
                "title": "Lease Expirations",
                "type": "stat",
                "gridPos": {
                    "h": 8,
                    "w": 6,
                    "x": 18,
                    "y": 24
                },
                "targets": [
                    {
                        "expr": "fetch_ml_lease_expirations_total"
                    }
                ]
            }
        ]
    }
 }
--- a/monitoring/dashboards/logs-dashboard.json
+++ b/monitoring/dashboards/logs-dashboard.json
@ -1,278 +0,0 @@
 {
    "dashboard": {
        "title": "Application Logs",
        "tags": [
            "logs",
            "loki",
            "fetch_ml"
        ],
        "timezone": "browser",
        "editable": true,
        "graphTooltip": 1,
        "time": {
            "from": "now-1h",
            "to": "now"
        },
        "timepicker": {
            "refresh_intervals": [
                "5s",
                "10s",
                "30s",
                "1m",
                "5m",
                "15m",
                "30m",
                "1h"
            ],
            "time_options": [
                "5m",
                "15m",
                "1h",
                "6h",
                "12h",
                "24h",
                "2d",
                "7d",
                "30d"
            ]
        },
        "panels": [
            {
                "title": "Log Stream",
                "type": "logs",
                "gridPos": {
                    "x": 0,
                    "y": 0,
                    "w": 24,
                    "h": 12
                },
                "id": 1,
                "targets": [
                    {
                        "expr": "{job=\"app_logs\"}",
                        "refId": "A",
                        "datasource": "Loki"
                    }
                ],
                "options": {
                    "showTime": true,
                    "showLabels": true,
                    "showCommonLabels": false,
                    "wrapLogMessage": false,
                    "prettifyLogMessage": false,
                    "enableLogDetails": true,
                    "dedupStrategy": "none",
                    "sortOrder": "Descending"
                }
            },
            {
                "title": "Log Level Distribution",
                "type": "bargauge",
                "gridPos": {
                    "x": 0,
                    "y": 12,
                    "w": 8,
                    "h": 8
                },
                "id": 2,
                "targets": [
                    {
                        "expr": "sum by (level) (count_over_time({job=\"app_logs\"} | logfmt | level != \"\" [5m]))",
                        "refId": "A",
                        "datasource": "Loki",
                        "legendFormat": "{{level}}"
                    }
                ],
                "options": {
                    "orientation": "horizontal",
                    "displayMode": "gradient",
                    "showUnfilled": true
                },
                "fieldConfig": {
                    "defaults": {
                        "color": {
                            "mode": "palette-classic"
                        }
                    },
                    "overrides": [
                        {
                            "matcher": {
                                "id": "byName",
                                "options": "INFO"
                            },
                            "properties": [
                                {
                                    "id": "color",
                                    "value": {
                                        "mode": "fixed",
                                        "fixedColor": "green"
                                    }
                                }
                            ]
                        },
                        {
                            "matcher": {
                                "id": "byName",
                                "options": "WARN"
                            },
                            "properties": [
                                {
                                    "id": "color",
                                    "value": {
                                        "mode": "fixed",
                                        "fixedColor": "yellow"
                                    }
                                }
                            ]
                        },
                        {
                            "matcher": {
                                "id": "byName",
                                "options": "ERROR"
                            },
                            "properties": [
                                {
                                    "id": "color",
                                    "value": {
                                        "mode": "fixed",
                                        "fixedColor": "red"
                                    }
                                }
                            ]
                        }
                    ]
                }
            },
            {
                "title": "Error Logs (Last Hour)",
                "type": "table",
                "gridPos": {
                    "x": 8,
                    "y": 12,
                    "w": 16,
                    "h": 8
                },
                "id": 3,
                "targets": [
                    {
                        "expr": "{job=\"app_logs\"} | logfmt | level=\"ERROR\"",
                        "refId": "A",
                        "datasource": "Loki"
                    }
                ],
                "options": {
                    "showHeader": true
                },
                "transformations": [
                    {
                        "id": "labelsToFields",
                        "options": {}
                    }
                ]
            },
            {
                "title": "Logs by Component",
                "type": "timeseries",
                "gridPos": {
                    "x": 0,
                    "y": 20,
                    "w": 12,
                    "h": 8
                },
                "id": 4,
                "targets": [
                    {
                        "expr": "sum by (component) (rate({job=\"app_logs\"} | logfmt [1m]))",
                        "refId": "A",
                        "datasource": "Loki",
                        "legendFormat": "{{component}}"
                    }
                ],
                "fieldConfig": {
                    "defaults": {
                        "custom": {
                            "drawStyle": "line",
                            "lineInterpolation": "smooth",
                            "fillOpacity": 10,
                            "spanNulls": false,
                            "showPoints": "never",
                            "stacking": {
                                "mode": "none"
                            }
                        },
                        "unit": "reqps"
                    }
                }
            },
            {
                "title": "Warning Logs Timeline",
                "type": "timeseries",
                "gridPos": {
                    "x": 12,
                    "y": 20,
                    "w": 12,
                    "h": 8
                },
                "id": 5,
                "targets": [
                    {
                        "expr": "sum(count_over_time({job=\"app_logs\"} | logfmt | level=\"WARN\" [1m]))",
                        "refId": "A",
                        "datasource": "Loki",
                        "legendFormat": "Warnings"
                    }
                ],
                "fieldConfig": {
                    "defaults": {
                        "custom": {
                            "drawStyle": "bars",
                            "fillOpacity": 50
                        },
                        "color": {
                            "mode": "fixed",
                            "fixedColor": "yellow"
                        }
                    }
                }
            },
            {
                "title": "Search Logs",
                "type": "logs",
                "gridPos": {
                    "x": 0,
                    "y": 28,
                    "w": 24,
                    "h": 10
                },
                "id": 6,
                "targets": [
                    {
                        "expr": "{job=\"app_logs\"} |= \"$search_term\"",
                        "refId": "A",
                        "datasource": "Loki"
                    }
                ],
                "options": {
                    "showTime": true,
                    "showLabels": true,
                    "wrapLogMessage": true,
                    "enableLogDetails": true
                }
            }
        ],
        "templating": {
            "list": [
                {
                    "name": "search_term",
                    "type": "textbox",
                    "label": "Search Term",
                    "current": {
                        "value": "",
                        "text": ""
                    }
                }
            ]
        },
        "refresh": "30s"
    }
 }
--- a/monitoring/dashboards/performance-dashboard.json
+++ b/monitoring/dashboards/performance-dashboard.json
@ -1,157 +0,0 @@
 {
  "annotations": {
    "list": [
      {
        "builtIn": 1,
        "datasource": "-- Grafana --",
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "type": "dashboard"
      }
    ]
  },
  "editable": true,
  "gnetId": null,
  "graphTooltip": 0,
  "id": null,
  "links": [],
  "panels": [
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": "loki",
      "fill": 1,
      "fillGradient": 0,
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 0
      },
      "hiddenSeries": false,
      "id": 1,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": true,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 1,
      "nullPointMode": "null",
      "options": {
        "dataLinks": []
      },
      "percentage": false,
      "pointradius": 2,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "expr": "{job=\"fetchml-performance\"} |= \"BenchmarkAPIServerCreateJobSimple\"",
          "legendFormat": "API Job Creation",
          "refId": "A"
        },
        {
          "expr": "{job=\"fetchml-performance\"} |= \"BenchmarkMLExperimentExecution/SmallExperiment\"",
          "legendFormat": "ML Small Experiment",
          "refId": "B"
        },
        {
          "expr": "{job=\"fetchml-performance\"} |= \"BenchmarkDatasetOperations/DatasetCreation\"",
          "legendFormat": "Dataset Creation",
          "refId": "C"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeRegions": [],
      "timeShift": null,
      "title": "API Performance Trends",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "time",
        "name": null,
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "short",
          "label": "Time (ns/op)",
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        },
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    },
    {
      "datasource": "loki",
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 12,
        "y": 0
      },
      "id": 2,
      "options": {
        "showLabels": true
      },
      "targets": [
        {
          "expr": "{job=\"fetchml-performance\"} |= \"Performance Summary\"",
          "legendFormat": "{{timestamp}}",
          "refId": "A"
        }
      ],
      "title": "Latest Performance Summary",
      "type": "logs"
    }
  ],
  "refresh": "30s",
  "schemaVersion": 27,
  "style": "dark",
  "tags": ["fetchml", "performance"],
  "templating": {
    "list": []
  },
  "time": {
    "from": "now-1h",
    "to": "now"
  },
  "timepicker": {},
  "timezone": "",
  "title": "Fetch ML Performance Dashboard",
  "uid": "fetchml-performance",
  "version": 1
 }
--- a/monitoring/docker-compose.performance.yml
+++ b/monitoring/docker-compose.performance.yml
@ -1,64 +0,0 @@
 services:
  prometheus:
    image: prom/prometheus:latest
    ports:
      - "9090:9090"
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--web.enable-lifecycle'
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - prometheus-data:/prometheus
    networks:
      - monitoring
  loki:
    image: grafana/loki:2.9.0
    ports:
      - "3100:3100"
    command: -config.file=/etc/loki/local-config.yaml
    volumes:
      - ./loki-performance-config.yaml:/etc/loki/local-config.yaml
    networks:
      - monitoring
  promtail:
    image: grafana/promtail:latest
    volumes:
      - ./promtail-performance-config.yaml:/etc/promtail/config.yml
      - /var/log:/var/log:ro
    command: -config.file=/etc/promtail/config.yml
    networks:
      - monitoring
  pushgateway:
    image: prom/pushgateway:latest
    ports:
      - "9091:9091"
    networks:
      - monitoring
  grafana:
    image: grafana/grafana:latest
    ports:
      - "3001:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    volumes:
      - grafana-data:/var/lib/grafana
      - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards
      - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources
    networks:
      - monitoring
 volumes:
  loki-data:
  grafana-data:
  prometheus-data:
 networks:
  monitoring:
    driver: bridge
--- a/monitoring/grafana/dashboards/load-test-performance.json
+++ b/monitoring/grafana/dashboards/load-test-performance.json
@ -0,0 +1,51 @@
 {
  "dashboard": {
    "id": null,
    "title": "Load Test Performance",
    "tags": [
      "load-test",
      "performance"
    ],
    "panels": [
      {
        "id": 1,
        "title": "Service Health",
        "type": "stat",
        "targets": [
          {
            "expr": "up",
            "legendFormat": "{{job}}"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 0,
          "y": 0
        }
      },
      {
        "id": 2,
        "title": "Request Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(http_requests_total[5m])",
            "legendFormat": "RPS"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 12,
          "y": 0
        }
      }
    ],
    "time": {
      "from": "now-1h",
      "to": "now"
    },
    "refresh": "5s"
  }
 }
--- a/monitoring/grafana/dashboards/load-test-simple.json
+++ b/monitoring/grafana/dashboards/load-test-simple.json
@ -0,0 +1 @@
 {"dashboard": {"id": null, "title": "Load Test Performance", "tags": ["load-test", "performance"], "panels": [{"id": 1, "title": "Service Status", "type": "stat", "targets": [{"expr": "up", "legendFormat": "{{job}}"}]}]}}
--- a/monitoring/grafana/dashboards/loki-logs.json
+++ b/monitoring/grafana/dashboards/loki-logs.json
@ -0,0 +1,51 @@
 {
  "dashboard": {
    "id": null,
    "title": "Log Analysis",
    "tags": [
      "loki",
      "logs"
    ],
    "panels": [
      {
        "id": 1,
        "title": "Error Logs",
        "type": "logs",
        "targets": [
          {
            "expr": "{job=~\".+\"} |= \"error\"",
            "legendFormat": "Errors"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 0,
          "y": 0
        }
      },
      {
        "id": 2,
        "title": "All Logs",
        "type": "logs",
        "targets": [
          {
            "expr": "{job=~\".+\"}",
            "legendFormat": "All logs"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 12,
          "y": 0
        }
      }
    ],
    "time": {
      "from": "now-30m",
      "to": "now"
    },
    "refresh": "30s"
  }
 }
--- a/monitoring/grafana/dashboards/prewarm-performance.txt
+++ b/monitoring/grafana/dashboards/prewarm-performance.txt
@ -0,0 +1,135 @@
 # Grafana Dashboard: Prewarm Performance
 # Import this JSON into Grafana to create a prewarm monitoring dashboard
 {
  "dashboard": {
    "id": null,
    "title": "Prewarm Performance",
    "tags": ["prewarm", "performance", "worker"],
    "panels": [
      {
        "id": 1,
        "title": "Environment Prewarm Hit Rate (%)",
        "type": "stat",
        "targets": [
          {
            "expr": "100 * (fetchml_prewarm_env_hit_total / clamp_min(fetchml_prewarm_env_hit_total + fetchml_prewarm_env_miss_total, 1))",
            "legendFormat": "{{worker_id}}"
          }
        ],
        "gridPos": {"h": 6, "w": 6, "x": 0, "y": 0},
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "thresholds": {
              "steps": [
                {"color": "red", "value": 0},
                {"color": "yellow", "value": 50},
                {"color": "green", "value": 80}
              ]
            }
          }
        }
      },
      {
        "id": 2,
        "title": "Snapshot Prewarm Hit Rate (%)",
        "type": "stat",
        "targets": [
          {
            "expr": "100 * (fetchml_prewarm_snapshot_hit_total / clamp_min(fetchml_prewarm_snapshot_hit_total + fetchml_prewarm_snapshot_miss_total, 1))",
            "legendFormat": "{{worker_id}}"
          }
        ],
        "gridPos": {"h": 6, "w": 6, "x": 6, "y": 0},
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "thresholds": {
              "steps": [
                {"color": "red", "value": 0},
                {"color": "yellow", "value": 50},
                {"color": "green", "value": 80}
              ]
            }
          }
        }
      },
      {
        "id": 3,
        "title": "Environment Prewarm Hits vs Misses",
        "type": "graph",
        "targets": [
          {"expr": "rate(fetchml_prewarm_env_hit_total[5m])", "legendFormat": "hits {{worker_id}}"},
          {"expr": "rate(fetchml_prewarm_env_miss_total[5m])", "legendFormat": "misses {{worker_id}}"}
        ],
        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
        "yAxes": [{"unit": "reqps"}]
      },
      {
        "id": 4,
        "title": "Snapshot Prewarm Hits vs Misses",
        "type": "graph",
        "targets": [
          {"expr": "rate(fetchml_prewarm_snapshot_hit_total[5m])", "legendFormat": "hits {{worker_id}}"},
          {"expr": "rate(fetchml_prewarm_snapshot_miss_total[5m])", "legendFormat": "misses {{worker_id}}"}
        ],
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
        "yAxes": [{"unit": "reqps"}]
      },
      {
        "id": 5,
        "title": "Environment Build Time",
        "type": "graph",
        "targets": [
          {"expr": "rate(fetchml_prewarm_env_time_seconds_total[5m])", "legendFormat": "build time {{worker_id}}"}
        ],
        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
        "yAxes": [{"unit": "seconds"}]
      },
      {
        "id": 6,
        "title": "Snapshot Prewarm Time",
        "type": "graph",
        "targets": [
          {"expr": "rate(fetchml_prewarm_snapshot_time_seconds_total[5m])", "legendFormat": "prewarm time {{worker_id}}"}
        ],
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
        "yAxes": [{"unit": "seconds"}]
      },
      {
        "id": 7,
        "title": "Environment Images Built",
        "type": "graph",
        "targets": [
          {"expr": "increase(fetchml_prewarm_env_built_total[1h])", "legendFormat": "built {{worker_id}}"}
        ],
        "gridPos": {"h": 8, "w": 6, "x": 12, "y": 16},
        "yAxes": [{"unit": "short"}]
      },
      {
        "id": 8,
        "title": "Snapshots Prewarmed",
        "type": "graph",
        "targets": [
          {"expr": "increase(fetchml_prewarm_snapshot_built_total[1h])", "legendFormat": "prewarmed {{worker_id}}"}
        ],
        "gridPos": {"h": 8, "w": 6, "x": 18, "y": 16},
        "yAxes": [{"unit": "short"}]
      },
      {
        "id": 9,
        "title": "Prewarm Efficiency",
        "type": "graph",
        "targets": [
          {"expr": "fetchml_prewarm_env_hit_total + fetchml_prewarm_snapshot_hit_total", "legendFormat": "total hits {{worker_id}}"},
          {"expr": "fetchml_prewarm_env_miss_total + fetchml_prewarm_snapshot_miss_total", "legendFormat": "total misses {{worker_id}}"}
        ],
        "gridPos": {"h": 8, "w": 24, "x": 0, "y": 24},
        "yAxes": [{"unit": "short"}]
      }
    ],
    "time": {"from": "now-1h", "to": "now"},
    "refresh": "5s"
  }
 }
--- a/monitoring/grafana/dashboards/rsync-performance.json
+++ b/monitoring/grafana/dashboards/rsync-performance.json
@ -0,0 +1,86 @@
 {
  "dashboard": {
    "id": null,
    "title": "Rsync Performance",
    "tags": [
      "rsync",
      "sync",
      "performance"
    ],
    "panels": [
      {
        "id": 1,
        "title": "Rsync Operations",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(rsync_operations_total[5m])",
            "legendFormat": "Operations/sec"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 0,
          "y": 0
        }
      },
      {
        "id": 2,
        "title": "Data Transfer Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(rsync_bytes_transferred_total[5m])",
            "legendFormat": "Bytes/sec"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 12,
          "y": 0
        }
      },
      {
        "id": 3,
        "title": "Sync Duration",
        "type": "graph",
        "targets": [
          {
            "expr": "rsync_sync_duration_seconds",
            "legendFormat": "Duration"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 0,
          "y": 8
        }
      },
      {
        "id": 4,
        "title": "Sync Errors",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(rsync_errors_total[5m])",
            "legendFormat": "Errors/sec"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 12,
          "y": 8
        }
      }
    ],
    "time": {
      "from": "now-1h",
      "to": "now"
    },
    "refresh": "5s"
  }
 }
--- a/monitoring/grafana/dashboards/system-health.json
+++ b/monitoring/grafana/dashboards/system-health.json
@ -0,0 +1,51 @@
 {
  "dashboard": {
    "id": null,
    "title": "System Health",
    "tags": [
      "system",
      "health"
    ],
    "panels": [
      {
        "id": 1,
        "title": "Service Status",
        "type": "stat",
        "targets": [
          {
            "expr": "up",
            "legendFormat": "{{job}}"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 0,
          "y": 0
        }
      },
      {
        "id": 2,
        "title": "Memory Usage",
        "type": "graph",
        "targets": [
          {
            "expr": "process_resident_memory_bytes",
            "legendFormat": "Memory"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 12,
          "y": 0
        }
      }
    ],
    "time": {
      "from": "now-1h",
      "to": "now"
    },
    "refresh": "10s"
  }
 }
--- a/monitoring/grafana/dashboards/websocket-performance.json
+++ b/monitoring/grafana/dashboards/websocket-performance.json
@ -0,0 +1,68 @@
 {
  "dashboard": {
    "id": null,
    "title": "WebSocket Performance",
    "tags": [
      "websocket",
      "performance"
    ],
    "panels": [
      {
        "id": 1,
        "title": "WebSocket Connections",
        "type": "graph",
        "targets": [
          {
            "expr": "websocket_connections_active",
            "legendFormat": "Active Connections"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 0,
          "y": 0
        }
      },
      {
        "id": 2,
        "title": "WebSocket Messages",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(websocket_messages_total[5m])",
            "legendFormat": "Messages/sec"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 12,
          "y": 0
        }
      },
      {
        "id": 3,
        "title": "Connection Errors",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(websocket_connection_errors_total[5m])",
            "legendFormat": "Errors/sec"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 0,
          "y": 8
        }
      }
    ],
    "time": {
      "from": "now-1h",
      "to": "now"
    },
    "refresh": "5s"
  }
 }
--- a/monitoring/grafana/dashboards/worker-resources.json
+++ b/monitoring/grafana/dashboards/worker-resources.json
@ -0,0 +1,280 @@
 {
  "id": null,
  "title": "Worker Resources",
  "tags": [
    "worker",
    "resources"
  ],
  "panels": [
      {
        "id": 1,
        "title": "CPU Free",
        "type": "stat",
        "targets": [
          {
            "expr": "fetchml_resources_cpu_free",
            "legendFormat": "{{worker_id}}"
          }
        ],
        "gridPos": {
          "h": 6,
          "w": 6,
          "x": 0,
          "y": 0
        }
      },
      {
        "id": 2,
        "title": "CPU Total",
        "type": "stat",
        "targets": [
          {
            "expr": "fetchml_resources_cpu_total",
            "legendFormat": "{{worker_id}}"
          }
        ],
        "gridPos": {
          "h": 6,
          "w": 6,
          "x": 6,
          "y": 0
        }
      },
      {
        "id": 3,
        "title": "CPU Utilization (%)",
        "type": "graph",
        "targets": [
          {
            "expr": "100 * (1 - (fetchml_resources_cpu_free / clamp_min(fetchml_resources_cpu_total, 1)))",
            "legendFormat": "{{worker_id}}"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 12,
          "y": 0
        }
      },
      {
        "id": 4,
        "title": "GPU Slots Free",
        "type": "graph",
        "targets": [
          {
            "expr": "fetchml_resources_gpu_slots_free",
            "legendFormat": "{{worker_id}} gpu={{gpu_index}}"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 0,
          "y": 6
        }
      },
      {
        "id": 5,
        "title": "Acquire Wait / Timeout (Totals)",
        "type": "graph",
        "targets": [
          {
            "expr": "fetchml_resources_acquire_wait_total",
            "legendFormat": "wait {{worker_id}}"
          },
          {
            "expr": "fetchml_resources_acquire_timeout_total",
            "legendFormat": "timeout {{worker_id}}"
          },
          {
            "expr": "fetchml_resources_acquire_total",
            "legendFormat": "total {{worker_id}}"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 12,
          "y": 8
        }
      },
      {
        "id": 6,
        "title": "Avg Acquire Wait (seconds)",
        "type": "stat",
        "targets": [
          {
            "expr": "fetchml_resources_acquire_wait_seconds_total / clamp_min(fetchml_resources_acquire_wait_total, 1)",
            "legendFormat": "{{worker_id}}"
          }
        ],
        "gridPos": {
          "h": 6,
          "w": 6,
          "x": 0,
          "y": 14
        }
      },
      {
        "id": 7,
        "title": "Acquire Wait Ratio",
        "type": "stat",
        "targets": [
          {
            "expr": "fetchml_resources_acquire_wait_total / clamp_min(fetchml_resources_acquire_total, 1)",
            "legendFormat": "{{worker_id}}"
          }
        ],
        "gridPos": {
          "h": 6,
          "w": 6,
          "x": 6,
          "y": 14
        }
      },
      {
        "id": 8,
        "title": "Environment Prewarm Hit Rate (%)",
        "type": "stat",
        "targets": [
          {
            "expr": "100 * (fetchml_prewarm_env_hit_total / clamp_min(fetchml_prewarm_env_hit_total + fetchml_prewarm_env_miss_total, 1))",
            "legendFormat": "{{worker_id}}"
          }
        ],
        "gridPos": {
          "h": 6,
          "w": 6,
          "x": 12,
          "y": 14
        },
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "thresholds": {
              "steps": [
                {"color": "red", "value": 0},
                {"color": "yellow", "value": 50},
                {"color": "green", "value": 80}
              ]
            }
          }
        }
      },
      {
        "id": 9,
        "title": "Snapshot Prewarm Hit Rate (%)",
        "type": "stat",
        "targets": [
          {
            "expr": "100 * (fetchml_prewarm_snapshot_hit_total / clamp_min(fetchml_prewarm_snapshot_hit_total + fetchml_prewarm_snapshot_miss_total, 1))",
            "legendFormat": "{{worker_id}}"
          }
        ],
        "gridPos": {
          "h": 6,
          "w": 6,
          "x": 18,
          "y": 14
        },
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "thresholds": {
              "steps": [
                {"color": "red", "value": 0},
                {"color": "yellow", "value": 50},
                {"color": "green", "value": 80}
              ]
            }
          }
        }
      },
      {
        "id": 10,
        "title": "Prewarm Hits vs Misses",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(fetchml_prewarm_env_hit_total[5m])",
            "legendFormat": "env hits {{worker_id}}"
          },
          {
            "expr": "rate(fetchml_prewarm_env_miss_total[5m])",
            "legendFormat": "env misses {{worker_id}}"
          },
          {
            "expr": "rate(fetchml_prewarm_snapshot_hit_total[5m])",
            "legendFormat": "snapshot hits {{worker_id}}"
          },
          {
            "expr": "rate(fetchml_prewarm_snapshot_miss_total[5m])",
            "legendFormat": "snapshot misses {{worker_id}}"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 24,
          "x": 0,
          "y": 20
        },
        "yAxes": [
          {"unit": "reqps"}
        ]
      },
      {
        "id": 11,
        "title": "Prewarm Build Time",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(fetchml_prewarm_env_time_seconds_total[5m])",
            "legendFormat": "env build {{worker_id}}"
          },
          {
            "expr": "rate(fetchml_prewarm_snapshot_time_seconds_total[5m])",
            "legendFormat": "snapshot prewarm {{worker_id}}"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 0,
          "y": 28
        },
        "yAxes": [
          {"unit": "seconds"}
        ]
      },
      {
        "id": 12,
        "title": "Prewarm Builds",
        "type": "graph",
        "targets": [
          {
            "expr": "increase(fetchml_prewarm_env_built_total[1h])",
            "legendFormat": "env built {{worker_id}}"
          },
          {
            "expr": "increase(fetchml_prewarm_snapshot_built_total[1h])",
            "legendFormat": "snapshots prewarmed {{worker_id}}"
          }
        ],
        "gridPos": {
          "h": 8,
          "w": 12,
          "x": 12,
          "y": 28
        },
        "yAxes": [
          {"unit": "short"}
        ]
      }
    ],
  "time": {
    "from": "now-1h",
    "to": "now"
  },
  "refresh": "5s"
 }
--- a/monitoring/grafana/provisioning/dashboards/dashboards.yml
+++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml
@ -1,5 +1,4 @@
 apiVersion: 1
 providers:
  - name: 'default'
    orgId: 1
--- a/monitoring/grafana/provisioning/datasources/loki.yml
+++ b/monitoring/grafana/provisioning/datasources/loki.yml
@ -0,0 +1,9 @@
 apiVersion: 1
 datasources:
  - name: Loki
    type: loki
    access: proxy
    url: http://loki:3100
    editable: true
    jsonData:
      maxLines: 1000
--- a/monitoring/grafana/provisioning/datasources/datasources.yml
+++ b/monitoring/grafana/provisioning/datasources/datasources.yml
@ -1,16 +1,10 @@
 apiVersion: 1
 datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    isDefault: false
    editable: false
  - name: Loki
    type: loki
    access: proxy
    url: http://loki:3100
    isDefault: true
-    editable: false
+    editable: true
    jsonData:
      timeInterval: "5s"
--- a/monitoring/health-testing.md
+++ b/monitoring/health-testing.md
@ -0,0 +1,100 @@
 # Testing Health Endpoints with Monitoring Stack
 ## Verify Health Endpoints
 ```bash
 # 1. Start the monitoring stack
 cd deployments
 docker-compose -f docker-compose.dev.yml up -d
 # 2. Wait for services to start (30 seconds)
 sleep 30
 # 3. Test health endpoints
 curl -k https://localhost:9101/health
 # Expected: {"status":"healthy","timestamp":"...","checks":{}}
 curl -k https://localhost:9101/health/live  
 # Expected: {"status":"alive","timestamp":"..."}
 curl -k https://localhost:9101/health/ready
 # Expected: {"status":"ready","timestamp":"...","checks":{"queue":"ok","experiments":"ok"}}
 # 4. Check Docker health status
 docker ps | grep api-server
 # Should show: (healthy)
 # 5. Access Grafana
 open http://localhost:3000
 # Login: admin / admin123
 # 6. Access Prometheus
 open http://localhost:9090
 # Check targets: Status > Targets
 # Should see: api-server, api-server-health
 # 7. Query health metrics in Prometheus
 # Go to Graph and enter: up{job="api-server-health"}
 # Should show: value=1 (service is up)
 ```
 ## Health Check Integration
 ### Docker Compose
 The health check is configured in `deployments/docker-compose.dev.yml`:
 ```yaml
 healthcheck:
  test: [ "CMD", "curl", "-k", "https://localhost:9101/health" ]
  interval: 30s
  timeout: 10s
  retries: 3
  start_period: 40s
 ```
 ### Prometheus Monitoring
 Prometheus scrapes health status every 30s from:
 - `/health` - Overall service health
 - `/metrics` - Future Prometheus metrics (when implemented)
 ### Kubernetes (Future)
 Health endpoints ready for K8s probes:
 ```yaml
 livenessProbe:
  httpGet:
    path: /health/live
    port: 9101
    scheme: HTTPS
  initialDelaySeconds: 30
  periodSeconds: 10
 readinessProbe:
  httpGet:
    path: /health/ready
    port: 9101
    scheme: HTTPS
  initialDelaySeconds: 10
  periodSeconds: 5
 ```
 ## Monitoring Stack Services
 - **Grafana** (port 3000): Dashboards and visualization
 - **Prometheus** (port 9090): Metrics collection
 - **Loki** (port 3100): Log aggregation
 - **Promtail**: Log shipping
 ## Troubleshooting
 ```bash
 # Check API server logs
 docker logs ml-experiments-api
 # Check Prometheus targets
 curl http://localhost:9090/api/v1/targets
 # Check health endpoint directly
 docker exec ml-experiments-api curl -k https://localhost:9101/health
 # Restart services
 docker-compose -f deployments/docker-compose.dev.yml restart api-server
 ```
--- a/monitoring/loki-config.yml
+++ b/monitoring/loki-config.yml
@ -12,7 +12,7 @@ common:
      rules_directory: /loki/rules
  replication_factor: 1
  ring:
-    instance_addr: 127.0.0.1
+    instance_addr: 0.0.0.0
    kvstore:
      store: inmemory
--- a/monitoring/loki-performance-config.yaml
+++ b/monitoring/loki-performance-config.yaml
@ -1,40 +0,0 @@
 auth_enabled: false
 server:
  http_listen_port: 3100
 ingester:
  lifecycler:
    address: 127.0.0.1
    ring:
      kvstore:
        store: inmemory
      replication_factor: 1
    final_sleep: 0s
    min_ready_duration: 0s
  chunk_idle_period: 1h
  max_chunk_age: 1h
  chunk_target_size: 1048576
  chunk_retain_period: 30s
 schema_config:
  configs:
    - from: 2020-10-24
      store: boltdb-shipper
      object_store: filesystem
      schema: v11
      index:
        prefix: index_
        period: 24h
 storage_config:
  boltdb_shipper:
    active_index_directory: /loki/boltdb-shipper-active
    cache_location: /loki/boltdb-shipper-cache
  filesystem:
    directory: /loki/chunks
 limits_config:
  reject_old_samples: true
  reject_old_samples_max_age: 168h
  allow_structured_metadata: false
--- a/monitoring/performance/grafana-dashboards/performance-dashboard.json
+++ b/monitoring/performance/grafana-dashboards/performance-dashboard.json
--- a/monitoring/prometheus/prometheus.yml
+++ b/monitoring/prometheus/prometheus.yml
@ -5,39 +5,35 @@ global:
  evaluation_interval: 15s
 scrape_configs:
-  # API Server metrics
+  # API Server metrics and health
  - job_name: 'api-server'
    scheme: http
    static_configs:
-      - targets: ['api-server:9100']
+      - targets: ['api-server:9101']
        labels:
          service: 'api-server'
    metrics_path: /metrics  # Future: Prometheus metrics endpoint
-  # Worker metrics (if running in docker)
+  # Benchmark metrics from Pushgateway
  - job_name: 'benchmark'
    static_configs: []
  # Worker metrics (ResourceManager + task execution)
  # For docker-compose dev on macOS/Windows, Prometheus can reach a locally running worker
  # via host.docker.internal.
  - job_name: 'worker'
    scrape_interval: 15s
    static_configs:
      - targets: ['worker:9100']
        labels:
          service: 'worker'
-    # Allow failures if worker not running
+          target_type: 'container'
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
  # Benchmark metrics from Pushgateway
  - job_name: 'benchmark'
    static_configs:
      - targets: ['localhost:9091']
        labels:
          service: 'benchmark'
    metrics_path: /metrics
    honor_labels: true
  # Loki metrics
  - job_name: 'loki'
    static_configs:
-      - targets: ['ml-experiments-loki:3100']
+      - targets: ['loki:3100']
        labels:
          service: 'loki'
    metrics_path: /metrics
--- a/monitoring/promtail-performance-config.yaml
+++ b/monitoring/promtail-performance-config.yaml
@ -1,50 +0,0 @@
 server:
  http_listen_port: 9080
  grpc_listen_port: 0
 positions:
  filename: /tmp/positions.yaml
 clients:
  - url: http://loki:3100/loki/api/v1/push
 scrape_configs:
 - job_name: fetchml-performance
  static_configs:
  - targets:
      - localhost
    labels:
      job: fetchml-performance
      __path__: /reports/performance.log
  pipeline_stages:
  - json:
      expressions:
        timestamp: timestamp
        git_commit: git_commit
        benchmark_name: name
        time_per_op: time_per_op_ns
        memory_per_op: memory_per_op_b
        allocs_per_op: allocs_per_op
  - labels:
      benchmark_name:
      git_commit:
  - output:
      source: output
 - job_name: fetchml-performance-summary
  static_configs:
  - targets:
      - localhost
    labels:
      job: fetchml-performance
      __path__: /reports/performance_summary.log
  pipeline_stages:
  - regex:
      expression: "=== Performance Summary ==="
  - output:
      source: output
--- a/monitoring/security_rules.yml
+++ b/monitoring/security_rules.yml
@ -1,112 +0,0 @@
 groups:
  - name: security.rules
    rules:
      # High rate of failed authentication attempts
      - alert: HighFailedAuthRate
        expr: rate(failed_auth_total[5m]) > 10
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High rate of failed authentication attempts"
          description: "More than 10 failed auth attempts per minute for the last 2 minutes"
      # Potential brute force attack
      - alert: BruteForceAttack
        expr: rate(failed_auth_total[1m]) > 30
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Potential brute force attack detected"
          description: "More than 30 failed auth attempts per minute"
      # Unusual WebSocket connection patterns
      - alert: UnusualWebSocketActivity
        expr: rate(websocket_connections_total[5m]) > 100
        for: 3m
        labels:
          severity: warning
        annotations:
          summary: "Unusual WebSocket connection activity"
          description: "WebSocket connection rate is unusually high"
      # Rate limit breaches
      - alert: RateLimitBreached
        expr: rate(rate_limit_exceeded_total[5m]) > 5
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Rate limits being exceeded"
          description: "Rate limit exceeded more than 5 times per minute"
      # SSL certificate expiration warning
      - alert: SSLCertificateExpiring
        expr: ssl_certificate_expiry_days < 30
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "SSL certificate expiring soon"
          description: "SSL certificate will expire in less than 30 days"
      # High memory usage
      - alert: HighMemoryUsage
        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage detected"
          description: "Memory usage is above 90%"
      # High CPU usage
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage detected"
          description: "CPU usage is above 80%"
      # Disk space running low
      - alert: LowDiskSpace
        expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Low disk space"
          description: "Disk space is below 10%"
      # Service down
      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service is down"
          description: "{{ $labels.instance }} service has been down for more than 1 minute"
      # Unexpected error rates
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High error rate detected"
          description: "Error rate is above 10%"
      # Suspicious IP activity
      - alert: SuspiciousIPActivity
        expr: rate(requests_by_ip[5m]) > 1000
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Suspicious IP activity"
          description: "IP address making unusually many requests"
--- a/podman/README.md
+++ b/podman/README.md
@ -118,7 +118,7 @@ jupyter>=1.0.0
  "allow_network": false,
  "blocked_packages": ["requests", "urllib3", "httpx"],
  "max_execution_time": 3600,
-  "gpu_access": true,
+  "gpu_devices": ["/dev/dri"],
  "ml_env": "ml_env",
  "package_manager": "mamba"
 }
--- a/podman/secure-ml-runner.podfile
+++ b/podman/secure-ml-runner.podfile
@ -32,6 +32,10 @@ RUN mamba install -n ml_env \
    -c pytorch -c conda-forge -y && \
    conda clean -afy
 # Poetry (for pyproject.toml + poetry.lock projects)
 RUN mamba install -n ml_env poetry -c conda-forge -y && \
    conda clean -afy
 # Copy security wrapper
 COPY secure_runner.py /usr/local/bin/secure_runner.py
 COPY security_policy.json /etc/ml_runner/security_policy.json
--- a/podman/secure_runner.py
+++ b/podman/secure_runner.py
@ -45,7 +45,7 @@ class SecurityPolicy:
                ],
                "max_execution_time": 3600,
                "max_memory_gb": 16,
-                "gpu_access": True,
+                "gpu_devices": ["/dev/dri"],
                "allow_file_writes": True,
                "resource_limits": {
                    "cpu_count": 4,
@ -106,97 +106,197 @@ class CondaRunner:
        self.conda_prefix = os.environ.get("CONDA_PREFIX", "/opt/conda")
        self.env_path = f"{self.conda_prefix}/envs/{self.conda_env}"
-    def setup_environment(self, requirements_file: Path) -> bool:
+        self.gpu_devices = self.security_policy.policy.get("gpu_devices", [])
-        """Setup Conda environment with mamba"""
+
    def setup_environment(self, deps_file: Path) -> bool:
        """Setup Conda environment based on a dependency manifest."""
        try:
-            # Read requirements
+            name = deps_file.name
            with open(requirements_file, "r") as f:
                requirements = [
                    line.strip()
                    for line in f
                    if line.strip() and not line.startswith("#")
                ]
-            # Check each package for security
+            print(f"[MANIFEST] Using dependency manifest: {name}")
            for req in requirements:
                package_name = (
                    req.split("==")[0].split(">=")[0].split("<=")[0].strip()
                )
                if not self.security_policy.check_package_safety(package_name):
                    print(
                        f"[SECURITY] Package '{package_name}' is blocked for security reasons"
                    )
                    return False
-            # Install packages with mamba (super fast!)
+            if name in ("environment.yml", "environment.yaml"):
-            for req in requirements:
+                print(f"[SETUP] Applying conda environment file: {deps_file}")
-                package_name = (
+                cmd = [
                    req.split("==")[0].split(">=")[0].split("<=")[0].strip()
                )
                # Check if already installed with conda
                check_cmd = [
                    "conda",
                    "run",
                    "-n",
                    self.conda_env,
                    "python",
                    "-c",
                    f"import {package_name.replace('-', '_')}",
                ]
                result = subprocess.run(
                    check_cmd, capture_output=True, text=True
                )
                if result.returncode == 0:
                    print(f"[OK] {package_name} already installed in conda env")
                    continue
                # Try conda-forge first (faster and more reliable)
                print(
                    f"[INSTALL] Installing {req} with {self.package_manager}..."
                )
                install_cmd = [
                    self.package_manager,
-                    "install",
+                    "env",
                    "update",
                    "-n",
                    self.conda_env,
-                    req,
+                    "-f",
-                    "-c",
+                    str(deps_file),
                    "conda-forge",
                    "-y",
                ]
-                result = subprocess.run(
+                result = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
-                    install_cmd, capture_output=True, text=True, timeout=300
+                if result.returncode != 0:
                    print(f"[ERROR] Failed to apply environment file: {result.stderr}")
                    return False
                return True
            if name == "poetry.lock":
                pyproject = self.workspace_dir / "pyproject.toml"
                if not pyproject.exists():
                    print("[ERROR] poetry.lock provided but pyproject.toml is missing")
                    return False
                print(f"[SETUP] Installing dependencies from Poetry lockfile: {deps_file}")
                env = os.environ.copy()
                env.update(
                    {
                        "POETRY_VIRTUALENVS_CREATE": "false",
                        "POETRY_NO_INTERACTION": "1",
                    }
                )
-                if result.returncode == 0:
+                # Ensure Poetry is available in the conda env.
-                    print(f"[OK] Installed {req} with {self.package_manager}")
+                check = subprocess.run(
-                    continue
+                    ["conda", "run", "-n", self.conda_env, "poetry", "--version"],
                    capture_output=True,
                    text=True,
                    env=env,
                )
                if check.returncode != 0:
                    print("[ERROR] Poetry is not available in the container environment")
                    print(check.stderr)
                    return False
-                # Fallback to pip if conda fails
+                # Install into the conda env (no separate venv).
-                print(f"[FALLBACK] Trying pip for {req}...")
+                install = subprocess.run(
-                pip_cmd = [
+                    [
                        "conda",
                        "run",
                        "-n",
                        self.conda_env,
                        "poetry",
                        "install",
                        "--no-ansi",
                    ],
                    capture_output=True,
                    text=True,
                    timeout=900,
                    cwd=str(self.workspace_dir),
                    env=env,
                )
                if install.returncode != 0:
                    print("[ERROR] Poetry install failed")
                    print(install.stderr)
                    return False
                return True
            if name == "pyproject.toml":
                # Use pip's PEP517/pyproject support (no Poetry required).
                # This installs the project itself; dependencies may be fetched as needed.
                print(f"[SETUP] Installing project from pyproject.toml: {deps_file}")
                cmd = [
                    "conda",
                    "run",
                    "-n",
                    self.conda_env,
                    "pip",
                    "install",
-                    req,
+                    str(self.workspace_dir),
                    "--no-cache-dir",
                ]
-                result = subprocess.run(
+                result = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
                    pip_cmd, capture_output=True, text=True, timeout=300
                )
                if result.returncode != 0:
-                    print(f"[ERROR] Failed to install {req}: {result.stderr}")
+                    print(f"[ERROR] Failed to install project from pyproject.toml: {result.stderr}")
                    return False
                return True
-                print(f"[OK] Installed {req} with pip")
+            if name == "requirements.txt":
                # Read requirements
                with open(deps_file, "r") as f:
                    requirements = [
                        line.strip()
                        for line in f
                        if line.strip() and not line.startswith("#")
                    ]
-            return True
+                # Check each package for security
                for req in requirements:
                    package_name = (
                        req.split("==")[0].split(">=")[0].split("<=")[0].strip()
                    )
                    if not self.security_policy.check_package_safety(package_name):
                        print(
                            f"[SECURITY] Package '{package_name}' is blocked for security reasons"
                        )
                        return False
                # Install packages with mamba (super fast!)
                for req in requirements:
                    package_name = (
                        req.split("==")[0].split(">=")[0].split("<=")[0].strip()
                    )
                    # Check if already installed with conda
                    check_cmd = [
                        "conda",
                        "run",
                        "-n",
                        self.conda_env,
                        "python",
                        "-c",
                        f"import {package_name.replace('-', '_')}",
                    ]
                    result = subprocess.run(
                        check_cmd, capture_output=True, text=True
                    )
                    if result.returncode == 0:
                        print(f"[OK] {package_name} already installed in conda env")
                        continue
                    # Try conda-forge first (faster and more reliable)
                    print(
                        f"[INSTALL] Installing {req} with {self.package_manager}..."
                    )
                    install_cmd = [
                        self.package_manager,
                        "install",
                        "-n",
                        self.conda_env,
                        req,
                        "-c",
                        "conda-forge",
                        "-y",
                    ]
                    result = subprocess.run(
                        install_cmd, capture_output=True, text=True, timeout=300
                    )
                    if result.returncode == 0:
                        print(f"[OK] Installed {req} with {self.package_manager}")
                        continue
                    # Fallback to pip if conda fails
                    print(f"[FALLBACK] Trying pip for {req}...")
                    pip_cmd = [
                        "conda",
                        "run",
                        "-n",
                        self.conda_env,
                        "pip",
                        "install",
                        req,
                        "--no-cache-dir",
                    ]
                    result = subprocess.run(
                        pip_cmd, capture_output=True, text=True, timeout=300
                    )
                    if result.returncode != 0:
                        print(f"[ERROR] Failed to install {req}: {result.stderr}")
                        return False
                    print(f"[OK] Installed {req} with pip")
                return True
            print(f"[ERROR] Unsupported dependency manifest: {deps_file}")
            print("Supported: environment.yml, environment.yaml, poetry.lock (requires pyproject.toml), pyproject.toml, requirements.txt")
            return False
        except Exception as e:
            print(f"[ERROR] Environment setup failed: {e}")
@ -217,7 +317,7 @@ class CondaRunner:
            env.update(
                {
                    "CONDA_DEFAULT_ENV": self.conda_env,
-                    "CUDA_VISIBLE_DEVICES": "0",  # Allow GPU access
+                    "CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES", ""),  # Allow GPU access
                    "SECURE_MODE": "1",
                    "NETWORK_ACCESS": (
                        "1"
@ -280,7 +380,7 @@ class CondaRunner:
                        "stdout": stdout,
                        "stderr": stderr,
                        "return_code": process.returncode,
-                        "gpu_accessible": True,
+                        "gpu_accessible": len(self.gpu_devices) > 0,
                        "security_mode": "enabled",
                        "container_type": "conda",
                        "conda_env": self.conda_env,
@ -338,8 +438,12 @@ def main():
    parser.add_argument(
        "--workspace", default="/workspace", help="Workspace directory"
    )
-    parser.add_argument("--requirements", help="Requirements file path")
+    parser.add_argument("--deps", help="Dependency manifest path (environment.yml | poetry.lock | pyproject.toml | requirements.txt)")
    parser.add_argument("--requirements", help="Deprecated alias for --deps")
    parser.add_argument("--script", help="Training script path")
    parser.add_argument(
        "--prepare-only", action="store_true", help="Only prepare dependencies and exit"
    )
    parser.add_argument(
        "--args",
        nargs=argparse.REMAINDER,
@ -383,17 +487,26 @@ def main():
    if args.check_gpu:
        return 0
    deps_arg = args.deps or args.requirements
    if not deps_arg:
        print("[ERROR] Missing dependency manifest. Provide --deps.")
        return 1
    # Setup environment
-    requirements_path = Path(args.requirements)
+    deps_path = Path(deps_arg)
-    if not requirements_path.exists():
+    if not deps_path.exists():
-        print(f"[ERROR] Requirements file not found: {requirements_path}")
+        print(f"[ERROR] Dependency manifest not found: {deps_path}")
        return 1
    print("[SETUP] Setting up secure environment...")
-    if not runner.setup_environment(requirements_path):
+    if not runner.setup_environment(deps_path):
        print("[ERROR] Failed to setup secure environment")
        return 1
    if args.prepare_only:
        print("[DONE] Environment prepared successfully")
        return 0
    # Run experiment
    script_path = Path(args.script)
    if not script_path.exists():
--- a/podman/security_policy.json
+++ b/podman/security_policy.json
@ -24,7 +24,7 @@
  ],
  "max_execution_time": 3600,
  "max_memory_gb": 16,
-  "gpu_access": true,
+  "gpu_devices": ["/dev/dri"],
  "allow_file_writes": true,
  "resource_limits": {
    "cpu_count": 4,
--- a/scripts/README.md
+++ b/scripts/README.md
@ -20,19 +20,12 @@ This directory contains setup and utility scripts for FetchML.
 sudo ./scripts/setup-prod.sh /data/ml-experiments ml-user ml-group
 ```
-### `validate-prod-config.sh`
+### Configuration validation
-**Purpose**: Validates production configuration files  
+Validate configs using the built-in config lint targets:
 **Usage**: `./scripts/validate-prod-config.sh [api-config] [worker-config]`  
 **What it does**:
 - Checks config file syntax
 - Verifies base_path consistency
 - Tests Redis connectivity
 - Validates Podman setup
 - Checks directory permissions
 **Example**:
 ```bash
-./scripts/validate-prod-config.sh configs/config-prod.yaml configs/worker-prod.toml
+make configlint
 make worker-configlint
 ```
 ## Legacy Setup Scripts (Deprecated)
@ -44,12 +37,11 @@ The following scripts are from earlier iterations and are **deprecated** in favo
 - `auto_setup.sh` - Old automated setup (superseded)
 - `setup_common.sh` - Common functions (integrated into setup-prod.sh)
 - `quick_start.sh` - Quick dev setup (use docker-compose on macOS instead)
- `test_tools.sh` - Tool testing (integrated into validate-prod-config.sh)
+
 ### Cleanup Recommendation
 These legacy scripts can be removed or archived. The current production setup only needs:
 - `setup-prod.sh`
 - `validate-prod-config.sh`
 ## Usage Workflow
@ -59,8 +51,8 @@ These legacy scripts can be removed or archived. The current production setup on
 sudo ./scripts/setup-prod.sh
 # 2. Copy and configure
-sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml
+sudo cp configs/api/prod.yaml /etc/fetch_ml/config.yaml
-sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml
+sudo cp configs/workers/worker-prod.toml /etc/fetch_ml/worker.toml
 sudo vim /etc/fetch_ml/config.yaml  # Update API keys, etc.
 # 3. Build and install
@ -68,7 +60,8 @@ make prod
 sudo make install
 # 4. Validate
-./scripts/validate-prod-config.sh /etc/fetch_ml/config.yaml /etc/fetch_ml/worker.toml
+./bin/configlint --schema configs/schema/api_server_config.yaml /etc/fetch_ml/config.yaml
 ./bin/configlint --schema configs/schema/worker_config_schema.yaml /etc/fetch_ml/worker.toml
 # 5. Start services
 sudo systemctl start fetchml-api fetchml-worker
@ -82,7 +75,7 @@ docker-compose up -d
 # Or run components directly
 make dev
-./bin/api-server -config configs/config-local.yaml
+./bin/api-server -config configs/api/dev.yaml
 ```
 ## Script Maintenance
--- a/scripts/auto-cleanup.service
+++ b/scripts/auto-cleanup.service
--- a/scripts/auto-cleanup.timer
+++ b/scripts/auto-cleanup.timer
--- a/scripts/benchmarks/run-benchmarks-local.sh
+++ b/scripts/benchmarks/run-benchmarks-local.sh
@ -8,6 +8,7 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
 LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
 ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"
 TIMESTAMP=$(date -u +"%Y%m%d_%H%M%S")
 RUN_DIR="$LOCAL_ARTIFACTS_DIR/run_$TIMESTAMP"
@ -168,14 +169,25 @@ if [ -f "$SCRIPT_DIR/cleanup-benchmarks.sh" ]; then
    "$SCRIPT_DIR/cleanup-benchmarks.sh" benchmarks
 else
    # Fallback cleanup if script not available
-    echo "Cleaning old benchmark runs (keeping last 10)..."
+    echo "Archiving old benchmark runs (keeping last 10)..."
    stamp=$(date -u +%Y%m%d-%H%M%S)
    mkdir -p "$ARCHIVE_DIR/$stamp"
    cd "$LOCAL_ARTIFACTS_DIR"
-    ls -1t run_* 2>/dev/null | tail -n +11 | xargs rm -rf 2>/dev/null || echo "No old runs to clean"
+    ls -1t run_* 2>/dev/null | tail -n +11 | while read -r run; do
        [ -n "$run" ] || continue
        mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
    done
    # Clean temporary files
-    echo "Cleaning temporary files..."
+    echo "Archiving temporary files..."
-    find /tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
+    tmp_archive_dir="$LOCAL_ARTIFACTS_DIR/tmp-archive/$stamp"
-    find /var/tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
+    mkdir -p "$tmp_archive_dir"
    find /tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
        mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
    done
    find /var/tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
        mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
    done
    # Clean Go build cache
    echo "Cleaning Go build cache..."
--- a/scripts/cleanup-status.sh
+++ b/scripts/cleanup-status.sh
--- a/scripts/cleanup.sh
+++ b/scripts/cleanup.sh
--- a/scripts/create_bitwarden_fetchml_item.sh
+++ b/scripts/create_bitwarden_fetchml_item.sh
@ -1,49 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # Create a Bitwarden item for a FetchML API user.
 #
 # Usage:
 #   ./scripts/create_bitwarden_fetchml_item.sh <username> <api_key> <api_key_hash>
 #
 # Requirements:
 #   - Bitwarden CLI (bw) installed
 #   - You are logged in and unlocked (bw login; bw unlock)
 #   - jq installed
 #
 # This script does NOT run on the homelab server. Run it from your
 # own machine where you manage Bitwarden.
 if [[ $# -ne 3 ]]; then
  echo "Usage: $0 <username> <api_key> <api_key_hash>" >&2
  exit 1
 fi
 USER_NAME="$1"
 API_KEY="$2"
 API_KEY_HASH="$3"
 ITEM_NAME="FetchML API  $USER_NAME"
 # Get base item template
 TEMPLATE_JSON=$(bw get template item)
 # Build item JSON with jq
 ITEM_JSON=$(echo "$TEMPLATE_JSON" | jq \
  --arg name "$ITEM_NAME" \
  --arg username "$USER_NAME" \
  --arg password "$API_KEY" \
  --arg hash "$API_KEY_HASH" \
  '.name = $name
   | .login.username = $username
   | .login.password = $password
   | .notes = "FetchML API key for user " + $username
   | .fields = [{"name":"api_key_hash","value":$hash,"type":1}]')
 # Create item in Bitwarden
 # If you ever want to edit instead, you can capture the ID from this call
 # and use: bw edit item <id> <json>
 echo "$ITEM_JSON" | bw encode | bw create item
 echo "Created Bitwarden item: $ITEM_NAME"
--- a/scripts/deployment/setup-auto-cleanup.sh
+++ b/scripts/deployment/setup-auto-cleanup.sh
@ -1,90 +0,0 @@
 #!/bin/bash
 # Setup auto-cleanup service for fetch_ml
 # This creates a systemd timer that runs cleanup daily
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
 # Colors
 GREEN='\033[0;32m'
 BLUE='\033[0;34m'
 NC='\033[0m'
 log_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
 }
 log_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
 }
 log_info "Setting up auto-cleanup service..."
 # Check if running on macOS or Linux
 if [[ "$OSTYPE" == "darwin"* ]]; then
    log_info "Detected macOS - setting up launchd agent"
    # Create launchd plist
    cat > ~/Library/LaunchAgents/com.fetchml.cleanup.plist << EOF
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 <plist version="1.0">
 <dict>
    <key>Label</key>
    <string>com.fetchml.cleanup</string>
    <key>ProgramArguments</key>
    <array>
        <string>$PROJECT_DIR/scripts/cleanup.sh</string>
        <string>--force</string>
    </array>
    <key>StartInterval</key>
    <integer>86400</integer>
    <key>RunAtLoad</key>
    <false/>
    <key>StandardOutPath</key>
    <string>/tmp/fetchml-cleanup.log</string>
    <key>StandardErrorPath</key>
    <string>/tmp/fetchml-cleanup.error.log</string>
 </dict>
 </plist>
 EOF
    # Load the launchd agent
    launchctl load ~/Library/LaunchAgents/com.fetchml.cleanup.plist
    log_success "Auto-cleanup service installed for macOS"
    log_info "Logs will be in /tmp/fetchml-cleanup.log"
 elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
    log_info "Detected Linux - setting up systemd timer"
    # Copy service files
    sudo cp "$SCRIPT_DIR/auto-cleanup.service" /etc/systemd/system/
    sudo cp "$SCRIPT_DIR/auto-cleanup.timer" /etc/systemd/system/
    # Reload systemd and enable timer
    sudo systemctl daemon-reload
    sudo systemctl enable auto-cleanup.timer
    sudo systemctl start auto-cleanup.timer
    log_success "Auto-cleanup service installed for Linux"
    log_info "Check status with: systemctl status auto-cleanup.timer"
 else
    echo "Unsupported OS: $OSTYPE"
    exit 1
 fi
 log_info "Auto-cleanup will run daily"
 log_info "To uninstall:"
 if [[ "$OSTYPE" == "darwin"* ]]; then
    echo "  launchctl unload ~/Library/LaunchAgents/com.fetchml.cleanup.plist"
    echo "  rm ~/Library/LaunchAgents/com.fetchml.cleanup.plist"
 else
    echo "  sudo systemctl stop auto-cleanup.timer"
    echo "  sudo systemctl disable auto-cleanup.timer"
    echo "  sudo rm /etc/systemd/system/auto-cleanup.*"
 fi
--- a/scripts/deployment/setup-monitoring-prod.sh
+++ b/scripts/deployment/setup-monitoring-prod.sh
@ -1,275 +0,0 @@
 #!/bin/bash
 # Production Monitoring Stack Setup for Linux
 # Deploys Prometheus/Grafana/Loki/Promtail as Podman containers with systemd
 # Compatible with: Rocky/RHEL/CentOS, Ubuntu/Debian, Arch, SUSE, etc.
 set -e
 BOLD='\033[1m'
 GREEN='\033[0;32m'
 BLUE='\033[0;34m'
 YELLOW='\033[0;33m'
 NC='\033[0m'
 echo -e "${BOLD}=== FetchML Monitoring Stack Setup (Linux) ===${NC}\n"
 # Detect Linux distribution and package manager
 detect_distro() {
    if [ -f /etc/os-release ]; then
        . /etc/os-release
        DISTRO=$ID
        DISTRO_VERSION=$VERSION_ID
    elif [ -f /etc/redhat-release ]; then
        DISTRO="rhel"
    else
        DISTRO="unknown"
    fi
    # Detect package manager
    if command -v dnf &>/dev/null; then
        PKG_MANAGER="dnf"
    elif command -v yum &>/dev/null; then
        PKG_MANAGER="yum"
    elif command -v apt-get &>/dev/null; then
        PKG_MANAGER="apt"
    elif command -v pacman &>/dev/null; then
        PKG_MANAGER="pacman"
    elif command -v zypper &>/dev/null; then
        PKG_MANAGER="zypper"
    else
        echo -e "${YELLOW}Warning: No known package manager found${NC}"
        PKG_MANAGER="unknown"
    fi
    echo "Detected distribution: $DISTRO (using $PKG_MANAGER)"
 }
 detect_distro
 # Configuration
 DATA_PATH="${1:-/data/monitoring}"
 ML_USER="${2:-ml-user}"
 ML_GROUP="${3:-ml-group}"
 echo "Configuration:"
 echo "  Monitoring data path: $DATA_PATH"
 echo "  User: $ML_USER"
 echo "  Group: $ML_GROUP"
 echo ""
 # Create pod for monitoring stack
 POD_NAME="monitoring"
 # 1. Create directories
 echo -e "${BLUE}[1/6]${NC} Creating directory structure..."
 sudo mkdir -p "${DATA_PATH}"/{prometheus,grafana,loki,promtail-config}
 sudo mkdir -p /etc/fetch_ml/monitoring
 sudo mkdir -p /var/lib/grafana/dashboards
 sudo chown -R $ML_USER:$ML_GROUP $DATA_PATH
 sudo chmod 755 $DATA_PATH
 echo -e "${GREEN}✓${NC} Directories created"
 # 2. Copy configuration files
 echo -e "${BLUE}[2/6]${NC} Copying configuration files..."
 sudo cp monitoring/prometheus.yml /etc/fetch_ml/monitoring/
 sudo cp monitoring/loki-config.yml /etc/fetch_ml/monitoring/
 sudo cp monitoring/promtail-config.yml /etc/fetch_ml/monitoring/
 sudo cp monitoring/grafana/provisioning /etc/fetch_ml/monitoring/ -r
 sudo cp monitoring/grafana-dashboard.json /var/lib/grafana/dashboards/ml-queue.json
 sudo cp monitoring/logs-dashboard.json /var/lib/grafana/dashboards/logs.json
 sudo chown -R $ML_USER:$ML_GROUP /etc/fetch_ml/monitoring
 sudo chown -R $ML_USER:$ML_GROUP /var/lib/grafana
 echo -e "${GREEN}✓${NC} Configuration copied"
 # 3. Create Podman pod
 echo -e "${BLUE}[3/6]${NC} Creating Podman pod..."
 sudo -u $ML_USER podman pod create \\
    --name $POD_NAME \\
    -p 3000:3000 \\
    -p 9090:9090 \\
    -p 3100:3100 \\
    || echo "Pod may already exist"
 echo -e "${GREEN}✓${NC} Pod created"
 # 4. Create systemd service for monitoring pod
 echo -e "${BLUE}[4/6]${NC} Creating systemd services..."
 # Prometheus service
 sudo tee /etc/systemd/system/prometheus.service >/dev/null <<EOF
 [Unit]
 Description=Prometheus Monitoring
 After=network.target
 PartOf=$POD_NAME-pod.service
 [Service]
 Type=simple
 User=$ML_USER
 Group=$ML_GROUP
 Restart=always
 RestartSec=10
 ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 9090:9090
 ExecStart=/usr/bin/podman run --rm --name prometheus \\
    --pod $POD_NAME \\
    -v /etc/fetch_ml/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro \\
    -v ${DATA_PATH}/prometheus:/prometheus \\
    docker.io/prom/prometheus:latest \\
    --config.file=/etc/prometheus/prometheus.yml \\
    --storage.tsdb.path=/prometheus \\
    --web.enable-lifecycle
 ExecStop=/usr/bin/podman stop -t 10 prometheus
 [Install]
 WantedBy=multi-user.target
 EOF
 # Loki service
 sudo tee /etc/systemd/system/loki.service >/dev/null <<EOF
 [Unit]
 Description=Loki Log Aggregation
 After=network.target
 PartOf=$POD_NAME-pod.service
 [Service]
 Type=simple
 User=$ML_USER
 Group=$ML_GROUP
 Restart=always
 RestartSec=10
 ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 3100:3100
 ExecStart=/usr/bin/podman run --rm --name loki \\
    --pod $POD_NAME \\
    -v /etc/fetch_ml/monitoring/loki-config.yml:/etc/loki/local-config.yaml:ro \\
    -v ${DATA_PATH}/loki:/loki \\
    docker.io/grafana/loki:latest \\
    -config.file=/etc/loki/local-config.yaml
 ExecStop=/usr/bin/podman stop -t 10 loki
 [Install]
 WantedBy=multi-user.target
 EOF
 # Grafana service
 sudo tee /etc/systemd/system/grafana.service >/dev/null <<EOF
 [Unit]
 Description=Grafana Visualization
 After=network.target prometheus.service loki.service
 PartOf=$POD_NAME-pod.service
 [Service]
 Type=simple
 User=$ML_USER
 Group=$ML_GROUP
 Restart=always
 RestartSec=10
 ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 3000:3000
 ExecStart=/usr/bin/podman run --rm --name grafana \\
    --pod $POD_NAME \\
    -v ${DATA_PATH}/grafana:/var/lib/grafana \\
    -v /etc/fetch_ml/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro \\
    -v /var/lib/grafana/dashboards:/var/lib/grafana/dashboards:ro \\
    -e GF_SECURITY_ADMIN_PASSWORD=\${GRAFANA_ADMIN_PASSWORD:-$(openssl rand -base64 32)} \\
    -e GF_USERS_ALLOW_SIGN_UP=false \\
    -e GF_AUTH_ANONYMOUS_ENABLED=false \\
    docker.io/grafana/grafana:latest
 ExecStop=/usr/bin/podman stop -t 10 grafana
 [Install]
 WantedBy=multi-user.target
 EOF
 # Promtail service
 sudo tee /etc/systemd/system/promtail.service >/dev/null <<EOF
 [Unit]
 Description=Promtail Log Collector
 After=network.target loki.service
 PartOf=$POD_NAME-pod.service
 [Service]
 Type=simple
 User=$ML_USER
 Group=$ML_GROUP
 Restart=always
 RestartSec=10
 ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME
 ExecStart=/usr/bin/podman run --rm --name promtail \\
    --pod $POD_NAME \\
    -v /etc/fetch_ml/monitoring/promtail-config.yml:/etc/promtail/config.yml:ro \\
    -v /var/log/fetch_ml:/var/log/app:ro \\
    docker.io/grafana/promtail:latest \\
    -config.file=/etc/promtail/config.yml
 ExecStop=/usr/bin/podman stop -t 10 promtail
 [Install]
 WantedBy=multi-user.target
 EOF
 sudo systemctl daemon-reload
 echo -e "${GREEN}✓${NC} Systemd services created"
 # 5. Create monitoring pod service
 echo -e "${BLUE}[5/6]${NC} Creating pod management service..."
 sudo -u $ML_USER podman generate systemd --new --name $POD_NAME \\
    | sudo tee /etc/systemd/system/$POD_NAME-pod.service >/dev/null
 sudo systemctl daemon-reload
 echo -e "${GREEN}✓${NC} Pod service created"
 # 6. Setup firewall rules
 echo -e "${BLUE}[6/6]${NC} Configuring firewall..."
 if command -v firewall-cmd &>/dev/null; then
    # RHEL/Rocky/Fedora (firewalld)
    sudo firewall-cmd --permanent --add-port=3000/tcp  # Grafana
    sudo firewall-cmd --permanent --add-port=9090/tcp  # Prometheus
    sudo firewall-cmd --reload
    echo -e "${GREEN}✓${NC} Firewall configured (firewalld)"
 elif command -v ufw &>/dev/null; then
    # Ubuntu/Debian (ufw)
    sudo ufw allow 3000/tcp comment 'Grafana'
    sudo ufw allow 9090/tcp comment 'Prometheus'
    echo -e "${GREEN}✓${NC} Firewall configured (ufw)"
 else
    echo -e "${YELLOW}!${NC} No firewall detected. You may need to manually open ports 3000 and 9090"
 fi
 # Summary
 echo ""
 echo -e "${BOLD}=== Monitoring Stack Setup Complete! ===${NC}"
 echo ""
 echo "Services created:"
 echo "  - prometheus.service (Metrics collection)"
 echo "  - loki.service (Log aggregation)"
 echo "  - grafana.service (Visualization)"
 echo "  - promtail.service (Log shipping)"
 echo ""
 echo -e "${BOLD}Next steps:${NC}"
 echo "1. Start services:"
 echo "   sudo systemctl start prometheus"
 echo "   sudo systemctl start loki"
 echo "   sudo systemctl start promtail"
 echo "   sudo systemctl start grafana"
 echo ""
 echo "2. Enable on boot:"
 echo "   sudo systemctl enable prometheus loki promtail grafana"
 echo ""
 echo "3. Access Grafana:"
 echo "   http://YOUR_SERVER_IP:3000"
 echo "   Username: admin"
 echo "   Password: admin (change on first login)"
 echo ""
 echo "4. Check logs:"
 echo "   sudo journalctl -u prometheus -f"
 echo "   sudo journalctl -u grafana -f"
 echo ""
--- a/scripts/deployment/setup-prod.sh
+++ b/scripts/deployment/setup-prod.sh
@ -1,229 +0,0 @@
 #!/bin/bash
 # Production Setup Script for Rocky Linux (Bare Metal)
 # This script sets up the complete FetchML environment on bare metal
 set -e
 BOLD='\033[1m'
 GREEN='\033[0;32m'
 BLUE='\033[0;34m'
 NC='\033[0m'
 echo -e "${BOLD}=== FetchML Production Setup (Rocky Linux Bare Metal) ===${NC}\n"
 # Configuration
 BASE_PATH="${1:-/data/ml-experiments}"
 ML_USER="${2:-ml-user}"
 ML_GROUP="${3:-ml-group}"
 echo "Configuration:"
 echo "  Base path: $BASE_PATH"
 echo "  ML user: $ML_USER"
 echo "  ML group: $ML_GROUP"
 echo ""
 # 1. Create system user if it doesn't exist
 echo -e "${BLUE}[1/8]${NC} Creating system user..."
 if id "$ML_USER" &>/dev/null; then
    echo "  User $ML_USER already exists"
 else
    sudo useradd -r -s /bin/bash -m -d /home/$ML_USER -c "ML System User" $ML_USER
    echo -e "${GREEN}✓${NC} Created user: $ML_USER"
 fi
 # 2. Create directory structure
 echo -e "${BLUE}[2/8]${NC} Creating directory structure..."
 sudo mkdir -p "${BASE_PATH}"/{experiments,pending,running,finished,failed,datasets}
 sudo mkdir -p /var/log/fetch_ml
 sudo mkdir -p /etc/fetch_ml
 echo -e "${GREEN}✓${NC} Created directories:"
 echo "  $BASE_PATH/experiments/"
 echo "  $BASE_PATH/pending/"
 echo "  $BASE_PATH/running/"
 echo "  $BASE_PATH/finished/"
 echo "  $BASE_PATH/failed/"
 echo "  $BASE_PATH/datasets/"
 echo "  /var/log/fetch_ml/"
 echo "  /etc/fetch_ml/"
 # 3. Set ownership and permissions
 echo -e "${BLUE}[3/8]${NC} Setting permissions..."
 sudo chown -R $ML_USER:$ML_GROUP $BASE_PATH
 sudo chmod 755 $BASE_PATH
 sudo chmod 700 $BASE_PATH/experiments  # Restrict experiment data
 sudo chown -R $ML_USER:$ML_GROUP /var/log/fetch_ml
 sudo chmod 755 /var/log/fetch_ml
 echo -e "${GREEN}✓${NC} Permissions set"
 # 4. Install system dependencies (Rocky Linux)
 echo -e "${BLUE}[4/8]${NC} Installing system dependencies..."
 sudo dnf install -y \
    golang \
    podman \
    redis \
    git \
    make \
    gcc \
    || echo "Some packages may already be installed"
 echo -e "${GREEN}✓${NC} Dependencies installed"
 # 5. Configure Podman for GPU access (if NVIDIA GPU present)
 echo -e "${BLUE}[5/8]${NC} Configuring Podman..."
 if lspci | grep -i nvidia &>/dev/null; then
    echo "  NVIDIA GPU detected, configuring GPU access..."
    # Install nvidia-container-toolkit if not present
    if ! command -v nvidia-container-toolkit &>/dev/null; then
        echo "  Installing nvidia-container-toolkit..."
        sudo dnf config-manager --add-repo \
            https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo
        sudo dnf install -y nvidia-container-toolkit
    fi
    # Configure Podman CDI
    sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
    echo -e "${GREEN}✓${NC} GPU support configured"
 else
    echo "  No NVIDIA GPU detected, skipping GPU setup"
 fi
 # 6. Configure Redis
 echo -e "${BLUE}[6/8]${NC} Configuring Redis..."
 sudo systemctl enable redis
 sudo systemctl start redis || echo "Redis may already be running"
 # Set Redis password if not already configured
 if ! sudo grep -q "^requirepass" /etc/redis/redis.conf 2>/dev/null; then
    REDIS_PASSWORD=$(openssl rand -base64 32)
    echo "requirepass $REDIS_PASSWORD" | sudo tee -a /etc/redis/redis.conf >/dev/null
    sudo systemctl restart redis
    echo "  Generated Redis password: $REDIS_PASSWORD"
    echo "  Save this password for your configuration!"
 else
    echo "  Redis password already configured"
 fi
 echo -e "${GREEN}✓${NC} Redis configured"
 # 7. Setup systemd services
 echo -e "${BLUE}[7/8]${NC} Creating systemd services..."
 # API Server service
 sudo tee /etc/systemd/system/fetchml-api.service >/dev/null <<EOF
 [Unit]
 Description=FetchML API Server
 After=network.target redis.service
 Wants=redis.service
 [Service]
 Type=simple
 User=$ML_USER
 Group=$ML_GROUP
 WorkingDirectory=/opt/fetch_ml
 ExecStart=/usr/local/bin/fetchml-api -config /etc/fetch_ml/config.yaml
 Restart=always
 RestartSec=10
 StandardOutput=append:/var/log/fetch_ml/api.log
 StandardError=append:/var/log/fetch_ml/api-error.log
 # Security hardening
 NoNewPrivileges=true
 PrivateTmp=true
 ProtectSystem=strict
 ProtectHome=true
 ReadWritePaths=$BASE_PATH /var/log/fetch_ml
 [Install]
 WantedBy=multi-user.target
 EOF
 # Worker service
 sudo tee /etc/systemd/system/fetchml-worker.service >/dev/null <<EOF
 [Unit]
 Description=FetchML Worker
 After=network.target redis.service fetchml-api.service
 Wants=redis.service
 [Service]
 Type=simple
 User=$ML_USER
 Group=$ML_GROUP
 WorkingDirectory=/opt/fetch_ml
 ExecStart=/usr/local/bin/fetchml-worker -config /etc/fetch_ml/worker.toml
 Restart=always
 RestartSec=10
 StandardOutput=append:/var/log/fetch_ml/worker.log
 StandardError=append:/var/log/fetch_ml/worker-error.log
 # Security hardening
 NoNewPrivileges=true
 PrivateTmp=true
 ProtectSystem=strict
 ProtectHome=true
 ReadWritePaths=$BASE_PATH /var/log/fetch_ml
 [Install]
 WantedBy=multi-user.target
 EOF
 sudo systemctl daemon-reload
 echo -e "${GREEN}✓${NC} Systemd services created"
 # 8. Setup logrotate
 echo -e "${BLUE}[8/8]${NC} Configuring log rotation..."
 sudo tee /etc/logrotate.d/fetchml >/dev/null <<EOF
 /var/log/fetch_ml/*.log {
    daily
    rotate 14
    compress
    delaycompress
    notifempty
    missingok
    create 0640 $ML_USER $ML_GROUP
    sharedscripts
    postrotate
        systemctl reload fetchml-api >/dev/null 2>&1 || true
        systemctl reload fetchml-worker >/dev/null 2>&1 || true
    endscript
 }
 EOF
 echo -e "${GREEN}✓${NC} Log rotation configured"
 # Summary
 echo ""
 echo -e "${BOLD}=== Setup Complete! ===${NC}"
 echo ""
 echo "Directory structure created at: $BASE_PATH"
 echo "Logs will be written to: /var/log/fetch_ml/"
 echo "Configuration directory: /etc/fetch_ml/"
 echo ""
 echo -e "${BOLD}Next steps:${NC}"
 echo "1. Copy your config files:"
 echo "   sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml"
 echo "   sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml"
 echo ""
 echo "2. Build and install binaries:"
 echo "   make build"
 echo "   sudo cp bin/api-server /usr/local/bin/fetchml-api"
 echo "   sudo cp bin/worker /usr/local/bin/fetchml-worker"
 echo ""
 echo "3. Update config files with your settings (Redis password, API keys, etc.)"
 echo ""
 echo "4. Start services:"
 echo "   sudo systemctl start fetchml-api"
 echo "   sudo systemctl start fetchml-worker"
 echo ""
 echo "5. Enable services to start on boot:"
 echo "   sudo systemctl enable fetchml-api"
 echo "   sudo systemctl enable fetchml-worker"
 echo ""
 echo "6. Check status:"
 echo "   sudo systemctl status fetchml-api"
 echo "   sudo systemctl status fetchml-worker"
 echo "   sudo journalctl -u fetchml-api -f"
 echo ""
--- a/scripts/deployment/setup-production.sh
+++ b/scripts/deployment/setup-production.sh
--- a/scripts/legacy/auto_setup.sh
+++ b/scripts/legacy/auto_setup.sh
@ -1,455 +0,0 @@
 #!/bin/bash
 # Automatic Setup Script for ML Experiment Manager
 # Handles complete environment setup with security features
 set -euo pipefail
 # Colors
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m'
 print_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
 }
 print_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
 }
 print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
 }
 print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
 }
 detect_os() {
    if [[ "$OSTYPE" == "darwin"* ]]; then
        echo "macos"
    elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
        echo "linux"
    else
        echo "unknown"
    fi
 }
 install_go() {
    print_info "Installing Go..."
    local os=$(detect_os)
    local go_version="1.23.0"
    if [[ "$os" == "macos" ]]; then
        if command -v brew &> /dev/null; then
            brew install go
        else
            print_error "Homebrew not found. Please install Go manually."
            return 1
        fi
    elif [[ "$os" == "linux" ]]; then
        wget -q "https://go.dev/dl/go${go_version}.linux-amd64.tar.gz"
        sudo rm -rf /usr/local/go
        sudo tar -C /usr/local -xzf "go${go_version}.linux-amd64.tar.gz"
        rm "go${go_version}.linux-amd64.tar.gz"
        # Add to PATH
        echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc
        export PATH=$PATH:/usr/local/go/bin
    fi
    print_success "Go installed"
 }
 install_zig() {
    print_info "Installing Zig..."
    local os=$(detect_os)
    if [[ "$os" == "macos" ]]; then
        if command -v brew &> /dev/null; then
            brew install zig
        else
            print_error "Homebrew not found. Please install Zig manually."
            return 1
        fi
    elif [[ "$os" == "linux" ]]; then
        # Download Zig binary
        local zig_version="0.13.0"
        wget -q "https://ziglang.org/download/${zig_version}/zig-linux-x86_64-${zig_version}.tar.xz"
        tar -xf "zig-linux-x86_64-${zig_version}.tar.xz"
        sudo mv "zig-linux-x86_64-${zig_version}/zig" /usr/local/bin/
        rm -rf "zig-linux-x86_64-${zig_version}.tar.xz" "zig-linux-x86_64-${zig_version}"
    fi
    print_success "Zig installed"
 }
 install_docker() {
    print_info "Installing Docker..."
    local os=$(detect_os)
    if [[ "$os" == "macos" ]]; then
        if command -v brew &> /dev/null; then
            brew install --cask docker
            print_warning "Docker Desktop installed. Please start it manually."
        else
            print_error "Homebrew not found. Please install Docker manually."
            return 1
        fi
    elif [[ "$os" == "linux" ]]; then
        # Install Docker using official script
        curl -fsSL https://get.docker.com -o get-docker.sh
        sudo sh get-docker.sh
        sudo usermod -aG docker $USER
        rm get-docker.sh
        # Start Docker
        sudo systemctl enable docker
        sudo systemctl start docker
        print_success "Docker installed. You may need to log out and log back in."
    fi
 }
 install_redis() {
    print_info "Installing Redis..."
    local os=$(detect_os)
    if [[ "$os" == "macos" ]]; then
        if command -v brew &> /dev/null; then
            brew install redis
            brew services start redis
        else
            print_error "Homebrew not found. Please install Redis manually."
            return 1
        fi
    elif [[ "$os" == "linux" ]]; then
        sudo apt-get update
        sudo apt-get install -y redis-server
        sudo systemctl enable redis-server
        sudo systemctl start redis-server
    fi
    print_success "Redis installed and started"
 }
 install_dependencies() {
    print_info "Installing dependencies..."
    local os=$(detect_os)
    # Install basic tools
    if [[ "$os" == "macos" ]]; then
        if command -v brew &> /dev/null; then
            brew install openssl curl jq
        fi
    elif [[ "$os" == "linux" ]]; then
        sudo apt-get update
        sudo apt-get install -y openssl curl jq build-essential
    fi
    # Install Go tools
    if command -v go &> /dev/null; then
        go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest
        go install golang.org/x/tools/cmd/goimports@latest
    fi
    print_success "Dependencies installed"
 }
 setup_project() {
    print_info "Setting up project..."
    # Create directories
    mkdir -p bin
    mkdir -p data
    mkdir -p logs
    mkdir -p db
    mkdir -p ssl
    mkdir -p configs
    mkdir -p scripts
    # Build project
    if command -v make &> /dev/null; then
        make build
        if command -v zig &> /dev/null; then
            make cli-build
        fi
    else
        print_warning "Make not found, building manually..."
        go build -o bin/worker ./cmd/worker
        go build -o bin/tui ./cmd/tui
        go build -o bin/data_manager ./cmd/data_manager
        go build -o bin/user_manager ./cmd/user_manager
        go build -o bin/api-server ./cmd/api-server
        if command -v zig &> /dev/null; then
            cd cli && zig build && cd ..
        fi
    fi
    print_success "Project setup completed"
 }
 setup_security() {
    print_info "Setting up security features..."
    # Generate SSL certificates
    if command -v openssl &> /dev/null; then
        openssl req -x509 -newkey rsa:4096 -keyout ssl/key.pem -out ssl/cert.pem \
            -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" \
            -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null || {
            print_warning "Failed to generate SSL certificates"
        }
        print_success "SSL certificates generated"
    fi
    # Generate secure configuration
    local redis_password=$(openssl rand -base64 32 2>/dev/null || echo "dev_redis_password_123")
    local jwt_secret=$(openssl rand -base64 64 2>/dev/null || echo "dev_jwt_secret_1234567890123456789012345678901234567890123456789012345678901234")
    cat > configs/security-config.yaml << EOF
 base_path: "/data/ml-experiments"
 auth:
  enabled: true
  api_keys:
    test_user:
      hash: "$(echo -n "dev_test_api_key_12345" | sha256sum | cut -d' ' -f1)"
      admin: true
      roles: ["data_scientist", "admin"]
      permissions:
        read: true
        write: true
        delete: true
 server:
  address: ":9101"
  tls:
    enabled: true
    cert_file: "./ssl/cert.pem"
    key_file: "./ssl/key.pem"
    min_version: "1.3"
 security:
  rate_limit:
    enabled: true
    requests_per_minute: 60
    burst_size: 10
  ip_whitelist:
    - "127.0.0.1"
    - "::1"
    - "10.0.0.0/8"
    - "192.168.0.0/16"
    - "172.16.0.0/12"
  failed_login_lockout:
    enabled: true
    max_attempts: 5
    lockout_duration: "15m"
 redis:
  url: "redis://localhost:6379"
  password: "${redis_password}"
 logging:
  level: "info"
  file: "logs/fetch_ml.log"
  audit_log: "logs/audit.log"
 EOF
    cat > .env.dev << EOF
 # Development environment variables
 REDIS_PASSWORD=${redis_password}
 JWT_SECRET=${jwt_secret}
 GRAFANA_USER=admin
 GRAFANA_PASSWORD=$(openssl rand -base64 16 2>/dev/null || echo "dev_grafana_password")
 EOF
    print_success "Security configuration created"
 }
 test_installation() {
    print_info "Testing installation..."
    local tests_passed=0
    local tests_total=0
    # Test Go
    tests_total=$((tests_total + 1))
    if command -v go &> /dev/null; then
        print_success "Go: Installed"
        tests_passed=$((tests_passed + 1))
    else
        print_error "Go: Not found"
    fi
    # Test Zig
    tests_total=$((tests_total + 1))
    if command -v zig &> /dev/null; then
        print_success "Zig: Installed"
        tests_passed=$((tests_passed + 1))
    else
        print_warning "Zig: Not found (optional)"
        tests_total=$((tests_total - 1))
    fi
    # Test Docker
    tests_total=$((tests_total + 1))
    if command -v docker &> /dev/null; then
        print_success "Docker: Installed"
        tests_passed=$((tests_passed + 1))
    else
        print_warning "Docker: Not found (optional)"
        tests_total=$((tests_total - 1))
    fi
    # Test Redis
    tests_total=$((tests_total + 1))
    if command -v redis-cli &> /dev/null; then
        if redis-cli ping | grep -q "PONG"; then
            print_success "Redis: Running"
            tests_passed=$((tests_passed + 1))
        else
            print_warning "Redis: Not running"
        fi
    else
        print_warning "Redis: Not found (optional)"
        tests_total=$((tests_total - 1))
    fi
    # Test binaries
    if [[ -f "bin/api-server" ]]; then
        tests_total=$((tests_total + 1))
        if ./bin/api-server --help > /dev/null 2>&1; then
            print_success "API Server: Built"
            tests_passed=$((tests_passed + 1))
        else
            print_error "API Server: Build failed"
        fi
    fi
    if [[ $tests_total -gt 0 ]]; then
        local success_rate=$((tests_passed * 100 / tests_total))
        print_info "Tests: $tests_passed/$tests_total passed ($success_rate%)"
    fi
    print_success "Installation testing completed"
 }
 show_next_steps() {
    print_success "Automatic setup completed!"
    echo
    echo "Next Steps:"
    echo "==========="
    echo ""
    echo "1. Load environment variables:"
    echo "   source .env.dev"
    echo ""
    echo "2. Start the API server:"
    echo "   ./bin/api-server -config configs/config.yaml"
    echo ""
    echo "3. Test the Zig CLI (if installed):"
    echo "   ./cli/zig-out/bin/ml --help"
    echo ""
    echo "4. Deploy with Docker (optional):"
    echo "   make docker-run"
    echo ""
    echo "5. Docker Compose deployment:"
    echo "   docker-compose up -d"
    echo ""
    echo "Configuration Files:"
    echo "  configs/config.yaml            # Main configuration"
    echo "  configs/config_local.yaml      # Local development"
    echo "  ssl/cert.pem, ssl/key.pem     # TLS certificates"
    echo ""
    echo "Documentation:"
    echo "  docs/DEPLOYMENT.md            # Deployment guide"
    echo ""
    echo "Quick Commands:"
    echo "  make help                     # Show all commands"
    echo "  make test                     # Run tests"
    echo "  docker-compose up -d          # Start services"
    echo ""
    print_success "Ready to use ML Experiment Manager!"
 }
 # Main setup function
 main() {
    echo "ML Experiment Manager Automatic Setup"
    echo "====================================="
    echo ""
    print_info "Starting automatic setup..."
    echo ""
    # Check and install dependencies
    if ! command -v go &> /dev/null; then
        print_info "Go not found, installing..."
        install_go
    fi
    if ! command -v zig &> /dev/null; then
        print_info "Zig not found, installing..."
        install_zig
    fi
    if ! command -v docker &> /dev/null; then
        print_info "Docker not found, installing..."
        install_docker
    fi
    if ! command -v redis-cli &> /dev/null; then
        print_info "Redis not found, installing..."
        install_redis
    fi
    # Install additional dependencies
    install_dependencies
    # Setup project
    setup_project
    # Setup security
    setup_security
    # Test installation
    test_installation
    # Show next steps
    show_next_steps
 }
 # Handle command line arguments
 case "${1:-setup}" in
    "setup")
        main
        ;;
    "deps")
        install_dependencies
        ;;
    "test")
        test_installation
        ;;
    "help"|"-h"|"--help")
        echo "Automatic Setup Script"
        echo "Usage: $0 {setup|deps|test|help}"
        echo ""
        echo "Commands:"
        echo "  setup  - Run full automatic setup"
        echo "  deps   - Install dependencies only"
        echo "  test   - Test installation"
        echo "  help   - Show this help"
        ;;
    *)
        print_error "Unknown command: $1"
        echo "Use '$0 help' for usage information"
        exit 1
        ;;
 esac
--- a/scripts/legacy/quick_start.sh
+++ b/scripts/legacy/quick_start.sh
@ -1,314 +0,0 @@
 #!/usr/bin/env bash
 # Fetch ML Quick Start Script with Security
 # Sets up development environment with security features and creates test user
 set -euo pipefail
 # Colors
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m'
 print_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
 }
 print_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
 }
 print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
 }
 print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
 }
 check_prerequisites() {
    print_info "Checking prerequisites..."
    # Check Go
    if ! command -v go &> /dev/null; then
        print_error "Go is not installed. Please install Go 1.25 or later."
        exit 1
    fi
    local go_version=$(go version | awk '{print $3}' | sed 's/go//')
    print_info "Go version: $go_version"
    # Check Zig
    if ! command -v zig &> /dev/null; then
        print_warning "Zig is not installed. CLI features will not be available."
    else
        local zig_version=$(zig version)
        print_info "Zig version: $zig_version"
    fi
    # Check Docker
    if ! command -v docker &> /dev/null; then
        print_warning "Docker is not installed. Container features will not work."
    fi
    # Check Redis
    if ! command -v redis-server &> /dev/null && ! command -v redis-cli &> /dev/null; then
        print_warning "Redis is not installed. Starting local Redis..."
    fi
    # Check OpenSSL for certificates
    if ! command -v openssl &> /dev/null; then
        print_warning "OpenSSL is not installed. TLS certificates will not be generated."
    fi
    print_success "Prerequisites checked"
 }
 setup_project() {
    print_info "Setting up Fetch ML project..."
    # Create directories
    mkdir -p bin
    mkdir -p data
    mkdir -p logs
    mkdir -p db
    mkdir -p ssl
    mkdir -p configs
    print_success "Project directories created"
 }
 build_project() {
    print_info "Building Fetch ML..."
    # Build Go binaries
    make build
    # Build Zig CLI if available
    if command -v zig &> /dev/null; then
        make cli-build
        print_success "Zig CLI built"
    fi
    print_success "Build completed"
 }
 generate_ssl_certificates() {
    print_info "Generating SSL certificates..."
    if command -v openssl &> /dev/null; then
        # Generate self-signed certificate for development
        openssl req -x509 -newkey rsa:4096 -keyout ssl/key.pem -out ssl/cert.pem \
            -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" \
            -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null || {
            print_warning "Failed to generate SSL certificates"
            return 1
        }
        print_success "SSL certificates generated in ssl/"
        print_info "Certificates are self-signed (development only)"
    else
        print_warning "OpenSSL not available, skipping SSL certificates"
    fi
 }
 setup_redis() {
    print_info "Setting up Redis..."
    if command -v redis-server &> /dev/null; then
        if ! pgrep -f "redis-server" > /dev/null; then
            redis-server --daemonize yes --port 6379
            print_success "Redis started"
        else
            print_info "Redis already running"
        fi
    else
        print_warning "Redis not available, some features may be limited"
    fi
 }
 create_secure_config() {
    print_info "Creating secure development configuration..."
    # Generate secure passwords and secrets
    local redis_password=$(openssl rand -base64 32 2>/dev/null || echo "dev_redis_password_123")
    local jwt_secret=$(openssl rand -base64 64 2>/dev/null || echo "dev_jwt_secret_1234567890123456789012345678901234567890123456789012345678901234")
    # Create development config
    cat > configs/config.yaml << EOF
 base_path: "/data/ml-experiments"
 auth:
  enabled: true
  api_keys:
    test_user:
      hash: "$(echo -n "dev_test_api_key_12345" | sha256sum | cut -d' ' -f1)"
      admin: true
      roles: ["data_scientist", "admin"]
      permissions:
        read: true
        write: true
        delete: true
 server:
  address: ":9101"
  tls:
    enabled: true
    cert_file: "./ssl/cert.pem"
    key_file: "./ssl/key.pem"
    min_version: "1.3"
 security:
  rate_limit:
    enabled: true
    requests_per_minute: 60
    burst_size: 10
  ip_whitelist:
    - "127.0.0.1"
    - "::1"
    - "10.0.0.0/8"
    - "192.168.0.0/16"
    - "172.16.0.0/12"
  failed_login_lockout:
    enabled: true
    max_attempts: 5
    lockout_duration: "15m"
 redis:
  url: "redis://localhost:6379"
  password: "${redis_password}"
 logging:
  level: "info"
  file: "logs/fetch_ml.log"
  audit_log: "logs/audit.log"
 EOF
    # Create environment file
    cat > .env.dev << EOF
 # Development environment variables
 REDIS_PASSWORD=${redis_password}
 JWT_SECRET=${jwt_secret}
 GRAFANA_USER=admin
 GRAFANA_PASSWORD=$(openssl rand -base64 16 2>/dev/null || echo "dev_grafana_password")
 EOF
    print_success "Secure configuration created"
    print_warning "Using development certificates and passwords"
 }
 create_test_user() {
    print_info "Creating test user..."
    # Generate API key for test user
    local api_key="dev_test_api_key_12345"
    local api_key_hash=$(echo -n "$api_key" | sha256sum | cut -d' ' -f1)
    print_success "Test user created successfully"
    echo "Username: test_user"
    echo "API Key: $api_key"
    echo "API Key Hash: $api_key_hash"
    echo "Store this key safely!"
    echo ""
    echo "Environment variables in .env.dev"
    echo "Run: source .env.dev"
 }
 test_setup() {
    print_info "Testing setup..."
    # Test Go binaries
    if [[ -f "bin/api-server" ]]; then
        ./bin/api-server --help > /dev/null 2>&1 || true
        print_success "API server binary OK"
    fi
    if [[ -f "bin/worker" ]]; then
        ./bin/worker --help > /dev/null 2>&1 || true
        print_success "Worker binary OK"
    fi
    # Test Zig CLI
    if [[ -f "cli/zig-out/bin/ml" ]]; then
        ./cli/zig-out/bin/ml --help > /dev/null 2>&1 || true
        print_success "Zig CLI binary OK"
    fi
    # Test Redis connection
    if command -v redis-cli &> /dev/null; then
        if redis-cli ping > /dev/null 2>&1; then
            print_success "Redis connection OK"
        else
            print_warning "Redis not responding"
        fi
    fi
    # Test SSL certificates
    if [[ -f "ssl/cert.pem" && -f "ssl/key.pem" ]]; then
        if openssl x509 -in ssl/cert.pem -noout -checkend 86400 > /dev/null 2>&1; then
            print_success "SSL certificates valid"
        else
            print_warning "SSL certificates expired or invalid"
        fi
    fi
 }
 show_next_steps() {
    print_success "Secure quick start completed!"
    echo
    echo "Next steps:"
    echo "1. Load environment variables:"
    echo "   source .env.dev"
    echo
    echo "2. Start API server:"
    echo "   ./bin/api-server -config configs/config.yaml"
    echo
    echo "3. Test Zig CLI:"
    echo "   ./cli/zig-out/bin/ml --help"
    echo
    echo "4. Test with curl (HTTPS):"
    echo "   curl -k -H 'X-API-Key: dev_test_api_key_12345' https://localhost:9101/health"
    echo
    echo "5. Deploy with Docker:"
    echo "   docker-compose up -d"
    echo
    echo "Features Enabled:"
    echo "  ✅ HTTPS/TLS encryption"
    echo "  ✅ API key authentication"
    echo "  ✅ Rate limiting"
    echo "  ✅ IP whitelisting"
    echo "  ✅ Security headers"
    echo "  ✅ Audit logging"
    echo
    echo "Configuration Files:"
    echo "  configs/config.yaml            # Main configuration"
    echo "  .env.dev                      # Environment variables"
    echo "  ssl/cert.pem, ssl/key.pem     # TLS certificates"
    echo
    echo "Documentation:"
    echo "  docs/DEPLOYMENT.md            # Deployment guide"
    echo ""
    print_success "Ready to run ML experiments!"
 }
 # Main function
 main() {
    echo "Fetch ML Quick Start Script (with Security & Zig CLI)"
    echo "===================================================="
    echo ""
    check_prerequisites
    setup_project
    build_project
    generate_ssl_certificates
    setup_redis
    create_secure_config
    create_test_user
    test_setup
    show_next_steps
 }
 # Run main function
 main "$@"
--- a/scripts/legacy/setup_common.sh
+++ b/scripts/legacy/setup_common.sh
@ -1,124 +0,0 @@
 #!/usr/bin/env bash
 # Shared helper functions for Fetch ML setup scripts (Ubuntu/Rocky)
 set -euo pipefail
 # Colors
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m'
 # Configuration defaults
 FETCH_ML_USER="fetchml"
 FETCH_ML_HOME="/opt/fetchml"
 SERVICE_DIR="/etc/systemd/system"
 LOG_DIR="/var/log/fetchml"
 DATA_DIR="/var/lib/fetchml"
 CONFIG_DIR="$FETCH_ML_HOME/configs"
 log_info()    { echo -e "${BLUE}[INFO]${NC} $1"; }
 log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
 log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
 log_error()   { echo -e "${RED}[ERROR]${NC} $1"; }
 # Download file with checksum verification
 # Args: url, checksum, dest
 secure_download() {
    local url="$1" checksum="$2" dest="$3"
    curl -fsSL "$url" -o "$dest"
    echo "$checksum  $dest" | sha256sum --check --status || {
        log_error "Checksum verification failed for $dest"
        rm -f "$dest"
        exit 1
    }
 }
 cleanup_temp() {
    if [[ -n "${TMP_FILES:-}" ]]; then
        rm -f $TMP_FILES || true
    fi
 }
 trap cleanup_temp EXIT
 ensure_user() {
    if ! id "$FETCH_ML_USER" &>/dev/null; then
        useradd -m -d "$FETCH_ML_HOME" -s /bin/bash "$FETCH_ML_USER"
    fi
    usermod -aG podman "$FETCH_ML_USER" || true
 }
 create_directories() {
    mkdir -p "$FETCH_ML_HOME" "$LOG_DIR" "$DATA_DIR" "$FETCH_ML_HOME/bin" "$CONFIG_DIR"
    chown -R "$FETCH_ML_USER":"$FETCH_ML_USER" "$FETCH_ML_HOME" "$LOG_DIR" "$DATA_DIR"
 }
 setup_systemd_service() {
    local name="$1" exec="$2"
    cat > "$SERVICE_DIR/${name}.service" <<EOF
 [Unit]
 Description=Fetch ML ${name^} Service
 After=network.target redis.service
 Wants=redis.service
 [Service]
 Type=simple
 User=$FETCH_ML_USER
 Group=$FETCH_ML_USER
 WorkingDirectory=$FETCH_ML_HOME
 Environment=PATH=$FETCH_ML_HOME/bin:/usr/local/go/bin:/usr/bin:/bin
 ExecStart=$exec
 Restart=on-failure
 RestartSec=5
 StandardOutput=journal
 StandardError=journal
 SyslogIdentifier=fetch_ml_${name}
 [Install]
 WantedBy=multi-user.target
 EOF
 }
 setup_logrotate() {
    cat > /etc/logrotate.d/fetch_ml <<'EOF'
 /var/log/fetchml/*.log {
    daily
    missingok
    rotate 14
    compress
    delaycompress
    notifempty
    create 0640 fetchml fetchml
 }
 EOF
 }
 hardening_steps() {
    # Increase file limits
    if ! grep -q fetchml /etc/security/limits.conf; then
        cat >> /etc/security/limits.conf <<'EOF'
 fetchml soft nofile 65536
 fetchml hard nofile 65536
 EOF
    fi
    # Enable unattended security upgrades if available
    if command -v apt-get &>/dev/null; then
        apt-get install -y unattended-upgrades >/dev/null || true
    elif command -v dnf &>/dev/null; then
        dnf install -y dnf-automatic >/dev/null || true
    fi
 }
 selinux_guidance() {
    if command -v getenforce &>/dev/null; then
        local mode=$(getenforce)
        log_info "SELinux mode: $mode"
        if [[ "$mode" == "Enforcing" ]]; then
            log_info "Ensure systemd units and directories have proper contexts. Example:"
            echo "  semanage fcontext -a -t bin_t '$FETCH_ML_HOME/bin(/.*)?'"
            echo "  restorecon -Rv $FETCH_ML_HOME/bin"
        fi
    fi
 }
--- a/scripts/legacy/setup_rocky.sh
+++ b/scripts/legacy/setup_rocky.sh
@ -1,417 +0,0 @@
 #!/usr/bin/env bash
 # Fetch ML Rocky Linux Setup Script
 # Optimized for ML experiments on Rocky Linux 8/9
 set -euo pipefail
 # shellcheck source=scripts/setup_common.sh
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 source "$SCRIPT_DIR/setup_common.sh"
 check_root() {
    if [[ $EUID -ne 0 ]]; then
        log_error "This script must be run as root"
        exit 1
    fi
 }
 check_rocky() {
    if ! command -v dnf &> /dev/null && ! command -v yum &> /dev/null; then
        log_error "This script is designed for Rocky Linux systems"
        exit 1
    fi
    local rocky_version=$(cat /etc/rocky-release | grep -oE '[0-9]+\.[0-9]+')
    log_info "Rocky Linux version: $rocky_version"
    # Use dnf for Rocky 9+, yum for Rocky 8
    if command -v dnf &> /dev/null; then
        PKG_MANAGER="dnf"
    else
        PKG_MANAGER="yum"
    fi
 }
 update_system() {
    log_info "Updating system packages..."
    $PKG_MANAGER update -y
    $PKG_MANAGER upgrade -y
    $PKG_MANAGER install -y curl wget gnupg2
 }
 enable_epel() {
    log_info "Enabling EPEL repository..."
    if $PKG_MANAGER repolist | grep -q "epel"; then
        log_info "EPEL already enabled"
        return
    fi
    $PKG_MANAGER install -y epel-release
    $PKG_MANAGER config-manager --set-enabled powertools
    log_success "EPEL repository enabled"
 }
 install_go() {
    log_info "Installing Go 1.25..."
    if command -v go &> /dev/null; then
        local go_version=$(go version | awk '{print $3}' | sed 's/go//')
        log_info "Go already installed: $go_version"
        return
    fi
    cd /tmp
    TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz"
    secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz"
    tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz
    # Add to PATH
    echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile
    echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile
    export PATH=$PATH:/usr/local/go/bin
    log_success "Go 1.25 installed"
 }
 install_podman() {
    log_info "Installing Podman..."
    if command -v podman &> /dev/null; then
        log_info "Podman already installed"
        return
    fi
    # Install Podman and related tools
    $PKG_MANAGER install -y podman podman-compose containernetworking-plugins
    # Configure Podman
    mkdir -p /etc/containers
    cat > /etc/containers/containers.conf << EOF
 [containers]
 user_namespace_enable = 1
 runtime = "crun"
 [network]
 network_backend = "netavark"
 [engine]
 cgroup_manager = "systemd"
 EOF
    # Enable user namespaces
    echo "user.max_user_namespaces=15000" >> /etc/sysctl.conf
    sysctl -p user.max_user_namespaces=15000
    log_success "Podman installed"
 }
 install_redis() {
    log_info "Installing Redis..."
    if command -v redis-server &> /dev/null; then
        log_info "Redis already installed"
        return
    fi
    $PKG_MANAGER install -y redis
    # Configure Redis for production
    sed -i 's/supervised no/supervised systemd/' /etc/redis.conf
    sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis.conf
    systemctl enable redis
    systemctl start redis
    log_success "Redis installed and configured"
 }
 install_nvidia_drivers() {
    log_info "Checking for NVIDIA GPU..."
    if command -v nvidia-smi &> /dev/null; then
        log_info "NVIDIA drivers already installed"
        nvidia-smi
        return
    fi
    if lspci | grep -i nvidia &> /dev/null; then
        log_info "NVIDIA GPU detected, installing drivers..."
        # Enable NVIDIA repository
        $PKG_MANAGER config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel$(rpm -E %rhel)/x86_64/cuda-rhel.repo
        # Clean and install
        $PKG_MANAGER clean all
        $PKG_MANAGER module enable -y nvidia-driver:latest-dkms
        $PKG_MANAGER install -y nvidia-driver nvidia-cuda-toolkit
        # Configure Podman for NVIDIA (only if needed)
        if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then
            log_warning "NVIDIA GPU access test failed, you may need to reboot"
        else
            log_success "NVIDIA drivers installed and GPU access verified"
        fi
        # Reboot required
        log_warning "System reboot required for NVIDIA drivers"
        log_info "Run: reboot"
    else
        log_info "No NVIDIA GPU detected, skipping driver installation"
    fi
 }
 install_ml_tools() {
    log_info "Installing ML tools and dependencies..."
    # Python and ML packages
    $PKG_MANAGER install -y python3 python3-pip python3-devel
    # System dependencies for ML
    $PKG_MANAGER groupinstall -y "Development Tools"
    $PKG_MANAGER install -y cmake git pkgconfig
    $PKG_MANAGER install -y libjpeg-turbo-devel libpng-devel libtiff-devel
    $PKG_MANAGER install -y mesa-libGL-devel mesa-libGLU-devel
    $PKG_MANAGER install -y gtk3-devel
    $PKG_MANAGER install -y atlas-devel blas-devel lapack-devel
    # Install common ML libraries
    pip3 install --upgrade pip
    pip3 install numpy scipy scikit-learn pandas
    pip3 install jupyter matplotlib seaborn
    pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
    log_success "ML tools installed"
 }
 create_user() {
    log_info "Creating fetchml user..."
    if id "$FETCH_ML_USER" &>/dev/null; then
        log_info "User $FETCH_ML_USER already exists"
        return
    fi
    useradd -m -d $FETCH_ML_HOME -s /bin/bash $FETCH_ML_USER
    usermod -aG podman $FETCH_ML_USER
    # Create directories
    mkdir -p $FETCH_ML_HOME/.config/containers
    mkdir -p $FETCH_ML_HOME/go/bin
    mkdir -p $LOG_DIR
    mkdir -p $DATA_DIR
    chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
    chown -R $FETCH_ML_USER:$FETCH_ML_USER $LOG_DIR
    chown -R $FETCH_ML_USER:$FETCH_ML_USER $DATA_DIR
    log_success "User $FETCH_ML_USER created"
 }
 setup_firewall() {
    log_info "Configuring firewall..."
    if command -v firewall-cmd &> /dev/null; then
        systemctl enable firewalld
        systemctl start firewalld
        firewall-cmd --permanent --add-service=ssh
        firewall-cmd --permanent --add-port=8080/tcp  # Worker API
        firewall-cmd --permanent --add-port=8081/tcp  # Data manager API
        firewall-cmd --permanent --add-port=6379/tcp  # Redis
        firewall-cmd --reload
        firewall-cmd --list-all
    else
        log_warning "Firewalld not available, skipping firewall configuration"
    fi
 }
 setup_systemd_services() {
    log_info "Setting up systemd services..."
    # Fetch ML Worker service
    cat > $SERVICE_DIR/fetch_ml_worker.service << EOF
 [Unit]
 Description=Fetch ML Worker Service
 After=network.target redis.service
 Wants=redis.service
 [Service]
 Type=simple
 User=$FETCH_ML_USER
 Group=$FETCH_ML_USER
 WorkingDirectory=$FETCH_ML_HOME
 Environment=FETCH_ML_HOME=$FETCH_ML_HOME
 Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin
 ExecStart=$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml
 Restart=always
 RestartSec=5
 StandardOutput=journal
 StandardError=journal
 SyslogIdentifier=fetch_ml_worker
 [Install]
 WantedBy=multi-user.target
 EOF
    # Fetch ML Data Manager service
    cat > $SERVICE_DIR/fetch_ml_data_manager.service << EOF
 [Unit]
 Description=Fetch ML Data Manager Service
 After=network.target redis.service
 Wants=redis.service
 [Service]
 Type=simple
 User=$FETCH_ML_USER
 Group=$FETCH_ML_USER
 WorkingDirectory=$FETCH_ML_HOME
 Environment=FETCH_ML_HOME=$FETCH_ML_HOME
 Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin
 ExecStart=$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml
 Restart=always
 RestartSec=5
 StandardOutput=journal
 StandardError=journal
 SyslogIdentifier=fetch_ml_data_manager
 [Install]
 WantedBy=multi-user.target
 EOF
    # Enable services
    systemctl daemon-reload
    systemctl enable fetch_ml_worker
    systemctl enable fetch_ml_data_manager
    log_success "Systemd services configured"
 }
 setup_log_rotation() {
    log_info "Setting up log rotation..."
    cat > /etc/logrotate.d/fetch_ml << EOF
 $LOG_DIR/*.log {
    daily
    missingok
    rotate 30
    compress
    delaycompress
    notifempty
    create 0644 $FETCH_ML_USER $FETCH_ML_USER
    postrotate
        systemctl reload fetch_ml_worker || true
        systemctl reload fetch_ml_data_manager || true
    endscript
 }
 EOF
    log_success "Log rotation configured"
 }
 optimize_system() {
    log_info "Optimizing system for ML workloads..."
    # Increase file limits
    echo "* soft nofile 65536" >> /etc/security/limits.conf
    echo "* hard nofile 65536" >> /etc/security/limits.conf
    # Optimize kernel parameters for ML
    cat >> /etc/sysctl.conf << EOF
 # ML Optimization
 net.core.rmem_max = 134217728
 net.core.wmem_max = 134217728
 vm.swappiness = 10
 vm.dirty_ratio = 15
 vm.dirty_background_ratio = 5
 EOF
    sysctl -p
    # Configure GPU persistence mode if NVIDIA available
    if command -v nvidia-smi &> /dev/null; then
        nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode"
    fi
    # Disable SELinux for better container compatibility (optional)
    if [[ -f /etc/selinux/config ]]; then
        log_warning "Consider setting SELinux to permissive mode for better container compatibility"
        log_info "Edit /etc/selinux/config and set SELINUX=permissive"
    fi
    log_success "System optimized for ML workloads"
 }
 install_fetch_ml() {
    log_info "Installing Fetch ML..."
    # Clone or copy Fetch ML
    cd $FETCH_ML_HOME
    if [[ ! -d "fetch_ml" ]]; then
        log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml"
        log_info "Example: git clone https://github.com/your-org/fetch_ml.git"
        return
    fi
    cd fetch_ml
    # Build
    export PATH=$PATH:/usr/local/go/bin
    make build
    # Copy binaries
    cp bin/* $FETCH_ML_HOME/bin/
    chmod +x $FETCH_ML_HOME/bin/*
    # Copy configs
    mkdir -p $FETCH_ML_HOME/configs
    cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml
    # Set permissions
    chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
    log_success "Fetch ML installed"
 }
 main() {
    log_info "Starting Fetch ML Rocky Linux server setup..."
    check_root
    check_rocky
    update_system
    enable_epel
    install_go
    install_podman
    install_redis
    install_nvidia_drivers
    install_ml_tools
    ensure_user
    create_directories
    setup_firewall
    setup_systemd_services
    setup_logrotate
    hardening_steps
    selinux_guidance
    install_fetch_ml
    log_success "Fetch ML setup complete!"
    echo
    log_info "Next steps:"
    echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml"
    echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml"
    echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager"
    echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager"
    echo "5. View logs: journalctl -u fetch_ml_worker -f"
    echo
    log_info "Services will be available at:"
    echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080"
    echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081"
 }
 # Run main function
 main "$@"
--- a/scripts/legacy/setup_ubuntu.sh
+++ b/scripts/legacy/setup_ubuntu.sh
@ -1,294 +0,0 @@
 #!/usr/bin/env bash
 # Fetch ML Ubuntu Server Setup Script
 # Optimized for ML experiments on Ubuntu 20.04/22.04
 set -euo pipefail
 # shellcheck source=scripts/setup_common.sh
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 source "$SCRIPT_DIR/setup_common.sh"
 check_root() {
    if [[ $EUID -ne 0 ]]; then
        log_error "This script must be run as root"
        exit 1
    fi
 }
 check_ubuntu() {
    if ! command -v apt-get &> /dev/null; then
        log_error "This script is designed for Ubuntu systems"
        exit 1
    fi
    local ubuntu_version=$(lsb_release -rs)
    log_info "Ubuntu version: $ubuntu_version"
    if (( $(echo "$ubuntu_version < 20.04" | bc -l) == 1 )); then
        log_warning "Ubuntu version < 20.04 may not support all features"
    fi
 }
 update_system() {
    log_info "Updating system packages..."
    apt-get update -y
    apt-get upgrade -y
    apt-get install -y curl wget gnupg lsb-release software-properties-common
 }
 install_go() {
    log_info "Installing Go 1.25..."
    if command -v go &> /dev/null; then
        local go_version=$(go version | awk '{print $3}' | sed 's/go//')
        log_info "Go already installed: $go_version"
        return
    fi
    cd /tmp
    TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz"
    secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz"
    tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz
    # Add to PATH
    echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile
    echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile
    export PATH=$PATH:/usr/local/go/bin
    log_success "Go 1.25 installed"
 }
 install_podman() {
    log_info "Installing Podman..."
    if command -v podman &> /dev/null; then
        log_info "Podman already installed"
        return
    fi
    # Add official Podman repository
    echo "deb https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/ /" | tee /etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list
    curl -L "https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/Release.key" | apt-key add -
    apt-get update -y
    apt-get install -y podman podman-compose
    # Configure Podman for rootless operation
    echo "user_namespace_enable = 1" >> /etc/containers/containers.conf
    echo "runtime = \"crun\"" >> /etc/containers/containers.conf
    log_success "Podman installed"
 }
 install_redis() {
    log_info "Installing Redis..."
    if command -v redis-server &> /dev/null; then
        log_info "Redis already installed"
        return
    fi
    apt-get install -y redis-server
    # Configure Redis for production
    sed -i 's/supervised no/supervised systemd/' /etc/redis/redis.conf
    sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis/redis.conf
    systemctl enable redis-server
    systemctl start redis-server
    log_success "Redis installed and configured"
 }
 install_nvidia_drivers() {
    log_info "Checking for NVIDIA GPU..."
    if command -v nvidia-smi &> /dev/null; then
        log_info "NVIDIA drivers already installed"
        nvidia-smi
        return
    fi
    if lspci | grep -i nvidia &> /dev/null; then
        log_info "NVIDIA GPU detected, installing drivers..."
        # Add NVIDIA repository
        TMP_FILES="/tmp/cuda-keyring_1.1-1_all.deb"
        secure_download "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$(lsb_release -rs | cut -d. -f1)/x86_64/cuda-keyring_1.1-1_all.deb" "cfa6b4109e7e3d9be060a016b7dc07e8edcd5356c0eabcc0c537a76e6c603d76" "/tmp/cuda-keyring_1.1-1_all.deb"
        dpkg -i /tmp/cuda-keyring_1.1-1_all.deb
        apt-get update -y
        # Install drivers
        apt-get install -y nvidia-driver-535 nvidia-cuda-toolkit
        # Configure Podman for NVIDIA (only if needed)
        if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then
            log_warning "NVIDIA GPU access test failed, you may need to reboot"
        else
            log_success "NVIDIA drivers installed and GPU access verified"
        fi
    else
        log_info "No NVIDIA GPU detected, skipping driver installation"
    fi
 }
 install_ml_tools() {
    log_info "Installing ML tools and dependencies..."
    # Python and ML packages
    apt-get install -y python3 python3-pip python3-venv
    # System dependencies for ML
    apt-get install -y build-essential cmake git pkg-config
    apt-get install -y libjpeg-dev libpng-dev libtiff-dev
    apt-get install -y libavcodec-dev libavformat-dev libswscale-dev
    apt-get install -y libgtk2.0-dev libcanberra-gtk-module
    apt-get install -y libxvidcore-dev libx264-dev
    apt-get install -y libatlas-base-dev gfortran
    # Install common ML libraries
    pip3 install --upgrade pip
    pip3 install numpy scipy scikit-learn pandas
    pip3 install jupyter matplotlib seaborn
    pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
    log_success "ML tools installed"
 }
 create_user() {
    log_info "Creating fetchml user..."
    ensure_user
    create_directories
    log_success "User $FETCH_ML_USER and directories created"
 }
 setup_firewall() {
    log_info "Configuring firewall..."
    if command -v ufw &> /dev/null; then
        ufw --force enable
        ufw allow ssh
        ufw allow 8080/tcp  # Worker API
        ufw allow 8081/tcp  # Data manager API
        ufw allow 6379/tcp  # Redis
        ufw status
    else
        log_warning "UFW not available, skipping firewall configuration"
    fi
 }
 setup_systemd_services() {
    log_info "Setting up systemd services..."
    setup_systemd_service "fetch_ml_worker" "$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml"
    setup_systemd_service "fetch_ml_data_manager" "$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml"
    # Enable services
    systemctl daemon-reload
    systemctl enable fetch_ml_worker
    systemctl enable fetch_ml_data_manager
    log_success "Systemd services configured"
 }
 setup_log_rotation() {
    log_info "Setting up log rotation..."
    setup_logrotate
    log_success "Log rotation configured"
 }
 optimize_system() {
    log_info "Optimizing system for ML workloads..."
    hardening_steps
    # Optimize kernel parameters for ML
    cat >> /etc/sysctl.conf << EOF
 # ML Optimization
 net.core.rmem_max = 134217728
 net.core.wmem_max = 134217728
 vm.swappiness = 10
 vm.dirty_ratio = 15
 vm.dirty_background_ratio = 5
 EOF
    sysctl -p
    # Configure GPU persistence mode if NVIDIA available
    if command -v nvidia-smi &> /dev/null; then
        nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode"
    fi
    log_success "System optimized for ML workloads"
 }
 install_fetch_ml() {
    log_info "Installing Fetch ML..."
    # Clone or copy Fetch ML
    cd $FETCH_ML_HOME
    if [[ ! -d "fetch_ml" ]]; then
        # This would be replaced with actual repository URL
        log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml"
        log_info "Example: git clone https://github.com/your-org/fetch_ml.git"
        return
    fi
    cd fetch_ml
    # Build
    export PATH=$PATH:/usr/local/go/bin
    make build
    # Copy binaries
    cp bin/* $FETCH_ML_HOME/bin/
    chmod +x $FETCH_ML_HOME/bin/*
    # Copy configs
    mkdir -p $FETCH_ML_HOME/configs
    cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml
    # Set permissions
    chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
    log_success "Fetch ML installed"
 }
 main() {
    log_info "Starting Fetch ML Ubuntu server setup..."
    check_root
    check_ubuntu
    update_system
    install_go
    install_podman
    install_redis
    install_nvidia_drivers
    install_ml_tools
    ensure_user
    create_directories
    setup_firewall
    setup_systemd_services
    setup_logrotate
    hardening_steps
    install_fetch_ml
    log_success "Fetch ML setup complete!"
    echo
    log_info "Next steps:"
    echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml"
    echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml"
    echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager"
    echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager"
    echo "5. View logs: journalctl -u fetch_ml_worker -f"
    echo
    log_info "Services will be available at:"
    echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080"
    echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081"
 }
 # Run main function
 main "$@"
--- a/scripts/legacy/test_tools.sh
+++ b/scripts/legacy/test_tools.sh
@ -1,67 +0,0 @@
 #!/bin/bash
 set -e
 echo "=== Test Tools Harness ==="
 # Function to check if Redis is running, start temporary instance if needed
 ensure_redis() {
    if ! redis-cli ping >/dev/null 2>&1; then
        echo "Starting temporary Redis instance..."
        redis-server --daemonize yes --port 6379
        sleep 2
        if ! redis-cli ping >/dev/null 2>&1; then
            echo "Failed to start Redis"
            exit 1
        fi
        echo "Redis started successfully"
        # Set up cleanup trap
        trap 'echo "Stopping temporary Redis..."; redis-cli shutdown || true' EXIT
    else
        echo "Redis is already running"
    fi
 }
 # Step 1: Build Go binaries
 echo "Building Go binaries..."
 go build -o bin/api-server ./cmd/api-server
 go build -o bin/worker ./cmd/worker
 go build -o bin/data_manager ./cmd/data_manager
 go build -o bin/user_manager ./cmd/user_manager
 # Step 2: Build Zig CLI
 echo "Building Zig CLI..."
 cd cli
 zig build
 cd ..
 # Step 3: Ensure Redis is running
 ensure_redis
 # Step 4: Run Go tests
 echo "Running Go tests..."
 go test ./...
 # Step 5: Run Zig tests
 echo "Running Zig CLI tests..."
 cd cli
 zig test
 cd ..
 # Step 6: Run Go E2E tests (Redis is already available)
 echo "Running Go E2E tests..."
 go test ./tests/e2e/...
 # Step 7: Smoke test API server and CLI
 echo "Running smoke test..."
 # Start API server in background on different port
 ./bin/api-server -config configs/config.yaml -port 19101 -no-tls > /tmp/api-server.log 2>&1 &
 API_PID=$!
 sleep 2
 # Test CLI status
 ./cli/zig-out/bin/ml status -server http://localhost:19101
 # Clean up
 kill $API_PID 2>/dev/null || true
 echo "=== All tests completed successfully ==="
--- a/scripts/maintenance/auto-cleanup.service
+++ b/scripts/maintenance/auto-cleanup.service
@ -5,7 +5,7 @@ Requires=docker.service
 [Service]
 Type=oneshot
-ExecStart=/Users/jfraeys/Documents/dev/fetch_ml/scripts/cleanup.sh --force
+ExecStart=/Users/jfraeys/Documents/dev/fetch_ml/scripts/maintenance/cleanup.sh --dry-run
 User=jfraeys
 Group=staff
 StandardOutput=journal
--- a/scripts/maintenance/cleanup-benchmarks.sh
+++ b/scripts/maintenance/cleanup-benchmarks.sh
@ -8,6 +8,7 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
 LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
 ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"
 # Colors for output
 RED='\033[0;31m'
@ -43,22 +44,34 @@ cleanup_benchmark_artifacts() {
        case "${1:-keep-10}" in
            "all")
-                print_status "Removing ALL benchmark artifacts..."
+                print_status "Archiving ALL benchmark artifacts..."
-                rm -rf "$LOCAL_ARTIFACTS_DIR"
+                local stamp=$(date -u +%Y%m%d-%H%M%S)
-                print_success "Removed all artifacts (was $size_before)"
+                mkdir -p "$ARCHIVE_DIR/$stamp"
                mv "$LOCAL_ARTIFACTS_DIR"/run_* "$ARCHIVE_DIR/$stamp"/ 2>/dev/null || true
                print_success "Archived all artifacts (was $size_before)"
                ;;
            "keep-5")
-                print_status "Keeping last 5 runs, removing older ones..."
+                print_status "Keeping last 5 runs, archiving older ones..."
                local stamp=$(date -u +%Y%m%d-%H%M%S)
                mkdir -p "$ARCHIVE_DIR/$stamp"
                cd "$LOCAL_ARTIFACTS_DIR"
-                ls -1t run_* 2>/dev/null | tail -n +6 | xargs rm -rf 2>/dev/null || true
+                ls -1t run_* 2>/dev/null | tail -n +6 | while read -r run; do
                    [ -n "$run" ] || continue
                    mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
                done
                local count_after=$(ls -1d run_* 2>/dev/null | wc -l)
                local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B")
                print_success "Cleaned old runs: $count_before → $count_after runs ($size_before → $size_after)"
                ;;
            "keep-10")
-                print_status "Keeping last 10 runs, removing older ones..."
+                print_status "Keeping last 10 runs, archiving older ones..."
                local stamp=$(date -u +%Y%m%d-%H%M%S)
                mkdir -p "$ARCHIVE_DIR/$stamp"
                cd "$LOCAL_ARTIFACTS_DIR"
-                ls -1t run_* 2>/dev/null | tail -n +11 | xargs rm -rf 2>/dev/null || true
+                ls -1t run_* 2>/dev/null | tail -n +11 | while read -r run; do
                    [ -n "$run" ] || continue
                    mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
                done
                local count_after=$(ls -1d run_* 2>/dev/null | wc -l)
                local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B")
                print_success "Cleaned old runs: $count_before → $count_after runs ($size_before → $size_after)"
@ -80,12 +93,18 @@ cleanup_temp_files() {
    # Clean temp directories
    local temp_cleaned=0
    local stamp=$(date -u +%Y%m%d-%H%M%S)
    local tmp_archive_dir="$LOCAL_ARTIFACTS_DIR/tmp-archive/$stamp"
    mkdir -p "$tmp_archive_dir"
    # /tmp cleanup
    if [ -d "/tmp" ]; then
        local tmp_files=$(find /tmp -name "benchmark_*" -type f 2>/dev/null | wc -l)
        if [ "$tmp_files" -gt 0 ]; then
-            find /tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
+            find /tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
-            print_success "Cleaned $tmp_files temporary files from /tmp"
+                mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
            done
            print_success "Archived $tmp_files temporary files from /tmp"
            temp_cleaned=$((temp_cleaned + tmp_files))
        fi
    fi
@ -94,8 +113,10 @@ cleanup_temp_files() {
    if [ -d "/var/tmp" ]; then
        local vartmp_files=$(find /var/tmp -name "benchmark_*" -type f 2>/dev/null | wc -l)
        if [ "$vartmp_files" -gt 0 ]; then
-            find /var/tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
+            find /var/tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
-            print_success "Cleaned $vartmp_files temporary files from /var/tmp"
+                mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
            done
            print_success "Archived $vartmp_files temporary files from /var/tmp"
            temp_cleaned=$((temp_cleaned + vartmp_files))
        fi
    fi
@ -104,8 +125,10 @@ cleanup_temp_files() {
    if [ -d "$HOME/tmp" ]; then
        local user_tmp_files=$(find "$HOME/tmp" -name "benchmark_*" -type f 2>/dev/null | wc -l)
        if [ "$user_tmp_files" -gt 0 ]; then
-            find "$HOME/tmp" -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
+            find "$HOME/tmp" -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
-            print_success "Cleaned $user_tmp_files temporary files from ~/tmp"
+                mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
            done
            print_success "Archived $user_tmp_files temporary files from ~/tmp"
            temp_cleaned=$((temp_cleaned + user_tmp_files))
        fi
    fi
@ -177,9 +200,16 @@ cleanup_logs() {
    for log_dir in "${log_dirs[@]}"; do
        if [ -d "$log_dir" ]; then
            local log_size_before=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B")
-            # Remove log files older than 7 days
+            local stamp=$(date -u +%Y%m%d-%H%M%S)
-            find "$log_dir" -name "*.log" -type f -mtime +7 -delete 2>/dev/null || true
+            local log_archive_dir="$log_dir/archive/$stamp"
-            find "$log_dir" -name "*.log.*" -type f -mtime +7 -delete 2>/dev/null || true
+            mkdir -p "$log_archive_dir"
            # Move log files older than 7 days to archive
            find "$log_dir" -name "*.log" -type f -mtime +7 -print0 2>/dev/null | while IFS= read -r -d '' f; do
                mv "$f" "$log_archive_dir/" 2>/dev/null || true
            done
            find "$log_dir" -name "*.log.*" -type f -mtime +7 -print0 2>/dev/null | while IFS= read -r -d '' f; do
                mv "$f" "$log_archive_dir/" 2>/dev/null || true
            done
            local log_size_after=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B")
            if [ "$log_size_before" != "$log_size_after" ]; then
                print_success "Cleaned old logs in $log_dir: $log_size_before → $log_size_after"
--- a/scripts/maintenance/cleanup.sh
+++ b/scripts/maintenance/cleanup.sh
@ -144,12 +144,12 @@ else
    log_info "No running containers found"
 fi
-# Remove containers
+ # Remove containers
 log_info "Removing containers..."
 containers=$(docker ps -aq --filter "name=ml-")
 if [ -n "$containers" ]; then
    if [ "$DRY_RUN" = false ]; then
-        echo "$containers" | xargs docker rm -f
+        echo "$containers" | xargs docker rm
        log_success "Containers removed"
    fi
 else
@ -168,9 +168,9 @@ else
    log_info "No networks found"
 fi
-# Remove volumes (with caution)
+ # Remove volumes (with caution)
-log_warning "Removing volumes (this will delete data)..."
+log_warning "Skipping volumes by default (use --all to remove them)"
-if [ "$FORCE" = true ] || [ "$ALL" = true ]; then
+if [ "$ALL" = true ]; then
    volumes=$(docker volume ls -q --filter "name=ml-")
    if [ -n "$volumes" ]; then
        if [ "$DRY_RUN" = false ]; then
@ -181,16 +181,16 @@ if [ "$FORCE" = true ] || [ "$ALL" = true ]; then
        log_info "No volumes found"
    fi
 else
-    log_info "Skipping volumes (use --force or --all to remove them)"
+    log_info "Skipping volumes"
 fi
-# Remove images if requested
+ # Remove images if requested
 if [ "$ALL" = true ]; then
    log_info "Removing images..."
    images=$(docker images -q --filter "reference=fetch_ml-*")
    if [ -n "$images" ]; then
        if [ "$DRY_RUN" = false ]; then
-            echo "$images" | xargs docker rmi -f
+            echo "$images" | xargs docker rmi
            log_success "Images removed"
        fi
    else
@ -200,11 +200,15 @@ else
    log_info "Skipping images (use --all to remove them)"
 fi
-# General Docker cleanup
+ # General Docker cleanup
-log_info "Running general Docker cleanup..."
+if [ "$ALL" = true ]; then
-if [ "$DRY_RUN" = false ]; then
+    log_info "Running general Docker cleanup (docker system prune)..."
-    docker system prune -f
+    if [ "$DRY_RUN" = false ]; then
-    log_success "General cleanup completed"
+        docker system prune -f
        log_success "General cleanup completed"
    fi
 else
    log_info "Skipping docker system prune (use --all to enable)"
 fi
 # Show final state
--- a/scripts/manage-artifacts.sh
+++ b/scripts/manage-artifacts.sh
@ -8,6 +8,7 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
 LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
 ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"
 # Create artifacts directory if it doesn't exist
 mkdir -p "$LOCAL_ARTIFACTS_DIR"
@ -41,17 +42,21 @@ case "${1:-help}" in
        echo "=== Cleaning Artifacts ==="
        case "${2:-all}" in
            "all")
-                echo "Removing all artifacts..."
+                echo "Archiving all artifacts..."
-                rm -rf "$LOCAL_ARTIFACTS_DIR"
+                stamp=$(date -u +%Y%m%d-%H%M%S)
-                echo "All artifacts removed"
+                mkdir -p "$ARCHIVE_DIR/$stamp"
                mv "$LOCAL_ARTIFACTS_DIR"/run_* "$ARCHIVE_DIR/$stamp"/ 2>/dev/null || true
                echo "All artifacts archived"
                ;;
            "old")
                keep_count="${3:-10}"
-                echo "Keeping last $keep_count runs, removing older ones..."
+                echo "Keeping last $keep_count runs, archiving older ones..."
                stamp=$(date -u +%Y%m%d-%H%M%S)
                mkdir -p "$ARCHIVE_DIR/$stamp"
                cd "$LOCAL_ARTIFACTS_DIR"
                ls -1t run_* 2>/dev/null | tail -n +$((keep_count + 1)) | while read -r run; do
-                    echo "Removing: $run"
+                    echo "Archiving: $run"
-                    rm -rf "$run"
+                    mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
                done
                ;;
            "run")
@ -64,8 +69,10 @@ case "${1:-help}" in
                fi
                run_dir="$LOCAL_ARTIFACTS_DIR/run_$run_id"
                if [ -d "$run_dir" ]; then
-                    echo "Removing run: $run_id"
+                    echo "Archiving run: $run_id"
-                    rm -rf "$run_dir"
+                    stamp=$(date -u +%Y%m%d-%H%M%S)
                    mkdir -p "$ARCHIVE_DIR/$stamp"
                    mv "$run_dir" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
                else
                    echo "Run not found: $run_id"
                fi
--- a/scripts/setup-auto-cleanup.sh
+++ b/scripts/setup-auto-cleanup.sh
--- a/scripts/setup-secure-homelab.sh
+++ b/scripts/setup-secure-homelab.sh
@ -1,169 +0,0 @@
 #!/bin/bash
 # Secure Homelab Setup Script for Fetch ML
 # This script generates secure API keys and TLS certificates
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
 CONFIG_DIR="$PROJECT_ROOT/configs/environments"
 SSL_DIR="$PROJECT_ROOT/ssl"
 echo "🔒 Setting up secure homelab configuration..."
 # Create SSL directory
 mkdir -p "$SSL_DIR"
 # Generate TLS certificates
 echo "📜 Generating TLS certificates..."
 if [[ ! -f "$SSL_DIR/cert.pem" ]] || [[ ! -f "$SSL_DIR/key.pem" ]]; then
    openssl req -x509 -newkey rsa:4096 -keyout "$SSL_DIR/key.pem" -out "$SSL_DIR/cert.pem" -days 365 -nodes \
        -subj "/C=US/ST=Homelab/L=Local/O=FetchML/OU=Homelab/CN=localhost" \
        -addext "subjectAltName=DNS:localhost,DNS:$(hostname),IP:127.0.0.1"
    chmod 600 "$SSL_DIR/key.pem"
    chmod 644 "$SSL_DIR/cert.pem"
    echo "✅ TLS certificates generated in $SSL_DIR/"
 else
    echo "ℹ️  TLS certificates already exist, skipping generation"
 fi
 # Generate secure API keys
 echo "🔑 Generating secure API keys..."
 generate_api_key() {
    openssl rand -hex 32
 }
 # Hash function
 hash_key() {
    echo -n "$1" | sha256sum | cut -d' ' -f1
 }
 # Generate keys
 ADMIN_KEY=$(generate_api_key)
 USER_KEY=$(generate_api_key)
 ADMIN_HASH=$(hash_key "$ADMIN_KEY")
 USER_HASH=$(hash_key "$USER_KEY")
 # Create secure config
 echo "⚙️  Creating secure configuration..."
 cat > "$CONFIG_DIR/config-homelab-secure.yaml" << EOF
 # Secure Homelab Configuration
 # IMPORTANT: Keep your API keys safe and never share them!
 redis:
    url: "redis://localhost:6379"
    max_connections: 10
 auth:
    enabled: true
    api_keys:
        homelab_admin:
            hash: $ADMIN_HASH
            admin: true
            roles:
                - admin
            permissions:
                '*': true
        homelab_user:
            hash: $USER_HASH
            admin: false
            roles:
                - researcher
            permissions:
                'experiments': true
                'datasets': true
                'jupyter': true
 server:
    address: ":9101"
    tls:
        enabled: true
        cert_file: "$SSL_DIR/cert.pem"
        key_file: "$SSL_DIR/key.pem"
 security:
    rate_limit:
        enabled: true
        requests_per_minute: 60
        burst_size: 10
    ip_whitelist:
        - "127.0.0.1"
        - "::1"
        - "localhost"
        - "192.168.1.0/24"  # Adjust to your network
        - "10.0.0.0/8"
 logging:
    level: "info"
    file: "logs/fetch_ml.log"
    console: true
 resources:
    cpu_limit: "2"
    memory_limit: "4Gi"
    gpu_limit: 0
    disk_limit: "10Gi"
 # Prometheus metrics
 metrics:
    enabled: true
    listen_addr: ":9100"
    tls:
        enabled: false
 EOF
 # Save API keys to a secure file
 echo "🔐 Saving API keys..."
 cat > "$PROJECT_ROOT/.api-keys" << EOF
 # Fetch ML Homelab API Keys
 # IMPORTANT: Keep this file secure and never commit to version control!
 ADMIN_API_KEY: $ADMIN_KEY
 USER_API_KEY: $USER_KEY
 # Usage examples:
 # curl -H "X-API-Key: $ADMIN_KEY" https://localhost:9101/health
 # curl -H "X-API-Key: $USER_KEY" https://localhost:9101/api/jupyter/services
 EOF
 chmod 600 "$PROJECT_ROOT/.api-keys"
 # Create environment file for JWT secret
 JWT_SECRET=$(generate_api_key)
 cat > "$PROJECT_ROOT/.env.secure" << EOF
 # Secure environment variables for Fetch ML
 # IMPORTANT: Keep this file secure and never commit to version control!
 JWT_SECRET=$JWT_SECRET
 # Source this file before running the server:
 # source .env.secure
 EOF
 chmod 600 "$PROJECT_ROOT/.env.secure"
 # Update .gitignore to exclude sensitive files
 echo "📝 Updating .gitignore..."
 if ! grep -q ".api-keys" "$PROJECT_ROOT/.gitignore"; then
    echo -e "\n# Security files\n.api-keys\n.env.secure\nssl/\n*.pem\n*.key" >> "$PROJECT_ROOT/.gitignore"
 fi
 echo ""
 echo "🎉 Secure homelab setup complete!"
 echo ""
 echo "📋 Next steps:"
 echo "1. Review and adjust the IP whitelist in config-homelab-secure.yaml"
 echo "2. Start the server with: ./api-server -config configs/environments/config-homelab-secure.yaml"
 echo "3. Source the environment: source .env.secure"
 echo "4. Your API keys are saved in .api-keys"
 echo ""
 echo "🔐 API Keys:"
 echo "   Admin: $ADMIN_KEY"
 echo "   User:  $USER_KEY"
 echo ""
 echo "⚠️  IMPORTANT:"
 echo "   - Never share your API keys"
 echo "   - Never commit .api-keys or .env.secure to version control"
 echo "   - Backup your SSL certificates and API keys securely"
 echo "   - Consider using a password manager for storing keys"
--- a/scripts/setup.sh
+++ b/scripts/setup.sh
@ -1,311 +0,0 @@
 #!/bin/bash
 # setup.sh: One-shot homelab setup (security + core services)
 # Keeps essential security (Fail2Ban, monitoring) while simplifying complexity
 set -euo pipefail
 readonly RED='\033[0;31m'
 readonly GREEN='\033[0;32m'
 readonly YELLOW='\033[1;33m'
 readonly BLUE='\033[0;34m'
 readonly NC='\033[0m'
 print_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
 }
 print_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
 }
 print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
 }
 print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
 }
 # Simple dependency check
 check_deps() {
    print_info "Checking dependencies..."
    local missing=()
    if ! command -v go &> /dev/null; then
        missing+=("go")
    fi
    if ! command -v zig &> /dev/null; then
        missing+=("zig")
    fi
    if ! command -v redis-server &> /dev/null; then
        missing+=("redis-server")
    fi
    if ! command -v docker &> /dev/null; then
        missing+=("docker")
    fi
    if [[ ${#missing[@]} -gt 0 ]]; then
        print_error "Missing dependencies: ${missing[*]}"
        echo ""
        echo "Install with:"
        echo "  macOS: brew install ${missing[*]}"
        echo "  Ubuntu: sudo apt-get install ${missing[*]}"
        exit 1
    fi
    print_success "Dependencies OK"
 }
 # Simple setup
 setup_project() {
    print_info "Setting up project..."
    # Create essential directories
    mkdir -p ssl logs configs data monitoring
    # Generate simple SSL cert
    if [[ ! -f "ssl/cert.pem" ]]; then
        openssl req -x509 -newkey rsa:2048 -keyout ssl/key.pem -out ssl/cert.pem \
            -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Homelab/CN=localhost" \
            -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null
        print_success "SSL certificates generated"
    fi
    # Create balanced config
    cat > configs/config.yaml << 'EOF'
 base_path: "./data/experiments"
 auth:
  enabled: true
  api_keys:
    homelab_user:
      hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
      admin: true
      roles: ["user", "admin"]
      permissions:
        read: true
        write: true
        delete: true
 server:
  address: ":9101"
  tls:
    enabled: true
    cert_file: "./ssl/cert.pem"
    key_file: "./ssl/key.pem"
 security:
  rate_limit:
    enabled: true
    requests_per_minute: 30
    burst_size: 5
  ip_whitelist:
    - "127.0.0.1"
    - "::1"
    - "192.168.0.0/16"
    - "10.0.0.0/8"
    - "172.16.0.0/12"
  failed_login_lockout:
    enabled: true
    max_attempts: 3
    lockout_duration: "15m"
 redis:
  url: "redis://localhost:6379"
 logging:
  level: "info"
  file: "./logs/app.log"
  audit_log: "./logs/audit.log"
  access_log: "./logs/access.log"
 monitoring:
  enabled: true
  metrics_port: 9090
  health_check_interval: "30s"
 EOF
    print_success "Configuration created"
 }
 # Simple build
 build_project() {
    print_info "Building project..."
    # Build Go apps
    go build -o bin/api-server ./cmd/api-server
    go build -o bin/worker ./cmd/worker
    go build -o bin/tui ./cmd/tui
    # Build Zig CLI
    cd cli && zig build && cd ..
    print_success "Build completed"
 }
 # Setup Fail2Ban
 setup_fail2ban() {
    print_info "Setting up Fail2Ban..."
    if ! command -v fail2ban-server &> /dev/null; then
        print_warning "Fail2Ban not installed, skipping..."
        return
    fi
    # Create Fail2Ban configuration
    sudo mkdir -p /etc/fail2ban/jail.d 2>/dev/null || true
    cat > /tmp/ml-experiments-jail.conf << 'EOF'
 [DEFAULT]
 bantime = 3600
 findtime = 600
 maxretry = 3
 backend = systemd
 [sshd]
 enabled = true
 port = ssh
 logpath = /var/log/auth.log
 maxretry = 3
 [ml-experiments-api]
 enabled = true
 port = 9101
 filter = ml-experiments-api
 logpath = ./logs/audit.log
 maxretry = 5
 bantime = 7200
 [ml-experiments-auth]
 enabled = true
 filter = ml-experiments-auth
 logpath = ./logs/audit.log
 maxretry = 3
 bantime = 3600
 EOF
    # Create filter definitions
    cat > /tmp/ml-experiments-api.conf << 'EOF'
 [Definition]
 failregex = ^.*<HOST>.*"status":40[13].*$
 ignoreregex =
 EOF
    cat > /tmp/ml-experiments-auth.conf << 'EOF'
 [Definition]
 failregex = ^.*"event":"failed_login".*"client_ip":"<HOST>".*$
 ignoreregex =
 EOF
    # Try to install configurations
    if sudo cp /tmp/ml-experiments-jail.conf /etc/fail2ban/jail.d/ 2>/dev/null; then
        sudo cp /tmp/ml-experiments-*.conf /etc/fail2ban/filter.d/ 2>/dev/null || true
        sudo systemctl restart fail2ban 2>/dev/null || true
        print_success "Fail2Ban configured"
    else
        print_warning "Could not configure Fail2Ban (requires sudo)"
    fi
    rm -f /tmp/ml-experiments-*.conf
 }
 # Setup Redis
 setup_redis() {
    print_info "Setting up Redis..."
    if ! pgrep -f "redis-server" > /dev/null; then
        redis-server --daemonize yes --port 6379
        print_success "Redis started"
    else
        print_info "Redis already running"
    fi
 }
 # Create simple management script
 create_manage_script() {
    cat > manage.sh << 'EOF'
 #!/bin/bash
 # Simple management script
 case "${1:-status}" in
    "start")
        echo "Starting services..."
        redis-server --daemonize yes --port 6379 2>/dev/null || true
        ./bin/api-server -config configs/config.yaml &
        echo "Services started"
        ;;
    "stop")
        echo "Stopping services..."
        pkill -f "api-server" || true
        redis-cli shutdown 2>/dev/null || true
        echo "Services stopped"
        ;;
    "status")
        echo "=== Status ==="
        if pgrep -f "redis-server" > /dev/null; then
            echo "✅ Redis: Running"
        else
            echo "❌ Redis: Stopped"
        fi
        if pgrep -f "api-server" > /dev/null; then
            echo "✅ API Server: Running"
        else
            echo "❌ API Server: Stopped"
        fi
        ;;
    "logs")
        echo "=== Recent Logs ==="
        tail -20 logs/app.log 2>/dev/null || echo "No logs yet"
        ;;
    "test")
        echo "=== Testing ==="
        curl -k -s https://localhost:9101/health || echo "API server not responding"
        ;;
    *)
        echo "Usage: $0 {start|stop|status|logs|test}"
        ;;
 esac
 EOF
    chmod +x manage.sh
    print_success "Management script created"
 }
 # Show next steps
 show_next_steps() {
    print_success "Setup completed!"
    echo ""
    echo "🎉 Setup complete!"
    echo ""
    echo "Next steps:"
    echo "  1. Start services: ./tools/manage.sh start"
    echo "  2. Check status: ./tools/manage.sh status"
    echo "  3. Test API: curl -k -H 'X-API-Key: password' https://localhost:9101/health"
    echo ""
    echo "Configuration: configs/config.yaml"
    echo "Logs: logs/app.log and logs/audit.log"
    echo ""
    print_success "Ready for homelab use!"
 }
 # Main setup
 main() {
    echo "ML Experiment Manager - Homelab Setup"
    echo "====================================="
    echo ""
    check_deps
    setup_project
    build_project
    setup_redis
    create_manage_script
    show_next_steps
 }
 main "$@"
--- a/scripts/setup_monitoring.py
+++ b/scripts/setup_monitoring.py
@ -0,0 +1,62 @@
 #!/usr/bin/env python3
 import os
 # Create monitoring directory structure
 repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 monitoring_dir = os.path.join(repo_root, 'monitoring')
 grafana_dir = os.path.join(monitoring_dir, 'grafana')
 datasources_dir = os.path.join(grafana_dir, 'provisioning', 'datasources')
 providers_dir = os.path.join(grafana_dir, 'provisioning', 'dashboards')
 os.makedirs(datasources_dir, exist_ok=True)
 os.makedirs(providers_dir, exist_ok=True)
 # Essential datasource configurations
 datasources = {
    'prometheus.yml': """apiVersion: 1
 datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    isDefault: true
    editable: true
    jsonData:
      timeInterval: "5s"
 """,
    'loki.yml': """apiVersion: 1
 datasources:
  - name: Loki
    type: loki
    access: proxy
    url: http://loki:3100
    editable: true
    jsonData:
      maxLines: 1000
 """,
    'dashboards.yml': """apiVersion: 1
 providers:
  - name: 'default'
    orgId: 1
    folder: ''
    type: file
    disableDeletion: false
    updateIntervalSeconds: 10
    allowUiUpdates: true
    options:
      path: /var/lib/grafana/dashboards
 """
 }
 # Write configuration files
 for filename, content in datasources.items():
    if filename == 'dashboards.yml':
        path = os.path.join(providers_dir, filename)
    else:
        path = os.path.join(datasources_dir, filename)
    with open(path, 'w') as f:
        f.write(content)
 print("Monitoring setup completed!")
--- a/scripts/smoke-test.sh
+++ b/scripts/smoke-test.sh
@ -0,0 +1,111 @@
 set -euo pipefail;
 repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 export FETCHML_REPO_ROOT="$repo_root"
 env="${1:-dev}";
 if [ "$env" != "dev" ] && [ "$env" != "prod" ]; then
    echo "usage: $0 [dev|prod]" >&2
    exit 2
 fi
 probe_https_health_openssl() {
    host="$1"
    port="$2"
    path="$3"
    req="GET ${path} HTTP/1.1\r\nHost: ${host}\r\nConnection: close\r\n\r\n"
    resp=$(printf "%b" "$req" | openssl s_client -connect "127.0.0.1:${port}" -servername "${host}" -tls1_2 -quiet 2>/dev/null || true)
    printf "%s" "$resp" | tr -d '\r' | head -n 1 | grep -Eq '^HTTP/1\.[01] 200'
 }
 compose_cmd="docker-compose";
 if ! command -v docker-compose >/dev/null 2>&1; then
    compose_cmd="docker compose";
 fi
 compose_files=()
 compose_project_args=("--project-directory" "$repo_root")
 api_base=""
 prometheus_base=""
 stack_name=""
 if [ "$env" = "dev" ]; then
    mkdir -p \
        "$repo_root/data/dev/redis" \
        "$repo_root/data/dev/minio" \
        "$repo_root/data/dev/prometheus" \
        "$repo_root/data/dev/grafana" \
        "$repo_root/data/dev/loki" \
        "$repo_root/data/dev/logs" \
        "$repo_root/data/dev/experiments" \
        "$repo_root/data/dev/active" \
        "$repo_root/data/dev/workspaces"
    stack_name="dev"
    compose_files=("-f" "$repo_root/deployments/docker-compose.dev.yml")
    api_base="https://localhost:9101"
    if ! curl -skf "$api_base/health" >/dev/null 2>&1; then
        api_base="http://localhost:9101"
    fi
    prometheus_base="http://localhost:9090"
 else
    mkdir -p \
        "$repo_root/data/prod-smoke/caddy/data" \
        "$repo_root/data/prod-smoke/caddy/config" \
        "$repo_root/data/prod-smoke/redis" \
        "$repo_root/data/prod-smoke/logs" \
        "$repo_root/data/prod-smoke/experiments" \
        "$repo_root/data/prod-smoke/active"
    stack_name="prod"
    compose_files=("-f" "$repo_root/deployments/docker-compose.prod.smoke.yml")
    api_base="https://localhost:8443"
    export FETCHML_DOMAIN=localhost
    export CADDY_EMAIL=smoke@example.invalid
 fi
 cleanup() {
    status=$?;
    if [ "$status" -ne 0 ]; then
        $compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" logs --no-color || true;
    fi
    if [ "${KEEP_STACK:-0}" != "1" ]; then
        $compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" down -v >/dev/null 2>&1 || true;
    fi
    exit "$status";
 }
 trap cleanup EXIT;
 echo "Starting $stack_name stack for smoke test...";
 $compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" up -d --build >/dev/null;
 echo "Waiting for API to become healthy...";
 deadline=$(($(date +%s) + 90));
 while true; do
    if [ "$env" = "dev" ]; then
        if curl -skf "$api_base/health" >/dev/null 2>&1; then break; fi;
    else
        if probe_https_health_openssl "localhost" "8443" "/health"; then break; fi;
    fi
    if [ $(date +%s) -ge $deadline ]; then echo "Timed out waiting for $api_base/health"; exit 1; fi;
    sleep 2;
 done;
 if [ "$env" = "dev" ]; then
    echo "Checking metrics endpoint...";
    curl -skf "$api_base/metrics" >/dev/null;
    echo "Waiting for Prometheus target api-server to be up...";
    deadline=$(($(date +%s) + 90));
    query_url="$prometheus_base/api/v1/query?query=up%7Bjob%3D%22api-server%22%7D";
    while true; do
        resp=$(curl -sf "$query_url" || true);
        resp_compact=$(printf "%s" "$resp" | tr -d '\n' | tr -d '\r');
        if echo "$resp_compact" | grep -Fq '"instance":"api-server:9101"' && echo "$resp_compact" | grep -Fq ',"1"]'; then break; fi;
        if [ $(date +%s) -ge $deadline ]; then echo "Timed out waiting for Prometheus api-server target to be up"; echo "$resp"; exit 1; fi;
        sleep 2;
    done;
 fi
--- a/scripts/testing/run-full-test-suite.sh
+++ b/scripts/testing/run-full-test-suite.sh
--- a/scripts/testing/test-homelab-secure.sh
+++ b/scripts/testing/test-homelab-secure.sh
@ -1,80 +0,0 @@
 #!/bin/bash
 # Homelab Secure Test Environment Script
 set -e
 echo "Starting Homelab Secure Production Environment..."
 # Clean up any existing containers
 echo "Cleaning up existing containers..."
 docker-compose -f deployments/docker-compose.homelab-secure.yml down -v
 # Create necessary directories with proper permissions
 echo "Creating directories..."
 mkdir -p data logs
 chmod 750 data logs
 # Build and start services
 echo "Building and starting services..."
 docker-compose -f deployments/docker-compose.homelab-secure.yml up --build -d
 # Wait for services to be healthy
 echo "Waiting for services to be healthy..."
 sleep 20
 # Check service health
 echo "Checking service health..."
 docker-compose -f deployments/docker-compose.homelab-secure.yml ps
 # Test API server with TLS
 echo "Testing API server..."
 curl -k -s https://localhost:9104/health || echo "API health check failed"
 # Test Redis with authentication
 echo "Testing Redis with authentication..."
 docker exec ml-homelab-redis redis-cli -a "HomelabRedis2024!" ping || echo "Redis health check failed"
 # Test SSH connectivity with security
 echo "Testing SSH connectivity..."
 docker exec -u worker ml-homelab-worker ssh -o StrictHostKeyChecking=no -o Port=2222 worker@localhost "echo 'SSH OK'" || echo "SSH test failed"
 # Test fail2ban status
 echo "Testing fail2ban..."
 docker exec ml-homelab-api fail2ban-client status sshd || echo "fail2ban check failed"
 echo ""
 echo "Homelab secure production environment is ready!"
 echo ""
 echo "Services:"
 echo "  - API Server: https://localhost:9104"
 echo "  - SSH: localhost:2223 (worker user)"
 echo "  - Redis: localhost:6379 (with password)"
 echo "  - Metrics: http://localhost:9101"
 echo ""
 echo "Security Features:"
 echo "  ✓ Strong TLS 1.3 with modern ciphers"
 echo "  ✓ SSH with fail2ban protection"
 echo "  ✓ Redis with password authentication"
 echo "  ✓ SQLite database with encryption"
 echo "  ✓ Container security hardening"
 echo "  ✓ Rate limiting and CORS protection"
 echo "  ✓ Security headers and CSRF protection"
 echo "  ✓ Podman sandboxed job execution"
 echo "  ✓ Audit logging and monitoring"
 echo ""
 echo "Credentials:"
 echo "  - API User: homelab_user / password"
 echo "  - SSH User: worker / HomelabWorker2024!"
 echo "  - Redis Password: HomelabRedis2024!"
 echo ""
 echo "To test with CLI:"
 echo "  ./cli/zig-out/bin/ml queue homelab-secure-test"
 echo "  ./cli/zig-out/bin/ml status"
 echo ""
 echo "To view logs:"
 echo "  docker-compose -f deployments/docker-compose.homelab-secure.yml logs -f api-server"
 echo "  docker-compose -f deployments/docker-compose.homelab-secure.yml logs -f worker"
 echo "  docker-compose -f deployments/docker-compose.homelab-secure.yml down"
 echo ""
 echo "To stop:"
 echo "  docker-compose -f deployments/docker-compose.homelab-secure.yml down"
--- a/scripts/track_performance.sh
+++ b/scripts/track_performance.sh
@ -0,0 +1,64 @@
 #!/bin/bash
 # Simple performance tracking script
 RESULTS_DIR="test_results/performance"
 TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
 RESULTS_FILE="$RESULTS_DIR/load_test_$TIMESTAMP.json"
 mkdir -p "$RESULTS_DIR"
 echo "Running load test performance tracking..."
 echo "Timestamp: $TIMESTAMP"
 # Run tests and capture results
 go test ./tests/load -run=TestLoadTestSuite -v -load-suite=medium -timeout=10m > "$RESULTS_DIR/raw_$TIMESTAMP.log"
 # Extract key metrics
 {
  echo "{"
  echo "  \"timestamp\": \"$TIMESTAMP\","
  echo "  \"tests\": ["
  # Parse light load
  LIGHT_RPS=$(grep -A1 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Throughput" | awk '{print $2}')
  LIGHT_ERROR=$(grep -A2 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Error rate" | awk '{print $3}')
  LIGHT_P99=$(grep -A4 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "P99 latency" | awk '{print $3}')
  echo "    {"
  echo "      \"name\": \"LightLoad\","
  echo "      \"throughput_rps\": $LIGHT_RPS,"
  echo "      \"error_rate_percent\": $LIGHT_ERROR,"
  echo "      \"p99_latency_ms\": \"$LIGHT_P99\""
  echo "    },"
  # Parse medium load
  MEDIUM_RPS=$(grep -A1 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Throughput" | awk '{print $2}')
  MEDIUM_ERROR=$(grep -A2 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Error rate" | awk '{print $3}')
  MEDIUM_P99=$(grep -A4 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "P99 latency" | awk '{print $3}')
  echo "    {"
  echo "      \"name\": \"MediumLoad\","
  echo "      \"throughput_rps\": $MEDIUM_RPS,"
  echo "      \"error_rate_percent\": $MEDIUM_ERROR,"
  echo "      \"p99_latency_ms\": \"$MEDIUM_P99\""
  echo "    }"
  echo "  ]"
  echo "}"
 } > "$RESULTS_FILE"
 echo "Results saved to: $RESULTS_FILE"
 echo "Raw logs: $RESULTS_DIR/raw_$TIMESTAMP.log"
 # Show comparison with previous run if exists
 PREV_FILE=$(ls -t "$RESULTS_DIR"/load_test_*.json | sed -n '2p')
 if [ -n "$PREV_FILE" ]; then
  echo ""
  echo "=== Comparison with previous run ==="
  echo "Previous: $(basename $PREV_FILE)"
  echo "Current:  $(basename $RESULTS_FILE)"
  echo ""
  echo "Light Load Throughput:"
  echo "  Previous: $(jq -r '.tests[0].throughput_rps' "$PREV_FILE") RPS"
  echo "  Current:  $(jq -r '.tests[0].throughput_rps' "$RESULTS_FILE") RPS"
  echo "  Change:   $(echo "$(jq -r '.tests[0].throughput_rps' "$RESULTS_FILE") - $(jq -r '.tests[0].throughput_rps' "$PREV_FILE")" | bc -l) RPS"
 fi
--- a/scripts/validate-prod-config.sh
+++ b/scripts/validate-prod-config.sh
@ -1,204 +0,0 @@
 #!/bin/bash
 # Production Configuration Validator
 # Verifies all paths and configs are consistent for experiment lifecycle
 set -e
 BOLD='\033[1m'
 GREEN='\033[0;32m'
 RED='\033[0;31m'
 YELLOW='\033[1;33m'
 NC='\033[0m' # No Color
 echo -e "${BOLD}=== FetchML Production Configuration Validator ===${NC}\n"
 # Configuration file paths
 API_CONFIG="${1:-configs/config-prod.yaml}"
 WORKER_CONFIG="${2:-configs/worker-prod.toml}"
 errors=0
 warnings=0
 check_pass() {
    echo -e "${GREEN}✓${NC} $1"
 }
 check_fail() {
    echo -e "${RED}✗${NC} $1"
    ((errors++))
 }
 check_warn() {
    echo -e "${YELLOW}⚠${NC} $1"
    ((warnings++))
 }
 # 1. Check API server config exists
 echo -e "${BOLD}Checking API Server Configuration${NC}"
 if [ ! -f "$API_CONFIG" ]; then
    check_fail "API config not found: $API_CONFIG"
 else
    check_pass "API config found: $API_CONFIG"
    # Extract base_path from API config
    API_BASE_PATH=$(grep 'base_path:' "$API_CONFIG" | head -1 | awk '{print $2}' | tr -d '"')
    echo "  Base path: $API_BASE_PATH"
    # Check if path is absolute
    if [[ "$API_BASE_PATH" != /* ]]; then
        check_fail "base_path must be absolute: $API_BASE_PATH"
    else
        check_pass "base_path is absolute"
    fi
    # Check Redis config
    if grep -q 'redis:' "$API_CONFIG"; then
        check_pass "Redis configuration present"
    else
        check_fail "Redis configuration missing"
    fi
    # Check auth enabled
    if grep -q 'enabled: true' "$API_CONFIG"; then
        check_pass "Authentication enabled"
    else
        check_warn "Authentication disabled (not recommended for production)"
    fi
 fi
 echo ""
 # 2. Check Worker config (if provided)
 if [ -f "$WORKER_CONFIG" ]; then
    echo -e "${BOLD}Checking Worker Configuration${NC}"
    check_pass "Worker config found: $WORKER_CONFIG"
    # Extract base_path from worker config
    WORKER_BASE_PATH=$(grep 'base_path' "$WORKER_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
    echo "  Base path: $WORKER_BASE_PATH"
    # Compare paths
    if [ "$API_BASE_PATH" = "$WORKER_BASE_PATH" ]; then
        check_pass "API and Worker base_path match"
    else
        check_fail "base_path mismatch! API: $API_BASE_PATH, Worker: $WORKER_BASE_PATH"
    fi
    # Check podman_image configured
    if grep -q 'podman_image' "$WORKER_CONFIG"; then
        PODMAN_IMAGE=$(grep 'podman_image' "$WORKER_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
        check_pass "Podman image configured: $PODMAN_IMAGE"
    else
        check_fail "podman_image not configured"
    fi
 else
    check_warn "Worker config not found: $WORKER_CONFIG (optional for API server only)"
 fi
 echo ""
 # 3. Check directory structure (if base_path exists)
 if [ -n "$API_BASE_PATH" ] && [ -d "$API_BASE_PATH" ]; then
    echo -e "${BOLD}Checking Directory Structure${NC}"
    check_pass "Base directory exists: $API_BASE_PATH"
    # Check subdirectories
    for dir in experiments pending running finished failed; do
        if [ -d "$API_BASE_PATH/$dir" ]; then
            check_pass "$dir/ directory exists"
        else
            check_warn "$dir/ directory missing (will be created automatically)"
        fi
    done
    # Check permissions
    if [ -w "$API_BASE_PATH" ]; then
        check_pass "Base directory is writable"
    else
        check_fail "Base directory is not writable (check permissions)"
    fi
 elif [ -n "$API_BASE_PATH" ]; then
    check_warn "Base directory does not exist: $API_BASE_PATH (will need to be created)"
 fi
 echo ""
 # 4. Check Redis connectivity (if server is running)
 echo -e "${BOLD}Checking Redis Connectivity${NC}"
 if command -v redis-cli &> /dev/null; then
    if redis-cli ping &> /dev/null; then
        check_pass "Redis server is running and accessible"
        # Check queue
        QUEUE_SIZE=$(redis-cli llen fetchml:tasks:queue 2>/dev/null || echo "0")
        echo "  Queue size: $QUEUE_SIZE tasks"
    else
        check_warn "Redis server not accessible (start with: redis-server)"
    fi
 else
    check_warn "redis-cli not installed (cannot verify Redis connectivity)"
 fi
 echo ""
 # 5. Check Podman (if worker config exists)
 if [ -f "$WORKER_CONFIG" ]; then
    echo -e "${BOLD}Checking Podman${NC}"
    if command -v podman &> /dev/null; then
        check_pass "Podman is installed"
        # Check if image exists
        if [ -n "$PODMAN_IMAGE" ]; then
            if podman image exists "$PODMAN_IMAGE" 2>/dev/null; then
                check_pass "Podman image exists: $PODMAN_IMAGE"
            else
                check_warn "Podman image not found: $PODMAN_IMAGE (needs to be built)"
            fi
        fi
        # Check GPU access (if configured)
        if grep -q 'gpu_access.*true' "$WORKER_CONFIG" 2>/dev/null; then
            if podman run --rm --device nvidia.com/gpu=all nvidia/cuda:11.8.0-base nvidia-smi &>/dev/null; then
                check_pass "GPU access working"
            else
                check_warn "GPU access configured but not working (check nvidia-container-toolkit)"
            fi
        fi
    else
        check_fail "Podman not installed (required for worker)"
    fi
 fi
 echo ""
 # 6. Check CLI config consistency
 echo -e "${BOLD}Checking CLI Configuration${NC}"
 CLI_CONFIG="$HOME/.ml/config.toml"
 if [ -f "$CLI_CONFIG" ]; then
    check_pass "CLI config found: $CLI_CONFIG"
    CLI_BASE=$(grep 'worker_base' "$CLI_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
    if [ "$CLI_BASE" = "$API_BASE_PATH" ]; then
        check_pass "CLI worker_base matches server base_path"
    else
        check_warn "CLI worker_base ($CLI_BASE) differs from server ($API_BASE_PATH)"
    fi
 else
    check_warn "CLI config not found (run: ml init)"
 fi
 echo ""
 # Summary
 echo -e "${BOLD}=== Summary ===${NC}"
 if [ $errors -eq 0 ] && [ $warnings -eq 0 ]; then
    echo -e "${GREEN}All checks passed! Configuration is ready for production.${NC}"
    exit 0
 elif [ $errors -eq 0 ]; then
    echo -e "${YELLOW}Configuration has $warnings warning(s). Review before deployment.${NC}"
    exit 0
 else
    echo -e "${RED}Configuration has $errors error(s) and $warnings warning(s). Fix before deployment.${NC}"
    exit 1
 fi
--- a/scripts/verify_release.sh
+++ b/scripts/verify_release.sh
@ -0,0 +1,148 @@
 #!/usr/bin/env bash
 set -euo pipefail
 usage() {
  cat <<'EOF'
 Usage:
  scripts/verify_release.sh --dir <release_dir> [--repo <org>/<repo>]
 What it does:
 - Verifies checksums.txt signature (keyless cosign) if cosign + checksums.txt.sig/.cert are present
 - Verifies *.tar.gz files against checksums.txt
 Notes:
 - --repo enables strict Sigstore identity checking against the release workflow.
 - Without cosign, the script still verifies SHA256 hashes.
 Examples:
  scripts/verify_release.sh --dir ./release --repo jfraeys/fetch_ml
  scripts/verify_release.sh --dir .
 EOF
 }
 release_dir=""
 repo=""
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --dir)
      release_dir="${2:-}"
      shift 2
      ;;
    --repo)
      repo="${2:-}"
      shift 2
      ;;
    -h|--help)
      usage
      exit 0
      ;;
    *)
      echo "unknown argument: $1" >&2
      usage >&2
      exit 2
      ;;
  esac
 done
 if [[ -z "$release_dir" ]]; then
  echo "missing --dir" >&2
  usage >&2
  exit 2
 fi
 if [[ ! -d "$release_dir" ]]; then
  echo "directory not found: $release_dir" >&2
  exit 2
 fi
 cd "$release_dir"
 if [[ ! -f checksums.txt ]]; then
  echo "missing checksums.txt in $release_dir" >&2
  exit 2
 fi
 has_cosign=false
 if command -v cosign >/dev/null 2>&1; then
  has_cosign=true
 fi
 verify_sigstore() {
  if [[ ! -f checksums.txt.sig ]] || [[ ! -f checksums.txt.cert ]]; then
    echo "[verify] cosign available, but checksums.txt.sig/.cert not found; skipping signature verification" >&2
    return 0
  fi
  if [[ -z "$repo" ]]; then
    echo "[verify] verifying signature (no repo identity pin; pass --repo to pin identity)" >&2
    COSIGN_YES=true cosign verify-blob \
      --certificate checksums.txt.cert \
      --signature checksums.txt.sig \
      --certificate-oidc-issuer https://token.actions.githubusercontent.com \
      checksums.txt >/dev/null
    echo "[ok] checksums.txt signature verified (un-pinned identity)"
    return 0
  fi
  local identity
  identity="^https://github.com/${repo}/\.github/workflows/release\.yml@refs/tags/v.*$"
  COSIGN_YES=true cosign verify-blob \
    --certificate checksums.txt.cert \
    --signature checksums.txt.sig \
    --certificate-identity-regexp "$identity" \
    --certificate-oidc-issuer https://token.actions.githubusercontent.com \
    checksums.txt >/dev/null
  echo "[ok] checksums.txt signature verified (pinned to ${repo} release workflow)"
 }
 verify_hashes() {
  local failures=0
  local has_sha256sum=false
  if command -v sha256sum >/dev/null 2>&1; then
    has_sha256sum=true
  fi
  while IFS= read -r expected file; do
    [[ -z "${expected}" ]] && continue
    [[ -z "${file}" ]] && continue
    if [[ ! -f "$file" ]]; then
      continue
    fi
    local actual
    if [[ "$has_sha256sum" == true ]]; then
      actual="$(sha256sum "$file" | awk '{print $1}')"
    else
      actual="$(shasum -a 256 "$file" | awk '{print $1}')"
    fi
    if [[ "$actual" != "$expected" ]]; then
      echo "[fail] $file" >&2
      echo "       expected: $expected" >&2
      echo "       actual:   $actual" >&2
      failures=$((failures+1))
    fi
  done < <(awk '{print $1, $2}' checksums.txt)
  if [[ $failures -gt 0 ]]; then
    echo "[fail] checksum verification failed ($failures file(s))" >&2
    exit 1
  fi
  echo "[ok] all available artifacts match checksums.txt"
 }
 if [[ "$has_cosign" == true ]]; then
  verify_sigstore
 else
  echo "[verify] cosign not installed; skipping signature verification" >&2
 fi
 verify_hashes
 echo "[ok] release verification complete"
--- a/tools/manage.sh
+++ b/tools/manage.sh
@ -5,6 +5,10 @@
 set -euo pipefail
 make_target_exists() {
    make -n "$1" >/dev/null 2>&1
 }
 # Colors
 RED='\033[0;31m'
 GREEN='\033[0;32m'
@ -45,7 +49,7 @@ show_status() {
    # Check Go apps
    print_app "Go Applications:"
-    local go_apps=("api-server" "worker" "tui" "data_manager" "user_manager")
+    local go_apps=("api-server" "worker" "tui")
    for app in "${go_apps[@]}"; do
        if [[ -f "bin/$app" ]]; then
            echo "  ✅ $app: Built"
@ -85,7 +89,7 @@ show_status() {
    # Check configuration
    print_app "Configuration:"
-    if [[ -f "configs/config-local.yaml" ]]; then
+    if [[ -f "configs/api/dev.yaml" ]]; then
        echo "  ✅ Security config: Found"
    else
        echo "  ⚠️  Security config: Not found"
@ -110,14 +114,14 @@ build_all() {
    echo "============================="
    echo ""
    print_info "Building Go applications..."
    make build
    if command -v zig &> /dev/null; then
-        print_info "Building Zig CLI..."
+        print_info "Building all components (Go + Zig CLI)..."
-        make cli-build
+        make build
    else
-        print_warning "Zig not found, skipping CLI build"
+        print_warning "Zig not found, building Go components only"
        go build -o bin/api-server cmd/api-server/main.go
        go build -o bin/worker cmd/worker/worker_server.go
        go build -o bin/tui ./cmd/tui
    fi
    print_success "Build completed!"
@ -128,11 +132,13 @@ test_all() {
    echo "===================="
    echo ""
-    print_info "Running main test suite..."
+    if make_target_exists test-full; then
-    make test
+        print_info "Running full test suite..."
-    
+        make test-full
-    print_info "Running comprehensive tests..."
+    else
-    make test-all
+        print_info "Running test suite..."
        make test
    fi
    print_success "All tests completed!"
 }
@ -156,8 +162,8 @@ start_services() {
    # Start API server if built
    if [[ -f "bin/api-server" ]]; then
        print_info "Starting API server..."
-        if [[ -f "configs/config-local.yaml" ]]; then
+        if [[ -f "configs/api/dev.yaml" ]]; then
-            ./bin/api-server --config configs/config-local.yaml &
+            ./bin/api-server --config configs/api/dev.yaml &
        else
            print_warning "No config found, using defaults"
            ./bin/api-server &
@ -187,13 +193,25 @@ check_health() {
    print_info "Port 9101 is open, checking API health endpoint..."
    # Try the health endpoint
-    response=$(curl -k -s --max-time 3 -H 'X-API-Key: password' -H 'X-Forwarded-For: 127.0.0.1' https://localhost:9101/health 2>/dev/null)
+    local api_key_header=""
    if [[ -n "${FETCH_ML_API_KEY:-}" ]]; then
        api_key_header="-H X-API-Key: ${FETCH_ML_API_KEY}"
    fi
    response=$(curl -s --max-time 3 ${api_key_header} http://localhost:9101/health 2>/dev/null || true)
    if [[ -z "$response" ]]; then
        response=$(curl -k -s --max-time 3 ${api_key_header} https://localhost:9101/health 2>/dev/null || true)
    fi
    if [[ "$response" == "OK" ]]; then
        print_success "API is healthy: $response"
    elif [[ "$response" == *"IP not whitelisted"* ]]; then
        print_warning "API running but IP not whitelisted (expected behavior)"
-        print_info "Try: curl -k -H 'X-API-Key: password' -H 'X-Forwarded-For: 127.0.0.1' https://localhost:9101/health"
+        if [[ -n "${FETCH_ML_API_KEY:-}" ]]; then
            print_info "Try: curl -k -H 'X-API-Key: $FETCH_ML_API_KEY' https://localhost:9101/health"
        else
            print_info "Try: curl -k https://localhost:9101/health"
        fi
    else
        print_error "Unexpected response: $response"
    fi
@ -229,19 +247,36 @@ run_security() {
    case "${1:-check}" in
        "check")
            print_info "Running security checks..."
-            make security-check
+            if make_target_exists security-check; then
                make security-check
            else
                print_warning "No 'security-check' Make target found"
                print_info "Try: make ci-local"
            fi
            ;;
        "monitor")
            print_info "Starting security monitoring..."
-            make security-monitor
+            if make_target_exists security-monitor; then
                make security-monitor
            else
                print_warning "No 'security-monitor' Make target found"
            fi
            ;;
        "deploy")
            print_info "Deploying with security..."
-            make security-deploy
+            if make_target_exists security-deploy; then
                make security-deploy
            else
                print_warning "No 'security-deploy' Make target found"
            fi
            ;;
        "audit")
            print_info "Running security audit..."
-            make security-audit
+            if make_target_exists security-audit; then
                make security-audit
            else
                print_warning "No 'security-audit' Make target found"
            fi
            ;;
        *)
            echo "Usage: $0 security {check|monitor|deploy|audit}"
@ -258,15 +293,22 @@ run_development() {
    case "${1:-setup}" in
        "setup")
            print_info "Setting up development environment..."
-            ./scripts/auto_setup.sh
+            print_warning "Legacy setup scripts were removed; using Makefile/deployments instead"
            print_info "Try: make dev"
            print_info "Or: ./deployments/deploy.sh dev up"
            ;;
        "quick")
            print_info "Running quick start..."
-            ./scripts/quick_start.sh
+            print_warning "Legacy quick start script was removed; using deployments instead"
            print_info "Try: ./deployments/deploy.sh dev up"
            ;;
        "deps")
            print_info "Installing dependencies..."
-            make install-deps
+            if make_target_exists install-deps; then
                make install-deps
            else
                print_warning "No 'install-deps' Make target found"
            fi
            ;;
        *)
            echo "Usage: $0 dev {setup|quick|deps}"
@ -309,7 +351,7 @@ cleanup() {
    echo ""
    print_info "Cleaning project artifacts..."
-    make clean-all
+    make clean
    print_info "Stopping services..."
    stop_services
@ -330,7 +372,7 @@ show_help() {
    echo "  start     - Start all services"
    echo "  stop      - Stop all services"
    echo "  health    - Check API health endpoint"
-echo "  security  - Security management (check|monitor|deploy|audit)"
+    echo "  security  - Security management (check|monitor|deploy|audit)"
    echo "  dev       - Development environment (setup|quick|deps)"
    echo "  logs      - Show application logs"
    echo "  cleanup   - Clean project artifacts and stop services"
--- a/tools/performance_regression_detector.go
+++ b/tools/performance_regression_detector.go
@ -47,7 +47,10 @@ type Improvement struct {
 }
 // NewPerformanceRegressionDetector creates a new detector instance
-func NewPerformanceRegressionDetector(baselineFile string, threshold float64) *PerformanceRegressionDetector {
+func NewPerformanceRegressionDetector(
 	baselineFile string,
 	threshold float64,
 ) *PerformanceRegressionDetector {
 	return &PerformanceRegressionDetector{
 		BaselineFile: baselineFile,
 		Threshold:    threshold,
@ -74,7 +77,9 @@ func (prd *PerformanceRegressionDetector) LoadBaseline() ([]BenchmarkResult, err
 }
 // AnalyzeResults analyzes current results against baseline
-func (prd *PerformanceRegressionDetector) AnalyzeResults(current []BenchmarkResult) (*RegressionReport, error) {
+func (prd *PerformanceRegressionDetector) AnalyzeResults(
 	current []BenchmarkResult,
 ) (*RegressionReport, error) {
 	baseline, err := prd.LoadBaseline()
 	if err != nil {
 		return nil, fmt.Errorf("failed to load baseline: %w", err)
--- a/Show more
+++ b/Show more
		`@ -0,0 +1 @@`
							`{"dashboard": {"id": null, "title": "Load Test Performance", "tags": ["load-test", "performance"], "panels": [{"id": 1, "title": "Service Status", "type": "stat", "targets": [{"expr": "up", "legendFormat": "{{job}}"}]}]}}`