From f7268067703cfe9e37e468ccbd3829a39f1ef7d6 Mon Sep 17 00:00:00 2001 From: Jeremie Fraeys Date: Mon, 5 Jan 2026 12:31:26 -0500 Subject: [PATCH] chore(ops): reorganize deployments/monitoring and remove legacy scripts --- configs/api/dev.yaml | 56 +++ configs/api/homelab-secure.yaml | 71 +++ configs/api/multi-user.yaml | 74 +++ configs/api/prod.yaml | 59 +++ configs/config-local.toml | 8 - configs/config-test.yaml | 26 - configs/deprecated/config-debug.yaml | 17 - configs/deprecated/config-docker-full.yaml | 46 -- configs/environments/config-docker.yaml | 39 -- .../environments/config-homelab-secure.yaml | 58 --- configs/environments/config-local.yaml | 49 -- configs/environments/config-multi-user.yaml | 78 --- configs/environments/config-prod.yaml | 59 --- configs/examples/config-postgres.yaml | 48 +- configs/examples/config.yaml.example | 55 +-- configs/schema/api_server_config.yaml | 122 +++-- configs/schema/config_schema.yaml | 0 configs/schema/worker_config_schema.yaml | 115 ++++- configs/workers/docker-dev.yaml | 58 +++ configs/workers/docker-prod.yaml | 50 ++ configs/workers/docker.yaml | 43 ++ configs/workers/examples/prewarm-worker.yaml | 27 ++ configs/workers/homelab-secure.yaml | 47 ++ configs/workers/worker-docker.yaml | 51 -- configs/workers/worker-homelab-secure.yaml | 79 --- configs/workers/worker-prod.toml | 22 +- deployments/Caddyfile.dev | 45 ++ deployments/Caddyfile.homelab-secure | 44 ++ deployments/Caddyfile.prod | 47 ++ deployments/Caddyfile.smoke | 23 + deployments/Makefile | 76 +++ deployments/README.md | 124 ++++- deployments/deploy.sh | 162 +++++++ deployments/docker-compose.dev.yml | 225 +++++++++ deployments/docker-compose.homelab-secure.yml | 188 +++++--- deployments/docker-compose.prod.smoke.yml | 75 +++ deployments/docker-compose.prod.yml | 99 +++- deployments/env.dev.example | 17 + deployments/env.prod.example | 28 ++ deployments/setup.sh | 112 +++++ monitoring/README.md | 208 ++++---- monitoring/dashboards/grafana-dashboard.json | 147 ------ monitoring/dashboards/logs-dashboard.json | 278 ----------- .../dashboards/performance-dashboard.json | 157 ------ monitoring/docker-compose.performance.yml | 64 --- .../dashboards/load-test-performance.json | 51 ++ .../grafana/dashboards/load-test-simple.json | 1 + monitoring/grafana/dashboards/loki-logs.json | 51 ++ .../dashboards/prewarm-performance.txt | 135 ++++++ .../grafana/dashboards/rsync-performance.json | 86 ++++ .../grafana/dashboards/system-health.json | 51 ++ .../dashboards/websocket-performance.json | 68 +++ .../grafana/dashboards/worker-resources.json | 280 +++++++++++ .../provisioning/dashboards/dashboards.yml | 1 - .../grafana/provisioning/datasources/loki.yml | 9 + .../{datasources.yml => prometheus.yml} | 12 +- monitoring/health-testing.md | 100 ++++ monitoring/loki-config.yml | 2 +- monitoring/loki-performance-config.yaml | 40 -- .../performance-dashboard.json | 0 monitoring/{ => prometheus}/prometheus.yml | 32 +- monitoring/promtail-performance-config.yaml | 50 -- monitoring/security_rules.yml | 112 ----- podman/README.md | 2 +- podman/secure-ml-runner.podfile | 4 + podman/secure_runner.py | 265 +++++++--- podman/security_policy.json | 2 +- scripts/README.md | 27 +- scripts/auto-cleanup.service | 0 scripts/auto-cleanup.timer | 0 scripts/benchmarks/run-benchmarks-local.sh | 22 +- scripts/cleanup-status.sh | 0 scripts/cleanup.sh | 0 scripts/create_bitwarden_fetchml_item.sh | 49 -- scripts/deployment/setup-auto-cleanup.sh | 90 ---- scripts/deployment/setup-monitoring-prod.sh | 275 ----------- scripts/deployment/setup-prod.sh | 229 --------- scripts/deployment/setup-production.sh | 0 scripts/legacy/auto_setup.sh | 455 ------------------ scripts/legacy/quick_start.sh | 314 ------------ scripts/legacy/setup_common.sh | 124 ----- scripts/legacy/setup_rocky.sh | 417 ---------------- scripts/legacy/setup_ubuntu.sh | 294 ----------- scripts/legacy/test_tools.sh | 67 --- scripts/maintenance/auto-cleanup.service | 2 +- scripts/maintenance/cleanup-benchmarks.sh | 62 ++- scripts/maintenance/cleanup.sh | 30 +- scripts/manage-artifacts.sh | 23 +- scripts/setup-auto-cleanup.sh | 0 scripts/setup-secure-homelab.sh | 169 ------- scripts/setup.sh | 311 ------------ scripts/setup_monitoring.py | 62 +++ scripts/smoke-test.sh | 111 +++++ scripts/testing/run-full-test-suite.sh | 0 scripts/testing/test-homelab-secure.sh | 80 --- scripts/track_performance.sh | 64 +++ scripts/validate-prod-config.sh | 204 -------- scripts/verify_release.sh | 148 ++++++ tools/manage.sh | 94 +++- tools/performance_regression_detector.go | 9 +- tools/profiler.go | 18 +- 101 files changed, 3598 insertions(+), 4982 deletions(-) create mode 100644 configs/api/dev.yaml create mode 100644 configs/api/homelab-secure.yaml create mode 100644 configs/api/multi-user.yaml create mode 100644 configs/api/prod.yaml delete mode 100644 configs/config-local.toml delete mode 100644 configs/config-test.yaml delete mode 100644 configs/deprecated/config-debug.yaml delete mode 100644 configs/deprecated/config-docker-full.yaml delete mode 100644 configs/environments/config-docker.yaml delete mode 100644 configs/environments/config-homelab-secure.yaml delete mode 100644 configs/environments/config-local.yaml delete mode 100644 configs/environments/config-multi-user.yaml delete mode 100644 configs/environments/config-prod.yaml delete mode 100644 configs/schema/config_schema.yaml create mode 100644 configs/workers/docker-dev.yaml create mode 100644 configs/workers/docker-prod.yaml create mode 100644 configs/workers/docker.yaml create mode 100644 configs/workers/examples/prewarm-worker.yaml create mode 100644 configs/workers/homelab-secure.yaml delete mode 100644 configs/workers/worker-docker.yaml delete mode 100644 configs/workers/worker-homelab-secure.yaml create mode 100644 deployments/Caddyfile.dev create mode 100644 deployments/Caddyfile.homelab-secure create mode 100644 deployments/Caddyfile.prod create mode 100644 deployments/Caddyfile.smoke create mode 100644 deployments/Makefile create mode 100755 deployments/deploy.sh create mode 100644 deployments/docker-compose.dev.yml create mode 100644 deployments/docker-compose.prod.smoke.yml create mode 100644 deployments/env.dev.example create mode 100644 deployments/env.prod.example create mode 100644 deployments/setup.sh delete mode 100644 monitoring/dashboards/grafana-dashboard.json delete mode 100644 monitoring/dashboards/logs-dashboard.json delete mode 100644 monitoring/dashboards/performance-dashboard.json delete mode 100644 monitoring/docker-compose.performance.yml create mode 100644 monitoring/grafana/dashboards/load-test-performance.json create mode 100644 monitoring/grafana/dashboards/load-test-simple.json create mode 100644 monitoring/grafana/dashboards/loki-logs.json create mode 100644 monitoring/grafana/dashboards/prewarm-performance.txt create mode 100644 monitoring/grafana/dashboards/rsync-performance.json create mode 100644 monitoring/grafana/dashboards/system-health.json create mode 100644 monitoring/grafana/dashboards/websocket-performance.json create mode 100644 monitoring/grafana/dashboards/worker-resources.json create mode 100644 monitoring/grafana/provisioning/datasources/loki.yml rename monitoring/grafana/provisioning/datasources/{datasources.yml => prometheus.yml} (50%) create mode 100644 monitoring/health-testing.md delete mode 100644 monitoring/loki-performance-config.yaml delete mode 100644 monitoring/performance/grafana-dashboards/performance-dashboard.json rename monitoring/{ => prometheus}/prometheus.yml (58%) delete mode 100644 monitoring/promtail-performance-config.yaml delete mode 100644 monitoring/security_rules.yml delete mode 100644 scripts/auto-cleanup.service delete mode 100644 scripts/auto-cleanup.timer delete mode 100644 scripts/cleanup-status.sh delete mode 100644 scripts/cleanup.sh delete mode 100644 scripts/create_bitwarden_fetchml_item.sh delete mode 100755 scripts/deployment/setup-auto-cleanup.sh delete mode 100755 scripts/deployment/setup-monitoring-prod.sh delete mode 100755 scripts/deployment/setup-prod.sh delete mode 100644 scripts/deployment/setup-production.sh delete mode 100755 scripts/legacy/auto_setup.sh delete mode 100755 scripts/legacy/quick_start.sh delete mode 100755 scripts/legacy/setup_common.sh delete mode 100755 scripts/legacy/setup_rocky.sh delete mode 100755 scripts/legacy/setup_ubuntu.sh delete mode 100755 scripts/legacy/test_tools.sh delete mode 100644 scripts/setup-auto-cleanup.sh delete mode 100755 scripts/setup-secure-homelab.sh delete mode 100755 scripts/setup.sh create mode 100644 scripts/setup_monitoring.py create mode 100644 scripts/smoke-test.sh delete mode 100755 scripts/testing/run-full-test-suite.sh delete mode 100755 scripts/testing/test-homelab-secure.sh create mode 100755 scripts/track_performance.sh delete mode 100755 scripts/validate-prod-config.sh create mode 100644 scripts/verify_release.sh diff --git a/configs/api/dev.yaml b/configs/api/dev.yaml new file mode 100644 index 0000000..634adb2 --- /dev/null +++ b/configs/api/dev.yaml @@ -0,0 +1,56 @@ +base_path: "/data/experiments" + +data_dir: "/data/active" + +auth: + enabled: false + +server: + address: "0.0.0.0:9101" + tls: + enabled: false + cert_file: "/app/ssl/cert.pem" + key_file: "/app/ssl/key.pem" + +security: + production_mode: false + allowed_origins: + - "http://localhost:3000" + api_key_rotation_days: 90 + audit_logging: + enabled: true + log_path: "/tmp/fetchml-audit.log" + rate_limit: + enabled: false + requests_per_minute: 60 + burst_size: 10 + ip_whitelist: [] + +monitoring: + prometheus: + enabled: true + port: 9101 + path: "/metrics" + health_checks: + enabled: true + interval: "30s" + +redis: + addr: "redis:6379" + password: "" + db: 0 + +database: + type: "sqlite" + connection: "/tmp/fetchml.sqlite" + +logging: + level: "info" + file: "" + audit_log: "" + +resources: + max_workers: 1 + desired_rps_per_worker: 2 + podman_cpus: "2" + podman_memory: "4Gi" diff --git a/configs/api/homelab-secure.yaml b/configs/api/homelab-secure.yaml new file mode 100644 index 0000000..2e66039 --- /dev/null +++ b/configs/api/homelab-secure.yaml @@ -0,0 +1,71 @@ +base_path: "/data/experiments" + +data_dir: "/data/active" + +auth: + enabled: true + api_keys: + homelab_admin: + hash: "CHANGE_ME_SHA256_HOMELAB_ADMIN_KEY" + admin: true + roles: + - admin + permissions: + "*": true + homelab_user: + hash: "CHANGE_ME_SHA256_HOMELAB_USER_KEY" + admin: false + roles: + - researcher + permissions: + experiments: true + datasets: true + jupyter: true + +server: + address: ":9101" + tls: + enabled: false + cert_file: "/app/ssl/cert.pem" + key_file: "/app/ssl/key.pem" + +security: + production_mode: true + allowed_origins: + - "https://ml-experiments.example.com" + rate_limit: + enabled: true + requests_per_minute: 60 + burst_size: 10 + ip_whitelist: + - "127.0.0.1" + - "192.168.0.0/16" + +monitoring: + prometheus: + enabled: true + port: 9101 + path: "/metrics" + health_checks: + enabled: true + interval: "30s" + +redis: + url: "redis://:CHANGE_ME_REDIS_PASSWORD@redis:6379" + password: "" + db: 0 + +database: + type: "sqlite" + connection: "/data/experiments/fetch_ml.sqlite" + +logging: + level: "info" + file: "/logs/fetch_ml.log" + audit_log: "" + +resources: + max_workers: 1 + desired_rps_per_worker: 2 + podman_cpus: "2" + podman_memory: "4Gi" diff --git a/configs/api/multi-user.yaml b/configs/api/multi-user.yaml new file mode 100644 index 0000000..74d0bbc --- /dev/null +++ b/configs/api/multi-user.yaml @@ -0,0 +1,74 @@ +base_path: "/app/data/experiments" + +data_dir: "/data/active" + +auth: + enabled: true + api_keys: + admin_user: + hash: "CHANGE_ME_SHA256_ADMIN_USER_KEY" + admin: true + roles: ["user", "admin"] + permissions: + "*": true + researcher1: + hash: "CHANGE_ME_SHA256_RESEARCHER1_KEY" + admin: false + roles: ["user", "researcher"] + permissions: + "jobs:read": true + "jobs:create": true + "jobs:update": true + "jobs:delete": false + analyst1: + hash: "CHANGE_ME_SHA256_ANALYST1_KEY" + admin: false + roles: ["user", "analyst"] + permissions: + "jobs:read": true + "jobs:create": false + "jobs:update": false + "jobs:delete": false + +server: + address: ":9101" + tls: + enabled: false + +security: + production_mode: false + allowed_origins: [] + rate_limit: + enabled: true + requests_per_minute: 60 + burst_size: 20 + ip_whitelist: [] + +monitoring: + prometheus: + enabled: true + port: 9101 + path: "/metrics" + health_checks: + enabled: true + interval: "30s" + +redis: + url: "redis://redis:6379" + password: "" + db: 0 + +database: + type: "sqlite" + connection: "/app/data/experiments/fetch_ml.sqlite" + +logging: + level: "info" + file: "/logs/app.log" + audit_log: "" + +resources: + max_workers: 3 + desired_rps_per_worker: 3 + podman_cpus: "2" + podman_memory: "4Gi" diff --git a/configs/api/prod.yaml b/configs/api/prod.yaml new file mode 100644 index 0000000..726d1d2 --- /dev/null +++ b/configs/api/prod.yaml @@ -0,0 +1,59 @@ +base_path: "/app/data/experiments" + +data_dir: "/data/active" + +auth: + enabled: true + api_keys: + admin: + hash: "replace-with-sha256-of-your-api-key" + admin: true + roles: + - admin + permissions: + "*": true + +server: + address: ":9101" + tls: + enabled: true + cert_file: "/app/ssl/cert.pem" + key_file: "/app/ssl/key.pem" + +security: + production_mode: false + allowed_origins: [] + rate_limit: + enabled: true + requests_per_minute: 60 + burst_size: 10 + ip_whitelist: [] + +monitoring: + prometheus: + enabled: true + port: 9101 + path: "/metrics" + health_checks: + enabled: true + interval: "30s" + +redis: + addr: "redis:6379" + password: "" + db: 0 + +database: + type: "sqlite" + connection: "/app/data/experiments/fetch_ml.sqlite" + +logging: + level: "info" + file: "/logs/fetch_ml.log" + audit_log: "" + +resources: + max_workers: 2 + desired_rps_per_worker: 5 + podman_cpus: "2" + podman_memory: "4Gi" diff --git a/configs/config-local.toml b/configs/config-local.toml deleted file mode 100644 index f037b29..0000000 --- a/configs/config-local.toml +++ /dev/null @@ -1,8 +0,0 @@ -# Local development config (TOML) -# Used by both CLI and TUI when no overrides are set - -worker_host = "127.0.0.1" -worker_user = "dev_user" -worker_base = "/tmp/ml-experiments" -worker_port = 9101 -api_key = "your-api-key-here" diff --git a/configs/config-test.yaml b/configs/config-test.yaml deleted file mode 100644 index 0b006fb..0000000 --- a/configs/config-test.yaml +++ /dev/null @@ -1,26 +0,0 @@ -auth: - enabled: true - api_keys: - dev_user: - hash: "replace-with-sha256-of-your-api-key" - admin: true - roles: - - admin - permissions: - '*': true - -server: - address: ":9101" - tls: - enabled: false - -security: - rate_limit: - enabled: false - -redis: - url: "redis://redis:6379" - -logging: - level: info - console: true diff --git a/configs/deprecated/config-debug.yaml b/configs/deprecated/config-debug.yaml deleted file mode 100644 index f737c3a..0000000 --- a/configs/deprecated/config-debug.yaml +++ /dev/null @@ -1,17 +0,0 @@ -base_path: "/app/data/experiments" - -auth: - enabled: false - -server: - address: ":9101" - -database: - type: "sqlite" - connection: "/app/data/experiments/fetch_ml.db" - -redis: - url: "redis://redis:6379" - -logging: - level: "debug" diff --git a/configs/deprecated/config-docker-full.yaml b/configs/deprecated/config-docker-full.yaml deleted file mode 100644 index 651b8b2..0000000 --- a/configs/deprecated/config-docker-full.yaml +++ /dev/null @@ -1,46 +0,0 @@ -base_path: "/app/data/experiments" - -auth: - enabled: true - api_keys: - homelab_user: - hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password" - admin: true - roles: ["user", "admin"] - permissions: - read: true - write: true - delete: true - -server: - address: ":9101" - tls: - enabled: true - cert_file: "/app/ssl/cert.pem" - key_file: "/app/ssl/key.pem" - -security: - rate_limit: - enabled: true - requests_per_minute: 30 - ip_whitelist: [] - -# SQLite database for persistence -database: - type: "sqlite" - connection: "/app/data/fetch_ml.db" - -redis: - url: "redis://redis:6379" - max_connections: 10 - -logging: - level: "info" - file: "/app/logs/app.log" - audit_file: "/app/logs/audit.log" - -resources: - max_workers: 1 - desired_rps_per_worker: 2 - podman_cpus: "2" - podman_memory: "8g" diff --git a/configs/environments/config-docker.yaml b/configs/environments/config-docker.yaml deleted file mode 100644 index 5583d29..0000000 --- a/configs/environments/config-docker.yaml +++ /dev/null @@ -1,39 +0,0 @@ -base_path: "/app/data/experiments" - -auth: - enabled: true - api_keys: - homelab_user: - hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password" - admin: true - roles: ["user", "admin"] - permissions: - read: true - write: true - delete: true - -server: - address: ":9101" - tls: - enabled: true - cert_file: "/app/ssl/cert.pem" - key_file: "/app/ssl/key.pem" - -security: - rate_limit: - enabled: true - requests_per_minute: 30 - ip_whitelist: - - "127.0.0.1" - - "::1" - - "192.168.0.0/16" - - "10.0.0.0/8" - -redis: - url: "redis://redis:6379" - max_connections: 10 - -logging: - level: "info" - file: "/app/logs/app.log" - audit_file: "/app/logs/audit.log" diff --git a/configs/environments/config-homelab-secure.yaml b/configs/environments/config-homelab-secure.yaml deleted file mode 100644 index aaeaf11..0000000 --- a/configs/environments/config-homelab-secure.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# Secure Homelab Configuration -# IMPORTANT: Keep your API keys safe and never share them! - -redis: - url: "redis://redis:6379" - max_connections: 10 - -auth: - enabled: true - api_keys: - homelab_admin: - hash: b444f7d99edd0e32c838d900c4f0dfab86690b55871b587b730f3bc84812dd5f - admin: true - roles: - - admin - permissions: - '*': true - homelab_user: - hash: 5badb9721b0cb19f5be512854885cadbc7490afc0de1f62db5ae3144c6cc294c - admin: false - roles: - - researcher - permissions: - 'experiments': true - 'datasets': true - 'jupyter': true - -server: - address: ":9101" - tls: - enabled: true - key_file: "/app/ssl/key.pem" - cert_file: "/app/ssl/cert.pem" - -security: - rate_limit: - enabled: true - requests_per_minute: 60 - burst_size: 10 - ip_whitelist: [] - -logging: - level: "info" - file: "logs/fetch_ml.log" - console: true - -resources: - cpu_limit: "2" - memory_limit: "4Gi" - gpu_limit: 0 - disk_limit: "10Gi" - -# Prometheus metrics -metrics: - enabled: true - listen_addr: ":9100" - tls: - enabled: false diff --git a/configs/environments/config-local.yaml b/configs/environments/config-local.yaml deleted file mode 100644 index 17f85b6..0000000 --- a/configs/environments/config-local.yaml +++ /dev/null @@ -1,49 +0,0 @@ -redis: - url: "redis://redis:6379" - max_connections: 10 - -auth: - enabled: true - api_keys: - homelab_admin: - hash: b444f7d99edd0e32c838d900c4f0dfab86690b55871b587b730f3bc84812dd5f - admin: true - roles: - - admin - permissions: - '*': true - homelab_user: - hash: 5badb9721b0cb19f5be512854885cadbc7490afc0de1f62db5ae3144c6cc294c - admin: false - roles: - - researcher - permissions: - 'experiments': true - 'datasets': true - 'jupyter': true - -server: - address: ":9101" - tls: - enabled: true - cert_file: "/app/ssl/cert.pem" - key_file: "/app/ssl/key.pem" - -security: - rate_limit: - enabled: true - requests_per_minute: 60 - burst_size: 10 - ip_whitelist: - - "127.0.0.1" - - "::1" - - "172.21.0.1" # Docker gateway - -# Prometheus metrics -metrics: - enabled: true - listen_addr: ":9100" - tls: - enabled: true - cert_file: "/app/ssl/cert.pem" - key_file: "/app/ssl/key.pem" diff --git a/configs/environments/config-multi-user.yaml b/configs/environments/config-multi-user.yaml deleted file mode 100644 index 6fdcdc3..0000000 --- a/configs/environments/config-multi-user.yaml +++ /dev/null @@ -1,78 +0,0 @@ -base_path: "/app/data/experiments" - -auth: - enabled: true - api_keys: - admin_user: - hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password" - admin: true - roles: ["user", "admin"] - permissions: - read: true - write: true - delete: true - researcher1: - hash: "ef92b778ba7a6c8f2150019a5678047b6a9a2b95cef8189518f9b35c54d2e3ae" # "research123" - admin: false - roles: ["user", "researcher"] - permissions: - jobs:read: true - jobs:create: true - jobs:update: true - jobs:delete: false - analyst1: - hash: "a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3" # "analyst123" - admin: false - roles: ["user", "analyst"] - permissions: - jobs:read: true - jobs:create: false - jobs:update: false - jobs:delete: false - -server: - address: ":9101" - tls: - enabled: false - -security: - rate_limit: - enabled: true - requests_per_minute: 60 - burst_size: 20 - ip_whitelist: [] - cors: - enabled: true - allowed_origins: ["https://localhost:9103", "https://localhost:3000"] - allowed_methods: ["GET", "POST", "PUT", "DELETE", "OPTIONS"] - allowed_headers: ["Content-Type", "Authorization"] - -database: - type: "sqlite" - connection: "/app/data/experiments/fetch_ml.db" - max_connections: 20 - connection_timeout: "30s" - -redis: - url: "redis://redis:6379" - max_connections: 15 - connection_timeout: "10s" - -logging: - level: "info" - file: "/app/logs/app.log" - max_size: "100MB" - max_backups: 5 - compress: true - -resources: - max_workers: 3 - desired_rps_per_worker: 3 - podman_cpus: "2" - podman_memory: "4g" - job_timeout: "30m" - -monitoring: - enabled: true - metrics_path: "/metrics" - health_check_interval: "30s" diff --git a/configs/environments/config-prod.yaml b/configs/environments/config-prod.yaml deleted file mode 100644 index 8a5693c..0000000 --- a/configs/environments/config-prod.yaml +++ /dev/null @@ -1,59 +0,0 @@ -base_path: "./data/ml-experiments" - -auth: - enabled: true - apikeys: - homelab_user: - hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password" - admin: true - roles: ["admin"] - permissions: - read: true - write: true - delete: true - -server: - address: ":9101" - tls: - enabled: false # Disabled for local testing - cert_file: "./ssl/cert.pem" - key_file: "./ssl/key.pem" - min_version: "1.3" - -security: - rate_limit: - enabled: true - requests_per_minute: 60 - burst_size: 10 - ip_whitelist: - - "127.0.0.1" - - "::1" - - "localhost" - - "10.0.0.0/8" - - "192.168.0.0/16" - - "172.16.0.0/12" - failed_login_lockout: - enabled: true - max_attempts: 5 - lockout_duration: "15m" - -# SQLite database for production -database: - type: "sqlite" - connection: "data/fetch_ml.db" - -redis: - url: "redis://localhost:6379" - addr: "localhost:6379" - password: "JZVd2Y6IDaLNaYLBOFgQ7ae4Ox5t37NTIyPMQlLJD4k=" - -logging: - level: "info" - file: "logs/fetch_ml.log" - audit_log: "logs/audit.log" - -resources: - max_workers: 2 - desired_rps_per_worker: 5 - podman_cpus: "8" - podman_memory: "32g" diff --git a/configs/examples/config-postgres.yaml b/configs/examples/config-postgres.yaml index 35cf082..6d11b7f 100644 --- a/configs/examples/config-postgres.yaml +++ b/configs/examples/config-postgres.yaml @@ -1,13 +1,17 @@ # Fetch ML Configuration Example for PostgreSQL # This example shows how to configure Fetch ML to use PostgreSQL as the database +base_path: "./data/experiments" + auth: enabled: true - apikeys: + api_keys: admin: hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd5f8b6c8b0b4f0b8e3" # "password" admin: true roles: ["admin"] + permissions: + "*": true server: address: ":9101" @@ -25,40 +29,34 @@ database: # connection: "postgres://fetchml:your_password_here@localhost:5432/fetchml?sslmode=disable" redis: - host: "localhost" - port: 6379 + addr: "localhost:6379" password: "" db: 0 - pool_size: 10 - max_retries: 3 logging: level: "info" - console: true - format: "text" + file: "" + audit_log: "" security: - secret_key: "your-secret-key-here-at-least-16-characters" - jwt_expiry: "24h" + production_mode: false rate_limit: enabled: false requests_per_minute: 60 burst_size: 10 + ip_whitelist: [] -containers: - runtime: "podman" - registry: "docker.io" - pull_policy: "missing" - resources: - cpu_limit: "2" - memory_limit: "4Gi" - gpu_limit: 1 - -storage: - data_path: "data" - results_path: "results" - temp_path: "/tmp/fetch_ml" - cleanup: +monitoring: + prometheus: enabled: true - max_age_hours: 168 - max_size_gb: 10 + port: 9101 + path: "/metrics" + health_checks: + enabled: true + interval: "30s" + +resources: + max_workers: 1 + desired_rps_per_worker: 2 + podman_cpus: "2" + podman_memory: "4Gi" diff --git a/configs/examples/config.yaml.example b/configs/examples/config.yaml.example index c467c5a..6928efd 100644 --- a/configs/examples/config.yaml.example +++ b/configs/examples/config.yaml.example @@ -1,6 +1,8 @@ # Fetch ML Configuration Example # Copy this file to config.yaml and customize for your environment +base_path: "./data/experiments" + auth: enabled: true api_keys: @@ -13,54 +15,43 @@ auth: "*": true server: - host: "localhost" - port: 8080 + address: ":9101" + tls: + enabled: false database: type: "sqlite" connection: "data/fetch_ml.db" - host: "" - port: 5432 - username: "" - password: "" - database: "fetch_ml" redis: - url: "redis://localhost:6379" - host: "localhost" - port: 6379 + addr: "localhost:6379" password: "" db: 0 - pool_size: 10 - max_retries: 3 logging: level: "info" file: "logs/fetch_ml.log" - format: "text" - console: true + audit_log: "logs/audit.log" security: - secret_key: "your-secret-key-at-least-16-chars" - jwt_expiry: "24h" rate_limit: enabled: false requests_per_minute: 60 + burst_size: 10 + ip_whitelist: [] + production_mode: false -containers: - runtime: "podman" - registry: "docker.io" - pull_policy: "missing" - resources: - cpu_limit: "2" - memory_limit: "4Gi" - gpu_limit: 1 - -storage: - data_path: "data" - results_path: "results" - temp_path: "/tmp/fetch_ml" - cleanup: +monitoring: + prometheus: enabled: true - max_age_hours: 168 - max_size_gb: 10 + port: 9101 + path: "/metrics" + health_checks: + enabled: true + interval: "30s" + +resources: + max_workers: 1 + desired_rps_per_worker: 2 + podman_cpus: "2" + podman_memory: "4Gi" diff --git a/configs/schema/api_server_config.yaml b/configs/schema/api_server_config.yaml index f9385fc..6b0c586 100644 --- a/configs/schema/api_server_config.yaml +++ b/configs/schema/api_server_config.yaml @@ -12,6 +12,10 @@ properties: type: string description: Base path for experiment data default: "/tmp/ml-experiments" + data_dir: + type: string + description: Data directory (datasets/snapshots) for integrity validation + default: "/data/active" auth: type: object additionalProperties: false @@ -40,7 +44,6 @@ properties: type: array items: type: string - enum: [admin, data_scientist, data_engineer, viewer, operator] permissions: type: object additionalProperties: @@ -64,9 +67,30 @@ properties: type: string key_file: type: string - min_version: + monitoring: + type: object + additionalProperties: false + properties: + prometheus: + type: object + additionalProperties: false + properties: + enabled: + type: boolean + port: + type: integer + minimum: 1 + maximum: 65535 + path: + type: string + health_checks: + type: object + additionalProperties: false + properties: + enabled: + type: boolean + interval: type: string - description: Minimum TLS version (e.g. "1.3") database: type: object additionalProperties: false @@ -99,58 +123,56 @@ properties: addr: type: string description: Optional host:port shorthand for Redis - host: - type: string - default: "localhost" - port: - type: integer - minimum: 1 - maximum: 65535 - default: 6379 password: type: string db: type: integer minimum: 0 default: 0 - pool_size: - type: integer - minimum: 1 - default: 10 - max_retries: - type: integer - minimum: 0 - default: 3 + queue: + type: object + additionalProperties: false + properties: + backend: + type: string + enum: [redis, sqlite] + default: redis + sqlite_path: + type: string logging: type: object additionalProperties: false properties: level: type: string - enum: [debug, info, warn, error, fatal] + enum: [debug, info, warn, error] default: "info" file: type: string audit_log: type: string - format: - type: string - enum: [text, json] - default: "text" - console: - type: boolean - default: true security: type: object additionalProperties: false properties: - secret_key: - type: string - minLength: 16 - jwt_expiry: - type: string - pattern: "^\\d+[smhd]$" - default: "24h" + production_mode: + type: boolean + default: false + allowed_origins: + type: array + items: + type: string + api_key_rotation_days: + type: integer + minimum: 0 + audit_logging: + type: object + additionalProperties: false + properties: + enabled: + type: boolean + log_path: + type: string ip_whitelist: type: array items: @@ -183,23 +205,23 @@ properties: minimum: 1 resources: type: object - description: Resource configuration defaults + description: Resource configuration additionalProperties: false properties: - cpu_limit: - type: string - description: Default CPU limit (e.g., "2" or "500m") - default: "2" - memory_limit: - type: string - description: Default memory limit (e.g., "1Gi" or "512Mi") - default: "4Gi" - gpu_limit: + max_workers: type: integer - description: Default GPU limit - minimum: 0 - default: 0 - disk_limit: + minimum: 1 + default: 1 + desired_rps_per_worker: + type: integer + minimum: 1 + requests_per_sec: + type: integer + minimum: 1 + podman_cpus: type: string - description: Default disk limit - default: "10Gi" + podman_memory: + type: string + request_burst: + type: integer + minimum: 0 diff --git a/configs/schema/config_schema.yaml b/configs/schema/config_schema.yaml deleted file mode 100644 index e69de29..0000000 diff --git a/configs/schema/worker_config_schema.yaml b/configs/schema/worker_config_schema.yaml index b197b62..6ba48a8 100644 --- a/configs/schema/worker_config_schema.yaml +++ b/configs/schema/worker_config_schema.yaml @@ -2,10 +2,28 @@ $schema: "http://json-schema.org/draft-07/schema#" title: "Fetch ML Worker Configuration" type: object additionalProperties: false +allOf: + # forbid both index and UUID at once (allow zero or one) + - not: + required: [gpu_visible_devices, gpu_visible_device_ids] + - if: + properties: + queue: + properties: + backend: + const: sqlite + required: [queue] + then: + properties: + queue: + required: [sqlite_path] + else: + anyOf: + - required: [redis_addr] + - required: [redis_url] required: - base_path - worker_id - - redis_addr - podman_image - container_workspace - container_results @@ -31,6 +49,9 @@ properties: train_script: type: string description: Path to training script + redis_url: + type: string + description: Legacy Redis URL (if set, redis_addr/password/db are derived) redis_addr: type: string description: Redis server address @@ -42,6 +63,18 @@ properties: minimum: 0 default: 0 description: Redis database number + queue: + type: object + description: Queue backend configuration (optional; defaults to redis) + additionalProperties: false + properties: + backend: + type: string + enum: [redis, sqlite] + default: redis + sqlite_path: + type: string + description: Path to queue.db (sqlite backend only) known_hosts: type: string description: Path to SSH known hosts file @@ -116,6 +149,48 @@ properties: type: string description: Dataset cache TTL duration default: "30m" + snapshot_store: + type: object + description: Optional S3-compatible snapshot store configuration (worker pulls snapshots by snapshot_id) + additionalProperties: false + properties: + enabled: + type: boolean + default: false + endpoint: + type: string + description: S3-compatible endpoint (e.g. "s3.amazonaws.com" or "minio:9000") + secure: + type: boolean + default: true + region: + type: string + bucket: + type: string + prefix: + type: string + description: Object key prefix where snapshots are stored + access_key: + type: string + description: Optional static access key (otherwise uses env credentials) + secret_key: + type: string + description: Optional static secret key (otherwise uses env credentials) + session_token: + type: string + description: Optional session token for temporary credentials + timeout: + type: string + description: Duration string (e.g., "10m") + default: "10m" + max_retries: + type: integer + minimum: 0 + default: 3 + prewarm_enabled: + type: boolean + description: Enable best-effort prewarming of next task artifacts (snapshot/datasets/env image). Default off. + default: false podman_image: type: string minLength: 1 @@ -126,10 +201,40 @@ properties: container_results: type: string description: Container results path - gpu_access: - type: boolean - default: false - description: Enable GPU access + gpu_devices: + type: array + description: GPU device paths to expose to the container (e.g. ["/dev/dri"]). + items: + type: string + gpu_vendor: + type: string + enum: [nvidia, amd, apple, none] + description: GPU vendor/runtime selection for env var injection (nvidia|amd|apple|none). + default: "none" + gpu_visible_devices: + type: array + description: GPU indices to expose via vendor-specific env (e.g. [0,1]). + items: + type: integer + gpu_visible_device_ids: + type: array + description: NVIDIA GPU UUIDs to expose via CUDA_VISIBLE_DEVICES (e.g. ["GPU-..."]). Mutually exclusive with gpu_visible_devices. + items: + type: string + apple_gpu: + type: object + description: Apple M-series GPU configuration + additionalProperties: false + properties: + enabled: + type: boolean + default: false + metal_device: + type: string + description: Path to Metal device node (e.g. /dev/metal) + mps_runtime: + type: string + description: Path to MPS runtime device node (e.g. /dev/mps) task_lease_duration: type: string description: Task lease duration diff --git a/configs/workers/docker-dev.yaml b/configs/workers/docker-dev.yaml new file mode 100644 index 0000000..e688ccd --- /dev/null +++ b/configs/workers/docker-dev.yaml @@ -0,0 +1,58 @@ +worker_id: "docker-worker" +base_path: "/data/experiments" +train_script: "train.py" + +redis_url: "redis://redis:6379/0" + +local_mode: true + +prewarm_enabled: true + +max_workers: 1 +poll_interval_seconds: 2 + +auto_fetch_data: false + +data_manager_path: "./data_manager" +dataset_cache_ttl: "30m" + +data_dir: "/data/active" + +snapshot_store: + enabled: true + endpoint: "minio:9000" + secure: false + bucket: "fetchml-snapshots" + prefix: "snapshots" + timeout: "2m" + max_retries: 3 + +podman_image: "python:3.9-slim" +container_workspace: "/workspace" +container_results: "/results" +gpu_devices: + - "/dev/dri" +gpu_vendor: "apple" +gpu_visible_devices: [] + +# Apple M-series GPU configuration +apple_gpu: + enabled: true + metal_device: "/dev/metal" + mps_runtime: "/dev/mps" + +resources: + max_workers: 1 + desired_rps_per_worker: 2 + podman_cpus: "2" + podman_memory: "4Gi" + +metrics: + enabled: true + listen_addr: ":9100" +metrics_flush_interval: "500ms" + +task_lease_duration: "30m" +heartbeat_interval: "1m" +max_retries: 3 +graceful_timeout: "5m" diff --git a/configs/workers/docker-prod.yaml b/configs/workers/docker-prod.yaml new file mode 100644 index 0000000..684fd0e --- /dev/null +++ b/configs/workers/docker-prod.yaml @@ -0,0 +1,50 @@ +worker_id: "docker-worker" +base_path: "/tmp/fetchml-jobs" +train_script: "train.py" + +redis_url: "redis://redis:6379/0" + +local_mode: true + +max_workers: 1 +poll_interval_seconds: 2 + +auto_fetch_data: false + +data_manager_path: "./data_manager" +dataset_cache_ttl: "30m" + +data_dir: "/data/active" + +snapshot_store: + enabled: true + endpoint: "minio:9000" + secure: false + bucket: "fetchml-snapshots" + prefix: "snapshots" + timeout: "5m" + max_retries: 3 + +podman_image: "python:3.9-slim" +container_workspace: "/workspace" +container_results: "/results" +gpu_vendor: "nvidia" +gpu_visible_devices: [0] +gpu_devices: ["/dev/nvidia0"] + + +resources: + max_workers: 1 + desired_rps_per_worker: 2 + podman_cpus: "2" + podman_memory: "4Gi" + +metrics: + enabled: true + listen_addr: ":9100" +metrics_flush_interval: "500ms" + +task_lease_duration: "30m" +heartbeat_interval: "1m" +max_retries: 3 +graceful_timeout: "5m" diff --git a/configs/workers/docker.yaml b/configs/workers/docker.yaml new file mode 100644 index 0000000..15cb93f --- /dev/null +++ b/configs/workers/docker.yaml @@ -0,0 +1,43 @@ +worker_id: "docker-worker" +base_path: "/tmp/fetchml-jobs" +train_script: "train.py" + +redis_addr: "redis:6379" +redis_password: "" +redis_db: 0 + +local_mode: true + +max_workers: 1 +poll_interval_seconds: 5 + +auto_fetch_data: false + +data_manager_path: "./data_manager" +dataset_cache_ttl: "30m" + +snapshot_store: + enabled: false + +podman_image: "python:3.9-slim" +container_workspace: "/workspace" +container_results: "/results" +gpu_devices: [] +gpu_vendor: "none" +gpu_visible_devices: [] + +resources: + max_workers: 1 + desired_rps_per_worker: 2 + podman_cpus: "2" + podman_memory: "4Gi" + +metrics: + enabled: true + listen_addr: ":9100" +metrics_flush_interval: "500ms" + +task_lease_duration: "30m" +heartbeat_interval: "1m" +max_retries: 3 +graceful_timeout: "5m" diff --git a/configs/workers/examples/prewarm-worker.yaml b/configs/workers/examples/prewarm-worker.yaml new file mode 100644 index 0000000..2d8c8b4 --- /dev/null +++ b/configs/workers/examples/prewarm-worker.yaml @@ -0,0 +1,27 @@ +worker_id: "test-prewarm-worker" +host: "localhost" +port: 8081 +base_path: "/tmp/fetch-ml-test" +data_dir: "/tmp/fetch-ml-test/data" +max_workers: 2 +local_mode: true +auto_fetch_data: true +prewarm_enabled: true +metrics: + enabled: true + listen_addr: ":9102" +train_script: "train.py" +snapshot_store: + enabled: false + endpoint: "" + secure: false + region: "" + bucket: "" + prefix: "" + access_key: "" + secret_key: "" + session_token: "" + max_retries: 3 + timeout: 0s +gpu_devices: [] +gpu_access: "none" diff --git a/configs/workers/homelab-secure.yaml b/configs/workers/homelab-secure.yaml new file mode 100644 index 0000000..e03445c --- /dev/null +++ b/configs/workers/homelab-secure.yaml @@ -0,0 +1,47 @@ +worker_id: "homelab-worker" +base_path: "/tmp/fetchml-jobs" +train_script: "train.py" + +redis_url: "redis://:${REDIS_PASSWORD}@redis:6379/0" + +local_mode: true + +max_workers: 1 +poll_interval_seconds: 2 + +auto_fetch_data: false + +data_manager_path: "./data_manager" +dataset_cache_ttl: "30m" + +data_dir: "/data/active" + +snapshot_store: + enabled: true + endpoint: "minio:9000" + secure: false + bucket: "fetchml-snapshots" + prefix: "snapshots" + timeout: "5m" + max_retries: 3 + +podman_image: "python:3.9-slim" +container_workspace: "/workspace" +container_results: "/results" +gpu_devices: [] + +resources: + max_workers: 1 + desired_rps_per_worker: 2 + podman_cpus: "2" + podman_memory: "4Gi" + +metrics: + enabled: true + listen_addr: ":9100" +metrics_flush_interval: "500ms" + +task_lease_duration: "30m" +heartbeat_interval: "1m" +max_retries: 3 +graceful_timeout: "5m" diff --git a/configs/workers/worker-docker.yaml b/configs/workers/worker-docker.yaml deleted file mode 100644 index 64f3936..0000000 --- a/configs/workers/worker-docker.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# Worker configuration for Docker production-like testing -worker_id: "docker-test-worker-1" - -# Redis configuration -redis: - url: "redis://redis:6379" - max_connections: 10 - -# Local mode settings -local_mode: false # Use Podman for containerized job execution - -# Job paths -base_path: "/tmp/fetchml-jobs" - -# Container workspace (not used in local mode) -container_workspace: "/workspace" -container_results: "/results" - -# Podman settings (not used in local mode) -podman_image: "python:3.9-slim" -podman_cpus: "2" -podman_memory: "4g" - -# Worker configuration -heartbeat_interval: "30s" -lease_duration: "5m" -max_concurrent_tasks: 1 - -# Data manager settings -data_manager: - enabled: false - base_path: "/data" - -# SSH settings for Podman communication -ssh: - enabled: true - host: "localhost" - port: 2222 - user: "worker" - password: "SecureWorkerPass2024!" - key_path: "/home/worker/.ssh/id_rsa" - -# Logging -logging: - level: "info" - file: "/logs/worker.log" - -# Metrics -metrics: - enabled: true - endpoint: ":9100" diff --git a/configs/workers/worker-homelab-secure.yaml b/configs/workers/worker-homelab-secure.yaml deleted file mode 100644 index ccc3877..0000000 --- a/configs/workers/worker-homelab-secure.yaml +++ /dev/null @@ -1,79 +0,0 @@ -# Worker configuration for Homelab secure environment -worker_id: "homelab-secure-worker-1" - -# Redis configuration with connection pooling -redis: - url: "redis://redis:6379" - max_connections: 10 - connection_timeout: "10s" - read_timeout: "5s" - write_timeout: "5s" - -# Local mode disabled for containerized execution -local_mode: false - -# Job paths with security considerations -base_path: "/tmp/fetchml-jobs" -container_workspace: "/workspace" -container_results: "/results" - -# Podman settings with resource limits -podman_image: "python:3.11-slim" -podman_cpus: "2" -podman_memory: "4g" -podman_network: "ml-job-network" -podman_timeout: "30m" - -# Worker configuration with security -heartbeat_interval: "30s" -lease_duration: "5m" -max_concurrent_tasks: 2 -task_timeout: "30m" - -# Data manager settings -data_manager: - enabled: true - base_path: "/data" - encryption_enabled: true - backup_enabled: true - -# SSH settings with secure configuration -ssh: - enabled: true - host: "localhost" - port: 2222 - user: "worker" - password: "HomelabWorker2024!" - key_path: "/home/worker/.ssh/id_rsa" - max_retries: 3 - connection_timeout: "30s" - strict_host_key_checking: false - -# Logging with rotation and security -logging: - level: "info" - file: "/logs/worker.log" - max_size: "50MB" - max_backups: 5 - compress: true - audit_enabled: true - -# Metrics and monitoring -metrics: - enabled: true - endpoint: ":9100" - path: "/metrics" - -# Security settings -security: - enable_job_isolation: true - sandbox_enabled: true - resource_monitoring: true - audit_commands: true - -# Health check configuration -health_check: - enabled: true - interval: "30s" - timeout: "10s" - failure_threshold: 3 diff --git a/configs/workers/worker-prod.toml b/configs/workers/worker-prod.toml index 62c5305..bc5f208 100644 --- a/configs/workers/worker-prod.toml +++ b/configs/workers/worker-prod.toml @@ -4,7 +4,7 @@ max_workers = 4 # Redis connection redis_addr = "localhost:6379" -redis_password = "your-redis-password" +redis_password = "CHANGE_ME_REDIS_PASSWORD" redis_db = 0 # SSH connection (for remote operations) @@ -15,17 +15,13 @@ ssh_key = "~/.ssh/id_rsa" # Podman configuration podman_image = "ml-training:latest" -gpu_access = true +gpu_vendor = "none" +gpu_visible_devices = [] +gpu_devices = [] container_workspace = "/workspace" container_results = "/results" train_script = "train.py" -[resources] -max_workers = 4 -desired_rps_per_worker = 2 -podman_cpus = "4" -podman_memory = "16g" - # Dataset management auto_fetch_data = true data_dir = "/data/datasets" @@ -36,10 +32,16 @@ dataset_cache_ttl = "24h" task_lease_duration = "1h" heartbeat_interval = "30s" graceful_timeout = "5m" -poll_interval = "100ms" +poll_interval_seconds = 1 metrics_flush_interval = "10s" +[resources] +max_workers = 4 +desired_rps_per_worker = 2 +podman_cpus = "4" +podman_memory = "16g" + # Metrics exporter [metrics] enabled = true -listen_addr = ":9090" +listen_addr = ":9100" diff --git a/deployments/Caddyfile.dev b/deployments/Caddyfile.dev new file mode 100644 index 0000000..a84013b --- /dev/null +++ b/deployments/Caddyfile.dev @@ -0,0 +1,45 @@ +{ + auto_https off + admin off + servers { + protocols h1 h2 + } +} + +http://localhost { + handle /health { + reverse_proxy api-server:9101 + } + + handle /ws* { + reverse_proxy api-server:9101 + } + + handle /api/* { + reverse_proxy api-server:9101 + } + + handle { + respond 404 + } +} + +https://localhost { + tls internal + + handle /health { + reverse_proxy api-server:9101 + } + + handle /ws* { + reverse_proxy api-server:9101 + } + + handle /api/* { + reverse_proxy api-server:9101 + } + + handle { + respond 404 + } +} diff --git a/deployments/Caddyfile.homelab-secure b/deployments/Caddyfile.homelab-secure new file mode 100644 index 0000000..5268806 --- /dev/null +++ b/deployments/Caddyfile.homelab-secure @@ -0,0 +1,44 @@ +{ + admin off + servers { + protocols h1 h2 + } +} + +{$FETCHML_DOMAIN} { + encode gzip + + tls /etc/caddy/ssl/cert.pem /etc/caddy/ssl/key.pem + + header { + -Server + X-Frame-Options "DENY" + X-Content-Type-Options "nosniff" + Referrer-Policy "strict-origin-when-cross-origin" + Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" + Content-Security-Policy "default-src 'self'; base-uri 'self'; frame-ancestors 'none'" + } + + @admin path /admin/* + @admin_private remote_ip private_ranges + handle @admin { + respond @admin_private 404 + respond 404 + } + + handle /health { + reverse_proxy api-server:9101 + } + + handle /ws* { + reverse_proxy api-server:9101 + } + + handle /api/* { + reverse_proxy api-server:9101 + } + + handle { + respond 404 + } +} diff --git a/deployments/Caddyfile.prod b/deployments/Caddyfile.prod new file mode 100644 index 0000000..e9d3e4f --- /dev/null +++ b/deployments/Caddyfile.prod @@ -0,0 +1,47 @@ +{ + email {$CADDY_EMAIL} + admin off + servers { + protocols h1 h2 + } +} + +{$FETCHML_DOMAIN} { + encode gzip + + request_body { + max_size 10MB + } + + header { + -Server + X-Frame-Options "DENY" + X-Content-Type-Options "nosniff" + Referrer-Policy "strict-origin-when-cross-origin" + Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" + Content-Security-Policy "default-src 'self'; base-uri 'self'; frame-ancestors 'none'" + } + + @admin path /admin/* + @admin_private remote_ip private_ranges + handle @admin { + respond @admin_private 404 + respond 404 + } + + handle /health { + reverse_proxy api-server:9101 + } + + handle /ws* { + reverse_proxy api-server:9101 + } + + handle /api/* { + reverse_proxy api-server:9101 + } + + handle { + respond 404 + } +} diff --git a/deployments/Caddyfile.smoke b/deployments/Caddyfile.smoke new file mode 100644 index 0000000..a1492d3 --- /dev/null +++ b/deployments/Caddyfile.smoke @@ -0,0 +1,23 @@ +{ + auto_https off +} + +localhost { + tls internal + + handle /health { + reverse_proxy api-server:9101 + } + + handle /ws* { + reverse_proxy api-server:9101 + } + + handle /api/* { + reverse_proxy api-server:9101 + } + + handle { + respond 404 + } +} diff --git a/deployments/Makefile b/deployments/Makefile new file mode 100644 index 0000000..f684d96 --- /dev/null +++ b/deployments/Makefile @@ -0,0 +1,76 @@ +# Docker Compose Deployment Management +.PHONY: help dev-up dev-down dev-logs dev-restart homelab-secure-up homelab-secure-down prod-up prod-down status clean + +# Default target +help: ## Show this help message + @echo "Available commands:" + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' + +# Development environment +dev-up: ## Start development environment + @echo "Starting development environment..." + docker-compose -f deployments/docker-compose.dev.yml up -d + @echo "Services: Caddy (8080/8443), Redis (6379), Prometheus (9090), Grafana (3000)" + +dev-down: ## Stop development environment + @echo "Stopping development environment..." + docker-compose -f deployments/docker-compose.dev.yml down + +dev-logs: ## Show development logs + docker-compose -f deployments/docker-compose.dev.yml logs -f + +dev-restart: ## Restart development environment + @echo "Restarting development environment..." + docker-compose -f deployments/docker-compose.dev.yml restart + + +# Homelab environment +homelab-secure-up: ## Start secure homelab environment + @echo "Starting secure homelab environment..." + docker-compose -f deployments/docker-compose.homelab-secure.yml up -d + +homelab-secure-down: ## Stop secure homelab environment + @echo "Stopping secure homelab environment..." + docker-compose -f deployments/docker-compose.homelab-secure.yml down + +# Production environment +prod-up: ## Start production environment + @echo "Starting production environment..." + docker-compose -f deployments/docker-compose.prod.yml up -d + +prod-down: ## Stop production environment + @echo "Stopping production environment..." + docker-compose -f deployments/docker-compose.prod.yml down + +# Utility commands +status: ## Show status of all environments + @echo "=== Development Status ===" + @if [ -f deployments/docker-compose.dev.yml ]; then \ + docker-compose -f deployments/docker-compose.dev.yml ps; \ + fi + @echo "" + @echo "=== Homelab Secure Status ===" + @if [ -f deployments/docker-compose.homelab-secure.yml ]; then \ + docker-compose -f deployments/docker-compose.homelab-secure.yml ps 2>/dev/null || echo "Not running"; \ + fi + @echo "" + @echo "=== Production Status ===" + @if [ -f deployments/docker-compose.prod.yml ]; then \ + docker-compose -f deployments/docker-compose.prod.yml ps 2>/dev/null || echo "Not running"; \ + fi + +clean: ## Clean up all containers and volumes + @echo "Cleaning up all Docker resources..." + @echo "This will remove all containers and volumes. Continue? [y/N]" + @read -r confirm && [ "$$confirm" = "y" ] || exit 1 + docker-compose -f deployments/docker-compose.dev.yml down -v 2>/dev/null || true + docker-compose -f deployments/docker-compose.homelab-secure.yml down -v 2>/dev/null || true + docker-compose -f deployments/docker-compose.prod.yml down -v 2>/dev/null || true + docker system prune -f + @echo "Cleanup complete." + +# Quick aliases +up: dev-up ## Alias for dev-up +down: dev-down ## Alias for dev-down +logs: dev-logs ## Alias for dev-logs +restart: dev-restart ## Alias for dev-restart diff --git a/deployments/README.md b/deployments/README.md index e30dda0..26a5ba0 100644 --- a/deployments/README.md +++ b/deployments/README.md @@ -2,33 +2,123 @@ This directory contains Docker Compose configurations for different deployment environments. -## Files +## Environment Configurations -- `docker-compose.homelab-secure.yml` - Secure homelab deployment with TLS and authentication -- `docker-compose.prod.yml` - Production deployment configuration +### Development (`docker-compose.dev.yml`) +- Full development stack with monitoring +- Includes: API, Worker, Redis, MinIO (snapshots), Prometheus, Grafana, Loki, Promtail +- Optimized for local development and testing +- **Usage**: `docker-compose -f deployments/docker-compose.dev.yml up -d` -## Usage +### Homelab - Secure (`docker-compose.homelab-secure.yml`) +- Secure homelab deployment with authentication and a Caddy reverse proxy +- TLS is terminated at the reverse proxy (Approach A) +- Includes: API, Redis (password protected), Caddy reverse proxy +- **Usage**: `docker-compose -f deployments/docker-compose.homelab-secure.yml up -d` + +### Production (`docker-compose.prod.yml`) +- Production deployment configuration +- Optimized for performance and security +- External services assumed (Redis, monitoring) +- **Usage**: `docker-compose -f deployments/docker-compose.prod.yml up -d` + +Note: `docker-compose.prod.yml` is a reproducible staging/testing harness. Real production deployments do not require Docker; you can run the Go services directly (systemd) and use Caddy for TLS/WSS termination. + +## TLS / WSS Policy + +- The Zig CLI currently supports `ws://` only (native `wss://` is not implemented). +- Production deployments terminate TLS/WSS at a reverse proxy (Caddy in `docker-compose.prod.yml`) and keep the API server on internal `ws://`. +- Homelab deployments terminate TLS/WSS at a reverse proxy (Caddy) and keep the API server on internal `ws://`. +- Health checks in compose files should use `http://localhost:9101/health` when `server.tls.enabled: false`. + +## Required Volume Mounts + +- `base_path` (experiments) must be writable by the API server. +- `data_dir` should be mounted if you want snapshot/dataset integrity validation via `ml validate`. + +For the default configs: + +- `base_path`: `/data/experiments` (dev/homelab configs) or `/app/data/experiments` (prod configs) +- `data_dir`: `/data/active` + +## Quick Start -### Development ```bash -# Use the main docker-compose.yml in project root -docker-compose up -d +# Development (most common) +docker-compose -f deployments/docker-compose.dev.yml up -d + +# Check status +docker-compose -f deployments/docker-compose.dev.yml ps + +# View logs +docker-compose -f deployments/docker-compose.dev.yml logs -f api-server + +# Stop services +docker-compose -f deployments/docker-compose.dev.yml down ``` -### Homelab (Secure) -```bash -docker-compose -f deployments/docker-compose.homelab-secure.yml up -d -``` +## Dev: MinIO-backed snapshots (smoke test) -### Production -```bash -docker-compose -f deployments/docker-compose.prod.yml up -d -``` +The dev compose file provisions a MinIO bucket and uploads a small example snapshot object at: + +`s3://fetchml-snapshots/snapshots/snap-1.tar.gz` + +To queue a task that forces the worker to pull the snapshot from MinIO: + +1. Start the dev stack: + `docker-compose -f deployments/docker-compose.dev.yml up -d` + +2. Read the `snapshot_sha256` printed by the init job: + `docker-compose -f deployments/docker-compose.dev.yml logs minio-init` + +3. Queue a job using the snapshot fields: + `ml queue --snapshot-id snap-1 --snapshot-sha256 ` + +## Smoke tests + +- `make dev-smoke` runs the development stack smoke test. +- `make prod-smoke` runs a Docker-based staging smoke test for the production stack, using a localhost-only Caddy configuration. + + Note: `ml queue` by itself will generate a random commit ID. For full provenance enforcement (manifest + dependency manifest), use `ml sync ./your-project --queue` so the server has real code + dependency files. + + Examples: + - `ml queue train-mnist --priority 3 --snapshot-id snap-1 --snapshot-sha256 ` + - `ml queue train-a train-b train-c --priority 5 --snapshot-id snap-1 --snapshot-sha256 ` ## Environment Variables -Each deployment may require specific environment variables. Refer to the individual compose files for requirements. +Create a `.env` file in the project root: + +```bash +# Grafana +GRAFANA_ADMIN_PASSWORD=your_secure_password + +# API Configuration +LOG_LEVEL=info + +# TLS (for secure deployments) +TLS_CERT_PATH=/app/ssl/cert.pem +TLS_KEY_PATH=/app/ssl/key.pem +``` + +## Service Ports + +| Service | Development | Homelab | Production | +|---------|-------------|---------|------------| +| API Server | 9101 | 9101 | 9101 | +| Redis | 6379 | 6379 | - | +| Prometheus | 9090 | - | - | +| Grafana | 3000 | - | - | +| Loki | 3100 | - | - | ## Monitoring -Performance monitoring configurations are in `monitoring/docker-compose.performance.yml` +- **Development**: Full monitoring stack included +- **Homelab**: Basic monitoring (configurable) +- **Production**: External monitoring assumed + +## Security Notes + +- If you need HTTPS externally, terminate TLS at a reverse proxy. +- API keys should be managed via environment variables +- Database credentials should use secrets management in production diff --git a/deployments/deploy.sh b/deployments/deploy.sh new file mode 100755 index 0000000..6a5aecd --- /dev/null +++ b/deployments/deploy.sh @@ -0,0 +1,162 @@ +#!/bin/bash +# Quick deployment script for fetch_ml + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Function to print colored output +print_status() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Function to show usage +show_usage() { + echo "Usage: $0 [ENVIRONMENT] [ACTION]" + echo "" + echo "Environments:" + echo " dev Development environment" + echo " secure Secure homelab environment" + echo " prod Production environment" + echo "" + echo "Actions:" + echo " up Start services" + echo " down Stop services" + echo " restart Restart services" + echo " logs Show logs" + echo " status Show status" + echo "" + echo "Examples:" + echo " $0 dev up # Start development environment" + echo " $0 prod down # Stop production environment" + echo " $0 secure logs # Show secure environment logs" +} + +# Function to check if docker-compose file exists +check_compose_file() { + local env=$1 + local compose_file="" + + case $env in + "dev") + compose_file="deployments/docker-compose.dev.yml" + ;; + "secure") + compose_file="deployments/docker-compose.homelab-secure.yml" + ;; + "prod") + compose_file="deployments/docker-compose.prod.yml" + ;; + *) + print_error "Unknown environment: $env" + show_usage + exit 1 + ;; + esac + + if [ ! -f "$compose_file" ]; then + print_error "Docker Compose file not found: $compose_file" + exit 1 + fi + + echo "$compose_file" +} + +# Function to check if .env file exists +check_env_file() { + local env=$1 + + if [ ! -f ".env" ]; then + print_warning ".env file not found. Creating from example..." + if [ "$env" = "dev" ]; then + cp deployments/env.dev.example .env + elif [ "$env" = "prod" ]; then + cp deployments/env.prod.example .env + else + cp deployments/env.dev.example .env + fi + print_warning "Please edit .env file with your configuration" + fi +} + +# Main script +main() { + if [ $# -ne 2 ]; then + show_usage + exit 1 + fi + + local environment=$1 + local action=$2 + + print_status "Environment: $environment" + print_status "Action: $action" + + # Check compose file + compose_file=$(check_compose_file "$environment") + print_status "Using: $compose_file" + + # Check .env file + check_env_file "$environment" + + # Execute action + case $action in + "up") + print_status "Starting $environment environment..." + docker-compose -f "$compose_file" up -d + print_success "$environment environment started successfully!" + + # Show service URLs + echo "" + print_status "Service URLs:" + echo " API Server: http://localhost:9101" + if [ "$environment" = "dev" ]; then + echo " Grafana: http://localhost:3000 (admin/admin123)" + echo " Prometheus: http://localhost:9090" + fi + ;; + "down") + print_status "Stopping $environment environment..." + docker-compose -f "$compose_file" down + print_success "$environment environment stopped successfully!" + ;; + "restart") + print_status "Restarting $environment environment..." + docker-compose -f "$compose_file" restart + print_success "$environment environment restarted successfully!" + ;; + "logs") + print_status "Showing logs for $environment environment..." + docker-compose -f "$compose_file" logs -f + ;; + "status") + print_status "Status of $environment environment:" + docker-compose -f "$compose_file" ps + ;; + *) + print_error "Unknown action: $action" + show_usage + exit 1 + ;; + esac +} + +# Run main function +main "$@" diff --git a/deployments/docker-compose.dev.yml b/deployments/docker-compose.dev.yml new file mode 100644 index 0000000..8f3185f --- /dev/null +++ b/deployments/docker-compose.dev.yml @@ -0,0 +1,225 @@ +# Homelab Docker Compose with Centralized Monitoring +# Includes: API, Redis, Prometheus, Grafana, Loki + +services: + caddy: + image: caddy:2-alpine + container_name: ml-dev-caddy + restart: unless-stopped + ports: + - "8080:80" + - "8443:443" + volumes: + - ${FETCHML_REPO_ROOT:-.}/deployments/Caddyfile.dev:/etc/caddy/Caddyfile:ro + - ${FETCHML_REPO_ROOT:-.}/data/dev/caddy/data:/data + - ${FETCHML_REPO_ROOT:-.}/data/dev/caddy/config:/config + depends_on: + api-server: + condition: service_healthy + + redis: + image: redis:7-alpine + container_name: ml-experiments-redis + user: "999:999" + ports: + - "6379:6379" + volumes: + - ${FETCHML_REPO_ROOT:-.}/data/dev/redis:/data + restart: unless-stopped + command: redis-server --appendonly yes + healthcheck: + test: [ "CMD", "redis-cli", "ping" ] + interval: 30s + timeout: 10s + retries: 3 + + api-server: + build: + context: ${FETCHML_REPO_ROOT:-.} + dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile + container_name: ml-experiments-api + user: "0:0" + expose: + - "9101" # API and health endpoints (internal; external access via Caddy) + volumes: + - ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs + - ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments + - ${FETCHML_REPO_ROOT:-.}/data/dev/active:/data/active + - ${FETCHML_REPO_ROOT:-.}/data/dev/workspaces:/data/active/workspaces:delegated + - ${FETCHML_REPO_ROOT:-.}/configs/api/dev.yaml:/app/configs/api/dev.yaml + - ${FETCHML_REPO_ROOT:-.}/ssl:/app/ssl + depends_on: + - redis + restart: unless-stopped + command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"] + environment: + - LOG_LEVEL=info + healthcheck: + test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + labels: + logging: "promtail" + job: "api-server" + + minio: + image: minio/minio:latest + container_name: ml-experiments-minio + ports: + - "9000:9000" + - "9001:9001" + volumes: + - ${FETCHML_REPO_ROOT:-.}/data/dev/minio:/data + environment: + - MINIO_ROOT_USER=minioadmin + - MINIO_ROOT_PASSWORD=minioadmin123 + command: ["server", "/data", "--console-address", ":9001"] + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 10s + timeout: 5s + retries: 10 + restart: unless-stopped + + minio-init: + image: alpine:3.19 + container_name: ml-experiments-minio-init + depends_on: + minio: + condition: service_healthy + entrypoint: ["/bin/sh", "-c"] + command: + - | + set -eu + apk add --no-cache ca-certificates curl tar gzip + ARCH=$$(uname -m) + MC_ARCH=amd64 + if [ "$$ARCH" = "aarch64" ] || [ "$$ARCH" = "arm64" ]; then + MC_ARCH=arm64 + fi + curl -fsSL -o /usr/local/bin/mc "https://dl.min.io/client/mc/release/linux-$$MC_ARCH/mc" + chmod +x /usr/local/bin/mc + i=0 + while ! mc alias set local http://minio:9000 minioadmin minioadmin123; do + i=$$((i+1)) + if [ $$i -ge 30 ]; then + echo "minio not ready after 30 attempts" >&2 + exit 1 + fi + echo "waiting for minio... ($$i/30)" + sleep 1 + done + mc mb -p local/fetchml-snapshots || true + mkdir -p /tmp/snapshots/snap-1 + echo -n "hello" > /tmp/snapshots/snap-1/hello.txt + tar -C /tmp/snapshots/snap-1 -czf /tmp/snap-1.tar.gz . + mc cp /tmp/snap-1.tar.gz local/fetchml-snapshots/snapshots/snap-1.tar.gz + FILE_SHA=$$(sha256sum /tmp/snapshots/snap-1/hello.txt | cut -d' ' -f1) + SNAP_SHA=$$(echo -n "$$FILE_SHA" | sha256sum | cut -d' ' -f1) + echo "snapshot_id=snap-1 snapshot_sha256=$$SNAP_SHA" + restart: "no" + + worker: + build: + context: ${FETCHML_REPO_ROOT:-.} + dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile + container_name: ml-experiments-worker + user: "0:0" + ports: + - "8888:8888" + volumes: + - ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs + - ${FETCHML_REPO_ROOT:-.}/data/dev/active:/data/active + - ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments + - ${FETCHML_REPO_ROOT:-.}/data/dev/workspaces:/data/active/workspaces:delegated + - ${FETCHML_REPO_ROOT:-.}/configs/workers/docker-dev.yaml:/app/configs/worker.yaml + - /sys/fs/cgroup:/sys/fs/cgroup:rw + depends_on: + redis: + condition: service_healthy + api-server: + condition: service_healthy + minio-init: + condition: service_completed_successfully + restart: unless-stopped + environment: + - LOG_LEVEL=info + - MINIO_ROOT_USER=minioadmin + - MINIO_ROOT_PASSWORD=minioadmin123 + - FETCHML_JUPYTER_DEFAULT_IMAGE=quay.io/jupyter/base-notebook:latest + - FETCHML_JUPYTER_CONDA_ENV=base + - FETCHML_JUPYTER_KERNEL_NAME=python + - FETCHML_PODMAN_CGROUPS=disabled + privileged: true + command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"] + + # Prometheus - Metrics collection + prometheus: + image: prom/prometheus:latest + container_name: ml-experiments-prometheus + ports: + - "9090:9090" + volumes: + - ${FETCHML_REPO_ROOT:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--web.enable-lifecycle' + restart: unless-stopped + + # Grafana - Visualization + grafana: + image: grafana/grafana:latest + container_name: ml-experiments-grafana + ports: + - "3000:3000" + volumes: + - grafana_data:/var/lib/grafana + - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning + - ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin123 + - GF_USERS_ALLOW_SIGN_UP=false + restart: unless-stopped + depends_on: + - prometheus + - loki + + # Loki - Log aggregation + loki: + image: grafana/loki:latest + container_name: ml-experiments-loki + ports: + - "3100:3100" + volumes: + - ${FETCHML_REPO_ROOT:-.}/monitoring/loki-config.yml:/etc/loki/local-config.yaml + - loki_data:/loki + command: -config.file=/etc/loki/local-config.yaml + restart: unless-stopped + + # Promtail - Log collector + promtail: + image: grafana/promtail:latest + container_name: ml-experiments-promtail + volumes: + - ${FETCHML_REPO_ROOT:-.}/monitoring/promtail-config.yml:/etc/promtail/config.yml + - ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/var/log/app + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock + command: -config.file=/etc/promtail/config.yml + restart: unless-stopped + depends_on: + - loki + +volumes: + prometheus_data: + driver: local + grafana_data: + driver: local + loki_data: + driver: local diff --git a/deployments/docker-compose.homelab-secure.yml b/deployments/docker-compose.homelab-secure.yml index 52a2eb4..4332c51 100644 --- a/deployments/docker-compose.homelab-secure.yml +++ b/deployments/docker-compose.homelab-secure.yml @@ -1,104 +1,152 @@ -# Homelab Secure Docker Environment -services: - redis: - image: redis:7-alpine - container_name: ml-homelab-redis - ports: - - "6379:6379" - volumes: - - redis_homelab_data:/data - restart: unless-stopped - command: > - redis-server - --appendonly yes - --requirepass "HomelabRedis2024!" - --maxmemory 512mb - --maxmemory-policy allkeys-lru - healthcheck: - test: ["CMD", "redis-cli", "-a", "HomelabRedis2024!", "ping"] - interval: 30s - timeout: 10s - retries: 3 - networks: - - ml-homelab-network +# Secure Homelab Docker Compose Configuration +# Use with: docker-compose -f docker-compose.yml -f docker-compose.homelab-secure.yml up -d +services: api-server: build: - context: . - dockerfile: build/docker/homelab-secure.Dockerfile - container_name: ml-homelab-api + context: ${FETCHML_REPO_ROOT:-.} + dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile + container_name: ml-experiments-api ports: - - "9104:9101" # API server port - - "2223:2222" # Secure SSH port - - "9101:9100" # Prometheus metrics + - "9101:9101" + - "9100:9100" # Prometheus metrics endpoint volumes: - - ./data:/app/data/experiments - - ./logs:/logs - - ./configs/config-homelab-secure.yaml:/app/configs/config.yaml + - ${FETCHML_REPO_ROOT:-.}/data/homelab/experiments:/data/experiments + - ${FETCHML_REPO_ROOT:-.}/data/homelab/active:/data/active + - ${FETCHML_REPO_ROOT:-.}/data/homelab/logs:/logs + - ${FETCHML_REPO_ROOT:-.}/ssl:/app/ssl:ro + - ${FETCHML_REPO_ROOT:-.}/configs/api/homelab-secure.yaml:/app/configs/api/prod.yaml:ro + - ${FETCHML_REPO_ROOT:-.}/.env.secure:/app/.env.secure:ro depends_on: redis: condition: service_healthy restart: unless-stopped environment: - - REDIS_URL=redis://:HomelabRedis2024!@redis:6379 - LOG_LEVEL=info - - TZ=America/New_York + # Load secure environment variables + - JWT_SECRET_FILE=/app/.env.secure healthcheck: - test: ["CMD", "curl", "-k", "-f", "https://localhost:9101/health"] + test: ["CMD", "curl", "-f", "http://localhost:9101/health"] interval: 30s timeout: 10s retries: 3 start_period: 40s - command: > - sh -c " - sudo /app/start-security.sh & - /usr/local/bin/api-server -config /app/configs/config.yaml - " + labels: + logging: "promtail" + job: "api-server" + command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"] networks: - - ml-homelab-network + - ml-experiments-network + # Add internal network for secure communication + - ml-backend-network + + minio: + image: minio/minio:latest + container_name: ml-experiments-minio + ports: + - "9000:9000" + - "9001:9001" + volumes: + - ${FETCHML_REPO_ROOT:-.}/data/homelab/minio:/data + environment: + - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin} + - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123} + command: ["server", "/data", "--console-address", ":9001"] + restart: unless-stopped + networks: + - ml-backend-network + + minio-init: + image: alpine:3.19 + container_name: ml-experiments-minio-init + depends_on: + - minio + entrypoint: ["/bin/sh", "-c"] + command: + - | + apk add --no-cache ca-certificates curl >/dev/null + curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc + chmod +x /usr/local/bin/mc + mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123} + mc mb -p local/fetchml-snapshots || true + restart: "no" + networks: + - ml-backend-network worker: build: - context: . - dockerfile: build/docker/homelab-secure.Dockerfile - container_name: ml-homelab-worker + context: ${FETCHML_REPO_ROOT:-.} + dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile + container_name: ml-experiments-worker volumes: - - ./data:/app/data/experiments - - ./logs:/logs - - ./configs/worker-homelab-secure.yaml:/app/configs/worker.yaml + - ${FETCHML_REPO_ROOT:-.}/data/homelab/experiments:/app/data/experiments + - ${FETCHML_REPO_ROOT:-.}/data/homelab/active:/data/active + - ${FETCHML_REPO_ROOT:-.}/data/homelab/logs:/logs + - ${FETCHML_REPO_ROOT:-.}/configs/workers/homelab-secure.yaml:/app/configs/worker.yaml depends_on: redis: condition: service_healthy api-server: condition: service_healthy + minio-init: + condition: service_started restart: unless-stopped environment: - - REDIS_URL=redis://:HomelabRedis2024!@redis:6379 - LOG_LEVEL=info - - TZ=America/New_York - privileged: true # Required for Podman - security_opt: - - no-new-privileges:true - cap_drop: - - ALL - cap_add: - - NET_ADMIN - - SYS_ADMIN - command: > - sh -c " - sudo /app/start-security.sh & - /usr/local/bin/worker -config /app/configs/worker.yaml - " + - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin} + - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123} + - REDIS_PASSWORD=${REDIS_PASSWORD} + privileged: true + command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"] networks: - - ml-homelab-network + - ml-backend-network -volumes: - redis_homelab_data: - driver: local + caddy: + image: caddy:2-alpine + container_name: ml-experiments-caddy + restart: unless-stopped + ports: + - "80:80" + - "443:443" + volumes: + - ${FETCHML_REPO_ROOT:-.}/deployments/Caddyfile.homelab-secure:/etc/caddy/Caddyfile:ro + - ${FETCHML_REPO_ROOT:-.}/ssl:/etc/caddy/ssl:ro + - ${FETCHML_REPO_ROOT:-.}/data/homelab/caddy/data:/data + - ${FETCHML_REPO_ROOT:-.}/data/homelab/caddy/config:/config + environment: + - FETCHML_DOMAIN=${FETCHML_DOMAIN:-ml.local} + depends_on: + api-server: + condition: service_healthy + networks: + - ml-experiments-network + + # Redis with authentication + redis: + image: redis:7-alpine + container_name: ml-experiments-redis + user: "999:999" + ports: + - "127.0.0.1:6379:6379" # Bind to localhost only + volumes: + - ${FETCHML_REPO_ROOT:-.}/data/homelab/redis:/data + - ${FETCHML_REPO_ROOT:-.}/redis/redis-secure.conf:/usr/local/etc/redis/redis.conf:ro + restart: unless-stopped + command: redis-server /usr/local/etc/redis/redis.conf --requirepass ${REDIS_PASSWORD} + healthcheck: + test: ["CMD", "redis-cli", "--no-auth-warning", "-a", "${REDIS_PASSWORD}", "ping"] + interval: 30s + timeout: 10s + retries: 3 + networks: + - ml-backend-network + environment: + - REDIS_PASSWORD=${REDIS_PASSWORD} + +volumes: {} networks: - ml-homelab-network: + ml-experiments-network: + driver: bridge + ml-backend-network: driver: bridge - ipam: - config: - - subnet: 172.25.0.0/16 diff --git a/deployments/docker-compose.prod.smoke.yml b/deployments/docker-compose.prod.smoke.yml new file mode 100644 index 0000000..134af2d --- /dev/null +++ b/deployments/docker-compose.prod.smoke.yml @@ -0,0 +1,75 @@ +services: + caddy: + image: caddy:2-alpine + environment: + - FETCHML_DOMAIN=localhost + - CADDY_EMAIL=smoke@example.invalid + ports: + - "8080:80" + - "8443:443" + volumes: + - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/caddy/data:/data + - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/caddy/config:/config + command: + - /bin/sh + - -c + - | + cat > /etc/caddy/Caddyfile <<'EOF' + { + debug + servers { + protocols h1 h2 + } + } + + https://localhost { + tls internal { + protocols tls1.2 tls1.3 + } + + handle { + reverse_proxy api-server:9101 + } + } + EOF + exec caddy run --config /etc/caddy/Caddyfile + + redis: + image: redis:7-alpine + user: "999:999" + restart: unless-stopped + expose: + - "6379" + volumes: + - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/redis:/data + command: redis-server --appendonly yes + healthcheck: + test: [ "CMD", "redis-cli", "ping" ] + interval: 10s + timeout: 5s + retries: 10 + + api-server: + build: + context: ${FETCHML_REPO_ROOT:-.} + dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile + user: "0:0" + restart: unless-stopped + expose: + - "9101" + depends_on: + redis: + condition: service_healthy + volumes: + - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/experiments:/data/experiments + - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/active:/data/active + - ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/logs:/logs + - ${FETCHML_REPO_ROOT:-.}/configs/api/dev.yaml:/app/configs/api/dev.yaml:ro + command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"] + healthcheck: + test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ] + interval: 10s + timeout: 5s + retries: 10 + +volumes: {} diff --git a/deployments/docker-compose.prod.yml b/deployments/docker-compose.prod.yml index aac50d0..61fbf91 100644 --- a/deployments/docker-compose.prod.yml +++ b/deployments/docker-compose.prod.yml @@ -1,12 +1,31 @@ # Full Production Docker Environment with Podman and SQLite services: + caddy: + image: caddy:2-alpine + container_name: ml-prod-caddy + restart: unless-stopped + ports: + - "80:80" + - "443:443" + volumes: + - ./Caddyfile.prod:/etc/caddy/Caddyfile:ro + - ${FETCHML_REPO_ROOT:-.}/data/prod/caddy/data:/data + - ${FETCHML_REPO_ROOT:-.}/data/prod/caddy/config:/config + environment: + - FETCHML_DOMAIN=${FETCHML_DOMAIN} + - CADDY_EMAIL=${CADDY_EMAIL} + depends_on: + api-server: + condition: service_healthy + redis: image: redis:7-alpine container_name: ml-prod-redis - ports: - - "6379:6379" + user: "999:999" + expose: + - "6379" volumes: - - redis_prod_data:/data + - ${FETCHML_REPO_ROOT:-.}/data/prod/redis:/data restart: unless-stopped command: redis-server --appendonly yes healthcheck: @@ -17,57 +36,87 @@ services: api-server: build: - context: . - dockerfile: build/docker/secure-prod.Dockerfile + context: ${FETCHML_REPO_ROOT:-.} + dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/secure-prod.Dockerfile container_name: ml-prod-api - ports: - - "9103:9101" # API server port - - "2222:2222" # Secure SSH port for Podman communication - - "9100:9100" # Prometheus metrics + expose: + - "9101" # API server port (internal; external access via Caddy) + - "2222" # Secure SSH port for Podman communication (internal) volumes: - - ./data:/app/data/experiments - - ./logs:/logs - - ./configs/config-multi-user.yaml:/app/configs/config.yaml + - ${FETCHML_REPO_ROOT:-.}/data/prod/experiments:/app/data/experiments + - ${FETCHML_REPO_ROOT:-.}/data/prod/active:/data/active + - ${FETCHML_REPO_ROOT:-.}/data/prod/logs:/logs + - ${FETCHML_REPO_ROOT:-.}/configs/api/multi-user.yaml:/app/configs/api/prod.yaml depends_on: redis: condition: service_healthy restart: unless-stopped environment: - - REDIS_URL=redis://redis:6379 - LOG_LEVEL=info healthcheck: - test: [ "CMD", "curl", "-k", "https://localhost:9101/health" ] + test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ] interval: 30s timeout: 10s retries: 3 start_period: 40s - # Start SSH daemon for Podman communication - command: ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"] + # Start API server (ensure data_dir exists for snapshot/dataset validation) + command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"] + + minio: + image: minio/minio:latest + container_name: ml-prod-minio + expose: + - "9000" + - "9001" + volumes: + - ${FETCHML_REPO_ROOT:-.}/data/prod/minio:/data + environment: + - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin} + - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123} + command: ["server", "/data", "--console-address", ":9001"] + restart: unless-stopped + + minio-init: + image: alpine:3.19 + container_name: ml-prod-minio-init + depends_on: + - minio + entrypoint: ["/bin/sh", "-c"] + command: + - | + apk add --no-cache ca-certificates curl >/dev/null + curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc + chmod +x /usr/local/bin/mc + mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123} + mc mb -p local/fetchml-snapshots || true + restart: "no" worker: build: - context: . - dockerfile: build/docker/secure-prod.Dockerfile + context: ${FETCHML_REPO_ROOT:-.} + dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile container_name: ml-prod-worker volumes: - - ./data:/app/data/experiments - - ./logs:/logs - - ./configs/worker-docker.yaml:/app/configs/worker.yaml + - ${FETCHML_REPO_ROOT:-.}/data/prod/experiments:/app/data/experiments + - ${FETCHML_REPO_ROOT:-.}/data/prod/active:/data/active + - ${FETCHML_REPO_ROOT:-.}/data/prod/logs:/logs + - ${FETCHML_REPO_ROOT:-.}/configs/workers/docker-prod.yaml:/app/configs/worker.yaml depends_on: redis: condition: service_healthy api-server: condition: service_healthy + minio-init: + condition: service_started restart: unless-stopped environment: - - REDIS_URL=redis://redis:6379 - LOG_LEVEL=info + - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin} + - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123} privileged: true # Required for Podman to work in Docker command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"] -volumes: - redis_prod_data: - driver: local +volumes: {} networks: default: diff --git a/deployments/env.dev.example b/deployments/env.dev.example new file mode 100644 index 0000000..0305983 --- /dev/null +++ b/deployments/env.dev.example @@ -0,0 +1,17 @@ +# Development Environment Variables +# Copy this file to .env and modify as needed + +# Grafana +GRAFANA_ADMIN_PASSWORD=admin123 + +# API Configuration +LOG_LEVEL=info + +# TLS (development uses self-signed certs) +TLS_CERT_PATH=/app/ssl/cert.pem +TLS_KEY_PATH=/app/ssl/key.pem + +# Development-specific +ENVIRONMENT=development +DEBUG=true +API_KEY=development_key_only diff --git a/deployments/env.prod.example b/deployments/env.prod.example new file mode 100644 index 0000000..f6e725d --- /dev/null +++ b/deployments/env.prod.example @@ -0,0 +1,28 @@ +# Production Environment Variables +# Copy this file to .env and modify as needed + +# Grafana (if using) +GRAFANA_ADMIN_PASSWORD=CHANGE_ME_SECURE_PASSWORD + +# API Configuration +LOG_LEVEL=warn + +# TLS (production should use CA-signed certs) +TLS_CERT_PATH=/app/ssl/cert.pem +TLS_KEY_PATH=/app/ssl/key.pem + +# Caddy (TLS/WSS termination) +FETCHML_DOMAIN=ml.example.com +CADDY_EMAIL=admin@example.com + +# Production-specific +ENVIRONMENT=production +DEBUG=false + +# Security +API_KEY=CHANGE_ME_SECURE_API_KEY +ALLOWED_ORIGINS=https://yourdomain.com + +# External services (if applicable) +EXTERNAL_REDIS_URL=redis://external-redis:6379 +EXTERNAL_PROMETHEUS_URL=http://external-prometheus:9090 diff --git a/deployments/setup.sh b/deployments/setup.sh new file mode 100644 index 0000000..6dbae1c --- /dev/null +++ b/deployments/setup.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Usage: ./deployments/setup.sh + +This script DOES NOT install dependencies. +It prints the manual steps and required/optional dependencies for a real (non-Docker) production deployment. +EOF +} + +if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then + usage + exit 0 +fi + +cat <<'EOF' +== FetchML production setup (non-Docker) == + +Required (core): +- Go-built binaries: api-server, worker +- Redis (reachable from api-server + worker) +- A writable base_path for experiments +- A writable data_dir if you want snapshot/dataset staging + integrity validation + +Required (TLS/WSS): +- Caddy (recommended) OR another reverse proxy that can terminate TLS and proxy WebSockets + +Optional: +- systemd (recommended) for service supervision +- MinIO / S3-compatible storage (only if you use remote snapshot_store) +- Podman (only if your worker executes jobs in Podman) + +Notes: +- The Zig CLI currently supports ws:// only. In production, keep the API server internal on ws:// and terminate TLS/WSS at Caddy. +- This script is informational; it will not modify your system. + +--- +1) Build binaries + + make prod + +Artifacts: + ./bin/api-server + ./bin/worker + +--- +2) Create a dedicated user (recommended) + + useradd --system --create-home --shell /usr/sbin/nologin fetchml + +--- +3) Create directories (example paths) + + mkdir -p /var/lib/fetchml/experiments + mkdir -p /var/lib/fetchml/active/datasets /var/lib/fetchml/active/snapshots + mkdir -p /var/log/fetchml + +Ensure ownership: + chown -R fetchml:fetchml /var/lib/fetchml /var/log/fetchml + +--- +4) Configure the API server + +- Start from: configs/api/prod.yaml (or your multi-user config) +- For real production, keep server.tls.enabled: false +- Ensure monitoring.health_checks.enabled is set appropriately + +Example flags: + ./bin/api-server -config /etc/fetchml/api.yaml + +--- +5) Configure Caddy (TLS/WSS termination) + +- Recommended: use deployments/Caddyfile.prod as a baseline. +- Caddy should listen on 443 and reverse proxy to the API server (internal) on 9101. + +Example layout: + /etc/caddy/Caddyfile + /var/lib/caddy + +--- +6) Configure Redis + +- Use Redis AUTH in production. +- Ensure the api-server + worker can reach it. + +--- +7) Run under systemd (recommended) + +Create unit files (example): + /etc/systemd/system/fetchml-api.service + /etc/systemd/system/fetchml-worker.service + /etc/systemd/system/caddy.service (if not already provided) + +Then: + systemctl daemon-reload + systemctl enable --now fetchml-api + systemctl enable --now fetchml-worker + systemctl enable --now caddy + +--- +8) Smoke check + +Internal health (no TLS): + curl -f http://127.0.0.1:9101/health + +External health (through Caddy TLS termination): + curl -f https://YOUR_DOMAIN/health + +EOF diff --git a/monitoring/README.md b/monitoring/README.md index 15ebcf7..a73c19c 100644 --- a/monitoring/README.md +++ b/monitoring/README.md @@ -1,13 +1,52 @@ -# Centralized Monitoring Stack +# Monitoring Stack + +## Directory Structure (Canonical) + +All monitoring configuration lives under `monitoring/`. + +```text +monitoring/ + prometheus/ + prometheus.yml # Prometheus scrape configuration + grafana/ + dashboards/ # Grafana dashboards (JSON) + provisioning/ + datasources/ # Grafana data sources (Prometheus/Loki) + dashboards/ # Grafana dashboard provider (points at dashboards/) + loki-config.yml # Loki configuration + promtail-config.yml # Promtail configuration +``` + +### What is "Grafana provisioning"? + +Grafana provisioning is how Grafana auto-configures itself on startup (no clicking in the UI): + +- **`grafana/provisioning/datasources/*.yml`** + - Defines where Grafana reads data from (e.g. Prometheus at `http://prometheus:9090`, Loki at `http://loki:3100`). +- **`grafana/provisioning/dashboards/*.yml`** + - Tells Grafana to load dashboard JSON files from `/var/lib/grafana/dashboards`. +- **`grafana/dashboards/*.json`** + - The dashboards themselves. + +### Source of truth + +- **Dashboards**: edit/add JSON in `monitoring/grafana/dashboards/`. +- **Grafana provisioning**: edit files in `monitoring/grafana/provisioning/`. +- **Prometheus scrape config**: edit `monitoring/prometheus/prometheus.yml`. + +`scripts/setup_monitoring.py` is intentionally **provisioning-only**: + +- It (re)writes Grafana **datasources** and the **dashboard provider**. +- It does **not** create or overwrite any dashboard JSON files. ## Quick Start ```bash -# Start everything -docker-compose up -d +# Start deployment +make deploy-up # Access services -open http://localhost:3000 # Grafana (admin/admin) +open http://localhost:3000 # Grafana (admin/admin123) open http://localhost:9090 # Prometheus ``` @@ -15,137 +54,80 @@ open http://localhost:9090 # Prometheus ### Grafana (Port 3000) **Main monitoring dashboard** -- Username: `admin` -- Password: `admin` -- Pre-configured datasources: Prometheus + Loki -- Pre-loaded ML Queue dashboard +- Username: `admin` +- Password: `admin123` +- Data source: Prometheus (http://localhost:9090) ### Prometheus (Port 9090) -**Metrics collection** -- Scrapes metrics from API server (`:9100/metrics`) -- 15s scrape interval -- Data retention: 15 days (default) +**Metrics collection and storage** ### Loki (Port 3100) **Log aggregation** -- Collects logs from all containers -- Collects application logs from `./logs/` -- Retention: 7 days -### Promtail -**Log shipping** -- Watches Docker container logs -- Watches `./logs/*.log` -- Sends to Loki +## Dashboards -## Viewing Data +Available dashboard configurations in `grafana/dashboards/`: -### Metrics -1. Open Grafana: http://localhost:3000 -2. Go to "ML Task Queue Monitoring" dashboard -3. See: queue depth, task duration, error rates, etc. +- `load-test-performance.json` - Load test metrics +- `websocket-performance.json` - WebSocket performance +- `system-health.json` - System health monitoring +- `rsync-performance.json` - Rsync performance metrics -### Logs -1. Open Grafana → Explore -2. Select "Loki" datasource -3. Query examples: - ```logql - {job="app_logs"} # All app logs - {job="docker",service="api-server"} # API server logs - {job="docker"} |= "error" # All errors - ``` +### Importing Dashboards -## Architecture - -``` -┌─────────────┐ -│ API Server │──┐ -└─────────────┘ │ - ├──► Prometheus ──► Grafana -┌─────────────┐ │ ▲ -│ Worker │──┘ │ -└─────────────┘ │ - │ -┌─────────────┐ │ -│ App Logs │──┐ │ -└─────────────┘ │ │ - ├──► Promtail ──► Loki ┘ -┌─────────────┐ │ -│Docker Logs │──┘ -└─────────────┘ -``` +1. Go to Grafana → "+" → "Import" +2. Upload JSON files from `grafana/dashboards/` directory +3. Select Prometheus data source ## Configuration Files -- `prometheus.yml` - Metrics scraping config -- `loki-config.yml` - Log storage config -- `promtail-config.yml` - Log collection config -- `grafana/provisioning/` - Auto-configuration +- `prometheus/prometheus.yml` - Prometheus configuration +- `loki-config.yml` - Loki configuration +- `promtail-config.yml` - Promtail configuration +- `security_rules.yml` - Security rules -## Customization +## Usage -### Add More Scrapers -Edit `monitoring/prometheus.yml`: -```yaml -scrape_configs: - - job_name: 'my-service' - static_configs: - - targets: ['my-service:9100'] -``` +1. Start monitoring stack: `make deploy-up` +2. Access Grafana: http://localhost:3000 (admin/admin123) +3. Import dashboards from `grafana/dashboards/` directory +4. View metrics and test results in real-time -### Change Retention -**Prometheus:** Add to command in docker-compose: -```yaml -- '--storage.tsdb.retention.time=30d' -``` +## Health Endpoints -**Loki:** Edit `loki-config.yml`: -```yaml -limits_config: - retention_period: 720h # 30 days -``` +The API server provides health check endpoints for monitoring: -## Troubleshooting +- **`/health`** - Overall service health (for Docker healthcheck) +- **`/health/live`** - Liveness probe (is the service running?) +- **`/health/ready`** - Readiness probe (can the service accept traffic?) -**No metrics showing:** -```bash -# Check if Prometheus can reach targets -curl http://localhost:9090/api/v1/targets - -# Check if API exposes metrics -curl http://localhost:9100/metrics -``` - -**No logs showing:** -```bash -# Check Promtail status -docker logs ml-experiments-promtail - -# Verify Loki is receiving logs -curl http://localhost:3100/ready -``` - -**Grafana can't connect to datasources:** -```bash -# Restart Grafana -docker-compose restart grafana -``` - -## Profiling Quick Start - -To capture CPU profiles while exercising real workloads: +### Testing Health Endpoints ```bash -# HTTP LoadTestSuite (MediumLoad scenario) -make profile-load +# Basic health check +curl -k https://localhost:9101/health -# WebSocket → Redis queue → worker integration -make profile-ws-queue +# Liveness check (for K8s or monitoring) +curl -k https://localhost:9101/health/live + +# Readiness check (verifies dependencies) +curl -k https://localhost:9101/health/ready ``` -Then inspect profiles with: +See `health-testing.md` for detailed testing procedures. -```bash -go tool pprof cpu_load.out # HTTP load -go tool pprof cpu_ws.out # WebSocket/queue/worker -``` +## Prometheus Integration + +Prometheus scrapes the following endpoints: +- `api-server:9101/metrics` - Application metrics (future) +- `api-server:9101/health` - Health status monitoring +- `host.docker.internal:9100/metrics` - Worker metrics (when the worker runs on the host) +- `worker:9100/metrics` - Worker metrics (when the worker runs as a container in the compose network) + +## Cleanup (deprecated paths) + +These legacy paths may still exist in the repo but are **not used** by the current dev compose config: + +- `monitoring/dashboards/` (old dashboards location) +- `monitoring/prometheus.yml` (old Prometheus config location) +- `monitoring/grafana/provisioning/dashboards/dashboard.yml` (duplicate of `dashboards.yml`) \ No newline at end of file diff --git a/monitoring/dashboards/grafana-dashboard.json b/monitoring/dashboards/grafana-dashboard.json deleted file mode 100644 index 517fdf3..0000000 --- a/monitoring/dashboards/grafana-dashboard.json +++ /dev/null @@ -1,147 +0,0 @@ -{ - "dashboard": { - "title": "ML Task Queue Monitoring", - "tags": [ - "ml", - "queue", - "fetch_ml" - ], - "timezone": "browser", - "panels": [ - { - "title": "Queue Depth", - "type": "graph", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "fetch_ml_queue_depth", - "legendFormat": "Queue Depth" - } - ] - }, - { - "title": "Active Tasks", - "type": "graph", - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "targets": [ - { - "expr": "sum(fetch_ml_active_tasks) by (worker_id)", - "legendFormat": "{{worker_id}}" - } - ] - }, - { - "title": "Task Duration (p50, p95, p99)", - "type": "graph", - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 8 - }, - "targets": [ - { - "expr": "histogram_quantile(0.50, rate(fetch_ml_task_duration_seconds_bucket[5m]))", - "legendFormat": "p50" - }, - { - "expr": "histogram_quantile(0.95, rate(fetch_ml_task_duration_seconds_bucket[5m]))", - "legendFormat": "p95" - }, - { - "expr": "histogram_quantile(0.99, rate(fetch_ml_task_duration_seconds_bucket[5m]))", - "legendFormat": "p99" - } - ] - }, - { - "title": "Task Completion Rate", - "type": "graph", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 16 - }, - "targets": [ - { - "expr": "rate(fetch_ml_tasks_completed_total[5m])", - "legendFormat": "{{status}}" - } - ] - }, - { - "title": "Failure Rate by Error Category", - "type": "graph", - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - }, - "targets": [ - { - "expr": "rate(fetch_ml_task_failures_total[5m])", - "legendFormat": "{{error_category}}" - } - ] - }, - { - "title": "Retry Rate", - "type": "graph", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 24 - }, - "targets": [ - { - "expr": "rate(fetch_ml_task_retries_total[5m])", - "legendFormat": "{{error_category}}" - } - ] - }, - { - "title": "Dead Letter Queue Size", - "type": "stat", - "gridPos": { - "h": 8, - "w": 6, - "x": 12, - "y": 24 - }, - "targets": [ - { - "expr": "fetch_ml_dlq_size" - } - ] - }, - { - "title": "Lease Expirations", - "type": "stat", - "gridPos": { - "h": 8, - "w": 6, - "x": 18, - "y": 24 - }, - "targets": [ - { - "expr": "fetch_ml_lease_expirations_total" - } - ] - } - ] - } -} \ No newline at end of file diff --git a/monitoring/dashboards/logs-dashboard.json b/monitoring/dashboards/logs-dashboard.json deleted file mode 100644 index c73726c..0000000 --- a/monitoring/dashboards/logs-dashboard.json +++ /dev/null @@ -1,278 +0,0 @@ -{ - "dashboard": { - "title": "Application Logs", - "tags": [ - "logs", - "loki", - "fetch_ml" - ], - "timezone": "browser", - "editable": true, - "graphTooltip": 1, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "panels": [ - { - "title": "Log Stream", - "type": "logs", - "gridPos": { - "x": 0, - "y": 0, - "w": 24, - "h": 12 - }, - "id": 1, - "targets": [ - { - "expr": "{job=\"app_logs\"}", - "refId": "A", - "datasource": "Loki" - } - ], - "options": { - "showTime": true, - "showLabels": true, - "showCommonLabels": false, - "wrapLogMessage": false, - "prettifyLogMessage": false, - "enableLogDetails": true, - "dedupStrategy": "none", - "sortOrder": "Descending" - } - }, - { - "title": "Log Level Distribution", - "type": "bargauge", - "gridPos": { - "x": 0, - "y": 12, - "w": 8, - "h": 8 - }, - "id": 2, - "targets": [ - { - "expr": "sum by (level) (count_over_time({job=\"app_logs\"} | logfmt | level != \"\" [5m]))", - "refId": "A", - "datasource": "Loki", - "legendFormat": "{{level}}" - } - ], - "options": { - "orientation": "horizontal", - "displayMode": "gradient", - "showUnfilled": true - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "INFO" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "green" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "WARN" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "yellow" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "ERROR" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "red" - } - } - ] - } - ] - } - }, - { - "title": "Error Logs (Last Hour)", - "type": "table", - "gridPos": { - "x": 8, - "y": 12, - "w": 16, - "h": 8 - }, - "id": 3, - "targets": [ - { - "expr": "{job=\"app_logs\"} | logfmt | level=\"ERROR\"", - "refId": "A", - "datasource": "Loki" - } - ], - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - } - ] - }, - { - "title": "Logs by Component", - "type": "timeseries", - "gridPos": { - "x": 0, - "y": 20, - "w": 12, - "h": 8 - }, - "id": 4, - "targets": [ - { - "expr": "sum by (component) (rate({job=\"app_logs\"} | logfmt [1m]))", - "refId": "A", - "datasource": "Loki", - "legendFormat": "{{component}}" - } - ], - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "line", - "lineInterpolation": "smooth", - "fillOpacity": 10, - "spanNulls": false, - "showPoints": "never", - "stacking": { - "mode": "none" - } - }, - "unit": "reqps" - } - } - }, - { - "title": "Warning Logs Timeline", - "type": "timeseries", - "gridPos": { - "x": 12, - "y": 20, - "w": 12, - "h": 8 - }, - "id": 5, - "targets": [ - { - "expr": "sum(count_over_time({job=\"app_logs\"} | logfmt | level=\"WARN\" [1m]))", - "refId": "A", - "datasource": "Loki", - "legendFormat": "Warnings" - } - ], - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "bars", - "fillOpacity": 50 - }, - "color": { - "mode": "fixed", - "fixedColor": "yellow" - } - } - } - }, - { - "title": "Search Logs", - "type": "logs", - "gridPos": { - "x": 0, - "y": 28, - "w": 24, - "h": 10 - }, - "id": 6, - "targets": [ - { - "expr": "{job=\"app_logs\"} |= \"$search_term\"", - "refId": "A", - "datasource": "Loki" - } - ], - "options": { - "showTime": true, - "showLabels": true, - "wrapLogMessage": true, - "enableLogDetails": true - } - } - ], - "templating": { - "list": [ - { - "name": "search_term", - "type": "textbox", - "label": "Search Term", - "current": { - "value": "", - "text": "" - } - } - ] - }, - "refresh": "30s" - } -} \ No newline at end of file diff --git a/monitoring/dashboards/performance-dashboard.json b/monitoring/dashboards/performance-dashboard.json deleted file mode 100644 index eed212b..0000000 --- a/monitoring/dashboards/performance-dashboard.json +++ /dev/null @@ -1,157 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "id": null, - "links": [], - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "loki", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "hiddenSeries": false, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "{job=\"fetchml-performance\"} |= \"BenchmarkAPIServerCreateJobSimple\"", - "legendFormat": "API Job Creation", - "refId": "A" - }, - { - "expr": "{job=\"fetchml-performance\"} |= \"BenchmarkMLExperimentExecution/SmallExperiment\"", - "legendFormat": "ML Small Experiment", - "refId": "B" - }, - { - "expr": "{job=\"fetchml-performance\"} |= \"BenchmarkDatasetOperations/DatasetCreation\"", - "legendFormat": "Dataset Creation", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "API Performance Trends", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "Time (ns/op)", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "datasource": "loki", - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 2, - "options": { - "showLabels": true - }, - "targets": [ - { - "expr": "{job=\"fetchml-performance\"} |= \"Performance Summary\"", - "legendFormat": "{{timestamp}}", - "refId": "A" - } - ], - "title": "Latest Performance Summary", - "type": "logs" - } - ], - "refresh": "30s", - "schemaVersion": 27, - "style": "dark", - "tags": ["fetchml", "performance"], - "templating": { - "list": [] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Fetch ML Performance Dashboard", - "uid": "fetchml-performance", - "version": 1 -} diff --git a/monitoring/docker-compose.performance.yml b/monitoring/docker-compose.performance.yml deleted file mode 100644 index 19ec8f7..0000000 --- a/monitoring/docker-compose.performance.yml +++ /dev/null @@ -1,64 +0,0 @@ -services: - prometheus: - image: prom/prometheus:latest - ports: - - "9090:9090" - command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - - '--web.console.libraries=/etc/prometheus/console_libraries' - - '--web.console.templates=/etc/prometheus/consoles' - - '--web.enable-lifecycle' - volumes: - - ./prometheus.yml:/etc/prometheus/prometheus.yml - - prometheus-data:/prometheus - networks: - - monitoring - - loki: - image: grafana/loki:2.9.0 - ports: - - "3100:3100" - command: -config.file=/etc/loki/local-config.yaml - volumes: - - ./loki-performance-config.yaml:/etc/loki/local-config.yaml - networks: - - monitoring - - promtail: - image: grafana/promtail:latest - volumes: - - ./promtail-performance-config.yaml:/etc/promtail/config.yml - - /var/log:/var/log:ro - command: -config.file=/etc/promtail/config.yml - networks: - - monitoring - - pushgateway: - image: prom/pushgateway:latest - ports: - - "9091:9091" - networks: - - monitoring - - grafana: - image: grafana/grafana:latest - ports: - - "3001:3000" - environment: - - GF_SECURITY_ADMIN_PASSWORD=admin - volumes: - - grafana-data:/var/lib/grafana - - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards - - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources - networks: - - monitoring - -volumes: - loki-data: - grafana-data: - prometheus-data: - -networks: - monitoring: - driver: bridge diff --git a/monitoring/grafana/dashboards/load-test-performance.json b/monitoring/grafana/dashboards/load-test-performance.json new file mode 100644 index 0000000..7c250eb --- /dev/null +++ b/monitoring/grafana/dashboards/load-test-performance.json @@ -0,0 +1,51 @@ +{ + "dashboard": { + "id": null, + "title": "Load Test Performance", + "tags": [ + "load-test", + "performance" + ], + "panels": [ + { + "id": 1, + "title": "Service Health", + "type": "stat", + "targets": [ + { + "expr": "up", + "legendFormat": "{{job}}" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + } + }, + { + "id": 2, + "title": "Request Rate", + "type": "graph", + "targets": [ + { + "expr": "rate(http_requests_total[5m])", + "legendFormat": "RPS" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + } + } + ], + "time": { + "from": "now-1h", + "to": "now" + }, + "refresh": "5s" + } +} diff --git a/monitoring/grafana/dashboards/load-test-simple.json b/monitoring/grafana/dashboards/load-test-simple.json new file mode 100644 index 0000000..0e7754f --- /dev/null +++ b/monitoring/grafana/dashboards/load-test-simple.json @@ -0,0 +1 @@ +{"dashboard": {"id": null, "title": "Load Test Performance", "tags": ["load-test", "performance"], "panels": [{"id": 1, "title": "Service Status", "type": "stat", "targets": [{"expr": "up", "legendFormat": "{{job}}"}]}]}} diff --git a/monitoring/grafana/dashboards/loki-logs.json b/monitoring/grafana/dashboards/loki-logs.json new file mode 100644 index 0000000..1dbd3bb --- /dev/null +++ b/monitoring/grafana/dashboards/loki-logs.json @@ -0,0 +1,51 @@ +{ + "dashboard": { + "id": null, + "title": "Log Analysis", + "tags": [ + "loki", + "logs" + ], + "panels": [ + { + "id": 1, + "title": "Error Logs", + "type": "logs", + "targets": [ + { + "expr": "{job=~\".+\"} |= \"error\"", + "legendFormat": "Errors" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + } + }, + { + "id": 2, + "title": "All Logs", + "type": "logs", + "targets": [ + { + "expr": "{job=~\".+\"}", + "legendFormat": "All logs" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + } + } + ], + "time": { + "from": "now-30m", + "to": "now" + }, + "refresh": "30s" + } +} diff --git a/monitoring/grafana/dashboards/prewarm-performance.txt b/monitoring/grafana/dashboards/prewarm-performance.txt new file mode 100644 index 0000000..678fdcc --- /dev/null +++ b/monitoring/grafana/dashboards/prewarm-performance.txt @@ -0,0 +1,135 @@ +# Grafana Dashboard: Prewarm Performance +# Import this JSON into Grafana to create a prewarm monitoring dashboard + +{ + "dashboard": { + "id": null, + "title": "Prewarm Performance", + "tags": ["prewarm", "performance", "worker"], + "panels": [ + { + "id": 1, + "title": "Environment Prewarm Hit Rate (%)", + "type": "stat", + "targets": [ + { + "expr": "100 * (fetchml_prewarm_env_hit_total / clamp_min(fetchml_prewarm_env_hit_total + fetchml_prewarm_env_miss_total, 1))", + "legendFormat": "{{worker_id}}" + } + ], + "gridPos": {"h": 6, "w": 6, "x": 0, "y": 0}, + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "steps": [ + {"color": "red", "value": 0}, + {"color": "yellow", "value": 50}, + {"color": "green", "value": 80} + ] + } + } + } + }, + { + "id": 2, + "title": "Snapshot Prewarm Hit Rate (%)", + "type": "stat", + "targets": [ + { + "expr": "100 * (fetchml_prewarm_snapshot_hit_total / clamp_min(fetchml_prewarm_snapshot_hit_total + fetchml_prewarm_snapshot_miss_total, 1))", + "legendFormat": "{{worker_id}}" + } + ], + "gridPos": {"h": 6, "w": 6, "x": 6, "y": 0}, + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "steps": [ + {"color": "red", "value": 0}, + {"color": "yellow", "value": 50}, + {"color": "green", "value": 80} + ] + } + } + } + }, + { + "id": 3, + "title": "Environment Prewarm Hits vs Misses", + "type": "graph", + "targets": [ + {"expr": "rate(fetchml_prewarm_env_hit_total[5m])", "legendFormat": "hits {{worker_id}}"}, + {"expr": "rate(fetchml_prewarm_env_miss_total[5m])", "legendFormat": "misses {{worker_id}}"} + ], + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}, + "yAxes": [{"unit": "reqps"}] + }, + { + "id": 4, + "title": "Snapshot Prewarm Hits vs Misses", + "type": "graph", + "targets": [ + {"expr": "rate(fetchml_prewarm_snapshot_hit_total[5m])", "legendFormat": "hits {{worker_id}}"}, + {"expr": "rate(fetchml_prewarm_snapshot_miss_total[5m])", "legendFormat": "misses {{worker_id}}"} + ], + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}, + "yAxes": [{"unit": "reqps"}] + }, + { + "id": 5, + "title": "Environment Build Time", + "type": "graph", + "targets": [ + {"expr": "rate(fetchml_prewarm_env_time_seconds_total[5m])", "legendFormat": "build time {{worker_id}}"} + ], + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}, + "yAxes": [{"unit": "seconds"}] + }, + { + "id": 6, + "title": "Snapshot Prewarm Time", + "type": "graph", + "targets": [ + {"expr": "rate(fetchml_prewarm_snapshot_time_seconds_total[5m])", "legendFormat": "prewarm time {{worker_id}}"} + ], + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "yAxes": [{"unit": "seconds"}] + }, + { + "id": 7, + "title": "Environment Images Built", + "type": "graph", + "targets": [ + {"expr": "increase(fetchml_prewarm_env_built_total[1h])", "legendFormat": "built {{worker_id}}"} + ], + "gridPos": {"h": 8, "w": 6, "x": 12, "y": 16}, + "yAxes": [{"unit": "short"}] + }, + { + "id": 8, + "title": "Snapshots Prewarmed", + "type": "graph", + "targets": [ + {"expr": "increase(fetchml_prewarm_snapshot_built_total[1h])", "legendFormat": "prewarmed {{worker_id}}"} + ], + "gridPos": {"h": 8, "w": 6, "x": 18, "y": 16}, + "yAxes": [{"unit": "short"}] + }, + { + "id": 9, + "title": "Prewarm Efficiency", + "type": "graph", + "targets": [ + {"expr": "fetchml_prewarm_env_hit_total + fetchml_prewarm_snapshot_hit_total", "legendFormat": "total hits {{worker_id}}"}, + {"expr": "fetchml_prewarm_env_miss_total + fetchml_prewarm_snapshot_miss_total", "legendFormat": "total misses {{worker_id}}"} + ], + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 24}, + "yAxes": [{"unit": "short"}] + } + ], + "time": {"from": "now-1h", "to": "now"}, + "refresh": "5s" + } +} diff --git a/monitoring/grafana/dashboards/rsync-performance.json b/monitoring/grafana/dashboards/rsync-performance.json new file mode 100644 index 0000000..bb906fd --- /dev/null +++ b/monitoring/grafana/dashboards/rsync-performance.json @@ -0,0 +1,86 @@ +{ + "dashboard": { + "id": null, + "title": "Rsync Performance", + "tags": [ + "rsync", + "sync", + "performance" + ], + "panels": [ + { + "id": 1, + "title": "Rsync Operations", + "type": "graph", + "targets": [ + { + "expr": "rate(rsync_operations_total[5m])", + "legendFormat": "Operations/sec" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + } + }, + { + "id": 2, + "title": "Data Transfer Rate", + "type": "graph", + "targets": [ + { + "expr": "rate(rsync_bytes_transferred_total[5m])", + "legendFormat": "Bytes/sec" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + } + }, + { + "id": 3, + "title": "Sync Duration", + "type": "graph", + "targets": [ + { + "expr": "rsync_sync_duration_seconds", + "legendFormat": "Duration" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + } + }, + { + "id": 4, + "title": "Sync Errors", + "type": "graph", + "targets": [ + { + "expr": "rate(rsync_errors_total[5m])", + "legendFormat": "Errors/sec" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + } + } + ], + "time": { + "from": "now-1h", + "to": "now" + }, + "refresh": "5s" + } +} diff --git a/monitoring/grafana/dashboards/system-health.json b/monitoring/grafana/dashboards/system-health.json new file mode 100644 index 0000000..6c3deac --- /dev/null +++ b/monitoring/grafana/dashboards/system-health.json @@ -0,0 +1,51 @@ +{ + "dashboard": { + "id": null, + "title": "System Health", + "tags": [ + "system", + "health" + ], + "panels": [ + { + "id": 1, + "title": "Service Status", + "type": "stat", + "targets": [ + { + "expr": "up", + "legendFormat": "{{job}}" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + } + }, + { + "id": 2, + "title": "Memory Usage", + "type": "graph", + "targets": [ + { + "expr": "process_resident_memory_bytes", + "legendFormat": "Memory" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + } + } + ], + "time": { + "from": "now-1h", + "to": "now" + }, + "refresh": "10s" + } +} diff --git a/monitoring/grafana/dashboards/websocket-performance.json b/monitoring/grafana/dashboards/websocket-performance.json new file mode 100644 index 0000000..a68319b --- /dev/null +++ b/monitoring/grafana/dashboards/websocket-performance.json @@ -0,0 +1,68 @@ +{ + "dashboard": { + "id": null, + "title": "WebSocket Performance", + "tags": [ + "websocket", + "performance" + ], + "panels": [ + { + "id": 1, + "title": "WebSocket Connections", + "type": "graph", + "targets": [ + { + "expr": "websocket_connections_active", + "legendFormat": "Active Connections" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + } + }, + { + "id": 2, + "title": "WebSocket Messages", + "type": "graph", + "targets": [ + { + "expr": "rate(websocket_messages_total[5m])", + "legendFormat": "Messages/sec" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + } + }, + { + "id": 3, + "title": "Connection Errors", + "type": "graph", + "targets": [ + { + "expr": "rate(websocket_connection_errors_total[5m])", + "legendFormat": "Errors/sec" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + } + } + ], + "time": { + "from": "now-1h", + "to": "now" + }, + "refresh": "5s" + } +} diff --git a/monitoring/grafana/dashboards/worker-resources.json b/monitoring/grafana/dashboards/worker-resources.json new file mode 100644 index 0000000..1ef314f --- /dev/null +++ b/monitoring/grafana/dashboards/worker-resources.json @@ -0,0 +1,280 @@ +{ + "id": null, + "title": "Worker Resources", + "tags": [ + "worker", + "resources" + ], + "panels": [ + { + "id": 1, + "title": "CPU Free", + "type": "stat", + "targets": [ + { + "expr": "fetchml_resources_cpu_free", + "legendFormat": "{{worker_id}}" + } + ], + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 0 + } + }, + { + "id": 2, + "title": "CPU Total", + "type": "stat", + "targets": [ + { + "expr": "fetchml_resources_cpu_total", + "legendFormat": "{{worker_id}}" + } + ], + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 0 + } + }, + { + "id": 3, + "title": "CPU Utilization (%)", + "type": "graph", + "targets": [ + { + "expr": "100 * (1 - (fetchml_resources_cpu_free / clamp_min(fetchml_resources_cpu_total, 1)))", + "legendFormat": "{{worker_id}}" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + } + }, + { + "id": 4, + "title": "GPU Slots Free", + "type": "graph", + "targets": [ + { + "expr": "fetchml_resources_gpu_slots_free", + "legendFormat": "{{worker_id}} gpu={{gpu_index}}" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + } + }, + { + "id": 5, + "title": "Acquire Wait / Timeout (Totals)", + "type": "graph", + "targets": [ + { + "expr": "fetchml_resources_acquire_wait_total", + "legendFormat": "wait {{worker_id}}" + }, + { + "expr": "fetchml_resources_acquire_timeout_total", + "legendFormat": "timeout {{worker_id}}" + }, + { + "expr": "fetchml_resources_acquire_total", + "legendFormat": "total {{worker_id}}" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + } + }, + { + "id": 6, + "title": "Avg Acquire Wait (seconds)", + "type": "stat", + "targets": [ + { + "expr": "fetchml_resources_acquire_wait_seconds_total / clamp_min(fetchml_resources_acquire_wait_total, 1)", + "legendFormat": "{{worker_id}}" + } + ], + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 14 + } + }, + { + "id": 7, + "title": "Acquire Wait Ratio", + "type": "stat", + "targets": [ + { + "expr": "fetchml_resources_acquire_wait_total / clamp_min(fetchml_resources_acquire_total, 1)", + "legendFormat": "{{worker_id}}" + } + ], + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 14 + } + }, + { + "id": 8, + "title": "Environment Prewarm Hit Rate (%)", + "type": "stat", + "targets": [ + { + "expr": "100 * (fetchml_prewarm_env_hit_total / clamp_min(fetchml_prewarm_env_hit_total + fetchml_prewarm_env_miss_total, 1))", + "legendFormat": "{{worker_id}}" + } + ], + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 14 + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "steps": [ + {"color": "red", "value": 0}, + {"color": "yellow", "value": 50}, + {"color": "green", "value": 80} + ] + } + } + } + }, + { + "id": 9, + "title": "Snapshot Prewarm Hit Rate (%)", + "type": "stat", + "targets": [ + { + "expr": "100 * (fetchml_prewarm_snapshot_hit_total / clamp_min(fetchml_prewarm_snapshot_hit_total + fetchml_prewarm_snapshot_miss_total, 1))", + "legendFormat": "{{worker_id}}" + } + ], + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 14 + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "steps": [ + {"color": "red", "value": 0}, + {"color": "yellow", "value": 50}, + {"color": "green", "value": 80} + ] + } + } + } + }, + { + "id": 10, + "title": "Prewarm Hits vs Misses", + "type": "graph", + "targets": [ + { + "expr": "rate(fetchml_prewarm_env_hit_total[5m])", + "legendFormat": "env hits {{worker_id}}" + }, + { + "expr": "rate(fetchml_prewarm_env_miss_total[5m])", + "legendFormat": "env misses {{worker_id}}" + }, + { + "expr": "rate(fetchml_prewarm_snapshot_hit_total[5m])", + "legendFormat": "snapshot hits {{worker_id}}" + }, + { + "expr": "rate(fetchml_prewarm_snapshot_miss_total[5m])", + "legendFormat": "snapshot misses {{worker_id}}" + } + ], + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 20 + }, + "yAxes": [ + {"unit": "reqps"} + ] + }, + { + "id": 11, + "title": "Prewarm Build Time", + "type": "graph", + "targets": [ + { + "expr": "rate(fetchml_prewarm_env_time_seconds_total[5m])", + "legendFormat": "env build {{worker_id}}" + }, + { + "expr": "rate(fetchml_prewarm_snapshot_time_seconds_total[5m])", + "legendFormat": "snapshot prewarm {{worker_id}}" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "yAxes": [ + {"unit": "seconds"} + ] + }, + { + "id": 12, + "title": "Prewarm Builds", + "type": "graph", + "targets": [ + { + "expr": "increase(fetchml_prewarm_env_built_total[1h])", + "legendFormat": "env built {{worker_id}}" + }, + { + "expr": "increase(fetchml_prewarm_snapshot_built_total[1h])", + "legendFormat": "snapshots prewarmed {{worker_id}}" + } + ], + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 28 + }, + "yAxes": [ + {"unit": "short"} + ] + } + ], + "time": { + "from": "now-1h", + "to": "now" + }, + "refresh": "5s" +} diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml index 7435f09..e456c4f 100644 --- a/monitoring/grafana/provisioning/dashboards/dashboards.yml +++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -1,5 +1,4 @@ apiVersion: 1 - providers: - name: 'default' orgId: 1 diff --git a/monitoring/grafana/provisioning/datasources/loki.yml b/monitoring/grafana/provisioning/datasources/loki.yml new file mode 100644 index 0000000..e4818d0 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/loki.yml @@ -0,0 +1,9 @@ +apiVersion: 1 +datasources: + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + editable: true + jsonData: + maxLines: 1000 diff --git a/monitoring/grafana/provisioning/datasources/datasources.yml b/monitoring/grafana/provisioning/datasources/prometheus.yml similarity index 50% rename from monitoring/grafana/provisioning/datasources/datasources.yml rename to monitoring/grafana/provisioning/datasources/prometheus.yml index fcf0dff..2a621c8 100644 --- a/monitoring/grafana/provisioning/datasources/datasources.yml +++ b/monitoring/grafana/provisioning/datasources/prometheus.yml @@ -1,16 +1,10 @@ apiVersion: 1 - datasources: - name: Prometheus type: prometheus access: proxy url: http://prometheus:9090 - isDefault: false - editable: false - - - name: Loki - type: loki - access: proxy - url: http://loki:3100 isDefault: true - editable: false + editable: true + jsonData: + timeInterval: "5s" diff --git a/monitoring/health-testing.md b/monitoring/health-testing.md new file mode 100644 index 0000000..40d54c1 --- /dev/null +++ b/monitoring/health-testing.md @@ -0,0 +1,100 @@ +# Testing Health Endpoints with Monitoring Stack + +## Verify Health Endpoints + +```bash +# 1. Start the monitoring stack +cd deployments +docker-compose -f docker-compose.dev.yml up -d + +# 2. Wait for services to start (30 seconds) +sleep 30 + +# 3. Test health endpoints +curl -k https://localhost:9101/health +# Expected: {"status":"healthy","timestamp":"...","checks":{}} + +curl -k https://localhost:9101/health/live +# Expected: {"status":"alive","timestamp":"..."} + +curl -k https://localhost:9101/health/ready +# Expected: {"status":"ready","timestamp":"...","checks":{"queue":"ok","experiments":"ok"}} + +# 4. Check Docker health status +docker ps | grep api-server +# Should show: (healthy) + +# 5. Access Grafana +open http://localhost:3000 +# Login: admin / admin123 + +# 6. Access Prometheus +open http://localhost:9090 +# Check targets: Status > Targets +# Should see: api-server, api-server-health + +# 7. Query health metrics in Prometheus +# Go to Graph and enter: up{job="api-server-health"} +# Should show: value=1 (service is up) +``` + +## Health Check Integration + +### Docker Compose +The health check is configured in `deployments/docker-compose.dev.yml`: +```yaml +healthcheck: + test: [ "CMD", "curl", "-k", "https://localhost:9101/health" ] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s +``` + +### Prometheus Monitoring +Prometheus scrapes health status every 30s from: +- `/health` - Overall service health +- `/metrics` - Future Prometheus metrics (when implemented) + +### Kubernetes (Future) +Health endpoints ready for K8s probes: +```yaml +livenessProbe: + httpGet: + path: /health/live + port: 9101 + scheme: HTTPS + initialDelaySeconds: 30 + periodSeconds: 10 + +readinessProbe: + httpGet: + path: /health/ready + port: 9101 + scheme: HTTPS + initialDelaySeconds: 10 + periodSeconds: 5 +``` + +## Monitoring Stack Services + +- **Grafana** (port 3000): Dashboards and visualization +- **Prometheus** (port 9090): Metrics collection +- **Loki** (port 3100): Log aggregation +- **Promtail**: Log shipping + +## Troubleshooting + +```bash +# Check API server logs +docker logs ml-experiments-api + +# Check Prometheus targets +curl http://localhost:9090/api/v1/targets + +# Check health endpoint directly +docker exec ml-experiments-api curl -k https://localhost:9101/health + +# Restart services +docker-compose -f deployments/docker-compose.dev.yml restart api-server +``` diff --git a/monitoring/loki-config.yml b/monitoring/loki-config.yml index 353066d..15eaaea 100644 --- a/monitoring/loki-config.yml +++ b/monitoring/loki-config.yml @@ -12,7 +12,7 @@ common: rules_directory: /loki/rules replication_factor: 1 ring: - instance_addr: 127.0.0.1 + instance_addr: 0.0.0.0 kvstore: store: inmemory diff --git a/monitoring/loki-performance-config.yaml b/monitoring/loki-performance-config.yaml deleted file mode 100644 index a38b0ff..0000000 --- a/monitoring/loki-performance-config.yaml +++ /dev/null @@ -1,40 +0,0 @@ -auth_enabled: false - -server: - http_listen_port: 3100 - -ingester: - lifecycler: - address: 127.0.0.1 - ring: - kvstore: - store: inmemory - replication_factor: 1 - final_sleep: 0s - min_ready_duration: 0s - chunk_idle_period: 1h - max_chunk_age: 1h - chunk_target_size: 1048576 - chunk_retain_period: 30s - -schema_config: - configs: - - from: 2020-10-24 - store: boltdb-shipper - object_store: filesystem - schema: v11 - index: - prefix: index_ - period: 24h - -storage_config: - boltdb_shipper: - active_index_directory: /loki/boltdb-shipper-active - cache_location: /loki/boltdb-shipper-cache - filesystem: - directory: /loki/chunks - -limits_config: - reject_old_samples: true - reject_old_samples_max_age: 168h - allow_structured_metadata: false diff --git a/monitoring/performance/grafana-dashboards/performance-dashboard.json b/monitoring/performance/grafana-dashboards/performance-dashboard.json deleted file mode 100644 index e69de29..0000000 diff --git a/monitoring/prometheus.yml b/monitoring/prometheus/prometheus.yml similarity index 58% rename from monitoring/prometheus.yml rename to monitoring/prometheus/prometheus.yml index 5c3f419..18c2e3f 100644 --- a/monitoring/prometheus.yml +++ b/monitoring/prometheus/prometheus.yml @@ -5,39 +5,35 @@ global: evaluation_interval: 15s scrape_configs: - # API Server metrics + # API Server metrics and health - job_name: 'api-server' + scheme: http static_configs: - - targets: ['api-server:9100'] + - targets: ['api-server:9101'] labels: service: 'api-server' + metrics_path: /metrics # Future: Prometheus metrics endpoint - # Worker metrics (if running in docker) + # Benchmark metrics from Pushgateway + - job_name: 'benchmark' + static_configs: [] + + # Worker metrics (ResourceManager + task execution) + # For docker-compose dev on macOS/Windows, Prometheus can reach a locally running worker + # via host.docker.internal. - job_name: 'worker' + scrape_interval: 15s static_configs: - targets: ['worker:9100'] labels: service: 'worker' - # Allow failures if worker not running - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - # Benchmark metrics from Pushgateway - - job_name: 'benchmark' - static_configs: - - targets: ['localhost:9091'] - labels: - service: 'benchmark' + target_type: 'container' metrics_path: /metrics - honor_labels: true # Loki metrics - job_name: 'loki' static_configs: - - targets: ['ml-experiments-loki:3100'] + - targets: ['loki:3100'] labels: service: 'loki' metrics_path: /metrics diff --git a/monitoring/promtail-performance-config.yaml b/monitoring/promtail-performance-config.yaml deleted file mode 100644 index 4562f11..0000000 --- a/monitoring/promtail-performance-config.yaml +++ /dev/null @@ -1,50 +0,0 @@ -server: - http_listen_port: 9080 - grpc_listen_port: 0 - -positions: - filename: /tmp/positions.yaml - -clients: - - url: http://loki:3100/loki/api/v1/push - -scrape_configs: -- job_name: fetchml-performance - static_configs: - - targets: - - localhost - labels: - job: fetchml-performance - __path__: /reports/performance.log - - pipeline_stages: - - json: - expressions: - timestamp: timestamp - git_commit: git_commit - benchmark_name: name - time_per_op: time_per_op_ns - memory_per_op: memory_per_op_b - allocs_per_op: allocs_per_op - - - labels: - benchmark_name: - git_commit: - - - output: - source: output - -- job_name: fetchml-performance-summary - static_configs: - - targets: - - localhost - labels: - job: fetchml-performance - __path__: /reports/performance_summary.log - - pipeline_stages: - - regex: - expression: "=== Performance Summary ===" - - - output: - source: output diff --git a/monitoring/security_rules.yml b/monitoring/security_rules.yml deleted file mode 100644 index 64b03dd..0000000 --- a/monitoring/security_rules.yml +++ /dev/null @@ -1,112 +0,0 @@ -groups: - - name: security.rules - rules: - # High rate of failed authentication attempts - - alert: HighFailedAuthRate - expr: rate(failed_auth_total[5m]) > 10 - for: 2m - labels: - severity: warning - annotations: - summary: "High rate of failed authentication attempts" - description: "More than 10 failed auth attempts per minute for the last 2 minutes" - - # Potential brute force attack - - alert: BruteForceAttack - expr: rate(failed_auth_total[1m]) > 30 - for: 1m - labels: - severity: critical - annotations: - summary: "Potential brute force attack detected" - description: "More than 30 failed auth attempts per minute" - - # Unusual WebSocket connection patterns - - alert: UnusualWebSocketActivity - expr: rate(websocket_connections_total[5m]) > 100 - for: 3m - labels: - severity: warning - annotations: - summary: "Unusual WebSocket connection activity" - description: "WebSocket connection rate is unusually high" - - # Rate limit breaches - - alert: RateLimitBreached - expr: rate(rate_limit_exceeded_total[5m]) > 5 - for: 1m - labels: - severity: warning - annotations: - summary: "Rate limits being exceeded" - description: "Rate limit exceeded more than 5 times per minute" - - # SSL certificate expiration warning - - alert: SSLCertificateExpiring - expr: ssl_certificate_expiry_days < 30 - for: 1h - labels: - severity: warning - annotations: - summary: "SSL certificate expiring soon" - description: "SSL certificate will expire in less than 30 days" - - # High memory usage - - alert: HighMemoryUsage - expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9 - for: 5m - labels: - severity: warning - annotations: - summary: "High memory usage detected" - description: "Memory usage is above 90%" - - # High CPU usage - - alert: HighCPUUsage - expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 - for: 5m - labels: - severity: warning - annotations: - summary: "High CPU usage detected" - description: "CPU usage is above 80%" - - # Disk space running low - - alert: LowDiskSpace - expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10 - for: 5m - labels: - severity: critical - annotations: - summary: "Low disk space" - description: "Disk space is below 10%" - - # Service down - - alert: ServiceDown - expr: up == 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Service is down" - description: "{{ $labels.instance }} service has been down for more than 1 minute" - - # Unexpected error rates - - alert: HighErrorRate - expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1 - for: 2m - labels: - severity: warning - annotations: - summary: "High error rate detected" - description: "Error rate is above 10%" - - # Suspicious IP activity - - alert: SuspiciousIPActivity - expr: rate(requests_by_ip[5m]) > 1000 - for: 2m - labels: - severity: warning - annotations: - summary: "Suspicious IP activity" - description: "IP address making unusually many requests" diff --git a/podman/README.md b/podman/README.md index 000d9e8..43f66da 100644 --- a/podman/README.md +++ b/podman/README.md @@ -118,7 +118,7 @@ jupyter>=1.0.0 "allow_network": false, "blocked_packages": ["requests", "urllib3", "httpx"], "max_execution_time": 3600, - "gpu_access": true, + "gpu_devices": ["/dev/dri"], "ml_env": "ml_env", "package_manager": "mamba" } diff --git a/podman/secure-ml-runner.podfile b/podman/secure-ml-runner.podfile index 6b0a356..4d5cebb 100644 --- a/podman/secure-ml-runner.podfile +++ b/podman/secure-ml-runner.podfile @@ -32,6 +32,10 @@ RUN mamba install -n ml_env \ -c pytorch -c conda-forge -y && \ conda clean -afy +# Poetry (for pyproject.toml + poetry.lock projects) +RUN mamba install -n ml_env poetry -c conda-forge -y && \ + conda clean -afy + # Copy security wrapper COPY secure_runner.py /usr/local/bin/secure_runner.py COPY security_policy.json /etc/ml_runner/security_policy.json diff --git a/podman/secure_runner.py b/podman/secure_runner.py index d78cc93..2aa85d0 100644 --- a/podman/secure_runner.py +++ b/podman/secure_runner.py @@ -45,7 +45,7 @@ class SecurityPolicy: ], "max_execution_time": 3600, "max_memory_gb": 16, - "gpu_access": True, + "gpu_devices": ["/dev/dri"], "allow_file_writes": True, "resource_limits": { "cpu_count": 4, @@ -106,97 +106,197 @@ class CondaRunner: self.conda_prefix = os.environ.get("CONDA_PREFIX", "/opt/conda") self.env_path = f"{self.conda_prefix}/envs/{self.conda_env}" - def setup_environment(self, requirements_file: Path) -> bool: - """Setup Conda environment with mamba""" + self.gpu_devices = self.security_policy.policy.get("gpu_devices", []) + + def setup_environment(self, deps_file: Path) -> bool: + """Setup Conda environment based on a dependency manifest.""" try: - # Read requirements - with open(requirements_file, "r") as f: - requirements = [ - line.strip() - for line in f - if line.strip() and not line.startswith("#") - ] + name = deps_file.name - # Check each package for security - for req in requirements: - package_name = ( - req.split("==")[0].split(">=")[0].split("<=")[0].strip() - ) - if not self.security_policy.check_package_safety(package_name): - print( - f"[SECURITY] Package '{package_name}' is blocked for security reasons" - ) - return False + print(f"[MANIFEST] Using dependency manifest: {name}") - # Install packages with mamba (super fast!) - for req in requirements: - package_name = ( - req.split("==")[0].split(">=")[0].split("<=")[0].strip() - ) - - # Check if already installed with conda - check_cmd = [ - "conda", - "run", - "-n", - self.conda_env, - "python", - "-c", - f"import {package_name.replace('-', '_')}", - ] - result = subprocess.run( - check_cmd, capture_output=True, text=True - ) - - if result.returncode == 0: - print(f"[OK] {package_name} already installed in conda env") - continue - - # Try conda-forge first (faster and more reliable) - print( - f"[INSTALL] Installing {req} with {self.package_manager}..." - ) - install_cmd = [ + if name in ("environment.yml", "environment.yaml"): + print(f"[SETUP] Applying conda environment file: {deps_file}") + cmd = [ self.package_manager, - "install", + "env", + "update", "-n", self.conda_env, - req, - "-c", - "conda-forge", + "-f", + str(deps_file), "-y", ] - result = subprocess.run( - install_cmd, capture_output=True, text=True, timeout=300 + result = subprocess.run(cmd, capture_output=True, text=True, timeout=900) + if result.returncode != 0: + print(f"[ERROR] Failed to apply environment file: {result.stderr}") + return False + return True + + if name == "poetry.lock": + pyproject = self.workspace_dir / "pyproject.toml" + if not pyproject.exists(): + print("[ERROR] poetry.lock provided but pyproject.toml is missing") + return False + + print(f"[SETUP] Installing dependencies from Poetry lockfile: {deps_file}") + env = os.environ.copy() + env.update( + { + "POETRY_VIRTUALENVS_CREATE": "false", + "POETRY_NO_INTERACTION": "1", + } ) - if result.returncode == 0: - print(f"[OK] Installed {req} with {self.package_manager}") - continue + # Ensure Poetry is available in the conda env. + check = subprocess.run( + ["conda", "run", "-n", self.conda_env, "poetry", "--version"], + capture_output=True, + text=True, + env=env, + ) + if check.returncode != 0: + print("[ERROR] Poetry is not available in the container environment") + print(check.stderr) + return False - # Fallback to pip if conda fails - print(f"[FALLBACK] Trying pip for {req}...") - pip_cmd = [ + # Install into the conda env (no separate venv). + install = subprocess.run( + [ + "conda", + "run", + "-n", + self.conda_env, + "poetry", + "install", + "--no-ansi", + ], + capture_output=True, + text=True, + timeout=900, + cwd=str(self.workspace_dir), + env=env, + ) + if install.returncode != 0: + print("[ERROR] Poetry install failed") + print(install.stderr) + return False + + return True + + if name == "pyproject.toml": + # Use pip's PEP517/pyproject support (no Poetry required). + # This installs the project itself; dependencies may be fetched as needed. + print(f"[SETUP] Installing project from pyproject.toml: {deps_file}") + cmd = [ "conda", "run", "-n", self.conda_env, "pip", "install", - req, + str(self.workspace_dir), "--no-cache-dir", ] - result = subprocess.run( - pip_cmd, capture_output=True, text=True, timeout=300 - ) - + result = subprocess.run(cmd, capture_output=True, text=True, timeout=900) if result.returncode != 0: - print(f"[ERROR] Failed to install {req}: {result.stderr}") + print(f"[ERROR] Failed to install project from pyproject.toml: {result.stderr}") return False + return True - print(f"[OK] Installed {req} with pip") + if name == "requirements.txt": + # Read requirements + with open(deps_file, "r") as f: + requirements = [ + line.strip() + for line in f + if line.strip() and not line.startswith("#") + ] - return True + # Check each package for security + for req in requirements: + package_name = ( + req.split("==")[0].split(">=")[0].split("<=")[0].strip() + ) + if not self.security_policy.check_package_safety(package_name): + print( + f"[SECURITY] Package '{package_name}' is blocked for security reasons" + ) + return False + + # Install packages with mamba (super fast!) + for req in requirements: + package_name = ( + req.split("==")[0].split(">=")[0].split("<=")[0].strip() + ) + + # Check if already installed with conda + check_cmd = [ + "conda", + "run", + "-n", + self.conda_env, + "python", + "-c", + f"import {package_name.replace('-', '_')}", + ] + result = subprocess.run( + check_cmd, capture_output=True, text=True + ) + + if result.returncode == 0: + print(f"[OK] {package_name} already installed in conda env") + continue + + # Try conda-forge first (faster and more reliable) + print( + f"[INSTALL] Installing {req} with {self.package_manager}..." + ) + install_cmd = [ + self.package_manager, + "install", + "-n", + self.conda_env, + req, + "-c", + "conda-forge", + "-y", + ] + result = subprocess.run( + install_cmd, capture_output=True, text=True, timeout=300 + ) + + if result.returncode == 0: + print(f"[OK] Installed {req} with {self.package_manager}") + continue + + # Fallback to pip if conda fails + print(f"[FALLBACK] Trying pip for {req}...") + pip_cmd = [ + "conda", + "run", + "-n", + self.conda_env, + "pip", + "install", + req, + "--no-cache-dir", + ] + result = subprocess.run( + pip_cmd, capture_output=True, text=True, timeout=300 + ) + + if result.returncode != 0: + print(f"[ERROR] Failed to install {req}: {result.stderr}") + return False + + print(f"[OK] Installed {req} with pip") + + return True + + print(f"[ERROR] Unsupported dependency manifest: {deps_file}") + print("Supported: environment.yml, environment.yaml, poetry.lock (requires pyproject.toml), pyproject.toml, requirements.txt") + return False except Exception as e: print(f"[ERROR] Environment setup failed: {e}") @@ -217,7 +317,7 @@ class CondaRunner: env.update( { "CONDA_DEFAULT_ENV": self.conda_env, - "CUDA_VISIBLE_DEVICES": "0", # Allow GPU access + "CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES", ""), # Allow GPU access "SECURE_MODE": "1", "NETWORK_ACCESS": ( "1" @@ -280,7 +380,7 @@ class CondaRunner: "stdout": stdout, "stderr": stderr, "return_code": process.returncode, - "gpu_accessible": True, + "gpu_accessible": len(self.gpu_devices) > 0, "security_mode": "enabled", "container_type": "conda", "conda_env": self.conda_env, @@ -338,8 +438,12 @@ def main(): parser.add_argument( "--workspace", default="/workspace", help="Workspace directory" ) - parser.add_argument("--requirements", help="Requirements file path") + parser.add_argument("--deps", help="Dependency manifest path (environment.yml | poetry.lock | pyproject.toml | requirements.txt)") + parser.add_argument("--requirements", help="Deprecated alias for --deps") parser.add_argument("--script", help="Training script path") + parser.add_argument( + "--prepare-only", action="store_true", help="Only prepare dependencies and exit" + ) parser.add_argument( "--args", nargs=argparse.REMAINDER, @@ -383,17 +487,26 @@ def main(): if args.check_gpu: return 0 + deps_arg = args.deps or args.requirements + if not deps_arg: + print("[ERROR] Missing dependency manifest. Provide --deps.") + return 1 + # Setup environment - requirements_path = Path(args.requirements) - if not requirements_path.exists(): - print(f"[ERROR] Requirements file not found: {requirements_path}") + deps_path = Path(deps_arg) + if not deps_path.exists(): + print(f"[ERROR] Dependency manifest not found: {deps_path}") return 1 print("[SETUP] Setting up secure environment...") - if not runner.setup_environment(requirements_path): + if not runner.setup_environment(deps_path): print("[ERROR] Failed to setup secure environment") return 1 + if args.prepare_only: + print("[DONE] Environment prepared successfully") + return 0 + # Run experiment script_path = Path(args.script) if not script_path.exists(): diff --git a/podman/security_policy.json b/podman/security_policy.json index da9c3de..974f857 100644 --- a/podman/security_policy.json +++ b/podman/security_policy.json @@ -24,7 +24,7 @@ ], "max_execution_time": 3600, "max_memory_gb": 16, - "gpu_access": true, + "gpu_devices": ["/dev/dri"], "allow_file_writes": true, "resource_limits": { "cpu_count": 4, diff --git a/scripts/README.md b/scripts/README.md index d53a72f..8050027 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -20,19 +20,12 @@ This directory contains setup and utility scripts for FetchML. sudo ./scripts/setup-prod.sh /data/ml-experiments ml-user ml-group ``` -### `validate-prod-config.sh` -**Purpose**: Validates production configuration files -**Usage**: `./scripts/validate-prod-config.sh [api-config] [worker-config]` -**What it does**: -- Checks config file syntax -- Verifies base_path consistency -- Tests Redis connectivity -- Validates Podman setup -- Checks directory permissions +### Configuration validation +Validate configs using the built-in config lint targets: -**Example**: ```bash -./scripts/validate-prod-config.sh configs/config-prod.yaml configs/worker-prod.toml +make configlint +make worker-configlint ``` ## Legacy Setup Scripts (Deprecated) @@ -44,12 +37,11 @@ The following scripts are from earlier iterations and are **deprecated** in favo - `auto_setup.sh` - Old automated setup (superseded) - `setup_common.sh` - Common functions (integrated into setup-prod.sh) - `quick_start.sh` - Quick dev setup (use docker-compose on macOS instead) -- `test_tools.sh` - Tool testing (integrated into validate-prod-config.sh) + ### Cleanup Recommendation These legacy scripts can be removed or archived. The current production setup only needs: - `setup-prod.sh` -- `validate-prod-config.sh` ## Usage Workflow @@ -59,8 +51,8 @@ These legacy scripts can be removed or archived. The current production setup on sudo ./scripts/setup-prod.sh # 2. Copy and configure -sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml -sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml +sudo cp configs/api/prod.yaml /etc/fetch_ml/config.yaml +sudo cp configs/workers/worker-prod.toml /etc/fetch_ml/worker.toml sudo vim /etc/fetch_ml/config.yaml # Update API keys, etc. # 3. Build and install @@ -68,7 +60,8 @@ make prod sudo make install # 4. Validate -./scripts/validate-prod-config.sh /etc/fetch_ml/config.yaml /etc/fetch_ml/worker.toml +./bin/configlint --schema configs/schema/api_server_config.yaml /etc/fetch_ml/config.yaml +./bin/configlint --schema configs/schema/worker_config_schema.yaml /etc/fetch_ml/worker.toml # 5. Start services sudo systemctl start fetchml-api fetchml-worker @@ -82,7 +75,7 @@ docker-compose up -d # Or run components directly make dev -./bin/api-server -config configs/config-local.yaml +./bin/api-server -config configs/api/dev.yaml ``` ## Script Maintenance diff --git a/scripts/auto-cleanup.service b/scripts/auto-cleanup.service deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/auto-cleanup.timer b/scripts/auto-cleanup.timer deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/benchmarks/run-benchmarks-local.sh b/scripts/benchmarks/run-benchmarks-local.sh index 820c86d..a1a2d32 100755 --- a/scripts/benchmarks/run-benchmarks-local.sh +++ b/scripts/benchmarks/run-benchmarks-local.sh @@ -8,6 +8,7 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts" +ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive" TIMESTAMP=$(date -u +"%Y%m%d_%H%M%S") RUN_DIR="$LOCAL_ARTIFACTS_DIR/run_$TIMESTAMP" @@ -168,14 +169,25 @@ if [ -f "$SCRIPT_DIR/cleanup-benchmarks.sh" ]; then "$SCRIPT_DIR/cleanup-benchmarks.sh" benchmarks else # Fallback cleanup if script not available - echo "Cleaning old benchmark runs (keeping last 10)..." + echo "Archiving old benchmark runs (keeping last 10)..." + stamp=$(date -u +%Y%m%d-%H%M%S) + mkdir -p "$ARCHIVE_DIR/$stamp" cd "$LOCAL_ARTIFACTS_DIR" - ls -1t run_* 2>/dev/null | tail -n +11 | xargs rm -rf 2>/dev/null || echo "No old runs to clean" + ls -1t run_* 2>/dev/null | tail -n +11 | while read -r run; do + [ -n "$run" ] || continue + mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true + done # Clean temporary files - echo "Cleaning temporary files..." - find /tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true - find /var/tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true + echo "Archiving temporary files..." + tmp_archive_dir="$LOCAL_ARTIFACTS_DIR/tmp-archive/$stamp" + mkdir -p "$tmp_archive_dir" + find /tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do + mv "$f" "$tmp_archive_dir/" 2>/dev/null || true + done + find /var/tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do + mv "$f" "$tmp_archive_dir/" 2>/dev/null || true + done # Clean Go build cache echo "Cleaning Go build cache..." diff --git a/scripts/cleanup-status.sh b/scripts/cleanup-status.sh deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/cleanup.sh b/scripts/cleanup.sh deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/create_bitwarden_fetchml_item.sh b/scripts/create_bitwarden_fetchml_item.sh deleted file mode 100644 index 66a7cd0..0000000 --- a/scripts/create_bitwarden_fetchml_item.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# Create a Bitwarden item for a FetchML API user. -# -# Usage: -# ./scripts/create_bitwarden_fetchml_item.sh -# -# Requirements: -# - Bitwarden CLI (bw) installed -# - You are logged in and unlocked (bw login; bw unlock) -# - jq installed -# -# This script does NOT run on the homelab server. Run it from your -# own machine where you manage Bitwarden. - -if [[ $# -ne 3 ]]; then - echo "Usage: $0 " >&2 - exit 1 -fi - -USER_NAME="$1" -API_KEY="$2" -API_KEY_HASH="$3" - -ITEM_NAME="FetchML API  $USER_NAME" - -# Get base item template -TEMPLATE_JSON=$(bw get template item) - -# Build item JSON with jq -ITEM_JSON=$(echo "$TEMPLATE_JSON" | jq \ - --arg name "$ITEM_NAME" \ - --arg username "$USER_NAME" \ - --arg password "$API_KEY" \ - --arg hash "$API_KEY_HASH" \ - '.name = $name - | .login.username = $username - | .login.password = $password - | .notes = "FetchML API key for user " + $username - | .fields = [{"name":"api_key_hash","value":$hash,"type":1}]') - -# Create item in Bitwarden -# If you ever want to edit instead, you can capture the ID from this call -# and use: bw edit item - -echo "$ITEM_JSON" | bw encode | bw create item - -echo "Created Bitwarden item: $ITEM_NAME" diff --git a/scripts/deployment/setup-auto-cleanup.sh b/scripts/deployment/setup-auto-cleanup.sh deleted file mode 100755 index 34bb285..0000000 --- a/scripts/deployment/setup-auto-cleanup.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -# Setup auto-cleanup service for fetch_ml -# This creates a systemd timer that runs cleanup daily - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_DIR="$(dirname "$SCRIPT_DIR")" - -# Colors -GREEN='\033[0;32m' -BLUE='\033[0;34m' -NC='\033[0m' - -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -log_info "Setting up auto-cleanup service..." - -# Check if running on macOS or Linux -if [[ "$OSTYPE" == "darwin"* ]]; then - log_info "Detected macOS - setting up launchd agent" - - # Create launchd plist - cat > ~/Library/LaunchAgents/com.fetchml.cleanup.plist << EOF - - - - - Label - com.fetchml.cleanup - ProgramArguments - - $PROJECT_DIR/scripts/cleanup.sh - --force - - StartInterval - 86400 - RunAtLoad - - StandardOutPath - /tmp/fetchml-cleanup.log - StandardErrorPath - /tmp/fetchml-cleanup.error.log - - -EOF - - # Load the launchd agent - launchctl load ~/Library/LaunchAgents/com.fetchml.cleanup.plist - - log_success "Auto-cleanup service installed for macOS" - log_info "Logs will be in /tmp/fetchml-cleanup.log" - -elif [[ "$OSTYPE" == "linux-gnu"* ]]; then - log_info "Detected Linux - setting up systemd timer" - - # Copy service files - sudo cp "$SCRIPT_DIR/auto-cleanup.service" /etc/systemd/system/ - sudo cp "$SCRIPT_DIR/auto-cleanup.timer" /etc/systemd/system/ - - # Reload systemd and enable timer - sudo systemctl daemon-reload - sudo systemctl enable auto-cleanup.timer - sudo systemctl start auto-cleanup.timer - - log_success "Auto-cleanup service installed for Linux" - log_info "Check status with: systemctl status auto-cleanup.timer" - -else - echo "Unsupported OS: $OSTYPE" - exit 1 -fi - -log_info "Auto-cleanup will run daily" -log_info "To uninstall:" -if [[ "$OSTYPE" == "darwin"* ]]; then - echo " launchctl unload ~/Library/LaunchAgents/com.fetchml.cleanup.plist" - echo " rm ~/Library/LaunchAgents/com.fetchml.cleanup.plist" -else - echo " sudo systemctl stop auto-cleanup.timer" - echo " sudo systemctl disable auto-cleanup.timer" - echo " sudo rm /etc/systemd/system/auto-cleanup.*" -fi diff --git a/scripts/deployment/setup-monitoring-prod.sh b/scripts/deployment/setup-monitoring-prod.sh deleted file mode 100755 index bc1319f..0000000 --- a/scripts/deployment/setup-monitoring-prod.sh +++ /dev/null @@ -1,275 +0,0 @@ -#!/bin/bash -# Production Monitoring Stack Setup for Linux -# Deploys Prometheus/Grafana/Loki/Promtail as Podman containers with systemd -# Compatible with: Rocky/RHEL/CentOS, Ubuntu/Debian, Arch, SUSE, etc. - -set -e - -BOLD='\033[1m' -GREEN='\033[0;32m' -BLUE='\033[0;34m' -YELLOW='\033[0;33m' -NC='\033[0m' - -echo -e "${BOLD}=== FetchML Monitoring Stack Setup (Linux) ===${NC}\n" - -# Detect Linux distribution and package manager -detect_distro() { - if [ -f /etc/os-release ]; then - . /etc/os-release - DISTRO=$ID - DISTRO_VERSION=$VERSION_ID - elif [ -f /etc/redhat-release ]; then - DISTRO="rhel" - else - DISTRO="unknown" - fi - - # Detect package manager - if command -v dnf &>/dev/null; then - PKG_MANAGER="dnf" - elif command -v yum &>/dev/null; then - PKG_MANAGER="yum" - elif command -v apt-get &>/dev/null; then - PKG_MANAGER="apt" - elif command -v pacman &>/dev/null; then - PKG_MANAGER="pacman" - elif command -v zypper &>/dev/null; then - PKG_MANAGER="zypper" - else - echo -e "${YELLOW}Warning: No known package manager found${NC}" - PKG_MANAGER="unknown" - fi - - echo "Detected distribution: $DISTRO (using $PKG_MANAGER)" -} - -detect_distro - -# Configuration -DATA_PATH="${1:-/data/monitoring}" -ML_USER="${2:-ml-user}" -ML_GROUP="${3:-ml-group}" - -echo "Configuration:" -echo " Monitoring data path: $DATA_PATH" -echo " User: $ML_USER" -echo " Group: $ML_GROUP" -echo "" - -# Create pod for monitoring stack -POD_NAME="monitoring" - -# 1. Create directories -echo -e "${BLUE}[1/6]${NC} Creating directory structure..." -sudo mkdir -p "${DATA_PATH}"/{prometheus,grafana,loki,promtail-config} -sudo mkdir -p /etc/fetch_ml/monitoring -sudo mkdir -p /var/lib/grafana/dashboards - -sudo chown -R $ML_USER:$ML_GROUP $DATA_PATH -sudo chmod 755 $DATA_PATH - -echo -e "${GREEN}✓${NC} Directories created" - -# 2. Copy configuration files -echo -e "${BLUE}[2/6]${NC} Copying configuration files..." -sudo cp monitoring/prometheus.yml /etc/fetch_ml/monitoring/ -sudo cp monitoring/loki-config.yml /etc/fetch_ml/monitoring/ -sudo cp monitoring/promtail-config.yml /etc/fetch_ml/monitoring/ -sudo cp monitoring/grafana/provisioning /etc/fetch_ml/monitoring/ -r -sudo cp monitoring/grafana-dashboard.json /var/lib/grafana/dashboards/ml-queue.json -sudo cp monitoring/logs-dashboard.json /var/lib/grafana/dashboards/logs.json - -sudo chown -R $ML_USER:$ML_GROUP /etc/fetch_ml/monitoring -sudo chown -R $ML_USER:$ML_GROUP /var/lib/grafana - -echo -e "${GREEN}✓${NC} Configuration copied" - -# 3. Create Podman pod -echo -e "${BLUE}[3/6]${NC} Creating Podman pod..." -sudo -u $ML_USER podman pod create \\ - --name $POD_NAME \\ - -p 3000:3000 \\ - -p 9090:9090 \\ - -p 3100:3100 \\ - || echo "Pod may already exist" - -echo -e "${GREEN}✓${NC} Pod created" - -# 4. Create systemd service for monitoring pod -echo -e "${BLUE}[4/6]${NC} Creating systemd services..." - -# Prometheus service -sudo tee /etc/systemd/system/prometheus.service >/dev/null </dev/null </dev/null </dev/null </dev/null - -sudo systemctl daemon-reload -echo -e "${GREEN}✓${NC} Pod service created" - -# 6. Setup firewall rules -echo -e "${BLUE}[6/6]${NC} Configuring firewall..." -if command -v firewall-cmd &>/dev/null; then - # RHEL/Rocky/Fedora (firewalld) - sudo firewall-cmd --permanent --add-port=3000/tcp # Grafana - sudo firewall-cmd --permanent --add-port=9090/tcp # Prometheus - sudo firewall-cmd --reload - echo -e "${GREEN}✓${NC} Firewall configured (firewalld)" -elif command -v ufw &>/dev/null; then - # Ubuntu/Debian (ufw) - sudo ufw allow 3000/tcp comment 'Grafana' - sudo ufw allow 9090/tcp comment 'Prometheus' - echo -e "${GREEN}✓${NC} Firewall configured (ufw)" -else - echo -e "${YELLOW}!${NC} No firewall detected. You may need to manually open ports 3000 and 9090" -fi - -# Summary -echo "" -echo -e "${BOLD}=== Monitoring Stack Setup Complete! ===${NC}" -echo "" -echo "Services created:" -echo " - prometheus.service (Metrics collection)" -echo " - loki.service (Log aggregation)" -echo " - grafana.service (Visualization)" -echo " - promtail.service (Log shipping)" -echo "" -echo -e "${BOLD}Next steps:${NC}" -echo "1. Start services:" -echo " sudo systemctl start prometheus" -echo " sudo systemctl start loki" -echo " sudo systemctl start promtail" -echo " sudo systemctl start grafana" -echo "" -echo "2. Enable on boot:" -echo " sudo systemctl enable prometheus loki promtail grafana" -echo "" -echo "3. Access Grafana:" -echo " http://YOUR_SERVER_IP:3000" -echo " Username: admin" -echo " Password: admin (change on first login)" -echo "" -echo "4. Check logs:" -echo " sudo journalctl -u prometheus -f" -echo " sudo journalctl -u grafana -f" -echo "" diff --git a/scripts/deployment/setup-prod.sh b/scripts/deployment/setup-prod.sh deleted file mode 100755 index 56fceb5..0000000 --- a/scripts/deployment/setup-prod.sh +++ /dev/null @@ -1,229 +0,0 @@ -#!/bin/bash -# Production Setup Script for Rocky Linux (Bare Metal) -# This script sets up the complete FetchML environment on bare metal - -set -e - -BOLD='\033[1m' -GREEN='\033[0;32m' -BLUE='\033[0;34m' -NC='\033[0m' - -echo -e "${BOLD}=== FetchML Production Setup (Rocky Linux Bare Metal) ===${NC}\n" - -# Configuration -BASE_PATH="${1:-/data/ml-experiments}" -ML_USER="${2:-ml-user}" -ML_GROUP="${3:-ml-group}" - -echo "Configuration:" -echo " Base path: $BASE_PATH" -echo " ML user: $ML_USER" -echo " ML group: $ML_GROUP" -echo "" - -# 1. Create system user if it doesn't exist -echo -e "${BLUE}[1/8]${NC} Creating system user..." -if id "$ML_USER" &>/dev/null; then - echo " User $ML_USER already exists" -else - sudo useradd -r -s /bin/bash -m -d /home/$ML_USER -c "ML System User" $ML_USER - echo -e "${GREEN}✓${NC} Created user: $ML_USER" -fi - -# 2. Create directory structure -echo -e "${BLUE}[2/8]${NC} Creating directory structure..." -sudo mkdir -p "${BASE_PATH}"/{experiments,pending,running,finished,failed,datasets} -sudo mkdir -p /var/log/fetch_ml -sudo mkdir -p /etc/fetch_ml - -echo -e "${GREEN}✓${NC} Created directories:" -echo " $BASE_PATH/experiments/" -echo " $BASE_PATH/pending/" -echo " $BASE_PATH/running/" -echo " $BASE_PATH/finished/" -echo " $BASE_PATH/failed/" -echo " $BASE_PATH/datasets/" -echo " /var/log/fetch_ml/" -echo " /etc/fetch_ml/" - -# 3. Set ownership and permissions -echo -e "${BLUE}[3/8]${NC} Setting permissions..." -sudo chown -R $ML_USER:$ML_GROUP $BASE_PATH -sudo chmod 755 $BASE_PATH -sudo chmod 700 $BASE_PATH/experiments # Restrict experiment data - -sudo chown -R $ML_USER:$ML_GROUP /var/log/fetch_ml -sudo chmod 755 /var/log/fetch_ml - -echo -e "${GREEN}✓${NC} Permissions set" - -# 4. Install system dependencies (Rocky Linux) -echo -e "${BLUE}[4/8]${NC} Installing system dependencies..." -sudo dnf install -y \ - golang \ - podman \ - redis \ - git \ - make \ - gcc \ - || echo "Some packages may already be installed" - -echo -e "${GREEN}✓${NC} Dependencies installed" - -# 5. Configure Podman for GPU access (if NVIDIA GPU present) -echo -e "${BLUE}[5/8]${NC} Configuring Podman..." -if lspci | grep -i nvidia &>/dev/null; then - echo " NVIDIA GPU detected, configuring GPU access..." - - # Install nvidia-container-toolkit if not present - if ! command -v nvidia-container-toolkit &>/dev/null; then - echo " Installing nvidia-container-toolkit..." - sudo dnf config-manager --add-repo \ - https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo - sudo dnf install -y nvidia-container-toolkit - fi - - # Configure Podman CDI - sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml - echo -e "${GREEN}✓${NC} GPU support configured" -else - echo " No NVIDIA GPU detected, skipping GPU setup" -fi - -# 6. Configure Redis -echo -e "${BLUE}[6/8]${NC} Configuring Redis..." -sudo systemctl enable redis -sudo systemctl start redis || echo "Redis may already be running" - -# Set Redis password if not already configured -if ! sudo grep -q "^requirepass" /etc/redis/redis.conf 2>/dev/null; then - REDIS_PASSWORD=$(openssl rand -base64 32) - echo "requirepass $REDIS_PASSWORD" | sudo tee -a /etc/redis/redis.conf >/dev/null - sudo systemctl restart redis - echo " Generated Redis password: $REDIS_PASSWORD" - echo " Save this password for your configuration!" -else - echo " Redis password already configured" -fi - -echo -e "${GREEN}✓${NC} Redis configured" - -# 7. Setup systemd services -echo -e "${BLUE}[7/8]${NC} Creating systemd services..." - -# API Server service -sudo tee /etc/systemd/system/fetchml-api.service >/dev/null </dev/null </dev/null </dev/null 2>&1 || true - systemctl reload fetchml-worker >/dev/null 2>&1 || true - endscript -} -EOF - -echo -e "${GREEN}✓${NC} Log rotation configured" - -# Summary -echo "" -echo -e "${BOLD}=== Setup Complete! ===${NC}" -echo "" -echo "Directory structure created at: $BASE_PATH" -echo "Logs will be written to: /var/log/fetch_ml/" -echo "Configuration directory: /etc/fetch_ml/" -echo "" -echo -e "${BOLD}Next steps:${NC}" -echo "1. Copy your config files:" -echo " sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml" -echo " sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml" -echo "" -echo "2. Build and install binaries:" -echo " make build" -echo " sudo cp bin/api-server /usr/local/bin/fetchml-api" -echo " sudo cp bin/worker /usr/local/bin/fetchml-worker" -echo "" -echo "3. Update config files with your settings (Redis password, API keys, etc.)" -echo "" -echo "4. Start services:" -echo " sudo systemctl start fetchml-api" -echo " sudo systemctl start fetchml-worker" -echo "" -echo "5. Enable services to start on boot:" -echo " sudo systemctl enable fetchml-api" -echo " sudo systemctl enable fetchml-worker" -echo "" -echo "6. Check status:" -echo " sudo systemctl status fetchml-api" -echo " sudo systemctl status fetchml-worker" -echo " sudo journalctl -u fetchml-api -f" -echo "" diff --git a/scripts/deployment/setup-production.sh b/scripts/deployment/setup-production.sh deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/legacy/auto_setup.sh b/scripts/legacy/auto_setup.sh deleted file mode 100755 index 1801c74..0000000 --- a/scripts/legacy/auto_setup.sh +++ /dev/null @@ -1,455 +0,0 @@ -#!/bin/bash - -# Automatic Setup Script for ML Experiment Manager -# Handles complete environment setup with security features - -set -euo pipefail - -# Colors -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -print_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -print_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -print_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -print_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -detect_os() { - if [[ "$OSTYPE" == "darwin"* ]]; then - echo "macos" - elif [[ "$OSTYPE" == "linux-gnu"* ]]; then - echo "linux" - else - echo "unknown" - fi -} - -install_go() { - print_info "Installing Go..." - - local os=$(detect_os) - local go_version="1.23.0" - - if [[ "$os" == "macos" ]]; then - if command -v brew &> /dev/null; then - brew install go - else - print_error "Homebrew not found. Please install Go manually." - return 1 - fi - elif [[ "$os" == "linux" ]]; then - wget -q "https://go.dev/dl/go${go_version}.linux-amd64.tar.gz" - sudo rm -rf /usr/local/go - sudo tar -C /usr/local -xzf "go${go_version}.linux-amd64.tar.gz" - rm "go${go_version}.linux-amd64.tar.gz" - - # Add to PATH - echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc - export PATH=$PATH:/usr/local/go/bin - fi - - print_success "Go installed" -} - -install_zig() { - print_info "Installing Zig..." - - local os=$(detect_os) - - if [[ "$os" == "macos" ]]; then - if command -v brew &> /dev/null; then - brew install zig - else - print_error "Homebrew not found. Please install Zig manually." - return 1 - fi - elif [[ "$os" == "linux" ]]; then - # Download Zig binary - local zig_version="0.13.0" - wget -q "https://ziglang.org/download/${zig_version}/zig-linux-x86_64-${zig_version}.tar.xz" - tar -xf "zig-linux-x86_64-${zig_version}.tar.xz" - sudo mv "zig-linux-x86_64-${zig_version}/zig" /usr/local/bin/ - rm -rf "zig-linux-x86_64-${zig_version}.tar.xz" "zig-linux-x86_64-${zig_version}" - fi - - print_success "Zig installed" -} - -install_docker() { - print_info "Installing Docker..." - - local os=$(detect_os) - - if [[ "$os" == "macos" ]]; then - if command -v brew &> /dev/null; then - brew install --cask docker - print_warning "Docker Desktop installed. Please start it manually." - else - print_error "Homebrew not found. Please install Docker manually." - return 1 - fi - elif [[ "$os" == "linux" ]]; then - # Install Docker using official script - curl -fsSL https://get.docker.com -o get-docker.sh - sudo sh get-docker.sh - sudo usermod -aG docker $USER - rm get-docker.sh - - # Start Docker - sudo systemctl enable docker - sudo systemctl start docker - - print_success "Docker installed. You may need to log out and log back in." - fi -} - -install_redis() { - print_info "Installing Redis..." - - local os=$(detect_os) - - if [[ "$os" == "macos" ]]; then - if command -v brew &> /dev/null; then - brew install redis - brew services start redis - else - print_error "Homebrew not found. Please install Redis manually." - return 1 - fi - elif [[ "$os" == "linux" ]]; then - sudo apt-get update - sudo apt-get install -y redis-server - sudo systemctl enable redis-server - sudo systemctl start redis-server - fi - - print_success "Redis installed and started" -} - -install_dependencies() { - print_info "Installing dependencies..." - - local os=$(detect_os) - - # Install basic tools - if [[ "$os" == "macos" ]]; then - if command -v brew &> /dev/null; then - brew install openssl curl jq - fi - elif [[ "$os" == "linux" ]]; then - sudo apt-get update - sudo apt-get install -y openssl curl jq build-essential - fi - - # Install Go tools - if command -v go &> /dev/null; then - go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest - go install golang.org/x/tools/cmd/goimports@latest - fi - - print_success "Dependencies installed" -} - -setup_project() { - print_info "Setting up project..." - - # Create directories - mkdir -p bin - mkdir -p data - mkdir -p logs - mkdir -p db - mkdir -p ssl - mkdir -p configs - mkdir -p scripts - - # Build project - if command -v make &> /dev/null; then - make build - if command -v zig &> /dev/null; then - make cli-build - fi - else - print_warning "Make not found, building manually..." - go build -o bin/worker ./cmd/worker - go build -o bin/tui ./cmd/tui - go build -o bin/data_manager ./cmd/data_manager - go build -o bin/user_manager ./cmd/user_manager - go build -o bin/api-server ./cmd/api-server - - if command -v zig &> /dev/null; then - cd cli && zig build && cd .. - fi - fi - - print_success "Project setup completed" -} - -setup_security() { - print_info "Setting up security features..." - - # Generate SSL certificates - if command -v openssl &> /dev/null; then - openssl req -x509 -newkey rsa:4096 -keyout ssl/key.pem -out ssl/cert.pem \ - -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" \ - -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null || { - print_warning "Failed to generate SSL certificates" - } - print_success "SSL certificates generated" - fi - - # Generate secure configuration - local redis_password=$(openssl rand -base64 32 2>/dev/null || echo "dev_redis_password_123") - local jwt_secret=$(openssl rand -base64 64 2>/dev/null || echo "dev_jwt_secret_1234567890123456789012345678901234567890123456789012345678901234") - - cat > configs/security-config.yaml << EOF -base_path: "/data/ml-experiments" - -auth: - enabled: true - api_keys: - test_user: - hash: "$(echo -n "dev_test_api_key_12345" | sha256sum | cut -d' ' -f1)" - admin: true - roles: ["data_scientist", "admin"] - permissions: - read: true - write: true - delete: true - -server: - address: ":9101" - tls: - enabled: true - cert_file: "./ssl/cert.pem" - key_file: "./ssl/key.pem" - min_version: "1.3" - -security: - rate_limit: - enabled: true - requests_per_minute: 60 - burst_size: 10 - ip_whitelist: - - "127.0.0.1" - - "::1" - - "10.0.0.0/8" - - "192.168.0.0/16" - - "172.16.0.0/12" - failed_login_lockout: - enabled: true - max_attempts: 5 - lockout_duration: "15m" - -redis: - url: "redis://localhost:6379" - password: "${redis_password}" - -logging: - level: "info" - file: "logs/fetch_ml.log" - audit_log: "logs/audit.log" -EOF - - cat > .env.dev << EOF -# Development environment variables -REDIS_PASSWORD=${redis_password} -JWT_SECRET=${jwt_secret} -GRAFANA_USER=admin -GRAFANA_PASSWORD=$(openssl rand -base64 16 2>/dev/null || echo "dev_grafana_password") -EOF - - print_success "Security configuration created" -} - -test_installation() { - print_info "Testing installation..." - - local tests_passed=0 - local tests_total=0 - - # Test Go - tests_total=$((tests_total + 1)) - if command -v go &> /dev/null; then - print_success "Go: Installed" - tests_passed=$((tests_passed + 1)) - else - print_error "Go: Not found" - fi - - # Test Zig - tests_total=$((tests_total + 1)) - if command -v zig &> /dev/null; then - print_success "Zig: Installed" - tests_passed=$((tests_passed + 1)) - else - print_warning "Zig: Not found (optional)" - tests_total=$((tests_total - 1)) - fi - - # Test Docker - tests_total=$((tests_total + 1)) - if command -v docker &> /dev/null; then - print_success "Docker: Installed" - tests_passed=$((tests_passed + 1)) - else - print_warning "Docker: Not found (optional)" - tests_total=$((tests_total - 1)) - fi - - # Test Redis - tests_total=$((tests_total + 1)) - if command -v redis-cli &> /dev/null; then - if redis-cli ping | grep -q "PONG"; then - print_success "Redis: Running" - tests_passed=$((tests_passed + 1)) - else - print_warning "Redis: Not running" - fi - else - print_warning "Redis: Not found (optional)" - tests_total=$((tests_total - 1)) - fi - - # Test binaries - if [[ -f "bin/api-server" ]]; then - tests_total=$((tests_total + 1)) - if ./bin/api-server --help > /dev/null 2>&1; then - print_success "API Server: Built" - tests_passed=$((tests_passed + 1)) - else - print_error "API Server: Build failed" - fi - fi - - if [[ $tests_total -gt 0 ]]; then - local success_rate=$((tests_passed * 100 / tests_total)) - print_info "Tests: $tests_passed/$tests_total passed ($success_rate%)" - fi - - print_success "Installation testing completed" -} - -show_next_steps() { - print_success "Automatic setup completed!" - echo - echo "Next Steps:" - echo "===========" - echo "" - echo "1. Load environment variables:" - echo " source .env.dev" - echo "" - echo "2. Start the API server:" - echo " ./bin/api-server -config configs/config.yaml" - echo "" - echo "3. Test the Zig CLI (if installed):" - echo " ./cli/zig-out/bin/ml --help" - echo "" - echo "4. Deploy with Docker (optional):" - echo " make docker-run" - echo "" - echo "5. Docker Compose deployment:" - echo " docker-compose up -d" - echo "" - echo "Configuration Files:" - echo " configs/config.yaml # Main configuration" - echo " configs/config_local.yaml # Local development" - echo " ssl/cert.pem, ssl/key.pem # TLS certificates" - echo "" - echo "Documentation:" - echo " docs/DEPLOYMENT.md # Deployment guide" - echo "" - echo "Quick Commands:" - echo " make help # Show all commands" - echo " make test # Run tests" - echo " docker-compose up -d # Start services" - echo "" - print_success "Ready to use ML Experiment Manager!" -} - -# Main setup function -main() { - echo "ML Experiment Manager Automatic Setup" - echo "=====================================" - echo "" - - print_info "Starting automatic setup..." - echo "" - - # Check and install dependencies - if ! command -v go &> /dev/null; then - print_info "Go not found, installing..." - install_go - fi - - if ! command -v zig &> /dev/null; then - print_info "Zig not found, installing..." - install_zig - fi - - if ! command -v docker &> /dev/null; then - print_info "Docker not found, installing..." - install_docker - fi - - if ! command -v redis-cli &> /dev/null; then - print_info "Redis not found, installing..." - install_redis - fi - - # Install additional dependencies - install_dependencies - - # Setup project - setup_project - - # Setup security - setup_security - - # Test installation - test_installation - - # Show next steps - show_next_steps -} - -# Handle command line arguments -case "${1:-setup}" in - "setup") - main - ;; - "deps") - install_dependencies - ;; - "test") - test_installation - ;; - "help"|"-h"|"--help") - echo "Automatic Setup Script" - echo "Usage: $0 {setup|deps|test|help}" - echo "" - echo "Commands:" - echo " setup - Run full automatic setup" - echo " deps - Install dependencies only" - echo " test - Test installation" - echo " help - Show this help" - ;; - *) - print_error "Unknown command: $1" - echo "Use '$0 help' for usage information" - exit 1 - ;; -esac diff --git a/scripts/legacy/quick_start.sh b/scripts/legacy/quick_start.sh deleted file mode 100755 index 700212c..0000000 --- a/scripts/legacy/quick_start.sh +++ /dev/null @@ -1,314 +0,0 @@ -#!/usr/bin/env bash - -# Fetch ML Quick Start Script with Security -# Sets up development environment with security features and creates test user - -set -euo pipefail - -# Colors -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -print_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -print_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -print_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -print_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -check_prerequisites() { - print_info "Checking prerequisites..." - - # Check Go - if ! command -v go &> /dev/null; then - print_error "Go is not installed. Please install Go 1.25 or later." - exit 1 - fi - - local go_version=$(go version | awk '{print $3}' | sed 's/go//') - print_info "Go version: $go_version" - - # Check Zig - if ! command -v zig &> /dev/null; then - print_warning "Zig is not installed. CLI features will not be available." - else - local zig_version=$(zig version) - print_info "Zig version: $zig_version" - fi - - # Check Docker - if ! command -v docker &> /dev/null; then - print_warning "Docker is not installed. Container features will not work." - fi - - # Check Redis - if ! command -v redis-server &> /dev/null && ! command -v redis-cli &> /dev/null; then - print_warning "Redis is not installed. Starting local Redis..." - fi - - # Check OpenSSL for certificates - if ! command -v openssl &> /dev/null; then - print_warning "OpenSSL is not installed. TLS certificates will not be generated." - fi - - print_success "Prerequisites checked" -} - -setup_project() { - print_info "Setting up Fetch ML project..." - - # Create directories - mkdir -p bin - mkdir -p data - mkdir -p logs - mkdir -p db - mkdir -p ssl - mkdir -p configs - - print_success "Project directories created" -} - -build_project() { - print_info "Building Fetch ML..." - - # Build Go binaries - make build - - # Build Zig CLI if available - if command -v zig &> /dev/null; then - make cli-build - print_success "Zig CLI built" - fi - - print_success "Build completed" -} - -generate_ssl_certificates() { - print_info "Generating SSL certificates..." - - if command -v openssl &> /dev/null; then - # Generate self-signed certificate for development - openssl req -x509 -newkey rsa:4096 -keyout ssl/key.pem -out ssl/cert.pem \ - -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" \ - -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null || { - print_warning "Failed to generate SSL certificates" - return 1 - } - - print_success "SSL certificates generated in ssl/" - print_info "Certificates are self-signed (development only)" - else - print_warning "OpenSSL not available, skipping SSL certificates" - fi -} - -setup_redis() { - print_info "Setting up Redis..." - - if command -v redis-server &> /dev/null; then - if ! pgrep -f "redis-server" > /dev/null; then - redis-server --daemonize yes --port 6379 - print_success "Redis started" - else - print_info "Redis already running" - fi - else - print_warning "Redis not available, some features may be limited" - fi -} - -create_secure_config() { - print_info "Creating secure development configuration..." - - # Generate secure passwords and secrets - local redis_password=$(openssl rand -base64 32 2>/dev/null || echo "dev_redis_password_123") - local jwt_secret=$(openssl rand -base64 64 2>/dev/null || echo "dev_jwt_secret_1234567890123456789012345678901234567890123456789012345678901234") - - # Create development config - cat > configs/config.yaml << EOF -base_path: "/data/ml-experiments" - -auth: - enabled: true - api_keys: - test_user: - hash: "$(echo -n "dev_test_api_key_12345" | sha256sum | cut -d' ' -f1)" - admin: true - roles: ["data_scientist", "admin"] - permissions: - read: true - write: true - delete: true - -server: - address: ":9101" - tls: - enabled: true - cert_file: "./ssl/cert.pem" - key_file: "./ssl/key.pem" - min_version: "1.3" - -security: - rate_limit: - enabled: true - requests_per_minute: 60 - burst_size: 10 - ip_whitelist: - - "127.0.0.1" - - "::1" - - "10.0.0.0/8" - - "192.168.0.0/16" - - "172.16.0.0/12" - failed_login_lockout: - enabled: true - max_attempts: 5 - lockout_duration: "15m" - -redis: - url: "redis://localhost:6379" - password: "${redis_password}" - -logging: - level: "info" - file: "logs/fetch_ml.log" - audit_log: "logs/audit.log" -EOF - - # Create environment file - cat > .env.dev << EOF -# Development environment variables -REDIS_PASSWORD=${redis_password} -JWT_SECRET=${jwt_secret} -GRAFANA_USER=admin -GRAFANA_PASSWORD=$(openssl rand -base64 16 2>/dev/null || echo "dev_grafana_password") -EOF - - print_success "Secure configuration created" - print_warning "Using development certificates and passwords" -} - -create_test_user() { - print_info "Creating test user..." - - # Generate API key for test user - local api_key="dev_test_api_key_12345" - local api_key_hash=$(echo -n "$api_key" | sha256sum | cut -d' ' -f1) - - print_success "Test user created successfully" - echo "Username: test_user" - echo "API Key: $api_key" - echo "API Key Hash: $api_key_hash" - echo "Store this key safely!" - echo "" - echo "Environment variables in .env.dev" - echo "Run: source .env.dev" -} - -test_setup() { - print_info "Testing setup..." - - # Test Go binaries - if [[ -f "bin/api-server" ]]; then - ./bin/api-server --help > /dev/null 2>&1 || true - print_success "API server binary OK" - fi - - if [[ -f "bin/worker" ]]; then - ./bin/worker --help > /dev/null 2>&1 || true - print_success "Worker binary OK" - fi - - # Test Zig CLI - if [[ -f "cli/zig-out/bin/ml" ]]; then - ./cli/zig-out/bin/ml --help > /dev/null 2>&1 || true - print_success "Zig CLI binary OK" - fi - - # Test Redis connection - if command -v redis-cli &> /dev/null; then - if redis-cli ping > /dev/null 2>&1; then - print_success "Redis connection OK" - else - print_warning "Redis not responding" - fi - fi - - # Test SSL certificates - if [[ -f "ssl/cert.pem" && -f "ssl/key.pem" ]]; then - if openssl x509 -in ssl/cert.pem -noout -checkend 86400 > /dev/null 2>&1; then - print_success "SSL certificates valid" - else - print_warning "SSL certificates expired or invalid" - fi - fi -} - -show_next_steps() { - print_success "Secure quick start completed!" - echo - echo "Next steps:" - echo "1. Load environment variables:" - echo " source .env.dev" - echo - echo "2. Start API server:" - echo " ./bin/api-server -config configs/config.yaml" - echo - echo "3. Test Zig CLI:" - echo " ./cli/zig-out/bin/ml --help" - echo - echo "4. Test with curl (HTTPS):" - echo " curl -k -H 'X-API-Key: dev_test_api_key_12345' https://localhost:9101/health" - echo - echo "5. Deploy with Docker:" - echo " docker-compose up -d" - echo - echo "Features Enabled:" - echo " ✅ HTTPS/TLS encryption" - echo " ✅ API key authentication" - echo " ✅ Rate limiting" - echo " ✅ IP whitelisting" - echo " ✅ Security headers" - echo " ✅ Audit logging" - echo - echo "Configuration Files:" - echo " configs/config.yaml # Main configuration" - echo " .env.dev # Environment variables" - echo " ssl/cert.pem, ssl/key.pem # TLS certificates" - echo - echo "Documentation:" - echo " docs/DEPLOYMENT.md # Deployment guide" - echo "" - print_success "Ready to run ML experiments!" -} - -# Main function -main() { - echo "Fetch ML Quick Start Script (with Security & Zig CLI)" - echo "====================================================" - echo "" - - check_prerequisites - setup_project - build_project - generate_ssl_certificates - setup_redis - create_secure_config - create_test_user - test_setup - show_next_steps -} - -# Run main function -main "$@" diff --git a/scripts/legacy/setup_common.sh b/scripts/legacy/setup_common.sh deleted file mode 100755 index 54040ea..0000000 --- a/scripts/legacy/setup_common.sh +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env bash - -# Shared helper functions for Fetch ML setup scripts (Ubuntu/Rocky) -set -euo pipefail - -# Colors -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -# Configuration defaults -FETCH_ML_USER="fetchml" -FETCH_ML_HOME="/opt/fetchml" -SERVICE_DIR="/etc/systemd/system" -LOG_DIR="/var/log/fetchml" -DATA_DIR="/var/lib/fetchml" -CONFIG_DIR="$FETCH_ML_HOME/configs" - -log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } -log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } -log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; } -log_error() { echo -e "${RED}[ERROR]${NC} $1"; } - -# Download file with checksum verification -# Args: url, checksum, dest -secure_download() { - local url="$1" checksum="$2" dest="$3" - curl -fsSL "$url" -o "$dest" - echo "$checksum $dest" | sha256sum --check --status || { - log_error "Checksum verification failed for $dest" - rm -f "$dest" - exit 1 - } -} - -cleanup_temp() { - if [[ -n "${TMP_FILES:-}" ]]; then - rm -f $TMP_FILES || true - fi -} -trap cleanup_temp EXIT - -ensure_user() { - if ! id "$FETCH_ML_USER" &>/dev/null; then - useradd -m -d "$FETCH_ML_HOME" -s /bin/bash "$FETCH_ML_USER" - fi - usermod -aG podman "$FETCH_ML_USER" || true -} - -create_directories() { - mkdir -p "$FETCH_ML_HOME" "$LOG_DIR" "$DATA_DIR" "$FETCH_ML_HOME/bin" "$CONFIG_DIR" - chown -R "$FETCH_ML_USER":"$FETCH_ML_USER" "$FETCH_ML_HOME" "$LOG_DIR" "$DATA_DIR" -} - -setup_systemd_service() { - local name="$1" exec="$2" - cat > "$SERVICE_DIR/${name}.service" < /etc/logrotate.d/fetch_ml <<'EOF' -/var/log/fetchml/*.log { - daily - missingok - rotate 14 - compress - delaycompress - notifempty - create 0640 fetchml fetchml -} -EOF -} - -hardening_steps() { - # Increase file limits - if ! grep -q fetchml /etc/security/limits.conf; then - cat >> /etc/security/limits.conf <<'EOF' -fetchml soft nofile 65536 -fetchml hard nofile 65536 -EOF - fi - - # Enable unattended security upgrades if available - if command -v apt-get &>/dev/null; then - apt-get install -y unattended-upgrades >/dev/null || true - elif command -v dnf &>/dev/null; then - dnf install -y dnf-automatic >/dev/null || true - fi -} - -selinux_guidance() { - if command -v getenforce &>/dev/null; then - local mode=$(getenforce) - log_info "SELinux mode: $mode" - if [[ "$mode" == "Enforcing" ]]; then - log_info "Ensure systemd units and directories have proper contexts. Example:" - echo " semanage fcontext -a -t bin_t '$FETCH_ML_HOME/bin(/.*)?'" - echo " restorecon -Rv $FETCH_ML_HOME/bin" - fi - fi -} diff --git a/scripts/legacy/setup_rocky.sh b/scripts/legacy/setup_rocky.sh deleted file mode 100755 index 6a5205b..0000000 --- a/scripts/legacy/setup_rocky.sh +++ /dev/null @@ -1,417 +0,0 @@ -#!/usr/bin/env bash - -# Fetch ML Rocky Linux Setup Script -# Optimized for ML experiments on Rocky Linux 8/9 - -set -euo pipefail - -# shellcheck source=scripts/setup_common.sh -SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) -source "$SCRIPT_DIR/setup_common.sh" - -check_root() { - if [[ $EUID -ne 0 ]]; then - log_error "This script must be run as root" - exit 1 - fi -} - -check_rocky() { - if ! command -v dnf &> /dev/null && ! command -v yum &> /dev/null; then - log_error "This script is designed for Rocky Linux systems" - exit 1 - fi - - local rocky_version=$(cat /etc/rocky-release | grep -oE '[0-9]+\.[0-9]+') - log_info "Rocky Linux version: $rocky_version" - - # Use dnf for Rocky 9+, yum for Rocky 8 - if command -v dnf &> /dev/null; then - PKG_MANAGER="dnf" - else - PKG_MANAGER="yum" - fi -} - -update_system() { - log_info "Updating system packages..." - $PKG_MANAGER update -y - $PKG_MANAGER upgrade -y - $PKG_MANAGER install -y curl wget gnupg2 -} - -enable_epel() { - log_info "Enabling EPEL repository..." - - if $PKG_MANAGER repolist | grep -q "epel"; then - log_info "EPEL already enabled" - return - fi - - $PKG_MANAGER install -y epel-release - $PKG_MANAGER config-manager --set-enabled powertools - - log_success "EPEL repository enabled" -} - -install_go() { - log_info "Installing Go 1.25..." - - if command -v go &> /dev/null; then - local go_version=$(go version | awk '{print $3}' | sed 's/go//') - log_info "Go already installed: $go_version" - return - fi - - cd /tmp - TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz" - secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz" - tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz - - # Add to PATH - echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile - echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile - export PATH=$PATH:/usr/local/go/bin - - log_success "Go 1.25 installed" -} - -install_podman() { - log_info "Installing Podman..." - - if command -v podman &> /dev/null; then - log_info "Podman already installed" - return - fi - - # Install Podman and related tools - $PKG_MANAGER install -y podman podman-compose containernetworking-plugins - - # Configure Podman - mkdir -p /etc/containers - cat > /etc/containers/containers.conf << EOF -[containers] -user_namespace_enable = 1 -runtime = "crun" - -[network] -network_backend = "netavark" - -[engine] -cgroup_manager = "systemd" -EOF - - # Enable user namespaces - echo "user.max_user_namespaces=15000" >> /etc/sysctl.conf - sysctl -p user.max_user_namespaces=15000 - - log_success "Podman installed" -} - -install_redis() { - log_info "Installing Redis..." - - if command -v redis-server &> /dev/null; then - log_info "Redis already installed" - return - fi - - $PKG_MANAGER install -y redis - - # Configure Redis for production - sed -i 's/supervised no/supervised systemd/' /etc/redis.conf - sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis.conf - - systemctl enable redis - systemctl start redis - - log_success "Redis installed and configured" -} - -install_nvidia_drivers() { - log_info "Checking for NVIDIA GPU..." - - if command -v nvidia-smi &> /dev/null; then - log_info "NVIDIA drivers already installed" - nvidia-smi - return - fi - - if lspci | grep -i nvidia &> /dev/null; then - log_info "NVIDIA GPU detected, installing drivers..." - - # Enable NVIDIA repository - $PKG_MANAGER config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel$(rpm -E %rhel)/x86_64/cuda-rhel.repo - - # Clean and install - $PKG_MANAGER clean all - $PKG_MANAGER module enable -y nvidia-driver:latest-dkms - $PKG_MANAGER install -y nvidia-driver nvidia-cuda-toolkit - - # Configure Podman for NVIDIA (only if needed) - if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then - log_warning "NVIDIA GPU access test failed, you may need to reboot" - else - log_success "NVIDIA drivers installed and GPU access verified" - fi - - # Reboot required - log_warning "System reboot required for NVIDIA drivers" - log_info "Run: reboot" - else - log_info "No NVIDIA GPU detected, skipping driver installation" - fi -} - -install_ml_tools() { - log_info "Installing ML tools and dependencies..." - - # Python and ML packages - $PKG_MANAGER install -y python3 python3-pip python3-devel - - # System dependencies for ML - $PKG_MANAGER groupinstall -y "Development Tools" - $PKG_MANAGER install -y cmake git pkgconfig - $PKG_MANAGER install -y libjpeg-turbo-devel libpng-devel libtiff-devel - $PKG_MANAGER install -y mesa-libGL-devel mesa-libGLU-devel - $PKG_MANAGER install -y gtk3-devel - $PKG_MANAGER install -y atlas-devel blas-devel lapack-devel - - # Install common ML libraries - pip3 install --upgrade pip - pip3 install numpy scipy scikit-learn pandas - pip3 install jupyter matplotlib seaborn - pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - - log_success "ML tools installed" -} - -create_user() { - log_info "Creating fetchml user..." - - if id "$FETCH_ML_USER" &>/dev/null; then - log_info "User $FETCH_ML_USER already exists" - return - fi - - useradd -m -d $FETCH_ML_HOME -s /bin/bash $FETCH_ML_USER - usermod -aG podman $FETCH_ML_USER - - # Create directories - mkdir -p $FETCH_ML_HOME/.config/containers - mkdir -p $FETCH_ML_HOME/go/bin - mkdir -p $LOG_DIR - mkdir -p $DATA_DIR - - chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME - chown -R $FETCH_ML_USER:$FETCH_ML_USER $LOG_DIR - chown -R $FETCH_ML_USER:$FETCH_ML_USER $DATA_DIR - - log_success "User $FETCH_ML_USER created" -} - -setup_firewall() { - log_info "Configuring firewall..." - - if command -v firewall-cmd &> /dev/null; then - systemctl enable firewalld - systemctl start firewalld - - firewall-cmd --permanent --add-service=ssh - firewall-cmd --permanent --add-port=8080/tcp # Worker API - firewall-cmd --permanent --add-port=8081/tcp # Data manager API - firewall-cmd --permanent --add-port=6379/tcp # Redis - firewall-cmd --reload - - firewall-cmd --list-all - else - log_warning "Firewalld not available, skipping firewall configuration" - fi -} - -setup_systemd_services() { - log_info "Setting up systemd services..." - - # Fetch ML Worker service - cat > $SERVICE_DIR/fetch_ml_worker.service << EOF -[Unit] -Description=Fetch ML Worker Service -After=network.target redis.service -Wants=redis.service - -[Service] -Type=simple -User=$FETCH_ML_USER -Group=$FETCH_ML_USER -WorkingDirectory=$FETCH_ML_HOME -Environment=FETCH_ML_HOME=$FETCH_ML_HOME -Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin -ExecStart=$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml -Restart=always -RestartSec=5 -StandardOutput=journal -StandardError=journal -SyslogIdentifier=fetch_ml_worker - -[Install] -WantedBy=multi-user.target -EOF - - # Fetch ML Data Manager service - cat > $SERVICE_DIR/fetch_ml_data_manager.service << EOF -[Unit] -Description=Fetch ML Data Manager Service -After=network.target redis.service -Wants=redis.service - -[Service] -Type=simple -User=$FETCH_ML_USER -Group=$FETCH_ML_USER -WorkingDirectory=$FETCH_ML_HOME -Environment=FETCH_ML_HOME=$FETCH_ML_HOME -Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin -ExecStart=$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml -Restart=always -RestartSec=5 -StandardOutput=journal -StandardError=journal -SyslogIdentifier=fetch_ml_data_manager - -[Install] -WantedBy=multi-user.target -EOF - - # Enable services - systemctl daemon-reload - systemctl enable fetch_ml_worker - systemctl enable fetch_ml_data_manager - - log_success "Systemd services configured" -} - -setup_log_rotation() { - log_info "Setting up log rotation..." - - cat > /etc/logrotate.d/fetch_ml << EOF -$LOG_DIR/*.log { - daily - missingok - rotate 30 - compress - delaycompress - notifempty - create 0644 $FETCH_ML_USER $FETCH_ML_USER - postrotate - systemctl reload fetch_ml_worker || true - systemctl reload fetch_ml_data_manager || true - endscript -} -EOF - - log_success "Log rotation configured" -} - -optimize_system() { - log_info "Optimizing system for ML workloads..." - - # Increase file limits - echo "* soft nofile 65536" >> /etc/security/limits.conf - echo "* hard nofile 65536" >> /etc/security/limits.conf - - # Optimize kernel parameters for ML - cat >> /etc/sysctl.conf << EOF -# ML Optimization -net.core.rmem_max = 134217728 -net.core.wmem_max = 134217728 -vm.swappiness = 10 -vm.dirty_ratio = 15 -vm.dirty_background_ratio = 5 -EOF - - sysctl -p - - # Configure GPU persistence mode if NVIDIA available - if command -v nvidia-smi &> /dev/null; then - nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode" - fi - - # Disable SELinux for better container compatibility (optional) - if [[ -f /etc/selinux/config ]]; then - log_warning "Consider setting SELinux to permissive mode for better container compatibility" - log_info "Edit /etc/selinux/config and set SELINUX=permissive" - fi - - log_success "System optimized for ML workloads" -} - -install_fetch_ml() { - log_info "Installing Fetch ML..." - - # Clone or copy Fetch ML - cd $FETCH_ML_HOME - - if [[ ! -d "fetch_ml" ]]; then - log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml" - log_info "Example: git clone https://github.com/your-org/fetch_ml.git" - return - fi - - cd fetch_ml - - # Build - export PATH=$PATH:/usr/local/go/bin - make build - - # Copy binaries - cp bin/* $FETCH_ML_HOME/bin/ - chmod +x $FETCH_ML_HOME/bin/* - - # Copy configs - mkdir -p $FETCH_ML_HOME/configs - cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml - - # Set permissions - chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME - - log_success "Fetch ML installed" -} - -main() { - log_info "Starting Fetch ML Rocky Linux server setup..." - - check_root - check_rocky - - update_system - enable_epel - install_go - install_podman - install_redis - install_nvidia_drivers - install_ml_tools - ensure_user - create_directories - setup_firewall - setup_systemd_services - setup_logrotate - hardening_steps - selinux_guidance - install_fetch_ml - - log_success "Fetch ML setup complete!" - echo - log_info "Next steps:" - echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml" - echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml" - echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager" - echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager" - echo "5. View logs: journalctl -u fetch_ml_worker -f" - echo - log_info "Services will be available at:" - echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080" - echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081" -} - -# Run main function -main "$@" diff --git a/scripts/legacy/setup_ubuntu.sh b/scripts/legacy/setup_ubuntu.sh deleted file mode 100755 index 2a112ef..0000000 --- a/scripts/legacy/setup_ubuntu.sh +++ /dev/null @@ -1,294 +0,0 @@ -#!/usr/bin/env bash - -# Fetch ML Ubuntu Server Setup Script -# Optimized for ML experiments on Ubuntu 20.04/22.04 - -set -euo pipefail - -# shellcheck source=scripts/setup_common.sh -SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) -source "$SCRIPT_DIR/setup_common.sh" - -check_root() { - if [[ $EUID -ne 0 ]]; then - log_error "This script must be run as root" - exit 1 - fi -} - -check_ubuntu() { - if ! command -v apt-get &> /dev/null; then - log_error "This script is designed for Ubuntu systems" - exit 1 - fi - - local ubuntu_version=$(lsb_release -rs) - log_info "Ubuntu version: $ubuntu_version" - - if (( $(echo "$ubuntu_version < 20.04" | bc -l) == 1 )); then - log_warning "Ubuntu version < 20.04 may not support all features" - fi -} - -update_system() { - log_info "Updating system packages..." - apt-get update -y - apt-get upgrade -y - apt-get install -y curl wget gnupg lsb-release software-properties-common -} - -install_go() { - log_info "Installing Go 1.25..." - - if command -v go &> /dev/null; then - local go_version=$(go version | awk '{print $3}' | sed 's/go//') - log_info "Go already installed: $go_version" - return - fi - - cd /tmp - TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz" - secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz" - tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz - - # Add to PATH - echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile - echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile - export PATH=$PATH:/usr/local/go/bin - - log_success "Go 1.25 installed" -} - -install_podman() { - log_info "Installing Podman..." - - if command -v podman &> /dev/null; then - log_info "Podman already installed" - return - fi - - # Add official Podman repository - echo "deb https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/ /" | tee /etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list - curl -L "https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/Release.key" | apt-key add - - - apt-get update -y - apt-get install -y podman podman-compose - - # Configure Podman for rootless operation - echo "user_namespace_enable = 1" >> /etc/containers/containers.conf - echo "runtime = \"crun\"" >> /etc/containers/containers.conf - - log_success "Podman installed" -} - -install_redis() { - log_info "Installing Redis..." - - if command -v redis-server &> /dev/null; then - log_info "Redis already installed" - return - fi - - apt-get install -y redis-server - - # Configure Redis for production - sed -i 's/supervised no/supervised systemd/' /etc/redis/redis.conf - sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis/redis.conf - - systemctl enable redis-server - systemctl start redis-server - - log_success "Redis installed and configured" -} - -install_nvidia_drivers() { - log_info "Checking for NVIDIA GPU..." - - if command -v nvidia-smi &> /dev/null; then - log_info "NVIDIA drivers already installed" - nvidia-smi - return - fi - - if lspci | grep -i nvidia &> /dev/null; then - log_info "NVIDIA GPU detected, installing drivers..." - - # Add NVIDIA repository - TMP_FILES="/tmp/cuda-keyring_1.1-1_all.deb" - secure_download "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$(lsb_release -rs | cut -d. -f1)/x86_64/cuda-keyring_1.1-1_all.deb" "cfa6b4109e7e3d9be060a016b7dc07e8edcd5356c0eabcc0c537a76e6c603d76" "/tmp/cuda-keyring_1.1-1_all.deb" - dpkg -i /tmp/cuda-keyring_1.1-1_all.deb - apt-get update -y - - # Install drivers - apt-get install -y nvidia-driver-535 nvidia-cuda-toolkit - - # Configure Podman for NVIDIA (only if needed) - if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then - log_warning "NVIDIA GPU access test failed, you may need to reboot" - else - log_success "NVIDIA drivers installed and GPU access verified" - fi - - else - log_info "No NVIDIA GPU detected, skipping driver installation" - fi -} - -install_ml_tools() { - log_info "Installing ML tools and dependencies..." - - # Python and ML packages - apt-get install -y python3 python3-pip python3-venv - - # System dependencies for ML - apt-get install -y build-essential cmake git pkg-config - apt-get install -y libjpeg-dev libpng-dev libtiff-dev - apt-get install -y libavcodec-dev libavformat-dev libswscale-dev - apt-get install -y libgtk2.0-dev libcanberra-gtk-module - apt-get install -y libxvidcore-dev libx264-dev - apt-get install -y libatlas-base-dev gfortran - - # Install common ML libraries - pip3 install --upgrade pip - pip3 install numpy scipy scikit-learn pandas - pip3 install jupyter matplotlib seaborn - pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - - log_success "ML tools installed" -} - -create_user() { - log_info "Creating fetchml user..." - ensure_user - create_directories - log_success "User $FETCH_ML_USER and directories created" -} - -setup_firewall() { - log_info "Configuring firewall..." - - if command -v ufw &> /dev/null; then - ufw --force enable - ufw allow ssh - ufw allow 8080/tcp # Worker API - ufw allow 8081/tcp # Data manager API - ufw allow 6379/tcp # Redis - ufw status - else - log_warning "UFW not available, skipping firewall configuration" - fi -} - -setup_systemd_services() { - log_info "Setting up systemd services..." - - setup_systemd_service "fetch_ml_worker" "$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml" - setup_systemd_service "fetch_ml_data_manager" "$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml" - - # Enable services - systemctl daemon-reload - systemctl enable fetch_ml_worker - systemctl enable fetch_ml_data_manager - - log_success "Systemd services configured" -} - -setup_log_rotation() { - log_info "Setting up log rotation..." - setup_logrotate - log_success "Log rotation configured" -} - -optimize_system() { - log_info "Optimizing system for ML workloads..." - hardening_steps - - # Optimize kernel parameters for ML - cat >> /etc/sysctl.conf << EOF -# ML Optimization -net.core.rmem_max = 134217728 -net.core.wmem_max = 134217728 -vm.swappiness = 10 -vm.dirty_ratio = 15 -vm.dirty_background_ratio = 5 -EOF - - sysctl -p - - # Configure GPU persistence mode if NVIDIA available - if command -v nvidia-smi &> /dev/null; then - nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode" - fi - - log_success "System optimized for ML workloads" -} - -install_fetch_ml() { - log_info "Installing Fetch ML..." - - # Clone or copy Fetch ML - cd $FETCH_ML_HOME - - if [[ ! -d "fetch_ml" ]]; then - # This would be replaced with actual repository URL - log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml" - log_info "Example: git clone https://github.com/your-org/fetch_ml.git" - return - fi - - cd fetch_ml - - # Build - export PATH=$PATH:/usr/local/go/bin - make build - - # Copy binaries - cp bin/* $FETCH_ML_HOME/bin/ - chmod +x $FETCH_ML_HOME/bin/* - - # Copy configs - mkdir -p $FETCH_ML_HOME/configs - cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml - - # Set permissions - chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME - - log_success "Fetch ML installed" -} - -main() { - log_info "Starting Fetch ML Ubuntu server setup..." - - check_root - check_ubuntu - - update_system - install_go - install_podman - install_redis - install_nvidia_drivers - install_ml_tools - ensure_user - create_directories - setup_firewall - setup_systemd_services - setup_logrotate - hardening_steps - install_fetch_ml - - log_success "Fetch ML setup complete!" - echo - log_info "Next steps:" - echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml" - echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml" - echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager" - echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager" - echo "5. View logs: journalctl -u fetch_ml_worker -f" - echo - log_info "Services will be available at:" - echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080" - echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081" -} - -# Run main function -main "$@" diff --git a/scripts/legacy/test_tools.sh b/scripts/legacy/test_tools.sh deleted file mode 100755 index efd1cfb..0000000 --- a/scripts/legacy/test_tools.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -set -e - -echo "=== Test Tools Harness ===" - -# Function to check if Redis is running, start temporary instance if needed -ensure_redis() { - if ! redis-cli ping >/dev/null 2>&1; then - echo "Starting temporary Redis instance..." - redis-server --daemonize yes --port 6379 - sleep 2 - if ! redis-cli ping >/dev/null 2>&1; then - echo "Failed to start Redis" - exit 1 - fi - echo "Redis started successfully" - # Set up cleanup trap - trap 'echo "Stopping temporary Redis..."; redis-cli shutdown || true' EXIT - else - echo "Redis is already running" - fi -} - -# Step 1: Build Go binaries -echo "Building Go binaries..." -go build -o bin/api-server ./cmd/api-server -go build -o bin/worker ./cmd/worker -go build -o bin/data_manager ./cmd/data_manager -go build -o bin/user_manager ./cmd/user_manager - -# Step 2: Build Zig CLI -echo "Building Zig CLI..." -cd cli -zig build -cd .. - -# Step 3: Ensure Redis is running -ensure_redis - -# Step 4: Run Go tests -echo "Running Go tests..." -go test ./... - -# Step 5: Run Zig tests -echo "Running Zig CLI tests..." -cd cli -zig test -cd .. - -# Step 6: Run Go E2E tests (Redis is already available) -echo "Running Go E2E tests..." -go test ./tests/e2e/... - -# Step 7: Smoke test API server and CLI -echo "Running smoke test..." -# Start API server in background on different port -./bin/api-server -config configs/config.yaml -port 19101 -no-tls > /tmp/api-server.log 2>&1 & -API_PID=$! -sleep 2 - -# Test CLI status -./cli/zig-out/bin/ml status -server http://localhost:19101 - -# Clean up -kill $API_PID 2>/dev/null || true - -echo "=== All tests completed successfully ===" diff --git a/scripts/maintenance/auto-cleanup.service b/scripts/maintenance/auto-cleanup.service index f6694c9..1ac2208 100644 --- a/scripts/maintenance/auto-cleanup.service +++ b/scripts/maintenance/auto-cleanup.service @@ -5,7 +5,7 @@ Requires=docker.service [Service] Type=oneshot -ExecStart=/Users/jfraeys/Documents/dev/fetch_ml/scripts/cleanup.sh --force +ExecStart=/Users/jfraeys/Documents/dev/fetch_ml/scripts/maintenance/cleanup.sh --dry-run User=jfraeys Group=staff StandardOutput=journal diff --git a/scripts/maintenance/cleanup-benchmarks.sh b/scripts/maintenance/cleanup-benchmarks.sh index 3ecdf6b..451a025 100755 --- a/scripts/maintenance/cleanup-benchmarks.sh +++ b/scripts/maintenance/cleanup-benchmarks.sh @@ -8,6 +8,7 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts" +ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive" # Colors for output RED='\033[0;31m' @@ -43,22 +44,34 @@ cleanup_benchmark_artifacts() { case "${1:-keep-10}" in "all") - print_status "Removing ALL benchmark artifacts..." - rm -rf "$LOCAL_ARTIFACTS_DIR" - print_success "Removed all artifacts (was $size_before)" + print_status "Archiving ALL benchmark artifacts..." + local stamp=$(date -u +%Y%m%d-%H%M%S) + mkdir -p "$ARCHIVE_DIR/$stamp" + mv "$LOCAL_ARTIFACTS_DIR"/run_* "$ARCHIVE_DIR/$stamp"/ 2>/dev/null || true + print_success "Archived all artifacts (was $size_before)" ;; "keep-5") - print_status "Keeping last 5 runs, removing older ones..." + print_status "Keeping last 5 runs, archiving older ones..." + local stamp=$(date -u +%Y%m%d-%H%M%S) + mkdir -p "$ARCHIVE_DIR/$stamp" cd "$LOCAL_ARTIFACTS_DIR" - ls -1t run_* 2>/dev/null | tail -n +6 | xargs rm -rf 2>/dev/null || true + ls -1t run_* 2>/dev/null | tail -n +6 | while read -r run; do + [ -n "$run" ] || continue + mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true + done local count_after=$(ls -1d run_* 2>/dev/null | wc -l) local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B") print_success "Cleaned old runs: $count_before → $count_after runs ($size_before → $size_after)" ;; "keep-10") - print_status "Keeping last 10 runs, removing older ones..." + print_status "Keeping last 10 runs, archiving older ones..." + local stamp=$(date -u +%Y%m%d-%H%M%S) + mkdir -p "$ARCHIVE_DIR/$stamp" cd "$LOCAL_ARTIFACTS_DIR" - ls -1t run_* 2>/dev/null | tail -n +11 | xargs rm -rf 2>/dev/null || true + ls -1t run_* 2>/dev/null | tail -n +11 | while read -r run; do + [ -n "$run" ] || continue + mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true + done local count_after=$(ls -1d run_* 2>/dev/null | wc -l) local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B") print_success "Cleaned old runs: $count_before → $count_after runs ($size_before → $size_after)" @@ -80,12 +93,18 @@ cleanup_temp_files() { # Clean temp directories local temp_cleaned=0 + local stamp=$(date -u +%Y%m%d-%H%M%S) + local tmp_archive_dir="$LOCAL_ARTIFACTS_DIR/tmp-archive/$stamp" + mkdir -p "$tmp_archive_dir" + # /tmp cleanup if [ -d "/tmp" ]; then local tmp_files=$(find /tmp -name "benchmark_*" -type f 2>/dev/null | wc -l) if [ "$tmp_files" -gt 0 ]; then - find /tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true - print_success "Cleaned $tmp_files temporary files from /tmp" + find /tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do + mv "$f" "$tmp_archive_dir/" 2>/dev/null || true + done + print_success "Archived $tmp_files temporary files from /tmp" temp_cleaned=$((temp_cleaned + tmp_files)) fi fi @@ -94,8 +113,10 @@ cleanup_temp_files() { if [ -d "/var/tmp" ]; then local vartmp_files=$(find /var/tmp -name "benchmark_*" -type f 2>/dev/null | wc -l) if [ "$vartmp_files" -gt 0 ]; then - find /var/tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true - print_success "Cleaned $vartmp_files temporary files from /var/tmp" + find /var/tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do + mv "$f" "$tmp_archive_dir/" 2>/dev/null || true + done + print_success "Archived $vartmp_files temporary files from /var/tmp" temp_cleaned=$((temp_cleaned + vartmp_files)) fi fi @@ -104,8 +125,10 @@ cleanup_temp_files() { if [ -d "$HOME/tmp" ]; then local user_tmp_files=$(find "$HOME/tmp" -name "benchmark_*" -type f 2>/dev/null | wc -l) if [ "$user_tmp_files" -gt 0 ]; then - find "$HOME/tmp" -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true - print_success "Cleaned $user_tmp_files temporary files from ~/tmp" + find "$HOME/tmp" -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do + mv "$f" "$tmp_archive_dir/" 2>/dev/null || true + done + print_success "Archived $user_tmp_files temporary files from ~/tmp" temp_cleaned=$((temp_cleaned + user_tmp_files)) fi fi @@ -177,9 +200,16 @@ cleanup_logs() { for log_dir in "${log_dirs[@]}"; do if [ -d "$log_dir" ]; then local log_size_before=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B") - # Remove log files older than 7 days - find "$log_dir" -name "*.log" -type f -mtime +7 -delete 2>/dev/null || true - find "$log_dir" -name "*.log.*" -type f -mtime +7 -delete 2>/dev/null || true + local stamp=$(date -u +%Y%m%d-%H%M%S) + local log_archive_dir="$log_dir/archive/$stamp" + mkdir -p "$log_archive_dir" + # Move log files older than 7 days to archive + find "$log_dir" -name "*.log" -type f -mtime +7 -print0 2>/dev/null | while IFS= read -r -d '' f; do + mv "$f" "$log_archive_dir/" 2>/dev/null || true + done + find "$log_dir" -name "*.log.*" -type f -mtime +7 -print0 2>/dev/null | while IFS= read -r -d '' f; do + mv "$f" "$log_archive_dir/" 2>/dev/null || true + done local log_size_after=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B") if [ "$log_size_before" != "$log_size_after" ]; then print_success "Cleaned old logs in $log_dir: $log_size_before → $log_size_after" diff --git a/scripts/maintenance/cleanup.sh b/scripts/maintenance/cleanup.sh index ed7dbd5..ae0209c 100755 --- a/scripts/maintenance/cleanup.sh +++ b/scripts/maintenance/cleanup.sh @@ -144,12 +144,12 @@ else log_info "No running containers found" fi -# Remove containers + # Remove containers log_info "Removing containers..." containers=$(docker ps -aq --filter "name=ml-") if [ -n "$containers" ]; then if [ "$DRY_RUN" = false ]; then - echo "$containers" | xargs docker rm -f + echo "$containers" | xargs docker rm log_success "Containers removed" fi else @@ -168,9 +168,9 @@ else log_info "No networks found" fi -# Remove volumes (with caution) -log_warning "Removing volumes (this will delete data)..." -if [ "$FORCE" = true ] || [ "$ALL" = true ]; then + # Remove volumes (with caution) +log_warning "Skipping volumes by default (use --all to remove them)" +if [ "$ALL" = true ]; then volumes=$(docker volume ls -q --filter "name=ml-") if [ -n "$volumes" ]; then if [ "$DRY_RUN" = false ]; then @@ -181,16 +181,16 @@ if [ "$FORCE" = true ] || [ "$ALL" = true ]; then log_info "No volumes found" fi else - log_info "Skipping volumes (use --force or --all to remove them)" + log_info "Skipping volumes" fi -# Remove images if requested + # Remove images if requested if [ "$ALL" = true ]; then log_info "Removing images..." images=$(docker images -q --filter "reference=fetch_ml-*") if [ -n "$images" ]; then if [ "$DRY_RUN" = false ]; then - echo "$images" | xargs docker rmi -f + echo "$images" | xargs docker rmi log_success "Images removed" fi else @@ -200,11 +200,15 @@ else log_info "Skipping images (use --all to remove them)" fi -# General Docker cleanup -log_info "Running general Docker cleanup..." -if [ "$DRY_RUN" = false ]; then - docker system prune -f - log_success "General cleanup completed" + # General Docker cleanup +if [ "$ALL" = true ]; then + log_info "Running general Docker cleanup (docker system prune)..." + if [ "$DRY_RUN" = false ]; then + docker system prune -f + log_success "General cleanup completed" + fi +else + log_info "Skipping docker system prune (use --all to enable)" fi # Show final state diff --git a/scripts/manage-artifacts.sh b/scripts/manage-artifacts.sh index c16d32f..35f1aea 100755 --- a/scripts/manage-artifacts.sh +++ b/scripts/manage-artifacts.sh @@ -8,6 +8,7 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts" +ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive" # Create artifacts directory if it doesn't exist mkdir -p "$LOCAL_ARTIFACTS_DIR" @@ -41,17 +42,21 @@ case "${1:-help}" in echo "=== Cleaning Artifacts ===" case "${2:-all}" in "all") - echo "Removing all artifacts..." - rm -rf "$LOCAL_ARTIFACTS_DIR" - echo "All artifacts removed" + echo "Archiving all artifacts..." + stamp=$(date -u +%Y%m%d-%H%M%S) + mkdir -p "$ARCHIVE_DIR/$stamp" + mv "$LOCAL_ARTIFACTS_DIR"/run_* "$ARCHIVE_DIR/$stamp"/ 2>/dev/null || true + echo "All artifacts archived" ;; "old") keep_count="${3:-10}" - echo "Keeping last $keep_count runs, removing older ones..." + echo "Keeping last $keep_count runs, archiving older ones..." + stamp=$(date -u +%Y%m%d-%H%M%S) + mkdir -p "$ARCHIVE_DIR/$stamp" cd "$LOCAL_ARTIFACTS_DIR" ls -1t run_* 2>/dev/null | tail -n +$((keep_count + 1)) | while read -r run; do - echo "Removing: $run" - rm -rf "$run" + echo "Archiving: $run" + mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true done ;; "run") @@ -64,8 +69,10 @@ case "${1:-help}" in fi run_dir="$LOCAL_ARTIFACTS_DIR/run_$run_id" if [ -d "$run_dir" ]; then - echo "Removing run: $run_id" - rm -rf "$run_dir" + echo "Archiving run: $run_id" + stamp=$(date -u +%Y%m%d-%H%M%S) + mkdir -p "$ARCHIVE_DIR/$stamp" + mv "$run_dir" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true else echo "Run not found: $run_id" fi diff --git a/scripts/setup-auto-cleanup.sh b/scripts/setup-auto-cleanup.sh deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/setup-secure-homelab.sh b/scripts/setup-secure-homelab.sh deleted file mode 100755 index 3eb6dd6..0000000 --- a/scripts/setup-secure-homelab.sh +++ /dev/null @@ -1,169 +0,0 @@ -#!/bin/bash - -# Secure Homelab Setup Script for Fetch ML -# This script generates secure API keys and TLS certificates - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" -CONFIG_DIR="$PROJECT_ROOT/configs/environments" -SSL_DIR="$PROJECT_ROOT/ssl" - -echo "🔒 Setting up secure homelab configuration..." - -# Create SSL directory -mkdir -p "$SSL_DIR" - -# Generate TLS certificates -echo "📜 Generating TLS certificates..." -if [[ ! -f "$SSL_DIR/cert.pem" ]] || [[ ! -f "$SSL_DIR/key.pem" ]]; then - openssl req -x509 -newkey rsa:4096 -keyout "$SSL_DIR/key.pem" -out "$SSL_DIR/cert.pem" -days 365 -nodes \ - -subj "/C=US/ST=Homelab/L=Local/O=FetchML/OU=Homelab/CN=localhost" \ - -addext "subjectAltName=DNS:localhost,DNS:$(hostname),IP:127.0.0.1" - chmod 600 "$SSL_DIR/key.pem" - chmod 644 "$SSL_DIR/cert.pem" - echo "✅ TLS certificates generated in $SSL_DIR/" -else - echo "ℹ️ TLS certificates already exist, skipping generation" -fi - -# Generate secure API keys -echo "🔑 Generating secure API keys..." -generate_api_key() { - openssl rand -hex 32 -} - -# Hash function -hash_key() { - echo -n "$1" | sha256sum | cut -d' ' -f1 -} - -# Generate keys -ADMIN_KEY=$(generate_api_key) -USER_KEY=$(generate_api_key) -ADMIN_HASH=$(hash_key "$ADMIN_KEY") -USER_HASH=$(hash_key "$USER_KEY") - -# Create secure config -echo "⚙️ Creating secure configuration..." -cat > "$CONFIG_DIR/config-homelab-secure.yaml" << EOF -# Secure Homelab Configuration -# IMPORTANT: Keep your API keys safe and never share them! - -redis: - url: "redis://localhost:6379" - max_connections: 10 - -auth: - enabled: true - api_keys: - homelab_admin: - hash: $ADMIN_HASH - admin: true - roles: - - admin - permissions: - '*': true - homelab_user: - hash: $USER_HASH - admin: false - roles: - - researcher - permissions: - 'experiments': true - 'datasets': true - 'jupyter': true - -server: - address: ":9101" - tls: - enabled: true - cert_file: "$SSL_DIR/cert.pem" - key_file: "$SSL_DIR/key.pem" - -security: - rate_limit: - enabled: true - requests_per_minute: 60 - burst_size: 10 - ip_whitelist: - - "127.0.0.1" - - "::1" - - "localhost" - - "192.168.1.0/24" # Adjust to your network - - "10.0.0.0/8" - -logging: - level: "info" - file: "logs/fetch_ml.log" - console: true - -resources: - cpu_limit: "2" - memory_limit: "4Gi" - gpu_limit: 0 - disk_limit: "10Gi" - -# Prometheus metrics -metrics: - enabled: true - listen_addr: ":9100" - tls: - enabled: false -EOF - -# Save API keys to a secure file -echo "🔐 Saving API keys..." -cat > "$PROJECT_ROOT/.api-keys" << EOF -# Fetch ML Homelab API Keys -# IMPORTANT: Keep this file secure and never commit to version control! - -ADMIN_API_KEY: $ADMIN_KEY -USER_API_KEY: $USER_KEY - -# Usage examples: -# curl -H "X-API-Key: $ADMIN_KEY" https://localhost:9101/health -# curl -H "X-API-Key: $USER_KEY" https://localhost:9101/api/jupyter/services -EOF - -chmod 600 "$PROJECT_ROOT/.api-keys" - -# Create environment file for JWT secret -JWT_SECRET=$(generate_api_key) -cat > "$PROJECT_ROOT/.env.secure" << EOF -# Secure environment variables for Fetch ML -# IMPORTANT: Keep this file secure and never commit to version control! - -JWT_SECRET=$JWT_SECRET - -# Source this file before running the server: -# source .env.secure -EOF - -chmod 600 "$PROJECT_ROOT/.env.secure" - -# Update .gitignore to exclude sensitive files -echo "📝 Updating .gitignore..." -if ! grep -q ".api-keys" "$PROJECT_ROOT/.gitignore"; then - echo -e "\n# Security files\n.api-keys\n.env.secure\nssl/\n*.pem\n*.key" >> "$PROJECT_ROOT/.gitignore" -fi - -echo "" -echo "🎉 Secure homelab setup complete!" -echo "" -echo "📋 Next steps:" -echo "1. Review and adjust the IP whitelist in config-homelab-secure.yaml" -echo "2. Start the server with: ./api-server -config configs/environments/config-homelab-secure.yaml" -echo "3. Source the environment: source .env.secure" -echo "4. Your API keys are saved in .api-keys" -echo "" -echo "🔐 API Keys:" -echo " Admin: $ADMIN_KEY" -echo " User: $USER_KEY" -echo "" -echo "⚠️ IMPORTANT:" -echo " - Never share your API keys" -echo " - Never commit .api-keys or .env.secure to version control" -echo " - Backup your SSL certificates and API keys securely" -echo " - Consider using a password manager for storing keys" diff --git a/scripts/setup.sh b/scripts/setup.sh deleted file mode 100755 index c0c89f7..0000000 --- a/scripts/setup.sh +++ /dev/null @@ -1,311 +0,0 @@ -#!/bin/bash -# setup.sh: One-shot homelab setup (security + core services) -# Keeps essential security (Fail2Ban, monitoring) while simplifying complexity - -set -euo pipefail - -readonly RED='\033[0;31m' -readonly GREEN='\033[0;32m' -readonly YELLOW='\033[1;33m' -readonly BLUE='\033[0;34m' -readonly NC='\033[0m' - -print_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -print_success() { - echo -e "${GREEN}[SUCCESS]${NC} $1" -} - -print_warning() { - echo -e "${YELLOW}[WARNING]${NC} $1" -} - -print_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# Simple dependency check -check_deps() { - print_info "Checking dependencies..." - - local missing=() - - if ! command -v go &> /dev/null; then - missing+=("go") - fi - - if ! command -v zig &> /dev/null; then - missing+=("zig") - fi - - if ! command -v redis-server &> /dev/null; then - missing+=("redis-server") - fi - - if ! command -v docker &> /dev/null; then - missing+=("docker") - fi - - if [[ ${#missing[@]} -gt 0 ]]; then - print_error "Missing dependencies: ${missing[*]}" - echo "" - echo "Install with:" - echo " macOS: brew install ${missing[*]}" - echo " Ubuntu: sudo apt-get install ${missing[*]}" - exit 1 - fi - - print_success "Dependencies OK" -} - -# Simple setup -setup_project() { - print_info "Setting up project..." - - # Create essential directories - mkdir -p ssl logs configs data monitoring - - # Generate simple SSL cert - if [[ ! -f "ssl/cert.pem" ]]; then - openssl req -x509 -newkey rsa:2048 -keyout ssl/key.pem -out ssl/cert.pem \ - -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Homelab/CN=localhost" \ - -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null - print_success "SSL certificates generated" - fi - - # Create balanced config - cat > configs/config.yaml << 'EOF' -base_path: "./data/experiments" - -auth: - enabled: true - api_keys: - homelab_user: - hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password" - admin: true - roles: ["user", "admin"] - permissions: - read: true - write: true - delete: true - -server: - address: ":9101" - tls: - enabled: true - cert_file: "./ssl/cert.pem" - key_file: "./ssl/key.pem" - -security: - rate_limit: - enabled: true - requests_per_minute: 30 - burst_size: 5 - ip_whitelist: - - "127.0.0.1" - - "::1" - - "192.168.0.0/16" - - "10.0.0.0/8" - - "172.16.0.0/12" - failed_login_lockout: - enabled: true - max_attempts: 3 - lockout_duration: "15m" - -redis: - url: "redis://localhost:6379" - -logging: - level: "info" - file: "./logs/app.log" - audit_log: "./logs/audit.log" - access_log: "./logs/access.log" - -monitoring: - enabled: true - metrics_port: 9090 - health_check_interval: "30s" -EOF - - print_success "Configuration created" -} - -# Simple build -build_project() { - print_info "Building project..." - - # Build Go apps - go build -o bin/api-server ./cmd/api-server - go build -o bin/worker ./cmd/worker - go build -o bin/tui ./cmd/tui - - # Build Zig CLI - cd cli && zig build && cd .. - - print_success "Build completed" -} - -# Setup Fail2Ban -setup_fail2ban() { - print_info "Setting up Fail2Ban..." - - if ! command -v fail2ban-server &> /dev/null; then - print_warning "Fail2Ban not installed, skipping..." - return - fi - - # Create Fail2Ban configuration - sudo mkdir -p /etc/fail2ban/jail.d 2>/dev/null || true - - cat > /tmp/ml-experiments-jail.conf << 'EOF' -[DEFAULT] -bantime = 3600 -findtime = 600 -maxretry = 3 -backend = systemd - -[sshd] -enabled = true -port = ssh -logpath = /var/log/auth.log -maxretry = 3 - -[ml-experiments-api] -enabled = true -port = 9101 -filter = ml-experiments-api -logpath = ./logs/audit.log -maxretry = 5 -bantime = 7200 - -[ml-experiments-auth] -enabled = true -filter = ml-experiments-auth -logpath = ./logs/audit.log -maxretry = 3 -bantime = 3600 -EOF - - # Create filter definitions - cat > /tmp/ml-experiments-api.conf << 'EOF' -[Definition] -failregex = ^.*.*"status":40[13].*$ -ignoreregex = -EOF - - cat > /tmp/ml-experiments-auth.conf << 'EOF' -[Definition] -failregex = ^.*"event":"failed_login".*"client_ip":"".*$ -ignoreregex = -EOF - - # Try to install configurations - if sudo cp /tmp/ml-experiments-jail.conf /etc/fail2ban/jail.d/ 2>/dev/null; then - sudo cp /tmp/ml-experiments-*.conf /etc/fail2ban/filter.d/ 2>/dev/null || true - sudo systemctl restart fail2ban 2>/dev/null || true - print_success "Fail2Ban configured" - else - print_warning "Could not configure Fail2Ban (requires sudo)" - fi - - rm -f /tmp/ml-experiments-*.conf -} - -# Setup Redis -setup_redis() { - print_info "Setting up Redis..." - - if ! pgrep -f "redis-server" > /dev/null; then - redis-server --daemonize yes --port 6379 - print_success "Redis started" - else - print_info "Redis already running" - fi -} - -# Create simple management script -create_manage_script() { - cat > manage.sh << 'EOF' -#!/bin/bash - -# Simple management script - -case "${1:-status}" in - "start") - echo "Starting services..." - redis-server --daemonize yes --port 6379 2>/dev/null || true - ./bin/api-server -config configs/config.yaml & - echo "Services started" - ;; - "stop") - echo "Stopping services..." - pkill -f "api-server" || true - redis-cli shutdown 2>/dev/null || true - echo "Services stopped" - ;; - "status") - echo "=== Status ===" - if pgrep -f "redis-server" > /dev/null; then - echo "✅ Redis: Running" - else - echo "❌ Redis: Stopped" - fi - - if pgrep -f "api-server" > /dev/null; then - echo "✅ API Server: Running" - else - echo "❌ API Server: Stopped" - fi - ;; - "logs") - echo "=== Recent Logs ===" - tail -20 logs/app.log 2>/dev/null || echo "No logs yet" - ;; - "test") - echo "=== Testing ===" - curl -k -s https://localhost:9101/health || echo "API server not responding" - ;; - *) - echo "Usage: $0 {start|stop|status|logs|test}" - ;; -esac -EOF - - chmod +x manage.sh - print_success "Management script created" -} - -# Show next steps -show_next_steps() { - print_success "Setup completed!" - echo "" - echo "🎉 Setup complete!" - echo "" - echo "Next steps:" - echo " 1. Start services: ./tools/manage.sh start" - echo " 2. Check status: ./tools/manage.sh status" - echo " 3. Test API: curl -k -H 'X-API-Key: password' https://localhost:9101/health" - echo "" - echo "Configuration: configs/config.yaml" - echo "Logs: logs/app.log and logs/audit.log" - echo "" - print_success "Ready for homelab use!" -} - -# Main setup -main() { - echo "ML Experiment Manager - Homelab Setup" - echo "=====================================" - echo "" - - check_deps - setup_project - build_project - setup_redis - create_manage_script - show_next_steps -} - -main "$@" diff --git a/scripts/setup_monitoring.py b/scripts/setup_monitoring.py new file mode 100644 index 0000000..174a242 --- /dev/null +++ b/scripts/setup_monitoring.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +import os + +# Create monitoring directory structure +repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +monitoring_dir = os.path.join(repo_root, 'monitoring') +grafana_dir = os.path.join(monitoring_dir, 'grafana') + +datasources_dir = os.path.join(grafana_dir, 'provisioning', 'datasources') +providers_dir = os.path.join(grafana_dir, 'provisioning', 'dashboards') + +os.makedirs(datasources_dir, exist_ok=True) +os.makedirs(providers_dir, exist_ok=True) + +# Essential datasource configurations +datasources = { + 'prometheus.yml': """apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true + jsonData: + timeInterval: "5s" +""", + 'loki.yml': """apiVersion: 1 +datasources: + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + editable: true + jsonData: + maxLines: 1000 +""", + 'dashboards.yml': """apiVersion: 1 +providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards +""" +} + +# Write configuration files +for filename, content in datasources.items(): + if filename == 'dashboards.yml': + path = os.path.join(providers_dir, filename) + else: + path = os.path.join(datasources_dir, filename) + + with open(path, 'w') as f: + f.write(content) + +print("Monitoring setup completed!") \ No newline at end of file diff --git a/scripts/smoke-test.sh b/scripts/smoke-test.sh new file mode 100644 index 0000000..8b4ff80 --- /dev/null +++ b/scripts/smoke-test.sh @@ -0,0 +1,111 @@ +set -euo pipefail; + +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +export FETCHML_REPO_ROOT="$repo_root" + +env="${1:-dev}"; +if [ "$env" != "dev" ] && [ "$env" != "prod" ]; then + echo "usage: $0 [dev|prod]" >&2 + exit 2 +fi + +probe_https_health_openssl() { + host="$1" + port="$2" + path="$3" + + req="GET ${path} HTTP/1.1\r\nHost: ${host}\r\nConnection: close\r\n\r\n" + resp=$(printf "%b" "$req" | openssl s_client -connect "127.0.0.1:${port}" -servername "${host}" -tls1_2 -quiet 2>/dev/null || true) + printf "%s" "$resp" | tr -d '\r' | head -n 1 | grep -Eq '^HTTP/1\.[01] 200' +} + +compose_cmd="docker-compose"; +if ! command -v docker-compose >/dev/null 2>&1; then + compose_cmd="docker compose"; +fi + +compose_files=() +compose_project_args=("--project-directory" "$repo_root") +api_base="" +prometheus_base="" +stack_name="" + +if [ "$env" = "dev" ]; then + mkdir -p \ + "$repo_root/data/dev/redis" \ + "$repo_root/data/dev/minio" \ + "$repo_root/data/dev/prometheus" \ + "$repo_root/data/dev/grafana" \ + "$repo_root/data/dev/loki" \ + "$repo_root/data/dev/logs" \ + "$repo_root/data/dev/experiments" \ + "$repo_root/data/dev/active" \ + "$repo_root/data/dev/workspaces" + + stack_name="dev" + compose_files=("-f" "$repo_root/deployments/docker-compose.dev.yml") + api_base="https://localhost:9101" + if ! curl -skf "$api_base/health" >/dev/null 2>&1; then + api_base="http://localhost:9101" + fi + prometheus_base="http://localhost:9090" +else + mkdir -p \ + "$repo_root/data/prod-smoke/caddy/data" \ + "$repo_root/data/prod-smoke/caddy/config" \ + "$repo_root/data/prod-smoke/redis" \ + "$repo_root/data/prod-smoke/logs" \ + "$repo_root/data/prod-smoke/experiments" \ + "$repo_root/data/prod-smoke/active" + + stack_name="prod" + compose_files=("-f" "$repo_root/deployments/docker-compose.prod.smoke.yml") + api_base="https://localhost:8443" + export FETCHML_DOMAIN=localhost + export CADDY_EMAIL=smoke@example.invalid +fi + +cleanup() { + status=$?; + if [ "$status" -ne 0 ]; then + $compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" logs --no-color || true; + fi + if [ "${KEEP_STACK:-0}" != "1" ]; then + $compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" down -v >/dev/null 2>&1 || true; + fi + exit "$status"; +} + +trap cleanup EXIT; +echo "Starting $stack_name stack for smoke test..."; + +$compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" up -d --build >/dev/null; +echo "Waiting for API to become healthy..."; + +deadline=$(($(date +%s) + 90)); +while true; do + if [ "$env" = "dev" ]; then + if curl -skf "$api_base/health" >/dev/null 2>&1; then break; fi; + else + if probe_https_health_openssl "localhost" "8443" "/health"; then break; fi; + fi + if [ $(date +%s) -ge $deadline ]; then echo "Timed out waiting for $api_base/health"; exit 1; fi; + sleep 2; +done; + +if [ "$env" = "dev" ]; then + echo "Checking metrics endpoint..."; + curl -skf "$api_base/metrics" >/dev/null; + + echo "Waiting for Prometheus target api-server to be up..."; + deadline=$(($(date +%s) + 90)); + query_url="$prometheus_base/api/v1/query?query=up%7Bjob%3D%22api-server%22%7D"; + + while true; do + resp=$(curl -sf "$query_url" || true); + resp_compact=$(printf "%s" "$resp" | tr -d '\n' | tr -d '\r'); + if echo "$resp_compact" | grep -Fq '"instance":"api-server:9101"' && echo "$resp_compact" | grep -Fq ',"1"]'; then break; fi; + if [ $(date +%s) -ge $deadline ]; then echo "Timed out waiting for Prometheus api-server target to be up"; echo "$resp"; exit 1; fi; + sleep 2; + done; +fi \ No newline at end of file diff --git a/scripts/testing/run-full-test-suite.sh b/scripts/testing/run-full-test-suite.sh deleted file mode 100755 index e69de29..0000000 diff --git a/scripts/testing/test-homelab-secure.sh b/scripts/testing/test-homelab-secure.sh deleted file mode 100755 index ef10f40..0000000 --- a/scripts/testing/test-homelab-secure.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash - -# Homelab Secure Test Environment Script -set -e - -echo "Starting Homelab Secure Production Environment..." - -# Clean up any existing containers -echo "Cleaning up existing containers..." -docker-compose -f deployments/docker-compose.homelab-secure.yml down -v - -# Create necessary directories with proper permissions -echo "Creating directories..." -mkdir -p data logs -chmod 750 data logs - -# Build and start services -echo "Building and starting services..." -docker-compose -f deployments/docker-compose.homelab-secure.yml up --build -d - -# Wait for services to be healthy -echo "Waiting for services to be healthy..." -sleep 20 - -# Check service health -echo "Checking service health..." -docker-compose -f deployments/docker-compose.homelab-secure.yml ps - -# Test API server with TLS -echo "Testing API server..." -curl -k -s https://localhost:9104/health || echo "API health check failed" - -# Test Redis with authentication -echo "Testing Redis with authentication..." -docker exec ml-homelab-redis redis-cli -a "HomelabRedis2024!" ping || echo "Redis health check failed" - -# Test SSH connectivity with security -echo "Testing SSH connectivity..." -docker exec -u worker ml-homelab-worker ssh -o StrictHostKeyChecking=no -o Port=2222 worker@localhost "echo 'SSH OK'" || echo "SSH test failed" - -# Test fail2ban status -echo "Testing fail2ban..." -docker exec ml-homelab-api fail2ban-client status sshd || echo "fail2ban check failed" - -echo "" -echo "Homelab secure production environment is ready!" -echo "" -echo "Services:" -echo " - API Server: https://localhost:9104" -echo " - SSH: localhost:2223 (worker user)" -echo " - Redis: localhost:6379 (with password)" -echo " - Metrics: http://localhost:9101" -echo "" -echo "Security Features:" -echo " ✓ Strong TLS 1.3 with modern ciphers" -echo " ✓ SSH with fail2ban protection" -echo " ✓ Redis with password authentication" -echo " ✓ SQLite database with encryption" -echo " ✓ Container security hardening" -echo " ✓ Rate limiting and CORS protection" -echo " ✓ Security headers and CSRF protection" -echo " ✓ Podman sandboxed job execution" -echo " ✓ Audit logging and monitoring" -echo "" -echo "Credentials:" -echo " - API User: homelab_user / password" -echo " - SSH User: worker / HomelabWorker2024!" -echo " - Redis Password: HomelabRedis2024!" -echo "" -echo "To test with CLI:" -echo " ./cli/zig-out/bin/ml queue homelab-secure-test" -echo " ./cli/zig-out/bin/ml status" -echo "" -echo "To view logs:" -echo " docker-compose -f deployments/docker-compose.homelab-secure.yml logs -f api-server" -echo " docker-compose -f deployments/docker-compose.homelab-secure.yml logs -f worker" -echo " docker-compose -f deployments/docker-compose.homelab-secure.yml down" -echo "" -echo "To stop:" -echo " docker-compose -f deployments/docker-compose.homelab-secure.yml down" diff --git a/scripts/track_performance.sh b/scripts/track_performance.sh new file mode 100755 index 0000000..1912e34 --- /dev/null +++ b/scripts/track_performance.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Simple performance tracking script + +RESULTS_DIR="test_results/performance" +TIMESTAMP=$(date +"%Y%m%d_%H%M%S") +RESULTS_FILE="$RESULTS_DIR/load_test_$TIMESTAMP.json" + +mkdir -p "$RESULTS_DIR" + +echo "Running load test performance tracking..." +echo "Timestamp: $TIMESTAMP" + +# Run tests and capture results +go test ./tests/load -run=TestLoadTestSuite -v -load-suite=medium -timeout=10m > "$RESULTS_DIR/raw_$TIMESTAMP.log" + +# Extract key metrics +{ + echo "{" + echo " \"timestamp\": \"$TIMESTAMP\"," + echo " \"tests\": [" + + # Parse light load + LIGHT_RPS=$(grep -A1 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Throughput" | awk '{print $2}') + LIGHT_ERROR=$(grep -A2 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Error rate" | awk '{print $3}') + LIGHT_P99=$(grep -A4 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "P99 latency" | awk '{print $3}') + + echo " {" + echo " \"name\": \"LightLoad\"," + echo " \"throughput_rps\": $LIGHT_RPS," + echo " \"error_rate_percent\": $LIGHT_ERROR," + echo " \"p99_latency_ms\": \"$LIGHT_P99\"" + echo " }," + + # Parse medium load + MEDIUM_RPS=$(grep -A1 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Throughput" | awk '{print $2}') + MEDIUM_ERROR=$(grep -A2 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Error rate" | awk '{print $3}') + MEDIUM_P99=$(grep -A4 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "P99 latency" | awk '{print $3}') + + echo " {" + echo " \"name\": \"MediumLoad\"," + echo " \"throughput_rps\": $MEDIUM_RPS," + echo " \"error_rate_percent\": $MEDIUM_ERROR," + echo " \"p99_latency_ms\": \"$MEDIUM_P99\"" + echo " }" + echo " ]" + echo "}" +} > "$RESULTS_FILE" + +echo "Results saved to: $RESULTS_FILE" +echo "Raw logs: $RESULTS_DIR/raw_$TIMESTAMP.log" + +# Show comparison with previous run if exists +PREV_FILE=$(ls -t "$RESULTS_DIR"/load_test_*.json | sed -n '2p') +if [ -n "$PREV_FILE" ]; then + echo "" + echo "=== Comparison with previous run ===" + echo "Previous: $(basename $PREV_FILE)" + echo "Current: $(basename $RESULTS_FILE)" + echo "" + echo "Light Load Throughput:" + echo " Previous: $(jq -r '.tests[0].throughput_rps' "$PREV_FILE") RPS" + echo " Current: $(jq -r '.tests[0].throughput_rps' "$RESULTS_FILE") RPS" + echo " Change: $(echo "$(jq -r '.tests[0].throughput_rps' "$RESULTS_FILE") - $(jq -r '.tests[0].throughput_rps' "$PREV_FILE")" | bc -l) RPS" +fi diff --git a/scripts/validate-prod-config.sh b/scripts/validate-prod-config.sh deleted file mode 100755 index 0e501e6..0000000 --- a/scripts/validate-prod-config.sh +++ /dev/null @@ -1,204 +0,0 @@ -#!/bin/bash -# Production Configuration Validator -# Verifies all paths and configs are consistent for experiment lifecycle - -set -e - -BOLD='\033[1m' -GREEN='\033[0;32m' -RED='\033[0;31m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -echo -e "${BOLD}=== FetchML Production Configuration Validator ===${NC}\n" - -# Configuration file paths -API_CONFIG="${1:-configs/config-prod.yaml}" -WORKER_CONFIG="${2:-configs/worker-prod.toml}" - -errors=0 -warnings=0 - -check_pass() { - echo -e "${GREEN}✓${NC} $1" -} - -check_fail() { - echo -e "${RED}✗${NC} $1" - ((errors++)) -} - -check_warn() { - echo -e "${YELLOW}⚠${NC} $1" - ((warnings++)) -} - -# 1. Check API server config exists -echo -e "${BOLD}Checking API Server Configuration${NC}" -if [ ! -f "$API_CONFIG" ]; then - check_fail "API config not found: $API_CONFIG" -else - check_pass "API config found: $API_CONFIG" - - # Extract base_path from API config - API_BASE_PATH=$(grep 'base_path:' "$API_CONFIG" | head -1 | awk '{print $2}' | tr -d '"') - echo " Base path: $API_BASE_PATH" - - # Check if path is absolute - if [[ "$API_BASE_PATH" != /* ]]; then - check_fail "base_path must be absolute: $API_BASE_PATH" - else - check_pass "base_path is absolute" - fi - - # Check Redis config - if grep -q 'redis:' "$API_CONFIG"; then - check_pass "Redis configuration present" - else - check_fail "Redis configuration missing" - fi - - # Check auth enabled - if grep -q 'enabled: true' "$API_CONFIG"; then - check_pass "Authentication enabled" - else - check_warn "Authentication disabled (not recommended for production)" - fi -fi - -echo "" - -# 2. Check Worker config (if provided) -if [ -f "$WORKER_CONFIG" ]; then - echo -e "${BOLD}Checking Worker Configuration${NC}" - check_pass "Worker config found: $WORKER_CONFIG" - - # Extract base_path from worker config - WORKER_BASE_PATH=$(grep 'base_path' "$WORKER_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "') - echo " Base path: $WORKER_BASE_PATH" - - # Compare paths - if [ "$API_BASE_PATH" = "$WORKER_BASE_PATH" ]; then - check_pass "API and Worker base_path match" - else - check_fail "base_path mismatch! API: $API_BASE_PATH, Worker: $WORKER_BASE_PATH" - fi - - # Check podman_image configured - if grep -q 'podman_image' "$WORKER_CONFIG"; then - PODMAN_IMAGE=$(grep 'podman_image' "$WORKER_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "') - check_pass "Podman image configured: $PODMAN_IMAGE" - else - check_fail "podman_image not configured" - fi -else - check_warn "Worker config not found: $WORKER_CONFIG (optional for API server only)" -fi - -echo "" - -# 3. Check directory structure (if base_path exists) -if [ -n "$API_BASE_PATH" ] && [ -d "$API_BASE_PATH" ]; then - echo -e "${BOLD}Checking Directory Structure${NC}" - check_pass "Base directory exists: $API_BASE_PATH" - - # Check subdirectories - for dir in experiments pending running finished failed; do - if [ -d "$API_BASE_PATH/$dir" ]; then - check_pass "$dir/ directory exists" - else - check_warn "$dir/ directory missing (will be created automatically)" - fi - done - - # Check permissions - if [ -w "$API_BASE_PATH" ]; then - check_pass "Base directory is writable" - else - check_fail "Base directory is not writable (check permissions)" - fi - -elif [ -n "$API_BASE_PATH" ]; then - check_warn "Base directory does not exist: $API_BASE_PATH (will need to be created)" -fi - -echo "" - -# 4. Check Redis connectivity (if server is running) -echo -e "${BOLD}Checking Redis Connectivity${NC}" -if command -v redis-cli &> /dev/null; then - if redis-cli ping &> /dev/null; then - check_pass "Redis server is running and accessible" - - # Check queue - QUEUE_SIZE=$(redis-cli llen fetchml:tasks:queue 2>/dev/null || echo "0") - echo " Queue size: $QUEUE_SIZE tasks" - else - check_warn "Redis server not accessible (start with: redis-server)" - fi -else - check_warn "redis-cli not installed (cannot verify Redis connectivity)" -fi - -echo "" - -# 5. Check Podman (if worker config exists) -if [ -f "$WORKER_CONFIG" ]; then - echo -e "${BOLD}Checking Podman${NC}" - if command -v podman &> /dev/null; then - check_pass "Podman is installed" - - # Check if image exists - if [ -n "$PODMAN_IMAGE" ]; then - if podman image exists "$PODMAN_IMAGE" 2>/dev/null; then - check_pass "Podman image exists: $PODMAN_IMAGE" - else - check_warn "Podman image not found: $PODMAN_IMAGE (needs to be built)" - fi - fi - - # Check GPU access (if configured) - if grep -q 'gpu_access.*true' "$WORKER_CONFIG" 2>/dev/null; then - if podman run --rm --device nvidia.com/gpu=all nvidia/cuda:11.8.0-base nvidia-smi &>/dev/null; then - check_pass "GPU access working" - else - check_warn "GPU access configured but not working (check nvidia-container-toolkit)" - fi - fi - else - check_fail "Podman not installed (required for worker)" - fi -fi - -echo "" - -# 6. Check CLI config consistency -echo -e "${BOLD}Checking CLI Configuration${NC}" -CLI_CONFIG="$HOME/.ml/config.toml" -if [ -f "$CLI_CONFIG" ]; then - check_pass "CLI config found: $CLI_CONFIG" - - CLI_BASE=$(grep 'worker_base' "$CLI_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "') - if [ "$CLI_BASE" = "$API_BASE_PATH" ]; then - check_pass "CLI worker_base matches server base_path" - else - check_warn "CLI worker_base ($CLI_BASE) differs from server ($API_BASE_PATH)" - fi -else - check_warn "CLI config not found (run: ml init)" -fi - -echo "" - -# Summary -echo -e "${BOLD}=== Summary ===${NC}" -if [ $errors -eq 0 ] && [ $warnings -eq 0 ]; then - echo -e "${GREEN}All checks passed! Configuration is ready for production.${NC}" - exit 0 -elif [ $errors -eq 0 ]; then - echo -e "${YELLOW}Configuration has $warnings warning(s). Review before deployment.${NC}" - exit 0 -else - echo -e "${RED}Configuration has $errors error(s) and $warnings warning(s). Fix before deployment.${NC}" - exit 1 -fi diff --git a/scripts/verify_release.sh b/scripts/verify_release.sh new file mode 100644 index 0000000..743fc07 --- /dev/null +++ b/scripts/verify_release.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Usage: + scripts/verify_release.sh --dir [--repo /] + +What it does: +- Verifies checksums.txt signature (keyless cosign) if cosign + checksums.txt.sig/.cert are present +- Verifies *.tar.gz files against checksums.txt + +Notes: +- --repo enables strict Sigstore identity checking against the release workflow. +- Without cosign, the script still verifies SHA256 hashes. + +Examples: + scripts/verify_release.sh --dir ./release --repo jfraeys/fetch_ml + scripts/verify_release.sh --dir . +EOF +} + +release_dir="" +repo="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --dir) + release_dir="${2:-}" + shift 2 + ;; + --repo) + repo="${2:-}" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "unknown argument: $1" >&2 + usage >&2 + exit 2 + ;; + esac +done + +if [[ -z "$release_dir" ]]; then + echo "missing --dir" >&2 + usage >&2 + exit 2 +fi + +if [[ ! -d "$release_dir" ]]; then + echo "directory not found: $release_dir" >&2 + exit 2 +fi + +cd "$release_dir" + +if [[ ! -f checksums.txt ]]; then + echo "missing checksums.txt in $release_dir" >&2 + exit 2 +fi + +has_cosign=false +if command -v cosign >/dev/null 2>&1; then + has_cosign=true +fi + +verify_sigstore() { + if [[ ! -f checksums.txt.sig ]] || [[ ! -f checksums.txt.cert ]]; then + echo "[verify] cosign available, but checksums.txt.sig/.cert not found; skipping signature verification" >&2 + return 0 + fi + + if [[ -z "$repo" ]]; then + echo "[verify] verifying signature (no repo identity pin; pass --repo to pin identity)" >&2 + COSIGN_YES=true cosign verify-blob \ + --certificate checksums.txt.cert \ + --signature checksums.txt.sig \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com \ + checksums.txt >/dev/null + echo "[ok] checksums.txt signature verified (un-pinned identity)" + return 0 + fi + + local identity + identity="^https://github.com/${repo}/\.github/workflows/release\.yml@refs/tags/v.*$" + + COSIGN_YES=true cosign verify-blob \ + --certificate checksums.txt.cert \ + --signature checksums.txt.sig \ + --certificate-identity-regexp "$identity" \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com \ + checksums.txt >/dev/null + + echo "[ok] checksums.txt signature verified (pinned to ${repo} release workflow)" +} + +verify_hashes() { + local failures=0 + + local has_sha256sum=false + if command -v sha256sum >/dev/null 2>&1; then + has_sha256sum=true + fi + + while IFS= read -r expected file; do + [[ -z "${expected}" ]] && continue + [[ -z "${file}" ]] && continue + + if [[ ! -f "$file" ]]; then + continue + fi + + local actual + if [[ "$has_sha256sum" == true ]]; then + actual="$(sha256sum "$file" | awk '{print $1}')" + else + actual="$(shasum -a 256 "$file" | awk '{print $1}')" + fi + + if [[ "$actual" != "$expected" ]]; then + echo "[fail] $file" >&2 + echo " expected: $expected" >&2 + echo " actual: $actual" >&2 + failures=$((failures+1)) + fi + done < <(awk '{print $1, $2}' checksums.txt) + + if [[ $failures -gt 0 ]]; then + echo "[fail] checksum verification failed ($failures file(s))" >&2 + exit 1 + fi + + echo "[ok] all available artifacts match checksums.txt" +} + +if [[ "$has_cosign" == true ]]; then + verify_sigstore +else + echo "[verify] cosign not installed; skipping signature verification" >&2 +fi + +verify_hashes + +echo "[ok] release verification complete" diff --git a/tools/manage.sh b/tools/manage.sh index 5f79e79..25f5ce5 100755 --- a/tools/manage.sh +++ b/tools/manage.sh @@ -5,6 +5,10 @@ set -euo pipefail +make_target_exists() { + make -n "$1" >/dev/null 2>&1 +} + # Colors RED='\033[0;31m' GREEN='\033[0;32m' @@ -45,7 +49,7 @@ show_status() { # Check Go apps print_app "Go Applications:" - local go_apps=("api-server" "worker" "tui" "data_manager" "user_manager") + local go_apps=("api-server" "worker" "tui") for app in "${go_apps[@]}"; do if [[ -f "bin/$app" ]]; then echo " ✅ $app: Built" @@ -85,7 +89,7 @@ show_status() { # Check configuration print_app "Configuration:" - if [[ -f "configs/config-local.yaml" ]]; then + if [[ -f "configs/api/dev.yaml" ]]; then echo " ✅ Security config: Found" else echo " ⚠️ Security config: Not found" @@ -110,14 +114,14 @@ build_all() { echo "=============================" echo "" - print_info "Building Go applications..." - make build - if command -v zig &> /dev/null; then - print_info "Building Zig CLI..." - make cli-build + print_info "Building all components (Go + Zig CLI)..." + make build else - print_warning "Zig not found, skipping CLI build" + print_warning "Zig not found, building Go components only" + go build -o bin/api-server cmd/api-server/main.go + go build -o bin/worker cmd/worker/worker_server.go + go build -o bin/tui ./cmd/tui fi print_success "Build completed!" @@ -128,11 +132,13 @@ test_all() { echo "====================" echo "" - print_info "Running main test suite..." - make test - - print_info "Running comprehensive tests..." - make test-all + if make_target_exists test-full; then + print_info "Running full test suite..." + make test-full + else + print_info "Running test suite..." + make test + fi print_success "All tests completed!" } @@ -156,8 +162,8 @@ start_services() { # Start API server if built if [[ -f "bin/api-server" ]]; then print_info "Starting API server..." - if [[ -f "configs/config-local.yaml" ]]; then - ./bin/api-server --config configs/config-local.yaml & + if [[ -f "configs/api/dev.yaml" ]]; then + ./bin/api-server --config configs/api/dev.yaml & else print_warning "No config found, using defaults" ./bin/api-server & @@ -187,13 +193,25 @@ check_health() { print_info "Port 9101 is open, checking API health endpoint..." # Try the health endpoint - response=$(curl -k -s --max-time 3 -H 'X-API-Key: password' -H 'X-Forwarded-For: 127.0.0.1' https://localhost:9101/health 2>/dev/null) + local api_key_header="" + if [[ -n "${FETCH_ML_API_KEY:-}" ]]; then + api_key_header="-H X-API-Key: ${FETCH_ML_API_KEY}" + fi + + response=$(curl -s --max-time 3 ${api_key_header} http://localhost:9101/health 2>/dev/null || true) + if [[ -z "$response" ]]; then + response=$(curl -k -s --max-time 3 ${api_key_header} https://localhost:9101/health 2>/dev/null || true) + fi if [[ "$response" == "OK" ]]; then print_success "API is healthy: $response" elif [[ "$response" == *"IP not whitelisted"* ]]; then print_warning "API running but IP not whitelisted (expected behavior)" - print_info "Try: curl -k -H 'X-API-Key: password' -H 'X-Forwarded-For: 127.0.0.1' https://localhost:9101/health" + if [[ -n "${FETCH_ML_API_KEY:-}" ]]; then + print_info "Try: curl -k -H 'X-API-Key: $FETCH_ML_API_KEY' https://localhost:9101/health" + else + print_info "Try: curl -k https://localhost:9101/health" + fi else print_error "Unexpected response: $response" fi @@ -229,19 +247,36 @@ run_security() { case "${1:-check}" in "check") print_info "Running security checks..." - make security-check + if make_target_exists security-check; then + make security-check + else + print_warning "No 'security-check' Make target found" + print_info "Try: make ci-local" + fi ;; "monitor") print_info "Starting security monitoring..." - make security-monitor + if make_target_exists security-monitor; then + make security-monitor + else + print_warning "No 'security-monitor' Make target found" + fi ;; "deploy") print_info "Deploying with security..." - make security-deploy + if make_target_exists security-deploy; then + make security-deploy + else + print_warning "No 'security-deploy' Make target found" + fi ;; "audit") print_info "Running security audit..." - make security-audit + if make_target_exists security-audit; then + make security-audit + else + print_warning "No 'security-audit' Make target found" + fi ;; *) echo "Usage: $0 security {check|monitor|deploy|audit}" @@ -258,15 +293,22 @@ run_development() { case "${1:-setup}" in "setup") print_info "Setting up development environment..." - ./scripts/auto_setup.sh + print_warning "Legacy setup scripts were removed; using Makefile/deployments instead" + print_info "Try: make dev" + print_info "Or: ./deployments/deploy.sh dev up" ;; "quick") print_info "Running quick start..." - ./scripts/quick_start.sh + print_warning "Legacy quick start script was removed; using deployments instead" + print_info "Try: ./deployments/deploy.sh dev up" ;; "deps") print_info "Installing dependencies..." - make install-deps + if make_target_exists install-deps; then + make install-deps + else + print_warning "No 'install-deps' Make target found" + fi ;; *) echo "Usage: $0 dev {setup|quick|deps}" @@ -309,7 +351,7 @@ cleanup() { echo "" print_info "Cleaning project artifacts..." - make clean-all + make clean print_info "Stopping services..." stop_services @@ -330,7 +372,7 @@ show_help() { echo " start - Start all services" echo " stop - Stop all services" echo " health - Check API health endpoint" -echo " security - Security management (check|monitor|deploy|audit)" + echo " security - Security management (check|monitor|deploy|audit)" echo " dev - Development environment (setup|quick|deps)" echo " logs - Show application logs" echo " cleanup - Clean project artifacts and stop services" diff --git a/tools/performance_regression_detector.go b/tools/performance_regression_detector.go index a334d37..4186dca 100644 --- a/tools/performance_regression_detector.go +++ b/tools/performance_regression_detector.go @@ -47,7 +47,10 @@ type Improvement struct { } // NewPerformanceRegressionDetector creates a new detector instance -func NewPerformanceRegressionDetector(baselineFile string, threshold float64) *PerformanceRegressionDetector { +func NewPerformanceRegressionDetector( + baselineFile string, + threshold float64, +) *PerformanceRegressionDetector { return &PerformanceRegressionDetector{ BaselineFile: baselineFile, Threshold: threshold, @@ -74,7 +77,9 @@ func (prd *PerformanceRegressionDetector) LoadBaseline() ([]BenchmarkResult, err } // AnalyzeResults analyzes current results against baseline -func (prd *PerformanceRegressionDetector) AnalyzeResults(current []BenchmarkResult) (*RegressionReport, error) { +func (prd *PerformanceRegressionDetector) AnalyzeResults( + current []BenchmarkResult, +) (*RegressionReport, error) { baseline, err := prd.LoadBaseline() if err != nil { return nil, fmt.Errorf("failed to load baseline: %w", err) diff --git a/tools/profiler.go b/tools/profiler.go index a3be521..e2a2e18 100644 --- a/tools/profiler.go +++ b/tools/profiler.go @@ -315,11 +315,17 @@ func (p *Profiler) generateRecommendations(analysis *ProfileAnalysis) []string { // Memory usage recommendations if analysis.MemoryUsage.Alloc > 100*1024*1024 { // > 100MB - recommendations = append(recommendations, "High memory usage detected. Consider optimizing memory allocations.") + recommendations = append( + recommendations, + "High memory usage detected. Consider optimizing memory allocations.", + ) } if analysis.GoroutineCount > 1000 { - recommendations = append(recommendations, "High goroutine count detected. Check for goroutine leaks.") + recommendations = append( + recommendations, + "High goroutine count detected. Check for goroutine leaks.", + ) } // GC recommendations @@ -330,7 +336,10 @@ func (p *Profiler) generateRecommendations(analysis *ProfileAnalysis) []string { if len(analysis.GCStats.Pause) > 0 { avgPause := analysis.GCStats.PauseTotal / time.Duration(len(analysis.GCStats.Pause)) if avgPause > 10*time.Millisecond { - recommendations = append(recommendations, "Long GC pauses detected. Consider tuning GC parameters.") + recommendations = append( + recommendations, + "Long GC pauses detected. Consider tuning GC parameters.", + ) } } @@ -359,7 +368,8 @@ func (p *Profiler) PrintAnalysis(analysis *ProfileAnalysis) { fmt.Printf(" Goroutines: %d\n", analysis.GoroutineCount) fmt.Printf(" Heap Size: %.2f MB\n", float64(analysis.HeapSize)/1024/1024) fmt.Printf(" Memory Allocated: %.2f MB\n", float64(analysis.MemoryUsage.Alloc)/1024/1024) - fmt.Printf(" Total Memory Allocated: %.2f MB\n", float64(analysis.MemoryUsage.TotalAlloc)/1024/1024) + totalAllocMB := float64(analysis.MemoryUsage.TotalAlloc) / 1024 / 1024 + fmt.Printf(" Total Memory Allocated: %.2f MB\n", totalAllocMB) fmt.Printf("\nGarbage Collection:\n") fmt.Printf(" GC Cycles: %d\n", analysis.GCStats.NumGC)