chore(ops): reorganize deployments/monitoring and remove legacy scripts

This commit is contained in:
Jeremie Fraeys 2026-01-05 12:31:26 -05:00
parent 5ef24e4c6d
commit f726806770
101 changed files with 3598 additions and 4982 deletions

56
configs/api/dev.yaml Normal file
View file

@ -0,0 +1,56 @@
base_path: "/data/experiments"
data_dir: "/data/active"
auth:
enabled: false
server:
address: "0.0.0.0:9101"
tls:
enabled: false
cert_file: "/app/ssl/cert.pem"
key_file: "/app/ssl/key.pem"
security:
production_mode: false
allowed_origins:
- "http://localhost:3000"
api_key_rotation_days: 90
audit_logging:
enabled: true
log_path: "/tmp/fetchml-audit.log"
rate_limit:
enabled: false
requests_per_minute: 60
burst_size: 10
ip_whitelist: []
monitoring:
prometheus:
enabled: true
port: 9101
path: "/metrics"
health_checks:
enabled: true
interval: "30s"
redis:
addr: "redis:6379"
password: ""
db: 0
database:
type: "sqlite"
connection: "/tmp/fetchml.sqlite"
logging:
level: "info"
file: ""
audit_log: ""
resources:
max_workers: 1
desired_rps_per_worker: 2
podman_cpus: "2"
podman_memory: "4Gi"

View file

@ -0,0 +1,71 @@
base_path: "/data/experiments"
data_dir: "/data/active"
auth:
enabled: true
api_keys:
homelab_admin:
hash: "CHANGE_ME_SHA256_HOMELAB_ADMIN_KEY"
admin: true
roles:
- admin
permissions:
"*": true
homelab_user:
hash: "CHANGE_ME_SHA256_HOMELAB_USER_KEY"
admin: false
roles:
- researcher
permissions:
experiments: true
datasets: true
jupyter: true
server:
address: ":9101"
tls:
enabled: false
cert_file: "/app/ssl/cert.pem"
key_file: "/app/ssl/key.pem"
security:
production_mode: true
allowed_origins:
- "https://ml-experiments.example.com"
rate_limit:
enabled: true
requests_per_minute: 60
burst_size: 10
ip_whitelist:
- "127.0.0.1"
- "192.168.0.0/16"
monitoring:
prometheus:
enabled: true
port: 9101
path: "/metrics"
health_checks:
enabled: true
interval: "30s"
redis:
url: "redis://:CHANGE_ME_REDIS_PASSWORD@redis:6379"
password: ""
db: 0
database:
type: "sqlite"
connection: "/data/experiments/fetch_ml.sqlite"
logging:
level: "info"
file: "/logs/fetch_ml.log"
audit_log: ""
resources:
max_workers: 1
desired_rps_per_worker: 2
podman_cpus: "2"
podman_memory: "4Gi"

View file

@ -0,0 +1,74 @@
base_path: "/app/data/experiments"
data_dir: "/data/active"
auth:
enabled: true
api_keys:
admin_user:
hash: "CHANGE_ME_SHA256_ADMIN_USER_KEY"
admin: true
roles: ["user", "admin"]
permissions:
"*": true
researcher1:
hash: "CHANGE_ME_SHA256_RESEARCHER1_KEY"
admin: false
roles: ["user", "researcher"]
permissions:
"jobs:read": true
"jobs:create": true
"jobs:update": true
"jobs:delete": false
analyst1:
hash: "CHANGE_ME_SHA256_ANALYST1_KEY"
admin: false
roles: ["user", "analyst"]
permissions:
"jobs:read": true
"jobs:create": false
"jobs:update": false
"jobs:delete": false
server:
address: ":9101"
tls:
enabled: false
security:
production_mode: false
allowed_origins: []
rate_limit:
enabled: true
requests_per_minute: 60
burst_size: 20
ip_whitelist: []
monitoring:
prometheus:
enabled: true
port: 9101
path: "/metrics"
health_checks:
enabled: true
interval: "30s"
redis:
url: "redis://redis:6379"
password: ""
db: 0
database:
type: "sqlite"
connection: "/app/data/experiments/fetch_ml.sqlite"
logging:
level: "info"
file: "/logs/app.log"
audit_log: ""
resources:
max_workers: 3
desired_rps_per_worker: 3
podman_cpus: "2"
podman_memory: "4Gi"

59
configs/api/prod.yaml Normal file
View file

@ -0,0 +1,59 @@
base_path: "/app/data/experiments"
data_dir: "/data/active"
auth:
enabled: true
api_keys:
admin:
hash: "replace-with-sha256-of-your-api-key"
admin: true
roles:
- admin
permissions:
"*": true
server:
address: ":9101"
tls:
enabled: true
cert_file: "/app/ssl/cert.pem"
key_file: "/app/ssl/key.pem"
security:
production_mode: false
allowed_origins: []
rate_limit:
enabled: true
requests_per_minute: 60
burst_size: 10
ip_whitelist: []
monitoring:
prometheus:
enabled: true
port: 9101
path: "/metrics"
health_checks:
enabled: true
interval: "30s"
redis:
addr: "redis:6379"
password: ""
db: 0
database:
type: "sqlite"
connection: "/app/data/experiments/fetch_ml.sqlite"
logging:
level: "info"
file: "/logs/fetch_ml.log"
audit_log: ""
resources:
max_workers: 2
desired_rps_per_worker: 5
podman_cpus: "2"
podman_memory: "4Gi"

View file

@ -1,8 +0,0 @@
# Local development config (TOML)
# Used by both CLI and TUI when no overrides are set
worker_host = "127.0.0.1"
worker_user = "dev_user"
worker_base = "/tmp/ml-experiments"
worker_port = 9101
api_key = "your-api-key-here"

View file

@ -1,26 +0,0 @@
auth:
enabled: true
api_keys:
dev_user:
hash: "replace-with-sha256-of-your-api-key"
admin: true
roles:
- admin
permissions:
'*': true
server:
address: ":9101"
tls:
enabled: false
security:
rate_limit:
enabled: false
redis:
url: "redis://redis:6379"
logging:
level: info
console: true

View file

@ -1,17 +0,0 @@
base_path: "/app/data/experiments"
auth:
enabled: false
server:
address: ":9101"
database:
type: "sqlite"
connection: "/app/data/experiments/fetch_ml.db"
redis:
url: "redis://redis:6379"
logging:
level: "debug"

View file

@ -1,46 +0,0 @@
base_path: "/app/data/experiments"
auth:
enabled: true
api_keys:
homelab_user:
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
admin: true
roles: ["user", "admin"]
permissions:
read: true
write: true
delete: true
server:
address: ":9101"
tls:
enabled: true
cert_file: "/app/ssl/cert.pem"
key_file: "/app/ssl/key.pem"
security:
rate_limit:
enabled: true
requests_per_minute: 30
ip_whitelist: []
# SQLite database for persistence
database:
type: "sqlite"
connection: "/app/data/fetch_ml.db"
redis:
url: "redis://redis:6379"
max_connections: 10
logging:
level: "info"
file: "/app/logs/app.log"
audit_file: "/app/logs/audit.log"
resources:
max_workers: 1
desired_rps_per_worker: 2
podman_cpus: "2"
podman_memory: "8g"

View file

@ -1,39 +0,0 @@
base_path: "/app/data/experiments"
auth:
enabled: true
api_keys:
homelab_user:
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
admin: true
roles: ["user", "admin"]
permissions:
read: true
write: true
delete: true
server:
address: ":9101"
tls:
enabled: true
cert_file: "/app/ssl/cert.pem"
key_file: "/app/ssl/key.pem"
security:
rate_limit:
enabled: true
requests_per_minute: 30
ip_whitelist:
- "127.0.0.1"
- "::1"
- "192.168.0.0/16"
- "10.0.0.0/8"
redis:
url: "redis://redis:6379"
max_connections: 10
logging:
level: "info"
file: "/app/logs/app.log"
audit_file: "/app/logs/audit.log"

View file

@ -1,58 +0,0 @@
# Secure Homelab Configuration
# IMPORTANT: Keep your API keys safe and never share them!
redis:
url: "redis://redis:6379"
max_connections: 10
auth:
enabled: true
api_keys:
homelab_admin:
hash: b444f7d99edd0e32c838d900c4f0dfab86690b55871b587b730f3bc84812dd5f
admin: true
roles:
- admin
permissions:
'*': true
homelab_user:
hash: 5badb9721b0cb19f5be512854885cadbc7490afc0de1f62db5ae3144c6cc294c
admin: false
roles:
- researcher
permissions:
'experiments': true
'datasets': true
'jupyter': true
server:
address: ":9101"
tls:
enabled: true
key_file: "/app/ssl/key.pem"
cert_file: "/app/ssl/cert.pem"
security:
rate_limit:
enabled: true
requests_per_minute: 60
burst_size: 10
ip_whitelist: []
logging:
level: "info"
file: "logs/fetch_ml.log"
console: true
resources:
cpu_limit: "2"
memory_limit: "4Gi"
gpu_limit: 0
disk_limit: "10Gi"
# Prometheus metrics
metrics:
enabled: true
listen_addr: ":9100"
tls:
enabled: false

View file

@ -1,49 +0,0 @@
redis:
url: "redis://redis:6379"
max_connections: 10
auth:
enabled: true
api_keys:
homelab_admin:
hash: b444f7d99edd0e32c838d900c4f0dfab86690b55871b587b730f3bc84812dd5f
admin: true
roles:
- admin
permissions:
'*': true
homelab_user:
hash: 5badb9721b0cb19f5be512854885cadbc7490afc0de1f62db5ae3144c6cc294c
admin: false
roles:
- researcher
permissions:
'experiments': true
'datasets': true
'jupyter': true
server:
address: ":9101"
tls:
enabled: true
cert_file: "/app/ssl/cert.pem"
key_file: "/app/ssl/key.pem"
security:
rate_limit:
enabled: true
requests_per_minute: 60
burst_size: 10
ip_whitelist:
- "127.0.0.1"
- "::1"
- "172.21.0.1" # Docker gateway
# Prometheus metrics
metrics:
enabled: true
listen_addr: ":9100"
tls:
enabled: true
cert_file: "/app/ssl/cert.pem"
key_file: "/app/ssl/key.pem"

View file

@ -1,78 +0,0 @@
base_path: "/app/data/experiments"
auth:
enabled: true
api_keys:
admin_user:
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
admin: true
roles: ["user", "admin"]
permissions:
read: true
write: true
delete: true
researcher1:
hash: "ef92b778ba7a6c8f2150019a5678047b6a9a2b95cef8189518f9b35c54d2e3ae" # "research123"
admin: false
roles: ["user", "researcher"]
permissions:
jobs:read: true
jobs:create: true
jobs:update: true
jobs:delete: false
analyst1:
hash: "a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3" # "analyst123"
admin: false
roles: ["user", "analyst"]
permissions:
jobs:read: true
jobs:create: false
jobs:update: false
jobs:delete: false
server:
address: ":9101"
tls:
enabled: false
security:
rate_limit:
enabled: true
requests_per_minute: 60
burst_size: 20
ip_whitelist: []
cors:
enabled: true
allowed_origins: ["https://localhost:9103", "https://localhost:3000"]
allowed_methods: ["GET", "POST", "PUT", "DELETE", "OPTIONS"]
allowed_headers: ["Content-Type", "Authorization"]
database:
type: "sqlite"
connection: "/app/data/experiments/fetch_ml.db"
max_connections: 20
connection_timeout: "30s"
redis:
url: "redis://redis:6379"
max_connections: 15
connection_timeout: "10s"
logging:
level: "info"
file: "/app/logs/app.log"
max_size: "100MB"
max_backups: 5
compress: true
resources:
max_workers: 3
desired_rps_per_worker: 3
podman_cpus: "2"
podman_memory: "4g"
job_timeout: "30m"
monitoring:
enabled: true
metrics_path: "/metrics"
health_check_interval: "30s"

View file

@ -1,59 +0,0 @@
base_path: "./data/ml-experiments"
auth:
enabled: true
apikeys:
homelab_user:
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
admin: true
roles: ["admin"]
permissions:
read: true
write: true
delete: true
server:
address: ":9101"
tls:
enabled: false # Disabled for local testing
cert_file: "./ssl/cert.pem"
key_file: "./ssl/key.pem"
min_version: "1.3"
security:
rate_limit:
enabled: true
requests_per_minute: 60
burst_size: 10
ip_whitelist:
- "127.0.0.1"
- "::1"
- "localhost"
- "10.0.0.0/8"
- "192.168.0.0/16"
- "172.16.0.0/12"
failed_login_lockout:
enabled: true
max_attempts: 5
lockout_duration: "15m"
# SQLite database for production
database:
type: "sqlite"
connection: "data/fetch_ml.db"
redis:
url: "redis://localhost:6379"
addr: "localhost:6379"
password: "JZVd2Y6IDaLNaYLBOFgQ7ae4Ox5t37NTIyPMQlLJD4k="
logging:
level: "info"
file: "logs/fetch_ml.log"
audit_log: "logs/audit.log"
resources:
max_workers: 2
desired_rps_per_worker: 5
podman_cpus: "8"
podman_memory: "32g"

View file

@ -1,13 +1,17 @@
# Fetch ML Configuration Example for PostgreSQL # Fetch ML Configuration Example for PostgreSQL
# This example shows how to configure Fetch ML to use PostgreSQL as the database # This example shows how to configure Fetch ML to use PostgreSQL as the database
base_path: "./data/experiments"
auth: auth:
enabled: true enabled: true
apikeys: api_keys:
admin: admin:
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd5f8b6c8b0b4f0b8e3" # "password" hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd5f8b6c8b0b4f0b8e3" # "password"
admin: true admin: true
roles: ["admin"] roles: ["admin"]
permissions:
"*": true
server: server:
address: ":9101" address: ":9101"
@ -25,40 +29,34 @@ database:
# connection: "postgres://fetchml:your_password_here@localhost:5432/fetchml?sslmode=disable" # connection: "postgres://fetchml:your_password_here@localhost:5432/fetchml?sslmode=disable"
redis: redis:
host: "localhost" addr: "localhost:6379"
port: 6379
password: "" password: ""
db: 0 db: 0
pool_size: 10
max_retries: 3
logging: logging:
level: "info" level: "info"
console: true file: ""
format: "text" audit_log: ""
security: security:
secret_key: "your-secret-key-here-at-least-16-characters" production_mode: false
jwt_expiry: "24h"
rate_limit: rate_limit:
enabled: false enabled: false
requests_per_minute: 60 requests_per_minute: 60
burst_size: 10 burst_size: 10
ip_whitelist: []
containers: monitoring:
runtime: "podman" prometheus:
registry: "docker.io"
pull_policy: "missing"
resources:
cpu_limit: "2"
memory_limit: "4Gi"
gpu_limit: 1
storage:
data_path: "data"
results_path: "results"
temp_path: "/tmp/fetch_ml"
cleanup:
enabled: true enabled: true
max_age_hours: 168 port: 9101
max_size_gb: 10 path: "/metrics"
health_checks:
enabled: true
interval: "30s"
resources:
max_workers: 1
desired_rps_per_worker: 2
podman_cpus: "2"
podman_memory: "4Gi"

View file

@ -1,6 +1,8 @@
# Fetch ML Configuration Example # Fetch ML Configuration Example
# Copy this file to config.yaml and customize for your environment # Copy this file to config.yaml and customize for your environment
base_path: "./data/experiments"
auth: auth:
enabled: true enabled: true
api_keys: api_keys:
@ -13,54 +15,43 @@ auth:
"*": true "*": true
server: server:
host: "localhost" address: ":9101"
port: 8080 tls:
enabled: false
database: database:
type: "sqlite" type: "sqlite"
connection: "data/fetch_ml.db" connection: "data/fetch_ml.db"
host: ""
port: 5432
username: ""
password: ""
database: "fetch_ml"
redis: redis:
url: "redis://localhost:6379" addr: "localhost:6379"
host: "localhost"
port: 6379
password: "" password: ""
db: 0 db: 0
pool_size: 10
max_retries: 3
logging: logging:
level: "info" level: "info"
file: "logs/fetch_ml.log" file: "logs/fetch_ml.log"
format: "text" audit_log: "logs/audit.log"
console: true
security: security:
secret_key: "your-secret-key-at-least-16-chars"
jwt_expiry: "24h"
rate_limit: rate_limit:
enabled: false enabled: false
requests_per_minute: 60 requests_per_minute: 60
burst_size: 10
ip_whitelist: []
production_mode: false
containers: monitoring:
runtime: "podman" prometheus:
registry: "docker.io"
pull_policy: "missing"
resources:
cpu_limit: "2"
memory_limit: "4Gi"
gpu_limit: 1
storage:
data_path: "data"
results_path: "results"
temp_path: "/tmp/fetch_ml"
cleanup:
enabled: true enabled: true
max_age_hours: 168 port: 9101
max_size_gb: 10 path: "/metrics"
health_checks:
enabled: true
interval: "30s"
resources:
max_workers: 1
desired_rps_per_worker: 2
podman_cpus: "2"
podman_memory: "4Gi"

View file

@ -12,6 +12,10 @@ properties:
type: string type: string
description: Base path for experiment data description: Base path for experiment data
default: "/tmp/ml-experiments" default: "/tmp/ml-experiments"
data_dir:
type: string
description: Data directory (datasets/snapshots) for integrity validation
default: "/data/active"
auth: auth:
type: object type: object
additionalProperties: false additionalProperties: false
@ -40,7 +44,6 @@ properties:
type: array type: array
items: items:
type: string type: string
enum: [admin, data_scientist, data_engineer, viewer, operator]
permissions: permissions:
type: object type: object
additionalProperties: additionalProperties:
@ -64,9 +67,30 @@ properties:
type: string type: string
key_file: key_file:
type: string type: string
min_version: monitoring:
type: object
additionalProperties: false
properties:
prometheus:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
port:
type: integer
minimum: 1
maximum: 65535
path:
type: string
health_checks:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
interval:
type: string type: string
description: Minimum TLS version (e.g. "1.3")
database: database:
type: object type: object
additionalProperties: false additionalProperties: false
@ -99,58 +123,56 @@ properties:
addr: addr:
type: string type: string
description: Optional host:port shorthand for Redis description: Optional host:port shorthand for Redis
host:
type: string
default: "localhost"
port:
type: integer
minimum: 1
maximum: 65535
default: 6379
password: password:
type: string type: string
db: db:
type: integer type: integer
minimum: 0 minimum: 0
default: 0 default: 0
pool_size: queue:
type: integer type: object
minimum: 1 additionalProperties: false
default: 10 properties:
max_retries: backend:
type: integer type: string
minimum: 0 enum: [redis, sqlite]
default: 3 default: redis
sqlite_path:
type: string
logging: logging:
type: object type: object
additionalProperties: false additionalProperties: false
properties: properties:
level: level:
type: string type: string
enum: [debug, info, warn, error, fatal] enum: [debug, info, warn, error]
default: "info" default: "info"
file: file:
type: string type: string
audit_log: audit_log:
type: string type: string
format:
type: string
enum: [text, json]
default: "text"
console:
type: boolean
default: true
security: security:
type: object type: object
additionalProperties: false additionalProperties: false
properties: properties:
secret_key: production_mode:
type: string type: boolean
minLength: 16 default: false
jwt_expiry: allowed_origins:
type: string type: array
pattern: "^\\d+[smhd]$" items:
default: "24h" type: string
api_key_rotation_days:
type: integer
minimum: 0
audit_logging:
type: object
additionalProperties: false
properties:
enabled:
type: boolean
log_path:
type: string
ip_whitelist: ip_whitelist:
type: array type: array
items: items:
@ -183,23 +205,23 @@ properties:
minimum: 1 minimum: 1
resources: resources:
type: object type: object
description: Resource configuration defaults description: Resource configuration
additionalProperties: false additionalProperties: false
properties: properties:
cpu_limit: max_workers:
type: string
description: Default CPU limit (e.g., "2" or "500m")
default: "2"
memory_limit:
type: string
description: Default memory limit (e.g., "1Gi" or "512Mi")
default: "4Gi"
gpu_limit:
type: integer type: integer
description: Default GPU limit minimum: 1
minimum: 0 default: 1
default: 0 desired_rps_per_worker:
disk_limit: type: integer
minimum: 1
requests_per_sec:
type: integer
minimum: 1
podman_cpus:
type: string type: string
description: Default disk limit podman_memory:
default: "10Gi" type: string
request_burst:
type: integer
minimum: 0

View file

@ -2,10 +2,28 @@ $schema: "http://json-schema.org/draft-07/schema#"
title: "Fetch ML Worker Configuration" title: "Fetch ML Worker Configuration"
type: object type: object
additionalProperties: false additionalProperties: false
allOf:
# forbid both index and UUID at once (allow zero or one)
- not:
required: [gpu_visible_devices, gpu_visible_device_ids]
- if:
properties:
queue:
properties:
backend:
const: sqlite
required: [queue]
then:
properties:
queue:
required: [sqlite_path]
else:
anyOf:
- required: [redis_addr]
- required: [redis_url]
required: required:
- base_path - base_path
- worker_id - worker_id
- redis_addr
- podman_image - podman_image
- container_workspace - container_workspace
- container_results - container_results
@ -31,6 +49,9 @@ properties:
train_script: train_script:
type: string type: string
description: Path to training script description: Path to training script
redis_url:
type: string
description: Legacy Redis URL (if set, redis_addr/password/db are derived)
redis_addr: redis_addr:
type: string type: string
description: Redis server address description: Redis server address
@ -42,6 +63,18 @@ properties:
minimum: 0 minimum: 0
default: 0 default: 0
description: Redis database number description: Redis database number
queue:
type: object
description: Queue backend configuration (optional; defaults to redis)
additionalProperties: false
properties:
backend:
type: string
enum: [redis, sqlite]
default: redis
sqlite_path:
type: string
description: Path to queue.db (sqlite backend only)
known_hosts: known_hosts:
type: string type: string
description: Path to SSH known hosts file description: Path to SSH known hosts file
@ -116,6 +149,48 @@ properties:
type: string type: string
description: Dataset cache TTL duration description: Dataset cache TTL duration
default: "30m" default: "30m"
snapshot_store:
type: object
description: Optional S3-compatible snapshot store configuration (worker pulls snapshots by snapshot_id)
additionalProperties: false
properties:
enabled:
type: boolean
default: false
endpoint:
type: string
description: S3-compatible endpoint (e.g. "s3.amazonaws.com" or "minio:9000")
secure:
type: boolean
default: true
region:
type: string
bucket:
type: string
prefix:
type: string
description: Object key prefix where snapshots are stored
access_key:
type: string
description: Optional static access key (otherwise uses env credentials)
secret_key:
type: string
description: Optional static secret key (otherwise uses env credentials)
session_token:
type: string
description: Optional session token for temporary credentials
timeout:
type: string
description: Duration string (e.g., "10m")
default: "10m"
max_retries:
type: integer
minimum: 0
default: 3
prewarm_enabled:
type: boolean
description: Enable best-effort prewarming of next task artifacts (snapshot/datasets/env image). Default off.
default: false
podman_image: podman_image:
type: string type: string
minLength: 1 minLength: 1
@ -126,10 +201,40 @@ properties:
container_results: container_results:
type: string type: string
description: Container results path description: Container results path
gpu_access: gpu_devices:
type: boolean type: array
default: false description: GPU device paths to expose to the container (e.g. ["/dev/dri"]).
description: Enable GPU access items:
type: string
gpu_vendor:
type: string
enum: [nvidia, amd, apple, none]
description: GPU vendor/runtime selection for env var injection (nvidia|amd|apple|none).
default: "none"
gpu_visible_devices:
type: array
description: GPU indices to expose via vendor-specific env (e.g. [0,1]).
items:
type: integer
gpu_visible_device_ids:
type: array
description: NVIDIA GPU UUIDs to expose via CUDA_VISIBLE_DEVICES (e.g. ["GPU-..."]). Mutually exclusive with gpu_visible_devices.
items:
type: string
apple_gpu:
type: object
description: Apple M-series GPU configuration
additionalProperties: false
properties:
enabled:
type: boolean
default: false
metal_device:
type: string
description: Path to Metal device node (e.g. /dev/metal)
mps_runtime:
type: string
description: Path to MPS runtime device node (e.g. /dev/mps)
task_lease_duration: task_lease_duration:
type: string type: string
description: Task lease duration description: Task lease duration

View file

@ -0,0 +1,58 @@
worker_id: "docker-worker"
base_path: "/data/experiments"
train_script: "train.py"
redis_url: "redis://redis:6379/0"
local_mode: true
prewarm_enabled: true
max_workers: 1
poll_interval_seconds: 2
auto_fetch_data: false
data_manager_path: "./data_manager"
dataset_cache_ttl: "30m"
data_dir: "/data/active"
snapshot_store:
enabled: true
endpoint: "minio:9000"
secure: false
bucket: "fetchml-snapshots"
prefix: "snapshots"
timeout: "2m"
max_retries: 3
podman_image: "python:3.9-slim"
container_workspace: "/workspace"
container_results: "/results"
gpu_devices:
- "/dev/dri"
gpu_vendor: "apple"
gpu_visible_devices: []
# Apple M-series GPU configuration
apple_gpu:
enabled: true
metal_device: "/dev/metal"
mps_runtime: "/dev/mps"
resources:
max_workers: 1
desired_rps_per_worker: 2
podman_cpus: "2"
podman_memory: "4Gi"
metrics:
enabled: true
listen_addr: ":9100"
metrics_flush_interval: "500ms"
task_lease_duration: "30m"
heartbeat_interval: "1m"
max_retries: 3
graceful_timeout: "5m"

View file

@ -0,0 +1,50 @@
worker_id: "docker-worker"
base_path: "/tmp/fetchml-jobs"
train_script: "train.py"
redis_url: "redis://redis:6379/0"
local_mode: true
max_workers: 1
poll_interval_seconds: 2
auto_fetch_data: false
data_manager_path: "./data_manager"
dataset_cache_ttl: "30m"
data_dir: "/data/active"
snapshot_store:
enabled: true
endpoint: "minio:9000"
secure: false
bucket: "fetchml-snapshots"
prefix: "snapshots"
timeout: "5m"
max_retries: 3
podman_image: "python:3.9-slim"
container_workspace: "/workspace"
container_results: "/results"
gpu_vendor: "nvidia"
gpu_visible_devices: [0]
gpu_devices: ["/dev/nvidia0"]
resources:
max_workers: 1
desired_rps_per_worker: 2
podman_cpus: "2"
podman_memory: "4Gi"
metrics:
enabled: true
listen_addr: ":9100"
metrics_flush_interval: "500ms"
task_lease_duration: "30m"
heartbeat_interval: "1m"
max_retries: 3
graceful_timeout: "5m"

View file

@ -0,0 +1,43 @@
worker_id: "docker-worker"
base_path: "/tmp/fetchml-jobs"
train_script: "train.py"
redis_addr: "redis:6379"
redis_password: ""
redis_db: 0
local_mode: true
max_workers: 1
poll_interval_seconds: 5
auto_fetch_data: false
data_manager_path: "./data_manager"
dataset_cache_ttl: "30m"
snapshot_store:
enabled: false
podman_image: "python:3.9-slim"
container_workspace: "/workspace"
container_results: "/results"
gpu_devices: []
gpu_vendor: "none"
gpu_visible_devices: []
resources:
max_workers: 1
desired_rps_per_worker: 2
podman_cpus: "2"
podman_memory: "4Gi"
metrics:
enabled: true
listen_addr: ":9100"
metrics_flush_interval: "500ms"
task_lease_duration: "30m"
heartbeat_interval: "1m"
max_retries: 3
graceful_timeout: "5m"

View file

@ -0,0 +1,27 @@
worker_id: "test-prewarm-worker"
host: "localhost"
port: 8081
base_path: "/tmp/fetch-ml-test"
data_dir: "/tmp/fetch-ml-test/data"
max_workers: 2
local_mode: true
auto_fetch_data: true
prewarm_enabled: true
metrics:
enabled: true
listen_addr: ":9102"
train_script: "train.py"
snapshot_store:
enabled: false
endpoint: ""
secure: false
region: ""
bucket: ""
prefix: ""
access_key: ""
secret_key: ""
session_token: ""
max_retries: 3
timeout: 0s
gpu_devices: []
gpu_access: "none"

View file

@ -0,0 +1,47 @@
worker_id: "homelab-worker"
base_path: "/tmp/fetchml-jobs"
train_script: "train.py"
redis_url: "redis://:${REDIS_PASSWORD}@redis:6379/0"
local_mode: true
max_workers: 1
poll_interval_seconds: 2
auto_fetch_data: false
data_manager_path: "./data_manager"
dataset_cache_ttl: "30m"
data_dir: "/data/active"
snapshot_store:
enabled: true
endpoint: "minio:9000"
secure: false
bucket: "fetchml-snapshots"
prefix: "snapshots"
timeout: "5m"
max_retries: 3
podman_image: "python:3.9-slim"
container_workspace: "/workspace"
container_results: "/results"
gpu_devices: []
resources:
max_workers: 1
desired_rps_per_worker: 2
podman_cpus: "2"
podman_memory: "4Gi"
metrics:
enabled: true
listen_addr: ":9100"
metrics_flush_interval: "500ms"
task_lease_duration: "30m"
heartbeat_interval: "1m"
max_retries: 3
graceful_timeout: "5m"

View file

@ -1,51 +0,0 @@
# Worker configuration for Docker production-like testing
worker_id: "docker-test-worker-1"
# Redis configuration
redis:
url: "redis://redis:6379"
max_connections: 10
# Local mode settings
local_mode: false # Use Podman for containerized job execution
# Job paths
base_path: "/tmp/fetchml-jobs"
# Container workspace (not used in local mode)
container_workspace: "/workspace"
container_results: "/results"
# Podman settings (not used in local mode)
podman_image: "python:3.9-slim"
podman_cpus: "2"
podman_memory: "4g"
# Worker configuration
heartbeat_interval: "30s"
lease_duration: "5m"
max_concurrent_tasks: 1
# Data manager settings
data_manager:
enabled: false
base_path: "/data"
# SSH settings for Podman communication
ssh:
enabled: true
host: "localhost"
port: 2222
user: "worker"
password: "SecureWorkerPass2024!"
key_path: "/home/worker/.ssh/id_rsa"
# Logging
logging:
level: "info"
file: "/logs/worker.log"
# Metrics
metrics:
enabled: true
endpoint: ":9100"

View file

@ -1,79 +0,0 @@
# Worker configuration for Homelab secure environment
worker_id: "homelab-secure-worker-1"
# Redis configuration with connection pooling
redis:
url: "redis://redis:6379"
max_connections: 10
connection_timeout: "10s"
read_timeout: "5s"
write_timeout: "5s"
# Local mode disabled for containerized execution
local_mode: false
# Job paths with security considerations
base_path: "/tmp/fetchml-jobs"
container_workspace: "/workspace"
container_results: "/results"
# Podman settings with resource limits
podman_image: "python:3.11-slim"
podman_cpus: "2"
podman_memory: "4g"
podman_network: "ml-job-network"
podman_timeout: "30m"
# Worker configuration with security
heartbeat_interval: "30s"
lease_duration: "5m"
max_concurrent_tasks: 2
task_timeout: "30m"
# Data manager settings
data_manager:
enabled: true
base_path: "/data"
encryption_enabled: true
backup_enabled: true
# SSH settings with secure configuration
ssh:
enabled: true
host: "localhost"
port: 2222
user: "worker"
password: "HomelabWorker2024!"
key_path: "/home/worker/.ssh/id_rsa"
max_retries: 3
connection_timeout: "30s"
strict_host_key_checking: false
# Logging with rotation and security
logging:
level: "info"
file: "/logs/worker.log"
max_size: "50MB"
max_backups: 5
compress: true
audit_enabled: true
# Metrics and monitoring
metrics:
enabled: true
endpoint: ":9100"
path: "/metrics"
# Security settings
security:
enable_job_isolation: true
sandbox_enabled: true
resource_monitoring: true
audit_commands: true
# Health check configuration
health_check:
enabled: true
interval: "30s"
timeout: "10s"
failure_threshold: 3

View file

@ -4,7 +4,7 @@ max_workers = 4
# Redis connection # Redis connection
redis_addr = "localhost:6379" redis_addr = "localhost:6379"
redis_password = "your-redis-password" redis_password = "CHANGE_ME_REDIS_PASSWORD"
redis_db = 0 redis_db = 0
# SSH connection (for remote operations) # SSH connection (for remote operations)
@ -15,17 +15,13 @@ ssh_key = "~/.ssh/id_rsa"
# Podman configuration # Podman configuration
podman_image = "ml-training:latest" podman_image = "ml-training:latest"
gpu_access = true gpu_vendor = "none"
gpu_visible_devices = []
gpu_devices = []
container_workspace = "/workspace" container_workspace = "/workspace"
container_results = "/results" container_results = "/results"
train_script = "train.py" train_script = "train.py"
[resources]
max_workers = 4
desired_rps_per_worker = 2
podman_cpus = "4"
podman_memory = "16g"
# Dataset management # Dataset management
auto_fetch_data = true auto_fetch_data = true
data_dir = "/data/datasets" data_dir = "/data/datasets"
@ -36,10 +32,16 @@ dataset_cache_ttl = "24h"
task_lease_duration = "1h" task_lease_duration = "1h"
heartbeat_interval = "30s" heartbeat_interval = "30s"
graceful_timeout = "5m" graceful_timeout = "5m"
poll_interval = "100ms" poll_interval_seconds = 1
metrics_flush_interval = "10s" metrics_flush_interval = "10s"
[resources]
max_workers = 4
desired_rps_per_worker = 2
podman_cpus = "4"
podman_memory = "16g"
# Metrics exporter # Metrics exporter
[metrics] [metrics]
enabled = true enabled = true
listen_addr = ":9090" listen_addr = ":9100"

45
deployments/Caddyfile.dev Normal file
View file

@ -0,0 +1,45 @@
{
auto_https off
admin off
servers {
protocols h1 h2
}
}
http://localhost {
handle /health {
reverse_proxy api-server:9101
}
handle /ws* {
reverse_proxy api-server:9101
}
handle /api/* {
reverse_proxy api-server:9101
}
handle {
respond 404
}
}
https://localhost {
tls internal
handle /health {
reverse_proxy api-server:9101
}
handle /ws* {
reverse_proxy api-server:9101
}
handle /api/* {
reverse_proxy api-server:9101
}
handle {
respond 404
}
}

View file

@ -0,0 +1,44 @@
{
admin off
servers {
protocols h1 h2
}
}
{$FETCHML_DOMAIN} {
encode gzip
tls /etc/caddy/ssl/cert.pem /etc/caddy/ssl/key.pem
header {
-Server
X-Frame-Options "DENY"
X-Content-Type-Options "nosniff"
Referrer-Policy "strict-origin-when-cross-origin"
Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
Content-Security-Policy "default-src 'self'; base-uri 'self'; frame-ancestors 'none'"
}
@admin path /admin/*
@admin_private remote_ip private_ranges
handle @admin {
respond @admin_private 404
respond 404
}
handle /health {
reverse_proxy api-server:9101
}
handle /ws* {
reverse_proxy api-server:9101
}
handle /api/* {
reverse_proxy api-server:9101
}
handle {
respond 404
}
}

View file

@ -0,0 +1,47 @@
{
email {$CADDY_EMAIL}
admin off
servers {
protocols h1 h2
}
}
{$FETCHML_DOMAIN} {
encode gzip
request_body {
max_size 10MB
}
header {
-Server
X-Frame-Options "DENY"
X-Content-Type-Options "nosniff"
Referrer-Policy "strict-origin-when-cross-origin"
Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
Content-Security-Policy "default-src 'self'; base-uri 'self'; frame-ancestors 'none'"
}
@admin path /admin/*
@admin_private remote_ip private_ranges
handle @admin {
respond @admin_private 404
respond 404
}
handle /health {
reverse_proxy api-server:9101
}
handle /ws* {
reverse_proxy api-server:9101
}
handle /api/* {
reverse_proxy api-server:9101
}
handle {
respond 404
}
}

View file

@ -0,0 +1,23 @@
{
auto_https off
}
localhost {
tls internal
handle /health {
reverse_proxy api-server:9101
}
handle /ws* {
reverse_proxy api-server:9101
}
handle /api/* {
reverse_proxy api-server:9101
}
handle {
respond 404
}
}

76
deployments/Makefile Normal file
View file

@ -0,0 +1,76 @@
# Docker Compose Deployment Management
.PHONY: help dev-up dev-down dev-logs dev-restart homelab-secure-up homelab-secure-down prod-up prod-down status clean
# Default target
help: ## Show this help message
@echo "Available commands:"
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
# Development environment
dev-up: ## Start development environment
@echo "Starting development environment..."
docker-compose -f deployments/docker-compose.dev.yml up -d
@echo "Services: Caddy (8080/8443), Redis (6379), Prometheus (9090), Grafana (3000)"
dev-down: ## Stop development environment
@echo "Stopping development environment..."
docker-compose -f deployments/docker-compose.dev.yml down
dev-logs: ## Show development logs
docker-compose -f deployments/docker-compose.dev.yml logs -f
dev-restart: ## Restart development environment
@echo "Restarting development environment..."
docker-compose -f deployments/docker-compose.dev.yml restart
# Homelab environment
homelab-secure-up: ## Start secure homelab environment
@echo "Starting secure homelab environment..."
docker-compose -f deployments/docker-compose.homelab-secure.yml up -d
homelab-secure-down: ## Stop secure homelab environment
@echo "Stopping secure homelab environment..."
docker-compose -f deployments/docker-compose.homelab-secure.yml down
# Production environment
prod-up: ## Start production environment
@echo "Starting production environment..."
docker-compose -f deployments/docker-compose.prod.yml up -d
prod-down: ## Stop production environment
@echo "Stopping production environment..."
docker-compose -f deployments/docker-compose.prod.yml down
# Utility commands
status: ## Show status of all environments
@echo "=== Development Status ==="
@if [ -f deployments/docker-compose.dev.yml ]; then \
docker-compose -f deployments/docker-compose.dev.yml ps; \
fi
@echo ""
@echo "=== Homelab Secure Status ==="
@if [ -f deployments/docker-compose.homelab-secure.yml ]; then \
docker-compose -f deployments/docker-compose.homelab-secure.yml ps 2>/dev/null || echo "Not running"; \
fi
@echo ""
@echo "=== Production Status ==="
@if [ -f deployments/docker-compose.prod.yml ]; then \
docker-compose -f deployments/docker-compose.prod.yml ps 2>/dev/null || echo "Not running"; \
fi
clean: ## Clean up all containers and volumes
@echo "Cleaning up all Docker resources..."
@echo "This will remove all containers and volumes. Continue? [y/N]"
@read -r confirm && [ "$$confirm" = "y" ] || exit 1
docker-compose -f deployments/docker-compose.dev.yml down -v 2>/dev/null || true
docker-compose -f deployments/docker-compose.homelab-secure.yml down -v 2>/dev/null || true
docker-compose -f deployments/docker-compose.prod.yml down -v 2>/dev/null || true
docker system prune -f
@echo "Cleanup complete."
# Quick aliases
up: dev-up ## Alias for dev-up
down: dev-down ## Alias for dev-down
logs: dev-logs ## Alias for dev-logs
restart: dev-restart ## Alias for dev-restart

View file

@ -2,33 +2,123 @@
This directory contains Docker Compose configurations for different deployment environments. This directory contains Docker Compose configurations for different deployment environments.
## Files ## Environment Configurations
- `docker-compose.homelab-secure.yml` - Secure homelab deployment with TLS and authentication ### Development (`docker-compose.dev.yml`)
- `docker-compose.prod.yml` - Production deployment configuration - Full development stack with monitoring
- Includes: API, Worker, Redis, MinIO (snapshots), Prometheus, Grafana, Loki, Promtail
- Optimized for local development and testing
- **Usage**: `docker-compose -f deployments/docker-compose.dev.yml up -d`
## Usage ### Homelab - Secure (`docker-compose.homelab-secure.yml`)
- Secure homelab deployment with authentication and a Caddy reverse proxy
- TLS is terminated at the reverse proxy (Approach A)
- Includes: API, Redis (password protected), Caddy reverse proxy
- **Usage**: `docker-compose -f deployments/docker-compose.homelab-secure.yml up -d`
### Production (`docker-compose.prod.yml`)
- Production deployment configuration
- Optimized for performance and security
- External services assumed (Redis, monitoring)
- **Usage**: `docker-compose -f deployments/docker-compose.prod.yml up -d`
Note: `docker-compose.prod.yml` is a reproducible staging/testing harness. Real production deployments do not require Docker; you can run the Go services directly (systemd) and use Caddy for TLS/WSS termination.
## TLS / WSS Policy
- The Zig CLI currently supports `ws://` only (native `wss://` is not implemented).
- Production deployments terminate TLS/WSS at a reverse proxy (Caddy in `docker-compose.prod.yml`) and keep the API server on internal `ws://`.
- Homelab deployments terminate TLS/WSS at a reverse proxy (Caddy) and keep the API server on internal `ws://`.
- Health checks in compose files should use `http://localhost:9101/health` when `server.tls.enabled: false`.
## Required Volume Mounts
- `base_path` (experiments) must be writable by the API server.
- `data_dir` should be mounted if you want snapshot/dataset integrity validation via `ml validate`.
For the default configs:
- `base_path`: `/data/experiments` (dev/homelab configs) or `/app/data/experiments` (prod configs)
- `data_dir`: `/data/active`
## Quick Start
### Development
```bash ```bash
# Use the main docker-compose.yml in project root # Development (most common)
docker-compose up -d docker-compose -f deployments/docker-compose.dev.yml up -d
# Check status
docker-compose -f deployments/docker-compose.dev.yml ps
# View logs
docker-compose -f deployments/docker-compose.dev.yml logs -f api-server
# Stop services
docker-compose -f deployments/docker-compose.dev.yml down
``` ```
### Homelab (Secure) ## Dev: MinIO-backed snapshots (smoke test)
```bash
docker-compose -f deployments/docker-compose.homelab-secure.yml up -d
```
### Production The dev compose file provisions a MinIO bucket and uploads a small example snapshot object at:
```bash
docker-compose -f deployments/docker-compose.prod.yml up -d `s3://fetchml-snapshots/snapshots/snap-1.tar.gz`
```
To queue a task that forces the worker to pull the snapshot from MinIO:
1. Start the dev stack:
`docker-compose -f deployments/docker-compose.dev.yml up -d`
2. Read the `snapshot_sha256` printed by the init job:
`docker-compose -f deployments/docker-compose.dev.yml logs minio-init`
3. Queue a job using the snapshot fields:
`ml queue <job-name> --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`
## Smoke tests
- `make dev-smoke` runs the development stack smoke test.
- `make prod-smoke` runs a Docker-based staging smoke test for the production stack, using a localhost-only Caddy configuration.
Note: `ml queue` by itself will generate a random commit ID. For full provenance enforcement (manifest + dependency manifest), use `ml sync ./your-project --queue` so the server has real code + dependency files.
Examples:
- `ml queue train-mnist --priority 3 --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`
- `ml queue train-a train-b train-c --priority 5 --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`
## Environment Variables ## Environment Variables
Each deployment may require specific environment variables. Refer to the individual compose files for requirements. Create a `.env` file in the project root:
```bash
# Grafana
GRAFANA_ADMIN_PASSWORD=your_secure_password
# API Configuration
LOG_LEVEL=info
# TLS (for secure deployments)
TLS_CERT_PATH=/app/ssl/cert.pem
TLS_KEY_PATH=/app/ssl/key.pem
```
## Service Ports
| Service | Development | Homelab | Production |
|---------|-------------|---------|------------|
| API Server | 9101 | 9101 | 9101 |
| Redis | 6379 | 6379 | - |
| Prometheus | 9090 | - | - |
| Grafana | 3000 | - | - |
| Loki | 3100 | - | - |
## Monitoring ## Monitoring
Performance monitoring configurations are in `monitoring/docker-compose.performance.yml` - **Development**: Full monitoring stack included
- **Homelab**: Basic monitoring (configurable)
- **Production**: External monitoring assumed
## Security Notes
- If you need HTTPS externally, terminate TLS at a reverse proxy.
- API keys should be managed via environment variables
- Database credentials should use secrets management in production

162
deployments/deploy.sh Executable file
View file

@ -0,0 +1,162 @@
#!/bin/bash
# Quick deployment script for fetch_ml
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Function to print colored output
print_status() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Function to show usage
show_usage() {
echo "Usage: $0 [ENVIRONMENT] [ACTION]"
echo ""
echo "Environments:"
echo " dev Development environment"
echo " secure Secure homelab environment"
echo " prod Production environment"
echo ""
echo "Actions:"
echo " up Start services"
echo " down Stop services"
echo " restart Restart services"
echo " logs Show logs"
echo " status Show status"
echo ""
echo "Examples:"
echo " $0 dev up # Start development environment"
echo " $0 prod down # Stop production environment"
echo " $0 secure logs # Show secure environment logs"
}
# Function to check if docker-compose file exists
check_compose_file() {
local env=$1
local compose_file=""
case $env in
"dev")
compose_file="deployments/docker-compose.dev.yml"
;;
"secure")
compose_file="deployments/docker-compose.homelab-secure.yml"
;;
"prod")
compose_file="deployments/docker-compose.prod.yml"
;;
*)
print_error "Unknown environment: $env"
show_usage
exit 1
;;
esac
if [ ! -f "$compose_file" ]; then
print_error "Docker Compose file not found: $compose_file"
exit 1
fi
echo "$compose_file"
}
# Function to check if .env file exists
check_env_file() {
local env=$1
if [ ! -f ".env" ]; then
print_warning ".env file not found. Creating from example..."
if [ "$env" = "dev" ]; then
cp deployments/env.dev.example .env
elif [ "$env" = "prod" ]; then
cp deployments/env.prod.example .env
else
cp deployments/env.dev.example .env
fi
print_warning "Please edit .env file with your configuration"
fi
}
# Main script
main() {
if [ $# -ne 2 ]; then
show_usage
exit 1
fi
local environment=$1
local action=$2
print_status "Environment: $environment"
print_status "Action: $action"
# Check compose file
compose_file=$(check_compose_file "$environment")
print_status "Using: $compose_file"
# Check .env file
check_env_file "$environment"
# Execute action
case $action in
"up")
print_status "Starting $environment environment..."
docker-compose -f "$compose_file" up -d
print_success "$environment environment started successfully!"
# Show service URLs
echo ""
print_status "Service URLs:"
echo " API Server: http://localhost:9101"
if [ "$environment" = "dev" ]; then
echo " Grafana: http://localhost:3000 (admin/admin123)"
echo " Prometheus: http://localhost:9090"
fi
;;
"down")
print_status "Stopping $environment environment..."
docker-compose -f "$compose_file" down
print_success "$environment environment stopped successfully!"
;;
"restart")
print_status "Restarting $environment environment..."
docker-compose -f "$compose_file" restart
print_success "$environment environment restarted successfully!"
;;
"logs")
print_status "Showing logs for $environment environment..."
docker-compose -f "$compose_file" logs -f
;;
"status")
print_status "Status of $environment environment:"
docker-compose -f "$compose_file" ps
;;
*)
print_error "Unknown action: $action"
show_usage
exit 1
;;
esac
}
# Run main function
main "$@"

View file

@ -0,0 +1,225 @@
# Homelab Docker Compose with Centralized Monitoring
# Includes: API, Redis, Prometheus, Grafana, Loki
services:
caddy:
image: caddy:2-alpine
container_name: ml-dev-caddy
restart: unless-stopped
ports:
- "8080:80"
- "8443:443"
volumes:
- ${FETCHML_REPO_ROOT:-.}/deployments/Caddyfile.dev:/etc/caddy/Caddyfile:ro
- ${FETCHML_REPO_ROOT:-.}/data/dev/caddy/data:/data
- ${FETCHML_REPO_ROOT:-.}/data/dev/caddy/config:/config
depends_on:
api-server:
condition: service_healthy
redis:
image: redis:7-alpine
container_name: ml-experiments-redis
user: "999:999"
ports:
- "6379:6379"
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/dev/redis:/data
restart: unless-stopped
command: redis-server --appendonly yes
healthcheck:
test: [ "CMD", "redis-cli", "ping" ]
interval: 30s
timeout: 10s
retries: 3
api-server:
build:
context: ${FETCHML_REPO_ROOT:-.}
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
container_name: ml-experiments-api
user: "0:0"
expose:
- "9101" # API and health endpoints (internal; external access via Caddy)
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs
- ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments
- ${FETCHML_REPO_ROOT:-.}/data/dev/active:/data/active
- ${FETCHML_REPO_ROOT:-.}/data/dev/workspaces:/data/active/workspaces:delegated
- ${FETCHML_REPO_ROOT:-.}/configs/api/dev.yaml:/app/configs/api/dev.yaml
- ${FETCHML_REPO_ROOT:-.}/ssl:/app/ssl
depends_on:
- redis
restart: unless-stopped
command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
environment:
- LOG_LEVEL=info
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
labels:
logging: "promtail"
job: "api-server"
minio:
image: minio/minio:latest
container_name: ml-experiments-minio
ports:
- "9000:9000"
- "9001:9001"
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/dev/minio:/data
environment:
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin123
command: ["server", "/data", "--console-address", ":9001"]
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 10s
timeout: 5s
retries: 10
restart: unless-stopped
minio-init:
image: alpine:3.19
container_name: ml-experiments-minio-init
depends_on:
minio:
condition: service_healthy
entrypoint: ["/bin/sh", "-c"]
command:
- |
set -eu
apk add --no-cache ca-certificates curl tar gzip
ARCH=$$(uname -m)
MC_ARCH=amd64
if [ "$$ARCH" = "aarch64" ] || [ "$$ARCH" = "arm64" ]; then
MC_ARCH=arm64
fi
curl -fsSL -o /usr/local/bin/mc "https://dl.min.io/client/mc/release/linux-$$MC_ARCH/mc"
chmod +x /usr/local/bin/mc
i=0
while ! mc alias set local http://minio:9000 minioadmin minioadmin123; do
i=$$((i+1))
if [ $$i -ge 30 ]; then
echo "minio not ready after 30 attempts" >&2
exit 1
fi
echo "waiting for minio... ($$i/30)"
sleep 1
done
mc mb -p local/fetchml-snapshots || true
mkdir -p /tmp/snapshots/snap-1
echo -n "hello" > /tmp/snapshots/snap-1/hello.txt
tar -C /tmp/snapshots/snap-1 -czf /tmp/snap-1.tar.gz .
mc cp /tmp/snap-1.tar.gz local/fetchml-snapshots/snapshots/snap-1.tar.gz
FILE_SHA=$$(sha256sum /tmp/snapshots/snap-1/hello.txt | cut -d' ' -f1)
SNAP_SHA=$$(echo -n "$$FILE_SHA" | sha256sum | cut -d' ' -f1)
echo "snapshot_id=snap-1 snapshot_sha256=$$SNAP_SHA"
restart: "no"
worker:
build:
context: ${FETCHML_REPO_ROOT:-.}
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
container_name: ml-experiments-worker
user: "0:0"
ports:
- "8888:8888"
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs
- ${FETCHML_REPO_ROOT:-.}/data/dev/active:/data/active
- ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments
- ${FETCHML_REPO_ROOT:-.}/data/dev/workspaces:/data/active/workspaces:delegated
- ${FETCHML_REPO_ROOT:-.}/configs/workers/docker-dev.yaml:/app/configs/worker.yaml
- /sys/fs/cgroup:/sys/fs/cgroup:rw
depends_on:
redis:
condition: service_healthy
api-server:
condition: service_healthy
minio-init:
condition: service_completed_successfully
restart: unless-stopped
environment:
- LOG_LEVEL=info
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin123
- FETCHML_JUPYTER_DEFAULT_IMAGE=quay.io/jupyter/base-notebook:latest
- FETCHML_JUPYTER_CONDA_ENV=base
- FETCHML_JUPYTER_KERNEL_NAME=python
- FETCHML_PODMAN_CGROUPS=disabled
privileged: true
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
# Prometheus - Metrics collection
prometheus:
image: prom/prometheus:latest
container_name: ml-experiments-prometheus
ports:
- "9090:9090"
volumes:
- ${FETCHML_REPO_ROOT:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
restart: unless-stopped
# Grafana - Visualization
grafana:
image: grafana/grafana:latest
container_name: ml-experiments-grafana
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
- ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning
- ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_USERS_ALLOW_SIGN_UP=false
restart: unless-stopped
depends_on:
- prometheus
- loki
# Loki - Log aggregation
loki:
image: grafana/loki:latest
container_name: ml-experiments-loki
ports:
- "3100:3100"
volumes:
- ${FETCHML_REPO_ROOT:-.}/monitoring/loki-config.yml:/etc/loki/local-config.yaml
- loki_data:/loki
command: -config.file=/etc/loki/local-config.yaml
restart: unless-stopped
# Promtail - Log collector
promtail:
image: grafana/promtail:latest
container_name: ml-experiments-promtail
volumes:
- ${FETCHML_REPO_ROOT:-.}/monitoring/promtail-config.yml:/etc/promtail/config.yml
- ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/var/log/app
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- /var/run/docker.sock:/var/run/docker.sock
command: -config.file=/etc/promtail/config.yml
restart: unless-stopped
depends_on:
- loki
volumes:
prometheus_data:
driver: local
grafana_data:
driver: local
loki_data:
driver: local

View file

@ -1,104 +1,152 @@
# Homelab Secure Docker Environment # Secure Homelab Docker Compose Configuration
services: # Use with: docker-compose -f docker-compose.yml -f docker-compose.homelab-secure.yml up -d
redis:
image: redis:7-alpine
container_name: ml-homelab-redis
ports:
- "6379:6379"
volumes:
- redis_homelab_data:/data
restart: unless-stopped
command: >
redis-server
--appendonly yes
--requirepass "HomelabRedis2024!"
--maxmemory 512mb
--maxmemory-policy allkeys-lru
healthcheck:
test: ["CMD", "redis-cli", "-a", "HomelabRedis2024!", "ping"]
interval: 30s
timeout: 10s
retries: 3
networks:
- ml-homelab-network
services:
api-server: api-server:
build: build:
context: . context: ${FETCHML_REPO_ROOT:-.}
dockerfile: build/docker/homelab-secure.Dockerfile dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
container_name: ml-homelab-api container_name: ml-experiments-api
ports: ports:
- "9104:9101" # API server port - "9101:9101"
- "2223:2222" # Secure SSH port - "9100:9100" # Prometheus metrics endpoint
- "9101:9100" # Prometheus metrics
volumes: volumes:
- ./data:/app/data/experiments - ${FETCHML_REPO_ROOT:-.}/data/homelab/experiments:/data/experiments
- ./logs:/logs - ${FETCHML_REPO_ROOT:-.}/data/homelab/active:/data/active
- ./configs/config-homelab-secure.yaml:/app/configs/config.yaml - ${FETCHML_REPO_ROOT:-.}/data/homelab/logs:/logs
- ${FETCHML_REPO_ROOT:-.}/ssl:/app/ssl:ro
- ${FETCHML_REPO_ROOT:-.}/configs/api/homelab-secure.yaml:/app/configs/api/prod.yaml:ro
- ${FETCHML_REPO_ROOT:-.}/.env.secure:/app/.env.secure:ro
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
restart: unless-stopped restart: unless-stopped
environment: environment:
- REDIS_URL=redis://:HomelabRedis2024!@redis:6379
- LOG_LEVEL=info - LOG_LEVEL=info
- TZ=America/New_York # Load secure environment variables
- JWT_SECRET_FILE=/app/.env.secure
healthcheck: healthcheck:
test: ["CMD", "curl", "-k", "-f", "https://localhost:9101/health"] test: ["CMD", "curl", "-f", "http://localhost:9101/health"]
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 3 retries: 3
start_period: 40s start_period: 40s
command: > labels:
sh -c " logging: "promtail"
sudo /app/start-security.sh & job: "api-server"
/usr/local/bin/api-server -config /app/configs/config.yaml command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"]
"
networks: networks:
- ml-homelab-network - ml-experiments-network
# Add internal network for secure communication
- ml-backend-network
minio:
image: minio/minio:latest
container_name: ml-experiments-minio
ports:
- "9000:9000"
- "9001:9001"
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/homelab/minio:/data
environment:
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
command: ["server", "/data", "--console-address", ":9001"]
restart: unless-stopped
networks:
- ml-backend-network
minio-init:
image: alpine:3.19
container_name: ml-experiments-minio-init
depends_on:
- minio
entrypoint: ["/bin/sh", "-c"]
command:
- |
apk add --no-cache ca-certificates curl >/dev/null
curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc
chmod +x /usr/local/bin/mc
mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123}
mc mb -p local/fetchml-snapshots || true
restart: "no"
networks:
- ml-backend-network
worker: worker:
build: build:
context: . context: ${FETCHML_REPO_ROOT:-.}
dockerfile: build/docker/homelab-secure.Dockerfile dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
container_name: ml-homelab-worker container_name: ml-experiments-worker
volumes: volumes:
- ./data:/app/data/experiments - ${FETCHML_REPO_ROOT:-.}/data/homelab/experiments:/app/data/experiments
- ./logs:/logs - ${FETCHML_REPO_ROOT:-.}/data/homelab/active:/data/active
- ./configs/worker-homelab-secure.yaml:/app/configs/worker.yaml - ${FETCHML_REPO_ROOT:-.}/data/homelab/logs:/logs
- ${FETCHML_REPO_ROOT:-.}/configs/workers/homelab-secure.yaml:/app/configs/worker.yaml
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
api-server: api-server:
condition: service_healthy condition: service_healthy
minio-init:
condition: service_started
restart: unless-stopped restart: unless-stopped
environment: environment:
- REDIS_URL=redis://:HomelabRedis2024!@redis:6379
- LOG_LEVEL=info - LOG_LEVEL=info
- TZ=America/New_York - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
privileged: true # Required for Podman - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
security_opt: - REDIS_PASSWORD=${REDIS_PASSWORD}
- no-new-privileges:true privileged: true
cap_drop: command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
- ALL
cap_add:
- NET_ADMIN
- SYS_ADMIN
command: >
sh -c "
sudo /app/start-security.sh &
/usr/local/bin/worker -config /app/configs/worker.yaml
"
networks: networks:
- ml-homelab-network - ml-backend-network
volumes: caddy:
redis_homelab_data: image: caddy:2-alpine
driver: local container_name: ml-experiments-caddy
restart: unless-stopped
ports:
- "80:80"
- "443:443"
volumes:
- ${FETCHML_REPO_ROOT:-.}/deployments/Caddyfile.homelab-secure:/etc/caddy/Caddyfile:ro
- ${FETCHML_REPO_ROOT:-.}/ssl:/etc/caddy/ssl:ro
- ${FETCHML_REPO_ROOT:-.}/data/homelab/caddy/data:/data
- ${FETCHML_REPO_ROOT:-.}/data/homelab/caddy/config:/config
environment:
- FETCHML_DOMAIN=${FETCHML_DOMAIN:-ml.local}
depends_on:
api-server:
condition: service_healthy
networks:
- ml-experiments-network
# Redis with authentication
redis:
image: redis:7-alpine
container_name: ml-experiments-redis
user: "999:999"
ports:
- "127.0.0.1:6379:6379" # Bind to localhost only
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/homelab/redis:/data
- ${FETCHML_REPO_ROOT:-.}/redis/redis-secure.conf:/usr/local/etc/redis/redis.conf:ro
restart: unless-stopped
command: redis-server /usr/local/etc/redis/redis.conf --requirepass ${REDIS_PASSWORD}
healthcheck:
test: ["CMD", "redis-cli", "--no-auth-warning", "-a", "${REDIS_PASSWORD}", "ping"]
interval: 30s
timeout: 10s
retries: 3
networks:
- ml-backend-network
environment:
- REDIS_PASSWORD=${REDIS_PASSWORD}
volumes: {}
networks: networks:
ml-homelab-network: ml-experiments-network:
driver: bridge
ml-backend-network:
driver: bridge driver: bridge
ipam:
config:
- subnet: 172.25.0.0/16

View file

@ -0,0 +1,75 @@
services:
caddy:
image: caddy:2-alpine
environment:
- FETCHML_DOMAIN=localhost
- CADDY_EMAIL=smoke@example.invalid
ports:
- "8080:80"
- "8443:443"
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/caddy/data:/data
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/caddy/config:/config
command:
- /bin/sh
- -c
- |
cat > /etc/caddy/Caddyfile <<'EOF'
{
debug
servers {
protocols h1 h2
}
}
https://localhost {
tls internal {
protocols tls1.2 tls1.3
}
handle {
reverse_proxy api-server:9101
}
}
EOF
exec caddy run --config /etc/caddy/Caddyfile
redis:
image: redis:7-alpine
user: "999:999"
restart: unless-stopped
expose:
- "6379"
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/redis:/data
command: redis-server --appendonly yes
healthcheck:
test: [ "CMD", "redis-cli", "ping" ]
interval: 10s
timeout: 5s
retries: 10
api-server:
build:
context: ${FETCHML_REPO_ROOT:-.}
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
user: "0:0"
restart: unless-stopped
expose:
- "9101"
depends_on:
redis:
condition: service_healthy
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/experiments:/data/experiments
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/active:/data/active
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/logs:/logs
- ${FETCHML_REPO_ROOT:-.}/configs/api/dev.yaml:/app/configs/api/dev.yaml:ro
command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
interval: 10s
timeout: 5s
retries: 10
volumes: {}

View file

@ -1,12 +1,31 @@
# Full Production Docker Environment with Podman and SQLite # Full Production Docker Environment with Podman and SQLite
services: services:
caddy:
image: caddy:2-alpine
container_name: ml-prod-caddy
restart: unless-stopped
ports:
- "80:80"
- "443:443"
volumes:
- ./Caddyfile.prod:/etc/caddy/Caddyfile:ro
- ${FETCHML_REPO_ROOT:-.}/data/prod/caddy/data:/data
- ${FETCHML_REPO_ROOT:-.}/data/prod/caddy/config:/config
environment:
- FETCHML_DOMAIN=${FETCHML_DOMAIN}
- CADDY_EMAIL=${CADDY_EMAIL}
depends_on:
api-server:
condition: service_healthy
redis: redis:
image: redis:7-alpine image: redis:7-alpine
container_name: ml-prod-redis container_name: ml-prod-redis
ports: user: "999:999"
- "6379:6379" expose:
- "6379"
volumes: volumes:
- redis_prod_data:/data - ${FETCHML_REPO_ROOT:-.}/data/prod/redis:/data
restart: unless-stopped restart: unless-stopped
command: redis-server --appendonly yes command: redis-server --appendonly yes
healthcheck: healthcheck:
@ -17,57 +36,87 @@ services:
api-server: api-server:
build: build:
context: . context: ${FETCHML_REPO_ROOT:-.}
dockerfile: build/docker/secure-prod.Dockerfile dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/secure-prod.Dockerfile
container_name: ml-prod-api container_name: ml-prod-api
ports: expose:
- "9103:9101" # API server port - "9101" # API server port (internal; external access via Caddy)
- "2222:2222" # Secure SSH port for Podman communication - "2222" # Secure SSH port for Podman communication (internal)
- "9100:9100" # Prometheus metrics
volumes: volumes:
- ./data:/app/data/experiments - ${FETCHML_REPO_ROOT:-.}/data/prod/experiments:/app/data/experiments
- ./logs:/logs - ${FETCHML_REPO_ROOT:-.}/data/prod/active:/data/active
- ./configs/config-multi-user.yaml:/app/configs/config.yaml - ${FETCHML_REPO_ROOT:-.}/data/prod/logs:/logs
- ${FETCHML_REPO_ROOT:-.}/configs/api/multi-user.yaml:/app/configs/api/prod.yaml
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
restart: unless-stopped restart: unless-stopped
environment: environment:
- REDIS_URL=redis://redis:6379
- LOG_LEVEL=info - LOG_LEVEL=info
healthcheck: healthcheck:
test: [ "CMD", "curl", "-k", "https://localhost:9101/health" ] test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 3 retries: 3
start_period: 40s start_period: 40s
# Start SSH daemon for Podman communication # Start API server (ensure data_dir exists for snapshot/dataset validation)
command: ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"] command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"]
minio:
image: minio/minio:latest
container_name: ml-prod-minio
expose:
- "9000"
- "9001"
volumes:
- ${FETCHML_REPO_ROOT:-.}/data/prod/minio:/data
environment:
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
command: ["server", "/data", "--console-address", ":9001"]
restart: unless-stopped
minio-init:
image: alpine:3.19
container_name: ml-prod-minio-init
depends_on:
- minio
entrypoint: ["/bin/sh", "-c"]
command:
- |
apk add --no-cache ca-certificates curl >/dev/null
curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc
chmod +x /usr/local/bin/mc
mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123}
mc mb -p local/fetchml-snapshots || true
restart: "no"
worker: worker:
build: build:
context: . context: ${FETCHML_REPO_ROOT:-.}
dockerfile: build/docker/secure-prod.Dockerfile dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
container_name: ml-prod-worker container_name: ml-prod-worker
volumes: volumes:
- ./data:/app/data/experiments - ${FETCHML_REPO_ROOT:-.}/data/prod/experiments:/app/data/experiments
- ./logs:/logs - ${FETCHML_REPO_ROOT:-.}/data/prod/active:/data/active
- ./configs/worker-docker.yaml:/app/configs/worker.yaml - ${FETCHML_REPO_ROOT:-.}/data/prod/logs:/logs
- ${FETCHML_REPO_ROOT:-.}/configs/workers/docker-prod.yaml:/app/configs/worker.yaml
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
api-server: api-server:
condition: service_healthy condition: service_healthy
minio-init:
condition: service_started
restart: unless-stopped restart: unless-stopped
environment: environment:
- REDIS_URL=redis://redis:6379
- LOG_LEVEL=info - LOG_LEVEL=info
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
privileged: true # Required for Podman to work in Docker privileged: true # Required for Podman to work in Docker
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"] command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
volumes: volumes: {}
redis_prod_data:
driver: local
networks: networks:
default: default:

View file

@ -0,0 +1,17 @@
# Development Environment Variables
# Copy this file to .env and modify as needed
# Grafana
GRAFANA_ADMIN_PASSWORD=admin123
# API Configuration
LOG_LEVEL=info
# TLS (development uses self-signed certs)
TLS_CERT_PATH=/app/ssl/cert.pem
TLS_KEY_PATH=/app/ssl/key.pem
# Development-specific
ENVIRONMENT=development
DEBUG=true
API_KEY=development_key_only

View file

@ -0,0 +1,28 @@
# Production Environment Variables
# Copy this file to .env and modify as needed
# Grafana (if using)
GRAFANA_ADMIN_PASSWORD=CHANGE_ME_SECURE_PASSWORD
# API Configuration
LOG_LEVEL=warn
# TLS (production should use CA-signed certs)
TLS_CERT_PATH=/app/ssl/cert.pem
TLS_KEY_PATH=/app/ssl/key.pem
# Caddy (TLS/WSS termination)
FETCHML_DOMAIN=ml.example.com
CADDY_EMAIL=admin@example.com
# Production-specific
ENVIRONMENT=production
DEBUG=false
# Security
API_KEY=CHANGE_ME_SECURE_API_KEY
ALLOWED_ORIGINS=https://yourdomain.com
# External services (if applicable)
EXTERNAL_REDIS_URL=redis://external-redis:6379
EXTERNAL_PROMETHEUS_URL=http://external-prometheus:9090

112
deployments/setup.sh Normal file
View file

@ -0,0 +1,112 @@
#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<'EOF'
Usage: ./deployments/setup.sh
This script DOES NOT install dependencies.
It prints the manual steps and required/optional dependencies for a real (non-Docker) production deployment.
EOF
}
if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
usage
exit 0
fi
cat <<'EOF'
== FetchML production setup (non-Docker) ==
Required (core):
- Go-built binaries: api-server, worker
- Redis (reachable from api-server + worker)
- A writable base_path for experiments
- A writable data_dir if you want snapshot/dataset staging + integrity validation
Required (TLS/WSS):
- Caddy (recommended) OR another reverse proxy that can terminate TLS and proxy WebSockets
Optional:
- systemd (recommended) for service supervision
- MinIO / S3-compatible storage (only if you use remote snapshot_store)
- Podman (only if your worker executes jobs in Podman)
Notes:
- The Zig CLI currently supports ws:// only. In production, keep the API server internal on ws:// and terminate TLS/WSS at Caddy.
- This script is informational; it will not modify your system.
---
1) Build binaries
make prod
Artifacts:
./bin/api-server
./bin/worker
---
2) Create a dedicated user (recommended)
useradd --system --create-home --shell /usr/sbin/nologin fetchml
---
3) Create directories (example paths)
mkdir -p /var/lib/fetchml/experiments
mkdir -p /var/lib/fetchml/active/datasets /var/lib/fetchml/active/snapshots
mkdir -p /var/log/fetchml
Ensure ownership:
chown -R fetchml:fetchml /var/lib/fetchml /var/log/fetchml
---
4) Configure the API server
- Start from: configs/api/prod.yaml (or your multi-user config)
- For real production, keep server.tls.enabled: false
- Ensure monitoring.health_checks.enabled is set appropriately
Example flags:
./bin/api-server -config /etc/fetchml/api.yaml
---
5) Configure Caddy (TLS/WSS termination)
- Recommended: use deployments/Caddyfile.prod as a baseline.
- Caddy should listen on 443 and reverse proxy to the API server (internal) on 9101.
Example layout:
/etc/caddy/Caddyfile
/var/lib/caddy
---
6) Configure Redis
- Use Redis AUTH in production.
- Ensure the api-server + worker can reach it.
---
7) Run under systemd (recommended)
Create unit files (example):
/etc/systemd/system/fetchml-api.service
/etc/systemd/system/fetchml-worker.service
/etc/systemd/system/caddy.service (if not already provided)
Then:
systemctl daemon-reload
systemctl enable --now fetchml-api
systemctl enable --now fetchml-worker
systemctl enable --now caddy
---
8) Smoke check
Internal health (no TLS):
curl -f http://127.0.0.1:9101/health
External health (through Caddy TLS termination):
curl -f https://YOUR_DOMAIN/health
EOF

View file

@ -1,13 +1,52 @@
# Centralized Monitoring Stack # Monitoring Stack
## Directory Structure (Canonical)
All monitoring configuration lives under `monitoring/`.
```text
monitoring/
prometheus/
prometheus.yml # Prometheus scrape configuration
grafana/
dashboards/ # Grafana dashboards (JSON)
provisioning/
datasources/ # Grafana data sources (Prometheus/Loki)
dashboards/ # Grafana dashboard provider (points at dashboards/)
loki-config.yml # Loki configuration
promtail-config.yml # Promtail configuration
```
### What is "Grafana provisioning"?
Grafana provisioning is how Grafana auto-configures itself on startup (no clicking in the UI):
- **`grafana/provisioning/datasources/*.yml`**
- Defines where Grafana reads data from (e.g. Prometheus at `http://prometheus:9090`, Loki at `http://loki:3100`).
- **`grafana/provisioning/dashboards/*.yml`**
- Tells Grafana to load dashboard JSON files from `/var/lib/grafana/dashboards`.
- **`grafana/dashboards/*.json`**
- The dashboards themselves.
### Source of truth
- **Dashboards**: edit/add JSON in `monitoring/grafana/dashboards/`.
- **Grafana provisioning**: edit files in `monitoring/grafana/provisioning/`.
- **Prometheus scrape config**: edit `monitoring/prometheus/prometheus.yml`.
`scripts/setup_monitoring.py` is intentionally **provisioning-only**:
- It (re)writes Grafana **datasources** and the **dashboard provider**.
- It does **not** create or overwrite any dashboard JSON files.
## Quick Start ## Quick Start
```bash ```bash
# Start everything # Start deployment
docker-compose up -d make deploy-up
# Access services # Access services
open http://localhost:3000 # Grafana (admin/admin) open http://localhost:3000 # Grafana (admin/admin123)
open http://localhost:9090 # Prometheus open http://localhost:9090 # Prometheus
``` ```
@ -15,137 +54,80 @@ open http://localhost:9090 # Prometheus
### Grafana (Port 3000) ### Grafana (Port 3000)
**Main monitoring dashboard** **Main monitoring dashboard**
- Username: `admin` - Username: `admin`
- Password: `admin` - Password: `admin123`
- Pre-configured datasources: Prometheus + Loki - Data source: Prometheus (http://localhost:9090)
- Pre-loaded ML Queue dashboard
### Prometheus (Port 9090) ### Prometheus (Port 9090)
**Metrics collection** **Metrics collection and storage**
- Scrapes metrics from API server (`:9100/metrics`)
- 15s scrape interval
- Data retention: 15 days (default)
### Loki (Port 3100) ### Loki (Port 3100)
**Log aggregation** **Log aggregation**
- Collects logs from all containers
- Collects application logs from `./logs/`
- Retention: 7 days
### Promtail ## Dashboards
**Log shipping**
- Watches Docker container logs
- Watches `./logs/*.log`
- Sends to Loki
## Viewing Data Available dashboard configurations in `grafana/dashboards/`:
### Metrics - `load-test-performance.json` - Load test metrics
1. Open Grafana: http://localhost:3000 - `websocket-performance.json` - WebSocket performance
2. Go to "ML Task Queue Monitoring" dashboard - `system-health.json` - System health monitoring
3. See: queue depth, task duration, error rates, etc. - `rsync-performance.json` - Rsync performance metrics
### Logs ### Importing Dashboards
1. Open Grafana → Explore
2. Select "Loki" datasource
3. Query examples:
```logql
{job="app_logs"} # All app logs
{job="docker",service="api-server"} # API server logs
{job="docker"} |= "error" # All errors
```
## Architecture 1. Go to Grafana → "+" → "Import"
2. Upload JSON files from `grafana/dashboards/` directory
``` 3. Select Prometheus data source
┌─────────────┐
│ API Server │──┐
└─────────────┘ │
├──► Prometheus ──► Grafana
┌─────────────┐ │ ▲
│ Worker │──┘ │
└─────────────┘ │
┌─────────────┐ │
│ App Logs │──┐ │
└─────────────┘ │ │
├──► Promtail ──► Loki ┘
┌─────────────┐ │
│Docker Logs │──┘
└─────────────┘
```
## Configuration Files ## Configuration Files
- `prometheus.yml` - Metrics scraping config - `prometheus/prometheus.yml` - Prometheus configuration
- `loki-config.yml` - Log storage config - `loki-config.yml` - Loki configuration
- `promtail-config.yml` - Log collection config - `promtail-config.yml` - Promtail configuration
- `grafana/provisioning/` - Auto-configuration - `security_rules.yml` - Security rules
## Customization ## Usage
### Add More Scrapers 1. Start monitoring stack: `make deploy-up`
Edit `monitoring/prometheus.yml`: 2. Access Grafana: http://localhost:3000 (admin/admin123)
```yaml 3. Import dashboards from `grafana/dashboards/` directory
scrape_configs: 4. View metrics and test results in real-time
- job_name: 'my-service'
static_configs:
- targets: ['my-service:9100']
```
### Change Retention ## Health Endpoints
**Prometheus:** Add to command in docker-compose:
```yaml
- '--storage.tsdb.retention.time=30d'
```
**Loki:** Edit `loki-config.yml`: The API server provides health check endpoints for monitoring:
```yaml
limits_config:
retention_period: 720h # 30 days
```
## Troubleshooting - **`/health`** - Overall service health (for Docker healthcheck)
- **`/health/live`** - Liveness probe (is the service running?)
- **`/health/ready`** - Readiness probe (can the service accept traffic?)
**No metrics showing:** ### Testing Health Endpoints
```bash
# Check if Prometheus can reach targets
curl http://localhost:9090/api/v1/targets
# Check if API exposes metrics
curl http://localhost:9100/metrics
```
**No logs showing:**
```bash
# Check Promtail status
docker logs ml-experiments-promtail
# Verify Loki is receiving logs
curl http://localhost:3100/ready
```
**Grafana can't connect to datasources:**
```bash
# Restart Grafana
docker-compose restart grafana
```
## Profiling Quick Start
To capture CPU profiles while exercising real workloads:
```bash ```bash
# HTTP LoadTestSuite (MediumLoad scenario) # Basic health check
make profile-load curl -k https://localhost:9101/health
# WebSocket → Redis queue → worker integration # Liveness check (for K8s or monitoring)
make profile-ws-queue curl -k https://localhost:9101/health/live
# Readiness check (verifies dependencies)
curl -k https://localhost:9101/health/ready
``` ```
Then inspect profiles with: See `health-testing.md` for detailed testing procedures.
```bash ## Prometheus Integration
go tool pprof cpu_load.out # HTTP load
go tool pprof cpu_ws.out # WebSocket/queue/worker Prometheus scrapes the following endpoints:
``` - `api-server:9101/metrics` - Application metrics (future)
- `api-server:9101/health` - Health status monitoring
- `host.docker.internal:9100/metrics` - Worker metrics (when the worker runs on the host)
- `worker:9100/metrics` - Worker metrics (when the worker runs as a container in the compose network)
## Cleanup (deprecated paths)
These legacy paths may still exist in the repo but are **not used** by the current dev compose config:
- `monitoring/dashboards/` (old dashboards location)
- `monitoring/prometheus.yml` (old Prometheus config location)
- `monitoring/grafana/provisioning/dashboards/dashboard.yml` (duplicate of `dashboards.yml`)

View file

@ -1,147 +0,0 @@
{
"dashboard": {
"title": "ML Task Queue Monitoring",
"tags": [
"ml",
"queue",
"fetch_ml"
],
"timezone": "browser",
"panels": [
{
"title": "Queue Depth",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "fetch_ml_queue_depth",
"legendFormat": "Queue Depth"
}
]
},
{
"title": "Active Tasks",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "sum(fetch_ml_active_tasks) by (worker_id)",
"legendFormat": "{{worker_id}}"
}
]
},
{
"title": "Task Duration (p50, p95, p99)",
"type": "graph",
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 8
},
"targets": [
{
"expr": "histogram_quantile(0.50, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
"legendFormat": "p50"
},
{
"expr": "histogram_quantile(0.95, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
"legendFormat": "p95"
},
{
"expr": "histogram_quantile(0.99, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
"legendFormat": "p99"
}
]
},
{
"title": "Task Completion Rate",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"targets": [
{
"expr": "rate(fetch_ml_tasks_completed_total[5m])",
"legendFormat": "{{status}}"
}
]
},
{
"title": "Failure Rate by Error Category",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"targets": [
{
"expr": "rate(fetch_ml_task_failures_total[5m])",
"legendFormat": "{{error_category}}"
}
]
},
{
"title": "Retry Rate",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"targets": [
{
"expr": "rate(fetch_ml_task_retries_total[5m])",
"legendFormat": "{{error_category}}"
}
]
},
{
"title": "Dead Letter Queue Size",
"type": "stat",
"gridPos": {
"h": 8,
"w": 6,
"x": 12,
"y": 24
},
"targets": [
{
"expr": "fetch_ml_dlq_size"
}
]
},
{
"title": "Lease Expirations",
"type": "stat",
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 24
},
"targets": [
{
"expr": "fetch_ml_lease_expirations_total"
}
]
}
]
}
}

View file

@ -1,278 +0,0 @@
{
"dashboard": {
"title": "Application Logs",
"tags": [
"logs",
"loki",
"fetch_ml"
],
"timezone": "browser",
"editable": true,
"graphTooltip": 1,
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"panels": [
{
"title": "Log Stream",
"type": "logs",
"gridPos": {
"x": 0,
"y": 0,
"w": 24,
"h": 12
},
"id": 1,
"targets": [
{
"expr": "{job=\"app_logs\"}",
"refId": "A",
"datasource": "Loki"
}
],
"options": {
"showTime": true,
"showLabels": true,
"showCommonLabels": false,
"wrapLogMessage": false,
"prettifyLogMessage": false,
"enableLogDetails": true,
"dedupStrategy": "none",
"sortOrder": "Descending"
}
},
{
"title": "Log Level Distribution",
"type": "bargauge",
"gridPos": {
"x": 0,
"y": 12,
"w": 8,
"h": 8
},
"id": 2,
"targets": [
{
"expr": "sum by (level) (count_over_time({job=\"app_logs\"} | logfmt | level != \"\" [5m]))",
"refId": "A",
"datasource": "Loki",
"legendFormat": "{{level}}"
}
],
"options": {
"orientation": "horizontal",
"displayMode": "gradient",
"showUnfilled": true
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "INFO"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "green"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "WARN"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "yellow"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "ERROR"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "red"
}
}
]
}
]
}
},
{
"title": "Error Logs (Last Hour)",
"type": "table",
"gridPos": {
"x": 8,
"y": 12,
"w": 16,
"h": 8
},
"id": 3,
"targets": [
{
"expr": "{job=\"app_logs\"} | logfmt | level=\"ERROR\"",
"refId": "A",
"datasource": "Loki"
}
],
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
},
{
"title": "Logs by Component",
"type": "timeseries",
"gridPos": {
"x": 0,
"y": 20,
"w": 12,
"h": 8
},
"id": 4,
"targets": [
{
"expr": "sum by (component) (rate({job=\"app_logs\"} | logfmt [1m]))",
"refId": "A",
"datasource": "Loki",
"legendFormat": "{{component}}"
}
],
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"lineInterpolation": "smooth",
"fillOpacity": 10,
"spanNulls": false,
"showPoints": "never",
"stacking": {
"mode": "none"
}
},
"unit": "reqps"
}
}
},
{
"title": "Warning Logs Timeline",
"type": "timeseries",
"gridPos": {
"x": 12,
"y": 20,
"w": 12,
"h": 8
},
"id": 5,
"targets": [
{
"expr": "sum(count_over_time({job=\"app_logs\"} | logfmt | level=\"WARN\" [1m]))",
"refId": "A",
"datasource": "Loki",
"legendFormat": "Warnings"
}
],
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "bars",
"fillOpacity": 50
},
"color": {
"mode": "fixed",
"fixedColor": "yellow"
}
}
}
},
{
"title": "Search Logs",
"type": "logs",
"gridPos": {
"x": 0,
"y": 28,
"w": 24,
"h": 10
},
"id": 6,
"targets": [
{
"expr": "{job=\"app_logs\"} |= \"$search_term\"",
"refId": "A",
"datasource": "Loki"
}
],
"options": {
"showTime": true,
"showLabels": true,
"wrapLogMessage": true,
"enableLogDetails": true
}
}
],
"templating": {
"list": [
{
"name": "search_term",
"type": "textbox",
"label": "Search Term",
"current": {
"value": "",
"text": ""
}
}
]
},
"refresh": "30s"
}
}

View file

@ -1,157 +0,0 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"links": [],
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "loki",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"hiddenSeries": false,
"id": 1,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "{job=\"fetchml-performance\"} |= \"BenchmarkAPIServerCreateJobSimple\"",
"legendFormat": "API Job Creation",
"refId": "A"
},
{
"expr": "{job=\"fetchml-performance\"} |= \"BenchmarkMLExperimentExecution/SmallExperiment\"",
"legendFormat": "ML Small Experiment",
"refId": "B"
},
{
"expr": "{job=\"fetchml-performance\"} |= \"BenchmarkDatasetOperations/DatasetCreation\"",
"legendFormat": "Dataset Creation",
"refId": "C"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "API Performance Trends",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": "Time (ns/op)",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"datasource": "loki",
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"showLabels": true
},
"targets": [
{
"expr": "{job=\"fetchml-performance\"} |= \"Performance Summary\"",
"legendFormat": "{{timestamp}}",
"refId": "A"
}
],
"title": "Latest Performance Summary",
"type": "logs"
}
],
"refresh": "30s",
"schemaVersion": 27,
"style": "dark",
"tags": ["fetchml", "performance"],
"templating": {
"list": []
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Fetch ML Performance Dashboard",
"uid": "fetchml-performance",
"version": 1
}

View file

@ -1,64 +0,0 @@
services:
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus-data:/prometheus
networks:
- monitoring
loki:
image: grafana/loki:2.9.0
ports:
- "3100:3100"
command: -config.file=/etc/loki/local-config.yaml
volumes:
- ./loki-performance-config.yaml:/etc/loki/local-config.yaml
networks:
- monitoring
promtail:
image: grafana/promtail:latest
volumes:
- ./promtail-performance-config.yaml:/etc/promtail/config.yml
- /var/log:/var/log:ro
command: -config.file=/etc/promtail/config.yml
networks:
- monitoring
pushgateway:
image: prom/pushgateway:latest
ports:
- "9091:9091"
networks:
- monitoring
grafana:
image: grafana/grafana:latest
ports:
- "3001:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards
- ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources
networks:
- monitoring
volumes:
loki-data:
grafana-data:
prometheus-data:
networks:
monitoring:
driver: bridge

View file

@ -0,0 +1,51 @@
{
"dashboard": {
"id": null,
"title": "Load Test Performance",
"tags": [
"load-test",
"performance"
],
"panels": [
{
"id": 1,
"title": "Service Health",
"type": "stat",
"targets": [
{
"expr": "up",
"legendFormat": "{{job}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
}
},
{
"id": 2,
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "RPS"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "5s"
}
}

View file

@ -0,0 +1 @@
{"dashboard": {"id": null, "title": "Load Test Performance", "tags": ["load-test", "performance"], "panels": [{"id": 1, "title": "Service Status", "type": "stat", "targets": [{"expr": "up", "legendFormat": "{{job}}"}]}]}}

View file

@ -0,0 +1,51 @@
{
"dashboard": {
"id": null,
"title": "Log Analysis",
"tags": [
"loki",
"logs"
],
"panels": [
{
"id": 1,
"title": "Error Logs",
"type": "logs",
"targets": [
{
"expr": "{job=~\".+\"} |= \"error\"",
"legendFormat": "Errors"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
}
},
{
"id": 2,
"title": "All Logs",
"type": "logs",
"targets": [
{
"expr": "{job=~\".+\"}",
"legendFormat": "All logs"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
}
}
],
"time": {
"from": "now-30m",
"to": "now"
},
"refresh": "30s"
}
}

View file

@ -0,0 +1,135 @@
# Grafana Dashboard: Prewarm Performance
# Import this JSON into Grafana to create a prewarm monitoring dashboard
{
"dashboard": {
"id": null,
"title": "Prewarm Performance",
"tags": ["prewarm", "performance", "worker"],
"panels": [
{
"id": 1,
"title": "Environment Prewarm Hit Rate (%)",
"type": "stat",
"targets": [
{
"expr": "100 * (fetchml_prewarm_env_hit_total / clamp_min(fetchml_prewarm_env_hit_total + fetchml_prewarm_env_miss_total, 1))",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {"h": 6, "w": 6, "x": 0, "y": 0},
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 50},
{"color": "green", "value": 80}
]
}
}
}
},
{
"id": 2,
"title": "Snapshot Prewarm Hit Rate (%)",
"type": "stat",
"targets": [
{
"expr": "100 * (fetchml_prewarm_snapshot_hit_total / clamp_min(fetchml_prewarm_snapshot_hit_total + fetchml_prewarm_snapshot_miss_total, 1))",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 0},
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 50},
{"color": "green", "value": 80}
]
}
}
}
},
{
"id": 3,
"title": "Environment Prewarm Hits vs Misses",
"type": "graph",
"targets": [
{"expr": "rate(fetchml_prewarm_env_hit_total[5m])", "legendFormat": "hits {{worker_id}}"},
{"expr": "rate(fetchml_prewarm_env_miss_total[5m])", "legendFormat": "misses {{worker_id}}"}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
"yAxes": [{"unit": "reqps"}]
},
{
"id": 4,
"title": "Snapshot Prewarm Hits vs Misses",
"type": "graph",
"targets": [
{"expr": "rate(fetchml_prewarm_snapshot_hit_total[5m])", "legendFormat": "hits {{worker_id}}"},
{"expr": "rate(fetchml_prewarm_snapshot_miss_total[5m])", "legendFormat": "misses {{worker_id}}"}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"yAxes": [{"unit": "reqps"}]
},
{
"id": 5,
"title": "Environment Build Time",
"type": "graph",
"targets": [
{"expr": "rate(fetchml_prewarm_env_time_seconds_total[5m])", "legendFormat": "build time {{worker_id}}"}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
"yAxes": [{"unit": "seconds"}]
},
{
"id": 6,
"title": "Snapshot Prewarm Time",
"type": "graph",
"targets": [
{"expr": "rate(fetchml_prewarm_snapshot_time_seconds_total[5m])", "legendFormat": "prewarm time {{worker_id}}"}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
"yAxes": [{"unit": "seconds"}]
},
{
"id": 7,
"title": "Environment Images Built",
"type": "graph",
"targets": [
{"expr": "increase(fetchml_prewarm_env_built_total[1h])", "legendFormat": "built {{worker_id}}"}
],
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 16},
"yAxes": [{"unit": "short"}]
},
{
"id": 8,
"title": "Snapshots Prewarmed",
"type": "graph",
"targets": [
{"expr": "increase(fetchml_prewarm_snapshot_built_total[1h])", "legendFormat": "prewarmed {{worker_id}}"}
],
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 16},
"yAxes": [{"unit": "short"}]
},
{
"id": 9,
"title": "Prewarm Efficiency",
"type": "graph",
"targets": [
{"expr": "fetchml_prewarm_env_hit_total + fetchml_prewarm_snapshot_hit_total", "legendFormat": "total hits {{worker_id}}"},
{"expr": "fetchml_prewarm_env_miss_total + fetchml_prewarm_snapshot_miss_total", "legendFormat": "total misses {{worker_id}}"}
],
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 24},
"yAxes": [{"unit": "short"}]
}
],
"time": {"from": "now-1h", "to": "now"},
"refresh": "5s"
}
}

View file

@ -0,0 +1,86 @@
{
"dashboard": {
"id": null,
"title": "Rsync Performance",
"tags": [
"rsync",
"sync",
"performance"
],
"panels": [
{
"id": 1,
"title": "Rsync Operations",
"type": "graph",
"targets": [
{
"expr": "rate(rsync_operations_total[5m])",
"legendFormat": "Operations/sec"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
}
},
{
"id": 2,
"title": "Data Transfer Rate",
"type": "graph",
"targets": [
{
"expr": "rate(rsync_bytes_transferred_total[5m])",
"legendFormat": "Bytes/sec"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
}
},
{
"id": 3,
"title": "Sync Duration",
"type": "graph",
"targets": [
{
"expr": "rsync_sync_duration_seconds",
"legendFormat": "Duration"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
}
},
{
"id": 4,
"title": "Sync Errors",
"type": "graph",
"targets": [
{
"expr": "rate(rsync_errors_total[5m])",
"legendFormat": "Errors/sec"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "5s"
}
}

View file

@ -0,0 +1,51 @@
{
"dashboard": {
"id": null,
"title": "System Health",
"tags": [
"system",
"health"
],
"panels": [
{
"id": 1,
"title": "Service Status",
"type": "stat",
"targets": [
{
"expr": "up",
"legendFormat": "{{job}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
}
},
{
"id": 2,
"title": "Memory Usage",
"type": "graph",
"targets": [
{
"expr": "process_resident_memory_bytes",
"legendFormat": "Memory"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "10s"
}
}

View file

@ -0,0 +1,68 @@
{
"dashboard": {
"id": null,
"title": "WebSocket Performance",
"tags": [
"websocket",
"performance"
],
"panels": [
{
"id": 1,
"title": "WebSocket Connections",
"type": "graph",
"targets": [
{
"expr": "websocket_connections_active",
"legendFormat": "Active Connections"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
}
},
{
"id": 2,
"title": "WebSocket Messages",
"type": "graph",
"targets": [
{
"expr": "rate(websocket_messages_total[5m])",
"legendFormat": "Messages/sec"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
}
},
{
"id": 3,
"title": "Connection Errors",
"type": "graph",
"targets": [
{
"expr": "rate(websocket_connection_errors_total[5m])",
"legendFormat": "Errors/sec"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "5s"
}
}

View file

@ -0,0 +1,280 @@
{
"id": null,
"title": "Worker Resources",
"tags": [
"worker",
"resources"
],
"panels": [
{
"id": 1,
"title": "CPU Free",
"type": "stat",
"targets": [
{
"expr": "fetchml_resources_cpu_free",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {
"h": 6,
"w": 6,
"x": 0,
"y": 0
}
},
{
"id": 2,
"title": "CPU Total",
"type": "stat",
"targets": [
{
"expr": "fetchml_resources_cpu_total",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {
"h": 6,
"w": 6,
"x": 6,
"y": 0
}
},
{
"id": 3,
"title": "CPU Utilization (%)",
"type": "graph",
"targets": [
{
"expr": "100 * (1 - (fetchml_resources_cpu_free / clamp_min(fetchml_resources_cpu_total, 1)))",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
}
},
{
"id": 4,
"title": "GPU Slots Free",
"type": "graph",
"targets": [
{
"expr": "fetchml_resources_gpu_slots_free",
"legendFormat": "{{worker_id}} gpu={{gpu_index}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 6
}
},
{
"id": 5,
"title": "Acquire Wait / Timeout (Totals)",
"type": "graph",
"targets": [
{
"expr": "fetchml_resources_acquire_wait_total",
"legendFormat": "wait {{worker_id}}"
},
{
"expr": "fetchml_resources_acquire_timeout_total",
"legendFormat": "timeout {{worker_id}}"
},
{
"expr": "fetchml_resources_acquire_total",
"legendFormat": "total {{worker_id}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
}
},
{
"id": 6,
"title": "Avg Acquire Wait (seconds)",
"type": "stat",
"targets": [
{
"expr": "fetchml_resources_acquire_wait_seconds_total / clamp_min(fetchml_resources_acquire_wait_total, 1)",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {
"h": 6,
"w": 6,
"x": 0,
"y": 14
}
},
{
"id": 7,
"title": "Acquire Wait Ratio",
"type": "stat",
"targets": [
{
"expr": "fetchml_resources_acquire_wait_total / clamp_min(fetchml_resources_acquire_total, 1)",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {
"h": 6,
"w": 6,
"x": 6,
"y": 14
}
},
{
"id": 8,
"title": "Environment Prewarm Hit Rate (%)",
"type": "stat",
"targets": [
{
"expr": "100 * (fetchml_prewarm_env_hit_total / clamp_min(fetchml_prewarm_env_hit_total + fetchml_prewarm_env_miss_total, 1))",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {
"h": 6,
"w": 6,
"x": 12,
"y": 14
},
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 50},
{"color": "green", "value": 80}
]
}
}
}
},
{
"id": 9,
"title": "Snapshot Prewarm Hit Rate (%)",
"type": "stat",
"targets": [
{
"expr": "100 * (fetchml_prewarm_snapshot_hit_total / clamp_min(fetchml_prewarm_snapshot_hit_total + fetchml_prewarm_snapshot_miss_total, 1))",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {
"h": 6,
"w": 6,
"x": 18,
"y": 14
},
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 50},
{"color": "green", "value": 80}
]
}
}
}
},
{
"id": 10,
"title": "Prewarm Hits vs Misses",
"type": "graph",
"targets": [
{
"expr": "rate(fetchml_prewarm_env_hit_total[5m])",
"legendFormat": "env hits {{worker_id}}"
},
{
"expr": "rate(fetchml_prewarm_env_miss_total[5m])",
"legendFormat": "env misses {{worker_id}}"
},
{
"expr": "rate(fetchml_prewarm_snapshot_hit_total[5m])",
"legendFormat": "snapshot hits {{worker_id}}"
},
{
"expr": "rate(fetchml_prewarm_snapshot_miss_total[5m])",
"legendFormat": "snapshot misses {{worker_id}}"
}
],
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 20
},
"yAxes": [
{"unit": "reqps"}
]
},
{
"id": 11,
"title": "Prewarm Build Time",
"type": "graph",
"targets": [
{
"expr": "rate(fetchml_prewarm_env_time_seconds_total[5m])",
"legendFormat": "env build {{worker_id}}"
},
{
"expr": "rate(fetchml_prewarm_snapshot_time_seconds_total[5m])",
"legendFormat": "snapshot prewarm {{worker_id}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 28
},
"yAxes": [
{"unit": "seconds"}
]
},
{
"id": 12,
"title": "Prewarm Builds",
"type": "graph",
"targets": [
{
"expr": "increase(fetchml_prewarm_env_built_total[1h])",
"legendFormat": "env built {{worker_id}}"
},
{
"expr": "increase(fetchml_prewarm_snapshot_built_total[1h])",
"legendFormat": "snapshots prewarmed {{worker_id}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 28
},
"yAxes": [
{"unit": "short"}
]
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "5s"
}

View file

@ -1,5 +1,4 @@
apiVersion: 1 apiVersion: 1
providers: providers:
- name: 'default' - name: 'default'
orgId: 1 orgId: 1

View file

@ -0,0 +1,9 @@
apiVersion: 1
datasources:
- name: Loki
type: loki
access: proxy
url: http://loki:3100
editable: true
jsonData:
maxLines: 1000

View file

@ -1,16 +1,10 @@
apiVersion: 1 apiVersion: 1
datasources: datasources:
- name: Prometheus - name: Prometheus
type: prometheus type: prometheus
access: proxy access: proxy
url: http://prometheus:9090 url: http://prometheus:9090
isDefault: false
editable: false
- name: Loki
type: loki
access: proxy
url: http://loki:3100
isDefault: true isDefault: true
editable: false editable: true
jsonData:
timeInterval: "5s"

View file

@ -0,0 +1,100 @@
# Testing Health Endpoints with Monitoring Stack
## Verify Health Endpoints
```bash
# 1. Start the monitoring stack
cd deployments
docker-compose -f docker-compose.dev.yml up -d
# 2. Wait for services to start (30 seconds)
sleep 30
# 3. Test health endpoints
curl -k https://localhost:9101/health
# Expected: {"status":"healthy","timestamp":"...","checks":{}}
curl -k https://localhost:9101/health/live
# Expected: {"status":"alive","timestamp":"..."}
curl -k https://localhost:9101/health/ready
# Expected: {"status":"ready","timestamp":"...","checks":{"queue":"ok","experiments":"ok"}}
# 4. Check Docker health status
docker ps | grep api-server
# Should show: (healthy)
# 5. Access Grafana
open http://localhost:3000
# Login: admin / admin123
# 6. Access Prometheus
open http://localhost:9090
# Check targets: Status > Targets
# Should see: api-server, api-server-health
# 7. Query health metrics in Prometheus
# Go to Graph and enter: up{job="api-server-health"}
# Should show: value=1 (service is up)
```
## Health Check Integration
### Docker Compose
The health check is configured in `deployments/docker-compose.dev.yml`:
```yaml
healthcheck:
test: [ "CMD", "curl", "-k", "https://localhost:9101/health" ]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
```
### Prometheus Monitoring
Prometheus scrapes health status every 30s from:
- `/health` - Overall service health
- `/metrics` - Future Prometheus metrics (when implemented)
### Kubernetes (Future)
Health endpoints ready for K8s probes:
```yaml
livenessProbe:
httpGet:
path: /health/live
port: 9101
scheme: HTTPS
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /health/ready
port: 9101
scheme: HTTPS
initialDelaySeconds: 10
periodSeconds: 5
```
## Monitoring Stack Services
- **Grafana** (port 3000): Dashboards and visualization
- **Prometheus** (port 9090): Metrics collection
- **Loki** (port 3100): Log aggregation
- **Promtail**: Log shipping
## Troubleshooting
```bash
# Check API server logs
docker logs ml-experiments-api
# Check Prometheus targets
curl http://localhost:9090/api/v1/targets
# Check health endpoint directly
docker exec ml-experiments-api curl -k https://localhost:9101/health
# Restart services
docker-compose -f deployments/docker-compose.dev.yml restart api-server
```

View file

@ -12,7 +12,7 @@ common:
rules_directory: /loki/rules rules_directory: /loki/rules
replication_factor: 1 replication_factor: 1
ring: ring:
instance_addr: 127.0.0.1 instance_addr: 0.0.0.0
kvstore: kvstore:
store: inmemory store: inmemory

View file

@ -1,40 +0,0 @@
auth_enabled: false
server:
http_listen_port: 3100
ingester:
lifecycler:
address: 127.0.0.1
ring:
kvstore:
store: inmemory
replication_factor: 1
final_sleep: 0s
min_ready_duration: 0s
chunk_idle_period: 1h
max_chunk_age: 1h
chunk_target_size: 1048576
chunk_retain_period: 30s
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb_shipper:
active_index_directory: /loki/boltdb-shipper-active
cache_location: /loki/boltdb-shipper-cache
filesystem:
directory: /loki/chunks
limits_config:
reject_old_samples: true
reject_old_samples_max_age: 168h
allow_structured_metadata: false

View file

@ -5,39 +5,35 @@ global:
evaluation_interval: 15s evaluation_interval: 15s
scrape_configs: scrape_configs:
# API Server metrics # API Server metrics and health
- job_name: 'api-server' - job_name: 'api-server'
scheme: http
static_configs: static_configs:
- targets: ['api-server:9100'] - targets: ['api-server:9101']
labels: labels:
service: 'api-server' service: 'api-server'
metrics_path: /metrics # Future: Prometheus metrics endpoint
# Worker metrics (if running in docker) # Benchmark metrics from Pushgateway
- job_name: 'benchmark'
static_configs: []
# Worker metrics (ResourceManager + task execution)
# For docker-compose dev on macOS/Windows, Prometheus can reach a locally running worker
# via host.docker.internal.
- job_name: 'worker' - job_name: 'worker'
scrape_interval: 15s
static_configs: static_configs:
- targets: ['worker:9100'] - targets: ['worker:9100']
labels: labels:
service: 'worker' service: 'worker'
# Allow failures if worker not running target_type: 'container'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
# Benchmark metrics from Pushgateway
- job_name: 'benchmark'
static_configs:
- targets: ['localhost:9091']
labels:
service: 'benchmark'
metrics_path: /metrics metrics_path: /metrics
honor_labels: true
# Loki metrics # Loki metrics
- job_name: 'loki' - job_name: 'loki'
static_configs: static_configs:
- targets: ['ml-experiments-loki:3100'] - targets: ['loki:3100']
labels: labels:
service: 'loki' service: 'loki'
metrics_path: /metrics metrics_path: /metrics

View file

@ -1,50 +0,0 @@
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: fetchml-performance
static_configs:
- targets:
- localhost
labels:
job: fetchml-performance
__path__: /reports/performance.log
pipeline_stages:
- json:
expressions:
timestamp: timestamp
git_commit: git_commit
benchmark_name: name
time_per_op: time_per_op_ns
memory_per_op: memory_per_op_b
allocs_per_op: allocs_per_op
- labels:
benchmark_name:
git_commit:
- output:
source: output
- job_name: fetchml-performance-summary
static_configs:
- targets:
- localhost
labels:
job: fetchml-performance
__path__: /reports/performance_summary.log
pipeline_stages:
- regex:
expression: "=== Performance Summary ==="
- output:
source: output

View file

@ -1,112 +0,0 @@
groups:
- name: security.rules
rules:
# High rate of failed authentication attempts
- alert: HighFailedAuthRate
expr: rate(failed_auth_total[5m]) > 10
for: 2m
labels:
severity: warning
annotations:
summary: "High rate of failed authentication attempts"
description: "More than 10 failed auth attempts per minute for the last 2 minutes"
# Potential brute force attack
- alert: BruteForceAttack
expr: rate(failed_auth_total[1m]) > 30
for: 1m
labels:
severity: critical
annotations:
summary: "Potential brute force attack detected"
description: "More than 30 failed auth attempts per minute"
# Unusual WebSocket connection patterns
- alert: UnusualWebSocketActivity
expr: rate(websocket_connections_total[5m]) > 100
for: 3m
labels:
severity: warning
annotations:
summary: "Unusual WebSocket connection activity"
description: "WebSocket connection rate is unusually high"
# Rate limit breaches
- alert: RateLimitBreached
expr: rate(rate_limit_exceeded_total[5m]) > 5
for: 1m
labels:
severity: warning
annotations:
summary: "Rate limits being exceeded"
description: "Rate limit exceeded more than 5 times per minute"
# SSL certificate expiration warning
- alert: SSLCertificateExpiring
expr: ssl_certificate_expiry_days < 30
for: 1h
labels:
severity: warning
annotations:
summary: "SSL certificate expiring soon"
description: "SSL certificate will expire in less than 30 days"
# High memory usage
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage detected"
description: "Memory usage is above 90%"
# High CPU usage
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage detected"
description: "CPU usage is above 80%"
# Disk space running low
- alert: LowDiskSpace
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10
for: 5m
labels:
severity: critical
annotations:
summary: "Low disk space"
description: "Disk space is below 10%"
# Service down
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service is down"
description: "{{ $labels.instance }} service has been down for more than 1 minute"
# Unexpected error rates
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Error rate is above 10%"
# Suspicious IP activity
- alert: SuspiciousIPActivity
expr: rate(requests_by_ip[5m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: "Suspicious IP activity"
description: "IP address making unusually many requests"

View file

@ -118,7 +118,7 @@ jupyter>=1.0.0
"allow_network": false, "allow_network": false,
"blocked_packages": ["requests", "urllib3", "httpx"], "blocked_packages": ["requests", "urllib3", "httpx"],
"max_execution_time": 3600, "max_execution_time": 3600,
"gpu_access": true, "gpu_devices": ["/dev/dri"],
"ml_env": "ml_env", "ml_env": "ml_env",
"package_manager": "mamba" "package_manager": "mamba"
} }

View file

@ -32,6 +32,10 @@ RUN mamba install -n ml_env \
-c pytorch -c conda-forge -y && \ -c pytorch -c conda-forge -y && \
conda clean -afy conda clean -afy
# Poetry (for pyproject.toml + poetry.lock projects)
RUN mamba install -n ml_env poetry -c conda-forge -y && \
conda clean -afy
# Copy security wrapper # Copy security wrapper
COPY secure_runner.py /usr/local/bin/secure_runner.py COPY secure_runner.py /usr/local/bin/secure_runner.py
COPY security_policy.json /etc/ml_runner/security_policy.json COPY security_policy.json /etc/ml_runner/security_policy.json

View file

@ -45,7 +45,7 @@ class SecurityPolicy:
], ],
"max_execution_time": 3600, "max_execution_time": 3600,
"max_memory_gb": 16, "max_memory_gb": 16,
"gpu_access": True, "gpu_devices": ["/dev/dri"],
"allow_file_writes": True, "allow_file_writes": True,
"resource_limits": { "resource_limits": {
"cpu_count": 4, "cpu_count": 4,
@ -106,97 +106,197 @@ class CondaRunner:
self.conda_prefix = os.environ.get("CONDA_PREFIX", "/opt/conda") self.conda_prefix = os.environ.get("CONDA_PREFIX", "/opt/conda")
self.env_path = f"{self.conda_prefix}/envs/{self.conda_env}" self.env_path = f"{self.conda_prefix}/envs/{self.conda_env}"
def setup_environment(self, requirements_file: Path) -> bool: self.gpu_devices = self.security_policy.policy.get("gpu_devices", [])
"""Setup Conda environment with mamba"""
def setup_environment(self, deps_file: Path) -> bool:
"""Setup Conda environment based on a dependency manifest."""
try: try:
# Read requirements name = deps_file.name
with open(requirements_file, "r") as f:
requirements = [
line.strip()
for line in f
if line.strip() and not line.startswith("#")
]
# Check each package for security print(f"[MANIFEST] Using dependency manifest: {name}")
for req in requirements:
package_name = (
req.split("==")[0].split(">=")[0].split("<=")[0].strip()
)
if not self.security_policy.check_package_safety(package_name):
print(
f"[SECURITY] Package '{package_name}' is blocked for security reasons"
)
return False
# Install packages with mamba (super fast!) if name in ("environment.yml", "environment.yaml"):
for req in requirements: print(f"[SETUP] Applying conda environment file: {deps_file}")
package_name = ( cmd = [
req.split("==")[0].split(">=")[0].split("<=")[0].strip()
)
# Check if already installed with conda
check_cmd = [
"conda",
"run",
"-n",
self.conda_env,
"python",
"-c",
f"import {package_name.replace('-', '_')}",
]
result = subprocess.run(
check_cmd, capture_output=True, text=True
)
if result.returncode == 0:
print(f"[OK] {package_name} already installed in conda env")
continue
# Try conda-forge first (faster and more reliable)
print(
f"[INSTALL] Installing {req} with {self.package_manager}..."
)
install_cmd = [
self.package_manager, self.package_manager,
"install", "env",
"update",
"-n", "-n",
self.conda_env, self.conda_env,
req, "-f",
"-c", str(deps_file),
"conda-forge",
"-y", "-y",
] ]
result = subprocess.run( result = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
install_cmd, capture_output=True, text=True, timeout=300 if result.returncode != 0:
print(f"[ERROR] Failed to apply environment file: {result.stderr}")
return False
return True
if name == "poetry.lock":
pyproject = self.workspace_dir / "pyproject.toml"
if not pyproject.exists():
print("[ERROR] poetry.lock provided but pyproject.toml is missing")
return False
print(f"[SETUP] Installing dependencies from Poetry lockfile: {deps_file}")
env = os.environ.copy()
env.update(
{
"POETRY_VIRTUALENVS_CREATE": "false",
"POETRY_NO_INTERACTION": "1",
}
) )
if result.returncode == 0: # Ensure Poetry is available in the conda env.
print(f"[OK] Installed {req} with {self.package_manager}") check = subprocess.run(
continue ["conda", "run", "-n", self.conda_env, "poetry", "--version"],
capture_output=True,
text=True,
env=env,
)
if check.returncode != 0:
print("[ERROR] Poetry is not available in the container environment")
print(check.stderr)
return False
# Fallback to pip if conda fails # Install into the conda env (no separate venv).
print(f"[FALLBACK] Trying pip for {req}...") install = subprocess.run(
pip_cmd = [ [
"conda",
"run",
"-n",
self.conda_env,
"poetry",
"install",
"--no-ansi",
],
capture_output=True,
text=True,
timeout=900,
cwd=str(self.workspace_dir),
env=env,
)
if install.returncode != 0:
print("[ERROR] Poetry install failed")
print(install.stderr)
return False
return True
if name == "pyproject.toml":
# Use pip's PEP517/pyproject support (no Poetry required).
# This installs the project itself; dependencies may be fetched as needed.
print(f"[SETUP] Installing project from pyproject.toml: {deps_file}")
cmd = [
"conda", "conda",
"run", "run",
"-n", "-n",
self.conda_env, self.conda_env,
"pip", "pip",
"install", "install",
req, str(self.workspace_dir),
"--no-cache-dir", "--no-cache-dir",
] ]
result = subprocess.run( result = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
pip_cmd, capture_output=True, text=True, timeout=300
)
if result.returncode != 0: if result.returncode != 0:
print(f"[ERROR] Failed to install {req}: {result.stderr}") print(f"[ERROR] Failed to install project from pyproject.toml: {result.stderr}")
return False return False
return True
print(f"[OK] Installed {req} with pip") if name == "requirements.txt":
# Read requirements
with open(deps_file, "r") as f:
requirements = [
line.strip()
for line in f
if line.strip() and not line.startswith("#")
]
return True # Check each package for security
for req in requirements:
package_name = (
req.split("==")[0].split(">=")[0].split("<=")[0].strip()
)
if not self.security_policy.check_package_safety(package_name):
print(
f"[SECURITY] Package '{package_name}' is blocked for security reasons"
)
return False
# Install packages with mamba (super fast!)
for req in requirements:
package_name = (
req.split("==")[0].split(">=")[0].split("<=")[0].strip()
)
# Check if already installed with conda
check_cmd = [
"conda",
"run",
"-n",
self.conda_env,
"python",
"-c",
f"import {package_name.replace('-', '_')}",
]
result = subprocess.run(
check_cmd, capture_output=True, text=True
)
if result.returncode == 0:
print(f"[OK] {package_name} already installed in conda env")
continue
# Try conda-forge first (faster and more reliable)
print(
f"[INSTALL] Installing {req} with {self.package_manager}..."
)
install_cmd = [
self.package_manager,
"install",
"-n",
self.conda_env,
req,
"-c",
"conda-forge",
"-y",
]
result = subprocess.run(
install_cmd, capture_output=True, text=True, timeout=300
)
if result.returncode == 0:
print(f"[OK] Installed {req} with {self.package_manager}")
continue
# Fallback to pip if conda fails
print(f"[FALLBACK] Trying pip for {req}...")
pip_cmd = [
"conda",
"run",
"-n",
self.conda_env,
"pip",
"install",
req,
"--no-cache-dir",
]
result = subprocess.run(
pip_cmd, capture_output=True, text=True, timeout=300
)
if result.returncode != 0:
print(f"[ERROR] Failed to install {req}: {result.stderr}")
return False
print(f"[OK] Installed {req} with pip")
return True
print(f"[ERROR] Unsupported dependency manifest: {deps_file}")
print("Supported: environment.yml, environment.yaml, poetry.lock (requires pyproject.toml), pyproject.toml, requirements.txt")
return False
except Exception as e: except Exception as e:
print(f"[ERROR] Environment setup failed: {e}") print(f"[ERROR] Environment setup failed: {e}")
@ -217,7 +317,7 @@ class CondaRunner:
env.update( env.update(
{ {
"CONDA_DEFAULT_ENV": self.conda_env, "CONDA_DEFAULT_ENV": self.conda_env,
"CUDA_VISIBLE_DEVICES": "0", # Allow GPU access "CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES", ""), # Allow GPU access
"SECURE_MODE": "1", "SECURE_MODE": "1",
"NETWORK_ACCESS": ( "NETWORK_ACCESS": (
"1" "1"
@ -280,7 +380,7 @@ class CondaRunner:
"stdout": stdout, "stdout": stdout,
"stderr": stderr, "stderr": stderr,
"return_code": process.returncode, "return_code": process.returncode,
"gpu_accessible": True, "gpu_accessible": len(self.gpu_devices) > 0,
"security_mode": "enabled", "security_mode": "enabled",
"container_type": "conda", "container_type": "conda",
"conda_env": self.conda_env, "conda_env": self.conda_env,
@ -338,8 +438,12 @@ def main():
parser.add_argument( parser.add_argument(
"--workspace", default="/workspace", help="Workspace directory" "--workspace", default="/workspace", help="Workspace directory"
) )
parser.add_argument("--requirements", help="Requirements file path") parser.add_argument("--deps", help="Dependency manifest path (environment.yml | poetry.lock | pyproject.toml | requirements.txt)")
parser.add_argument("--requirements", help="Deprecated alias for --deps")
parser.add_argument("--script", help="Training script path") parser.add_argument("--script", help="Training script path")
parser.add_argument(
"--prepare-only", action="store_true", help="Only prepare dependencies and exit"
)
parser.add_argument( parser.add_argument(
"--args", "--args",
nargs=argparse.REMAINDER, nargs=argparse.REMAINDER,
@ -383,17 +487,26 @@ def main():
if args.check_gpu: if args.check_gpu:
return 0 return 0
deps_arg = args.deps or args.requirements
if not deps_arg:
print("[ERROR] Missing dependency manifest. Provide --deps.")
return 1
# Setup environment # Setup environment
requirements_path = Path(args.requirements) deps_path = Path(deps_arg)
if not requirements_path.exists(): if not deps_path.exists():
print(f"[ERROR] Requirements file not found: {requirements_path}") print(f"[ERROR] Dependency manifest not found: {deps_path}")
return 1 return 1
print("[SETUP] Setting up secure environment...") print("[SETUP] Setting up secure environment...")
if not runner.setup_environment(requirements_path): if not runner.setup_environment(deps_path):
print("[ERROR] Failed to setup secure environment") print("[ERROR] Failed to setup secure environment")
return 1 return 1
if args.prepare_only:
print("[DONE] Environment prepared successfully")
return 0
# Run experiment # Run experiment
script_path = Path(args.script) script_path = Path(args.script)
if not script_path.exists(): if not script_path.exists():

View file

@ -24,7 +24,7 @@
], ],
"max_execution_time": 3600, "max_execution_time": 3600,
"max_memory_gb": 16, "max_memory_gb": 16,
"gpu_access": true, "gpu_devices": ["/dev/dri"],
"allow_file_writes": true, "allow_file_writes": true,
"resource_limits": { "resource_limits": {
"cpu_count": 4, "cpu_count": 4,

View file

@ -20,19 +20,12 @@ This directory contains setup and utility scripts for FetchML.
sudo ./scripts/setup-prod.sh /data/ml-experiments ml-user ml-group sudo ./scripts/setup-prod.sh /data/ml-experiments ml-user ml-group
``` ```
### `validate-prod-config.sh` ### Configuration validation
**Purpose**: Validates production configuration files Validate configs using the built-in config lint targets:
**Usage**: `./scripts/validate-prod-config.sh [api-config] [worker-config]`
**What it does**:
- Checks config file syntax
- Verifies base_path consistency
- Tests Redis connectivity
- Validates Podman setup
- Checks directory permissions
**Example**:
```bash ```bash
./scripts/validate-prod-config.sh configs/config-prod.yaml configs/worker-prod.toml make configlint
make worker-configlint
``` ```
## Legacy Setup Scripts (Deprecated) ## Legacy Setup Scripts (Deprecated)
@ -44,12 +37,11 @@ The following scripts are from earlier iterations and are **deprecated** in favo
- `auto_setup.sh` - Old automated setup (superseded) - `auto_setup.sh` - Old automated setup (superseded)
- `setup_common.sh` - Common functions (integrated into setup-prod.sh) - `setup_common.sh` - Common functions (integrated into setup-prod.sh)
- `quick_start.sh` - Quick dev setup (use docker-compose on macOS instead) - `quick_start.sh` - Quick dev setup (use docker-compose on macOS instead)
- `test_tools.sh` - Tool testing (integrated into validate-prod-config.sh)
### Cleanup Recommendation ### Cleanup Recommendation
These legacy scripts can be removed or archived. The current production setup only needs: These legacy scripts can be removed or archived. The current production setup only needs:
- `setup-prod.sh` - `setup-prod.sh`
- `validate-prod-config.sh`
## Usage Workflow ## Usage Workflow
@ -59,8 +51,8 @@ These legacy scripts can be removed or archived. The current production setup on
sudo ./scripts/setup-prod.sh sudo ./scripts/setup-prod.sh
# 2. Copy and configure # 2. Copy and configure
sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml sudo cp configs/api/prod.yaml /etc/fetch_ml/config.yaml
sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml sudo cp configs/workers/worker-prod.toml /etc/fetch_ml/worker.toml
sudo vim /etc/fetch_ml/config.yaml # Update API keys, etc. sudo vim /etc/fetch_ml/config.yaml # Update API keys, etc.
# 3. Build and install # 3. Build and install
@ -68,7 +60,8 @@ make prod
sudo make install sudo make install
# 4. Validate # 4. Validate
./scripts/validate-prod-config.sh /etc/fetch_ml/config.yaml /etc/fetch_ml/worker.toml ./bin/configlint --schema configs/schema/api_server_config.yaml /etc/fetch_ml/config.yaml
./bin/configlint --schema configs/schema/worker_config_schema.yaml /etc/fetch_ml/worker.toml
# 5. Start services # 5. Start services
sudo systemctl start fetchml-api fetchml-worker sudo systemctl start fetchml-api fetchml-worker
@ -82,7 +75,7 @@ docker-compose up -d
# Or run components directly # Or run components directly
make dev make dev
./bin/api-server -config configs/config-local.yaml ./bin/api-server -config configs/api/dev.yaml
``` ```
## Script Maintenance ## Script Maintenance

View file

@ -8,6 +8,7 @@ set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts" LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"
TIMESTAMP=$(date -u +"%Y%m%d_%H%M%S") TIMESTAMP=$(date -u +"%Y%m%d_%H%M%S")
RUN_DIR="$LOCAL_ARTIFACTS_DIR/run_$TIMESTAMP" RUN_DIR="$LOCAL_ARTIFACTS_DIR/run_$TIMESTAMP"
@ -168,14 +169,25 @@ if [ -f "$SCRIPT_DIR/cleanup-benchmarks.sh" ]; then
"$SCRIPT_DIR/cleanup-benchmarks.sh" benchmarks "$SCRIPT_DIR/cleanup-benchmarks.sh" benchmarks
else else
# Fallback cleanup if script not available # Fallback cleanup if script not available
echo "Cleaning old benchmark runs (keeping last 10)..." echo "Archiving old benchmark runs (keeping last 10)..."
stamp=$(date -u +%Y%m%d-%H%M%S)
mkdir -p "$ARCHIVE_DIR/$stamp"
cd "$LOCAL_ARTIFACTS_DIR" cd "$LOCAL_ARTIFACTS_DIR"
ls -1t run_* 2>/dev/null | tail -n +11 | xargs rm -rf 2>/dev/null || echo "No old runs to clean" ls -1t run_* 2>/dev/null | tail -n +11 | while read -r run; do
[ -n "$run" ] || continue
mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
done
# Clean temporary files # Clean temporary files
echo "Cleaning temporary files..." echo "Archiving temporary files..."
find /tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true tmp_archive_dir="$LOCAL_ARTIFACTS_DIR/tmp-archive/$stamp"
find /var/tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true mkdir -p "$tmp_archive_dir"
find /tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
done
find /var/tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
done
# Clean Go build cache # Clean Go build cache
echo "Cleaning Go build cache..." echo "Cleaning Go build cache..."

View file

View file

@ -1,49 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
# Create a Bitwarden item for a FetchML API user.
#
# Usage:
# ./scripts/create_bitwarden_fetchml_item.sh <username> <api_key> <api_key_hash>
#
# Requirements:
# - Bitwarden CLI (bw) installed
# - You are logged in and unlocked (bw login; bw unlock)
# - jq installed
#
# This script does NOT run on the homelab server. Run it from your
# own machine where you manage Bitwarden.
if [[ $# -ne 3 ]]; then
echo "Usage: $0 <username> <api_key> <api_key_hash>" >&2
exit 1
fi
USER_NAME="$1"
API_KEY="$2"
API_KEY_HASH="$3"
ITEM_NAME="FetchML API  $USER_NAME"
# Get base item template
TEMPLATE_JSON=$(bw get template item)
# Build item JSON with jq
ITEM_JSON=$(echo "$TEMPLATE_JSON" | jq \
--arg name "$ITEM_NAME" \
--arg username "$USER_NAME" \
--arg password "$API_KEY" \
--arg hash "$API_KEY_HASH" \
'.name = $name
| .login.username = $username
| .login.password = $password
| .notes = "FetchML API key for user " + $username
| .fields = [{"name":"api_key_hash","value":$hash,"type":1}]')
# Create item in Bitwarden
# If you ever want to edit instead, you can capture the ID from this call
# and use: bw edit item <id> <json>
echo "$ITEM_JSON" | bw encode | bw create item
echo "Created Bitwarden item: $ITEM_NAME"

View file

@ -1,90 +0,0 @@
#!/bin/bash
# Setup auto-cleanup service for fetch_ml
# This creates a systemd timer that runs cleanup daily
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
# Colors
GREEN='\033[0;32m'
BLUE='\033[0;34m'
NC='\033[0m'
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_info "Setting up auto-cleanup service..."
# Check if running on macOS or Linux
if [[ "$OSTYPE" == "darwin"* ]]; then
log_info "Detected macOS - setting up launchd agent"
# Create launchd plist
cat > ~/Library/LaunchAgents/com.fetchml.cleanup.plist << EOF
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key>
<string>com.fetchml.cleanup</string>
<key>ProgramArguments</key>
<array>
<string>$PROJECT_DIR/scripts/cleanup.sh</string>
<string>--force</string>
</array>
<key>StartInterval</key>
<integer>86400</integer>
<key>RunAtLoad</key>
<false/>
<key>StandardOutPath</key>
<string>/tmp/fetchml-cleanup.log</string>
<key>StandardErrorPath</key>
<string>/tmp/fetchml-cleanup.error.log</string>
</dict>
</plist>
EOF
# Load the launchd agent
launchctl load ~/Library/LaunchAgents/com.fetchml.cleanup.plist
log_success "Auto-cleanup service installed for macOS"
log_info "Logs will be in /tmp/fetchml-cleanup.log"
elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
log_info "Detected Linux - setting up systemd timer"
# Copy service files
sudo cp "$SCRIPT_DIR/auto-cleanup.service" /etc/systemd/system/
sudo cp "$SCRIPT_DIR/auto-cleanup.timer" /etc/systemd/system/
# Reload systemd and enable timer
sudo systemctl daemon-reload
sudo systemctl enable auto-cleanup.timer
sudo systemctl start auto-cleanup.timer
log_success "Auto-cleanup service installed for Linux"
log_info "Check status with: systemctl status auto-cleanup.timer"
else
echo "Unsupported OS: $OSTYPE"
exit 1
fi
log_info "Auto-cleanup will run daily"
log_info "To uninstall:"
if [[ "$OSTYPE" == "darwin"* ]]; then
echo " launchctl unload ~/Library/LaunchAgents/com.fetchml.cleanup.plist"
echo " rm ~/Library/LaunchAgents/com.fetchml.cleanup.plist"
else
echo " sudo systemctl stop auto-cleanup.timer"
echo " sudo systemctl disable auto-cleanup.timer"
echo " sudo rm /etc/systemd/system/auto-cleanup.*"
fi

View file

@ -1,275 +0,0 @@
#!/bin/bash
# Production Monitoring Stack Setup for Linux
# Deploys Prometheus/Grafana/Loki/Promtail as Podman containers with systemd
# Compatible with: Rocky/RHEL/CentOS, Ubuntu/Debian, Arch, SUSE, etc.
set -e
BOLD='\033[1m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[0;33m'
NC='\033[0m'
echo -e "${BOLD}=== FetchML Monitoring Stack Setup (Linux) ===${NC}\n"
# Detect Linux distribution and package manager
detect_distro() {
if [ -f /etc/os-release ]; then
. /etc/os-release
DISTRO=$ID
DISTRO_VERSION=$VERSION_ID
elif [ -f /etc/redhat-release ]; then
DISTRO="rhel"
else
DISTRO="unknown"
fi
# Detect package manager
if command -v dnf &>/dev/null; then
PKG_MANAGER="dnf"
elif command -v yum &>/dev/null; then
PKG_MANAGER="yum"
elif command -v apt-get &>/dev/null; then
PKG_MANAGER="apt"
elif command -v pacman &>/dev/null; then
PKG_MANAGER="pacman"
elif command -v zypper &>/dev/null; then
PKG_MANAGER="zypper"
else
echo -e "${YELLOW}Warning: No known package manager found${NC}"
PKG_MANAGER="unknown"
fi
echo "Detected distribution: $DISTRO (using $PKG_MANAGER)"
}
detect_distro
# Configuration
DATA_PATH="${1:-/data/monitoring}"
ML_USER="${2:-ml-user}"
ML_GROUP="${3:-ml-group}"
echo "Configuration:"
echo " Monitoring data path: $DATA_PATH"
echo " User: $ML_USER"
echo " Group: $ML_GROUP"
echo ""
# Create pod for monitoring stack
POD_NAME="monitoring"
# 1. Create directories
echo -e "${BLUE}[1/6]${NC} Creating directory structure..."
sudo mkdir -p "${DATA_PATH}"/{prometheus,grafana,loki,promtail-config}
sudo mkdir -p /etc/fetch_ml/monitoring
sudo mkdir -p /var/lib/grafana/dashboards
sudo chown -R $ML_USER:$ML_GROUP $DATA_PATH
sudo chmod 755 $DATA_PATH
echo -e "${GREEN}${NC} Directories created"
# 2. Copy configuration files
echo -e "${BLUE}[2/6]${NC} Copying configuration files..."
sudo cp monitoring/prometheus.yml /etc/fetch_ml/monitoring/
sudo cp monitoring/loki-config.yml /etc/fetch_ml/monitoring/
sudo cp monitoring/promtail-config.yml /etc/fetch_ml/monitoring/
sudo cp monitoring/grafana/provisioning /etc/fetch_ml/monitoring/ -r
sudo cp monitoring/grafana-dashboard.json /var/lib/grafana/dashboards/ml-queue.json
sudo cp monitoring/logs-dashboard.json /var/lib/grafana/dashboards/logs.json
sudo chown -R $ML_USER:$ML_GROUP /etc/fetch_ml/monitoring
sudo chown -R $ML_USER:$ML_GROUP /var/lib/grafana
echo -e "${GREEN}${NC} Configuration copied"
# 3. Create Podman pod
echo -e "${BLUE}[3/6]${NC} Creating Podman pod..."
sudo -u $ML_USER podman pod create \\
--name $POD_NAME \\
-p 3000:3000 \\
-p 9090:9090 \\
-p 3100:3100 \\
|| echo "Pod may already exist"
echo -e "${GREEN}${NC} Pod created"
# 4. Create systemd service for monitoring pod
echo -e "${BLUE}[4/6]${NC} Creating systemd services..."
# Prometheus service
sudo tee /etc/systemd/system/prometheus.service >/dev/null <<EOF
[Unit]
Description=Prometheus Monitoring
After=network.target
PartOf=$POD_NAME-pod.service
[Service]
Type=simple
User=$ML_USER
Group=$ML_GROUP
Restart=always
RestartSec=10
ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 9090:9090
ExecStart=/usr/bin/podman run --rm --name prometheus \\
--pod $POD_NAME \\
-v /etc/fetch_ml/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro \\
-v ${DATA_PATH}/prometheus:/prometheus \\
docker.io/prom/prometheus:latest \\
--config.file=/etc/prometheus/prometheus.yml \\
--storage.tsdb.path=/prometheus \\
--web.enable-lifecycle
ExecStop=/usr/bin/podman stop -t 10 prometheus
[Install]
WantedBy=multi-user.target
EOF
# Loki service
sudo tee /etc/systemd/system/loki.service >/dev/null <<EOF
[Unit]
Description=Loki Log Aggregation
After=network.target
PartOf=$POD_NAME-pod.service
[Service]
Type=simple
User=$ML_USER
Group=$ML_GROUP
Restart=always
RestartSec=10
ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 3100:3100
ExecStart=/usr/bin/podman run --rm --name loki \\
--pod $POD_NAME \\
-v /etc/fetch_ml/monitoring/loki-config.yml:/etc/loki/local-config.yaml:ro \\
-v ${DATA_PATH}/loki:/loki \\
docker.io/grafana/loki:latest \\
-config.file=/etc/loki/local-config.yaml
ExecStop=/usr/bin/podman stop -t 10 loki
[Install]
WantedBy=multi-user.target
EOF
# Grafana service
sudo tee /etc/systemd/system/grafana.service >/dev/null <<EOF
[Unit]
Description=Grafana Visualization
After=network.target prometheus.service loki.service
PartOf=$POD_NAME-pod.service
[Service]
Type=simple
User=$ML_USER
Group=$ML_GROUP
Restart=always
RestartSec=10
ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 3000:3000
ExecStart=/usr/bin/podman run --rm --name grafana \\
--pod $POD_NAME \\
-v ${DATA_PATH}/grafana:/var/lib/grafana \\
-v /etc/fetch_ml/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro \\
-v /var/lib/grafana/dashboards:/var/lib/grafana/dashboards:ro \\
-e GF_SECURITY_ADMIN_PASSWORD=\${GRAFANA_ADMIN_PASSWORD:-$(openssl rand -base64 32)} \\
-e GF_USERS_ALLOW_SIGN_UP=false \\
-e GF_AUTH_ANONYMOUS_ENABLED=false \\
docker.io/grafana/grafana:latest
ExecStop=/usr/bin/podman stop -t 10 grafana
[Install]
WantedBy=multi-user.target
EOF
# Promtail service
sudo tee /etc/systemd/system/promtail.service >/dev/null <<EOF
[Unit]
Description=Promtail Log Collector
After=network.target loki.service
PartOf=$POD_NAME-pod.service
[Service]
Type=simple
User=$ML_USER
Group=$ML_GROUP
Restart=always
RestartSec=10
ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME
ExecStart=/usr/bin/podman run --rm --name promtail \\
--pod $POD_NAME \\
-v /etc/fetch_ml/monitoring/promtail-config.yml:/etc/promtail/config.yml:ro \\
-v /var/log/fetch_ml:/var/log/app:ro \\
docker.io/grafana/promtail:latest \\
-config.file=/etc/promtail/config.yml
ExecStop=/usr/bin/podman stop -t 10 promtail
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
echo -e "${GREEN}${NC} Systemd services created"
# 5. Create monitoring pod service
echo -e "${BLUE}[5/6]${NC} Creating pod management service..."
sudo -u $ML_USER podman generate systemd --new --name $POD_NAME \\
| sudo tee /etc/systemd/system/$POD_NAME-pod.service >/dev/null
sudo systemctl daemon-reload
echo -e "${GREEN}${NC} Pod service created"
# 6. Setup firewall rules
echo -e "${BLUE}[6/6]${NC} Configuring firewall..."
if command -v firewall-cmd &>/dev/null; then
# RHEL/Rocky/Fedora (firewalld)
sudo firewall-cmd --permanent --add-port=3000/tcp # Grafana
sudo firewall-cmd --permanent --add-port=9090/tcp # Prometheus
sudo firewall-cmd --reload
echo -e "${GREEN}${NC} Firewall configured (firewalld)"
elif command -v ufw &>/dev/null; then
# Ubuntu/Debian (ufw)
sudo ufw allow 3000/tcp comment 'Grafana'
sudo ufw allow 9090/tcp comment 'Prometheus'
echo -e "${GREEN}${NC} Firewall configured (ufw)"
else
echo -e "${YELLOW}!${NC} No firewall detected. You may need to manually open ports 3000 and 9090"
fi
# Summary
echo ""
echo -e "${BOLD}=== Monitoring Stack Setup Complete! ===${NC}"
echo ""
echo "Services created:"
echo " - prometheus.service (Metrics collection)"
echo " - loki.service (Log aggregation)"
echo " - grafana.service (Visualization)"
echo " - promtail.service (Log shipping)"
echo ""
echo -e "${BOLD}Next steps:${NC}"
echo "1. Start services:"
echo " sudo systemctl start prometheus"
echo " sudo systemctl start loki"
echo " sudo systemctl start promtail"
echo " sudo systemctl start grafana"
echo ""
echo "2. Enable on boot:"
echo " sudo systemctl enable prometheus loki promtail grafana"
echo ""
echo "3. Access Grafana:"
echo " http://YOUR_SERVER_IP:3000"
echo " Username: admin"
echo " Password: admin (change on first login)"
echo ""
echo "4. Check logs:"
echo " sudo journalctl -u prometheus -f"
echo " sudo journalctl -u grafana -f"
echo ""

View file

@ -1,229 +0,0 @@
#!/bin/bash
# Production Setup Script for Rocky Linux (Bare Metal)
# This script sets up the complete FetchML environment on bare metal
set -e
BOLD='\033[1m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
NC='\033[0m'
echo -e "${BOLD}=== FetchML Production Setup (Rocky Linux Bare Metal) ===${NC}\n"
# Configuration
BASE_PATH="${1:-/data/ml-experiments}"
ML_USER="${2:-ml-user}"
ML_GROUP="${3:-ml-group}"
echo "Configuration:"
echo " Base path: $BASE_PATH"
echo " ML user: $ML_USER"
echo " ML group: $ML_GROUP"
echo ""
# 1. Create system user if it doesn't exist
echo -e "${BLUE}[1/8]${NC} Creating system user..."
if id "$ML_USER" &>/dev/null; then
echo " User $ML_USER already exists"
else
sudo useradd -r -s /bin/bash -m -d /home/$ML_USER -c "ML System User" $ML_USER
echo -e "${GREEN}${NC} Created user: $ML_USER"
fi
# 2. Create directory structure
echo -e "${BLUE}[2/8]${NC} Creating directory structure..."
sudo mkdir -p "${BASE_PATH}"/{experiments,pending,running,finished,failed,datasets}
sudo mkdir -p /var/log/fetch_ml
sudo mkdir -p /etc/fetch_ml
echo -e "${GREEN}${NC} Created directories:"
echo " $BASE_PATH/experiments/"
echo " $BASE_PATH/pending/"
echo " $BASE_PATH/running/"
echo " $BASE_PATH/finished/"
echo " $BASE_PATH/failed/"
echo " $BASE_PATH/datasets/"
echo " /var/log/fetch_ml/"
echo " /etc/fetch_ml/"
# 3. Set ownership and permissions
echo -e "${BLUE}[3/8]${NC} Setting permissions..."
sudo chown -R $ML_USER:$ML_GROUP $BASE_PATH
sudo chmod 755 $BASE_PATH
sudo chmod 700 $BASE_PATH/experiments # Restrict experiment data
sudo chown -R $ML_USER:$ML_GROUP /var/log/fetch_ml
sudo chmod 755 /var/log/fetch_ml
echo -e "${GREEN}${NC} Permissions set"
# 4. Install system dependencies (Rocky Linux)
echo -e "${BLUE}[4/8]${NC} Installing system dependencies..."
sudo dnf install -y \
golang \
podman \
redis \
git \
make \
gcc \
|| echo "Some packages may already be installed"
echo -e "${GREEN}${NC} Dependencies installed"
# 5. Configure Podman for GPU access (if NVIDIA GPU present)
echo -e "${BLUE}[5/8]${NC} Configuring Podman..."
if lspci | grep -i nvidia &>/dev/null; then
echo " NVIDIA GPU detected, configuring GPU access..."
# Install nvidia-container-toolkit if not present
if ! command -v nvidia-container-toolkit &>/dev/null; then
echo " Installing nvidia-container-toolkit..."
sudo dnf config-manager --add-repo \
https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo
sudo dnf install -y nvidia-container-toolkit
fi
# Configure Podman CDI
sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
echo -e "${GREEN}${NC} GPU support configured"
else
echo " No NVIDIA GPU detected, skipping GPU setup"
fi
# 6. Configure Redis
echo -e "${BLUE}[6/8]${NC} Configuring Redis..."
sudo systemctl enable redis
sudo systemctl start redis || echo "Redis may already be running"
# Set Redis password if not already configured
if ! sudo grep -q "^requirepass" /etc/redis/redis.conf 2>/dev/null; then
REDIS_PASSWORD=$(openssl rand -base64 32)
echo "requirepass $REDIS_PASSWORD" | sudo tee -a /etc/redis/redis.conf >/dev/null
sudo systemctl restart redis
echo " Generated Redis password: $REDIS_PASSWORD"
echo " Save this password for your configuration!"
else
echo " Redis password already configured"
fi
echo -e "${GREEN}${NC} Redis configured"
# 7. Setup systemd services
echo -e "${BLUE}[7/8]${NC} Creating systemd services..."
# API Server service
sudo tee /etc/systemd/system/fetchml-api.service >/dev/null <<EOF
[Unit]
Description=FetchML API Server
After=network.target redis.service
Wants=redis.service
[Service]
Type=simple
User=$ML_USER
Group=$ML_GROUP
WorkingDirectory=/opt/fetch_ml
ExecStart=/usr/local/bin/fetchml-api -config /etc/fetch_ml/config.yaml
Restart=always
RestartSec=10
StandardOutput=append:/var/log/fetch_ml/api.log
StandardError=append:/var/log/fetch_ml/api-error.log
# Security hardening
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths=$BASE_PATH /var/log/fetch_ml
[Install]
WantedBy=multi-user.target
EOF
# Worker service
sudo tee /etc/systemd/system/fetchml-worker.service >/dev/null <<EOF
[Unit]
Description=FetchML Worker
After=network.target redis.service fetchml-api.service
Wants=redis.service
[Service]
Type=simple
User=$ML_USER
Group=$ML_GROUP
WorkingDirectory=/opt/fetch_ml
ExecStart=/usr/local/bin/fetchml-worker -config /etc/fetch_ml/worker.toml
Restart=always
RestartSec=10
StandardOutput=append:/var/log/fetch_ml/worker.log
StandardError=append:/var/log/fetch_ml/worker-error.log
# Security hardening
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths=$BASE_PATH /var/log/fetch_ml
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
echo -e "${GREEN}${NC} Systemd services created"
# 8. Setup logrotate
echo -e "${BLUE}[8/8]${NC} Configuring log rotation..."
sudo tee /etc/logrotate.d/fetchml >/dev/null <<EOF
/var/log/fetch_ml/*.log {
daily
rotate 14
compress
delaycompress
notifempty
missingok
create 0640 $ML_USER $ML_GROUP
sharedscripts
postrotate
systemctl reload fetchml-api >/dev/null 2>&1 || true
systemctl reload fetchml-worker >/dev/null 2>&1 || true
endscript
}
EOF
echo -e "${GREEN}${NC} Log rotation configured"
# Summary
echo ""
echo -e "${BOLD}=== Setup Complete! ===${NC}"
echo ""
echo "Directory structure created at: $BASE_PATH"
echo "Logs will be written to: /var/log/fetch_ml/"
echo "Configuration directory: /etc/fetch_ml/"
echo ""
echo -e "${BOLD}Next steps:${NC}"
echo "1. Copy your config files:"
echo " sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml"
echo " sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml"
echo ""
echo "2. Build and install binaries:"
echo " make build"
echo " sudo cp bin/api-server /usr/local/bin/fetchml-api"
echo " sudo cp bin/worker /usr/local/bin/fetchml-worker"
echo ""
echo "3. Update config files with your settings (Redis password, API keys, etc.)"
echo ""
echo "4. Start services:"
echo " sudo systemctl start fetchml-api"
echo " sudo systemctl start fetchml-worker"
echo ""
echo "5. Enable services to start on boot:"
echo " sudo systemctl enable fetchml-api"
echo " sudo systemctl enable fetchml-worker"
echo ""
echo "6. Check status:"
echo " sudo systemctl status fetchml-api"
echo " sudo systemctl status fetchml-worker"
echo " sudo journalctl -u fetchml-api -f"
echo ""

View file

@ -1,455 +0,0 @@
#!/bin/bash
# Automatic Setup Script for ML Experiment Manager
# Handles complete environment setup with security features
set -euo pipefail
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
print_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
detect_os() {
if [[ "$OSTYPE" == "darwin"* ]]; then
echo "macos"
elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
echo "linux"
else
echo "unknown"
fi
}
install_go() {
print_info "Installing Go..."
local os=$(detect_os)
local go_version="1.23.0"
if [[ "$os" == "macos" ]]; then
if command -v brew &> /dev/null; then
brew install go
else
print_error "Homebrew not found. Please install Go manually."
return 1
fi
elif [[ "$os" == "linux" ]]; then
wget -q "https://go.dev/dl/go${go_version}.linux-amd64.tar.gz"
sudo rm -rf /usr/local/go
sudo tar -C /usr/local -xzf "go${go_version}.linux-amd64.tar.gz"
rm "go${go_version}.linux-amd64.tar.gz"
# Add to PATH
echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc
export PATH=$PATH:/usr/local/go/bin
fi
print_success "Go installed"
}
install_zig() {
print_info "Installing Zig..."
local os=$(detect_os)
if [[ "$os" == "macos" ]]; then
if command -v brew &> /dev/null; then
brew install zig
else
print_error "Homebrew not found. Please install Zig manually."
return 1
fi
elif [[ "$os" == "linux" ]]; then
# Download Zig binary
local zig_version="0.13.0"
wget -q "https://ziglang.org/download/${zig_version}/zig-linux-x86_64-${zig_version}.tar.xz"
tar -xf "zig-linux-x86_64-${zig_version}.tar.xz"
sudo mv "zig-linux-x86_64-${zig_version}/zig" /usr/local/bin/
rm -rf "zig-linux-x86_64-${zig_version}.tar.xz" "zig-linux-x86_64-${zig_version}"
fi
print_success "Zig installed"
}
install_docker() {
print_info "Installing Docker..."
local os=$(detect_os)
if [[ "$os" == "macos" ]]; then
if command -v brew &> /dev/null; then
brew install --cask docker
print_warning "Docker Desktop installed. Please start it manually."
else
print_error "Homebrew not found. Please install Docker manually."
return 1
fi
elif [[ "$os" == "linux" ]]; then
# Install Docker using official script
curl -fsSL https://get.docker.com -o get-docker.sh
sudo sh get-docker.sh
sudo usermod -aG docker $USER
rm get-docker.sh
# Start Docker
sudo systemctl enable docker
sudo systemctl start docker
print_success "Docker installed. You may need to log out and log back in."
fi
}
install_redis() {
print_info "Installing Redis..."
local os=$(detect_os)
if [[ "$os" == "macos" ]]; then
if command -v brew &> /dev/null; then
brew install redis
brew services start redis
else
print_error "Homebrew not found. Please install Redis manually."
return 1
fi
elif [[ "$os" == "linux" ]]; then
sudo apt-get update
sudo apt-get install -y redis-server
sudo systemctl enable redis-server
sudo systemctl start redis-server
fi
print_success "Redis installed and started"
}
install_dependencies() {
print_info "Installing dependencies..."
local os=$(detect_os)
# Install basic tools
if [[ "$os" == "macos" ]]; then
if command -v brew &> /dev/null; then
brew install openssl curl jq
fi
elif [[ "$os" == "linux" ]]; then
sudo apt-get update
sudo apt-get install -y openssl curl jq build-essential
fi
# Install Go tools
if command -v go &> /dev/null; then
go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest
go install golang.org/x/tools/cmd/goimports@latest
fi
print_success "Dependencies installed"
}
setup_project() {
print_info "Setting up project..."
# Create directories
mkdir -p bin
mkdir -p data
mkdir -p logs
mkdir -p db
mkdir -p ssl
mkdir -p configs
mkdir -p scripts
# Build project
if command -v make &> /dev/null; then
make build
if command -v zig &> /dev/null; then
make cli-build
fi
else
print_warning "Make not found, building manually..."
go build -o bin/worker ./cmd/worker
go build -o bin/tui ./cmd/tui
go build -o bin/data_manager ./cmd/data_manager
go build -o bin/user_manager ./cmd/user_manager
go build -o bin/api-server ./cmd/api-server
if command -v zig &> /dev/null; then
cd cli && zig build && cd ..
fi
fi
print_success "Project setup completed"
}
setup_security() {
print_info "Setting up security features..."
# Generate SSL certificates
if command -v openssl &> /dev/null; then
openssl req -x509 -newkey rsa:4096 -keyout ssl/key.pem -out ssl/cert.pem \
-days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" \
-addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null || {
print_warning "Failed to generate SSL certificates"
}
print_success "SSL certificates generated"
fi
# Generate secure configuration
local redis_password=$(openssl rand -base64 32 2>/dev/null || echo "dev_redis_password_123")
local jwt_secret=$(openssl rand -base64 64 2>/dev/null || echo "dev_jwt_secret_1234567890123456789012345678901234567890123456789012345678901234")
cat > configs/security-config.yaml << EOF
base_path: "/data/ml-experiments"
auth:
enabled: true
api_keys:
test_user:
hash: "$(echo -n "dev_test_api_key_12345" | sha256sum | cut -d' ' -f1)"
admin: true
roles: ["data_scientist", "admin"]
permissions:
read: true
write: true
delete: true
server:
address: ":9101"
tls:
enabled: true
cert_file: "./ssl/cert.pem"
key_file: "./ssl/key.pem"
min_version: "1.3"
security:
rate_limit:
enabled: true
requests_per_minute: 60
burst_size: 10
ip_whitelist:
- "127.0.0.1"
- "::1"
- "10.0.0.0/8"
- "192.168.0.0/16"
- "172.16.0.0/12"
failed_login_lockout:
enabled: true
max_attempts: 5
lockout_duration: "15m"
redis:
url: "redis://localhost:6379"
password: "${redis_password}"
logging:
level: "info"
file: "logs/fetch_ml.log"
audit_log: "logs/audit.log"
EOF
cat > .env.dev << EOF
# Development environment variables
REDIS_PASSWORD=${redis_password}
JWT_SECRET=${jwt_secret}
GRAFANA_USER=admin
GRAFANA_PASSWORD=$(openssl rand -base64 16 2>/dev/null || echo "dev_grafana_password")
EOF
print_success "Security configuration created"
}
test_installation() {
print_info "Testing installation..."
local tests_passed=0
local tests_total=0
# Test Go
tests_total=$((tests_total + 1))
if command -v go &> /dev/null; then
print_success "Go: Installed"
tests_passed=$((tests_passed + 1))
else
print_error "Go: Not found"
fi
# Test Zig
tests_total=$((tests_total + 1))
if command -v zig &> /dev/null; then
print_success "Zig: Installed"
tests_passed=$((tests_passed + 1))
else
print_warning "Zig: Not found (optional)"
tests_total=$((tests_total - 1))
fi
# Test Docker
tests_total=$((tests_total + 1))
if command -v docker &> /dev/null; then
print_success "Docker: Installed"
tests_passed=$((tests_passed + 1))
else
print_warning "Docker: Not found (optional)"
tests_total=$((tests_total - 1))
fi
# Test Redis
tests_total=$((tests_total + 1))
if command -v redis-cli &> /dev/null; then
if redis-cli ping | grep -q "PONG"; then
print_success "Redis: Running"
tests_passed=$((tests_passed + 1))
else
print_warning "Redis: Not running"
fi
else
print_warning "Redis: Not found (optional)"
tests_total=$((tests_total - 1))
fi
# Test binaries
if [[ -f "bin/api-server" ]]; then
tests_total=$((tests_total + 1))
if ./bin/api-server --help > /dev/null 2>&1; then
print_success "API Server: Built"
tests_passed=$((tests_passed + 1))
else
print_error "API Server: Build failed"
fi
fi
if [[ $tests_total -gt 0 ]]; then
local success_rate=$((tests_passed * 100 / tests_total))
print_info "Tests: $tests_passed/$tests_total passed ($success_rate%)"
fi
print_success "Installation testing completed"
}
show_next_steps() {
print_success "Automatic setup completed!"
echo
echo "Next Steps:"
echo "==========="
echo ""
echo "1. Load environment variables:"
echo " source .env.dev"
echo ""
echo "2. Start the API server:"
echo " ./bin/api-server -config configs/config.yaml"
echo ""
echo "3. Test the Zig CLI (if installed):"
echo " ./cli/zig-out/bin/ml --help"
echo ""
echo "4. Deploy with Docker (optional):"
echo " make docker-run"
echo ""
echo "5. Docker Compose deployment:"
echo " docker-compose up -d"
echo ""
echo "Configuration Files:"
echo " configs/config.yaml # Main configuration"
echo " configs/config_local.yaml # Local development"
echo " ssl/cert.pem, ssl/key.pem # TLS certificates"
echo ""
echo "Documentation:"
echo " docs/DEPLOYMENT.md # Deployment guide"
echo ""
echo "Quick Commands:"
echo " make help # Show all commands"
echo " make test # Run tests"
echo " docker-compose up -d # Start services"
echo ""
print_success "Ready to use ML Experiment Manager!"
}
# Main setup function
main() {
echo "ML Experiment Manager Automatic Setup"
echo "====================================="
echo ""
print_info "Starting automatic setup..."
echo ""
# Check and install dependencies
if ! command -v go &> /dev/null; then
print_info "Go not found, installing..."
install_go
fi
if ! command -v zig &> /dev/null; then
print_info "Zig not found, installing..."
install_zig
fi
if ! command -v docker &> /dev/null; then
print_info "Docker not found, installing..."
install_docker
fi
if ! command -v redis-cli &> /dev/null; then
print_info "Redis not found, installing..."
install_redis
fi
# Install additional dependencies
install_dependencies
# Setup project
setup_project
# Setup security
setup_security
# Test installation
test_installation
# Show next steps
show_next_steps
}
# Handle command line arguments
case "${1:-setup}" in
"setup")
main
;;
"deps")
install_dependencies
;;
"test")
test_installation
;;
"help"|"-h"|"--help")
echo "Automatic Setup Script"
echo "Usage: $0 {setup|deps|test|help}"
echo ""
echo "Commands:"
echo " setup - Run full automatic setup"
echo " deps - Install dependencies only"
echo " test - Test installation"
echo " help - Show this help"
;;
*)
print_error "Unknown command: $1"
echo "Use '$0 help' for usage information"
exit 1
;;
esac

View file

@ -1,314 +0,0 @@
#!/usr/bin/env bash
# Fetch ML Quick Start Script with Security
# Sets up development environment with security features and creates test user
set -euo pipefail
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
print_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
check_prerequisites() {
print_info "Checking prerequisites..."
# Check Go
if ! command -v go &> /dev/null; then
print_error "Go is not installed. Please install Go 1.25 or later."
exit 1
fi
local go_version=$(go version | awk '{print $3}' | sed 's/go//')
print_info "Go version: $go_version"
# Check Zig
if ! command -v zig &> /dev/null; then
print_warning "Zig is not installed. CLI features will not be available."
else
local zig_version=$(zig version)
print_info "Zig version: $zig_version"
fi
# Check Docker
if ! command -v docker &> /dev/null; then
print_warning "Docker is not installed. Container features will not work."
fi
# Check Redis
if ! command -v redis-server &> /dev/null && ! command -v redis-cli &> /dev/null; then
print_warning "Redis is not installed. Starting local Redis..."
fi
# Check OpenSSL for certificates
if ! command -v openssl &> /dev/null; then
print_warning "OpenSSL is not installed. TLS certificates will not be generated."
fi
print_success "Prerequisites checked"
}
setup_project() {
print_info "Setting up Fetch ML project..."
# Create directories
mkdir -p bin
mkdir -p data
mkdir -p logs
mkdir -p db
mkdir -p ssl
mkdir -p configs
print_success "Project directories created"
}
build_project() {
print_info "Building Fetch ML..."
# Build Go binaries
make build
# Build Zig CLI if available
if command -v zig &> /dev/null; then
make cli-build
print_success "Zig CLI built"
fi
print_success "Build completed"
}
generate_ssl_certificates() {
print_info "Generating SSL certificates..."
if command -v openssl &> /dev/null; then
# Generate self-signed certificate for development
openssl req -x509 -newkey rsa:4096 -keyout ssl/key.pem -out ssl/cert.pem \
-days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" \
-addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null || {
print_warning "Failed to generate SSL certificates"
return 1
}
print_success "SSL certificates generated in ssl/"
print_info "Certificates are self-signed (development only)"
else
print_warning "OpenSSL not available, skipping SSL certificates"
fi
}
setup_redis() {
print_info "Setting up Redis..."
if command -v redis-server &> /dev/null; then
if ! pgrep -f "redis-server" > /dev/null; then
redis-server --daemonize yes --port 6379
print_success "Redis started"
else
print_info "Redis already running"
fi
else
print_warning "Redis not available, some features may be limited"
fi
}
create_secure_config() {
print_info "Creating secure development configuration..."
# Generate secure passwords and secrets
local redis_password=$(openssl rand -base64 32 2>/dev/null || echo "dev_redis_password_123")
local jwt_secret=$(openssl rand -base64 64 2>/dev/null || echo "dev_jwt_secret_1234567890123456789012345678901234567890123456789012345678901234")
# Create development config
cat > configs/config.yaml << EOF
base_path: "/data/ml-experiments"
auth:
enabled: true
api_keys:
test_user:
hash: "$(echo -n "dev_test_api_key_12345" | sha256sum | cut -d' ' -f1)"
admin: true
roles: ["data_scientist", "admin"]
permissions:
read: true
write: true
delete: true
server:
address: ":9101"
tls:
enabled: true
cert_file: "./ssl/cert.pem"
key_file: "./ssl/key.pem"
min_version: "1.3"
security:
rate_limit:
enabled: true
requests_per_minute: 60
burst_size: 10
ip_whitelist:
- "127.0.0.1"
- "::1"
- "10.0.0.0/8"
- "192.168.0.0/16"
- "172.16.0.0/12"
failed_login_lockout:
enabled: true
max_attempts: 5
lockout_duration: "15m"
redis:
url: "redis://localhost:6379"
password: "${redis_password}"
logging:
level: "info"
file: "logs/fetch_ml.log"
audit_log: "logs/audit.log"
EOF
# Create environment file
cat > .env.dev << EOF
# Development environment variables
REDIS_PASSWORD=${redis_password}
JWT_SECRET=${jwt_secret}
GRAFANA_USER=admin
GRAFANA_PASSWORD=$(openssl rand -base64 16 2>/dev/null || echo "dev_grafana_password")
EOF
print_success "Secure configuration created"
print_warning "Using development certificates and passwords"
}
create_test_user() {
print_info "Creating test user..."
# Generate API key for test user
local api_key="dev_test_api_key_12345"
local api_key_hash=$(echo -n "$api_key" | sha256sum | cut -d' ' -f1)
print_success "Test user created successfully"
echo "Username: test_user"
echo "API Key: $api_key"
echo "API Key Hash: $api_key_hash"
echo "Store this key safely!"
echo ""
echo "Environment variables in .env.dev"
echo "Run: source .env.dev"
}
test_setup() {
print_info "Testing setup..."
# Test Go binaries
if [[ -f "bin/api-server" ]]; then
./bin/api-server --help > /dev/null 2>&1 || true
print_success "API server binary OK"
fi
if [[ -f "bin/worker" ]]; then
./bin/worker --help > /dev/null 2>&1 || true
print_success "Worker binary OK"
fi
# Test Zig CLI
if [[ -f "cli/zig-out/bin/ml" ]]; then
./cli/zig-out/bin/ml --help > /dev/null 2>&1 || true
print_success "Zig CLI binary OK"
fi
# Test Redis connection
if command -v redis-cli &> /dev/null; then
if redis-cli ping > /dev/null 2>&1; then
print_success "Redis connection OK"
else
print_warning "Redis not responding"
fi
fi
# Test SSL certificates
if [[ -f "ssl/cert.pem" && -f "ssl/key.pem" ]]; then
if openssl x509 -in ssl/cert.pem -noout -checkend 86400 > /dev/null 2>&1; then
print_success "SSL certificates valid"
else
print_warning "SSL certificates expired or invalid"
fi
fi
}
show_next_steps() {
print_success "Secure quick start completed!"
echo
echo "Next steps:"
echo "1. Load environment variables:"
echo " source .env.dev"
echo
echo "2. Start API server:"
echo " ./bin/api-server -config configs/config.yaml"
echo
echo "3. Test Zig CLI:"
echo " ./cli/zig-out/bin/ml --help"
echo
echo "4. Test with curl (HTTPS):"
echo " curl -k -H 'X-API-Key: dev_test_api_key_12345' https://localhost:9101/health"
echo
echo "5. Deploy with Docker:"
echo " docker-compose up -d"
echo
echo "Features Enabled:"
echo " ✅ HTTPS/TLS encryption"
echo " ✅ API key authentication"
echo " ✅ Rate limiting"
echo " ✅ IP whitelisting"
echo " ✅ Security headers"
echo " ✅ Audit logging"
echo
echo "Configuration Files:"
echo " configs/config.yaml # Main configuration"
echo " .env.dev # Environment variables"
echo " ssl/cert.pem, ssl/key.pem # TLS certificates"
echo
echo "Documentation:"
echo " docs/DEPLOYMENT.md # Deployment guide"
echo ""
print_success "Ready to run ML experiments!"
}
# Main function
main() {
echo "Fetch ML Quick Start Script (with Security & Zig CLI)"
echo "===================================================="
echo ""
check_prerequisites
setup_project
build_project
generate_ssl_certificates
setup_redis
create_secure_config
create_test_user
test_setup
show_next_steps
}
# Run main function
main "$@"

View file

@ -1,124 +0,0 @@
#!/usr/bin/env bash
# Shared helper functions for Fetch ML setup scripts (Ubuntu/Rocky)
set -euo pipefail
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# Configuration defaults
FETCH_ML_USER="fetchml"
FETCH_ML_HOME="/opt/fetchml"
SERVICE_DIR="/etc/systemd/system"
LOG_DIR="/var/log/fetchml"
DATA_DIR="/var/lib/fetchml"
CONFIG_DIR="$FETCH_ML_HOME/configs"
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
# Download file with checksum verification
# Args: url, checksum, dest
secure_download() {
local url="$1" checksum="$2" dest="$3"
curl -fsSL "$url" -o "$dest"
echo "$checksum $dest" | sha256sum --check --status || {
log_error "Checksum verification failed for $dest"
rm -f "$dest"
exit 1
}
}
cleanup_temp() {
if [[ -n "${TMP_FILES:-}" ]]; then
rm -f $TMP_FILES || true
fi
}
trap cleanup_temp EXIT
ensure_user() {
if ! id "$FETCH_ML_USER" &>/dev/null; then
useradd -m -d "$FETCH_ML_HOME" -s /bin/bash "$FETCH_ML_USER"
fi
usermod -aG podman "$FETCH_ML_USER" || true
}
create_directories() {
mkdir -p "$FETCH_ML_HOME" "$LOG_DIR" "$DATA_DIR" "$FETCH_ML_HOME/bin" "$CONFIG_DIR"
chown -R "$FETCH_ML_USER":"$FETCH_ML_USER" "$FETCH_ML_HOME" "$LOG_DIR" "$DATA_DIR"
}
setup_systemd_service() {
local name="$1" exec="$2"
cat > "$SERVICE_DIR/${name}.service" <<EOF
[Unit]
Description=Fetch ML ${name^} Service
After=network.target redis.service
Wants=redis.service
[Service]
Type=simple
User=$FETCH_ML_USER
Group=$FETCH_ML_USER
WorkingDirectory=$FETCH_ML_HOME
Environment=PATH=$FETCH_ML_HOME/bin:/usr/local/go/bin:/usr/bin:/bin
ExecStart=$exec
Restart=on-failure
RestartSec=5
StandardOutput=journal
StandardError=journal
SyslogIdentifier=fetch_ml_${name}
[Install]
WantedBy=multi-user.target
EOF
}
setup_logrotate() {
cat > /etc/logrotate.d/fetch_ml <<'EOF'
/var/log/fetchml/*.log {
daily
missingok
rotate 14
compress
delaycompress
notifempty
create 0640 fetchml fetchml
}
EOF
}
hardening_steps() {
# Increase file limits
if ! grep -q fetchml /etc/security/limits.conf; then
cat >> /etc/security/limits.conf <<'EOF'
fetchml soft nofile 65536
fetchml hard nofile 65536
EOF
fi
# Enable unattended security upgrades if available
if command -v apt-get &>/dev/null; then
apt-get install -y unattended-upgrades >/dev/null || true
elif command -v dnf &>/dev/null; then
dnf install -y dnf-automatic >/dev/null || true
fi
}
selinux_guidance() {
if command -v getenforce &>/dev/null; then
local mode=$(getenforce)
log_info "SELinux mode: $mode"
if [[ "$mode" == "Enforcing" ]]; then
log_info "Ensure systemd units and directories have proper contexts. Example:"
echo " semanage fcontext -a -t bin_t '$FETCH_ML_HOME/bin(/.*)?'"
echo " restorecon -Rv $FETCH_ML_HOME/bin"
fi
fi
}

View file

@ -1,417 +0,0 @@
#!/usr/bin/env bash
# Fetch ML Rocky Linux Setup Script
# Optimized for ML experiments on Rocky Linux 8/9
set -euo pipefail
# shellcheck source=scripts/setup_common.sh
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
source "$SCRIPT_DIR/setup_common.sh"
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "This script must be run as root"
exit 1
fi
}
check_rocky() {
if ! command -v dnf &> /dev/null && ! command -v yum &> /dev/null; then
log_error "This script is designed for Rocky Linux systems"
exit 1
fi
local rocky_version=$(cat /etc/rocky-release | grep -oE '[0-9]+\.[0-9]+')
log_info "Rocky Linux version: $rocky_version"
# Use dnf for Rocky 9+, yum for Rocky 8
if command -v dnf &> /dev/null; then
PKG_MANAGER="dnf"
else
PKG_MANAGER="yum"
fi
}
update_system() {
log_info "Updating system packages..."
$PKG_MANAGER update -y
$PKG_MANAGER upgrade -y
$PKG_MANAGER install -y curl wget gnupg2
}
enable_epel() {
log_info "Enabling EPEL repository..."
if $PKG_MANAGER repolist | grep -q "epel"; then
log_info "EPEL already enabled"
return
fi
$PKG_MANAGER install -y epel-release
$PKG_MANAGER config-manager --set-enabled powertools
log_success "EPEL repository enabled"
}
install_go() {
log_info "Installing Go 1.25..."
if command -v go &> /dev/null; then
local go_version=$(go version | awk '{print $3}' | sed 's/go//')
log_info "Go already installed: $go_version"
return
fi
cd /tmp
TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz"
secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz"
tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz
# Add to PATH
echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile
echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile
export PATH=$PATH:/usr/local/go/bin
log_success "Go 1.25 installed"
}
install_podman() {
log_info "Installing Podman..."
if command -v podman &> /dev/null; then
log_info "Podman already installed"
return
fi
# Install Podman and related tools
$PKG_MANAGER install -y podman podman-compose containernetworking-plugins
# Configure Podman
mkdir -p /etc/containers
cat > /etc/containers/containers.conf << EOF
[containers]
user_namespace_enable = 1
runtime = "crun"
[network]
network_backend = "netavark"
[engine]
cgroup_manager = "systemd"
EOF
# Enable user namespaces
echo "user.max_user_namespaces=15000" >> /etc/sysctl.conf
sysctl -p user.max_user_namespaces=15000
log_success "Podman installed"
}
install_redis() {
log_info "Installing Redis..."
if command -v redis-server &> /dev/null; then
log_info "Redis already installed"
return
fi
$PKG_MANAGER install -y redis
# Configure Redis for production
sed -i 's/supervised no/supervised systemd/' /etc/redis.conf
sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis.conf
systemctl enable redis
systemctl start redis
log_success "Redis installed and configured"
}
install_nvidia_drivers() {
log_info "Checking for NVIDIA GPU..."
if command -v nvidia-smi &> /dev/null; then
log_info "NVIDIA drivers already installed"
nvidia-smi
return
fi
if lspci | grep -i nvidia &> /dev/null; then
log_info "NVIDIA GPU detected, installing drivers..."
# Enable NVIDIA repository
$PKG_MANAGER config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel$(rpm -E %rhel)/x86_64/cuda-rhel.repo
# Clean and install
$PKG_MANAGER clean all
$PKG_MANAGER module enable -y nvidia-driver:latest-dkms
$PKG_MANAGER install -y nvidia-driver nvidia-cuda-toolkit
# Configure Podman for NVIDIA (only if needed)
if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then
log_warning "NVIDIA GPU access test failed, you may need to reboot"
else
log_success "NVIDIA drivers installed and GPU access verified"
fi
# Reboot required
log_warning "System reboot required for NVIDIA drivers"
log_info "Run: reboot"
else
log_info "No NVIDIA GPU detected, skipping driver installation"
fi
}
install_ml_tools() {
log_info "Installing ML tools and dependencies..."
# Python and ML packages
$PKG_MANAGER install -y python3 python3-pip python3-devel
# System dependencies for ML
$PKG_MANAGER groupinstall -y "Development Tools"
$PKG_MANAGER install -y cmake git pkgconfig
$PKG_MANAGER install -y libjpeg-turbo-devel libpng-devel libtiff-devel
$PKG_MANAGER install -y mesa-libGL-devel mesa-libGLU-devel
$PKG_MANAGER install -y gtk3-devel
$PKG_MANAGER install -y atlas-devel blas-devel lapack-devel
# Install common ML libraries
pip3 install --upgrade pip
pip3 install numpy scipy scikit-learn pandas
pip3 install jupyter matplotlib seaborn
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
log_success "ML tools installed"
}
create_user() {
log_info "Creating fetchml user..."
if id "$FETCH_ML_USER" &>/dev/null; then
log_info "User $FETCH_ML_USER already exists"
return
fi
useradd -m -d $FETCH_ML_HOME -s /bin/bash $FETCH_ML_USER
usermod -aG podman $FETCH_ML_USER
# Create directories
mkdir -p $FETCH_ML_HOME/.config/containers
mkdir -p $FETCH_ML_HOME/go/bin
mkdir -p $LOG_DIR
mkdir -p $DATA_DIR
chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
chown -R $FETCH_ML_USER:$FETCH_ML_USER $LOG_DIR
chown -R $FETCH_ML_USER:$FETCH_ML_USER $DATA_DIR
log_success "User $FETCH_ML_USER created"
}
setup_firewall() {
log_info "Configuring firewall..."
if command -v firewall-cmd &> /dev/null; then
systemctl enable firewalld
systemctl start firewalld
firewall-cmd --permanent --add-service=ssh
firewall-cmd --permanent --add-port=8080/tcp # Worker API
firewall-cmd --permanent --add-port=8081/tcp # Data manager API
firewall-cmd --permanent --add-port=6379/tcp # Redis
firewall-cmd --reload
firewall-cmd --list-all
else
log_warning "Firewalld not available, skipping firewall configuration"
fi
}
setup_systemd_services() {
log_info "Setting up systemd services..."
# Fetch ML Worker service
cat > $SERVICE_DIR/fetch_ml_worker.service << EOF
[Unit]
Description=Fetch ML Worker Service
After=network.target redis.service
Wants=redis.service
[Service]
Type=simple
User=$FETCH_ML_USER
Group=$FETCH_ML_USER
WorkingDirectory=$FETCH_ML_HOME
Environment=FETCH_ML_HOME=$FETCH_ML_HOME
Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin
ExecStart=$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml
Restart=always
RestartSec=5
StandardOutput=journal
StandardError=journal
SyslogIdentifier=fetch_ml_worker
[Install]
WantedBy=multi-user.target
EOF
# Fetch ML Data Manager service
cat > $SERVICE_DIR/fetch_ml_data_manager.service << EOF
[Unit]
Description=Fetch ML Data Manager Service
After=network.target redis.service
Wants=redis.service
[Service]
Type=simple
User=$FETCH_ML_USER
Group=$FETCH_ML_USER
WorkingDirectory=$FETCH_ML_HOME
Environment=FETCH_ML_HOME=$FETCH_ML_HOME
Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin
ExecStart=$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml
Restart=always
RestartSec=5
StandardOutput=journal
StandardError=journal
SyslogIdentifier=fetch_ml_data_manager
[Install]
WantedBy=multi-user.target
EOF
# Enable services
systemctl daemon-reload
systemctl enable fetch_ml_worker
systemctl enable fetch_ml_data_manager
log_success "Systemd services configured"
}
setup_log_rotation() {
log_info "Setting up log rotation..."
cat > /etc/logrotate.d/fetch_ml << EOF
$LOG_DIR/*.log {
daily
missingok
rotate 30
compress
delaycompress
notifempty
create 0644 $FETCH_ML_USER $FETCH_ML_USER
postrotate
systemctl reload fetch_ml_worker || true
systemctl reload fetch_ml_data_manager || true
endscript
}
EOF
log_success "Log rotation configured"
}
optimize_system() {
log_info "Optimizing system for ML workloads..."
# Increase file limits
echo "* soft nofile 65536" >> /etc/security/limits.conf
echo "* hard nofile 65536" >> /etc/security/limits.conf
# Optimize kernel parameters for ML
cat >> /etc/sysctl.conf << EOF
# ML Optimization
net.core.rmem_max = 134217728
net.core.wmem_max = 134217728
vm.swappiness = 10
vm.dirty_ratio = 15
vm.dirty_background_ratio = 5
EOF
sysctl -p
# Configure GPU persistence mode if NVIDIA available
if command -v nvidia-smi &> /dev/null; then
nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode"
fi
# Disable SELinux for better container compatibility (optional)
if [[ -f /etc/selinux/config ]]; then
log_warning "Consider setting SELinux to permissive mode for better container compatibility"
log_info "Edit /etc/selinux/config and set SELINUX=permissive"
fi
log_success "System optimized for ML workloads"
}
install_fetch_ml() {
log_info "Installing Fetch ML..."
# Clone or copy Fetch ML
cd $FETCH_ML_HOME
if [[ ! -d "fetch_ml" ]]; then
log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml"
log_info "Example: git clone https://github.com/your-org/fetch_ml.git"
return
fi
cd fetch_ml
# Build
export PATH=$PATH:/usr/local/go/bin
make build
# Copy binaries
cp bin/* $FETCH_ML_HOME/bin/
chmod +x $FETCH_ML_HOME/bin/*
# Copy configs
mkdir -p $FETCH_ML_HOME/configs
cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml
# Set permissions
chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
log_success "Fetch ML installed"
}
main() {
log_info "Starting Fetch ML Rocky Linux server setup..."
check_root
check_rocky
update_system
enable_epel
install_go
install_podman
install_redis
install_nvidia_drivers
install_ml_tools
ensure_user
create_directories
setup_firewall
setup_systemd_services
setup_logrotate
hardening_steps
selinux_guidance
install_fetch_ml
log_success "Fetch ML setup complete!"
echo
log_info "Next steps:"
echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml"
echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml"
echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager"
echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager"
echo "5. View logs: journalctl -u fetch_ml_worker -f"
echo
log_info "Services will be available at:"
echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080"
echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081"
}
# Run main function
main "$@"

View file

@ -1,294 +0,0 @@
#!/usr/bin/env bash
# Fetch ML Ubuntu Server Setup Script
# Optimized for ML experiments on Ubuntu 20.04/22.04
set -euo pipefail
# shellcheck source=scripts/setup_common.sh
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
source "$SCRIPT_DIR/setup_common.sh"
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "This script must be run as root"
exit 1
fi
}
check_ubuntu() {
if ! command -v apt-get &> /dev/null; then
log_error "This script is designed for Ubuntu systems"
exit 1
fi
local ubuntu_version=$(lsb_release -rs)
log_info "Ubuntu version: $ubuntu_version"
if (( $(echo "$ubuntu_version < 20.04" | bc -l) == 1 )); then
log_warning "Ubuntu version < 20.04 may not support all features"
fi
}
update_system() {
log_info "Updating system packages..."
apt-get update -y
apt-get upgrade -y
apt-get install -y curl wget gnupg lsb-release software-properties-common
}
install_go() {
log_info "Installing Go 1.25..."
if command -v go &> /dev/null; then
local go_version=$(go version | awk '{print $3}' | sed 's/go//')
log_info "Go already installed: $go_version"
return
fi
cd /tmp
TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz"
secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz"
tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz
# Add to PATH
echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile
echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile
export PATH=$PATH:/usr/local/go/bin
log_success "Go 1.25 installed"
}
install_podman() {
log_info "Installing Podman..."
if command -v podman &> /dev/null; then
log_info "Podman already installed"
return
fi
# Add official Podman repository
echo "deb https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/ /" | tee /etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list
curl -L "https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/Release.key" | apt-key add -
apt-get update -y
apt-get install -y podman podman-compose
# Configure Podman for rootless operation
echo "user_namespace_enable = 1" >> /etc/containers/containers.conf
echo "runtime = \"crun\"" >> /etc/containers/containers.conf
log_success "Podman installed"
}
install_redis() {
log_info "Installing Redis..."
if command -v redis-server &> /dev/null; then
log_info "Redis already installed"
return
fi
apt-get install -y redis-server
# Configure Redis for production
sed -i 's/supervised no/supervised systemd/' /etc/redis/redis.conf
sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis/redis.conf
systemctl enable redis-server
systemctl start redis-server
log_success "Redis installed and configured"
}
install_nvidia_drivers() {
log_info "Checking for NVIDIA GPU..."
if command -v nvidia-smi &> /dev/null; then
log_info "NVIDIA drivers already installed"
nvidia-smi
return
fi
if lspci | grep -i nvidia &> /dev/null; then
log_info "NVIDIA GPU detected, installing drivers..."
# Add NVIDIA repository
TMP_FILES="/tmp/cuda-keyring_1.1-1_all.deb"
secure_download "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$(lsb_release -rs | cut -d. -f1)/x86_64/cuda-keyring_1.1-1_all.deb" "cfa6b4109e7e3d9be060a016b7dc07e8edcd5356c0eabcc0c537a76e6c603d76" "/tmp/cuda-keyring_1.1-1_all.deb"
dpkg -i /tmp/cuda-keyring_1.1-1_all.deb
apt-get update -y
# Install drivers
apt-get install -y nvidia-driver-535 nvidia-cuda-toolkit
# Configure Podman for NVIDIA (only if needed)
if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then
log_warning "NVIDIA GPU access test failed, you may need to reboot"
else
log_success "NVIDIA drivers installed and GPU access verified"
fi
else
log_info "No NVIDIA GPU detected, skipping driver installation"
fi
}
install_ml_tools() {
log_info "Installing ML tools and dependencies..."
# Python and ML packages
apt-get install -y python3 python3-pip python3-venv
# System dependencies for ML
apt-get install -y build-essential cmake git pkg-config
apt-get install -y libjpeg-dev libpng-dev libtiff-dev
apt-get install -y libavcodec-dev libavformat-dev libswscale-dev
apt-get install -y libgtk2.0-dev libcanberra-gtk-module
apt-get install -y libxvidcore-dev libx264-dev
apt-get install -y libatlas-base-dev gfortran
# Install common ML libraries
pip3 install --upgrade pip
pip3 install numpy scipy scikit-learn pandas
pip3 install jupyter matplotlib seaborn
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
log_success "ML tools installed"
}
create_user() {
log_info "Creating fetchml user..."
ensure_user
create_directories
log_success "User $FETCH_ML_USER and directories created"
}
setup_firewall() {
log_info "Configuring firewall..."
if command -v ufw &> /dev/null; then
ufw --force enable
ufw allow ssh
ufw allow 8080/tcp # Worker API
ufw allow 8081/tcp # Data manager API
ufw allow 6379/tcp # Redis
ufw status
else
log_warning "UFW not available, skipping firewall configuration"
fi
}
setup_systemd_services() {
log_info "Setting up systemd services..."
setup_systemd_service "fetch_ml_worker" "$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml"
setup_systemd_service "fetch_ml_data_manager" "$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml"
# Enable services
systemctl daemon-reload
systemctl enable fetch_ml_worker
systemctl enable fetch_ml_data_manager
log_success "Systemd services configured"
}
setup_log_rotation() {
log_info "Setting up log rotation..."
setup_logrotate
log_success "Log rotation configured"
}
optimize_system() {
log_info "Optimizing system for ML workloads..."
hardening_steps
# Optimize kernel parameters for ML
cat >> /etc/sysctl.conf << EOF
# ML Optimization
net.core.rmem_max = 134217728
net.core.wmem_max = 134217728
vm.swappiness = 10
vm.dirty_ratio = 15
vm.dirty_background_ratio = 5
EOF
sysctl -p
# Configure GPU persistence mode if NVIDIA available
if command -v nvidia-smi &> /dev/null; then
nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode"
fi
log_success "System optimized for ML workloads"
}
install_fetch_ml() {
log_info "Installing Fetch ML..."
# Clone or copy Fetch ML
cd $FETCH_ML_HOME
if [[ ! -d "fetch_ml" ]]; then
# This would be replaced with actual repository URL
log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml"
log_info "Example: git clone https://github.com/your-org/fetch_ml.git"
return
fi
cd fetch_ml
# Build
export PATH=$PATH:/usr/local/go/bin
make build
# Copy binaries
cp bin/* $FETCH_ML_HOME/bin/
chmod +x $FETCH_ML_HOME/bin/*
# Copy configs
mkdir -p $FETCH_ML_HOME/configs
cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml
# Set permissions
chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
log_success "Fetch ML installed"
}
main() {
log_info "Starting Fetch ML Ubuntu server setup..."
check_root
check_ubuntu
update_system
install_go
install_podman
install_redis
install_nvidia_drivers
install_ml_tools
ensure_user
create_directories
setup_firewall
setup_systemd_services
setup_logrotate
hardening_steps
install_fetch_ml
log_success "Fetch ML setup complete!"
echo
log_info "Next steps:"
echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml"
echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml"
echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager"
echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager"
echo "5. View logs: journalctl -u fetch_ml_worker -f"
echo
log_info "Services will be available at:"
echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080"
echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081"
}
# Run main function
main "$@"

View file

@ -1,67 +0,0 @@
#!/bin/bash
set -e
echo "=== Test Tools Harness ==="
# Function to check if Redis is running, start temporary instance if needed
ensure_redis() {
if ! redis-cli ping >/dev/null 2>&1; then
echo "Starting temporary Redis instance..."
redis-server --daemonize yes --port 6379
sleep 2
if ! redis-cli ping >/dev/null 2>&1; then
echo "Failed to start Redis"
exit 1
fi
echo "Redis started successfully"
# Set up cleanup trap
trap 'echo "Stopping temporary Redis..."; redis-cli shutdown || true' EXIT
else
echo "Redis is already running"
fi
}
# Step 1: Build Go binaries
echo "Building Go binaries..."
go build -o bin/api-server ./cmd/api-server
go build -o bin/worker ./cmd/worker
go build -o bin/data_manager ./cmd/data_manager
go build -o bin/user_manager ./cmd/user_manager
# Step 2: Build Zig CLI
echo "Building Zig CLI..."
cd cli
zig build
cd ..
# Step 3: Ensure Redis is running
ensure_redis
# Step 4: Run Go tests
echo "Running Go tests..."
go test ./...
# Step 5: Run Zig tests
echo "Running Zig CLI tests..."
cd cli
zig test
cd ..
# Step 6: Run Go E2E tests (Redis is already available)
echo "Running Go E2E tests..."
go test ./tests/e2e/...
# Step 7: Smoke test API server and CLI
echo "Running smoke test..."
# Start API server in background on different port
./bin/api-server -config configs/config.yaml -port 19101 -no-tls > /tmp/api-server.log 2>&1 &
API_PID=$!
sleep 2
# Test CLI status
./cli/zig-out/bin/ml status -server http://localhost:19101
# Clean up
kill $API_PID 2>/dev/null || true
echo "=== All tests completed successfully ==="

View file

@ -5,7 +5,7 @@ Requires=docker.service
[Service] [Service]
Type=oneshot Type=oneshot
ExecStart=/Users/jfraeys/Documents/dev/fetch_ml/scripts/cleanup.sh --force ExecStart=/Users/jfraeys/Documents/dev/fetch_ml/scripts/maintenance/cleanup.sh --dry-run
User=jfraeys User=jfraeys
Group=staff Group=staff
StandardOutput=journal StandardOutput=journal

View file

@ -8,6 +8,7 @@ set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts" LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"
# Colors for output # Colors for output
RED='\033[0;31m' RED='\033[0;31m'
@ -43,22 +44,34 @@ cleanup_benchmark_artifacts() {
case "${1:-keep-10}" in case "${1:-keep-10}" in
"all") "all")
print_status "Removing ALL benchmark artifacts..." print_status "Archiving ALL benchmark artifacts..."
rm -rf "$LOCAL_ARTIFACTS_DIR" local stamp=$(date -u +%Y%m%d-%H%M%S)
print_success "Removed all artifacts (was $size_before)" mkdir -p "$ARCHIVE_DIR/$stamp"
mv "$LOCAL_ARTIFACTS_DIR"/run_* "$ARCHIVE_DIR/$stamp"/ 2>/dev/null || true
print_success "Archived all artifacts (was $size_before)"
;; ;;
"keep-5") "keep-5")
print_status "Keeping last 5 runs, removing older ones..." print_status "Keeping last 5 runs, archiving older ones..."
local stamp=$(date -u +%Y%m%d-%H%M%S)
mkdir -p "$ARCHIVE_DIR/$stamp"
cd "$LOCAL_ARTIFACTS_DIR" cd "$LOCAL_ARTIFACTS_DIR"
ls -1t run_* 2>/dev/null | tail -n +6 | xargs rm -rf 2>/dev/null || true ls -1t run_* 2>/dev/null | tail -n +6 | while read -r run; do
[ -n "$run" ] || continue
mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
done
local count_after=$(ls -1d run_* 2>/dev/null | wc -l) local count_after=$(ls -1d run_* 2>/dev/null | wc -l)
local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B") local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B")
print_success "Cleaned old runs: $count_before$count_after runs ($size_before$size_after)" print_success "Cleaned old runs: $count_before$count_after runs ($size_before$size_after)"
;; ;;
"keep-10") "keep-10")
print_status "Keeping last 10 runs, removing older ones..." print_status "Keeping last 10 runs, archiving older ones..."
local stamp=$(date -u +%Y%m%d-%H%M%S)
mkdir -p "$ARCHIVE_DIR/$stamp"
cd "$LOCAL_ARTIFACTS_DIR" cd "$LOCAL_ARTIFACTS_DIR"
ls -1t run_* 2>/dev/null | tail -n +11 | xargs rm -rf 2>/dev/null || true ls -1t run_* 2>/dev/null | tail -n +11 | while read -r run; do
[ -n "$run" ] || continue
mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
done
local count_after=$(ls -1d run_* 2>/dev/null | wc -l) local count_after=$(ls -1d run_* 2>/dev/null | wc -l)
local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B") local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B")
print_success "Cleaned old runs: $count_before$count_after runs ($size_before$size_after)" print_success "Cleaned old runs: $count_before$count_after runs ($size_before$size_after)"
@ -80,12 +93,18 @@ cleanup_temp_files() {
# Clean temp directories # Clean temp directories
local temp_cleaned=0 local temp_cleaned=0
local stamp=$(date -u +%Y%m%d-%H%M%S)
local tmp_archive_dir="$LOCAL_ARTIFACTS_DIR/tmp-archive/$stamp"
mkdir -p "$tmp_archive_dir"
# /tmp cleanup # /tmp cleanup
if [ -d "/tmp" ]; then if [ -d "/tmp" ]; then
local tmp_files=$(find /tmp -name "benchmark_*" -type f 2>/dev/null | wc -l) local tmp_files=$(find /tmp -name "benchmark_*" -type f 2>/dev/null | wc -l)
if [ "$tmp_files" -gt 0 ]; then if [ "$tmp_files" -gt 0 ]; then
find /tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true find /tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
print_success "Cleaned $tmp_files temporary files from /tmp" mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
done
print_success "Archived $tmp_files temporary files from /tmp"
temp_cleaned=$((temp_cleaned + tmp_files)) temp_cleaned=$((temp_cleaned + tmp_files))
fi fi
fi fi
@ -94,8 +113,10 @@ cleanup_temp_files() {
if [ -d "/var/tmp" ]; then if [ -d "/var/tmp" ]; then
local vartmp_files=$(find /var/tmp -name "benchmark_*" -type f 2>/dev/null | wc -l) local vartmp_files=$(find /var/tmp -name "benchmark_*" -type f 2>/dev/null | wc -l)
if [ "$vartmp_files" -gt 0 ]; then if [ "$vartmp_files" -gt 0 ]; then
find /var/tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true find /var/tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
print_success "Cleaned $vartmp_files temporary files from /var/tmp" mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
done
print_success "Archived $vartmp_files temporary files from /var/tmp"
temp_cleaned=$((temp_cleaned + vartmp_files)) temp_cleaned=$((temp_cleaned + vartmp_files))
fi fi
fi fi
@ -104,8 +125,10 @@ cleanup_temp_files() {
if [ -d "$HOME/tmp" ]; then if [ -d "$HOME/tmp" ]; then
local user_tmp_files=$(find "$HOME/tmp" -name "benchmark_*" -type f 2>/dev/null | wc -l) local user_tmp_files=$(find "$HOME/tmp" -name "benchmark_*" -type f 2>/dev/null | wc -l)
if [ "$user_tmp_files" -gt 0 ]; then if [ "$user_tmp_files" -gt 0 ]; then
find "$HOME/tmp" -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true find "$HOME/tmp" -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
print_success "Cleaned $user_tmp_files temporary files from ~/tmp" mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
done
print_success "Archived $user_tmp_files temporary files from ~/tmp"
temp_cleaned=$((temp_cleaned + user_tmp_files)) temp_cleaned=$((temp_cleaned + user_tmp_files))
fi fi
fi fi
@ -177,9 +200,16 @@ cleanup_logs() {
for log_dir in "${log_dirs[@]}"; do for log_dir in "${log_dirs[@]}"; do
if [ -d "$log_dir" ]; then if [ -d "$log_dir" ]; then
local log_size_before=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B") local log_size_before=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B")
# Remove log files older than 7 days local stamp=$(date -u +%Y%m%d-%H%M%S)
find "$log_dir" -name "*.log" -type f -mtime +7 -delete 2>/dev/null || true local log_archive_dir="$log_dir/archive/$stamp"
find "$log_dir" -name "*.log.*" -type f -mtime +7 -delete 2>/dev/null || true mkdir -p "$log_archive_dir"
# Move log files older than 7 days to archive
find "$log_dir" -name "*.log" -type f -mtime +7 -print0 2>/dev/null | while IFS= read -r -d '' f; do
mv "$f" "$log_archive_dir/" 2>/dev/null || true
done
find "$log_dir" -name "*.log.*" -type f -mtime +7 -print0 2>/dev/null | while IFS= read -r -d '' f; do
mv "$f" "$log_archive_dir/" 2>/dev/null || true
done
local log_size_after=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B") local log_size_after=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B")
if [ "$log_size_before" != "$log_size_after" ]; then if [ "$log_size_before" != "$log_size_after" ]; then
print_success "Cleaned old logs in $log_dir: $log_size_before$log_size_after" print_success "Cleaned old logs in $log_dir: $log_size_before$log_size_after"

View file

@ -144,12 +144,12 @@ else
log_info "No running containers found" log_info "No running containers found"
fi fi
# Remove containers # Remove containers
log_info "Removing containers..." log_info "Removing containers..."
containers=$(docker ps -aq --filter "name=ml-") containers=$(docker ps -aq --filter "name=ml-")
if [ -n "$containers" ]; then if [ -n "$containers" ]; then
if [ "$DRY_RUN" = false ]; then if [ "$DRY_RUN" = false ]; then
echo "$containers" | xargs docker rm -f echo "$containers" | xargs docker rm
log_success "Containers removed" log_success "Containers removed"
fi fi
else else
@ -168,9 +168,9 @@ else
log_info "No networks found" log_info "No networks found"
fi fi
# Remove volumes (with caution) # Remove volumes (with caution)
log_warning "Removing volumes (this will delete data)..." log_warning "Skipping volumes by default (use --all to remove them)"
if [ "$FORCE" = true ] || [ "$ALL" = true ]; then if [ "$ALL" = true ]; then
volumes=$(docker volume ls -q --filter "name=ml-") volumes=$(docker volume ls -q --filter "name=ml-")
if [ -n "$volumes" ]; then if [ -n "$volumes" ]; then
if [ "$DRY_RUN" = false ]; then if [ "$DRY_RUN" = false ]; then
@ -181,16 +181,16 @@ if [ "$FORCE" = true ] || [ "$ALL" = true ]; then
log_info "No volumes found" log_info "No volumes found"
fi fi
else else
log_info "Skipping volumes (use --force or --all to remove them)" log_info "Skipping volumes"
fi fi
# Remove images if requested # Remove images if requested
if [ "$ALL" = true ]; then if [ "$ALL" = true ]; then
log_info "Removing images..." log_info "Removing images..."
images=$(docker images -q --filter "reference=fetch_ml-*") images=$(docker images -q --filter "reference=fetch_ml-*")
if [ -n "$images" ]; then if [ -n "$images" ]; then
if [ "$DRY_RUN" = false ]; then if [ "$DRY_RUN" = false ]; then
echo "$images" | xargs docker rmi -f echo "$images" | xargs docker rmi
log_success "Images removed" log_success "Images removed"
fi fi
else else
@ -200,11 +200,15 @@ else
log_info "Skipping images (use --all to remove them)" log_info "Skipping images (use --all to remove them)"
fi fi
# General Docker cleanup # General Docker cleanup
log_info "Running general Docker cleanup..." if [ "$ALL" = true ]; then
if [ "$DRY_RUN" = false ]; then log_info "Running general Docker cleanup (docker system prune)..."
docker system prune -f if [ "$DRY_RUN" = false ]; then
log_success "General cleanup completed" docker system prune -f
log_success "General cleanup completed"
fi
else
log_info "Skipping docker system prune (use --all to enable)"
fi fi
# Show final state # Show final state

View file

@ -8,6 +8,7 @@ set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts" LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"
# Create artifacts directory if it doesn't exist # Create artifacts directory if it doesn't exist
mkdir -p "$LOCAL_ARTIFACTS_DIR" mkdir -p "$LOCAL_ARTIFACTS_DIR"
@ -41,17 +42,21 @@ case "${1:-help}" in
echo "=== Cleaning Artifacts ===" echo "=== Cleaning Artifacts ==="
case "${2:-all}" in case "${2:-all}" in
"all") "all")
echo "Removing all artifacts..." echo "Archiving all artifacts..."
rm -rf "$LOCAL_ARTIFACTS_DIR" stamp=$(date -u +%Y%m%d-%H%M%S)
echo "All artifacts removed" mkdir -p "$ARCHIVE_DIR/$stamp"
mv "$LOCAL_ARTIFACTS_DIR"/run_* "$ARCHIVE_DIR/$stamp"/ 2>/dev/null || true
echo "All artifacts archived"
;; ;;
"old") "old")
keep_count="${3:-10}" keep_count="${3:-10}"
echo "Keeping last $keep_count runs, removing older ones..." echo "Keeping last $keep_count runs, archiving older ones..."
stamp=$(date -u +%Y%m%d-%H%M%S)
mkdir -p "$ARCHIVE_DIR/$stamp"
cd "$LOCAL_ARTIFACTS_DIR" cd "$LOCAL_ARTIFACTS_DIR"
ls -1t run_* 2>/dev/null | tail -n +$((keep_count + 1)) | while read -r run; do ls -1t run_* 2>/dev/null | tail -n +$((keep_count + 1)) | while read -r run; do
echo "Removing: $run" echo "Archiving: $run"
rm -rf "$run" mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
done done
;; ;;
"run") "run")
@ -64,8 +69,10 @@ case "${1:-help}" in
fi fi
run_dir="$LOCAL_ARTIFACTS_DIR/run_$run_id" run_dir="$LOCAL_ARTIFACTS_DIR/run_$run_id"
if [ -d "$run_dir" ]; then if [ -d "$run_dir" ]; then
echo "Removing run: $run_id" echo "Archiving run: $run_id"
rm -rf "$run_dir" stamp=$(date -u +%Y%m%d-%H%M%S)
mkdir -p "$ARCHIVE_DIR/$stamp"
mv "$run_dir" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
else else
echo "Run not found: $run_id" echo "Run not found: $run_id"
fi fi

View file

@ -1,169 +0,0 @@
#!/bin/bash
# Secure Homelab Setup Script for Fetch ML
# This script generates secure API keys and TLS certificates
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
CONFIG_DIR="$PROJECT_ROOT/configs/environments"
SSL_DIR="$PROJECT_ROOT/ssl"
echo "🔒 Setting up secure homelab configuration..."
# Create SSL directory
mkdir -p "$SSL_DIR"
# Generate TLS certificates
echo "📜 Generating TLS certificates..."
if [[ ! -f "$SSL_DIR/cert.pem" ]] || [[ ! -f "$SSL_DIR/key.pem" ]]; then
openssl req -x509 -newkey rsa:4096 -keyout "$SSL_DIR/key.pem" -out "$SSL_DIR/cert.pem" -days 365 -nodes \
-subj "/C=US/ST=Homelab/L=Local/O=FetchML/OU=Homelab/CN=localhost" \
-addext "subjectAltName=DNS:localhost,DNS:$(hostname),IP:127.0.0.1"
chmod 600 "$SSL_DIR/key.pem"
chmod 644 "$SSL_DIR/cert.pem"
echo "✅ TLS certificates generated in $SSL_DIR/"
else
echo " TLS certificates already exist, skipping generation"
fi
# Generate secure API keys
echo "🔑 Generating secure API keys..."
generate_api_key() {
openssl rand -hex 32
}
# Hash function
hash_key() {
echo -n "$1" | sha256sum | cut -d' ' -f1
}
# Generate keys
ADMIN_KEY=$(generate_api_key)
USER_KEY=$(generate_api_key)
ADMIN_HASH=$(hash_key "$ADMIN_KEY")
USER_HASH=$(hash_key "$USER_KEY")
# Create secure config
echo "⚙️ Creating secure configuration..."
cat > "$CONFIG_DIR/config-homelab-secure.yaml" << EOF
# Secure Homelab Configuration
# IMPORTANT: Keep your API keys safe and never share them!
redis:
url: "redis://localhost:6379"
max_connections: 10
auth:
enabled: true
api_keys:
homelab_admin:
hash: $ADMIN_HASH
admin: true
roles:
- admin
permissions:
'*': true
homelab_user:
hash: $USER_HASH
admin: false
roles:
- researcher
permissions:
'experiments': true
'datasets': true
'jupyter': true
server:
address: ":9101"
tls:
enabled: true
cert_file: "$SSL_DIR/cert.pem"
key_file: "$SSL_DIR/key.pem"
security:
rate_limit:
enabled: true
requests_per_minute: 60
burst_size: 10
ip_whitelist:
- "127.0.0.1"
- "::1"
- "localhost"
- "192.168.1.0/24" # Adjust to your network
- "10.0.0.0/8"
logging:
level: "info"
file: "logs/fetch_ml.log"
console: true
resources:
cpu_limit: "2"
memory_limit: "4Gi"
gpu_limit: 0
disk_limit: "10Gi"
# Prometheus metrics
metrics:
enabled: true
listen_addr: ":9100"
tls:
enabled: false
EOF
# Save API keys to a secure file
echo "🔐 Saving API keys..."
cat > "$PROJECT_ROOT/.api-keys" << EOF
# Fetch ML Homelab API Keys
# IMPORTANT: Keep this file secure and never commit to version control!
ADMIN_API_KEY: $ADMIN_KEY
USER_API_KEY: $USER_KEY
# Usage examples:
# curl -H "X-API-Key: $ADMIN_KEY" https://localhost:9101/health
# curl -H "X-API-Key: $USER_KEY" https://localhost:9101/api/jupyter/services
EOF
chmod 600 "$PROJECT_ROOT/.api-keys"
# Create environment file for JWT secret
JWT_SECRET=$(generate_api_key)
cat > "$PROJECT_ROOT/.env.secure" << EOF
# Secure environment variables for Fetch ML
# IMPORTANT: Keep this file secure and never commit to version control!
JWT_SECRET=$JWT_SECRET
# Source this file before running the server:
# source .env.secure
EOF
chmod 600 "$PROJECT_ROOT/.env.secure"
# Update .gitignore to exclude sensitive files
echo "📝 Updating .gitignore..."
if ! grep -q ".api-keys" "$PROJECT_ROOT/.gitignore"; then
echo -e "\n# Security files\n.api-keys\n.env.secure\nssl/\n*.pem\n*.key" >> "$PROJECT_ROOT/.gitignore"
fi
echo ""
echo "🎉 Secure homelab setup complete!"
echo ""
echo "📋 Next steps:"
echo "1. Review and adjust the IP whitelist in config-homelab-secure.yaml"
echo "2. Start the server with: ./api-server -config configs/environments/config-homelab-secure.yaml"
echo "3. Source the environment: source .env.secure"
echo "4. Your API keys are saved in .api-keys"
echo ""
echo "🔐 API Keys:"
echo " Admin: $ADMIN_KEY"
echo " User: $USER_KEY"
echo ""
echo "⚠️ IMPORTANT:"
echo " - Never share your API keys"
echo " - Never commit .api-keys or .env.secure to version control"
echo " - Backup your SSL certificates and API keys securely"
echo " - Consider using a password manager for storing keys"

View file

@ -1,311 +0,0 @@
#!/bin/bash
# setup.sh: One-shot homelab setup (security + core services)
# Keeps essential security (Fail2Ban, monitoring) while simplifying complexity
set -euo pipefail
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly BLUE='\033[0;34m'
readonly NC='\033[0m'
print_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Simple dependency check
check_deps() {
print_info "Checking dependencies..."
local missing=()
if ! command -v go &> /dev/null; then
missing+=("go")
fi
if ! command -v zig &> /dev/null; then
missing+=("zig")
fi
if ! command -v redis-server &> /dev/null; then
missing+=("redis-server")
fi
if ! command -v docker &> /dev/null; then
missing+=("docker")
fi
if [[ ${#missing[@]} -gt 0 ]]; then
print_error "Missing dependencies: ${missing[*]}"
echo ""
echo "Install with:"
echo " macOS: brew install ${missing[*]}"
echo " Ubuntu: sudo apt-get install ${missing[*]}"
exit 1
fi
print_success "Dependencies OK"
}
# Simple setup
setup_project() {
print_info "Setting up project..."
# Create essential directories
mkdir -p ssl logs configs data monitoring
# Generate simple SSL cert
if [[ ! -f "ssl/cert.pem" ]]; then
openssl req -x509 -newkey rsa:2048 -keyout ssl/key.pem -out ssl/cert.pem \
-days 365 -nodes -subj "/C=US/ST=State/L=City/O=Homelab/CN=localhost" \
-addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null
print_success "SSL certificates generated"
fi
# Create balanced config
cat > configs/config.yaml << 'EOF'
base_path: "./data/experiments"
auth:
enabled: true
api_keys:
homelab_user:
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
admin: true
roles: ["user", "admin"]
permissions:
read: true
write: true
delete: true
server:
address: ":9101"
tls:
enabled: true
cert_file: "./ssl/cert.pem"
key_file: "./ssl/key.pem"
security:
rate_limit:
enabled: true
requests_per_minute: 30
burst_size: 5
ip_whitelist:
- "127.0.0.1"
- "::1"
- "192.168.0.0/16"
- "10.0.0.0/8"
- "172.16.0.0/12"
failed_login_lockout:
enabled: true
max_attempts: 3
lockout_duration: "15m"
redis:
url: "redis://localhost:6379"
logging:
level: "info"
file: "./logs/app.log"
audit_log: "./logs/audit.log"
access_log: "./logs/access.log"
monitoring:
enabled: true
metrics_port: 9090
health_check_interval: "30s"
EOF
print_success "Configuration created"
}
# Simple build
build_project() {
print_info "Building project..."
# Build Go apps
go build -o bin/api-server ./cmd/api-server
go build -o bin/worker ./cmd/worker
go build -o bin/tui ./cmd/tui
# Build Zig CLI
cd cli && zig build && cd ..
print_success "Build completed"
}
# Setup Fail2Ban
setup_fail2ban() {
print_info "Setting up Fail2Ban..."
if ! command -v fail2ban-server &> /dev/null; then
print_warning "Fail2Ban not installed, skipping..."
return
fi
# Create Fail2Ban configuration
sudo mkdir -p /etc/fail2ban/jail.d 2>/dev/null || true
cat > /tmp/ml-experiments-jail.conf << 'EOF'
[DEFAULT]
bantime = 3600
findtime = 600
maxretry = 3
backend = systemd
[sshd]
enabled = true
port = ssh
logpath = /var/log/auth.log
maxretry = 3
[ml-experiments-api]
enabled = true
port = 9101
filter = ml-experiments-api
logpath = ./logs/audit.log
maxretry = 5
bantime = 7200
[ml-experiments-auth]
enabled = true
filter = ml-experiments-auth
logpath = ./logs/audit.log
maxretry = 3
bantime = 3600
EOF
# Create filter definitions
cat > /tmp/ml-experiments-api.conf << 'EOF'
[Definition]
failregex = ^.*<HOST>.*"status":40[13].*$
ignoreregex =
EOF
cat > /tmp/ml-experiments-auth.conf << 'EOF'
[Definition]
failregex = ^.*"event":"failed_login".*"client_ip":"<HOST>".*$
ignoreregex =
EOF
# Try to install configurations
if sudo cp /tmp/ml-experiments-jail.conf /etc/fail2ban/jail.d/ 2>/dev/null; then
sudo cp /tmp/ml-experiments-*.conf /etc/fail2ban/filter.d/ 2>/dev/null || true
sudo systemctl restart fail2ban 2>/dev/null || true
print_success "Fail2Ban configured"
else
print_warning "Could not configure Fail2Ban (requires sudo)"
fi
rm -f /tmp/ml-experiments-*.conf
}
# Setup Redis
setup_redis() {
print_info "Setting up Redis..."
if ! pgrep -f "redis-server" > /dev/null; then
redis-server --daemonize yes --port 6379
print_success "Redis started"
else
print_info "Redis already running"
fi
}
# Create simple management script
create_manage_script() {
cat > manage.sh << 'EOF'
#!/bin/bash
# Simple management script
case "${1:-status}" in
"start")
echo "Starting services..."
redis-server --daemonize yes --port 6379 2>/dev/null || true
./bin/api-server -config configs/config.yaml &
echo "Services started"
;;
"stop")
echo "Stopping services..."
pkill -f "api-server" || true
redis-cli shutdown 2>/dev/null || true
echo "Services stopped"
;;
"status")
echo "=== Status ==="
if pgrep -f "redis-server" > /dev/null; then
echo "✅ Redis: Running"
else
echo "❌ Redis: Stopped"
fi
if pgrep -f "api-server" > /dev/null; then
echo "✅ API Server: Running"
else
echo "❌ API Server: Stopped"
fi
;;
"logs")
echo "=== Recent Logs ==="
tail -20 logs/app.log 2>/dev/null || echo "No logs yet"
;;
"test")
echo "=== Testing ==="
curl -k -s https://localhost:9101/health || echo "API server not responding"
;;
*)
echo "Usage: $0 {start|stop|status|logs|test}"
;;
esac
EOF
chmod +x manage.sh
print_success "Management script created"
}
# Show next steps
show_next_steps() {
print_success "Setup completed!"
echo ""
echo "🎉 Setup complete!"
echo ""
echo "Next steps:"
echo " 1. Start services: ./tools/manage.sh start"
echo " 2. Check status: ./tools/manage.sh status"
echo " 3. Test API: curl -k -H 'X-API-Key: password' https://localhost:9101/health"
echo ""
echo "Configuration: configs/config.yaml"
echo "Logs: logs/app.log and logs/audit.log"
echo ""
print_success "Ready for homelab use!"
}
# Main setup
main() {
echo "ML Experiment Manager - Homelab Setup"
echo "====================================="
echo ""
check_deps
setup_project
build_project
setup_redis
create_manage_script
show_next_steps
}
main "$@"

View file

@ -0,0 +1,62 @@
#!/usr/bin/env python3
import os
# Create monitoring directory structure
repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
monitoring_dir = os.path.join(repo_root, 'monitoring')
grafana_dir = os.path.join(monitoring_dir, 'grafana')
datasources_dir = os.path.join(grafana_dir, 'provisioning', 'datasources')
providers_dir = os.path.join(grafana_dir, 'provisioning', 'dashboards')
os.makedirs(datasources_dir, exist_ok=True)
os.makedirs(providers_dir, exist_ok=True)
# Essential datasource configurations
datasources = {
'prometheus.yml': """apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
jsonData:
timeInterval: "5s"
""",
'loki.yml': """apiVersion: 1
datasources:
- name: Loki
type: loki
access: proxy
url: http://loki:3100
editable: true
jsonData:
maxLines: 1000
""",
'dashboards.yml': """apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
"""
}
# Write configuration files
for filename, content in datasources.items():
if filename == 'dashboards.yml':
path = os.path.join(providers_dir, filename)
else:
path = os.path.join(datasources_dir, filename)
with open(path, 'w') as f:
f.write(content)
print("Monitoring setup completed!")

111
scripts/smoke-test.sh Normal file
View file

@ -0,0 +1,111 @@
set -euo pipefail;
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
export FETCHML_REPO_ROOT="$repo_root"
env="${1:-dev}";
if [ "$env" != "dev" ] && [ "$env" != "prod" ]; then
echo "usage: $0 [dev|prod]" >&2
exit 2
fi
probe_https_health_openssl() {
host="$1"
port="$2"
path="$3"
req="GET ${path} HTTP/1.1\r\nHost: ${host}\r\nConnection: close\r\n\r\n"
resp=$(printf "%b" "$req" | openssl s_client -connect "127.0.0.1:${port}" -servername "${host}" -tls1_2 -quiet 2>/dev/null || true)
printf "%s" "$resp" | tr -d '\r' | head -n 1 | grep -Eq '^HTTP/1\.[01] 200'
}
compose_cmd="docker-compose";
if ! command -v docker-compose >/dev/null 2>&1; then
compose_cmd="docker compose";
fi
compose_files=()
compose_project_args=("--project-directory" "$repo_root")
api_base=""
prometheus_base=""
stack_name=""
if [ "$env" = "dev" ]; then
mkdir -p \
"$repo_root/data/dev/redis" \
"$repo_root/data/dev/minio" \
"$repo_root/data/dev/prometheus" \
"$repo_root/data/dev/grafana" \
"$repo_root/data/dev/loki" \
"$repo_root/data/dev/logs" \
"$repo_root/data/dev/experiments" \
"$repo_root/data/dev/active" \
"$repo_root/data/dev/workspaces"
stack_name="dev"
compose_files=("-f" "$repo_root/deployments/docker-compose.dev.yml")
api_base="https://localhost:9101"
if ! curl -skf "$api_base/health" >/dev/null 2>&1; then
api_base="http://localhost:9101"
fi
prometheus_base="http://localhost:9090"
else
mkdir -p \
"$repo_root/data/prod-smoke/caddy/data" \
"$repo_root/data/prod-smoke/caddy/config" \
"$repo_root/data/prod-smoke/redis" \
"$repo_root/data/prod-smoke/logs" \
"$repo_root/data/prod-smoke/experiments" \
"$repo_root/data/prod-smoke/active"
stack_name="prod"
compose_files=("-f" "$repo_root/deployments/docker-compose.prod.smoke.yml")
api_base="https://localhost:8443"
export FETCHML_DOMAIN=localhost
export CADDY_EMAIL=smoke@example.invalid
fi
cleanup() {
status=$?;
if [ "$status" -ne 0 ]; then
$compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" logs --no-color || true;
fi
if [ "${KEEP_STACK:-0}" != "1" ]; then
$compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" down -v >/dev/null 2>&1 || true;
fi
exit "$status";
}
trap cleanup EXIT;
echo "Starting $stack_name stack for smoke test...";
$compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" up -d --build >/dev/null;
echo "Waiting for API to become healthy...";
deadline=$(($(date +%s) + 90));
while true; do
if [ "$env" = "dev" ]; then
if curl -skf "$api_base/health" >/dev/null 2>&1; then break; fi;
else
if probe_https_health_openssl "localhost" "8443" "/health"; then break; fi;
fi
if [ $(date +%s) -ge $deadline ]; then echo "Timed out waiting for $api_base/health"; exit 1; fi;
sleep 2;
done;
if [ "$env" = "dev" ]; then
echo "Checking metrics endpoint...";
curl -skf "$api_base/metrics" >/dev/null;
echo "Waiting for Prometheus target api-server to be up...";
deadline=$(($(date +%s) + 90));
query_url="$prometheus_base/api/v1/query?query=up%7Bjob%3D%22api-server%22%7D";
while true; do
resp=$(curl -sf "$query_url" || true);
resp_compact=$(printf "%s" "$resp" | tr -d '\n' | tr -d '\r');
if echo "$resp_compact" | grep -Fq '"instance":"api-server:9101"' && echo "$resp_compact" | grep -Fq ',"1"]'; then break; fi;
if [ $(date +%s) -ge $deadline ]; then echo "Timed out waiting for Prometheus api-server target to be up"; echo "$resp"; exit 1; fi;
sleep 2;
done;
fi

View file

@ -1,80 +0,0 @@
#!/bin/bash
# Homelab Secure Test Environment Script
set -e
echo "Starting Homelab Secure Production Environment..."
# Clean up any existing containers
echo "Cleaning up existing containers..."
docker-compose -f deployments/docker-compose.homelab-secure.yml down -v
# Create necessary directories with proper permissions
echo "Creating directories..."
mkdir -p data logs
chmod 750 data logs
# Build and start services
echo "Building and starting services..."
docker-compose -f deployments/docker-compose.homelab-secure.yml up --build -d
# Wait for services to be healthy
echo "Waiting for services to be healthy..."
sleep 20
# Check service health
echo "Checking service health..."
docker-compose -f deployments/docker-compose.homelab-secure.yml ps
# Test API server with TLS
echo "Testing API server..."
curl -k -s https://localhost:9104/health || echo "API health check failed"
# Test Redis with authentication
echo "Testing Redis with authentication..."
docker exec ml-homelab-redis redis-cli -a "HomelabRedis2024!" ping || echo "Redis health check failed"
# Test SSH connectivity with security
echo "Testing SSH connectivity..."
docker exec -u worker ml-homelab-worker ssh -o StrictHostKeyChecking=no -o Port=2222 worker@localhost "echo 'SSH OK'" || echo "SSH test failed"
# Test fail2ban status
echo "Testing fail2ban..."
docker exec ml-homelab-api fail2ban-client status sshd || echo "fail2ban check failed"
echo ""
echo "Homelab secure production environment is ready!"
echo ""
echo "Services:"
echo " - API Server: https://localhost:9104"
echo " - SSH: localhost:2223 (worker user)"
echo " - Redis: localhost:6379 (with password)"
echo " - Metrics: http://localhost:9101"
echo ""
echo "Security Features:"
echo " ✓ Strong TLS 1.3 with modern ciphers"
echo " ✓ SSH with fail2ban protection"
echo " ✓ Redis with password authentication"
echo " ✓ SQLite database with encryption"
echo " ✓ Container security hardening"
echo " ✓ Rate limiting and CORS protection"
echo " ✓ Security headers and CSRF protection"
echo " ✓ Podman sandboxed job execution"
echo " ✓ Audit logging and monitoring"
echo ""
echo "Credentials:"
echo " - API User: homelab_user / password"
echo " - SSH User: worker / HomelabWorker2024!"
echo " - Redis Password: HomelabRedis2024!"
echo ""
echo "To test with CLI:"
echo " ./cli/zig-out/bin/ml queue homelab-secure-test"
echo " ./cli/zig-out/bin/ml status"
echo ""
echo "To view logs:"
echo " docker-compose -f deployments/docker-compose.homelab-secure.yml logs -f api-server"
echo " docker-compose -f deployments/docker-compose.homelab-secure.yml logs -f worker"
echo " docker-compose -f deployments/docker-compose.homelab-secure.yml down"
echo ""
echo "To stop:"
echo " docker-compose -f deployments/docker-compose.homelab-secure.yml down"

64
scripts/track_performance.sh Executable file
View file

@ -0,0 +1,64 @@
#!/bin/bash
# Simple performance tracking script
RESULTS_DIR="test_results/performance"
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
RESULTS_FILE="$RESULTS_DIR/load_test_$TIMESTAMP.json"
mkdir -p "$RESULTS_DIR"
echo "Running load test performance tracking..."
echo "Timestamp: $TIMESTAMP"
# Run tests and capture results
go test ./tests/load -run=TestLoadTestSuite -v -load-suite=medium -timeout=10m > "$RESULTS_DIR/raw_$TIMESTAMP.log"
# Extract key metrics
{
echo "{"
echo " \"timestamp\": \"$TIMESTAMP\","
echo " \"tests\": ["
# Parse light load
LIGHT_RPS=$(grep -A1 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Throughput" | awk '{print $2}')
LIGHT_ERROR=$(grep -A2 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Error rate" | awk '{print $3}')
LIGHT_P99=$(grep -A4 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "P99 latency" | awk '{print $3}')
echo " {"
echo " \"name\": \"LightLoad\","
echo " \"throughput_rps\": $LIGHT_RPS,"
echo " \"error_rate_percent\": $LIGHT_ERROR,"
echo " \"p99_latency_ms\": \"$LIGHT_P99\""
echo " },"
# Parse medium load
MEDIUM_RPS=$(grep -A1 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Throughput" | awk '{print $2}')
MEDIUM_ERROR=$(grep -A2 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Error rate" | awk '{print $3}')
MEDIUM_P99=$(grep -A4 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "P99 latency" | awk '{print $3}')
echo " {"
echo " \"name\": \"MediumLoad\","
echo " \"throughput_rps\": $MEDIUM_RPS,"
echo " \"error_rate_percent\": $MEDIUM_ERROR,"
echo " \"p99_latency_ms\": \"$MEDIUM_P99\""
echo " }"
echo " ]"
echo "}"
} > "$RESULTS_FILE"
echo "Results saved to: $RESULTS_FILE"
echo "Raw logs: $RESULTS_DIR/raw_$TIMESTAMP.log"
# Show comparison with previous run if exists
PREV_FILE=$(ls -t "$RESULTS_DIR"/load_test_*.json | sed -n '2p')
if [ -n "$PREV_FILE" ]; then
echo ""
echo "=== Comparison with previous run ==="
echo "Previous: $(basename $PREV_FILE)"
echo "Current: $(basename $RESULTS_FILE)"
echo ""
echo "Light Load Throughput:"
echo " Previous: $(jq -r '.tests[0].throughput_rps' "$PREV_FILE") RPS"
echo " Current: $(jq -r '.tests[0].throughput_rps' "$RESULTS_FILE") RPS"
echo " Change: $(echo "$(jq -r '.tests[0].throughput_rps' "$RESULTS_FILE") - $(jq -r '.tests[0].throughput_rps' "$PREV_FILE")" | bc -l) RPS"
fi

View file

@ -1,204 +0,0 @@
#!/bin/bash
# Production Configuration Validator
# Verifies all paths and configs are consistent for experiment lifecycle
set -e
BOLD='\033[1m'
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
echo -e "${BOLD}=== FetchML Production Configuration Validator ===${NC}\n"
# Configuration file paths
API_CONFIG="${1:-configs/config-prod.yaml}"
WORKER_CONFIG="${2:-configs/worker-prod.toml}"
errors=0
warnings=0
check_pass() {
echo -e "${GREEN}${NC} $1"
}
check_fail() {
echo -e "${RED}${NC} $1"
((errors++))
}
check_warn() {
echo -e "${YELLOW}${NC} $1"
((warnings++))
}
# 1. Check API server config exists
echo -e "${BOLD}Checking API Server Configuration${NC}"
if [ ! -f "$API_CONFIG" ]; then
check_fail "API config not found: $API_CONFIG"
else
check_pass "API config found: $API_CONFIG"
# Extract base_path from API config
API_BASE_PATH=$(grep 'base_path:' "$API_CONFIG" | head -1 | awk '{print $2}' | tr -d '"')
echo " Base path: $API_BASE_PATH"
# Check if path is absolute
if [[ "$API_BASE_PATH" != /* ]]; then
check_fail "base_path must be absolute: $API_BASE_PATH"
else
check_pass "base_path is absolute"
fi
# Check Redis config
if grep -q 'redis:' "$API_CONFIG"; then
check_pass "Redis configuration present"
else
check_fail "Redis configuration missing"
fi
# Check auth enabled
if grep -q 'enabled: true' "$API_CONFIG"; then
check_pass "Authentication enabled"
else
check_warn "Authentication disabled (not recommended for production)"
fi
fi
echo ""
# 2. Check Worker config (if provided)
if [ -f "$WORKER_CONFIG" ]; then
echo -e "${BOLD}Checking Worker Configuration${NC}"
check_pass "Worker config found: $WORKER_CONFIG"
# Extract base_path from worker config
WORKER_BASE_PATH=$(grep 'base_path' "$WORKER_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
echo " Base path: $WORKER_BASE_PATH"
# Compare paths
if [ "$API_BASE_PATH" = "$WORKER_BASE_PATH" ]; then
check_pass "API and Worker base_path match"
else
check_fail "base_path mismatch! API: $API_BASE_PATH, Worker: $WORKER_BASE_PATH"
fi
# Check podman_image configured
if grep -q 'podman_image' "$WORKER_CONFIG"; then
PODMAN_IMAGE=$(grep 'podman_image' "$WORKER_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
check_pass "Podman image configured: $PODMAN_IMAGE"
else
check_fail "podman_image not configured"
fi
else
check_warn "Worker config not found: $WORKER_CONFIG (optional for API server only)"
fi
echo ""
# 3. Check directory structure (if base_path exists)
if [ -n "$API_BASE_PATH" ] && [ -d "$API_BASE_PATH" ]; then
echo -e "${BOLD}Checking Directory Structure${NC}"
check_pass "Base directory exists: $API_BASE_PATH"
# Check subdirectories
for dir in experiments pending running finished failed; do
if [ -d "$API_BASE_PATH/$dir" ]; then
check_pass "$dir/ directory exists"
else
check_warn "$dir/ directory missing (will be created automatically)"
fi
done
# Check permissions
if [ -w "$API_BASE_PATH" ]; then
check_pass "Base directory is writable"
else
check_fail "Base directory is not writable (check permissions)"
fi
elif [ -n "$API_BASE_PATH" ]; then
check_warn "Base directory does not exist: $API_BASE_PATH (will need to be created)"
fi
echo ""
# 4. Check Redis connectivity (if server is running)
echo -e "${BOLD}Checking Redis Connectivity${NC}"
if command -v redis-cli &> /dev/null; then
if redis-cli ping &> /dev/null; then
check_pass "Redis server is running and accessible"
# Check queue
QUEUE_SIZE=$(redis-cli llen fetchml:tasks:queue 2>/dev/null || echo "0")
echo " Queue size: $QUEUE_SIZE tasks"
else
check_warn "Redis server not accessible (start with: redis-server)"
fi
else
check_warn "redis-cli not installed (cannot verify Redis connectivity)"
fi
echo ""
# 5. Check Podman (if worker config exists)
if [ -f "$WORKER_CONFIG" ]; then
echo -e "${BOLD}Checking Podman${NC}"
if command -v podman &> /dev/null; then
check_pass "Podman is installed"
# Check if image exists
if [ -n "$PODMAN_IMAGE" ]; then
if podman image exists "$PODMAN_IMAGE" 2>/dev/null; then
check_pass "Podman image exists: $PODMAN_IMAGE"
else
check_warn "Podman image not found: $PODMAN_IMAGE (needs to be built)"
fi
fi
# Check GPU access (if configured)
if grep -q 'gpu_access.*true' "$WORKER_CONFIG" 2>/dev/null; then
if podman run --rm --device nvidia.com/gpu=all nvidia/cuda:11.8.0-base nvidia-smi &>/dev/null; then
check_pass "GPU access working"
else
check_warn "GPU access configured but not working (check nvidia-container-toolkit)"
fi
fi
else
check_fail "Podman not installed (required for worker)"
fi
fi
echo ""
# 6. Check CLI config consistency
echo -e "${BOLD}Checking CLI Configuration${NC}"
CLI_CONFIG="$HOME/.ml/config.toml"
if [ -f "$CLI_CONFIG" ]; then
check_pass "CLI config found: $CLI_CONFIG"
CLI_BASE=$(grep 'worker_base' "$CLI_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
if [ "$CLI_BASE" = "$API_BASE_PATH" ]; then
check_pass "CLI worker_base matches server base_path"
else
check_warn "CLI worker_base ($CLI_BASE) differs from server ($API_BASE_PATH)"
fi
else
check_warn "CLI config not found (run: ml init)"
fi
echo ""
# Summary
echo -e "${BOLD}=== Summary ===${NC}"
if [ $errors -eq 0 ] && [ $warnings -eq 0 ]; then
echo -e "${GREEN}All checks passed! Configuration is ready for production.${NC}"
exit 0
elif [ $errors -eq 0 ]; then
echo -e "${YELLOW}Configuration has $warnings warning(s). Review before deployment.${NC}"
exit 0
else
echo -e "${RED}Configuration has $errors error(s) and $warnings warning(s). Fix before deployment.${NC}"
exit 1
fi

148
scripts/verify_release.sh Normal file
View file

@ -0,0 +1,148 @@
#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<'EOF'
Usage:
scripts/verify_release.sh --dir <release_dir> [--repo <org>/<repo>]
What it does:
- Verifies checksums.txt signature (keyless cosign) if cosign + checksums.txt.sig/.cert are present
- Verifies *.tar.gz files against checksums.txt
Notes:
- --repo enables strict Sigstore identity checking against the release workflow.
- Without cosign, the script still verifies SHA256 hashes.
Examples:
scripts/verify_release.sh --dir ./release --repo jfraeys/fetch_ml
scripts/verify_release.sh --dir .
EOF
}
release_dir=""
repo=""
while [[ $# -gt 0 ]]; do
case "$1" in
--dir)
release_dir="${2:-}"
shift 2
;;
--repo)
repo="${2:-}"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
echo "unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if [[ -z "$release_dir" ]]; then
echo "missing --dir" >&2
usage >&2
exit 2
fi
if [[ ! -d "$release_dir" ]]; then
echo "directory not found: $release_dir" >&2
exit 2
fi
cd "$release_dir"
if [[ ! -f checksums.txt ]]; then
echo "missing checksums.txt in $release_dir" >&2
exit 2
fi
has_cosign=false
if command -v cosign >/dev/null 2>&1; then
has_cosign=true
fi
verify_sigstore() {
if [[ ! -f checksums.txt.sig ]] || [[ ! -f checksums.txt.cert ]]; then
echo "[verify] cosign available, but checksums.txt.sig/.cert not found; skipping signature verification" >&2
return 0
fi
if [[ -z "$repo" ]]; then
echo "[verify] verifying signature (no repo identity pin; pass --repo to pin identity)" >&2
COSIGN_YES=true cosign verify-blob \
--certificate checksums.txt.cert \
--signature checksums.txt.sig \
--certificate-oidc-issuer https://token.actions.githubusercontent.com \
checksums.txt >/dev/null
echo "[ok] checksums.txt signature verified (un-pinned identity)"
return 0
fi
local identity
identity="^https://github.com/${repo}/\.github/workflows/release\.yml@refs/tags/v.*$"
COSIGN_YES=true cosign verify-blob \
--certificate checksums.txt.cert \
--signature checksums.txt.sig \
--certificate-identity-regexp "$identity" \
--certificate-oidc-issuer https://token.actions.githubusercontent.com \
checksums.txt >/dev/null
echo "[ok] checksums.txt signature verified (pinned to ${repo} release workflow)"
}
verify_hashes() {
local failures=0
local has_sha256sum=false
if command -v sha256sum >/dev/null 2>&1; then
has_sha256sum=true
fi
while IFS= read -r expected file; do
[[ -z "${expected}" ]] && continue
[[ -z "${file}" ]] && continue
if [[ ! -f "$file" ]]; then
continue
fi
local actual
if [[ "$has_sha256sum" == true ]]; then
actual="$(sha256sum "$file" | awk '{print $1}')"
else
actual="$(shasum -a 256 "$file" | awk '{print $1}')"
fi
if [[ "$actual" != "$expected" ]]; then
echo "[fail] $file" >&2
echo " expected: $expected" >&2
echo " actual: $actual" >&2
failures=$((failures+1))
fi
done < <(awk '{print $1, $2}' checksums.txt)
if [[ $failures -gt 0 ]]; then
echo "[fail] checksum verification failed ($failures file(s))" >&2
exit 1
fi
echo "[ok] all available artifacts match checksums.txt"
}
if [[ "$has_cosign" == true ]]; then
verify_sigstore
else
echo "[verify] cosign not installed; skipping signature verification" >&2
fi
verify_hashes
echo "[ok] release verification complete"

View file

@ -5,6 +5,10 @@
set -euo pipefail set -euo pipefail
make_target_exists() {
make -n "$1" >/dev/null 2>&1
}
# Colors # Colors
RED='\033[0;31m' RED='\033[0;31m'
GREEN='\033[0;32m' GREEN='\033[0;32m'
@ -45,7 +49,7 @@ show_status() {
# Check Go apps # Check Go apps
print_app "Go Applications:" print_app "Go Applications:"
local go_apps=("api-server" "worker" "tui" "data_manager" "user_manager") local go_apps=("api-server" "worker" "tui")
for app in "${go_apps[@]}"; do for app in "${go_apps[@]}"; do
if [[ -f "bin/$app" ]]; then if [[ -f "bin/$app" ]]; then
echo "$app: Built" echo "$app: Built"
@ -85,7 +89,7 @@ show_status() {
# Check configuration # Check configuration
print_app "Configuration:" print_app "Configuration:"
if [[ -f "configs/config-local.yaml" ]]; then if [[ -f "configs/api/dev.yaml" ]]; then
echo " ✅ Security config: Found" echo " ✅ Security config: Found"
else else
echo " ⚠️ Security config: Not found" echo " ⚠️ Security config: Not found"
@ -110,14 +114,14 @@ build_all() {
echo "=============================" echo "============================="
echo "" echo ""
print_info "Building Go applications..."
make build
if command -v zig &> /dev/null; then if command -v zig &> /dev/null; then
print_info "Building Zig CLI..." print_info "Building all components (Go + Zig CLI)..."
make cli-build make build
else else
print_warning "Zig not found, skipping CLI build" print_warning "Zig not found, building Go components only"
go build -o bin/api-server cmd/api-server/main.go
go build -o bin/worker cmd/worker/worker_server.go
go build -o bin/tui ./cmd/tui
fi fi
print_success "Build completed!" print_success "Build completed!"
@ -128,11 +132,13 @@ test_all() {
echo "====================" echo "===================="
echo "" echo ""
print_info "Running main test suite..." if make_target_exists test-full; then
make test print_info "Running full test suite..."
make test-full
print_info "Running comprehensive tests..." else
make test-all print_info "Running test suite..."
make test
fi
print_success "All tests completed!" print_success "All tests completed!"
} }
@ -156,8 +162,8 @@ start_services() {
# Start API server if built # Start API server if built
if [[ -f "bin/api-server" ]]; then if [[ -f "bin/api-server" ]]; then
print_info "Starting API server..." print_info "Starting API server..."
if [[ -f "configs/config-local.yaml" ]]; then if [[ -f "configs/api/dev.yaml" ]]; then
./bin/api-server --config configs/config-local.yaml & ./bin/api-server --config configs/api/dev.yaml &
else else
print_warning "No config found, using defaults" print_warning "No config found, using defaults"
./bin/api-server & ./bin/api-server &
@ -187,13 +193,25 @@ check_health() {
print_info "Port 9101 is open, checking API health endpoint..." print_info "Port 9101 is open, checking API health endpoint..."
# Try the health endpoint # Try the health endpoint
response=$(curl -k -s --max-time 3 -H 'X-API-Key: password' -H 'X-Forwarded-For: 127.0.0.1' https://localhost:9101/health 2>/dev/null) local api_key_header=""
if [[ -n "${FETCH_ML_API_KEY:-}" ]]; then
api_key_header="-H X-API-Key: ${FETCH_ML_API_KEY}"
fi
response=$(curl -s --max-time 3 ${api_key_header} http://localhost:9101/health 2>/dev/null || true)
if [[ -z "$response" ]]; then
response=$(curl -k -s --max-time 3 ${api_key_header} https://localhost:9101/health 2>/dev/null || true)
fi
if [[ "$response" == "OK" ]]; then if [[ "$response" == "OK" ]]; then
print_success "API is healthy: $response" print_success "API is healthy: $response"
elif [[ "$response" == *"IP not whitelisted"* ]]; then elif [[ "$response" == *"IP not whitelisted"* ]]; then
print_warning "API running but IP not whitelisted (expected behavior)" print_warning "API running but IP not whitelisted (expected behavior)"
print_info "Try: curl -k -H 'X-API-Key: password' -H 'X-Forwarded-For: 127.0.0.1' https://localhost:9101/health" if [[ -n "${FETCH_ML_API_KEY:-}" ]]; then
print_info "Try: curl -k -H 'X-API-Key: $FETCH_ML_API_KEY' https://localhost:9101/health"
else
print_info "Try: curl -k https://localhost:9101/health"
fi
else else
print_error "Unexpected response: $response" print_error "Unexpected response: $response"
fi fi
@ -229,19 +247,36 @@ run_security() {
case "${1:-check}" in case "${1:-check}" in
"check") "check")
print_info "Running security checks..." print_info "Running security checks..."
make security-check if make_target_exists security-check; then
make security-check
else
print_warning "No 'security-check' Make target found"
print_info "Try: make ci-local"
fi
;; ;;
"monitor") "monitor")
print_info "Starting security monitoring..." print_info "Starting security monitoring..."
make security-monitor if make_target_exists security-monitor; then
make security-monitor
else
print_warning "No 'security-monitor' Make target found"
fi
;; ;;
"deploy") "deploy")
print_info "Deploying with security..." print_info "Deploying with security..."
make security-deploy if make_target_exists security-deploy; then
make security-deploy
else
print_warning "No 'security-deploy' Make target found"
fi
;; ;;
"audit") "audit")
print_info "Running security audit..." print_info "Running security audit..."
make security-audit if make_target_exists security-audit; then
make security-audit
else
print_warning "No 'security-audit' Make target found"
fi
;; ;;
*) *)
echo "Usage: $0 security {check|monitor|deploy|audit}" echo "Usage: $0 security {check|monitor|deploy|audit}"
@ -258,15 +293,22 @@ run_development() {
case "${1:-setup}" in case "${1:-setup}" in
"setup") "setup")
print_info "Setting up development environment..." print_info "Setting up development environment..."
./scripts/auto_setup.sh print_warning "Legacy setup scripts were removed; using Makefile/deployments instead"
print_info "Try: make dev"
print_info "Or: ./deployments/deploy.sh dev up"
;; ;;
"quick") "quick")
print_info "Running quick start..." print_info "Running quick start..."
./scripts/quick_start.sh print_warning "Legacy quick start script was removed; using deployments instead"
print_info "Try: ./deployments/deploy.sh dev up"
;; ;;
"deps") "deps")
print_info "Installing dependencies..." print_info "Installing dependencies..."
make install-deps if make_target_exists install-deps; then
make install-deps
else
print_warning "No 'install-deps' Make target found"
fi
;; ;;
*) *)
echo "Usage: $0 dev {setup|quick|deps}" echo "Usage: $0 dev {setup|quick|deps}"
@ -309,7 +351,7 @@ cleanup() {
echo "" echo ""
print_info "Cleaning project artifacts..." print_info "Cleaning project artifacts..."
make clean-all make clean
print_info "Stopping services..." print_info "Stopping services..."
stop_services stop_services
@ -330,7 +372,7 @@ show_help() {
echo " start - Start all services" echo " start - Start all services"
echo " stop - Stop all services" echo " stop - Stop all services"
echo " health - Check API health endpoint" echo " health - Check API health endpoint"
echo " security - Security management (check|monitor|deploy|audit)" echo " security - Security management (check|monitor|deploy|audit)"
echo " dev - Development environment (setup|quick|deps)" echo " dev - Development environment (setup|quick|deps)"
echo " logs - Show application logs" echo " logs - Show application logs"
echo " cleanup - Clean project artifacts and stop services" echo " cleanup - Clean project artifacts and stop services"

View file

@ -47,7 +47,10 @@ type Improvement struct {
} }
// NewPerformanceRegressionDetector creates a new detector instance // NewPerformanceRegressionDetector creates a new detector instance
func NewPerformanceRegressionDetector(baselineFile string, threshold float64) *PerformanceRegressionDetector { func NewPerformanceRegressionDetector(
baselineFile string,
threshold float64,
) *PerformanceRegressionDetector {
return &PerformanceRegressionDetector{ return &PerformanceRegressionDetector{
BaselineFile: baselineFile, BaselineFile: baselineFile,
Threshold: threshold, Threshold: threshold,
@ -74,7 +77,9 @@ func (prd *PerformanceRegressionDetector) LoadBaseline() ([]BenchmarkResult, err
} }
// AnalyzeResults analyzes current results against baseline // AnalyzeResults analyzes current results against baseline
func (prd *PerformanceRegressionDetector) AnalyzeResults(current []BenchmarkResult) (*RegressionReport, error) { func (prd *PerformanceRegressionDetector) AnalyzeResults(
current []BenchmarkResult,
) (*RegressionReport, error) {
baseline, err := prd.LoadBaseline() baseline, err := prd.LoadBaseline()
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to load baseline: %w", err) return nil, fmt.Errorf("failed to load baseline: %w", err)

Some files were not shown because too many files have changed in this diff Show more