chore(ops): reorganize deployments/monitoring and remove legacy scripts
This commit is contained in:
parent
5ef24e4c6d
commit
f726806770
101 changed files with 3598 additions and 4982 deletions
56
configs/api/dev.yaml
Normal file
56
configs/api/dev.yaml
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
base_path: "/data/experiments"
|
||||
|
||||
data_dir: "/data/active"
|
||||
|
||||
auth:
|
||||
enabled: false
|
||||
|
||||
server:
|
||||
address: "0.0.0.0:9101"
|
||||
tls:
|
||||
enabled: false
|
||||
cert_file: "/app/ssl/cert.pem"
|
||||
key_file: "/app/ssl/key.pem"
|
||||
|
||||
security:
|
||||
production_mode: false
|
||||
allowed_origins:
|
||||
- "http://localhost:3000"
|
||||
api_key_rotation_days: 90
|
||||
audit_logging:
|
||||
enabled: true
|
||||
log_path: "/tmp/fetchml-audit.log"
|
||||
rate_limit:
|
||||
enabled: false
|
||||
requests_per_minute: 60
|
||||
burst_size: 10
|
||||
ip_whitelist: []
|
||||
|
||||
monitoring:
|
||||
prometheus:
|
||||
enabled: true
|
||||
port: 9101
|
||||
path: "/metrics"
|
||||
health_checks:
|
||||
enabled: true
|
||||
interval: "30s"
|
||||
|
||||
redis:
|
||||
addr: "redis:6379"
|
||||
password: ""
|
||||
db: 0
|
||||
|
||||
database:
|
||||
type: "sqlite"
|
||||
connection: "/tmp/fetchml.sqlite"
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file: ""
|
||||
audit_log: ""
|
||||
|
||||
resources:
|
||||
max_workers: 1
|
||||
desired_rps_per_worker: 2
|
||||
podman_cpus: "2"
|
||||
podman_memory: "4Gi"
|
||||
71
configs/api/homelab-secure.yaml
Normal file
71
configs/api/homelab-secure.yaml
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
base_path: "/data/experiments"
|
||||
|
||||
data_dir: "/data/active"
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
api_keys:
|
||||
homelab_admin:
|
||||
hash: "CHANGE_ME_SHA256_HOMELAB_ADMIN_KEY"
|
||||
admin: true
|
||||
roles:
|
||||
- admin
|
||||
permissions:
|
||||
"*": true
|
||||
homelab_user:
|
||||
hash: "CHANGE_ME_SHA256_HOMELAB_USER_KEY"
|
||||
admin: false
|
||||
roles:
|
||||
- researcher
|
||||
permissions:
|
||||
experiments: true
|
||||
datasets: true
|
||||
jupyter: true
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: false
|
||||
cert_file: "/app/ssl/cert.pem"
|
||||
key_file: "/app/ssl/key.pem"
|
||||
|
||||
security:
|
||||
production_mode: true
|
||||
allowed_origins:
|
||||
- "https://ml-experiments.example.com"
|
||||
rate_limit:
|
||||
enabled: true
|
||||
requests_per_minute: 60
|
||||
burst_size: 10
|
||||
ip_whitelist:
|
||||
- "127.0.0.1"
|
||||
- "192.168.0.0/16"
|
||||
|
||||
monitoring:
|
||||
prometheus:
|
||||
enabled: true
|
||||
port: 9101
|
||||
path: "/metrics"
|
||||
health_checks:
|
||||
enabled: true
|
||||
interval: "30s"
|
||||
|
||||
redis:
|
||||
url: "redis://:CHANGE_ME_REDIS_PASSWORD@redis:6379"
|
||||
password: ""
|
||||
db: 0
|
||||
|
||||
database:
|
||||
type: "sqlite"
|
||||
connection: "/data/experiments/fetch_ml.sqlite"
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file: "/logs/fetch_ml.log"
|
||||
audit_log: ""
|
||||
|
||||
resources:
|
||||
max_workers: 1
|
||||
desired_rps_per_worker: 2
|
||||
podman_cpus: "2"
|
||||
podman_memory: "4Gi"
|
||||
74
configs/api/multi-user.yaml
Normal file
74
configs/api/multi-user.yaml
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
base_path: "/app/data/experiments"
|
||||
|
||||
data_dir: "/data/active"
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
api_keys:
|
||||
admin_user:
|
||||
hash: "CHANGE_ME_SHA256_ADMIN_USER_KEY"
|
||||
admin: true
|
||||
roles: ["user", "admin"]
|
||||
permissions:
|
||||
"*": true
|
||||
researcher1:
|
||||
hash: "CHANGE_ME_SHA256_RESEARCHER1_KEY"
|
||||
admin: false
|
||||
roles: ["user", "researcher"]
|
||||
permissions:
|
||||
"jobs:read": true
|
||||
"jobs:create": true
|
||||
"jobs:update": true
|
||||
"jobs:delete": false
|
||||
analyst1:
|
||||
hash: "CHANGE_ME_SHA256_ANALYST1_KEY"
|
||||
admin: false
|
||||
roles: ["user", "analyst"]
|
||||
permissions:
|
||||
"jobs:read": true
|
||||
"jobs:create": false
|
||||
"jobs:update": false
|
||||
"jobs:delete": false
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: false
|
||||
|
||||
security:
|
||||
production_mode: false
|
||||
allowed_origins: []
|
||||
rate_limit:
|
||||
enabled: true
|
||||
requests_per_minute: 60
|
||||
burst_size: 20
|
||||
ip_whitelist: []
|
||||
|
||||
monitoring:
|
||||
prometheus:
|
||||
enabled: true
|
||||
port: 9101
|
||||
path: "/metrics"
|
||||
health_checks:
|
||||
enabled: true
|
||||
interval: "30s"
|
||||
|
||||
redis:
|
||||
url: "redis://redis:6379"
|
||||
password: ""
|
||||
db: 0
|
||||
|
||||
database:
|
||||
type: "sqlite"
|
||||
connection: "/app/data/experiments/fetch_ml.sqlite"
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file: "/logs/app.log"
|
||||
audit_log: ""
|
||||
|
||||
resources:
|
||||
max_workers: 3
|
||||
desired_rps_per_worker: 3
|
||||
podman_cpus: "2"
|
||||
podman_memory: "4Gi"
|
||||
59
configs/api/prod.yaml
Normal file
59
configs/api/prod.yaml
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
base_path: "/app/data/experiments"
|
||||
|
||||
data_dir: "/data/active"
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
api_keys:
|
||||
admin:
|
||||
hash: "replace-with-sha256-of-your-api-key"
|
||||
admin: true
|
||||
roles:
|
||||
- admin
|
||||
permissions:
|
||||
"*": true
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: true
|
||||
cert_file: "/app/ssl/cert.pem"
|
||||
key_file: "/app/ssl/key.pem"
|
||||
|
||||
security:
|
||||
production_mode: false
|
||||
allowed_origins: []
|
||||
rate_limit:
|
||||
enabled: true
|
||||
requests_per_minute: 60
|
||||
burst_size: 10
|
||||
ip_whitelist: []
|
||||
|
||||
monitoring:
|
||||
prometheus:
|
||||
enabled: true
|
||||
port: 9101
|
||||
path: "/metrics"
|
||||
health_checks:
|
||||
enabled: true
|
||||
interval: "30s"
|
||||
|
||||
redis:
|
||||
addr: "redis:6379"
|
||||
password: ""
|
||||
db: 0
|
||||
|
||||
database:
|
||||
type: "sqlite"
|
||||
connection: "/app/data/experiments/fetch_ml.sqlite"
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file: "/logs/fetch_ml.log"
|
||||
audit_log: ""
|
||||
|
||||
resources:
|
||||
max_workers: 2
|
||||
desired_rps_per_worker: 5
|
||||
podman_cpus: "2"
|
||||
podman_memory: "4Gi"
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
# Local development config (TOML)
|
||||
# Used by both CLI and TUI when no overrides are set
|
||||
|
||||
worker_host = "127.0.0.1"
|
||||
worker_user = "dev_user"
|
||||
worker_base = "/tmp/ml-experiments"
|
||||
worker_port = 9101
|
||||
api_key = "your-api-key-here"
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
auth:
|
||||
enabled: true
|
||||
api_keys:
|
||||
dev_user:
|
||||
hash: "replace-with-sha256-of-your-api-key"
|
||||
admin: true
|
||||
roles:
|
||||
- admin
|
||||
permissions:
|
||||
'*': true
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: false
|
||||
|
||||
security:
|
||||
rate_limit:
|
||||
enabled: false
|
||||
|
||||
redis:
|
||||
url: "redis://redis:6379"
|
||||
|
||||
logging:
|
||||
level: info
|
||||
console: true
|
||||
|
|
@ -1,17 +0,0 @@
|
|||
base_path: "/app/data/experiments"
|
||||
|
||||
auth:
|
||||
enabled: false
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
|
||||
database:
|
||||
type: "sqlite"
|
||||
connection: "/app/data/experiments/fetch_ml.db"
|
||||
|
||||
redis:
|
||||
url: "redis://redis:6379"
|
||||
|
||||
logging:
|
||||
level: "debug"
|
||||
|
|
@ -1,46 +0,0 @@
|
|||
base_path: "/app/data/experiments"
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
api_keys:
|
||||
homelab_user:
|
||||
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
|
||||
admin: true
|
||||
roles: ["user", "admin"]
|
||||
permissions:
|
||||
read: true
|
||||
write: true
|
||||
delete: true
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: true
|
||||
cert_file: "/app/ssl/cert.pem"
|
||||
key_file: "/app/ssl/key.pem"
|
||||
|
||||
security:
|
||||
rate_limit:
|
||||
enabled: true
|
||||
requests_per_minute: 30
|
||||
ip_whitelist: []
|
||||
|
||||
# SQLite database for persistence
|
||||
database:
|
||||
type: "sqlite"
|
||||
connection: "/app/data/fetch_ml.db"
|
||||
|
||||
redis:
|
||||
url: "redis://redis:6379"
|
||||
max_connections: 10
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file: "/app/logs/app.log"
|
||||
audit_file: "/app/logs/audit.log"
|
||||
|
||||
resources:
|
||||
max_workers: 1
|
||||
desired_rps_per_worker: 2
|
||||
podman_cpus: "2"
|
||||
podman_memory: "8g"
|
||||
|
|
@ -1,39 +0,0 @@
|
|||
base_path: "/app/data/experiments"
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
api_keys:
|
||||
homelab_user:
|
||||
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
|
||||
admin: true
|
||||
roles: ["user", "admin"]
|
||||
permissions:
|
||||
read: true
|
||||
write: true
|
||||
delete: true
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: true
|
||||
cert_file: "/app/ssl/cert.pem"
|
||||
key_file: "/app/ssl/key.pem"
|
||||
|
||||
security:
|
||||
rate_limit:
|
||||
enabled: true
|
||||
requests_per_minute: 30
|
||||
ip_whitelist:
|
||||
- "127.0.0.1"
|
||||
- "::1"
|
||||
- "192.168.0.0/16"
|
||||
- "10.0.0.0/8"
|
||||
|
||||
redis:
|
||||
url: "redis://redis:6379"
|
||||
max_connections: 10
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file: "/app/logs/app.log"
|
||||
audit_file: "/app/logs/audit.log"
|
||||
|
|
@ -1,58 +0,0 @@
|
|||
# Secure Homelab Configuration
|
||||
# IMPORTANT: Keep your API keys safe and never share them!
|
||||
|
||||
redis:
|
||||
url: "redis://redis:6379"
|
||||
max_connections: 10
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
api_keys:
|
||||
homelab_admin:
|
||||
hash: b444f7d99edd0e32c838d900c4f0dfab86690b55871b587b730f3bc84812dd5f
|
||||
admin: true
|
||||
roles:
|
||||
- admin
|
||||
permissions:
|
||||
'*': true
|
||||
homelab_user:
|
||||
hash: 5badb9721b0cb19f5be512854885cadbc7490afc0de1f62db5ae3144c6cc294c
|
||||
admin: false
|
||||
roles:
|
||||
- researcher
|
||||
permissions:
|
||||
'experiments': true
|
||||
'datasets': true
|
||||
'jupyter': true
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: true
|
||||
key_file: "/app/ssl/key.pem"
|
||||
cert_file: "/app/ssl/cert.pem"
|
||||
|
||||
security:
|
||||
rate_limit:
|
||||
enabled: true
|
||||
requests_per_minute: 60
|
||||
burst_size: 10
|
||||
ip_whitelist: []
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file: "logs/fetch_ml.log"
|
||||
console: true
|
||||
|
||||
resources:
|
||||
cpu_limit: "2"
|
||||
memory_limit: "4Gi"
|
||||
gpu_limit: 0
|
||||
disk_limit: "10Gi"
|
||||
|
||||
# Prometheus metrics
|
||||
metrics:
|
||||
enabled: true
|
||||
listen_addr: ":9100"
|
||||
tls:
|
||||
enabled: false
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
redis:
|
||||
url: "redis://redis:6379"
|
||||
max_connections: 10
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
api_keys:
|
||||
homelab_admin:
|
||||
hash: b444f7d99edd0e32c838d900c4f0dfab86690b55871b587b730f3bc84812dd5f
|
||||
admin: true
|
||||
roles:
|
||||
- admin
|
||||
permissions:
|
||||
'*': true
|
||||
homelab_user:
|
||||
hash: 5badb9721b0cb19f5be512854885cadbc7490afc0de1f62db5ae3144c6cc294c
|
||||
admin: false
|
||||
roles:
|
||||
- researcher
|
||||
permissions:
|
||||
'experiments': true
|
||||
'datasets': true
|
||||
'jupyter': true
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: true
|
||||
cert_file: "/app/ssl/cert.pem"
|
||||
key_file: "/app/ssl/key.pem"
|
||||
|
||||
security:
|
||||
rate_limit:
|
||||
enabled: true
|
||||
requests_per_minute: 60
|
||||
burst_size: 10
|
||||
ip_whitelist:
|
||||
- "127.0.0.1"
|
||||
- "::1"
|
||||
- "172.21.0.1" # Docker gateway
|
||||
|
||||
# Prometheus metrics
|
||||
metrics:
|
||||
enabled: true
|
||||
listen_addr: ":9100"
|
||||
tls:
|
||||
enabled: true
|
||||
cert_file: "/app/ssl/cert.pem"
|
||||
key_file: "/app/ssl/key.pem"
|
||||
|
|
@ -1,78 +0,0 @@
|
|||
base_path: "/app/data/experiments"
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
api_keys:
|
||||
admin_user:
|
||||
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
|
||||
admin: true
|
||||
roles: ["user", "admin"]
|
||||
permissions:
|
||||
read: true
|
||||
write: true
|
||||
delete: true
|
||||
researcher1:
|
||||
hash: "ef92b778ba7a6c8f2150019a5678047b6a9a2b95cef8189518f9b35c54d2e3ae" # "research123"
|
||||
admin: false
|
||||
roles: ["user", "researcher"]
|
||||
permissions:
|
||||
jobs:read: true
|
||||
jobs:create: true
|
||||
jobs:update: true
|
||||
jobs:delete: false
|
||||
analyst1:
|
||||
hash: "a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3" # "analyst123"
|
||||
admin: false
|
||||
roles: ["user", "analyst"]
|
||||
permissions:
|
||||
jobs:read: true
|
||||
jobs:create: false
|
||||
jobs:update: false
|
||||
jobs:delete: false
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: false
|
||||
|
||||
security:
|
||||
rate_limit:
|
||||
enabled: true
|
||||
requests_per_minute: 60
|
||||
burst_size: 20
|
||||
ip_whitelist: []
|
||||
cors:
|
||||
enabled: true
|
||||
allowed_origins: ["https://localhost:9103", "https://localhost:3000"]
|
||||
allowed_methods: ["GET", "POST", "PUT", "DELETE", "OPTIONS"]
|
||||
allowed_headers: ["Content-Type", "Authorization"]
|
||||
|
||||
database:
|
||||
type: "sqlite"
|
||||
connection: "/app/data/experiments/fetch_ml.db"
|
||||
max_connections: 20
|
||||
connection_timeout: "30s"
|
||||
|
||||
redis:
|
||||
url: "redis://redis:6379"
|
||||
max_connections: 15
|
||||
connection_timeout: "10s"
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file: "/app/logs/app.log"
|
||||
max_size: "100MB"
|
||||
max_backups: 5
|
||||
compress: true
|
||||
|
||||
resources:
|
||||
max_workers: 3
|
||||
desired_rps_per_worker: 3
|
||||
podman_cpus: "2"
|
||||
podman_memory: "4g"
|
||||
job_timeout: "30m"
|
||||
|
||||
monitoring:
|
||||
enabled: true
|
||||
metrics_path: "/metrics"
|
||||
health_check_interval: "30s"
|
||||
|
|
@ -1,59 +0,0 @@
|
|||
base_path: "./data/ml-experiments"
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
apikeys:
|
||||
homelab_user:
|
||||
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
|
||||
admin: true
|
||||
roles: ["admin"]
|
||||
permissions:
|
||||
read: true
|
||||
write: true
|
||||
delete: true
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: false # Disabled for local testing
|
||||
cert_file: "./ssl/cert.pem"
|
||||
key_file: "./ssl/key.pem"
|
||||
min_version: "1.3"
|
||||
|
||||
security:
|
||||
rate_limit:
|
||||
enabled: true
|
||||
requests_per_minute: 60
|
||||
burst_size: 10
|
||||
ip_whitelist:
|
||||
- "127.0.0.1"
|
||||
- "::1"
|
||||
- "localhost"
|
||||
- "10.0.0.0/8"
|
||||
- "192.168.0.0/16"
|
||||
- "172.16.0.0/12"
|
||||
failed_login_lockout:
|
||||
enabled: true
|
||||
max_attempts: 5
|
||||
lockout_duration: "15m"
|
||||
|
||||
# SQLite database for production
|
||||
database:
|
||||
type: "sqlite"
|
||||
connection: "data/fetch_ml.db"
|
||||
|
||||
redis:
|
||||
url: "redis://localhost:6379"
|
||||
addr: "localhost:6379"
|
||||
password: "JZVd2Y6IDaLNaYLBOFgQ7ae4Ox5t37NTIyPMQlLJD4k="
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file: "logs/fetch_ml.log"
|
||||
audit_log: "logs/audit.log"
|
||||
|
||||
resources:
|
||||
max_workers: 2
|
||||
desired_rps_per_worker: 5
|
||||
podman_cpus: "8"
|
||||
podman_memory: "32g"
|
||||
|
|
@ -1,13 +1,17 @@
|
|||
# Fetch ML Configuration Example for PostgreSQL
|
||||
# This example shows how to configure Fetch ML to use PostgreSQL as the database
|
||||
|
||||
base_path: "./data/experiments"
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
apikeys:
|
||||
api_keys:
|
||||
admin:
|
||||
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd5f8b6c8b0b4f0b8e3" # "password"
|
||||
admin: true
|
||||
roles: ["admin"]
|
||||
permissions:
|
||||
"*": true
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
|
|
@ -25,40 +29,34 @@ database:
|
|||
# connection: "postgres://fetchml:your_password_here@localhost:5432/fetchml?sslmode=disable"
|
||||
|
||||
redis:
|
||||
host: "localhost"
|
||||
port: 6379
|
||||
addr: "localhost:6379"
|
||||
password: ""
|
||||
db: 0
|
||||
pool_size: 10
|
||||
max_retries: 3
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
console: true
|
||||
format: "text"
|
||||
file: ""
|
||||
audit_log: ""
|
||||
|
||||
security:
|
||||
secret_key: "your-secret-key-here-at-least-16-characters"
|
||||
jwt_expiry: "24h"
|
||||
production_mode: false
|
||||
rate_limit:
|
||||
enabled: false
|
||||
requests_per_minute: 60
|
||||
burst_size: 10
|
||||
ip_whitelist: []
|
||||
|
||||
containers:
|
||||
runtime: "podman"
|
||||
registry: "docker.io"
|
||||
pull_policy: "missing"
|
||||
resources:
|
||||
cpu_limit: "2"
|
||||
memory_limit: "4Gi"
|
||||
gpu_limit: 1
|
||||
|
||||
storage:
|
||||
data_path: "data"
|
||||
results_path: "results"
|
||||
temp_path: "/tmp/fetch_ml"
|
||||
cleanup:
|
||||
monitoring:
|
||||
prometheus:
|
||||
enabled: true
|
||||
max_age_hours: 168
|
||||
max_size_gb: 10
|
||||
port: 9101
|
||||
path: "/metrics"
|
||||
health_checks:
|
||||
enabled: true
|
||||
interval: "30s"
|
||||
|
||||
resources:
|
||||
max_workers: 1
|
||||
desired_rps_per_worker: 2
|
||||
podman_cpus: "2"
|
||||
podman_memory: "4Gi"
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
# Fetch ML Configuration Example
|
||||
# Copy this file to config.yaml and customize for your environment
|
||||
|
||||
base_path: "./data/experiments"
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
api_keys:
|
||||
|
|
@ -13,54 +15,43 @@ auth:
|
|||
"*": true
|
||||
|
||||
server:
|
||||
host: "localhost"
|
||||
port: 8080
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: false
|
||||
|
||||
database:
|
||||
type: "sqlite"
|
||||
connection: "data/fetch_ml.db"
|
||||
host: ""
|
||||
port: 5432
|
||||
username: ""
|
||||
password: ""
|
||||
database: "fetch_ml"
|
||||
|
||||
redis:
|
||||
url: "redis://localhost:6379"
|
||||
host: "localhost"
|
||||
port: 6379
|
||||
addr: "localhost:6379"
|
||||
password: ""
|
||||
db: 0
|
||||
pool_size: 10
|
||||
max_retries: 3
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file: "logs/fetch_ml.log"
|
||||
format: "text"
|
||||
console: true
|
||||
audit_log: "logs/audit.log"
|
||||
|
||||
security:
|
||||
secret_key: "your-secret-key-at-least-16-chars"
|
||||
jwt_expiry: "24h"
|
||||
rate_limit:
|
||||
enabled: false
|
||||
requests_per_minute: 60
|
||||
burst_size: 10
|
||||
ip_whitelist: []
|
||||
production_mode: false
|
||||
|
||||
containers:
|
||||
runtime: "podman"
|
||||
registry: "docker.io"
|
||||
pull_policy: "missing"
|
||||
resources:
|
||||
cpu_limit: "2"
|
||||
memory_limit: "4Gi"
|
||||
gpu_limit: 1
|
||||
|
||||
storage:
|
||||
data_path: "data"
|
||||
results_path: "results"
|
||||
temp_path: "/tmp/fetch_ml"
|
||||
cleanup:
|
||||
monitoring:
|
||||
prometheus:
|
||||
enabled: true
|
||||
max_age_hours: 168
|
||||
max_size_gb: 10
|
||||
port: 9101
|
||||
path: "/metrics"
|
||||
health_checks:
|
||||
enabled: true
|
||||
interval: "30s"
|
||||
|
||||
resources:
|
||||
max_workers: 1
|
||||
desired_rps_per_worker: 2
|
||||
podman_cpus: "2"
|
||||
podman_memory: "4Gi"
|
||||
|
|
|
|||
|
|
@ -12,6 +12,10 @@ properties:
|
|||
type: string
|
||||
description: Base path for experiment data
|
||||
default: "/tmp/ml-experiments"
|
||||
data_dir:
|
||||
type: string
|
||||
description: Data directory (datasets/snapshots) for integrity validation
|
||||
default: "/data/active"
|
||||
auth:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
|
|
@ -40,7 +44,6 @@ properties:
|
|||
type: array
|
||||
items:
|
||||
type: string
|
||||
enum: [admin, data_scientist, data_engineer, viewer, operator]
|
||||
permissions:
|
||||
type: object
|
||||
additionalProperties:
|
||||
|
|
@ -64,9 +67,30 @@ properties:
|
|||
type: string
|
||||
key_file:
|
||||
type: string
|
||||
min_version:
|
||||
monitoring:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
properties:
|
||||
prometheus:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
properties:
|
||||
enabled:
|
||||
type: boolean
|
||||
port:
|
||||
type: integer
|
||||
minimum: 1
|
||||
maximum: 65535
|
||||
path:
|
||||
type: string
|
||||
health_checks:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
properties:
|
||||
enabled:
|
||||
type: boolean
|
||||
interval:
|
||||
type: string
|
||||
description: Minimum TLS version (e.g. "1.3")
|
||||
database:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
|
|
@ -99,58 +123,56 @@ properties:
|
|||
addr:
|
||||
type: string
|
||||
description: Optional host:port shorthand for Redis
|
||||
host:
|
||||
type: string
|
||||
default: "localhost"
|
||||
port:
|
||||
type: integer
|
||||
minimum: 1
|
||||
maximum: 65535
|
||||
default: 6379
|
||||
password:
|
||||
type: string
|
||||
db:
|
||||
type: integer
|
||||
minimum: 0
|
||||
default: 0
|
||||
pool_size:
|
||||
type: integer
|
||||
minimum: 1
|
||||
default: 10
|
||||
max_retries:
|
||||
type: integer
|
||||
minimum: 0
|
||||
default: 3
|
||||
queue:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
properties:
|
||||
backend:
|
||||
type: string
|
||||
enum: [redis, sqlite]
|
||||
default: redis
|
||||
sqlite_path:
|
||||
type: string
|
||||
logging:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
properties:
|
||||
level:
|
||||
type: string
|
||||
enum: [debug, info, warn, error, fatal]
|
||||
enum: [debug, info, warn, error]
|
||||
default: "info"
|
||||
file:
|
||||
type: string
|
||||
audit_log:
|
||||
type: string
|
||||
format:
|
||||
type: string
|
||||
enum: [text, json]
|
||||
default: "text"
|
||||
console:
|
||||
type: boolean
|
||||
default: true
|
||||
security:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
properties:
|
||||
secret_key:
|
||||
type: string
|
||||
minLength: 16
|
||||
jwt_expiry:
|
||||
type: string
|
||||
pattern: "^\\d+[smhd]$"
|
||||
default: "24h"
|
||||
production_mode:
|
||||
type: boolean
|
||||
default: false
|
||||
allowed_origins:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
api_key_rotation_days:
|
||||
type: integer
|
||||
minimum: 0
|
||||
audit_logging:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
properties:
|
||||
enabled:
|
||||
type: boolean
|
||||
log_path:
|
||||
type: string
|
||||
ip_whitelist:
|
||||
type: array
|
||||
items:
|
||||
|
|
@ -183,23 +205,23 @@ properties:
|
|||
minimum: 1
|
||||
resources:
|
||||
type: object
|
||||
description: Resource configuration defaults
|
||||
description: Resource configuration
|
||||
additionalProperties: false
|
||||
properties:
|
||||
cpu_limit:
|
||||
type: string
|
||||
description: Default CPU limit (e.g., "2" or "500m")
|
||||
default: "2"
|
||||
memory_limit:
|
||||
type: string
|
||||
description: Default memory limit (e.g., "1Gi" or "512Mi")
|
||||
default: "4Gi"
|
||||
gpu_limit:
|
||||
max_workers:
|
||||
type: integer
|
||||
description: Default GPU limit
|
||||
minimum: 0
|
||||
default: 0
|
||||
disk_limit:
|
||||
minimum: 1
|
||||
default: 1
|
||||
desired_rps_per_worker:
|
||||
type: integer
|
||||
minimum: 1
|
||||
requests_per_sec:
|
||||
type: integer
|
||||
minimum: 1
|
||||
podman_cpus:
|
||||
type: string
|
||||
description: Default disk limit
|
||||
default: "10Gi"
|
||||
podman_memory:
|
||||
type: string
|
||||
request_burst:
|
||||
type: integer
|
||||
minimum: 0
|
||||
|
|
|
|||
|
|
@ -2,10 +2,28 @@ $schema: "http://json-schema.org/draft-07/schema#"
|
|||
title: "Fetch ML Worker Configuration"
|
||||
type: object
|
||||
additionalProperties: false
|
||||
allOf:
|
||||
# forbid both index and UUID at once (allow zero or one)
|
||||
- not:
|
||||
required: [gpu_visible_devices, gpu_visible_device_ids]
|
||||
- if:
|
||||
properties:
|
||||
queue:
|
||||
properties:
|
||||
backend:
|
||||
const: sqlite
|
||||
required: [queue]
|
||||
then:
|
||||
properties:
|
||||
queue:
|
||||
required: [sqlite_path]
|
||||
else:
|
||||
anyOf:
|
||||
- required: [redis_addr]
|
||||
- required: [redis_url]
|
||||
required:
|
||||
- base_path
|
||||
- worker_id
|
||||
- redis_addr
|
||||
- podman_image
|
||||
- container_workspace
|
||||
- container_results
|
||||
|
|
@ -31,6 +49,9 @@ properties:
|
|||
train_script:
|
||||
type: string
|
||||
description: Path to training script
|
||||
redis_url:
|
||||
type: string
|
||||
description: Legacy Redis URL (if set, redis_addr/password/db are derived)
|
||||
redis_addr:
|
||||
type: string
|
||||
description: Redis server address
|
||||
|
|
@ -42,6 +63,18 @@ properties:
|
|||
minimum: 0
|
||||
default: 0
|
||||
description: Redis database number
|
||||
queue:
|
||||
type: object
|
||||
description: Queue backend configuration (optional; defaults to redis)
|
||||
additionalProperties: false
|
||||
properties:
|
||||
backend:
|
||||
type: string
|
||||
enum: [redis, sqlite]
|
||||
default: redis
|
||||
sqlite_path:
|
||||
type: string
|
||||
description: Path to queue.db (sqlite backend only)
|
||||
known_hosts:
|
||||
type: string
|
||||
description: Path to SSH known hosts file
|
||||
|
|
@ -116,6 +149,48 @@ properties:
|
|||
type: string
|
||||
description: Dataset cache TTL duration
|
||||
default: "30m"
|
||||
snapshot_store:
|
||||
type: object
|
||||
description: Optional S3-compatible snapshot store configuration (worker pulls snapshots by snapshot_id)
|
||||
additionalProperties: false
|
||||
properties:
|
||||
enabled:
|
||||
type: boolean
|
||||
default: false
|
||||
endpoint:
|
||||
type: string
|
||||
description: S3-compatible endpoint (e.g. "s3.amazonaws.com" or "minio:9000")
|
||||
secure:
|
||||
type: boolean
|
||||
default: true
|
||||
region:
|
||||
type: string
|
||||
bucket:
|
||||
type: string
|
||||
prefix:
|
||||
type: string
|
||||
description: Object key prefix where snapshots are stored
|
||||
access_key:
|
||||
type: string
|
||||
description: Optional static access key (otherwise uses env credentials)
|
||||
secret_key:
|
||||
type: string
|
||||
description: Optional static secret key (otherwise uses env credentials)
|
||||
session_token:
|
||||
type: string
|
||||
description: Optional session token for temporary credentials
|
||||
timeout:
|
||||
type: string
|
||||
description: Duration string (e.g., "10m")
|
||||
default: "10m"
|
||||
max_retries:
|
||||
type: integer
|
||||
minimum: 0
|
||||
default: 3
|
||||
prewarm_enabled:
|
||||
type: boolean
|
||||
description: Enable best-effort prewarming of next task artifacts (snapshot/datasets/env image). Default off.
|
||||
default: false
|
||||
podman_image:
|
||||
type: string
|
||||
minLength: 1
|
||||
|
|
@ -126,10 +201,40 @@ properties:
|
|||
container_results:
|
||||
type: string
|
||||
description: Container results path
|
||||
gpu_access:
|
||||
type: boolean
|
||||
default: false
|
||||
description: Enable GPU access
|
||||
gpu_devices:
|
||||
type: array
|
||||
description: GPU device paths to expose to the container (e.g. ["/dev/dri"]).
|
||||
items:
|
||||
type: string
|
||||
gpu_vendor:
|
||||
type: string
|
||||
enum: [nvidia, amd, apple, none]
|
||||
description: GPU vendor/runtime selection for env var injection (nvidia|amd|apple|none).
|
||||
default: "none"
|
||||
gpu_visible_devices:
|
||||
type: array
|
||||
description: GPU indices to expose via vendor-specific env (e.g. [0,1]).
|
||||
items:
|
||||
type: integer
|
||||
gpu_visible_device_ids:
|
||||
type: array
|
||||
description: NVIDIA GPU UUIDs to expose via CUDA_VISIBLE_DEVICES (e.g. ["GPU-..."]). Mutually exclusive with gpu_visible_devices.
|
||||
items:
|
||||
type: string
|
||||
apple_gpu:
|
||||
type: object
|
||||
description: Apple M-series GPU configuration
|
||||
additionalProperties: false
|
||||
properties:
|
||||
enabled:
|
||||
type: boolean
|
||||
default: false
|
||||
metal_device:
|
||||
type: string
|
||||
description: Path to Metal device node (e.g. /dev/metal)
|
||||
mps_runtime:
|
||||
type: string
|
||||
description: Path to MPS runtime device node (e.g. /dev/mps)
|
||||
task_lease_duration:
|
||||
type: string
|
||||
description: Task lease duration
|
||||
|
|
|
|||
58
configs/workers/docker-dev.yaml
Normal file
58
configs/workers/docker-dev.yaml
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
worker_id: "docker-worker"
|
||||
base_path: "/data/experiments"
|
||||
train_script: "train.py"
|
||||
|
||||
redis_url: "redis://redis:6379/0"
|
||||
|
||||
local_mode: true
|
||||
|
||||
prewarm_enabled: true
|
||||
|
||||
max_workers: 1
|
||||
poll_interval_seconds: 2
|
||||
|
||||
auto_fetch_data: false
|
||||
|
||||
data_manager_path: "./data_manager"
|
||||
dataset_cache_ttl: "30m"
|
||||
|
||||
data_dir: "/data/active"
|
||||
|
||||
snapshot_store:
|
||||
enabled: true
|
||||
endpoint: "minio:9000"
|
||||
secure: false
|
||||
bucket: "fetchml-snapshots"
|
||||
prefix: "snapshots"
|
||||
timeout: "2m"
|
||||
max_retries: 3
|
||||
|
||||
podman_image: "python:3.9-slim"
|
||||
container_workspace: "/workspace"
|
||||
container_results: "/results"
|
||||
gpu_devices:
|
||||
- "/dev/dri"
|
||||
gpu_vendor: "apple"
|
||||
gpu_visible_devices: []
|
||||
|
||||
# Apple M-series GPU configuration
|
||||
apple_gpu:
|
||||
enabled: true
|
||||
metal_device: "/dev/metal"
|
||||
mps_runtime: "/dev/mps"
|
||||
|
||||
resources:
|
||||
max_workers: 1
|
||||
desired_rps_per_worker: 2
|
||||
podman_cpus: "2"
|
||||
podman_memory: "4Gi"
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
listen_addr: ":9100"
|
||||
metrics_flush_interval: "500ms"
|
||||
|
||||
task_lease_duration: "30m"
|
||||
heartbeat_interval: "1m"
|
||||
max_retries: 3
|
||||
graceful_timeout: "5m"
|
||||
50
configs/workers/docker-prod.yaml
Normal file
50
configs/workers/docker-prod.yaml
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
worker_id: "docker-worker"
|
||||
base_path: "/tmp/fetchml-jobs"
|
||||
train_script: "train.py"
|
||||
|
||||
redis_url: "redis://redis:6379/0"
|
||||
|
||||
local_mode: true
|
||||
|
||||
max_workers: 1
|
||||
poll_interval_seconds: 2
|
||||
|
||||
auto_fetch_data: false
|
||||
|
||||
data_manager_path: "./data_manager"
|
||||
dataset_cache_ttl: "30m"
|
||||
|
||||
data_dir: "/data/active"
|
||||
|
||||
snapshot_store:
|
||||
enabled: true
|
||||
endpoint: "minio:9000"
|
||||
secure: false
|
||||
bucket: "fetchml-snapshots"
|
||||
prefix: "snapshots"
|
||||
timeout: "5m"
|
||||
max_retries: 3
|
||||
|
||||
podman_image: "python:3.9-slim"
|
||||
container_workspace: "/workspace"
|
||||
container_results: "/results"
|
||||
gpu_vendor: "nvidia"
|
||||
gpu_visible_devices: [0]
|
||||
gpu_devices: ["/dev/nvidia0"]
|
||||
|
||||
|
||||
resources:
|
||||
max_workers: 1
|
||||
desired_rps_per_worker: 2
|
||||
podman_cpus: "2"
|
||||
podman_memory: "4Gi"
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
listen_addr: ":9100"
|
||||
metrics_flush_interval: "500ms"
|
||||
|
||||
task_lease_duration: "30m"
|
||||
heartbeat_interval: "1m"
|
||||
max_retries: 3
|
||||
graceful_timeout: "5m"
|
||||
43
configs/workers/docker.yaml
Normal file
43
configs/workers/docker.yaml
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
worker_id: "docker-worker"
|
||||
base_path: "/tmp/fetchml-jobs"
|
||||
train_script: "train.py"
|
||||
|
||||
redis_addr: "redis:6379"
|
||||
redis_password: ""
|
||||
redis_db: 0
|
||||
|
||||
local_mode: true
|
||||
|
||||
max_workers: 1
|
||||
poll_interval_seconds: 5
|
||||
|
||||
auto_fetch_data: false
|
||||
|
||||
data_manager_path: "./data_manager"
|
||||
dataset_cache_ttl: "30m"
|
||||
|
||||
snapshot_store:
|
||||
enabled: false
|
||||
|
||||
podman_image: "python:3.9-slim"
|
||||
container_workspace: "/workspace"
|
||||
container_results: "/results"
|
||||
gpu_devices: []
|
||||
gpu_vendor: "none"
|
||||
gpu_visible_devices: []
|
||||
|
||||
resources:
|
||||
max_workers: 1
|
||||
desired_rps_per_worker: 2
|
||||
podman_cpus: "2"
|
||||
podman_memory: "4Gi"
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
listen_addr: ":9100"
|
||||
metrics_flush_interval: "500ms"
|
||||
|
||||
task_lease_duration: "30m"
|
||||
heartbeat_interval: "1m"
|
||||
max_retries: 3
|
||||
graceful_timeout: "5m"
|
||||
27
configs/workers/examples/prewarm-worker.yaml
Normal file
27
configs/workers/examples/prewarm-worker.yaml
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
worker_id: "test-prewarm-worker"
|
||||
host: "localhost"
|
||||
port: 8081
|
||||
base_path: "/tmp/fetch-ml-test"
|
||||
data_dir: "/tmp/fetch-ml-test/data"
|
||||
max_workers: 2
|
||||
local_mode: true
|
||||
auto_fetch_data: true
|
||||
prewarm_enabled: true
|
||||
metrics:
|
||||
enabled: true
|
||||
listen_addr: ":9102"
|
||||
train_script: "train.py"
|
||||
snapshot_store:
|
||||
enabled: false
|
||||
endpoint: ""
|
||||
secure: false
|
||||
region: ""
|
||||
bucket: ""
|
||||
prefix: ""
|
||||
access_key: ""
|
||||
secret_key: ""
|
||||
session_token: ""
|
||||
max_retries: 3
|
||||
timeout: 0s
|
||||
gpu_devices: []
|
||||
gpu_access: "none"
|
||||
47
configs/workers/homelab-secure.yaml
Normal file
47
configs/workers/homelab-secure.yaml
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
worker_id: "homelab-worker"
|
||||
base_path: "/tmp/fetchml-jobs"
|
||||
train_script: "train.py"
|
||||
|
||||
redis_url: "redis://:${REDIS_PASSWORD}@redis:6379/0"
|
||||
|
||||
local_mode: true
|
||||
|
||||
max_workers: 1
|
||||
poll_interval_seconds: 2
|
||||
|
||||
auto_fetch_data: false
|
||||
|
||||
data_manager_path: "./data_manager"
|
||||
dataset_cache_ttl: "30m"
|
||||
|
||||
data_dir: "/data/active"
|
||||
|
||||
snapshot_store:
|
||||
enabled: true
|
||||
endpoint: "minio:9000"
|
||||
secure: false
|
||||
bucket: "fetchml-snapshots"
|
||||
prefix: "snapshots"
|
||||
timeout: "5m"
|
||||
max_retries: 3
|
||||
|
||||
podman_image: "python:3.9-slim"
|
||||
container_workspace: "/workspace"
|
||||
container_results: "/results"
|
||||
gpu_devices: []
|
||||
|
||||
resources:
|
||||
max_workers: 1
|
||||
desired_rps_per_worker: 2
|
||||
podman_cpus: "2"
|
||||
podman_memory: "4Gi"
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
listen_addr: ":9100"
|
||||
metrics_flush_interval: "500ms"
|
||||
|
||||
task_lease_duration: "30m"
|
||||
heartbeat_interval: "1m"
|
||||
max_retries: 3
|
||||
graceful_timeout: "5m"
|
||||
|
|
@ -1,51 +0,0 @@
|
|||
# Worker configuration for Docker production-like testing
|
||||
worker_id: "docker-test-worker-1"
|
||||
|
||||
# Redis configuration
|
||||
redis:
|
||||
url: "redis://redis:6379"
|
||||
max_connections: 10
|
||||
|
||||
# Local mode settings
|
||||
local_mode: false # Use Podman for containerized job execution
|
||||
|
||||
# Job paths
|
||||
base_path: "/tmp/fetchml-jobs"
|
||||
|
||||
# Container workspace (not used in local mode)
|
||||
container_workspace: "/workspace"
|
||||
container_results: "/results"
|
||||
|
||||
# Podman settings (not used in local mode)
|
||||
podman_image: "python:3.9-slim"
|
||||
podman_cpus: "2"
|
||||
podman_memory: "4g"
|
||||
|
||||
# Worker configuration
|
||||
heartbeat_interval: "30s"
|
||||
lease_duration: "5m"
|
||||
max_concurrent_tasks: 1
|
||||
|
||||
# Data manager settings
|
||||
data_manager:
|
||||
enabled: false
|
||||
base_path: "/data"
|
||||
|
||||
# SSH settings for Podman communication
|
||||
ssh:
|
||||
enabled: true
|
||||
host: "localhost"
|
||||
port: 2222
|
||||
user: "worker"
|
||||
password: "SecureWorkerPass2024!"
|
||||
key_path: "/home/worker/.ssh/id_rsa"
|
||||
|
||||
# Logging
|
||||
logging:
|
||||
level: "info"
|
||||
file: "/logs/worker.log"
|
||||
|
||||
# Metrics
|
||||
metrics:
|
||||
enabled: true
|
||||
endpoint: ":9100"
|
||||
|
|
@ -1,79 +0,0 @@
|
|||
# Worker configuration for Homelab secure environment
|
||||
worker_id: "homelab-secure-worker-1"
|
||||
|
||||
# Redis configuration with connection pooling
|
||||
redis:
|
||||
url: "redis://redis:6379"
|
||||
max_connections: 10
|
||||
connection_timeout: "10s"
|
||||
read_timeout: "5s"
|
||||
write_timeout: "5s"
|
||||
|
||||
# Local mode disabled for containerized execution
|
||||
local_mode: false
|
||||
|
||||
# Job paths with security considerations
|
||||
base_path: "/tmp/fetchml-jobs"
|
||||
container_workspace: "/workspace"
|
||||
container_results: "/results"
|
||||
|
||||
# Podman settings with resource limits
|
||||
podman_image: "python:3.11-slim"
|
||||
podman_cpus: "2"
|
||||
podman_memory: "4g"
|
||||
podman_network: "ml-job-network"
|
||||
podman_timeout: "30m"
|
||||
|
||||
# Worker configuration with security
|
||||
heartbeat_interval: "30s"
|
||||
lease_duration: "5m"
|
||||
max_concurrent_tasks: 2
|
||||
task_timeout: "30m"
|
||||
|
||||
# Data manager settings
|
||||
data_manager:
|
||||
enabled: true
|
||||
base_path: "/data"
|
||||
encryption_enabled: true
|
||||
backup_enabled: true
|
||||
|
||||
# SSH settings with secure configuration
|
||||
ssh:
|
||||
enabled: true
|
||||
host: "localhost"
|
||||
port: 2222
|
||||
user: "worker"
|
||||
password: "HomelabWorker2024!"
|
||||
key_path: "/home/worker/.ssh/id_rsa"
|
||||
max_retries: 3
|
||||
connection_timeout: "30s"
|
||||
strict_host_key_checking: false
|
||||
|
||||
# Logging with rotation and security
|
||||
logging:
|
||||
level: "info"
|
||||
file: "/logs/worker.log"
|
||||
max_size: "50MB"
|
||||
max_backups: 5
|
||||
compress: true
|
||||
audit_enabled: true
|
||||
|
||||
# Metrics and monitoring
|
||||
metrics:
|
||||
enabled: true
|
||||
endpoint: ":9100"
|
||||
path: "/metrics"
|
||||
|
||||
# Security settings
|
||||
security:
|
||||
enable_job_isolation: true
|
||||
sandbox_enabled: true
|
||||
resource_monitoring: true
|
||||
audit_commands: true
|
||||
|
||||
# Health check configuration
|
||||
health_check:
|
||||
enabled: true
|
||||
interval: "30s"
|
||||
timeout: "10s"
|
||||
failure_threshold: 3
|
||||
|
|
@ -4,7 +4,7 @@ max_workers = 4
|
|||
|
||||
# Redis connection
|
||||
redis_addr = "localhost:6379"
|
||||
redis_password = "your-redis-password"
|
||||
redis_password = "CHANGE_ME_REDIS_PASSWORD"
|
||||
redis_db = 0
|
||||
|
||||
# SSH connection (for remote operations)
|
||||
|
|
@ -15,17 +15,13 @@ ssh_key = "~/.ssh/id_rsa"
|
|||
|
||||
# Podman configuration
|
||||
podman_image = "ml-training:latest"
|
||||
gpu_access = true
|
||||
gpu_vendor = "none"
|
||||
gpu_visible_devices = []
|
||||
gpu_devices = []
|
||||
container_workspace = "/workspace"
|
||||
container_results = "/results"
|
||||
train_script = "train.py"
|
||||
|
||||
[resources]
|
||||
max_workers = 4
|
||||
desired_rps_per_worker = 2
|
||||
podman_cpus = "4"
|
||||
podman_memory = "16g"
|
||||
|
||||
# Dataset management
|
||||
auto_fetch_data = true
|
||||
data_dir = "/data/datasets"
|
||||
|
|
@ -36,10 +32,16 @@ dataset_cache_ttl = "24h"
|
|||
task_lease_duration = "1h"
|
||||
heartbeat_interval = "30s"
|
||||
graceful_timeout = "5m"
|
||||
poll_interval = "100ms"
|
||||
poll_interval_seconds = 1
|
||||
metrics_flush_interval = "10s"
|
||||
|
||||
[resources]
|
||||
max_workers = 4
|
||||
desired_rps_per_worker = 2
|
||||
podman_cpus = "4"
|
||||
podman_memory = "16g"
|
||||
|
||||
# Metrics exporter
|
||||
[metrics]
|
||||
enabled = true
|
||||
listen_addr = ":9090"
|
||||
listen_addr = ":9100"
|
||||
|
|
|
|||
45
deployments/Caddyfile.dev
Normal file
45
deployments/Caddyfile.dev
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
{
|
||||
auto_https off
|
||||
admin off
|
||||
servers {
|
||||
protocols h1 h2
|
||||
}
|
||||
}
|
||||
|
||||
http://localhost {
|
||||
handle /health {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle /ws* {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle /api/* {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle {
|
||||
respond 404
|
||||
}
|
||||
}
|
||||
|
||||
https://localhost {
|
||||
tls internal
|
||||
|
||||
handle /health {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle /ws* {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle /api/* {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle {
|
||||
respond 404
|
||||
}
|
||||
}
|
||||
44
deployments/Caddyfile.homelab-secure
Normal file
44
deployments/Caddyfile.homelab-secure
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
{
|
||||
admin off
|
||||
servers {
|
||||
protocols h1 h2
|
||||
}
|
||||
}
|
||||
|
||||
{$FETCHML_DOMAIN} {
|
||||
encode gzip
|
||||
|
||||
tls /etc/caddy/ssl/cert.pem /etc/caddy/ssl/key.pem
|
||||
|
||||
header {
|
||||
-Server
|
||||
X-Frame-Options "DENY"
|
||||
X-Content-Type-Options "nosniff"
|
||||
Referrer-Policy "strict-origin-when-cross-origin"
|
||||
Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
|
||||
Content-Security-Policy "default-src 'self'; base-uri 'self'; frame-ancestors 'none'"
|
||||
}
|
||||
|
||||
@admin path /admin/*
|
||||
@admin_private remote_ip private_ranges
|
||||
handle @admin {
|
||||
respond @admin_private 404
|
||||
respond 404
|
||||
}
|
||||
|
||||
handle /health {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle /ws* {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle /api/* {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle {
|
||||
respond 404
|
||||
}
|
||||
}
|
||||
47
deployments/Caddyfile.prod
Normal file
47
deployments/Caddyfile.prod
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
{
|
||||
email {$CADDY_EMAIL}
|
||||
admin off
|
||||
servers {
|
||||
protocols h1 h2
|
||||
}
|
||||
}
|
||||
|
||||
{$FETCHML_DOMAIN} {
|
||||
encode gzip
|
||||
|
||||
request_body {
|
||||
max_size 10MB
|
||||
}
|
||||
|
||||
header {
|
||||
-Server
|
||||
X-Frame-Options "DENY"
|
||||
X-Content-Type-Options "nosniff"
|
||||
Referrer-Policy "strict-origin-when-cross-origin"
|
||||
Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
|
||||
Content-Security-Policy "default-src 'self'; base-uri 'self'; frame-ancestors 'none'"
|
||||
}
|
||||
|
||||
@admin path /admin/*
|
||||
@admin_private remote_ip private_ranges
|
||||
handle @admin {
|
||||
respond @admin_private 404
|
||||
respond 404
|
||||
}
|
||||
|
||||
handle /health {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle /ws* {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle /api/* {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle {
|
||||
respond 404
|
||||
}
|
||||
}
|
||||
23
deployments/Caddyfile.smoke
Normal file
23
deployments/Caddyfile.smoke
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
{
|
||||
auto_https off
|
||||
}
|
||||
|
||||
localhost {
|
||||
tls internal
|
||||
|
||||
handle /health {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle /ws* {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle /api/* {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
|
||||
handle {
|
||||
respond 404
|
||||
}
|
||||
}
|
||||
76
deployments/Makefile
Normal file
76
deployments/Makefile
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
# Docker Compose Deployment Management
|
||||
.PHONY: help dev-up dev-down dev-logs dev-restart homelab-secure-up homelab-secure-down prod-up prod-down status clean
|
||||
|
||||
# Default target
|
||||
help: ## Show this help message
|
||||
@echo "Available commands:"
|
||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
|
||||
|
||||
# Development environment
|
||||
dev-up: ## Start development environment
|
||||
@echo "Starting development environment..."
|
||||
docker-compose -f deployments/docker-compose.dev.yml up -d
|
||||
@echo "Services: Caddy (8080/8443), Redis (6379), Prometheus (9090), Grafana (3000)"
|
||||
|
||||
dev-down: ## Stop development environment
|
||||
@echo "Stopping development environment..."
|
||||
docker-compose -f deployments/docker-compose.dev.yml down
|
||||
|
||||
dev-logs: ## Show development logs
|
||||
docker-compose -f deployments/docker-compose.dev.yml logs -f
|
||||
|
||||
dev-restart: ## Restart development environment
|
||||
@echo "Restarting development environment..."
|
||||
docker-compose -f deployments/docker-compose.dev.yml restart
|
||||
|
||||
|
||||
# Homelab environment
|
||||
homelab-secure-up: ## Start secure homelab environment
|
||||
@echo "Starting secure homelab environment..."
|
||||
docker-compose -f deployments/docker-compose.homelab-secure.yml up -d
|
||||
|
||||
homelab-secure-down: ## Stop secure homelab environment
|
||||
@echo "Stopping secure homelab environment..."
|
||||
docker-compose -f deployments/docker-compose.homelab-secure.yml down
|
||||
|
||||
# Production environment
|
||||
prod-up: ## Start production environment
|
||||
@echo "Starting production environment..."
|
||||
docker-compose -f deployments/docker-compose.prod.yml up -d
|
||||
|
||||
prod-down: ## Stop production environment
|
||||
@echo "Stopping production environment..."
|
||||
docker-compose -f deployments/docker-compose.prod.yml down
|
||||
|
||||
# Utility commands
|
||||
status: ## Show status of all environments
|
||||
@echo "=== Development Status ==="
|
||||
@if [ -f deployments/docker-compose.dev.yml ]; then \
|
||||
docker-compose -f deployments/docker-compose.dev.yml ps; \
|
||||
fi
|
||||
@echo ""
|
||||
@echo "=== Homelab Secure Status ==="
|
||||
@if [ -f deployments/docker-compose.homelab-secure.yml ]; then \
|
||||
docker-compose -f deployments/docker-compose.homelab-secure.yml ps 2>/dev/null || echo "Not running"; \
|
||||
fi
|
||||
@echo ""
|
||||
@echo "=== Production Status ==="
|
||||
@if [ -f deployments/docker-compose.prod.yml ]; then \
|
||||
docker-compose -f deployments/docker-compose.prod.yml ps 2>/dev/null || echo "Not running"; \
|
||||
fi
|
||||
|
||||
clean: ## Clean up all containers and volumes
|
||||
@echo "Cleaning up all Docker resources..."
|
||||
@echo "This will remove all containers and volumes. Continue? [y/N]"
|
||||
@read -r confirm && [ "$$confirm" = "y" ] || exit 1
|
||||
docker-compose -f deployments/docker-compose.dev.yml down -v 2>/dev/null || true
|
||||
docker-compose -f deployments/docker-compose.homelab-secure.yml down -v 2>/dev/null || true
|
||||
docker-compose -f deployments/docker-compose.prod.yml down -v 2>/dev/null || true
|
||||
docker system prune -f
|
||||
@echo "Cleanup complete."
|
||||
|
||||
# Quick aliases
|
||||
up: dev-up ## Alias for dev-up
|
||||
down: dev-down ## Alias for dev-down
|
||||
logs: dev-logs ## Alias for dev-logs
|
||||
restart: dev-restart ## Alias for dev-restart
|
||||
|
|
@ -2,33 +2,123 @@
|
|||
|
||||
This directory contains Docker Compose configurations for different deployment environments.
|
||||
|
||||
## Files
|
||||
## Environment Configurations
|
||||
|
||||
- `docker-compose.homelab-secure.yml` - Secure homelab deployment with TLS and authentication
|
||||
- `docker-compose.prod.yml` - Production deployment configuration
|
||||
### Development (`docker-compose.dev.yml`)
|
||||
- Full development stack with monitoring
|
||||
- Includes: API, Worker, Redis, MinIO (snapshots), Prometheus, Grafana, Loki, Promtail
|
||||
- Optimized for local development and testing
|
||||
- **Usage**: `docker-compose -f deployments/docker-compose.dev.yml up -d`
|
||||
|
||||
## Usage
|
||||
### Homelab - Secure (`docker-compose.homelab-secure.yml`)
|
||||
- Secure homelab deployment with authentication and a Caddy reverse proxy
|
||||
- TLS is terminated at the reverse proxy (Approach A)
|
||||
- Includes: API, Redis (password protected), Caddy reverse proxy
|
||||
- **Usage**: `docker-compose -f deployments/docker-compose.homelab-secure.yml up -d`
|
||||
|
||||
### Production (`docker-compose.prod.yml`)
|
||||
- Production deployment configuration
|
||||
- Optimized for performance and security
|
||||
- External services assumed (Redis, monitoring)
|
||||
- **Usage**: `docker-compose -f deployments/docker-compose.prod.yml up -d`
|
||||
|
||||
Note: `docker-compose.prod.yml` is a reproducible staging/testing harness. Real production deployments do not require Docker; you can run the Go services directly (systemd) and use Caddy for TLS/WSS termination.
|
||||
|
||||
## TLS / WSS Policy
|
||||
|
||||
- The Zig CLI currently supports `ws://` only (native `wss://` is not implemented).
|
||||
- Production deployments terminate TLS/WSS at a reverse proxy (Caddy in `docker-compose.prod.yml`) and keep the API server on internal `ws://`.
|
||||
- Homelab deployments terminate TLS/WSS at a reverse proxy (Caddy) and keep the API server on internal `ws://`.
|
||||
- Health checks in compose files should use `http://localhost:9101/health` when `server.tls.enabled: false`.
|
||||
|
||||
## Required Volume Mounts
|
||||
|
||||
- `base_path` (experiments) must be writable by the API server.
|
||||
- `data_dir` should be mounted if you want snapshot/dataset integrity validation via `ml validate`.
|
||||
|
||||
For the default configs:
|
||||
|
||||
- `base_path`: `/data/experiments` (dev/homelab configs) or `/app/data/experiments` (prod configs)
|
||||
- `data_dir`: `/data/active`
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Development
|
||||
```bash
|
||||
# Use the main docker-compose.yml in project root
|
||||
docker-compose up -d
|
||||
# Development (most common)
|
||||
docker-compose -f deployments/docker-compose.dev.yml up -d
|
||||
|
||||
# Check status
|
||||
docker-compose -f deployments/docker-compose.dev.yml ps
|
||||
|
||||
# View logs
|
||||
docker-compose -f deployments/docker-compose.dev.yml logs -f api-server
|
||||
|
||||
# Stop services
|
||||
docker-compose -f deployments/docker-compose.dev.yml down
|
||||
```
|
||||
|
||||
### Homelab (Secure)
|
||||
```bash
|
||||
docker-compose -f deployments/docker-compose.homelab-secure.yml up -d
|
||||
```
|
||||
## Dev: MinIO-backed snapshots (smoke test)
|
||||
|
||||
### Production
|
||||
```bash
|
||||
docker-compose -f deployments/docker-compose.prod.yml up -d
|
||||
```
|
||||
The dev compose file provisions a MinIO bucket and uploads a small example snapshot object at:
|
||||
|
||||
`s3://fetchml-snapshots/snapshots/snap-1.tar.gz`
|
||||
|
||||
To queue a task that forces the worker to pull the snapshot from MinIO:
|
||||
|
||||
1. Start the dev stack:
|
||||
`docker-compose -f deployments/docker-compose.dev.yml up -d`
|
||||
|
||||
2. Read the `snapshot_sha256` printed by the init job:
|
||||
`docker-compose -f deployments/docker-compose.dev.yml logs minio-init`
|
||||
|
||||
3. Queue a job using the snapshot fields:
|
||||
`ml queue <job-name> --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`
|
||||
|
||||
## Smoke tests
|
||||
|
||||
- `make dev-smoke` runs the development stack smoke test.
|
||||
- `make prod-smoke` runs a Docker-based staging smoke test for the production stack, using a localhost-only Caddy configuration.
|
||||
|
||||
Note: `ml queue` by itself will generate a random commit ID. For full provenance enforcement (manifest + dependency manifest), use `ml sync ./your-project --queue` so the server has real code + dependency files.
|
||||
|
||||
Examples:
|
||||
- `ml queue train-mnist --priority 3 --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`
|
||||
- `ml queue train-a train-b train-c --priority 5 --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`
|
||||
|
||||
## Environment Variables
|
||||
|
||||
Each deployment may require specific environment variables. Refer to the individual compose files for requirements.
|
||||
Create a `.env` file in the project root:
|
||||
|
||||
```bash
|
||||
# Grafana
|
||||
GRAFANA_ADMIN_PASSWORD=your_secure_password
|
||||
|
||||
# API Configuration
|
||||
LOG_LEVEL=info
|
||||
|
||||
# TLS (for secure deployments)
|
||||
TLS_CERT_PATH=/app/ssl/cert.pem
|
||||
TLS_KEY_PATH=/app/ssl/key.pem
|
||||
```
|
||||
|
||||
## Service Ports
|
||||
|
||||
| Service | Development | Homelab | Production |
|
||||
|---------|-------------|---------|------------|
|
||||
| API Server | 9101 | 9101 | 9101 |
|
||||
| Redis | 6379 | 6379 | - |
|
||||
| Prometheus | 9090 | - | - |
|
||||
| Grafana | 3000 | - | - |
|
||||
| Loki | 3100 | - | - |
|
||||
|
||||
## Monitoring
|
||||
|
||||
Performance monitoring configurations are in `monitoring/docker-compose.performance.yml`
|
||||
- **Development**: Full monitoring stack included
|
||||
- **Homelab**: Basic monitoring (configurable)
|
||||
- **Production**: External monitoring assumed
|
||||
|
||||
## Security Notes
|
||||
|
||||
- If you need HTTPS externally, terminate TLS at a reverse proxy.
|
||||
- API keys should be managed via environment variables
|
||||
- Database credentials should use secrets management in production
|
||||
|
|
|
|||
162
deployments/deploy.sh
Executable file
162
deployments/deploy.sh
Executable file
|
|
@ -0,0 +1,162 @@
|
|||
#!/bin/bash
|
||||
# Quick deployment script for fetch_ml
|
||||
|
||||
set -e
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Function to print colored output
|
||||
print_status() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
print_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
print_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
print_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# Function to show usage
|
||||
show_usage() {
|
||||
echo "Usage: $0 [ENVIRONMENT] [ACTION]"
|
||||
echo ""
|
||||
echo "Environments:"
|
||||
echo " dev Development environment"
|
||||
echo " secure Secure homelab environment"
|
||||
echo " prod Production environment"
|
||||
echo ""
|
||||
echo "Actions:"
|
||||
echo " up Start services"
|
||||
echo " down Stop services"
|
||||
echo " restart Restart services"
|
||||
echo " logs Show logs"
|
||||
echo " status Show status"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " $0 dev up # Start development environment"
|
||||
echo " $0 prod down # Stop production environment"
|
||||
echo " $0 secure logs # Show secure environment logs"
|
||||
}
|
||||
|
||||
# Function to check if docker-compose file exists
|
||||
check_compose_file() {
|
||||
local env=$1
|
||||
local compose_file=""
|
||||
|
||||
case $env in
|
||||
"dev")
|
||||
compose_file="deployments/docker-compose.dev.yml"
|
||||
;;
|
||||
"secure")
|
||||
compose_file="deployments/docker-compose.homelab-secure.yml"
|
||||
;;
|
||||
"prod")
|
||||
compose_file="deployments/docker-compose.prod.yml"
|
||||
;;
|
||||
*)
|
||||
print_error "Unknown environment: $env"
|
||||
show_usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ ! -f "$compose_file" ]; then
|
||||
print_error "Docker Compose file not found: $compose_file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$compose_file"
|
||||
}
|
||||
|
||||
# Function to check if .env file exists
|
||||
check_env_file() {
|
||||
local env=$1
|
||||
|
||||
if [ ! -f ".env" ]; then
|
||||
print_warning ".env file not found. Creating from example..."
|
||||
if [ "$env" = "dev" ]; then
|
||||
cp deployments/env.dev.example .env
|
||||
elif [ "$env" = "prod" ]; then
|
||||
cp deployments/env.prod.example .env
|
||||
else
|
||||
cp deployments/env.dev.example .env
|
||||
fi
|
||||
print_warning "Please edit .env file with your configuration"
|
||||
fi
|
||||
}
|
||||
|
||||
# Main script
|
||||
main() {
|
||||
if [ $# -ne 2 ]; then
|
||||
show_usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local environment=$1
|
||||
local action=$2
|
||||
|
||||
print_status "Environment: $environment"
|
||||
print_status "Action: $action"
|
||||
|
||||
# Check compose file
|
||||
compose_file=$(check_compose_file "$environment")
|
||||
print_status "Using: $compose_file"
|
||||
|
||||
# Check .env file
|
||||
check_env_file "$environment"
|
||||
|
||||
# Execute action
|
||||
case $action in
|
||||
"up")
|
||||
print_status "Starting $environment environment..."
|
||||
docker-compose -f "$compose_file" up -d
|
||||
print_success "$environment environment started successfully!"
|
||||
|
||||
# Show service URLs
|
||||
echo ""
|
||||
print_status "Service URLs:"
|
||||
echo " API Server: http://localhost:9101"
|
||||
if [ "$environment" = "dev" ]; then
|
||||
echo " Grafana: http://localhost:3000 (admin/admin123)"
|
||||
echo " Prometheus: http://localhost:9090"
|
||||
fi
|
||||
;;
|
||||
"down")
|
||||
print_status "Stopping $environment environment..."
|
||||
docker-compose -f "$compose_file" down
|
||||
print_success "$environment environment stopped successfully!"
|
||||
;;
|
||||
"restart")
|
||||
print_status "Restarting $environment environment..."
|
||||
docker-compose -f "$compose_file" restart
|
||||
print_success "$environment environment restarted successfully!"
|
||||
;;
|
||||
"logs")
|
||||
print_status "Showing logs for $environment environment..."
|
||||
docker-compose -f "$compose_file" logs -f
|
||||
;;
|
||||
"status")
|
||||
print_status "Status of $environment environment:"
|
||||
docker-compose -f "$compose_file" ps
|
||||
;;
|
||||
*)
|
||||
print_error "Unknown action: $action"
|
||||
show_usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
225
deployments/docker-compose.dev.yml
Normal file
225
deployments/docker-compose.dev.yml
Normal file
|
|
@ -0,0 +1,225 @@
|
|||
# Homelab Docker Compose with Centralized Monitoring
|
||||
# Includes: API, Redis, Prometheus, Grafana, Loki
|
||||
|
||||
services:
|
||||
caddy:
|
||||
image: caddy:2-alpine
|
||||
container_name: ml-dev-caddy
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8080:80"
|
||||
- "8443:443"
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/deployments/Caddyfile.dev:/etc/caddy/Caddyfile:ro
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/dev/caddy/data:/data
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/dev/caddy/config:/config
|
||||
depends_on:
|
||||
api-server:
|
||||
condition: service_healthy
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: ml-experiments-redis
|
||||
user: "999:999"
|
||||
ports:
|
||||
- "6379:6379"
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/dev/redis:/data
|
||||
restart: unless-stopped
|
||||
command: redis-server --appendonly yes
|
||||
healthcheck:
|
||||
test: [ "CMD", "redis-cli", "ping" ]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
api-server:
|
||||
build:
|
||||
context: ${FETCHML_REPO_ROOT:-.}
|
||||
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
|
||||
container_name: ml-experiments-api
|
||||
user: "0:0"
|
||||
expose:
|
||||
- "9101" # API and health endpoints (internal; external access via Caddy)
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/dev/active:/data/active
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/dev/workspaces:/data/active/workspaces:delegated
|
||||
- ${FETCHML_REPO_ROOT:-.}/configs/api/dev.yaml:/app/configs/api/dev.yaml
|
||||
- ${FETCHML_REPO_ROOT:-.}/ssl:/app/ssl
|
||||
depends_on:
|
||||
- redis
|
||||
restart: unless-stopped
|
||||
command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
|
||||
environment:
|
||||
- LOG_LEVEL=info
|
||||
healthcheck:
|
||||
test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
labels:
|
||||
logging: "promtail"
|
||||
job: "api-server"
|
||||
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
container_name: ml-experiments-minio
|
||||
ports:
|
||||
- "9000:9000"
|
||||
- "9001:9001"
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/dev/minio:/data
|
||||
environment:
|
||||
- MINIO_ROOT_USER=minioadmin
|
||||
- MINIO_ROOT_PASSWORD=minioadmin123
|
||||
command: ["server", "/data", "--console-address", ":9001"]
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
restart: unless-stopped
|
||||
|
||||
minio-init:
|
||||
image: alpine:3.19
|
||||
container_name: ml-experiments-minio-init
|
||||
depends_on:
|
||||
minio:
|
||||
condition: service_healthy
|
||||
entrypoint: ["/bin/sh", "-c"]
|
||||
command:
|
||||
- |
|
||||
set -eu
|
||||
apk add --no-cache ca-certificates curl tar gzip
|
||||
ARCH=$$(uname -m)
|
||||
MC_ARCH=amd64
|
||||
if [ "$$ARCH" = "aarch64" ] || [ "$$ARCH" = "arm64" ]; then
|
||||
MC_ARCH=arm64
|
||||
fi
|
||||
curl -fsSL -o /usr/local/bin/mc "https://dl.min.io/client/mc/release/linux-$$MC_ARCH/mc"
|
||||
chmod +x /usr/local/bin/mc
|
||||
i=0
|
||||
while ! mc alias set local http://minio:9000 minioadmin minioadmin123; do
|
||||
i=$$((i+1))
|
||||
if [ $$i -ge 30 ]; then
|
||||
echo "minio not ready after 30 attempts" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "waiting for minio... ($$i/30)"
|
||||
sleep 1
|
||||
done
|
||||
mc mb -p local/fetchml-snapshots || true
|
||||
mkdir -p /tmp/snapshots/snap-1
|
||||
echo -n "hello" > /tmp/snapshots/snap-1/hello.txt
|
||||
tar -C /tmp/snapshots/snap-1 -czf /tmp/snap-1.tar.gz .
|
||||
mc cp /tmp/snap-1.tar.gz local/fetchml-snapshots/snapshots/snap-1.tar.gz
|
||||
FILE_SHA=$$(sha256sum /tmp/snapshots/snap-1/hello.txt | cut -d' ' -f1)
|
||||
SNAP_SHA=$$(echo -n "$$FILE_SHA" | sha256sum | cut -d' ' -f1)
|
||||
echo "snapshot_id=snap-1 snapshot_sha256=$$SNAP_SHA"
|
||||
restart: "no"
|
||||
|
||||
worker:
|
||||
build:
|
||||
context: ${FETCHML_REPO_ROOT:-.}
|
||||
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
|
||||
container_name: ml-experiments-worker
|
||||
user: "0:0"
|
||||
ports:
|
||||
- "8888:8888"
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/dev/active:/data/active
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/dev/workspaces:/data/active/workspaces:delegated
|
||||
- ${FETCHML_REPO_ROOT:-.}/configs/workers/docker-dev.yaml:/app/configs/worker.yaml
|
||||
- /sys/fs/cgroup:/sys/fs/cgroup:rw
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
api-server:
|
||||
condition: service_healthy
|
||||
minio-init:
|
||||
condition: service_completed_successfully
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- LOG_LEVEL=info
|
||||
- MINIO_ROOT_USER=minioadmin
|
||||
- MINIO_ROOT_PASSWORD=minioadmin123
|
||||
- FETCHML_JUPYTER_DEFAULT_IMAGE=quay.io/jupyter/base-notebook:latest
|
||||
- FETCHML_JUPYTER_CONDA_ENV=base
|
||||
- FETCHML_JUPYTER_KERNEL_NAME=python
|
||||
- FETCHML_PODMAN_CGROUPS=disabled
|
||||
privileged: true
|
||||
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
|
||||
|
||||
# Prometheus - Metrics collection
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: ml-experiments-prometheus
|
||||
ports:
|
||||
- "9090:9090"
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- prometheus_data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||
- '--web.console.templates=/etc/prometheus/consoles'
|
||||
- '--web.enable-lifecycle'
|
||||
restart: unless-stopped
|
||||
|
||||
# Grafana - Visualization
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
container_name: ml-experiments-grafana
|
||||
ports:
|
||||
- "3000:3000"
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning
|
||||
- ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_PASSWORD=admin123
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
- prometheus
|
||||
- loki
|
||||
|
||||
# Loki - Log aggregation
|
||||
loki:
|
||||
image: grafana/loki:latest
|
||||
container_name: ml-experiments-loki
|
||||
ports:
|
||||
- "3100:3100"
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/monitoring/loki-config.yml:/etc/loki/local-config.yaml
|
||||
- loki_data:/loki
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
restart: unless-stopped
|
||||
|
||||
# Promtail - Log collector
|
||||
promtail:
|
||||
image: grafana/promtail:latest
|
||||
container_name: ml-experiments-promtail
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/monitoring/promtail-config.yml:/etc/promtail/config.yml
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/var/log/app
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
command: -config.file=/etc/promtail/config.yml
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
- loki
|
||||
|
||||
volumes:
|
||||
prometheus_data:
|
||||
driver: local
|
||||
grafana_data:
|
||||
driver: local
|
||||
loki_data:
|
||||
driver: local
|
||||
|
|
@ -1,104 +1,152 @@
|
|||
# Homelab Secure Docker Environment
|
||||
services:
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: ml-homelab-redis
|
||||
ports:
|
||||
- "6379:6379"
|
||||
volumes:
|
||||
- redis_homelab_data:/data
|
||||
restart: unless-stopped
|
||||
command: >
|
||||
redis-server
|
||||
--appendonly yes
|
||||
--requirepass "HomelabRedis2024!"
|
||||
--maxmemory 512mb
|
||||
--maxmemory-policy allkeys-lru
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "-a", "HomelabRedis2024!", "ping"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
networks:
|
||||
- ml-homelab-network
|
||||
# Secure Homelab Docker Compose Configuration
|
||||
# Use with: docker-compose -f docker-compose.yml -f docker-compose.homelab-secure.yml up -d
|
||||
|
||||
services:
|
||||
api-server:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: build/docker/homelab-secure.Dockerfile
|
||||
container_name: ml-homelab-api
|
||||
context: ${FETCHML_REPO_ROOT:-.}
|
||||
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
|
||||
container_name: ml-experiments-api
|
||||
ports:
|
||||
- "9104:9101" # API server port
|
||||
- "2223:2222" # Secure SSH port
|
||||
- "9101:9100" # Prometheus metrics
|
||||
- "9101:9101"
|
||||
- "9100:9100" # Prometheus metrics endpoint
|
||||
volumes:
|
||||
- ./data:/app/data/experiments
|
||||
- ./logs:/logs
|
||||
- ./configs/config-homelab-secure.yaml:/app/configs/config.yaml
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/homelab/experiments:/data/experiments
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/homelab/active:/data/active
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/homelab/logs:/logs
|
||||
- ${FETCHML_REPO_ROOT:-.}/ssl:/app/ssl:ro
|
||||
- ${FETCHML_REPO_ROOT:-.}/configs/api/homelab-secure.yaml:/app/configs/api/prod.yaml:ro
|
||||
- ${FETCHML_REPO_ROOT:-.}/.env.secure:/app/.env.secure:ro
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- REDIS_URL=redis://:HomelabRedis2024!@redis:6379
|
||||
- LOG_LEVEL=info
|
||||
- TZ=America/New_York
|
||||
# Load secure environment variables
|
||||
- JWT_SECRET_FILE=/app/.env.secure
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-k", "-f", "https://localhost:9101/health"]
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9101/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
command: >
|
||||
sh -c "
|
||||
sudo /app/start-security.sh &
|
||||
/usr/local/bin/api-server -config /app/configs/config.yaml
|
||||
"
|
||||
labels:
|
||||
logging: "promtail"
|
||||
job: "api-server"
|
||||
command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"]
|
||||
networks:
|
||||
- ml-homelab-network
|
||||
- ml-experiments-network
|
||||
# Add internal network for secure communication
|
||||
- ml-backend-network
|
||||
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
container_name: ml-experiments-minio
|
||||
ports:
|
||||
- "9000:9000"
|
||||
- "9001:9001"
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/homelab/minio:/data
|
||||
environment:
|
||||
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
||||
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
|
||||
command: ["server", "/data", "--console-address", ":9001"]
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- ml-backend-network
|
||||
|
||||
minio-init:
|
||||
image: alpine:3.19
|
||||
container_name: ml-experiments-minio-init
|
||||
depends_on:
|
||||
- minio
|
||||
entrypoint: ["/bin/sh", "-c"]
|
||||
command:
|
||||
- |
|
||||
apk add --no-cache ca-certificates curl >/dev/null
|
||||
curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc
|
||||
chmod +x /usr/local/bin/mc
|
||||
mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123}
|
||||
mc mb -p local/fetchml-snapshots || true
|
||||
restart: "no"
|
||||
networks:
|
||||
- ml-backend-network
|
||||
|
||||
worker:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: build/docker/homelab-secure.Dockerfile
|
||||
container_name: ml-homelab-worker
|
||||
context: ${FETCHML_REPO_ROOT:-.}
|
||||
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
|
||||
container_name: ml-experiments-worker
|
||||
volumes:
|
||||
- ./data:/app/data/experiments
|
||||
- ./logs:/logs
|
||||
- ./configs/worker-homelab-secure.yaml:/app/configs/worker.yaml
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/homelab/experiments:/app/data/experiments
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/homelab/active:/data/active
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/homelab/logs:/logs
|
||||
- ${FETCHML_REPO_ROOT:-.}/configs/workers/homelab-secure.yaml:/app/configs/worker.yaml
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
api-server:
|
||||
condition: service_healthy
|
||||
minio-init:
|
||||
condition: service_started
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- REDIS_URL=redis://:HomelabRedis2024!@redis:6379
|
||||
- LOG_LEVEL=info
|
||||
- TZ=America/New_York
|
||||
privileged: true # Required for Podman
|
||||
security_opt:
|
||||
- no-new-privileges:true
|
||||
cap_drop:
|
||||
- ALL
|
||||
cap_add:
|
||||
- NET_ADMIN
|
||||
- SYS_ADMIN
|
||||
command: >
|
||||
sh -c "
|
||||
sudo /app/start-security.sh &
|
||||
/usr/local/bin/worker -config /app/configs/worker.yaml
|
||||
"
|
||||
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
||||
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
|
||||
- REDIS_PASSWORD=${REDIS_PASSWORD}
|
||||
privileged: true
|
||||
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
|
||||
networks:
|
||||
- ml-homelab-network
|
||||
- ml-backend-network
|
||||
|
||||
volumes:
|
||||
redis_homelab_data:
|
||||
driver: local
|
||||
caddy:
|
||||
image: caddy:2-alpine
|
||||
container_name: ml-experiments-caddy
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "80:80"
|
||||
- "443:443"
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/deployments/Caddyfile.homelab-secure:/etc/caddy/Caddyfile:ro
|
||||
- ${FETCHML_REPO_ROOT:-.}/ssl:/etc/caddy/ssl:ro
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/homelab/caddy/data:/data
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/homelab/caddy/config:/config
|
||||
environment:
|
||||
- FETCHML_DOMAIN=${FETCHML_DOMAIN:-ml.local}
|
||||
depends_on:
|
||||
api-server:
|
||||
condition: service_healthy
|
||||
networks:
|
||||
- ml-experiments-network
|
||||
|
||||
# Redis with authentication
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: ml-experiments-redis
|
||||
user: "999:999"
|
||||
ports:
|
||||
- "127.0.0.1:6379:6379" # Bind to localhost only
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/homelab/redis:/data
|
||||
- ${FETCHML_REPO_ROOT:-.}/redis/redis-secure.conf:/usr/local/etc/redis/redis.conf:ro
|
||||
restart: unless-stopped
|
||||
command: redis-server /usr/local/etc/redis/redis.conf --requirepass ${REDIS_PASSWORD}
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "--no-auth-warning", "-a", "${REDIS_PASSWORD}", "ping"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
networks:
|
||||
- ml-backend-network
|
||||
environment:
|
||||
- REDIS_PASSWORD=${REDIS_PASSWORD}
|
||||
|
||||
volumes: {}
|
||||
|
||||
networks:
|
||||
ml-homelab-network:
|
||||
ml-experiments-network:
|
||||
driver: bridge
|
||||
ml-backend-network:
|
||||
driver: bridge
|
||||
ipam:
|
||||
config:
|
||||
- subnet: 172.25.0.0/16
|
||||
|
|
|
|||
75
deployments/docker-compose.prod.smoke.yml
Normal file
75
deployments/docker-compose.prod.smoke.yml
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
services:
|
||||
caddy:
|
||||
image: caddy:2-alpine
|
||||
environment:
|
||||
- FETCHML_DOMAIN=localhost
|
||||
- CADDY_EMAIL=smoke@example.invalid
|
||||
ports:
|
||||
- "8080:80"
|
||||
- "8443:443"
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/caddy/data:/data
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/caddy/config:/config
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
cat > /etc/caddy/Caddyfile <<'EOF'
|
||||
{
|
||||
debug
|
||||
servers {
|
||||
protocols h1 h2
|
||||
}
|
||||
}
|
||||
|
||||
https://localhost {
|
||||
tls internal {
|
||||
protocols tls1.2 tls1.3
|
||||
}
|
||||
|
||||
handle {
|
||||
reverse_proxy api-server:9101
|
||||
}
|
||||
}
|
||||
EOF
|
||||
exec caddy run --config /etc/caddy/Caddyfile
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
user: "999:999"
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- "6379"
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/redis:/data
|
||||
command: redis-server --appendonly yes
|
||||
healthcheck:
|
||||
test: [ "CMD", "redis-cli", "ping" ]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
|
||||
api-server:
|
||||
build:
|
||||
context: ${FETCHML_REPO_ROOT:-.}
|
||||
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
|
||||
user: "0:0"
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- "9101"
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/experiments:/data/experiments
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/active:/data/active
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/logs:/logs
|
||||
- ${FETCHML_REPO_ROOT:-.}/configs/api/dev.yaml:/app/configs/api/dev.yaml:ro
|
||||
command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
|
||||
healthcheck:
|
||||
test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
|
||||
volumes: {}
|
||||
|
|
@ -1,12 +1,31 @@
|
|||
# Full Production Docker Environment with Podman and SQLite
|
||||
services:
|
||||
caddy:
|
||||
image: caddy:2-alpine
|
||||
container_name: ml-prod-caddy
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "80:80"
|
||||
- "443:443"
|
||||
volumes:
|
||||
- ./Caddyfile.prod:/etc/caddy/Caddyfile:ro
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod/caddy/data:/data
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod/caddy/config:/config
|
||||
environment:
|
||||
- FETCHML_DOMAIN=${FETCHML_DOMAIN}
|
||||
- CADDY_EMAIL=${CADDY_EMAIL}
|
||||
depends_on:
|
||||
api-server:
|
||||
condition: service_healthy
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: ml-prod-redis
|
||||
ports:
|
||||
- "6379:6379"
|
||||
user: "999:999"
|
||||
expose:
|
||||
- "6379"
|
||||
volumes:
|
||||
- redis_prod_data:/data
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod/redis:/data
|
||||
restart: unless-stopped
|
||||
command: redis-server --appendonly yes
|
||||
healthcheck:
|
||||
|
|
@ -17,57 +36,87 @@ services:
|
|||
|
||||
api-server:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: build/docker/secure-prod.Dockerfile
|
||||
context: ${FETCHML_REPO_ROOT:-.}
|
||||
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/secure-prod.Dockerfile
|
||||
container_name: ml-prod-api
|
||||
ports:
|
||||
- "9103:9101" # API server port
|
||||
- "2222:2222" # Secure SSH port for Podman communication
|
||||
- "9100:9100" # Prometheus metrics
|
||||
expose:
|
||||
- "9101" # API server port (internal; external access via Caddy)
|
||||
- "2222" # Secure SSH port for Podman communication (internal)
|
||||
volumes:
|
||||
- ./data:/app/data/experiments
|
||||
- ./logs:/logs
|
||||
- ./configs/config-multi-user.yaml:/app/configs/config.yaml
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod/experiments:/app/data/experiments
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod/active:/data/active
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod/logs:/logs
|
||||
- ${FETCHML_REPO_ROOT:-.}/configs/api/multi-user.yaml:/app/configs/api/prod.yaml
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- REDIS_URL=redis://redis:6379
|
||||
- LOG_LEVEL=info
|
||||
healthcheck:
|
||||
test: [ "CMD", "curl", "-k", "https://localhost:9101/health" ]
|
||||
test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
# Start SSH daemon for Podman communication
|
||||
command: ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"]
|
||||
# Start API server (ensure data_dir exists for snapshot/dataset validation)
|
||||
command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"]
|
||||
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
container_name: ml-prod-minio
|
||||
expose:
|
||||
- "9000"
|
||||
- "9001"
|
||||
volumes:
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod/minio:/data
|
||||
environment:
|
||||
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
||||
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
|
||||
command: ["server", "/data", "--console-address", ":9001"]
|
||||
restart: unless-stopped
|
||||
|
||||
minio-init:
|
||||
image: alpine:3.19
|
||||
container_name: ml-prod-minio-init
|
||||
depends_on:
|
||||
- minio
|
||||
entrypoint: ["/bin/sh", "-c"]
|
||||
command:
|
||||
- |
|
||||
apk add --no-cache ca-certificates curl >/dev/null
|
||||
curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc
|
||||
chmod +x /usr/local/bin/mc
|
||||
mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123}
|
||||
mc mb -p local/fetchml-snapshots || true
|
||||
restart: "no"
|
||||
|
||||
worker:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: build/docker/secure-prod.Dockerfile
|
||||
context: ${FETCHML_REPO_ROOT:-.}
|
||||
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
|
||||
container_name: ml-prod-worker
|
||||
volumes:
|
||||
- ./data:/app/data/experiments
|
||||
- ./logs:/logs
|
||||
- ./configs/worker-docker.yaml:/app/configs/worker.yaml
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod/experiments:/app/data/experiments
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod/active:/data/active
|
||||
- ${FETCHML_REPO_ROOT:-.}/data/prod/logs:/logs
|
||||
- ${FETCHML_REPO_ROOT:-.}/configs/workers/docker-prod.yaml:/app/configs/worker.yaml
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
api-server:
|
||||
condition: service_healthy
|
||||
minio-init:
|
||||
condition: service_started
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- REDIS_URL=redis://redis:6379
|
||||
- LOG_LEVEL=info
|
||||
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
||||
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
|
||||
privileged: true # Required for Podman to work in Docker
|
||||
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
|
||||
|
||||
volumes:
|
||||
redis_prod_data:
|
||||
driver: local
|
||||
volumes: {}
|
||||
|
||||
networks:
|
||||
default:
|
||||
|
|
|
|||
17
deployments/env.dev.example
Normal file
17
deployments/env.dev.example
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
# Development Environment Variables
|
||||
# Copy this file to .env and modify as needed
|
||||
|
||||
# Grafana
|
||||
GRAFANA_ADMIN_PASSWORD=admin123
|
||||
|
||||
# API Configuration
|
||||
LOG_LEVEL=info
|
||||
|
||||
# TLS (development uses self-signed certs)
|
||||
TLS_CERT_PATH=/app/ssl/cert.pem
|
||||
TLS_KEY_PATH=/app/ssl/key.pem
|
||||
|
||||
# Development-specific
|
||||
ENVIRONMENT=development
|
||||
DEBUG=true
|
||||
API_KEY=development_key_only
|
||||
28
deployments/env.prod.example
Normal file
28
deployments/env.prod.example
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
# Production Environment Variables
|
||||
# Copy this file to .env and modify as needed
|
||||
|
||||
# Grafana (if using)
|
||||
GRAFANA_ADMIN_PASSWORD=CHANGE_ME_SECURE_PASSWORD
|
||||
|
||||
# API Configuration
|
||||
LOG_LEVEL=warn
|
||||
|
||||
# TLS (production should use CA-signed certs)
|
||||
TLS_CERT_PATH=/app/ssl/cert.pem
|
||||
TLS_KEY_PATH=/app/ssl/key.pem
|
||||
|
||||
# Caddy (TLS/WSS termination)
|
||||
FETCHML_DOMAIN=ml.example.com
|
||||
CADDY_EMAIL=admin@example.com
|
||||
|
||||
# Production-specific
|
||||
ENVIRONMENT=production
|
||||
DEBUG=false
|
||||
|
||||
# Security
|
||||
API_KEY=CHANGE_ME_SECURE_API_KEY
|
||||
ALLOWED_ORIGINS=https://yourdomain.com
|
||||
|
||||
# External services (if applicable)
|
||||
EXTERNAL_REDIS_URL=redis://external-redis:6379
|
||||
EXTERNAL_PROMETHEUS_URL=http://external-prometheus:9090
|
||||
112
deployments/setup.sh
Normal file
112
deployments/setup.sh
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: ./deployments/setup.sh
|
||||
|
||||
This script DOES NOT install dependencies.
|
||||
It prints the manual steps and required/optional dependencies for a real (non-Docker) production deployment.
|
||||
EOF
|
||||
}
|
||||
|
||||
if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
|
||||
usage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
cat <<'EOF'
|
||||
== FetchML production setup (non-Docker) ==
|
||||
|
||||
Required (core):
|
||||
- Go-built binaries: api-server, worker
|
||||
- Redis (reachable from api-server + worker)
|
||||
- A writable base_path for experiments
|
||||
- A writable data_dir if you want snapshot/dataset staging + integrity validation
|
||||
|
||||
Required (TLS/WSS):
|
||||
- Caddy (recommended) OR another reverse proxy that can terminate TLS and proxy WebSockets
|
||||
|
||||
Optional:
|
||||
- systemd (recommended) for service supervision
|
||||
- MinIO / S3-compatible storage (only if you use remote snapshot_store)
|
||||
- Podman (only if your worker executes jobs in Podman)
|
||||
|
||||
Notes:
|
||||
- The Zig CLI currently supports ws:// only. In production, keep the API server internal on ws:// and terminate TLS/WSS at Caddy.
|
||||
- This script is informational; it will not modify your system.
|
||||
|
||||
---
|
||||
1) Build binaries
|
||||
|
||||
make prod
|
||||
|
||||
Artifacts:
|
||||
./bin/api-server
|
||||
./bin/worker
|
||||
|
||||
---
|
||||
2) Create a dedicated user (recommended)
|
||||
|
||||
useradd --system --create-home --shell /usr/sbin/nologin fetchml
|
||||
|
||||
---
|
||||
3) Create directories (example paths)
|
||||
|
||||
mkdir -p /var/lib/fetchml/experiments
|
||||
mkdir -p /var/lib/fetchml/active/datasets /var/lib/fetchml/active/snapshots
|
||||
mkdir -p /var/log/fetchml
|
||||
|
||||
Ensure ownership:
|
||||
chown -R fetchml:fetchml /var/lib/fetchml /var/log/fetchml
|
||||
|
||||
---
|
||||
4) Configure the API server
|
||||
|
||||
- Start from: configs/api/prod.yaml (or your multi-user config)
|
||||
- For real production, keep server.tls.enabled: false
|
||||
- Ensure monitoring.health_checks.enabled is set appropriately
|
||||
|
||||
Example flags:
|
||||
./bin/api-server -config /etc/fetchml/api.yaml
|
||||
|
||||
---
|
||||
5) Configure Caddy (TLS/WSS termination)
|
||||
|
||||
- Recommended: use deployments/Caddyfile.prod as a baseline.
|
||||
- Caddy should listen on 443 and reverse proxy to the API server (internal) on 9101.
|
||||
|
||||
Example layout:
|
||||
/etc/caddy/Caddyfile
|
||||
/var/lib/caddy
|
||||
|
||||
---
|
||||
6) Configure Redis
|
||||
|
||||
- Use Redis AUTH in production.
|
||||
- Ensure the api-server + worker can reach it.
|
||||
|
||||
---
|
||||
7) Run under systemd (recommended)
|
||||
|
||||
Create unit files (example):
|
||||
/etc/systemd/system/fetchml-api.service
|
||||
/etc/systemd/system/fetchml-worker.service
|
||||
/etc/systemd/system/caddy.service (if not already provided)
|
||||
|
||||
Then:
|
||||
systemctl daemon-reload
|
||||
systemctl enable --now fetchml-api
|
||||
systemctl enable --now fetchml-worker
|
||||
systemctl enable --now caddy
|
||||
|
||||
---
|
||||
8) Smoke check
|
||||
|
||||
Internal health (no TLS):
|
||||
curl -f http://127.0.0.1:9101/health
|
||||
|
||||
External health (through Caddy TLS termination):
|
||||
curl -f https://YOUR_DOMAIN/health
|
||||
|
||||
EOF
|
||||
|
|
@ -1,13 +1,52 @@
|
|||
# Centralized Monitoring Stack
|
||||
# Monitoring Stack
|
||||
|
||||
## Directory Structure (Canonical)
|
||||
|
||||
All monitoring configuration lives under `monitoring/`.
|
||||
|
||||
```text
|
||||
monitoring/
|
||||
prometheus/
|
||||
prometheus.yml # Prometheus scrape configuration
|
||||
grafana/
|
||||
dashboards/ # Grafana dashboards (JSON)
|
||||
provisioning/
|
||||
datasources/ # Grafana data sources (Prometheus/Loki)
|
||||
dashboards/ # Grafana dashboard provider (points at dashboards/)
|
||||
loki-config.yml # Loki configuration
|
||||
promtail-config.yml # Promtail configuration
|
||||
```
|
||||
|
||||
### What is "Grafana provisioning"?
|
||||
|
||||
Grafana provisioning is how Grafana auto-configures itself on startup (no clicking in the UI):
|
||||
|
||||
- **`grafana/provisioning/datasources/*.yml`**
|
||||
- Defines where Grafana reads data from (e.g. Prometheus at `http://prometheus:9090`, Loki at `http://loki:3100`).
|
||||
- **`grafana/provisioning/dashboards/*.yml`**
|
||||
- Tells Grafana to load dashboard JSON files from `/var/lib/grafana/dashboards`.
|
||||
- **`grafana/dashboards/*.json`**
|
||||
- The dashboards themselves.
|
||||
|
||||
### Source of truth
|
||||
|
||||
- **Dashboards**: edit/add JSON in `monitoring/grafana/dashboards/`.
|
||||
- **Grafana provisioning**: edit files in `monitoring/grafana/provisioning/`.
|
||||
- **Prometheus scrape config**: edit `monitoring/prometheus/prometheus.yml`.
|
||||
|
||||
`scripts/setup_monitoring.py` is intentionally **provisioning-only**:
|
||||
|
||||
- It (re)writes Grafana **datasources** and the **dashboard provider**.
|
||||
- It does **not** create or overwrite any dashboard JSON files.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Start everything
|
||||
docker-compose up -d
|
||||
# Start deployment
|
||||
make deploy-up
|
||||
|
||||
# Access services
|
||||
open http://localhost:3000 # Grafana (admin/admin)
|
||||
open http://localhost:3000 # Grafana (admin/admin123)
|
||||
open http://localhost:9090 # Prometheus
|
||||
```
|
||||
|
||||
|
|
@ -15,137 +54,80 @@ open http://localhost:9090 # Prometheus
|
|||
|
||||
### Grafana (Port 3000)
|
||||
**Main monitoring dashboard**
|
||||
- Username: `admin`
|
||||
- Password: `admin`
|
||||
- Pre-configured datasources: Prometheus + Loki
|
||||
- Pre-loaded ML Queue dashboard
|
||||
- Username: `admin`
|
||||
- Password: `admin123`
|
||||
- Data source: Prometheus (http://localhost:9090)
|
||||
|
||||
### Prometheus (Port 9090)
|
||||
**Metrics collection**
|
||||
- Scrapes metrics from API server (`:9100/metrics`)
|
||||
- 15s scrape interval
|
||||
- Data retention: 15 days (default)
|
||||
**Metrics collection and storage**
|
||||
|
||||
### Loki (Port 3100)
|
||||
**Log aggregation**
|
||||
- Collects logs from all containers
|
||||
- Collects application logs from `./logs/`
|
||||
- Retention: 7 days
|
||||
|
||||
### Promtail
|
||||
**Log shipping**
|
||||
- Watches Docker container logs
|
||||
- Watches `./logs/*.log`
|
||||
- Sends to Loki
|
||||
## Dashboards
|
||||
|
||||
## Viewing Data
|
||||
Available dashboard configurations in `grafana/dashboards/`:
|
||||
|
||||
### Metrics
|
||||
1. Open Grafana: http://localhost:3000
|
||||
2. Go to "ML Task Queue Monitoring" dashboard
|
||||
3. See: queue depth, task duration, error rates, etc.
|
||||
- `load-test-performance.json` - Load test metrics
|
||||
- `websocket-performance.json` - WebSocket performance
|
||||
- `system-health.json` - System health monitoring
|
||||
- `rsync-performance.json` - Rsync performance metrics
|
||||
|
||||
### Logs
|
||||
1. Open Grafana → Explore
|
||||
2. Select "Loki" datasource
|
||||
3. Query examples:
|
||||
```logql
|
||||
{job="app_logs"} # All app logs
|
||||
{job="docker",service="api-server"} # API server logs
|
||||
{job="docker"} |= "error" # All errors
|
||||
```
|
||||
### Importing Dashboards
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────┐
|
||||
│ API Server │──┐
|
||||
└─────────────┘ │
|
||||
├──► Prometheus ──► Grafana
|
||||
┌─────────────┐ │ ▲
|
||||
│ Worker │──┘ │
|
||||
└─────────────┘ │
|
||||
│
|
||||
┌─────────────┐ │
|
||||
│ App Logs │──┐ │
|
||||
└─────────────┘ │ │
|
||||
├──► Promtail ──► Loki ┘
|
||||
┌─────────────┐ │
|
||||
│Docker Logs │──┘
|
||||
└─────────────┘
|
||||
```
|
||||
1. Go to Grafana → "+" → "Import"
|
||||
2. Upload JSON files from `grafana/dashboards/` directory
|
||||
3. Select Prometheus data source
|
||||
|
||||
## Configuration Files
|
||||
|
||||
- `prometheus.yml` - Metrics scraping config
|
||||
- `loki-config.yml` - Log storage config
|
||||
- `promtail-config.yml` - Log collection config
|
||||
- `grafana/provisioning/` - Auto-configuration
|
||||
- `prometheus/prometheus.yml` - Prometheus configuration
|
||||
- `loki-config.yml` - Loki configuration
|
||||
- `promtail-config.yml` - Promtail configuration
|
||||
- `security_rules.yml` - Security rules
|
||||
|
||||
## Customization
|
||||
## Usage
|
||||
|
||||
### Add More Scrapers
|
||||
Edit `monitoring/prometheus.yml`:
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: 'my-service'
|
||||
static_configs:
|
||||
- targets: ['my-service:9100']
|
||||
```
|
||||
1. Start monitoring stack: `make deploy-up`
|
||||
2. Access Grafana: http://localhost:3000 (admin/admin123)
|
||||
3. Import dashboards from `grafana/dashboards/` directory
|
||||
4. View metrics and test results in real-time
|
||||
|
||||
### Change Retention
|
||||
**Prometheus:** Add to command in docker-compose:
|
||||
```yaml
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
```
|
||||
## Health Endpoints
|
||||
|
||||
**Loki:** Edit `loki-config.yml`:
|
||||
```yaml
|
||||
limits_config:
|
||||
retention_period: 720h # 30 days
|
||||
```
|
||||
The API server provides health check endpoints for monitoring:
|
||||
|
||||
## Troubleshooting
|
||||
- **`/health`** - Overall service health (for Docker healthcheck)
|
||||
- **`/health/live`** - Liveness probe (is the service running?)
|
||||
- **`/health/ready`** - Readiness probe (can the service accept traffic?)
|
||||
|
||||
**No metrics showing:**
|
||||
```bash
|
||||
# Check if Prometheus can reach targets
|
||||
curl http://localhost:9090/api/v1/targets
|
||||
|
||||
# Check if API exposes metrics
|
||||
curl http://localhost:9100/metrics
|
||||
```
|
||||
|
||||
**No logs showing:**
|
||||
```bash
|
||||
# Check Promtail status
|
||||
docker logs ml-experiments-promtail
|
||||
|
||||
# Verify Loki is receiving logs
|
||||
curl http://localhost:3100/ready
|
||||
```
|
||||
|
||||
**Grafana can't connect to datasources:**
|
||||
```bash
|
||||
# Restart Grafana
|
||||
docker-compose restart grafana
|
||||
```
|
||||
|
||||
## Profiling Quick Start
|
||||
|
||||
To capture CPU profiles while exercising real workloads:
|
||||
### Testing Health Endpoints
|
||||
|
||||
```bash
|
||||
# HTTP LoadTestSuite (MediumLoad scenario)
|
||||
make profile-load
|
||||
# Basic health check
|
||||
curl -k https://localhost:9101/health
|
||||
|
||||
# WebSocket → Redis queue → worker integration
|
||||
make profile-ws-queue
|
||||
# Liveness check (for K8s or monitoring)
|
||||
curl -k https://localhost:9101/health/live
|
||||
|
||||
# Readiness check (verifies dependencies)
|
||||
curl -k https://localhost:9101/health/ready
|
||||
```
|
||||
|
||||
Then inspect profiles with:
|
||||
See `health-testing.md` for detailed testing procedures.
|
||||
|
||||
```bash
|
||||
go tool pprof cpu_load.out # HTTP load
|
||||
go tool pprof cpu_ws.out # WebSocket/queue/worker
|
||||
```
|
||||
## Prometheus Integration
|
||||
|
||||
Prometheus scrapes the following endpoints:
|
||||
- `api-server:9101/metrics` - Application metrics (future)
|
||||
- `api-server:9101/health` - Health status monitoring
|
||||
- `host.docker.internal:9100/metrics` - Worker metrics (when the worker runs on the host)
|
||||
- `worker:9100/metrics` - Worker metrics (when the worker runs as a container in the compose network)
|
||||
|
||||
## Cleanup (deprecated paths)
|
||||
|
||||
These legacy paths may still exist in the repo but are **not used** by the current dev compose config:
|
||||
|
||||
- `monitoring/dashboards/` (old dashboards location)
|
||||
- `monitoring/prometheus.yml` (old Prometheus config location)
|
||||
- `monitoring/grafana/provisioning/dashboards/dashboard.yml` (duplicate of `dashboards.yml`)
|
||||
|
|
@ -1,147 +0,0 @@
|
|||
{
|
||||
"dashboard": {
|
||||
"title": "ML Task Queue Monitoring",
|
||||
"tags": [
|
||||
"ml",
|
||||
"queue",
|
||||
"fetch_ml"
|
||||
],
|
||||
"timezone": "browser",
|
||||
"panels": [
|
||||
{
|
||||
"title": "Queue Depth",
|
||||
"type": "graph",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "fetch_ml_queue_depth",
|
||||
"legendFormat": "Queue Depth"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Active Tasks",
|
||||
"type": "graph",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(fetch_ml_active_tasks) by (worker_id)",
|
||||
"legendFormat": "{{worker_id}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Task Duration (p50, p95, p99)",
|
||||
"type": "graph",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
|
||||
"legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
|
||||
"legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
|
||||
"legendFormat": "p99"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Task Completion Rate",
|
||||
"type": "graph",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(fetch_ml_tasks_completed_total[5m])",
|
||||
"legendFormat": "{{status}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Failure Rate by Error Category",
|
||||
"type": "graph",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(fetch_ml_task_failures_total[5m])",
|
||||
"legendFormat": "{{error_category}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Retry Rate",
|
||||
"type": "graph",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(fetch_ml_task_retries_total[5m])",
|
||||
"legendFormat": "{{error_category}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Dead Letter Queue Size",
|
||||
"type": "stat",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 24
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "fetch_ml_dlq_size"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Lease Expirations",
|
||||
"type": "stat",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 24
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "fetch_ml_lease_expirations_total"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
@ -1,278 +0,0 @@
|
|||
{
|
||||
"dashboard": {
|
||||
"title": "Application Logs",
|
||||
"tags": [
|
||||
"logs",
|
||||
"loki",
|
||||
"fetch_ml"
|
||||
],
|
||||
"timezone": "browser",
|
||||
"editable": true,
|
||||
"graphTooltip": 1,
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"title": "Log Stream",
|
||||
"type": "logs",
|
||||
"gridPos": {
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"w": 24,
|
||||
"h": 12
|
||||
},
|
||||
"id": 1,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{job=\"app_logs\"}",
|
||||
"refId": "A",
|
||||
"datasource": "Loki"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"showCommonLabels": false,
|
||||
"wrapLogMessage": false,
|
||||
"prettifyLogMessage": false,
|
||||
"enableLogDetails": true,
|
||||
"dedupStrategy": "none",
|
||||
"sortOrder": "Descending"
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Log Level Distribution",
|
||||
"type": "bargauge",
|
||||
"gridPos": {
|
||||
"x": 0,
|
||||
"y": 12,
|
||||
"w": 8,
|
||||
"h": 8
|
||||
},
|
||||
"id": 2,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (level) (count_over_time({job=\"app_logs\"} | logfmt | level != \"\" [5m]))",
|
||||
"refId": "A",
|
||||
"datasource": "Loki",
|
||||
"legendFormat": "{{level}}"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"orientation": "horizontal",
|
||||
"displayMode": "gradient",
|
||||
"showUnfilled": true
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "INFO"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"mode": "fixed",
|
||||
"fixedColor": "green"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "WARN"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"mode": "fixed",
|
||||
"fixedColor": "yellow"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "ERROR"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"mode": "fixed",
|
||||
"fixedColor": "red"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Error Logs (Last Hour)",
|
||||
"type": "table",
|
||||
"gridPos": {
|
||||
"x": 8,
|
||||
"y": 12,
|
||||
"w": 16,
|
||||
"h": 8
|
||||
},
|
||||
"id": 3,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{job=\"app_logs\"} | logfmt | level=\"ERROR\"",
|
||||
"refId": "A",
|
||||
"datasource": "Loki"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Logs by Component",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"x": 0,
|
||||
"y": 20,
|
||||
"w": 12,
|
||||
"h": 8
|
||||
},
|
||||
"id": 4,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (component) (rate({job=\"app_logs\"} | logfmt [1m]))",
|
||||
"refId": "A",
|
||||
"datasource": "Loki",
|
||||
"legendFormat": "{{component}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 10,
|
||||
"spanNulls": false,
|
||||
"showPoints": "never",
|
||||
"stacking": {
|
||||
"mode": "none"
|
||||
}
|
||||
},
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Warning Logs Timeline",
|
||||
"type": "timeseries",
|
||||
"gridPos": {
|
||||
"x": 12,
|
||||
"y": 20,
|
||||
"w": 12,
|
||||
"h": 8
|
||||
},
|
||||
"id": 5,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(count_over_time({job=\"app_logs\"} | logfmt | level=\"WARN\" [1m]))",
|
||||
"refId": "A",
|
||||
"datasource": "Loki",
|
||||
"legendFormat": "Warnings"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"drawStyle": "bars",
|
||||
"fillOpacity": 50
|
||||
},
|
||||
"color": {
|
||||
"mode": "fixed",
|
||||
"fixedColor": "yellow"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Search Logs",
|
||||
"type": "logs",
|
||||
"gridPos": {
|
||||
"x": 0,
|
||||
"y": 28,
|
||||
"w": 24,
|
||||
"h": 10
|
||||
},
|
||||
"id": 6,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{job=\"app_logs\"} |= \"$search_term\"",
|
||||
"refId": "A",
|
||||
"datasource": "Loki"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"wrapLogMessage": true,
|
||||
"enableLogDetails": true
|
||||
}
|
||||
}
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "search_term",
|
||||
"type": "textbox",
|
||||
"label": "Search Term",
|
||||
"current": {
|
||||
"value": "",
|
||||
"text": ""
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"refresh": "30s"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,157 +0,0 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "loki",
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 1,
|
||||
"legend": {
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"dataLinks": []
|
||||
},
|
||||
"percentage": false,
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{job=\"fetchml-performance\"} |= \"BenchmarkAPIServerCreateJobSimple\"",
|
||||
"legendFormat": "API Job Creation",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "{job=\"fetchml-performance\"} |= \"BenchmarkMLExperimentExecution/SmallExperiment\"",
|
||||
"legendFormat": "ML Small Experiment",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "{job=\"fetchml-performance\"} |= \"BenchmarkDatasetOperations/DatasetCreation\"",
|
||||
"legendFormat": "Dataset Creation",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "API Performance Trends",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": "Time (ns/op)",
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"datasource": "loki",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"showLabels": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{job=\"fetchml-performance\"} |= \"Performance Summary\"",
|
||||
"legendFormat": "{{timestamp}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Latest Performance Summary",
|
||||
"type": "logs"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 27,
|
||||
"style": "dark",
|
||||
"tags": ["fetchml", "performance"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Fetch ML Performance Dashboard",
|
||||
"uid": "fetchml-performance",
|
||||
"version": 1
|
||||
}
|
||||
|
|
@ -1,64 +0,0 @@
|
|||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
ports:
|
||||
- "9090:9090"
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||
- '--web.console.templates=/etc/prometheus/consoles'
|
||||
- '--web.enable-lifecycle'
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- prometheus-data:/prometheus
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
loki:
|
||||
image: grafana/loki:2.9.0
|
||||
ports:
|
||||
- "3100:3100"
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
volumes:
|
||||
- ./loki-performance-config.yaml:/etc/loki/local-config.yaml
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
promtail:
|
||||
image: grafana/promtail:latest
|
||||
volumes:
|
||||
- ./promtail-performance-config.yaml:/etc/promtail/config.yml
|
||||
- /var/log:/var/log:ro
|
||||
command: -config.file=/etc/promtail/config.yml
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
pushgateway:
|
||||
image: prom/pushgateway:latest
|
||||
ports:
|
||||
- "9091:9091"
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
ports:
|
||||
- "3001:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_PASSWORD=admin
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana
|
||||
- ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards
|
||||
- ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
volumes:
|
||||
loki-data:
|
||||
grafana-data:
|
||||
prometheus-data:
|
||||
|
||||
networks:
|
||||
monitoring:
|
||||
driver: bridge
|
||||
51
monitoring/grafana/dashboards/load-test-performance.json
Normal file
51
monitoring/grafana/dashboards/load-test-performance.json
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"dashboard": {
|
||||
"id": null,
|
||||
"title": "Load Test Performance",
|
||||
"tags": [
|
||||
"load-test",
|
||||
"performance"
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Service Health",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up",
|
||||
"legendFormat": "{{job}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Request Rate",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(http_requests_total[5m])",
|
||||
"legendFormat": "RPS"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
}
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"refresh": "5s"
|
||||
}
|
||||
}
|
||||
1
monitoring/grafana/dashboards/load-test-simple.json
Normal file
1
monitoring/grafana/dashboards/load-test-simple.json
Normal file
|
|
@ -0,0 +1 @@
|
|||
{"dashboard": {"id": null, "title": "Load Test Performance", "tags": ["load-test", "performance"], "panels": [{"id": 1, "title": "Service Status", "type": "stat", "targets": [{"expr": "up", "legendFormat": "{{job}}"}]}]}}
|
||||
51
monitoring/grafana/dashboards/loki-logs.json
Normal file
51
monitoring/grafana/dashboards/loki-logs.json
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"dashboard": {
|
||||
"id": null,
|
||||
"title": "Log Analysis",
|
||||
"tags": [
|
||||
"loki",
|
||||
"logs"
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Error Logs",
|
||||
"type": "logs",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{job=~\".+\"} |= \"error\"",
|
||||
"legendFormat": "Errors"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "All Logs",
|
||||
"type": "logs",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{job=~\".+\"}",
|
||||
"legendFormat": "All logs"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
}
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-30m",
|
||||
"to": "now"
|
||||
},
|
||||
"refresh": "30s"
|
||||
}
|
||||
}
|
||||
135
monitoring/grafana/dashboards/prewarm-performance.txt
Normal file
135
monitoring/grafana/dashboards/prewarm-performance.txt
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
# Grafana Dashboard: Prewarm Performance
|
||||
# Import this JSON into Grafana to create a prewarm monitoring dashboard
|
||||
|
||||
{
|
||||
"dashboard": {
|
||||
"id": null,
|
||||
"title": "Prewarm Performance",
|
||||
"tags": ["prewarm", "performance", "worker"],
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Environment Prewarm Hit Rate (%)",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (fetchml_prewarm_env_hit_total / clamp_min(fetchml_prewarm_env_hit_total + fetchml_prewarm_env_miss_total, 1))",
|
||||
"legendFormat": "{{worker_id}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 6, "w": 6, "x": 0, "y": 0},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "red", "value": 0},
|
||||
{"color": "yellow", "value": 50},
|
||||
{"color": "green", "value": 80}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Snapshot Prewarm Hit Rate (%)",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (fetchml_prewarm_snapshot_hit_total / clamp_min(fetchml_prewarm_snapshot_hit_total + fetchml_prewarm_snapshot_miss_total, 1))",
|
||||
"legendFormat": "{{worker_id}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 0},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "red", "value": 0},
|
||||
{"color": "yellow", "value": 50},
|
||||
{"color": "green", "value": 80}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Environment Prewarm Hits vs Misses",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{"expr": "rate(fetchml_prewarm_env_hit_total[5m])", "legendFormat": "hits {{worker_id}}"},
|
||||
{"expr": "rate(fetchml_prewarm_env_miss_total[5m])", "legendFormat": "misses {{worker_id}}"}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||
"yAxes": [{"unit": "reqps"}]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Snapshot Prewarm Hits vs Misses",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{"expr": "rate(fetchml_prewarm_snapshot_hit_total[5m])", "legendFormat": "hits {{worker_id}}"},
|
||||
{"expr": "rate(fetchml_prewarm_snapshot_miss_total[5m])", "legendFormat": "misses {{worker_id}}"}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
||||
"yAxes": [{"unit": "reqps"}]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Environment Build Time",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{"expr": "rate(fetchml_prewarm_env_time_seconds_total[5m])", "legendFormat": "build time {{worker_id}}"}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
||||
"yAxes": [{"unit": "seconds"}]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Snapshot Prewarm Time",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{"expr": "rate(fetchml_prewarm_snapshot_time_seconds_total[5m])", "legendFormat": "prewarm time {{worker_id}}"}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
|
||||
"yAxes": [{"unit": "seconds"}]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Environment Images Built",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{"expr": "increase(fetchml_prewarm_env_built_total[1h])", "legendFormat": "built {{worker_id}}"}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 16},
|
||||
"yAxes": [{"unit": "short"}]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Snapshots Prewarmed",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{"expr": "increase(fetchml_prewarm_snapshot_built_total[1h])", "legendFormat": "prewarmed {{worker_id}}"}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 16},
|
||||
"yAxes": [{"unit": "short"}]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Prewarm Efficiency",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{"expr": "fetchml_prewarm_env_hit_total + fetchml_prewarm_snapshot_hit_total", "legendFormat": "total hits {{worker_id}}"},
|
||||
{"expr": "fetchml_prewarm_env_miss_total + fetchml_prewarm_snapshot_miss_total", "legendFormat": "total misses {{worker_id}}"}
|
||||
],
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 24},
|
||||
"yAxes": [{"unit": "short"}]
|
||||
}
|
||||
],
|
||||
"time": {"from": "now-1h", "to": "now"},
|
||||
"refresh": "5s"
|
||||
}
|
||||
}
|
||||
86
monitoring/grafana/dashboards/rsync-performance.json
Normal file
86
monitoring/grafana/dashboards/rsync-performance.json
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
{
|
||||
"dashboard": {
|
||||
"id": null,
|
||||
"title": "Rsync Performance",
|
||||
"tags": [
|
||||
"rsync",
|
||||
"sync",
|
||||
"performance"
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Rsync Operations",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(rsync_operations_total[5m])",
|
||||
"legendFormat": "Operations/sec"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Data Transfer Rate",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(rsync_bytes_transferred_total[5m])",
|
||||
"legendFormat": "Bytes/sec"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Sync Duration",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rsync_sync_duration_seconds",
|
||||
"legendFormat": "Duration"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Sync Errors",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(rsync_errors_total[5m])",
|
||||
"legendFormat": "Errors/sec"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8
|
||||
}
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"refresh": "5s"
|
||||
}
|
||||
}
|
||||
51
monitoring/grafana/dashboards/system-health.json
Normal file
51
monitoring/grafana/dashboards/system-health.json
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"dashboard": {
|
||||
"id": null,
|
||||
"title": "System Health",
|
||||
"tags": [
|
||||
"system",
|
||||
"health"
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Service Status",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up",
|
||||
"legendFormat": "{{job}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Memory Usage",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "process_resident_memory_bytes",
|
||||
"legendFormat": "Memory"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
}
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"refresh": "10s"
|
||||
}
|
||||
}
|
||||
68
monitoring/grafana/dashboards/websocket-performance.json
Normal file
68
monitoring/grafana/dashboards/websocket-performance.json
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
{
|
||||
"dashboard": {
|
||||
"id": null,
|
||||
"title": "WebSocket Performance",
|
||||
"tags": [
|
||||
"websocket",
|
||||
"performance"
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "WebSocket Connections",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "websocket_connections_active",
|
||||
"legendFormat": "Active Connections"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "WebSocket Messages",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(websocket_messages_total[5m])",
|
||||
"legendFormat": "Messages/sec"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Connection Errors",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(websocket_connection_errors_total[5m])",
|
||||
"legendFormat": "Errors/sec"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
}
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"refresh": "5s"
|
||||
}
|
||||
}
|
||||
280
monitoring/grafana/dashboards/worker-resources.json
Normal file
280
monitoring/grafana/dashboards/worker-resources.json
Normal file
|
|
@ -0,0 +1,280 @@
|
|||
{
|
||||
"id": null,
|
||||
"title": "Worker Resources",
|
||||
"tags": [
|
||||
"worker",
|
||||
"resources"
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "CPU Free",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "fetchml_resources_cpu_free",
|
||||
"legendFormat": "{{worker_id}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "CPU Total",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "fetchml_resources_cpu_total",
|
||||
"legendFormat": "{{worker_id}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "CPU Utilization (%)",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (1 - (fetchml_resources_cpu_free / clamp_min(fetchml_resources_cpu_total, 1)))",
|
||||
"legendFormat": "{{worker_id}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "GPU Slots Free",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "fetchml_resources_gpu_slots_free",
|
||||
"legendFormat": "{{worker_id}} gpu={{gpu_index}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 6
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Acquire Wait / Timeout (Totals)",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "fetchml_resources_acquire_wait_total",
|
||||
"legendFormat": "wait {{worker_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "fetchml_resources_acquire_timeout_total",
|
||||
"legendFormat": "timeout {{worker_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "fetchml_resources_acquire_total",
|
||||
"legendFormat": "total {{worker_id}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 8
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Avg Acquire Wait (seconds)",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "fetchml_resources_acquire_wait_seconds_total / clamp_min(fetchml_resources_acquire_wait_total, 1)",
|
||||
"legendFormat": "{{worker_id}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Acquire Wait Ratio",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "fetchml_resources_acquire_wait_total / clamp_min(fetchml_resources_acquire_total, 1)",
|
||||
"legendFormat": "{{worker_id}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 14
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Environment Prewarm Hit Rate (%)",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (fetchml_prewarm_env_hit_total / clamp_min(fetchml_prewarm_env_hit_total + fetchml_prewarm_env_miss_total, 1))",
|
||||
"legendFormat": "{{worker_id}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 14
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "red", "value": 0},
|
||||
{"color": "yellow", "value": 50},
|
||||
{"color": "green", "value": 80}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Snapshot Prewarm Hit Rate (%)",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (fetchml_prewarm_snapshot_hit_total / clamp_min(fetchml_prewarm_snapshot_hit_total + fetchml_prewarm_snapshot_miss_total, 1))",
|
||||
"legendFormat": "{{worker_id}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 14
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{"color": "red", "value": 0},
|
||||
{"color": "yellow", "value": 50},
|
||||
{"color": "green", "value": 80}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Prewarm Hits vs Misses",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(fetchml_prewarm_env_hit_total[5m])",
|
||||
"legendFormat": "env hits {{worker_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(fetchml_prewarm_env_miss_total[5m])",
|
||||
"legendFormat": "env misses {{worker_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(fetchml_prewarm_snapshot_hit_total[5m])",
|
||||
"legendFormat": "snapshot hits {{worker_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(fetchml_prewarm_snapshot_miss_total[5m])",
|
||||
"legendFormat": "snapshot misses {{worker_id}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 20
|
||||
},
|
||||
"yAxes": [
|
||||
{"unit": "reqps"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Prewarm Build Time",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(fetchml_prewarm_env_time_seconds_total[5m])",
|
||||
"legendFormat": "env build {{worker_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "rate(fetchml_prewarm_snapshot_time_seconds_total[5m])",
|
||||
"legendFormat": "snapshot prewarm {{worker_id}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 28
|
||||
},
|
||||
"yAxes": [
|
||||
{"unit": "seconds"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Prewarm Builds",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "increase(fetchml_prewarm_env_built_total[1h])",
|
||||
"legendFormat": "env built {{worker_id}}"
|
||||
},
|
||||
{
|
||||
"expr": "increase(fetchml_prewarm_snapshot_built_total[1h])",
|
||||
"legendFormat": "snapshots prewarmed {{worker_id}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 28
|
||||
},
|
||||
"yAxes": [
|
||||
{"unit": "short"}
|
||||
]
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"refresh": "5s"
|
||||
}
|
||||
|
|
@ -1,5 +1,4 @@
|
|||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
|
|
|
|||
9
monitoring/grafana/provisioning/datasources/loki.yml
Normal file
9
monitoring/grafana/provisioning/datasources/loki.yml
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
editable: true
|
||||
jsonData:
|
||||
maxLines: 1000
|
||||
|
|
@ -1,16 +1,10 @@
|
|||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: false
|
||||
editable: false
|
||||
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
isDefault: true
|
||||
editable: false
|
||||
editable: true
|
||||
jsonData:
|
||||
timeInterval: "5s"
|
||||
100
monitoring/health-testing.md
Normal file
100
monitoring/health-testing.md
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
# Testing Health Endpoints with Monitoring Stack
|
||||
|
||||
## Verify Health Endpoints
|
||||
|
||||
```bash
|
||||
# 1. Start the monitoring stack
|
||||
cd deployments
|
||||
docker-compose -f docker-compose.dev.yml up -d
|
||||
|
||||
# 2. Wait for services to start (30 seconds)
|
||||
sleep 30
|
||||
|
||||
# 3. Test health endpoints
|
||||
curl -k https://localhost:9101/health
|
||||
# Expected: {"status":"healthy","timestamp":"...","checks":{}}
|
||||
|
||||
curl -k https://localhost:9101/health/live
|
||||
# Expected: {"status":"alive","timestamp":"..."}
|
||||
|
||||
curl -k https://localhost:9101/health/ready
|
||||
# Expected: {"status":"ready","timestamp":"...","checks":{"queue":"ok","experiments":"ok"}}
|
||||
|
||||
# 4. Check Docker health status
|
||||
docker ps | grep api-server
|
||||
# Should show: (healthy)
|
||||
|
||||
# 5. Access Grafana
|
||||
open http://localhost:3000
|
||||
# Login: admin / admin123
|
||||
|
||||
# 6. Access Prometheus
|
||||
open http://localhost:9090
|
||||
# Check targets: Status > Targets
|
||||
# Should see: api-server, api-server-health
|
||||
|
||||
# 7. Query health metrics in Prometheus
|
||||
# Go to Graph and enter: up{job="api-server-health"}
|
||||
# Should show: value=1 (service is up)
|
||||
```
|
||||
|
||||
## Health Check Integration
|
||||
|
||||
### Docker Compose
|
||||
The health check is configured in `deployments/docker-compose.dev.yml`:
|
||||
```yaml
|
||||
healthcheck:
|
||||
test: [ "CMD", "curl", "-k", "https://localhost:9101/health" ]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
```
|
||||
|
||||
### Prometheus Monitoring
|
||||
Prometheus scrapes health status every 30s from:
|
||||
- `/health` - Overall service health
|
||||
- `/metrics` - Future Prometheus metrics (when implemented)
|
||||
|
||||
### Kubernetes (Future)
|
||||
Health endpoints ready for K8s probes:
|
||||
```yaml
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health/live
|
||||
port: 9101
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health/ready
|
||||
port: 9101
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 5
|
||||
```
|
||||
|
||||
## Monitoring Stack Services
|
||||
|
||||
- **Grafana** (port 3000): Dashboards and visualization
|
||||
- **Prometheus** (port 9090): Metrics collection
|
||||
- **Loki** (port 3100): Log aggregation
|
||||
- **Promtail**: Log shipping
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
```bash
|
||||
# Check API server logs
|
||||
docker logs ml-experiments-api
|
||||
|
||||
# Check Prometheus targets
|
||||
curl http://localhost:9090/api/v1/targets
|
||||
|
||||
# Check health endpoint directly
|
||||
docker exec ml-experiments-api curl -k https://localhost:9101/health
|
||||
|
||||
# Restart services
|
||||
docker-compose -f deployments/docker-compose.dev.yml restart api-server
|
||||
```
|
||||
|
|
@ -12,7 +12,7 @@ common:
|
|||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
instance_addr: 127.0.0.1
|
||||
instance_addr: 0.0.0.0
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
|
|
|
|||
|
|
@ -1,40 +0,0 @@
|
|||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
|
||||
ingester:
|
||||
lifecycler:
|
||||
address: 127.0.0.1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
replication_factor: 1
|
||||
final_sleep: 0s
|
||||
min_ready_duration: 0s
|
||||
chunk_idle_period: 1h
|
||||
max_chunk_age: 1h
|
||||
chunk_target_size: 1048576
|
||||
chunk_retain_period: 30s
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-10-24
|
||||
store: boltdb-shipper
|
||||
object_store: filesystem
|
||||
schema: v11
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
storage_config:
|
||||
boltdb_shipper:
|
||||
active_index_directory: /loki/boltdb-shipper-active
|
||||
cache_location: /loki/boltdb-shipper-cache
|
||||
filesystem:
|
||||
directory: /loki/chunks
|
||||
|
||||
limits_config:
|
||||
reject_old_samples: true
|
||||
reject_old_samples_max_age: 168h
|
||||
allow_structured_metadata: false
|
||||
|
|
@ -5,39 +5,35 @@ global:
|
|||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
# API Server metrics
|
||||
# API Server metrics and health
|
||||
- job_name: 'api-server'
|
||||
scheme: http
|
||||
static_configs:
|
||||
- targets: ['api-server:9100']
|
||||
- targets: ['api-server:9101']
|
||||
labels:
|
||||
service: 'api-server'
|
||||
metrics_path: /metrics # Future: Prometheus metrics endpoint
|
||||
|
||||
# Worker metrics (if running in docker)
|
||||
# Benchmark metrics from Pushgateway
|
||||
- job_name: 'benchmark'
|
||||
static_configs: []
|
||||
|
||||
# Worker metrics (ResourceManager + task execution)
|
||||
# For docker-compose dev on macOS/Windows, Prometheus can reach a locally running worker
|
||||
# via host.docker.internal.
|
||||
- job_name: 'worker'
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
- targets: ['worker:9100']
|
||||
labels:
|
||||
service: 'worker'
|
||||
# Allow failures if worker not running
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
|
||||
# Benchmark metrics from Pushgateway
|
||||
- job_name: 'benchmark'
|
||||
static_configs:
|
||||
- targets: ['localhost:9091']
|
||||
labels:
|
||||
service: 'benchmark'
|
||||
target_type: 'container'
|
||||
metrics_path: /metrics
|
||||
honor_labels: true
|
||||
|
||||
# Loki metrics
|
||||
- job_name: 'loki'
|
||||
static_configs:
|
||||
- targets: ['ml-experiments-loki:3100']
|
||||
- targets: ['loki:3100']
|
||||
labels:
|
||||
service: 'loki'
|
||||
metrics_path: /metrics
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
server:
|
||||
http_listen_port: 9080
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://loki:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
- job_name: fetchml-performance
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost
|
||||
labels:
|
||||
job: fetchml-performance
|
||||
__path__: /reports/performance.log
|
||||
|
||||
pipeline_stages:
|
||||
- json:
|
||||
expressions:
|
||||
timestamp: timestamp
|
||||
git_commit: git_commit
|
||||
benchmark_name: name
|
||||
time_per_op: time_per_op_ns
|
||||
memory_per_op: memory_per_op_b
|
||||
allocs_per_op: allocs_per_op
|
||||
|
||||
- labels:
|
||||
benchmark_name:
|
||||
git_commit:
|
||||
|
||||
- output:
|
||||
source: output
|
||||
|
||||
- job_name: fetchml-performance-summary
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost
|
||||
labels:
|
||||
job: fetchml-performance
|
||||
__path__: /reports/performance_summary.log
|
||||
|
||||
pipeline_stages:
|
||||
- regex:
|
||||
expression: "=== Performance Summary ==="
|
||||
|
||||
- output:
|
||||
source: output
|
||||
|
|
@ -1,112 +0,0 @@
|
|||
groups:
|
||||
- name: security.rules
|
||||
rules:
|
||||
# High rate of failed authentication attempts
|
||||
- alert: HighFailedAuthRate
|
||||
expr: rate(failed_auth_total[5m]) > 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High rate of failed authentication attempts"
|
||||
description: "More than 10 failed auth attempts per minute for the last 2 minutes"
|
||||
|
||||
# Potential brute force attack
|
||||
- alert: BruteForceAttack
|
||||
expr: rate(failed_auth_total[1m]) > 30
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Potential brute force attack detected"
|
||||
description: "More than 30 failed auth attempts per minute"
|
||||
|
||||
# Unusual WebSocket connection patterns
|
||||
- alert: UnusualWebSocketActivity
|
||||
expr: rate(websocket_connections_total[5m]) > 100
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Unusual WebSocket connection activity"
|
||||
description: "WebSocket connection rate is unusually high"
|
||||
|
||||
# Rate limit breaches
|
||||
- alert: RateLimitBreached
|
||||
expr: rate(rate_limit_exceeded_total[5m]) > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Rate limits being exceeded"
|
||||
description: "Rate limit exceeded more than 5 times per minute"
|
||||
|
||||
# SSL certificate expiration warning
|
||||
- alert: SSLCertificateExpiring
|
||||
expr: ssl_certificate_expiry_days < 30
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "SSL certificate expiring soon"
|
||||
description: "SSL certificate will expire in less than 30 days"
|
||||
|
||||
# High memory usage
|
||||
- alert: HighMemoryUsage
|
||||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage detected"
|
||||
description: "Memory usage is above 90%"
|
||||
|
||||
# High CPU usage
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage detected"
|
||||
description: "CPU usage is above 80%"
|
||||
|
||||
# Disk space running low
|
||||
- alert: LowDiskSpace
|
||||
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Low disk space"
|
||||
description: "Disk space is below 10%"
|
||||
|
||||
# Service down
|
||||
- alert: ServiceDown
|
||||
expr: up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service is down"
|
||||
description: "{{ $labels.instance }} service has been down for more than 1 minute"
|
||||
|
||||
# Unexpected error rates
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate detected"
|
||||
description: "Error rate is above 10%"
|
||||
|
||||
# Suspicious IP activity
|
||||
- alert: SuspiciousIPActivity
|
||||
expr: rate(requests_by_ip[5m]) > 1000
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Suspicious IP activity"
|
||||
description: "IP address making unusually many requests"
|
||||
|
|
@ -118,7 +118,7 @@ jupyter>=1.0.0
|
|||
"allow_network": false,
|
||||
"blocked_packages": ["requests", "urllib3", "httpx"],
|
||||
"max_execution_time": 3600,
|
||||
"gpu_access": true,
|
||||
"gpu_devices": ["/dev/dri"],
|
||||
"ml_env": "ml_env",
|
||||
"package_manager": "mamba"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,6 +32,10 @@ RUN mamba install -n ml_env \
|
|||
-c pytorch -c conda-forge -y && \
|
||||
conda clean -afy
|
||||
|
||||
# Poetry (for pyproject.toml + poetry.lock projects)
|
||||
RUN mamba install -n ml_env poetry -c conda-forge -y && \
|
||||
conda clean -afy
|
||||
|
||||
# Copy security wrapper
|
||||
COPY secure_runner.py /usr/local/bin/secure_runner.py
|
||||
COPY security_policy.json /etc/ml_runner/security_policy.json
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ class SecurityPolicy:
|
|||
],
|
||||
"max_execution_time": 3600,
|
||||
"max_memory_gb": 16,
|
||||
"gpu_access": True,
|
||||
"gpu_devices": ["/dev/dri"],
|
||||
"allow_file_writes": True,
|
||||
"resource_limits": {
|
||||
"cpu_count": 4,
|
||||
|
|
@ -106,97 +106,197 @@ class CondaRunner:
|
|||
self.conda_prefix = os.environ.get("CONDA_PREFIX", "/opt/conda")
|
||||
self.env_path = f"{self.conda_prefix}/envs/{self.conda_env}"
|
||||
|
||||
def setup_environment(self, requirements_file: Path) -> bool:
|
||||
"""Setup Conda environment with mamba"""
|
||||
self.gpu_devices = self.security_policy.policy.get("gpu_devices", [])
|
||||
|
||||
def setup_environment(self, deps_file: Path) -> bool:
|
||||
"""Setup Conda environment based on a dependency manifest."""
|
||||
try:
|
||||
# Read requirements
|
||||
with open(requirements_file, "r") as f:
|
||||
requirements = [
|
||||
line.strip()
|
||||
for line in f
|
||||
if line.strip() and not line.startswith("#")
|
||||
]
|
||||
name = deps_file.name
|
||||
|
||||
# Check each package for security
|
||||
for req in requirements:
|
||||
package_name = (
|
||||
req.split("==")[0].split(">=")[0].split("<=")[0].strip()
|
||||
)
|
||||
if not self.security_policy.check_package_safety(package_name):
|
||||
print(
|
||||
f"[SECURITY] Package '{package_name}' is blocked for security reasons"
|
||||
)
|
||||
return False
|
||||
print(f"[MANIFEST] Using dependency manifest: {name}")
|
||||
|
||||
# Install packages with mamba (super fast!)
|
||||
for req in requirements:
|
||||
package_name = (
|
||||
req.split("==")[0].split(">=")[0].split("<=")[0].strip()
|
||||
)
|
||||
|
||||
# Check if already installed with conda
|
||||
check_cmd = [
|
||||
"conda",
|
||||
"run",
|
||||
"-n",
|
||||
self.conda_env,
|
||||
"python",
|
||||
"-c",
|
||||
f"import {package_name.replace('-', '_')}",
|
||||
]
|
||||
result = subprocess.run(
|
||||
check_cmd, capture_output=True, text=True
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"[OK] {package_name} already installed in conda env")
|
||||
continue
|
||||
|
||||
# Try conda-forge first (faster and more reliable)
|
||||
print(
|
||||
f"[INSTALL] Installing {req} with {self.package_manager}..."
|
||||
)
|
||||
install_cmd = [
|
||||
if name in ("environment.yml", "environment.yaml"):
|
||||
print(f"[SETUP] Applying conda environment file: {deps_file}")
|
||||
cmd = [
|
||||
self.package_manager,
|
||||
"install",
|
||||
"env",
|
||||
"update",
|
||||
"-n",
|
||||
self.conda_env,
|
||||
req,
|
||||
"-c",
|
||||
"conda-forge",
|
||||
"-f",
|
||||
str(deps_file),
|
||||
"-y",
|
||||
]
|
||||
result = subprocess.run(
|
||||
install_cmd, capture_output=True, text=True, timeout=300
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
|
||||
if result.returncode != 0:
|
||||
print(f"[ERROR] Failed to apply environment file: {result.stderr}")
|
||||
return False
|
||||
return True
|
||||
|
||||
if name == "poetry.lock":
|
||||
pyproject = self.workspace_dir / "pyproject.toml"
|
||||
if not pyproject.exists():
|
||||
print("[ERROR] poetry.lock provided but pyproject.toml is missing")
|
||||
return False
|
||||
|
||||
print(f"[SETUP] Installing dependencies from Poetry lockfile: {deps_file}")
|
||||
env = os.environ.copy()
|
||||
env.update(
|
||||
{
|
||||
"POETRY_VIRTUALENVS_CREATE": "false",
|
||||
"POETRY_NO_INTERACTION": "1",
|
||||
}
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"[OK] Installed {req} with {self.package_manager}")
|
||||
continue
|
||||
# Ensure Poetry is available in the conda env.
|
||||
check = subprocess.run(
|
||||
["conda", "run", "-n", self.conda_env, "poetry", "--version"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
)
|
||||
if check.returncode != 0:
|
||||
print("[ERROR] Poetry is not available in the container environment")
|
||||
print(check.stderr)
|
||||
return False
|
||||
|
||||
# Fallback to pip if conda fails
|
||||
print(f"[FALLBACK] Trying pip for {req}...")
|
||||
pip_cmd = [
|
||||
# Install into the conda env (no separate venv).
|
||||
install = subprocess.run(
|
||||
[
|
||||
"conda",
|
||||
"run",
|
||||
"-n",
|
||||
self.conda_env,
|
||||
"poetry",
|
||||
"install",
|
||||
"--no-ansi",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=900,
|
||||
cwd=str(self.workspace_dir),
|
||||
env=env,
|
||||
)
|
||||
if install.returncode != 0:
|
||||
print("[ERROR] Poetry install failed")
|
||||
print(install.stderr)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
if name == "pyproject.toml":
|
||||
# Use pip's PEP517/pyproject support (no Poetry required).
|
||||
# This installs the project itself; dependencies may be fetched as needed.
|
||||
print(f"[SETUP] Installing project from pyproject.toml: {deps_file}")
|
||||
cmd = [
|
||||
"conda",
|
||||
"run",
|
||||
"-n",
|
||||
self.conda_env,
|
||||
"pip",
|
||||
"install",
|
||||
req,
|
||||
str(self.workspace_dir),
|
||||
"--no-cache-dir",
|
||||
]
|
||||
result = subprocess.run(
|
||||
pip_cmd, capture_output=True, text=True, timeout=300
|
||||
)
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
|
||||
if result.returncode != 0:
|
||||
print(f"[ERROR] Failed to install {req}: {result.stderr}")
|
||||
print(f"[ERROR] Failed to install project from pyproject.toml: {result.stderr}")
|
||||
return False
|
||||
return True
|
||||
|
||||
print(f"[OK] Installed {req} with pip")
|
||||
if name == "requirements.txt":
|
||||
# Read requirements
|
||||
with open(deps_file, "r") as f:
|
||||
requirements = [
|
||||
line.strip()
|
||||
for line in f
|
||||
if line.strip() and not line.startswith("#")
|
||||
]
|
||||
|
||||
return True
|
||||
# Check each package for security
|
||||
for req in requirements:
|
||||
package_name = (
|
||||
req.split("==")[0].split(">=")[0].split("<=")[0].strip()
|
||||
)
|
||||
if not self.security_policy.check_package_safety(package_name):
|
||||
print(
|
||||
f"[SECURITY] Package '{package_name}' is blocked for security reasons"
|
||||
)
|
||||
return False
|
||||
|
||||
# Install packages with mamba (super fast!)
|
||||
for req in requirements:
|
||||
package_name = (
|
||||
req.split("==")[0].split(">=")[0].split("<=")[0].strip()
|
||||
)
|
||||
|
||||
# Check if already installed with conda
|
||||
check_cmd = [
|
||||
"conda",
|
||||
"run",
|
||||
"-n",
|
||||
self.conda_env,
|
||||
"python",
|
||||
"-c",
|
||||
f"import {package_name.replace('-', '_')}",
|
||||
]
|
||||
result = subprocess.run(
|
||||
check_cmd, capture_output=True, text=True
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"[OK] {package_name} already installed in conda env")
|
||||
continue
|
||||
|
||||
# Try conda-forge first (faster and more reliable)
|
||||
print(
|
||||
f"[INSTALL] Installing {req} with {self.package_manager}..."
|
||||
)
|
||||
install_cmd = [
|
||||
self.package_manager,
|
||||
"install",
|
||||
"-n",
|
||||
self.conda_env,
|
||||
req,
|
||||
"-c",
|
||||
"conda-forge",
|
||||
"-y",
|
||||
]
|
||||
result = subprocess.run(
|
||||
install_cmd, capture_output=True, text=True, timeout=300
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"[OK] Installed {req} with {self.package_manager}")
|
||||
continue
|
||||
|
||||
# Fallback to pip if conda fails
|
||||
print(f"[FALLBACK] Trying pip for {req}...")
|
||||
pip_cmd = [
|
||||
"conda",
|
||||
"run",
|
||||
"-n",
|
||||
self.conda_env,
|
||||
"pip",
|
||||
"install",
|
||||
req,
|
||||
"--no-cache-dir",
|
||||
]
|
||||
result = subprocess.run(
|
||||
pip_cmd, capture_output=True, text=True, timeout=300
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"[ERROR] Failed to install {req}: {result.stderr}")
|
||||
return False
|
||||
|
||||
print(f"[OK] Installed {req} with pip")
|
||||
|
||||
return True
|
||||
|
||||
print(f"[ERROR] Unsupported dependency manifest: {deps_file}")
|
||||
print("Supported: environment.yml, environment.yaml, poetry.lock (requires pyproject.toml), pyproject.toml, requirements.txt")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Environment setup failed: {e}")
|
||||
|
|
@ -217,7 +317,7 @@ class CondaRunner:
|
|||
env.update(
|
||||
{
|
||||
"CONDA_DEFAULT_ENV": self.conda_env,
|
||||
"CUDA_VISIBLE_DEVICES": "0", # Allow GPU access
|
||||
"CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES", ""), # Allow GPU access
|
||||
"SECURE_MODE": "1",
|
||||
"NETWORK_ACCESS": (
|
||||
"1"
|
||||
|
|
@ -280,7 +380,7 @@ class CondaRunner:
|
|||
"stdout": stdout,
|
||||
"stderr": stderr,
|
||||
"return_code": process.returncode,
|
||||
"gpu_accessible": True,
|
||||
"gpu_accessible": len(self.gpu_devices) > 0,
|
||||
"security_mode": "enabled",
|
||||
"container_type": "conda",
|
||||
"conda_env": self.conda_env,
|
||||
|
|
@ -338,8 +438,12 @@ def main():
|
|||
parser.add_argument(
|
||||
"--workspace", default="/workspace", help="Workspace directory"
|
||||
)
|
||||
parser.add_argument("--requirements", help="Requirements file path")
|
||||
parser.add_argument("--deps", help="Dependency manifest path (environment.yml | poetry.lock | pyproject.toml | requirements.txt)")
|
||||
parser.add_argument("--requirements", help="Deprecated alias for --deps")
|
||||
parser.add_argument("--script", help="Training script path")
|
||||
parser.add_argument(
|
||||
"--prepare-only", action="store_true", help="Only prepare dependencies and exit"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--args",
|
||||
nargs=argparse.REMAINDER,
|
||||
|
|
@ -383,17 +487,26 @@ def main():
|
|||
if args.check_gpu:
|
||||
return 0
|
||||
|
||||
deps_arg = args.deps or args.requirements
|
||||
if not deps_arg:
|
||||
print("[ERROR] Missing dependency manifest. Provide --deps.")
|
||||
return 1
|
||||
|
||||
# Setup environment
|
||||
requirements_path = Path(args.requirements)
|
||||
if not requirements_path.exists():
|
||||
print(f"[ERROR] Requirements file not found: {requirements_path}")
|
||||
deps_path = Path(deps_arg)
|
||||
if not deps_path.exists():
|
||||
print(f"[ERROR] Dependency manifest not found: {deps_path}")
|
||||
return 1
|
||||
|
||||
print("[SETUP] Setting up secure environment...")
|
||||
if not runner.setup_environment(requirements_path):
|
||||
if not runner.setup_environment(deps_path):
|
||||
print("[ERROR] Failed to setup secure environment")
|
||||
return 1
|
||||
|
||||
if args.prepare_only:
|
||||
print("[DONE] Environment prepared successfully")
|
||||
return 0
|
||||
|
||||
# Run experiment
|
||||
script_path = Path(args.script)
|
||||
if not script_path.exists():
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@
|
|||
],
|
||||
"max_execution_time": 3600,
|
||||
"max_memory_gb": 16,
|
||||
"gpu_access": true,
|
||||
"gpu_devices": ["/dev/dri"],
|
||||
"allow_file_writes": true,
|
||||
"resource_limits": {
|
||||
"cpu_count": 4,
|
||||
|
|
|
|||
|
|
@ -20,19 +20,12 @@ This directory contains setup and utility scripts for FetchML.
|
|||
sudo ./scripts/setup-prod.sh /data/ml-experiments ml-user ml-group
|
||||
```
|
||||
|
||||
### `validate-prod-config.sh`
|
||||
**Purpose**: Validates production configuration files
|
||||
**Usage**: `./scripts/validate-prod-config.sh [api-config] [worker-config]`
|
||||
**What it does**:
|
||||
- Checks config file syntax
|
||||
- Verifies base_path consistency
|
||||
- Tests Redis connectivity
|
||||
- Validates Podman setup
|
||||
- Checks directory permissions
|
||||
### Configuration validation
|
||||
Validate configs using the built-in config lint targets:
|
||||
|
||||
**Example**:
|
||||
```bash
|
||||
./scripts/validate-prod-config.sh configs/config-prod.yaml configs/worker-prod.toml
|
||||
make configlint
|
||||
make worker-configlint
|
||||
```
|
||||
|
||||
## Legacy Setup Scripts (Deprecated)
|
||||
|
|
@ -44,12 +37,11 @@ The following scripts are from earlier iterations and are **deprecated** in favo
|
|||
- `auto_setup.sh` - Old automated setup (superseded)
|
||||
- `setup_common.sh` - Common functions (integrated into setup-prod.sh)
|
||||
- `quick_start.sh` - Quick dev setup (use docker-compose on macOS instead)
|
||||
- `test_tools.sh` - Tool testing (integrated into validate-prod-config.sh)
|
||||
|
||||
|
||||
### Cleanup Recommendation
|
||||
These legacy scripts can be removed or archived. The current production setup only needs:
|
||||
- `setup-prod.sh`
|
||||
- `validate-prod-config.sh`
|
||||
|
||||
## Usage Workflow
|
||||
|
||||
|
|
@ -59,8 +51,8 @@ These legacy scripts can be removed or archived. The current production setup on
|
|||
sudo ./scripts/setup-prod.sh
|
||||
|
||||
# 2. Copy and configure
|
||||
sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml
|
||||
sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml
|
||||
sudo cp configs/api/prod.yaml /etc/fetch_ml/config.yaml
|
||||
sudo cp configs/workers/worker-prod.toml /etc/fetch_ml/worker.toml
|
||||
sudo vim /etc/fetch_ml/config.yaml # Update API keys, etc.
|
||||
|
||||
# 3. Build and install
|
||||
|
|
@ -68,7 +60,8 @@ make prod
|
|||
sudo make install
|
||||
|
||||
# 4. Validate
|
||||
./scripts/validate-prod-config.sh /etc/fetch_ml/config.yaml /etc/fetch_ml/worker.toml
|
||||
./bin/configlint --schema configs/schema/api_server_config.yaml /etc/fetch_ml/config.yaml
|
||||
./bin/configlint --schema configs/schema/worker_config_schema.yaml /etc/fetch_ml/worker.toml
|
||||
|
||||
# 5. Start services
|
||||
sudo systemctl start fetchml-api fetchml-worker
|
||||
|
|
@ -82,7 +75,7 @@ docker-compose up -d
|
|||
|
||||
# Or run components directly
|
||||
make dev
|
||||
./bin/api-server -config configs/config-local.yaml
|
||||
./bin/api-server -config configs/api/dev.yaml
|
||||
```
|
||||
|
||||
## Script Maintenance
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ set -e
|
|||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||
LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
|
||||
ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"
|
||||
TIMESTAMP=$(date -u +"%Y%m%d_%H%M%S")
|
||||
RUN_DIR="$LOCAL_ARTIFACTS_DIR/run_$TIMESTAMP"
|
||||
|
||||
|
|
@ -168,14 +169,25 @@ if [ -f "$SCRIPT_DIR/cleanup-benchmarks.sh" ]; then
|
|||
"$SCRIPT_DIR/cleanup-benchmarks.sh" benchmarks
|
||||
else
|
||||
# Fallback cleanup if script not available
|
||||
echo "Cleaning old benchmark runs (keeping last 10)..."
|
||||
echo "Archiving old benchmark runs (keeping last 10)..."
|
||||
stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||
mkdir -p "$ARCHIVE_DIR/$stamp"
|
||||
cd "$LOCAL_ARTIFACTS_DIR"
|
||||
ls -1t run_* 2>/dev/null | tail -n +11 | xargs rm -rf 2>/dev/null || echo "No old runs to clean"
|
||||
ls -1t run_* 2>/dev/null | tail -n +11 | while read -r run; do
|
||||
[ -n "$run" ] || continue
|
||||
mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
|
||||
done
|
||||
|
||||
# Clean temporary files
|
||||
echo "Cleaning temporary files..."
|
||||
find /tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
|
||||
find /var/tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
|
||||
echo "Archiving temporary files..."
|
||||
tmp_archive_dir="$LOCAL_ARTIFACTS_DIR/tmp-archive/$stamp"
|
||||
mkdir -p "$tmp_archive_dir"
|
||||
find /tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
|
||||
mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
|
||||
done
|
||||
find /var/tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
|
||||
mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
|
||||
done
|
||||
|
||||
# Clean Go build cache
|
||||
echo "Cleaning Go build cache..."
|
||||
|
|
|
|||
|
|
@ -1,49 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Create a Bitwarden item for a FetchML API user.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/create_bitwarden_fetchml_item.sh <username> <api_key> <api_key_hash>
|
||||
#
|
||||
# Requirements:
|
||||
# - Bitwarden CLI (bw) installed
|
||||
# - You are logged in and unlocked (bw login; bw unlock)
|
||||
# - jq installed
|
||||
#
|
||||
# This script does NOT run on the homelab server. Run it from your
|
||||
# own machine where you manage Bitwarden.
|
||||
|
||||
if [[ $# -ne 3 ]]; then
|
||||
echo "Usage: $0 <username> <api_key> <api_key_hash>" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
USER_NAME="$1"
|
||||
API_KEY="$2"
|
||||
API_KEY_HASH="$3"
|
||||
|
||||
ITEM_NAME="FetchML API $USER_NAME"
|
||||
|
||||
# Get base item template
|
||||
TEMPLATE_JSON=$(bw get template item)
|
||||
|
||||
# Build item JSON with jq
|
||||
ITEM_JSON=$(echo "$TEMPLATE_JSON" | jq \
|
||||
--arg name "$ITEM_NAME" \
|
||||
--arg username "$USER_NAME" \
|
||||
--arg password "$API_KEY" \
|
||||
--arg hash "$API_KEY_HASH" \
|
||||
'.name = $name
|
||||
| .login.username = $username
|
||||
| .login.password = $password
|
||||
| .notes = "FetchML API key for user " + $username
|
||||
| .fields = [{"name":"api_key_hash","value":$hash,"type":1}]')
|
||||
|
||||
# Create item in Bitwarden
|
||||
# If you ever want to edit instead, you can capture the ID from this call
|
||||
# and use: bw edit item <id> <json>
|
||||
|
||||
echo "$ITEM_JSON" | bw encode | bw create item
|
||||
|
||||
echo "Created Bitwarden item: $ITEM_NAME"
|
||||
|
|
@ -1,90 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Setup auto-cleanup service for fetch_ml
|
||||
# This creates a systemd timer that runs cleanup daily
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
|
||||
# Colors
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
log_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
log_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
log_info "Setting up auto-cleanup service..."
|
||||
|
||||
# Check if running on macOS or Linux
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
log_info "Detected macOS - setting up launchd agent"
|
||||
|
||||
# Create launchd plist
|
||||
cat > ~/Library/LaunchAgents/com.fetchml.cleanup.plist << EOF
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>com.fetchml.cleanup</string>
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>$PROJECT_DIR/scripts/cleanup.sh</string>
|
||||
<string>--force</string>
|
||||
</array>
|
||||
<key>StartInterval</key>
|
||||
<integer>86400</integer>
|
||||
<key>RunAtLoad</key>
|
||||
<false/>
|
||||
<key>StandardOutPath</key>
|
||||
<string>/tmp/fetchml-cleanup.log</string>
|
||||
<key>StandardErrorPath</key>
|
||||
<string>/tmp/fetchml-cleanup.error.log</string>
|
||||
</dict>
|
||||
</plist>
|
||||
EOF
|
||||
|
||||
# Load the launchd agent
|
||||
launchctl load ~/Library/LaunchAgents/com.fetchml.cleanup.plist
|
||||
|
||||
log_success "Auto-cleanup service installed for macOS"
|
||||
log_info "Logs will be in /tmp/fetchml-cleanup.log"
|
||||
|
||||
elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
|
||||
log_info "Detected Linux - setting up systemd timer"
|
||||
|
||||
# Copy service files
|
||||
sudo cp "$SCRIPT_DIR/auto-cleanup.service" /etc/systemd/system/
|
||||
sudo cp "$SCRIPT_DIR/auto-cleanup.timer" /etc/systemd/system/
|
||||
|
||||
# Reload systemd and enable timer
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable auto-cleanup.timer
|
||||
sudo systemctl start auto-cleanup.timer
|
||||
|
||||
log_success "Auto-cleanup service installed for Linux"
|
||||
log_info "Check status with: systemctl status auto-cleanup.timer"
|
||||
|
||||
else
|
||||
echo "Unsupported OS: $OSTYPE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info "Auto-cleanup will run daily"
|
||||
log_info "To uninstall:"
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
echo " launchctl unload ~/Library/LaunchAgents/com.fetchml.cleanup.plist"
|
||||
echo " rm ~/Library/LaunchAgents/com.fetchml.cleanup.plist"
|
||||
else
|
||||
echo " sudo systemctl stop auto-cleanup.timer"
|
||||
echo " sudo systemctl disable auto-cleanup.timer"
|
||||
echo " sudo rm /etc/systemd/system/auto-cleanup.*"
|
||||
fi
|
||||
|
|
@ -1,275 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Production Monitoring Stack Setup for Linux
|
||||
# Deploys Prometheus/Grafana/Loki/Promtail as Podman containers with systemd
|
||||
# Compatible with: Rocky/RHEL/CentOS, Ubuntu/Debian, Arch, SUSE, etc.
|
||||
|
||||
set -e
|
||||
|
||||
BOLD='\033[1m'
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
YELLOW='\033[0;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
echo -e "${BOLD}=== FetchML Monitoring Stack Setup (Linux) ===${NC}\n"
|
||||
|
||||
# Detect Linux distribution and package manager
|
||||
detect_distro() {
|
||||
if [ -f /etc/os-release ]; then
|
||||
. /etc/os-release
|
||||
DISTRO=$ID
|
||||
DISTRO_VERSION=$VERSION_ID
|
||||
elif [ -f /etc/redhat-release ]; then
|
||||
DISTRO="rhel"
|
||||
else
|
||||
DISTRO="unknown"
|
||||
fi
|
||||
|
||||
# Detect package manager
|
||||
if command -v dnf &>/dev/null; then
|
||||
PKG_MANAGER="dnf"
|
||||
elif command -v yum &>/dev/null; then
|
||||
PKG_MANAGER="yum"
|
||||
elif command -v apt-get &>/dev/null; then
|
||||
PKG_MANAGER="apt"
|
||||
elif command -v pacman &>/dev/null; then
|
||||
PKG_MANAGER="pacman"
|
||||
elif command -v zypper &>/dev/null; then
|
||||
PKG_MANAGER="zypper"
|
||||
else
|
||||
echo -e "${YELLOW}Warning: No known package manager found${NC}"
|
||||
PKG_MANAGER="unknown"
|
||||
fi
|
||||
|
||||
echo "Detected distribution: $DISTRO (using $PKG_MANAGER)"
|
||||
}
|
||||
|
||||
detect_distro
|
||||
|
||||
# Configuration
|
||||
DATA_PATH="${1:-/data/monitoring}"
|
||||
ML_USER="${2:-ml-user}"
|
||||
ML_GROUP="${3:-ml-group}"
|
||||
|
||||
echo "Configuration:"
|
||||
echo " Monitoring data path: $DATA_PATH"
|
||||
echo " User: $ML_USER"
|
||||
echo " Group: $ML_GROUP"
|
||||
echo ""
|
||||
|
||||
# Create pod for monitoring stack
|
||||
POD_NAME="monitoring"
|
||||
|
||||
# 1. Create directories
|
||||
echo -e "${BLUE}[1/6]${NC} Creating directory structure..."
|
||||
sudo mkdir -p "${DATA_PATH}"/{prometheus,grafana,loki,promtail-config}
|
||||
sudo mkdir -p /etc/fetch_ml/monitoring
|
||||
sudo mkdir -p /var/lib/grafana/dashboards
|
||||
|
||||
sudo chown -R $ML_USER:$ML_GROUP $DATA_PATH
|
||||
sudo chmod 755 $DATA_PATH
|
||||
|
||||
echo -e "${GREEN}✓${NC} Directories created"
|
||||
|
||||
# 2. Copy configuration files
|
||||
echo -e "${BLUE}[2/6]${NC} Copying configuration files..."
|
||||
sudo cp monitoring/prometheus.yml /etc/fetch_ml/monitoring/
|
||||
sudo cp monitoring/loki-config.yml /etc/fetch_ml/monitoring/
|
||||
sudo cp monitoring/promtail-config.yml /etc/fetch_ml/monitoring/
|
||||
sudo cp monitoring/grafana/provisioning /etc/fetch_ml/monitoring/ -r
|
||||
sudo cp monitoring/grafana-dashboard.json /var/lib/grafana/dashboards/ml-queue.json
|
||||
sudo cp monitoring/logs-dashboard.json /var/lib/grafana/dashboards/logs.json
|
||||
|
||||
sudo chown -R $ML_USER:$ML_GROUP /etc/fetch_ml/monitoring
|
||||
sudo chown -R $ML_USER:$ML_GROUP /var/lib/grafana
|
||||
|
||||
echo -e "${GREEN}✓${NC} Configuration copied"
|
||||
|
||||
# 3. Create Podman pod
|
||||
echo -e "${BLUE}[3/6]${NC} Creating Podman pod..."
|
||||
sudo -u $ML_USER podman pod create \\
|
||||
--name $POD_NAME \\
|
||||
-p 3000:3000 \\
|
||||
-p 9090:9090 \\
|
||||
-p 3100:3100 \\
|
||||
|| echo "Pod may already exist"
|
||||
|
||||
echo -e "${GREEN}✓${NC} Pod created"
|
||||
|
||||
# 4. Create systemd service for monitoring pod
|
||||
echo -e "${BLUE}[4/6]${NC} Creating systemd services..."
|
||||
|
||||
# Prometheus service
|
||||
sudo tee /etc/systemd/system/prometheus.service >/dev/null <<EOF
|
||||
[Unit]
|
||||
Description=Prometheus Monitoring
|
||||
After=network.target
|
||||
PartOf=$POD_NAME-pod.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=$ML_USER
|
||||
Group=$ML_GROUP
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 9090:9090
|
||||
ExecStart=/usr/bin/podman run --rm --name prometheus \\
|
||||
--pod $POD_NAME \\
|
||||
-v /etc/fetch_ml/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro \\
|
||||
-v ${DATA_PATH}/prometheus:/prometheus \\
|
||||
docker.io/prom/prometheus:latest \\
|
||||
--config.file=/etc/prometheus/prometheus.yml \\
|
||||
--storage.tsdb.path=/prometheus \\
|
||||
--web.enable-lifecycle
|
||||
|
||||
ExecStop=/usr/bin/podman stop -t 10 prometheus
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
# Loki service
|
||||
sudo tee /etc/systemd/system/loki.service >/dev/null <<EOF
|
||||
[Unit]
|
||||
Description=Loki Log Aggregation
|
||||
After=network.target
|
||||
PartOf=$POD_NAME-pod.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=$ML_USER
|
||||
Group=$ML_GROUP
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 3100:3100
|
||||
ExecStart=/usr/bin/podman run --rm --name loki \\
|
||||
--pod $POD_NAME \\
|
||||
-v /etc/fetch_ml/monitoring/loki-config.yml:/etc/loki/local-config.yaml:ro \\
|
||||
-v ${DATA_PATH}/loki:/loki \\
|
||||
docker.io/grafana/loki:latest \\
|
||||
-config.file=/etc/loki/local-config.yaml
|
||||
|
||||
ExecStop=/usr/bin/podman stop -t 10 loki
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
# Grafana service
|
||||
sudo tee /etc/systemd/system/grafana.service >/dev/null <<EOF
|
||||
[Unit]
|
||||
Description=Grafana Visualization
|
||||
After=network.target prometheus.service loki.service
|
||||
PartOf=$POD_NAME-pod.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=$ML_USER
|
||||
Group=$ML_GROUP
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 3000:3000
|
||||
ExecStart=/usr/bin/podman run --rm --name grafana \\
|
||||
--pod $POD_NAME \\
|
||||
-v ${DATA_PATH}/grafana:/var/lib/grafana \\
|
||||
-v /etc/fetch_ml/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro \\
|
||||
-v /var/lib/grafana/dashboards:/var/lib/grafana/dashboards:ro \\
|
||||
-e GF_SECURITY_ADMIN_PASSWORD=\${GRAFANA_ADMIN_PASSWORD:-$(openssl rand -base64 32)} \\
|
||||
-e GF_USERS_ALLOW_SIGN_UP=false \\
|
||||
-e GF_AUTH_ANONYMOUS_ENABLED=false \\
|
||||
docker.io/grafana/grafana:latest
|
||||
|
||||
ExecStop=/usr/bin/podman stop -t 10 grafana
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
# Promtail service
|
||||
sudo tee /etc/systemd/system/promtail.service >/dev/null <<EOF
|
||||
[Unit]
|
||||
Description=Promtail Log Collector
|
||||
After=network.target loki.service
|
||||
PartOf=$POD_NAME-pod.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=$ML_USER
|
||||
Group=$ML_GROUP
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME
|
||||
ExecStart=/usr/bin/podman run --rm --name promtail \\
|
||||
--pod $POD_NAME \\
|
||||
-v /etc/fetch_ml/monitoring/promtail-config.yml:/etc/promtail/config.yml:ro \\
|
||||
-v /var/log/fetch_ml:/var/log/app:ro \\
|
||||
docker.io/grafana/promtail:latest \\
|
||||
-config.file=/etc/promtail/config.yml
|
||||
|
||||
ExecStop=/usr/bin/podman stop -t 10 promtail
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
echo -e "${GREEN}✓${NC} Systemd services created"
|
||||
|
||||
# 5. Create monitoring pod service
|
||||
echo -e "${BLUE}[5/6]${NC} Creating pod management service..."
|
||||
sudo -u $ML_USER podman generate systemd --new --name $POD_NAME \\
|
||||
| sudo tee /etc/systemd/system/$POD_NAME-pod.service >/dev/null
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
echo -e "${GREEN}✓${NC} Pod service created"
|
||||
|
||||
# 6. Setup firewall rules
|
||||
echo -e "${BLUE}[6/6]${NC} Configuring firewall..."
|
||||
if command -v firewall-cmd &>/dev/null; then
|
||||
# RHEL/Rocky/Fedora (firewalld)
|
||||
sudo firewall-cmd --permanent --add-port=3000/tcp # Grafana
|
||||
sudo firewall-cmd --permanent --add-port=9090/tcp # Prometheus
|
||||
sudo firewall-cmd --reload
|
||||
echo -e "${GREEN}✓${NC} Firewall configured (firewalld)"
|
||||
elif command -v ufw &>/dev/null; then
|
||||
# Ubuntu/Debian (ufw)
|
||||
sudo ufw allow 3000/tcp comment 'Grafana'
|
||||
sudo ufw allow 9090/tcp comment 'Prometheus'
|
||||
echo -e "${GREEN}✓${NC} Firewall configured (ufw)"
|
||||
else
|
||||
echo -e "${YELLOW}!${NC} No firewall detected. You may need to manually open ports 3000 and 9090"
|
||||
fi
|
||||
|
||||
# Summary
|
||||
echo ""
|
||||
echo -e "${BOLD}=== Monitoring Stack Setup Complete! ===${NC}"
|
||||
echo ""
|
||||
echo "Services created:"
|
||||
echo " - prometheus.service (Metrics collection)"
|
||||
echo " - loki.service (Log aggregation)"
|
||||
echo " - grafana.service (Visualization)"
|
||||
echo " - promtail.service (Log shipping)"
|
||||
echo ""
|
||||
echo -e "${BOLD}Next steps:${NC}"
|
||||
echo "1. Start services:"
|
||||
echo " sudo systemctl start prometheus"
|
||||
echo " sudo systemctl start loki"
|
||||
echo " sudo systemctl start promtail"
|
||||
echo " sudo systemctl start grafana"
|
||||
echo ""
|
||||
echo "2. Enable on boot:"
|
||||
echo " sudo systemctl enable prometheus loki promtail grafana"
|
||||
echo ""
|
||||
echo "3. Access Grafana:"
|
||||
echo " http://YOUR_SERVER_IP:3000"
|
||||
echo " Username: admin"
|
||||
echo " Password: admin (change on first login)"
|
||||
echo ""
|
||||
echo "4. Check logs:"
|
||||
echo " sudo journalctl -u prometheus -f"
|
||||
echo " sudo journalctl -u grafana -f"
|
||||
echo ""
|
||||
|
|
@ -1,229 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Production Setup Script for Rocky Linux (Bare Metal)
|
||||
# This script sets up the complete FetchML environment on bare metal
|
||||
|
||||
set -e
|
||||
|
||||
BOLD='\033[1m'
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
echo -e "${BOLD}=== FetchML Production Setup (Rocky Linux Bare Metal) ===${NC}\n"
|
||||
|
||||
# Configuration
|
||||
BASE_PATH="${1:-/data/ml-experiments}"
|
||||
ML_USER="${2:-ml-user}"
|
||||
ML_GROUP="${3:-ml-group}"
|
||||
|
||||
echo "Configuration:"
|
||||
echo " Base path: $BASE_PATH"
|
||||
echo " ML user: $ML_USER"
|
||||
echo " ML group: $ML_GROUP"
|
||||
echo ""
|
||||
|
||||
# 1. Create system user if it doesn't exist
|
||||
echo -e "${BLUE}[1/8]${NC} Creating system user..."
|
||||
if id "$ML_USER" &>/dev/null; then
|
||||
echo " User $ML_USER already exists"
|
||||
else
|
||||
sudo useradd -r -s /bin/bash -m -d /home/$ML_USER -c "ML System User" $ML_USER
|
||||
echo -e "${GREEN}✓${NC} Created user: $ML_USER"
|
||||
fi
|
||||
|
||||
# 2. Create directory structure
|
||||
echo -e "${BLUE}[2/8]${NC} Creating directory structure..."
|
||||
sudo mkdir -p "${BASE_PATH}"/{experiments,pending,running,finished,failed,datasets}
|
||||
sudo mkdir -p /var/log/fetch_ml
|
||||
sudo mkdir -p /etc/fetch_ml
|
||||
|
||||
echo -e "${GREEN}✓${NC} Created directories:"
|
||||
echo " $BASE_PATH/experiments/"
|
||||
echo " $BASE_PATH/pending/"
|
||||
echo " $BASE_PATH/running/"
|
||||
echo " $BASE_PATH/finished/"
|
||||
echo " $BASE_PATH/failed/"
|
||||
echo " $BASE_PATH/datasets/"
|
||||
echo " /var/log/fetch_ml/"
|
||||
echo " /etc/fetch_ml/"
|
||||
|
||||
# 3. Set ownership and permissions
|
||||
echo -e "${BLUE}[3/8]${NC} Setting permissions..."
|
||||
sudo chown -R $ML_USER:$ML_GROUP $BASE_PATH
|
||||
sudo chmod 755 $BASE_PATH
|
||||
sudo chmod 700 $BASE_PATH/experiments # Restrict experiment data
|
||||
|
||||
sudo chown -R $ML_USER:$ML_GROUP /var/log/fetch_ml
|
||||
sudo chmod 755 /var/log/fetch_ml
|
||||
|
||||
echo -e "${GREEN}✓${NC} Permissions set"
|
||||
|
||||
# 4. Install system dependencies (Rocky Linux)
|
||||
echo -e "${BLUE}[4/8]${NC} Installing system dependencies..."
|
||||
sudo dnf install -y \
|
||||
golang \
|
||||
podman \
|
||||
redis \
|
||||
git \
|
||||
make \
|
||||
gcc \
|
||||
|| echo "Some packages may already be installed"
|
||||
|
||||
echo -e "${GREEN}✓${NC} Dependencies installed"
|
||||
|
||||
# 5. Configure Podman for GPU access (if NVIDIA GPU present)
|
||||
echo -e "${BLUE}[5/8]${NC} Configuring Podman..."
|
||||
if lspci | grep -i nvidia &>/dev/null; then
|
||||
echo " NVIDIA GPU detected, configuring GPU access..."
|
||||
|
||||
# Install nvidia-container-toolkit if not present
|
||||
if ! command -v nvidia-container-toolkit &>/dev/null; then
|
||||
echo " Installing nvidia-container-toolkit..."
|
||||
sudo dnf config-manager --add-repo \
|
||||
https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo
|
||||
sudo dnf install -y nvidia-container-toolkit
|
||||
fi
|
||||
|
||||
# Configure Podman CDI
|
||||
sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
|
||||
echo -e "${GREEN}✓${NC} GPU support configured"
|
||||
else
|
||||
echo " No NVIDIA GPU detected, skipping GPU setup"
|
||||
fi
|
||||
|
||||
# 6. Configure Redis
|
||||
echo -e "${BLUE}[6/8]${NC} Configuring Redis..."
|
||||
sudo systemctl enable redis
|
||||
sudo systemctl start redis || echo "Redis may already be running"
|
||||
|
||||
# Set Redis password if not already configured
|
||||
if ! sudo grep -q "^requirepass" /etc/redis/redis.conf 2>/dev/null; then
|
||||
REDIS_PASSWORD=$(openssl rand -base64 32)
|
||||
echo "requirepass $REDIS_PASSWORD" | sudo tee -a /etc/redis/redis.conf >/dev/null
|
||||
sudo systemctl restart redis
|
||||
echo " Generated Redis password: $REDIS_PASSWORD"
|
||||
echo " Save this password for your configuration!"
|
||||
else
|
||||
echo " Redis password already configured"
|
||||
fi
|
||||
|
||||
echo -e "${GREEN}✓${NC} Redis configured"
|
||||
|
||||
# 7. Setup systemd services
|
||||
echo -e "${BLUE}[7/8]${NC} Creating systemd services..."
|
||||
|
||||
# API Server service
|
||||
sudo tee /etc/systemd/system/fetchml-api.service >/dev/null <<EOF
|
||||
[Unit]
|
||||
Description=FetchML API Server
|
||||
After=network.target redis.service
|
||||
Wants=redis.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=$ML_USER
|
||||
Group=$ML_GROUP
|
||||
WorkingDirectory=/opt/fetch_ml
|
||||
ExecStart=/usr/local/bin/fetchml-api -config /etc/fetch_ml/config.yaml
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
StandardOutput=append:/var/log/fetch_ml/api.log
|
||||
StandardError=append:/var/log/fetch_ml/api-error.log
|
||||
|
||||
# Security hardening
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
ProtectSystem=strict
|
||||
ProtectHome=true
|
||||
ReadWritePaths=$BASE_PATH /var/log/fetch_ml
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
# Worker service
|
||||
sudo tee /etc/systemd/system/fetchml-worker.service >/dev/null <<EOF
|
||||
[Unit]
|
||||
Description=FetchML Worker
|
||||
After=network.target redis.service fetchml-api.service
|
||||
Wants=redis.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=$ML_USER
|
||||
Group=$ML_GROUP
|
||||
WorkingDirectory=/opt/fetch_ml
|
||||
ExecStart=/usr/local/bin/fetchml-worker -config /etc/fetch_ml/worker.toml
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
StandardOutput=append:/var/log/fetch_ml/worker.log
|
||||
StandardError=append:/var/log/fetch_ml/worker-error.log
|
||||
|
||||
# Security hardening
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
ProtectSystem=strict
|
||||
ProtectHome=true
|
||||
ReadWritePaths=$BASE_PATH /var/log/fetch_ml
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
echo -e "${GREEN}✓${NC} Systemd services created"
|
||||
|
||||
# 8. Setup logrotate
|
||||
echo -e "${BLUE}[8/8]${NC} Configuring log rotation..."
|
||||
sudo tee /etc/logrotate.d/fetchml >/dev/null <<EOF
|
||||
/var/log/fetch_ml/*.log {
|
||||
daily
|
||||
rotate 14
|
||||
compress
|
||||
delaycompress
|
||||
notifempty
|
||||
missingok
|
||||
create 0640 $ML_USER $ML_GROUP
|
||||
sharedscripts
|
||||
postrotate
|
||||
systemctl reload fetchml-api >/dev/null 2>&1 || true
|
||||
systemctl reload fetchml-worker >/dev/null 2>&1 || true
|
||||
endscript
|
||||
}
|
||||
EOF
|
||||
|
||||
echo -e "${GREEN}✓${NC} Log rotation configured"
|
||||
|
||||
# Summary
|
||||
echo ""
|
||||
echo -e "${BOLD}=== Setup Complete! ===${NC}"
|
||||
echo ""
|
||||
echo "Directory structure created at: $BASE_PATH"
|
||||
echo "Logs will be written to: /var/log/fetch_ml/"
|
||||
echo "Configuration directory: /etc/fetch_ml/"
|
||||
echo ""
|
||||
echo -e "${BOLD}Next steps:${NC}"
|
||||
echo "1. Copy your config files:"
|
||||
echo " sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml"
|
||||
echo " sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml"
|
||||
echo ""
|
||||
echo "2. Build and install binaries:"
|
||||
echo " make build"
|
||||
echo " sudo cp bin/api-server /usr/local/bin/fetchml-api"
|
||||
echo " sudo cp bin/worker /usr/local/bin/fetchml-worker"
|
||||
echo ""
|
||||
echo "3. Update config files with your settings (Redis password, API keys, etc.)"
|
||||
echo ""
|
||||
echo "4. Start services:"
|
||||
echo " sudo systemctl start fetchml-api"
|
||||
echo " sudo systemctl start fetchml-worker"
|
||||
echo ""
|
||||
echo "5. Enable services to start on boot:"
|
||||
echo " sudo systemctl enable fetchml-api"
|
||||
echo " sudo systemctl enable fetchml-worker"
|
||||
echo ""
|
||||
echo "6. Check status:"
|
||||
echo " sudo systemctl status fetchml-api"
|
||||
echo " sudo systemctl status fetchml-worker"
|
||||
echo " sudo journalctl -u fetchml-api -f"
|
||||
echo ""
|
||||
|
|
@ -1,455 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Automatic Setup Script for ML Experiment Manager
|
||||
# Handles complete environment setup with security features
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
print_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
print_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
print_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
print_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
detect_os() {
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
echo "macos"
|
||||
elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
|
||||
echo "linux"
|
||||
else
|
||||
echo "unknown"
|
||||
fi
|
||||
}
|
||||
|
||||
install_go() {
|
||||
print_info "Installing Go..."
|
||||
|
||||
local os=$(detect_os)
|
||||
local go_version="1.23.0"
|
||||
|
||||
if [[ "$os" == "macos" ]]; then
|
||||
if command -v brew &> /dev/null; then
|
||||
brew install go
|
||||
else
|
||||
print_error "Homebrew not found. Please install Go manually."
|
||||
return 1
|
||||
fi
|
||||
elif [[ "$os" == "linux" ]]; then
|
||||
wget -q "https://go.dev/dl/go${go_version}.linux-amd64.tar.gz"
|
||||
sudo rm -rf /usr/local/go
|
||||
sudo tar -C /usr/local -xzf "go${go_version}.linux-amd64.tar.gz"
|
||||
rm "go${go_version}.linux-amd64.tar.gz"
|
||||
|
||||
# Add to PATH
|
||||
echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc
|
||||
export PATH=$PATH:/usr/local/go/bin
|
||||
fi
|
||||
|
||||
print_success "Go installed"
|
||||
}
|
||||
|
||||
install_zig() {
|
||||
print_info "Installing Zig..."
|
||||
|
||||
local os=$(detect_os)
|
||||
|
||||
if [[ "$os" == "macos" ]]; then
|
||||
if command -v brew &> /dev/null; then
|
||||
brew install zig
|
||||
else
|
||||
print_error "Homebrew not found. Please install Zig manually."
|
||||
return 1
|
||||
fi
|
||||
elif [[ "$os" == "linux" ]]; then
|
||||
# Download Zig binary
|
||||
local zig_version="0.13.0"
|
||||
wget -q "https://ziglang.org/download/${zig_version}/zig-linux-x86_64-${zig_version}.tar.xz"
|
||||
tar -xf "zig-linux-x86_64-${zig_version}.tar.xz"
|
||||
sudo mv "zig-linux-x86_64-${zig_version}/zig" /usr/local/bin/
|
||||
rm -rf "zig-linux-x86_64-${zig_version}.tar.xz" "zig-linux-x86_64-${zig_version}"
|
||||
fi
|
||||
|
||||
print_success "Zig installed"
|
||||
}
|
||||
|
||||
install_docker() {
|
||||
print_info "Installing Docker..."
|
||||
|
||||
local os=$(detect_os)
|
||||
|
||||
if [[ "$os" == "macos" ]]; then
|
||||
if command -v brew &> /dev/null; then
|
||||
brew install --cask docker
|
||||
print_warning "Docker Desktop installed. Please start it manually."
|
||||
else
|
||||
print_error "Homebrew not found. Please install Docker manually."
|
||||
return 1
|
||||
fi
|
||||
elif [[ "$os" == "linux" ]]; then
|
||||
# Install Docker using official script
|
||||
curl -fsSL https://get.docker.com -o get-docker.sh
|
||||
sudo sh get-docker.sh
|
||||
sudo usermod -aG docker $USER
|
||||
rm get-docker.sh
|
||||
|
||||
# Start Docker
|
||||
sudo systemctl enable docker
|
||||
sudo systemctl start docker
|
||||
|
||||
print_success "Docker installed. You may need to log out and log back in."
|
||||
fi
|
||||
}
|
||||
|
||||
install_redis() {
|
||||
print_info "Installing Redis..."
|
||||
|
||||
local os=$(detect_os)
|
||||
|
||||
if [[ "$os" == "macos" ]]; then
|
||||
if command -v brew &> /dev/null; then
|
||||
brew install redis
|
||||
brew services start redis
|
||||
else
|
||||
print_error "Homebrew not found. Please install Redis manually."
|
||||
return 1
|
||||
fi
|
||||
elif [[ "$os" == "linux" ]]; then
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y redis-server
|
||||
sudo systemctl enable redis-server
|
||||
sudo systemctl start redis-server
|
||||
fi
|
||||
|
||||
print_success "Redis installed and started"
|
||||
}
|
||||
|
||||
install_dependencies() {
|
||||
print_info "Installing dependencies..."
|
||||
|
||||
local os=$(detect_os)
|
||||
|
||||
# Install basic tools
|
||||
if [[ "$os" == "macos" ]]; then
|
||||
if command -v brew &> /dev/null; then
|
||||
brew install openssl curl jq
|
||||
fi
|
||||
elif [[ "$os" == "linux" ]]; then
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y openssl curl jq build-essential
|
||||
fi
|
||||
|
||||
# Install Go tools
|
||||
if command -v go &> /dev/null; then
|
||||
go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest
|
||||
go install golang.org/x/tools/cmd/goimports@latest
|
||||
fi
|
||||
|
||||
print_success "Dependencies installed"
|
||||
}
|
||||
|
||||
setup_project() {
|
||||
print_info "Setting up project..."
|
||||
|
||||
# Create directories
|
||||
mkdir -p bin
|
||||
mkdir -p data
|
||||
mkdir -p logs
|
||||
mkdir -p db
|
||||
mkdir -p ssl
|
||||
mkdir -p configs
|
||||
mkdir -p scripts
|
||||
|
||||
# Build project
|
||||
if command -v make &> /dev/null; then
|
||||
make build
|
||||
if command -v zig &> /dev/null; then
|
||||
make cli-build
|
||||
fi
|
||||
else
|
||||
print_warning "Make not found, building manually..."
|
||||
go build -o bin/worker ./cmd/worker
|
||||
go build -o bin/tui ./cmd/tui
|
||||
go build -o bin/data_manager ./cmd/data_manager
|
||||
go build -o bin/user_manager ./cmd/user_manager
|
||||
go build -o bin/api-server ./cmd/api-server
|
||||
|
||||
if command -v zig &> /dev/null; then
|
||||
cd cli && zig build && cd ..
|
||||
fi
|
||||
fi
|
||||
|
||||
print_success "Project setup completed"
|
||||
}
|
||||
|
||||
setup_security() {
|
||||
print_info "Setting up security features..."
|
||||
|
||||
# Generate SSL certificates
|
||||
if command -v openssl &> /dev/null; then
|
||||
openssl req -x509 -newkey rsa:4096 -keyout ssl/key.pem -out ssl/cert.pem \
|
||||
-days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" \
|
||||
-addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null || {
|
||||
print_warning "Failed to generate SSL certificates"
|
||||
}
|
||||
print_success "SSL certificates generated"
|
||||
fi
|
||||
|
||||
# Generate secure configuration
|
||||
local redis_password=$(openssl rand -base64 32 2>/dev/null || echo "dev_redis_password_123")
|
||||
local jwt_secret=$(openssl rand -base64 64 2>/dev/null || echo "dev_jwt_secret_1234567890123456789012345678901234567890123456789012345678901234")
|
||||
|
||||
cat > configs/security-config.yaml << EOF
|
||||
base_path: "/data/ml-experiments"
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
api_keys:
|
||||
test_user:
|
||||
hash: "$(echo -n "dev_test_api_key_12345" | sha256sum | cut -d' ' -f1)"
|
||||
admin: true
|
||||
roles: ["data_scientist", "admin"]
|
||||
permissions:
|
||||
read: true
|
||||
write: true
|
||||
delete: true
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: true
|
||||
cert_file: "./ssl/cert.pem"
|
||||
key_file: "./ssl/key.pem"
|
||||
min_version: "1.3"
|
||||
|
||||
security:
|
||||
rate_limit:
|
||||
enabled: true
|
||||
requests_per_minute: 60
|
||||
burst_size: 10
|
||||
ip_whitelist:
|
||||
- "127.0.0.1"
|
||||
- "::1"
|
||||
- "10.0.0.0/8"
|
||||
- "192.168.0.0/16"
|
||||
- "172.16.0.0/12"
|
||||
failed_login_lockout:
|
||||
enabled: true
|
||||
max_attempts: 5
|
||||
lockout_duration: "15m"
|
||||
|
||||
redis:
|
||||
url: "redis://localhost:6379"
|
||||
password: "${redis_password}"
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file: "logs/fetch_ml.log"
|
||||
audit_log: "logs/audit.log"
|
||||
EOF
|
||||
|
||||
cat > .env.dev << EOF
|
||||
# Development environment variables
|
||||
REDIS_PASSWORD=${redis_password}
|
||||
JWT_SECRET=${jwt_secret}
|
||||
GRAFANA_USER=admin
|
||||
GRAFANA_PASSWORD=$(openssl rand -base64 16 2>/dev/null || echo "dev_grafana_password")
|
||||
EOF
|
||||
|
||||
print_success "Security configuration created"
|
||||
}
|
||||
|
||||
test_installation() {
|
||||
print_info "Testing installation..."
|
||||
|
||||
local tests_passed=0
|
||||
local tests_total=0
|
||||
|
||||
# Test Go
|
||||
tests_total=$((tests_total + 1))
|
||||
if command -v go &> /dev/null; then
|
||||
print_success "Go: Installed"
|
||||
tests_passed=$((tests_passed + 1))
|
||||
else
|
||||
print_error "Go: Not found"
|
||||
fi
|
||||
|
||||
# Test Zig
|
||||
tests_total=$((tests_total + 1))
|
||||
if command -v zig &> /dev/null; then
|
||||
print_success "Zig: Installed"
|
||||
tests_passed=$((tests_passed + 1))
|
||||
else
|
||||
print_warning "Zig: Not found (optional)"
|
||||
tests_total=$((tests_total - 1))
|
||||
fi
|
||||
|
||||
# Test Docker
|
||||
tests_total=$((tests_total + 1))
|
||||
if command -v docker &> /dev/null; then
|
||||
print_success "Docker: Installed"
|
||||
tests_passed=$((tests_passed + 1))
|
||||
else
|
||||
print_warning "Docker: Not found (optional)"
|
||||
tests_total=$((tests_total - 1))
|
||||
fi
|
||||
|
||||
# Test Redis
|
||||
tests_total=$((tests_total + 1))
|
||||
if command -v redis-cli &> /dev/null; then
|
||||
if redis-cli ping | grep -q "PONG"; then
|
||||
print_success "Redis: Running"
|
||||
tests_passed=$((tests_passed + 1))
|
||||
else
|
||||
print_warning "Redis: Not running"
|
||||
fi
|
||||
else
|
||||
print_warning "Redis: Not found (optional)"
|
||||
tests_total=$((tests_total - 1))
|
||||
fi
|
||||
|
||||
# Test binaries
|
||||
if [[ -f "bin/api-server" ]]; then
|
||||
tests_total=$((tests_total + 1))
|
||||
if ./bin/api-server --help > /dev/null 2>&1; then
|
||||
print_success "API Server: Built"
|
||||
tests_passed=$((tests_passed + 1))
|
||||
else
|
||||
print_error "API Server: Build failed"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ $tests_total -gt 0 ]]; then
|
||||
local success_rate=$((tests_passed * 100 / tests_total))
|
||||
print_info "Tests: $tests_passed/$tests_total passed ($success_rate%)"
|
||||
fi
|
||||
|
||||
print_success "Installation testing completed"
|
||||
}
|
||||
|
||||
show_next_steps() {
|
||||
print_success "Automatic setup completed!"
|
||||
echo
|
||||
echo "Next Steps:"
|
||||
echo "==========="
|
||||
echo ""
|
||||
echo "1. Load environment variables:"
|
||||
echo " source .env.dev"
|
||||
echo ""
|
||||
echo "2. Start the API server:"
|
||||
echo " ./bin/api-server -config configs/config.yaml"
|
||||
echo ""
|
||||
echo "3. Test the Zig CLI (if installed):"
|
||||
echo " ./cli/zig-out/bin/ml --help"
|
||||
echo ""
|
||||
echo "4. Deploy with Docker (optional):"
|
||||
echo " make docker-run"
|
||||
echo ""
|
||||
echo "5. Docker Compose deployment:"
|
||||
echo " docker-compose up -d"
|
||||
echo ""
|
||||
echo "Configuration Files:"
|
||||
echo " configs/config.yaml # Main configuration"
|
||||
echo " configs/config_local.yaml # Local development"
|
||||
echo " ssl/cert.pem, ssl/key.pem # TLS certificates"
|
||||
echo ""
|
||||
echo "Documentation:"
|
||||
echo " docs/DEPLOYMENT.md # Deployment guide"
|
||||
echo ""
|
||||
echo "Quick Commands:"
|
||||
echo " make help # Show all commands"
|
||||
echo " make test # Run tests"
|
||||
echo " docker-compose up -d # Start services"
|
||||
echo ""
|
||||
print_success "Ready to use ML Experiment Manager!"
|
||||
}
|
||||
|
||||
# Main setup function
|
||||
main() {
|
||||
echo "ML Experiment Manager Automatic Setup"
|
||||
echo "====================================="
|
||||
echo ""
|
||||
|
||||
print_info "Starting automatic setup..."
|
||||
echo ""
|
||||
|
||||
# Check and install dependencies
|
||||
if ! command -v go &> /dev/null; then
|
||||
print_info "Go not found, installing..."
|
||||
install_go
|
||||
fi
|
||||
|
||||
if ! command -v zig &> /dev/null; then
|
||||
print_info "Zig not found, installing..."
|
||||
install_zig
|
||||
fi
|
||||
|
||||
if ! command -v docker &> /dev/null; then
|
||||
print_info "Docker not found, installing..."
|
||||
install_docker
|
||||
fi
|
||||
|
||||
if ! command -v redis-cli &> /dev/null; then
|
||||
print_info "Redis not found, installing..."
|
||||
install_redis
|
||||
fi
|
||||
|
||||
# Install additional dependencies
|
||||
install_dependencies
|
||||
|
||||
# Setup project
|
||||
setup_project
|
||||
|
||||
# Setup security
|
||||
setup_security
|
||||
|
||||
# Test installation
|
||||
test_installation
|
||||
|
||||
# Show next steps
|
||||
show_next_steps
|
||||
}
|
||||
|
||||
# Handle command line arguments
|
||||
case "${1:-setup}" in
|
||||
"setup")
|
||||
main
|
||||
;;
|
||||
"deps")
|
||||
install_dependencies
|
||||
;;
|
||||
"test")
|
||||
test_installation
|
||||
;;
|
||||
"help"|"-h"|"--help")
|
||||
echo "Automatic Setup Script"
|
||||
echo "Usage: $0 {setup|deps|test|help}"
|
||||
echo ""
|
||||
echo "Commands:"
|
||||
echo " setup - Run full automatic setup"
|
||||
echo " deps - Install dependencies only"
|
||||
echo " test - Test installation"
|
||||
echo " help - Show this help"
|
||||
;;
|
||||
*)
|
||||
print_error "Unknown command: $1"
|
||||
echo "Use '$0 help' for usage information"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
|
@ -1,314 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Fetch ML Quick Start Script with Security
|
||||
# Sets up development environment with security features and creates test user
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
print_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
print_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
print_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
print_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
check_prerequisites() {
|
||||
print_info "Checking prerequisites..."
|
||||
|
||||
# Check Go
|
||||
if ! command -v go &> /dev/null; then
|
||||
print_error "Go is not installed. Please install Go 1.25 or later."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local go_version=$(go version | awk '{print $3}' | sed 's/go//')
|
||||
print_info "Go version: $go_version"
|
||||
|
||||
# Check Zig
|
||||
if ! command -v zig &> /dev/null; then
|
||||
print_warning "Zig is not installed. CLI features will not be available."
|
||||
else
|
||||
local zig_version=$(zig version)
|
||||
print_info "Zig version: $zig_version"
|
||||
fi
|
||||
|
||||
# Check Docker
|
||||
if ! command -v docker &> /dev/null; then
|
||||
print_warning "Docker is not installed. Container features will not work."
|
||||
fi
|
||||
|
||||
# Check Redis
|
||||
if ! command -v redis-server &> /dev/null && ! command -v redis-cli &> /dev/null; then
|
||||
print_warning "Redis is not installed. Starting local Redis..."
|
||||
fi
|
||||
|
||||
# Check OpenSSL for certificates
|
||||
if ! command -v openssl &> /dev/null; then
|
||||
print_warning "OpenSSL is not installed. TLS certificates will not be generated."
|
||||
fi
|
||||
|
||||
print_success "Prerequisites checked"
|
||||
}
|
||||
|
||||
setup_project() {
|
||||
print_info "Setting up Fetch ML project..."
|
||||
|
||||
# Create directories
|
||||
mkdir -p bin
|
||||
mkdir -p data
|
||||
mkdir -p logs
|
||||
mkdir -p db
|
||||
mkdir -p ssl
|
||||
mkdir -p configs
|
||||
|
||||
print_success "Project directories created"
|
||||
}
|
||||
|
||||
build_project() {
|
||||
print_info "Building Fetch ML..."
|
||||
|
||||
# Build Go binaries
|
||||
make build
|
||||
|
||||
# Build Zig CLI if available
|
||||
if command -v zig &> /dev/null; then
|
||||
make cli-build
|
||||
print_success "Zig CLI built"
|
||||
fi
|
||||
|
||||
print_success "Build completed"
|
||||
}
|
||||
|
||||
generate_ssl_certificates() {
|
||||
print_info "Generating SSL certificates..."
|
||||
|
||||
if command -v openssl &> /dev/null; then
|
||||
# Generate self-signed certificate for development
|
||||
openssl req -x509 -newkey rsa:4096 -keyout ssl/key.pem -out ssl/cert.pem \
|
||||
-days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" \
|
||||
-addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null || {
|
||||
print_warning "Failed to generate SSL certificates"
|
||||
return 1
|
||||
}
|
||||
|
||||
print_success "SSL certificates generated in ssl/"
|
||||
print_info "Certificates are self-signed (development only)"
|
||||
else
|
||||
print_warning "OpenSSL not available, skipping SSL certificates"
|
||||
fi
|
||||
}
|
||||
|
||||
setup_redis() {
|
||||
print_info "Setting up Redis..."
|
||||
|
||||
if command -v redis-server &> /dev/null; then
|
||||
if ! pgrep -f "redis-server" > /dev/null; then
|
||||
redis-server --daemonize yes --port 6379
|
||||
print_success "Redis started"
|
||||
else
|
||||
print_info "Redis already running"
|
||||
fi
|
||||
else
|
||||
print_warning "Redis not available, some features may be limited"
|
||||
fi
|
||||
}
|
||||
|
||||
create_secure_config() {
|
||||
print_info "Creating secure development configuration..."
|
||||
|
||||
# Generate secure passwords and secrets
|
||||
local redis_password=$(openssl rand -base64 32 2>/dev/null || echo "dev_redis_password_123")
|
||||
local jwt_secret=$(openssl rand -base64 64 2>/dev/null || echo "dev_jwt_secret_1234567890123456789012345678901234567890123456789012345678901234")
|
||||
|
||||
# Create development config
|
||||
cat > configs/config.yaml << EOF
|
||||
base_path: "/data/ml-experiments"
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
api_keys:
|
||||
test_user:
|
||||
hash: "$(echo -n "dev_test_api_key_12345" | sha256sum | cut -d' ' -f1)"
|
||||
admin: true
|
||||
roles: ["data_scientist", "admin"]
|
||||
permissions:
|
||||
read: true
|
||||
write: true
|
||||
delete: true
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: true
|
||||
cert_file: "./ssl/cert.pem"
|
||||
key_file: "./ssl/key.pem"
|
||||
min_version: "1.3"
|
||||
|
||||
security:
|
||||
rate_limit:
|
||||
enabled: true
|
||||
requests_per_minute: 60
|
||||
burst_size: 10
|
||||
ip_whitelist:
|
||||
- "127.0.0.1"
|
||||
- "::1"
|
||||
- "10.0.0.0/8"
|
||||
- "192.168.0.0/16"
|
||||
- "172.16.0.0/12"
|
||||
failed_login_lockout:
|
||||
enabled: true
|
||||
max_attempts: 5
|
||||
lockout_duration: "15m"
|
||||
|
||||
redis:
|
||||
url: "redis://localhost:6379"
|
||||
password: "${redis_password}"
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file: "logs/fetch_ml.log"
|
||||
audit_log: "logs/audit.log"
|
||||
EOF
|
||||
|
||||
# Create environment file
|
||||
cat > .env.dev << EOF
|
||||
# Development environment variables
|
||||
REDIS_PASSWORD=${redis_password}
|
||||
JWT_SECRET=${jwt_secret}
|
||||
GRAFANA_USER=admin
|
||||
GRAFANA_PASSWORD=$(openssl rand -base64 16 2>/dev/null || echo "dev_grafana_password")
|
||||
EOF
|
||||
|
||||
print_success "Secure configuration created"
|
||||
print_warning "Using development certificates and passwords"
|
||||
}
|
||||
|
||||
create_test_user() {
|
||||
print_info "Creating test user..."
|
||||
|
||||
# Generate API key for test user
|
||||
local api_key="dev_test_api_key_12345"
|
||||
local api_key_hash=$(echo -n "$api_key" | sha256sum | cut -d' ' -f1)
|
||||
|
||||
print_success "Test user created successfully"
|
||||
echo "Username: test_user"
|
||||
echo "API Key: $api_key"
|
||||
echo "API Key Hash: $api_key_hash"
|
||||
echo "Store this key safely!"
|
||||
echo ""
|
||||
echo "Environment variables in .env.dev"
|
||||
echo "Run: source .env.dev"
|
||||
}
|
||||
|
||||
test_setup() {
|
||||
print_info "Testing setup..."
|
||||
|
||||
# Test Go binaries
|
||||
if [[ -f "bin/api-server" ]]; then
|
||||
./bin/api-server --help > /dev/null 2>&1 || true
|
||||
print_success "API server binary OK"
|
||||
fi
|
||||
|
||||
if [[ -f "bin/worker" ]]; then
|
||||
./bin/worker --help > /dev/null 2>&1 || true
|
||||
print_success "Worker binary OK"
|
||||
fi
|
||||
|
||||
# Test Zig CLI
|
||||
if [[ -f "cli/zig-out/bin/ml" ]]; then
|
||||
./cli/zig-out/bin/ml --help > /dev/null 2>&1 || true
|
||||
print_success "Zig CLI binary OK"
|
||||
fi
|
||||
|
||||
# Test Redis connection
|
||||
if command -v redis-cli &> /dev/null; then
|
||||
if redis-cli ping > /dev/null 2>&1; then
|
||||
print_success "Redis connection OK"
|
||||
else
|
||||
print_warning "Redis not responding"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Test SSL certificates
|
||||
if [[ -f "ssl/cert.pem" && -f "ssl/key.pem" ]]; then
|
||||
if openssl x509 -in ssl/cert.pem -noout -checkend 86400 > /dev/null 2>&1; then
|
||||
print_success "SSL certificates valid"
|
||||
else
|
||||
print_warning "SSL certificates expired or invalid"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
show_next_steps() {
|
||||
print_success "Secure quick start completed!"
|
||||
echo
|
||||
echo "Next steps:"
|
||||
echo "1. Load environment variables:"
|
||||
echo " source .env.dev"
|
||||
echo
|
||||
echo "2. Start API server:"
|
||||
echo " ./bin/api-server -config configs/config.yaml"
|
||||
echo
|
||||
echo "3. Test Zig CLI:"
|
||||
echo " ./cli/zig-out/bin/ml --help"
|
||||
echo
|
||||
echo "4. Test with curl (HTTPS):"
|
||||
echo " curl -k -H 'X-API-Key: dev_test_api_key_12345' https://localhost:9101/health"
|
||||
echo
|
||||
echo "5. Deploy with Docker:"
|
||||
echo " docker-compose up -d"
|
||||
echo
|
||||
echo "Features Enabled:"
|
||||
echo " ✅ HTTPS/TLS encryption"
|
||||
echo " ✅ API key authentication"
|
||||
echo " ✅ Rate limiting"
|
||||
echo " ✅ IP whitelisting"
|
||||
echo " ✅ Security headers"
|
||||
echo " ✅ Audit logging"
|
||||
echo
|
||||
echo "Configuration Files:"
|
||||
echo " configs/config.yaml # Main configuration"
|
||||
echo " .env.dev # Environment variables"
|
||||
echo " ssl/cert.pem, ssl/key.pem # TLS certificates"
|
||||
echo
|
||||
echo "Documentation:"
|
||||
echo " docs/DEPLOYMENT.md # Deployment guide"
|
||||
echo ""
|
||||
print_success "Ready to run ML experiments!"
|
||||
}
|
||||
|
||||
# Main function
|
||||
main() {
|
||||
echo "Fetch ML Quick Start Script (with Security & Zig CLI)"
|
||||
echo "===================================================="
|
||||
echo ""
|
||||
|
||||
check_prerequisites
|
||||
setup_project
|
||||
build_project
|
||||
generate_ssl_certificates
|
||||
setup_redis
|
||||
create_secure_config
|
||||
create_test_user
|
||||
test_setup
|
||||
show_next_steps
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
|
|
@ -1,124 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Shared helper functions for Fetch ML setup scripts (Ubuntu/Rocky)
|
||||
set -euo pipefail
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
# Configuration defaults
|
||||
FETCH_ML_USER="fetchml"
|
||||
FETCH_ML_HOME="/opt/fetchml"
|
||||
SERVICE_DIR="/etc/systemd/system"
|
||||
LOG_DIR="/var/log/fetchml"
|
||||
DATA_DIR="/var/lib/fetchml"
|
||||
CONFIG_DIR="$FETCH_ML_HOME/configs"
|
||||
|
||||
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
|
||||
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
|
||||
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
|
||||
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
|
||||
|
||||
# Download file with checksum verification
|
||||
# Args: url, checksum, dest
|
||||
secure_download() {
|
||||
local url="$1" checksum="$2" dest="$3"
|
||||
curl -fsSL "$url" -o "$dest"
|
||||
echo "$checksum $dest" | sha256sum --check --status || {
|
||||
log_error "Checksum verification failed for $dest"
|
||||
rm -f "$dest"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
cleanup_temp() {
|
||||
if [[ -n "${TMP_FILES:-}" ]]; then
|
||||
rm -f $TMP_FILES || true
|
||||
fi
|
||||
}
|
||||
trap cleanup_temp EXIT
|
||||
|
||||
ensure_user() {
|
||||
if ! id "$FETCH_ML_USER" &>/dev/null; then
|
||||
useradd -m -d "$FETCH_ML_HOME" -s /bin/bash "$FETCH_ML_USER"
|
||||
fi
|
||||
usermod -aG podman "$FETCH_ML_USER" || true
|
||||
}
|
||||
|
||||
create_directories() {
|
||||
mkdir -p "$FETCH_ML_HOME" "$LOG_DIR" "$DATA_DIR" "$FETCH_ML_HOME/bin" "$CONFIG_DIR"
|
||||
chown -R "$FETCH_ML_USER":"$FETCH_ML_USER" "$FETCH_ML_HOME" "$LOG_DIR" "$DATA_DIR"
|
||||
}
|
||||
|
||||
setup_systemd_service() {
|
||||
local name="$1" exec="$2"
|
||||
cat > "$SERVICE_DIR/${name}.service" <<EOF
|
||||
[Unit]
|
||||
Description=Fetch ML ${name^} Service
|
||||
After=network.target redis.service
|
||||
Wants=redis.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=$FETCH_ML_USER
|
||||
Group=$FETCH_ML_USER
|
||||
WorkingDirectory=$FETCH_ML_HOME
|
||||
Environment=PATH=$FETCH_ML_HOME/bin:/usr/local/go/bin:/usr/bin:/bin
|
||||
ExecStart=$exec
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=fetch_ml_${name}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
}
|
||||
|
||||
setup_logrotate() {
|
||||
cat > /etc/logrotate.d/fetch_ml <<'EOF'
|
||||
/var/log/fetchml/*.log {
|
||||
daily
|
||||
missingok
|
||||
rotate 14
|
||||
compress
|
||||
delaycompress
|
||||
notifempty
|
||||
create 0640 fetchml fetchml
|
||||
}
|
||||
EOF
|
||||
}
|
||||
|
||||
hardening_steps() {
|
||||
# Increase file limits
|
||||
if ! grep -q fetchml /etc/security/limits.conf; then
|
||||
cat >> /etc/security/limits.conf <<'EOF'
|
||||
fetchml soft nofile 65536
|
||||
fetchml hard nofile 65536
|
||||
EOF
|
||||
fi
|
||||
|
||||
# Enable unattended security upgrades if available
|
||||
if command -v apt-get &>/dev/null; then
|
||||
apt-get install -y unattended-upgrades >/dev/null || true
|
||||
elif command -v dnf &>/dev/null; then
|
||||
dnf install -y dnf-automatic >/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
selinux_guidance() {
|
||||
if command -v getenforce &>/dev/null; then
|
||||
local mode=$(getenforce)
|
||||
log_info "SELinux mode: $mode"
|
||||
if [[ "$mode" == "Enforcing" ]]; then
|
||||
log_info "Ensure systemd units and directories have proper contexts. Example:"
|
||||
echo " semanage fcontext -a -t bin_t '$FETCH_ML_HOME/bin(/.*)?'"
|
||||
echo " restorecon -Rv $FETCH_ML_HOME/bin"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
|
@ -1,417 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Fetch ML Rocky Linux Setup Script
|
||||
# Optimized for ML experiments on Rocky Linux 8/9
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=scripts/setup_common.sh
|
||||
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
||||
source "$SCRIPT_DIR/setup_common.sh"
|
||||
|
||||
check_root() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "This script must be run as root"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
check_rocky() {
|
||||
if ! command -v dnf &> /dev/null && ! command -v yum &> /dev/null; then
|
||||
log_error "This script is designed for Rocky Linux systems"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local rocky_version=$(cat /etc/rocky-release | grep -oE '[0-9]+\.[0-9]+')
|
||||
log_info "Rocky Linux version: $rocky_version"
|
||||
|
||||
# Use dnf for Rocky 9+, yum for Rocky 8
|
||||
if command -v dnf &> /dev/null; then
|
||||
PKG_MANAGER="dnf"
|
||||
else
|
||||
PKG_MANAGER="yum"
|
||||
fi
|
||||
}
|
||||
|
||||
update_system() {
|
||||
log_info "Updating system packages..."
|
||||
$PKG_MANAGER update -y
|
||||
$PKG_MANAGER upgrade -y
|
||||
$PKG_MANAGER install -y curl wget gnupg2
|
||||
}
|
||||
|
||||
enable_epel() {
|
||||
log_info "Enabling EPEL repository..."
|
||||
|
||||
if $PKG_MANAGER repolist | grep -q "epel"; then
|
||||
log_info "EPEL already enabled"
|
||||
return
|
||||
fi
|
||||
|
||||
$PKG_MANAGER install -y epel-release
|
||||
$PKG_MANAGER config-manager --set-enabled powertools
|
||||
|
||||
log_success "EPEL repository enabled"
|
||||
}
|
||||
|
||||
install_go() {
|
||||
log_info "Installing Go 1.25..."
|
||||
|
||||
if command -v go &> /dev/null; then
|
||||
local go_version=$(go version | awk '{print $3}' | sed 's/go//')
|
||||
log_info "Go already installed: $go_version"
|
||||
return
|
||||
fi
|
||||
|
||||
cd /tmp
|
||||
TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz"
|
||||
secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz"
|
||||
tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz
|
||||
|
||||
# Add to PATH
|
||||
echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile
|
||||
echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile
|
||||
export PATH=$PATH:/usr/local/go/bin
|
||||
|
||||
log_success "Go 1.25 installed"
|
||||
}
|
||||
|
||||
install_podman() {
|
||||
log_info "Installing Podman..."
|
||||
|
||||
if command -v podman &> /dev/null; then
|
||||
log_info "Podman already installed"
|
||||
return
|
||||
fi
|
||||
|
||||
# Install Podman and related tools
|
||||
$PKG_MANAGER install -y podman podman-compose containernetworking-plugins
|
||||
|
||||
# Configure Podman
|
||||
mkdir -p /etc/containers
|
||||
cat > /etc/containers/containers.conf << EOF
|
||||
[containers]
|
||||
user_namespace_enable = 1
|
||||
runtime = "crun"
|
||||
|
||||
[network]
|
||||
network_backend = "netavark"
|
||||
|
||||
[engine]
|
||||
cgroup_manager = "systemd"
|
||||
EOF
|
||||
|
||||
# Enable user namespaces
|
||||
echo "user.max_user_namespaces=15000" >> /etc/sysctl.conf
|
||||
sysctl -p user.max_user_namespaces=15000
|
||||
|
||||
log_success "Podman installed"
|
||||
}
|
||||
|
||||
install_redis() {
|
||||
log_info "Installing Redis..."
|
||||
|
||||
if command -v redis-server &> /dev/null; then
|
||||
log_info "Redis already installed"
|
||||
return
|
||||
fi
|
||||
|
||||
$PKG_MANAGER install -y redis
|
||||
|
||||
# Configure Redis for production
|
||||
sed -i 's/supervised no/supervised systemd/' /etc/redis.conf
|
||||
sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis.conf
|
||||
|
||||
systemctl enable redis
|
||||
systemctl start redis
|
||||
|
||||
log_success "Redis installed and configured"
|
||||
}
|
||||
|
||||
install_nvidia_drivers() {
|
||||
log_info "Checking for NVIDIA GPU..."
|
||||
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
log_info "NVIDIA drivers already installed"
|
||||
nvidia-smi
|
||||
return
|
||||
fi
|
||||
|
||||
if lspci | grep -i nvidia &> /dev/null; then
|
||||
log_info "NVIDIA GPU detected, installing drivers..."
|
||||
|
||||
# Enable NVIDIA repository
|
||||
$PKG_MANAGER config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel$(rpm -E %rhel)/x86_64/cuda-rhel.repo
|
||||
|
||||
# Clean and install
|
||||
$PKG_MANAGER clean all
|
||||
$PKG_MANAGER module enable -y nvidia-driver:latest-dkms
|
||||
$PKG_MANAGER install -y nvidia-driver nvidia-cuda-toolkit
|
||||
|
||||
# Configure Podman for NVIDIA (only if needed)
|
||||
if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then
|
||||
log_warning "NVIDIA GPU access test failed, you may need to reboot"
|
||||
else
|
||||
log_success "NVIDIA drivers installed and GPU access verified"
|
||||
fi
|
||||
|
||||
# Reboot required
|
||||
log_warning "System reboot required for NVIDIA drivers"
|
||||
log_info "Run: reboot"
|
||||
else
|
||||
log_info "No NVIDIA GPU detected, skipping driver installation"
|
||||
fi
|
||||
}
|
||||
|
||||
install_ml_tools() {
|
||||
log_info "Installing ML tools and dependencies..."
|
||||
|
||||
# Python and ML packages
|
||||
$PKG_MANAGER install -y python3 python3-pip python3-devel
|
||||
|
||||
# System dependencies for ML
|
||||
$PKG_MANAGER groupinstall -y "Development Tools"
|
||||
$PKG_MANAGER install -y cmake git pkgconfig
|
||||
$PKG_MANAGER install -y libjpeg-turbo-devel libpng-devel libtiff-devel
|
||||
$PKG_MANAGER install -y mesa-libGL-devel mesa-libGLU-devel
|
||||
$PKG_MANAGER install -y gtk3-devel
|
||||
$PKG_MANAGER install -y atlas-devel blas-devel lapack-devel
|
||||
|
||||
# Install common ML libraries
|
||||
pip3 install --upgrade pip
|
||||
pip3 install numpy scipy scikit-learn pandas
|
||||
pip3 install jupyter matplotlib seaborn
|
||||
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
||||
|
||||
log_success "ML tools installed"
|
||||
}
|
||||
|
||||
create_user() {
|
||||
log_info "Creating fetchml user..."
|
||||
|
||||
if id "$FETCH_ML_USER" &>/dev/null; then
|
||||
log_info "User $FETCH_ML_USER already exists"
|
||||
return
|
||||
fi
|
||||
|
||||
useradd -m -d $FETCH_ML_HOME -s /bin/bash $FETCH_ML_USER
|
||||
usermod -aG podman $FETCH_ML_USER
|
||||
|
||||
# Create directories
|
||||
mkdir -p $FETCH_ML_HOME/.config/containers
|
||||
mkdir -p $FETCH_ML_HOME/go/bin
|
||||
mkdir -p $LOG_DIR
|
||||
mkdir -p $DATA_DIR
|
||||
|
||||
chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
|
||||
chown -R $FETCH_ML_USER:$FETCH_ML_USER $LOG_DIR
|
||||
chown -R $FETCH_ML_USER:$FETCH_ML_USER $DATA_DIR
|
||||
|
||||
log_success "User $FETCH_ML_USER created"
|
||||
}
|
||||
|
||||
setup_firewall() {
|
||||
log_info "Configuring firewall..."
|
||||
|
||||
if command -v firewall-cmd &> /dev/null; then
|
||||
systemctl enable firewalld
|
||||
systemctl start firewalld
|
||||
|
||||
firewall-cmd --permanent --add-service=ssh
|
||||
firewall-cmd --permanent --add-port=8080/tcp # Worker API
|
||||
firewall-cmd --permanent --add-port=8081/tcp # Data manager API
|
||||
firewall-cmd --permanent --add-port=6379/tcp # Redis
|
||||
firewall-cmd --reload
|
||||
|
||||
firewall-cmd --list-all
|
||||
else
|
||||
log_warning "Firewalld not available, skipping firewall configuration"
|
||||
fi
|
||||
}
|
||||
|
||||
setup_systemd_services() {
|
||||
log_info "Setting up systemd services..."
|
||||
|
||||
# Fetch ML Worker service
|
||||
cat > $SERVICE_DIR/fetch_ml_worker.service << EOF
|
||||
[Unit]
|
||||
Description=Fetch ML Worker Service
|
||||
After=network.target redis.service
|
||||
Wants=redis.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=$FETCH_ML_USER
|
||||
Group=$FETCH_ML_USER
|
||||
WorkingDirectory=$FETCH_ML_HOME
|
||||
Environment=FETCH_ML_HOME=$FETCH_ML_HOME
|
||||
Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin
|
||||
ExecStart=$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=fetch_ml_worker
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
# Fetch ML Data Manager service
|
||||
cat > $SERVICE_DIR/fetch_ml_data_manager.service << EOF
|
||||
[Unit]
|
||||
Description=Fetch ML Data Manager Service
|
||||
After=network.target redis.service
|
||||
Wants=redis.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=$FETCH_ML_USER
|
||||
Group=$FETCH_ML_USER
|
||||
WorkingDirectory=$FETCH_ML_HOME
|
||||
Environment=FETCH_ML_HOME=$FETCH_ML_HOME
|
||||
Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin
|
||||
ExecStart=$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=fetch_ml_data_manager
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
# Enable services
|
||||
systemctl daemon-reload
|
||||
systemctl enable fetch_ml_worker
|
||||
systemctl enable fetch_ml_data_manager
|
||||
|
||||
log_success "Systemd services configured"
|
||||
}
|
||||
|
||||
setup_log_rotation() {
|
||||
log_info "Setting up log rotation..."
|
||||
|
||||
cat > /etc/logrotate.d/fetch_ml << EOF
|
||||
$LOG_DIR/*.log {
|
||||
daily
|
||||
missingok
|
||||
rotate 30
|
||||
compress
|
||||
delaycompress
|
||||
notifempty
|
||||
create 0644 $FETCH_ML_USER $FETCH_ML_USER
|
||||
postrotate
|
||||
systemctl reload fetch_ml_worker || true
|
||||
systemctl reload fetch_ml_data_manager || true
|
||||
endscript
|
||||
}
|
||||
EOF
|
||||
|
||||
log_success "Log rotation configured"
|
||||
}
|
||||
|
||||
optimize_system() {
|
||||
log_info "Optimizing system for ML workloads..."
|
||||
|
||||
# Increase file limits
|
||||
echo "* soft nofile 65536" >> /etc/security/limits.conf
|
||||
echo "* hard nofile 65536" >> /etc/security/limits.conf
|
||||
|
||||
# Optimize kernel parameters for ML
|
||||
cat >> /etc/sysctl.conf << EOF
|
||||
# ML Optimization
|
||||
net.core.rmem_max = 134217728
|
||||
net.core.wmem_max = 134217728
|
||||
vm.swappiness = 10
|
||||
vm.dirty_ratio = 15
|
||||
vm.dirty_background_ratio = 5
|
||||
EOF
|
||||
|
||||
sysctl -p
|
||||
|
||||
# Configure GPU persistence mode if NVIDIA available
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode"
|
||||
fi
|
||||
|
||||
# Disable SELinux for better container compatibility (optional)
|
||||
if [[ -f /etc/selinux/config ]]; then
|
||||
log_warning "Consider setting SELinux to permissive mode for better container compatibility"
|
||||
log_info "Edit /etc/selinux/config and set SELINUX=permissive"
|
||||
fi
|
||||
|
||||
log_success "System optimized for ML workloads"
|
||||
}
|
||||
|
||||
install_fetch_ml() {
|
||||
log_info "Installing Fetch ML..."
|
||||
|
||||
# Clone or copy Fetch ML
|
||||
cd $FETCH_ML_HOME
|
||||
|
||||
if [[ ! -d "fetch_ml" ]]; then
|
||||
log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml"
|
||||
log_info "Example: git clone https://github.com/your-org/fetch_ml.git"
|
||||
return
|
||||
fi
|
||||
|
||||
cd fetch_ml
|
||||
|
||||
# Build
|
||||
export PATH=$PATH:/usr/local/go/bin
|
||||
make build
|
||||
|
||||
# Copy binaries
|
||||
cp bin/* $FETCH_ML_HOME/bin/
|
||||
chmod +x $FETCH_ML_HOME/bin/*
|
||||
|
||||
# Copy configs
|
||||
mkdir -p $FETCH_ML_HOME/configs
|
||||
cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml
|
||||
|
||||
# Set permissions
|
||||
chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
|
||||
|
||||
log_success "Fetch ML installed"
|
||||
}
|
||||
|
||||
main() {
|
||||
log_info "Starting Fetch ML Rocky Linux server setup..."
|
||||
|
||||
check_root
|
||||
check_rocky
|
||||
|
||||
update_system
|
||||
enable_epel
|
||||
install_go
|
||||
install_podman
|
||||
install_redis
|
||||
install_nvidia_drivers
|
||||
install_ml_tools
|
||||
ensure_user
|
||||
create_directories
|
||||
setup_firewall
|
||||
setup_systemd_services
|
||||
setup_logrotate
|
||||
hardening_steps
|
||||
selinux_guidance
|
||||
install_fetch_ml
|
||||
|
||||
log_success "Fetch ML setup complete!"
|
||||
echo
|
||||
log_info "Next steps:"
|
||||
echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml"
|
||||
echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml"
|
||||
echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager"
|
||||
echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager"
|
||||
echo "5. View logs: journalctl -u fetch_ml_worker -f"
|
||||
echo
|
||||
log_info "Services will be available at:"
|
||||
echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080"
|
||||
echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081"
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
|
|
@ -1,294 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Fetch ML Ubuntu Server Setup Script
|
||||
# Optimized for ML experiments on Ubuntu 20.04/22.04
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# shellcheck source=scripts/setup_common.sh
|
||||
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
||||
source "$SCRIPT_DIR/setup_common.sh"
|
||||
|
||||
check_root() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
log_error "This script must be run as root"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
check_ubuntu() {
|
||||
if ! command -v apt-get &> /dev/null; then
|
||||
log_error "This script is designed for Ubuntu systems"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local ubuntu_version=$(lsb_release -rs)
|
||||
log_info "Ubuntu version: $ubuntu_version"
|
||||
|
||||
if (( $(echo "$ubuntu_version < 20.04" | bc -l) == 1 )); then
|
||||
log_warning "Ubuntu version < 20.04 may not support all features"
|
||||
fi
|
||||
}
|
||||
|
||||
update_system() {
|
||||
log_info "Updating system packages..."
|
||||
apt-get update -y
|
||||
apt-get upgrade -y
|
||||
apt-get install -y curl wget gnupg lsb-release software-properties-common
|
||||
}
|
||||
|
||||
install_go() {
|
||||
log_info "Installing Go 1.25..."
|
||||
|
||||
if command -v go &> /dev/null; then
|
||||
local go_version=$(go version | awk '{print $3}' | sed 's/go//')
|
||||
log_info "Go already installed: $go_version"
|
||||
return
|
||||
fi
|
||||
|
||||
cd /tmp
|
||||
TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz"
|
||||
secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz"
|
||||
tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz
|
||||
|
||||
# Add to PATH
|
||||
echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile
|
||||
echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile
|
||||
export PATH=$PATH:/usr/local/go/bin
|
||||
|
||||
log_success "Go 1.25 installed"
|
||||
}
|
||||
|
||||
install_podman() {
|
||||
log_info "Installing Podman..."
|
||||
|
||||
if command -v podman &> /dev/null; then
|
||||
log_info "Podman already installed"
|
||||
return
|
||||
fi
|
||||
|
||||
# Add official Podman repository
|
||||
echo "deb https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/ /" | tee /etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list
|
||||
curl -L "https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/Release.key" | apt-key add -
|
||||
|
||||
apt-get update -y
|
||||
apt-get install -y podman podman-compose
|
||||
|
||||
# Configure Podman for rootless operation
|
||||
echo "user_namespace_enable = 1" >> /etc/containers/containers.conf
|
||||
echo "runtime = \"crun\"" >> /etc/containers/containers.conf
|
||||
|
||||
log_success "Podman installed"
|
||||
}
|
||||
|
||||
install_redis() {
|
||||
log_info "Installing Redis..."
|
||||
|
||||
if command -v redis-server &> /dev/null; then
|
||||
log_info "Redis already installed"
|
||||
return
|
||||
fi
|
||||
|
||||
apt-get install -y redis-server
|
||||
|
||||
# Configure Redis for production
|
||||
sed -i 's/supervised no/supervised systemd/' /etc/redis/redis.conf
|
||||
sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis/redis.conf
|
||||
|
||||
systemctl enable redis-server
|
||||
systemctl start redis-server
|
||||
|
||||
log_success "Redis installed and configured"
|
||||
}
|
||||
|
||||
install_nvidia_drivers() {
|
||||
log_info "Checking for NVIDIA GPU..."
|
||||
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
log_info "NVIDIA drivers already installed"
|
||||
nvidia-smi
|
||||
return
|
||||
fi
|
||||
|
||||
if lspci | grep -i nvidia &> /dev/null; then
|
||||
log_info "NVIDIA GPU detected, installing drivers..."
|
||||
|
||||
# Add NVIDIA repository
|
||||
TMP_FILES="/tmp/cuda-keyring_1.1-1_all.deb"
|
||||
secure_download "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$(lsb_release -rs | cut -d. -f1)/x86_64/cuda-keyring_1.1-1_all.deb" "cfa6b4109e7e3d9be060a016b7dc07e8edcd5356c0eabcc0c537a76e6c603d76" "/tmp/cuda-keyring_1.1-1_all.deb"
|
||||
dpkg -i /tmp/cuda-keyring_1.1-1_all.deb
|
||||
apt-get update -y
|
||||
|
||||
# Install drivers
|
||||
apt-get install -y nvidia-driver-535 nvidia-cuda-toolkit
|
||||
|
||||
# Configure Podman for NVIDIA (only if needed)
|
||||
if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then
|
||||
log_warning "NVIDIA GPU access test failed, you may need to reboot"
|
||||
else
|
||||
log_success "NVIDIA drivers installed and GPU access verified"
|
||||
fi
|
||||
|
||||
else
|
||||
log_info "No NVIDIA GPU detected, skipping driver installation"
|
||||
fi
|
||||
}
|
||||
|
||||
install_ml_tools() {
|
||||
log_info "Installing ML tools and dependencies..."
|
||||
|
||||
# Python and ML packages
|
||||
apt-get install -y python3 python3-pip python3-venv
|
||||
|
||||
# System dependencies for ML
|
||||
apt-get install -y build-essential cmake git pkg-config
|
||||
apt-get install -y libjpeg-dev libpng-dev libtiff-dev
|
||||
apt-get install -y libavcodec-dev libavformat-dev libswscale-dev
|
||||
apt-get install -y libgtk2.0-dev libcanberra-gtk-module
|
||||
apt-get install -y libxvidcore-dev libx264-dev
|
||||
apt-get install -y libatlas-base-dev gfortran
|
||||
|
||||
# Install common ML libraries
|
||||
pip3 install --upgrade pip
|
||||
pip3 install numpy scipy scikit-learn pandas
|
||||
pip3 install jupyter matplotlib seaborn
|
||||
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
||||
|
||||
log_success "ML tools installed"
|
||||
}
|
||||
|
||||
create_user() {
|
||||
log_info "Creating fetchml user..."
|
||||
ensure_user
|
||||
create_directories
|
||||
log_success "User $FETCH_ML_USER and directories created"
|
||||
}
|
||||
|
||||
setup_firewall() {
|
||||
log_info "Configuring firewall..."
|
||||
|
||||
if command -v ufw &> /dev/null; then
|
||||
ufw --force enable
|
||||
ufw allow ssh
|
||||
ufw allow 8080/tcp # Worker API
|
||||
ufw allow 8081/tcp # Data manager API
|
||||
ufw allow 6379/tcp # Redis
|
||||
ufw status
|
||||
else
|
||||
log_warning "UFW not available, skipping firewall configuration"
|
||||
fi
|
||||
}
|
||||
|
||||
setup_systemd_services() {
|
||||
log_info "Setting up systemd services..."
|
||||
|
||||
setup_systemd_service "fetch_ml_worker" "$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml"
|
||||
setup_systemd_service "fetch_ml_data_manager" "$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml"
|
||||
|
||||
# Enable services
|
||||
systemctl daemon-reload
|
||||
systemctl enable fetch_ml_worker
|
||||
systemctl enable fetch_ml_data_manager
|
||||
|
||||
log_success "Systemd services configured"
|
||||
}
|
||||
|
||||
setup_log_rotation() {
|
||||
log_info "Setting up log rotation..."
|
||||
setup_logrotate
|
||||
log_success "Log rotation configured"
|
||||
}
|
||||
|
||||
optimize_system() {
|
||||
log_info "Optimizing system for ML workloads..."
|
||||
hardening_steps
|
||||
|
||||
# Optimize kernel parameters for ML
|
||||
cat >> /etc/sysctl.conf << EOF
|
||||
# ML Optimization
|
||||
net.core.rmem_max = 134217728
|
||||
net.core.wmem_max = 134217728
|
||||
vm.swappiness = 10
|
||||
vm.dirty_ratio = 15
|
||||
vm.dirty_background_ratio = 5
|
||||
EOF
|
||||
|
||||
sysctl -p
|
||||
|
||||
# Configure GPU persistence mode if NVIDIA available
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode"
|
||||
fi
|
||||
|
||||
log_success "System optimized for ML workloads"
|
||||
}
|
||||
|
||||
install_fetch_ml() {
|
||||
log_info "Installing Fetch ML..."
|
||||
|
||||
# Clone or copy Fetch ML
|
||||
cd $FETCH_ML_HOME
|
||||
|
||||
if [[ ! -d "fetch_ml" ]]; then
|
||||
# This would be replaced with actual repository URL
|
||||
log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml"
|
||||
log_info "Example: git clone https://github.com/your-org/fetch_ml.git"
|
||||
return
|
||||
fi
|
||||
|
||||
cd fetch_ml
|
||||
|
||||
# Build
|
||||
export PATH=$PATH:/usr/local/go/bin
|
||||
make build
|
||||
|
||||
# Copy binaries
|
||||
cp bin/* $FETCH_ML_HOME/bin/
|
||||
chmod +x $FETCH_ML_HOME/bin/*
|
||||
|
||||
# Copy configs
|
||||
mkdir -p $FETCH_ML_HOME/configs
|
||||
cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml
|
||||
|
||||
# Set permissions
|
||||
chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
|
||||
|
||||
log_success "Fetch ML installed"
|
||||
}
|
||||
|
||||
main() {
|
||||
log_info "Starting Fetch ML Ubuntu server setup..."
|
||||
|
||||
check_root
|
||||
check_ubuntu
|
||||
|
||||
update_system
|
||||
install_go
|
||||
install_podman
|
||||
install_redis
|
||||
install_nvidia_drivers
|
||||
install_ml_tools
|
||||
ensure_user
|
||||
create_directories
|
||||
setup_firewall
|
||||
setup_systemd_services
|
||||
setup_logrotate
|
||||
hardening_steps
|
||||
install_fetch_ml
|
||||
|
||||
log_success "Fetch ML setup complete!"
|
||||
echo
|
||||
log_info "Next steps:"
|
||||
echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml"
|
||||
echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml"
|
||||
echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager"
|
||||
echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager"
|
||||
echo "5. View logs: journalctl -u fetch_ml_worker -f"
|
||||
echo
|
||||
log_info "Services will be available at:"
|
||||
echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080"
|
||||
echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081"
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main "$@"
|
||||
|
|
@ -1,67 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
echo "=== Test Tools Harness ==="
|
||||
|
||||
# Function to check if Redis is running, start temporary instance if needed
|
||||
ensure_redis() {
|
||||
if ! redis-cli ping >/dev/null 2>&1; then
|
||||
echo "Starting temporary Redis instance..."
|
||||
redis-server --daemonize yes --port 6379
|
||||
sleep 2
|
||||
if ! redis-cli ping >/dev/null 2>&1; then
|
||||
echo "Failed to start Redis"
|
||||
exit 1
|
||||
fi
|
||||
echo "Redis started successfully"
|
||||
# Set up cleanup trap
|
||||
trap 'echo "Stopping temporary Redis..."; redis-cli shutdown || true' EXIT
|
||||
else
|
||||
echo "Redis is already running"
|
||||
fi
|
||||
}
|
||||
|
||||
# Step 1: Build Go binaries
|
||||
echo "Building Go binaries..."
|
||||
go build -o bin/api-server ./cmd/api-server
|
||||
go build -o bin/worker ./cmd/worker
|
||||
go build -o bin/data_manager ./cmd/data_manager
|
||||
go build -o bin/user_manager ./cmd/user_manager
|
||||
|
||||
# Step 2: Build Zig CLI
|
||||
echo "Building Zig CLI..."
|
||||
cd cli
|
||||
zig build
|
||||
cd ..
|
||||
|
||||
# Step 3: Ensure Redis is running
|
||||
ensure_redis
|
||||
|
||||
# Step 4: Run Go tests
|
||||
echo "Running Go tests..."
|
||||
go test ./...
|
||||
|
||||
# Step 5: Run Zig tests
|
||||
echo "Running Zig CLI tests..."
|
||||
cd cli
|
||||
zig test
|
||||
cd ..
|
||||
|
||||
# Step 6: Run Go E2E tests (Redis is already available)
|
||||
echo "Running Go E2E tests..."
|
||||
go test ./tests/e2e/...
|
||||
|
||||
# Step 7: Smoke test API server and CLI
|
||||
echo "Running smoke test..."
|
||||
# Start API server in background on different port
|
||||
./bin/api-server -config configs/config.yaml -port 19101 -no-tls > /tmp/api-server.log 2>&1 &
|
||||
API_PID=$!
|
||||
sleep 2
|
||||
|
||||
# Test CLI status
|
||||
./cli/zig-out/bin/ml status -server http://localhost:19101
|
||||
|
||||
# Clean up
|
||||
kill $API_PID 2>/dev/null || true
|
||||
|
||||
echo "=== All tests completed successfully ==="
|
||||
|
|
@ -5,7 +5,7 @@ Requires=docker.service
|
|||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/Users/jfraeys/Documents/dev/fetch_ml/scripts/cleanup.sh --force
|
||||
ExecStart=/Users/jfraeys/Documents/dev/fetch_ml/scripts/maintenance/cleanup.sh --dry-run
|
||||
User=jfraeys
|
||||
Group=staff
|
||||
StandardOutput=journal
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ set -e
|
|||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||
LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
|
||||
ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
|
|
@ -43,22 +44,34 @@ cleanup_benchmark_artifacts() {
|
|||
|
||||
case "${1:-keep-10}" in
|
||||
"all")
|
||||
print_status "Removing ALL benchmark artifacts..."
|
||||
rm -rf "$LOCAL_ARTIFACTS_DIR"
|
||||
print_success "Removed all artifacts (was $size_before)"
|
||||
print_status "Archiving ALL benchmark artifacts..."
|
||||
local stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||
mkdir -p "$ARCHIVE_DIR/$stamp"
|
||||
mv "$LOCAL_ARTIFACTS_DIR"/run_* "$ARCHIVE_DIR/$stamp"/ 2>/dev/null || true
|
||||
print_success "Archived all artifacts (was $size_before)"
|
||||
;;
|
||||
"keep-5")
|
||||
print_status "Keeping last 5 runs, removing older ones..."
|
||||
print_status "Keeping last 5 runs, archiving older ones..."
|
||||
local stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||
mkdir -p "$ARCHIVE_DIR/$stamp"
|
||||
cd "$LOCAL_ARTIFACTS_DIR"
|
||||
ls -1t run_* 2>/dev/null | tail -n +6 | xargs rm -rf 2>/dev/null || true
|
||||
ls -1t run_* 2>/dev/null | tail -n +6 | while read -r run; do
|
||||
[ -n "$run" ] || continue
|
||||
mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
|
||||
done
|
||||
local count_after=$(ls -1d run_* 2>/dev/null | wc -l)
|
||||
local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B")
|
||||
print_success "Cleaned old runs: $count_before → $count_after runs ($size_before → $size_after)"
|
||||
;;
|
||||
"keep-10")
|
||||
print_status "Keeping last 10 runs, removing older ones..."
|
||||
print_status "Keeping last 10 runs, archiving older ones..."
|
||||
local stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||
mkdir -p "$ARCHIVE_DIR/$stamp"
|
||||
cd "$LOCAL_ARTIFACTS_DIR"
|
||||
ls -1t run_* 2>/dev/null | tail -n +11 | xargs rm -rf 2>/dev/null || true
|
||||
ls -1t run_* 2>/dev/null | tail -n +11 | while read -r run; do
|
||||
[ -n "$run" ] || continue
|
||||
mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
|
||||
done
|
||||
local count_after=$(ls -1d run_* 2>/dev/null | wc -l)
|
||||
local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B")
|
||||
print_success "Cleaned old runs: $count_before → $count_after runs ($size_before → $size_after)"
|
||||
|
|
@ -80,12 +93,18 @@ cleanup_temp_files() {
|
|||
# Clean temp directories
|
||||
local temp_cleaned=0
|
||||
|
||||
local stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||
local tmp_archive_dir="$LOCAL_ARTIFACTS_DIR/tmp-archive/$stamp"
|
||||
mkdir -p "$tmp_archive_dir"
|
||||
|
||||
# /tmp cleanup
|
||||
if [ -d "/tmp" ]; then
|
||||
local tmp_files=$(find /tmp -name "benchmark_*" -type f 2>/dev/null | wc -l)
|
||||
if [ "$tmp_files" -gt 0 ]; then
|
||||
find /tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
|
||||
print_success "Cleaned $tmp_files temporary files from /tmp"
|
||||
find /tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
|
||||
mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
|
||||
done
|
||||
print_success "Archived $tmp_files temporary files from /tmp"
|
||||
temp_cleaned=$((temp_cleaned + tmp_files))
|
||||
fi
|
||||
fi
|
||||
|
|
@ -94,8 +113,10 @@ cleanup_temp_files() {
|
|||
if [ -d "/var/tmp" ]; then
|
||||
local vartmp_files=$(find /var/tmp -name "benchmark_*" -type f 2>/dev/null | wc -l)
|
||||
if [ "$vartmp_files" -gt 0 ]; then
|
||||
find /var/tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
|
||||
print_success "Cleaned $vartmp_files temporary files from /var/tmp"
|
||||
find /var/tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
|
||||
mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
|
||||
done
|
||||
print_success "Archived $vartmp_files temporary files from /var/tmp"
|
||||
temp_cleaned=$((temp_cleaned + vartmp_files))
|
||||
fi
|
||||
fi
|
||||
|
|
@ -104,8 +125,10 @@ cleanup_temp_files() {
|
|||
if [ -d "$HOME/tmp" ]; then
|
||||
local user_tmp_files=$(find "$HOME/tmp" -name "benchmark_*" -type f 2>/dev/null | wc -l)
|
||||
if [ "$user_tmp_files" -gt 0 ]; then
|
||||
find "$HOME/tmp" -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
|
||||
print_success "Cleaned $user_tmp_files temporary files from ~/tmp"
|
||||
find "$HOME/tmp" -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
|
||||
mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
|
||||
done
|
||||
print_success "Archived $user_tmp_files temporary files from ~/tmp"
|
||||
temp_cleaned=$((temp_cleaned + user_tmp_files))
|
||||
fi
|
||||
fi
|
||||
|
|
@ -177,9 +200,16 @@ cleanup_logs() {
|
|||
for log_dir in "${log_dirs[@]}"; do
|
||||
if [ -d "$log_dir" ]; then
|
||||
local log_size_before=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B")
|
||||
# Remove log files older than 7 days
|
||||
find "$log_dir" -name "*.log" -type f -mtime +7 -delete 2>/dev/null || true
|
||||
find "$log_dir" -name "*.log.*" -type f -mtime +7 -delete 2>/dev/null || true
|
||||
local stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||
local log_archive_dir="$log_dir/archive/$stamp"
|
||||
mkdir -p "$log_archive_dir"
|
||||
# Move log files older than 7 days to archive
|
||||
find "$log_dir" -name "*.log" -type f -mtime +7 -print0 2>/dev/null | while IFS= read -r -d '' f; do
|
||||
mv "$f" "$log_archive_dir/" 2>/dev/null || true
|
||||
done
|
||||
find "$log_dir" -name "*.log.*" -type f -mtime +7 -print0 2>/dev/null | while IFS= read -r -d '' f; do
|
||||
mv "$f" "$log_archive_dir/" 2>/dev/null || true
|
||||
done
|
||||
local log_size_after=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B")
|
||||
if [ "$log_size_before" != "$log_size_after" ]; then
|
||||
print_success "Cleaned old logs in $log_dir: $log_size_before → $log_size_after"
|
||||
|
|
|
|||
|
|
@ -144,12 +144,12 @@ else
|
|||
log_info "No running containers found"
|
||||
fi
|
||||
|
||||
# Remove containers
|
||||
# Remove containers
|
||||
log_info "Removing containers..."
|
||||
containers=$(docker ps -aq --filter "name=ml-")
|
||||
if [ -n "$containers" ]; then
|
||||
if [ "$DRY_RUN" = false ]; then
|
||||
echo "$containers" | xargs docker rm -f
|
||||
echo "$containers" | xargs docker rm
|
||||
log_success "Containers removed"
|
||||
fi
|
||||
else
|
||||
|
|
@ -168,9 +168,9 @@ else
|
|||
log_info "No networks found"
|
||||
fi
|
||||
|
||||
# Remove volumes (with caution)
|
||||
log_warning "Removing volumes (this will delete data)..."
|
||||
if [ "$FORCE" = true ] || [ "$ALL" = true ]; then
|
||||
# Remove volumes (with caution)
|
||||
log_warning "Skipping volumes by default (use --all to remove them)"
|
||||
if [ "$ALL" = true ]; then
|
||||
volumes=$(docker volume ls -q --filter "name=ml-")
|
||||
if [ -n "$volumes" ]; then
|
||||
if [ "$DRY_RUN" = false ]; then
|
||||
|
|
@ -181,16 +181,16 @@ if [ "$FORCE" = true ] || [ "$ALL" = true ]; then
|
|||
log_info "No volumes found"
|
||||
fi
|
||||
else
|
||||
log_info "Skipping volumes (use --force or --all to remove them)"
|
||||
log_info "Skipping volumes"
|
||||
fi
|
||||
|
||||
# Remove images if requested
|
||||
# Remove images if requested
|
||||
if [ "$ALL" = true ]; then
|
||||
log_info "Removing images..."
|
||||
images=$(docker images -q --filter "reference=fetch_ml-*")
|
||||
if [ -n "$images" ]; then
|
||||
if [ "$DRY_RUN" = false ]; then
|
||||
echo "$images" | xargs docker rmi -f
|
||||
echo "$images" | xargs docker rmi
|
||||
log_success "Images removed"
|
||||
fi
|
||||
else
|
||||
|
|
@ -200,11 +200,15 @@ else
|
|||
log_info "Skipping images (use --all to remove them)"
|
||||
fi
|
||||
|
||||
# General Docker cleanup
|
||||
log_info "Running general Docker cleanup..."
|
||||
if [ "$DRY_RUN" = false ]; then
|
||||
docker system prune -f
|
||||
log_success "General cleanup completed"
|
||||
# General Docker cleanup
|
||||
if [ "$ALL" = true ]; then
|
||||
log_info "Running general Docker cleanup (docker system prune)..."
|
||||
if [ "$DRY_RUN" = false ]; then
|
||||
docker system prune -f
|
||||
log_success "General cleanup completed"
|
||||
fi
|
||||
else
|
||||
log_info "Skipping docker system prune (use --all to enable)"
|
||||
fi
|
||||
|
||||
# Show final state
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ set -e
|
|||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||
LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
|
||||
ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"
|
||||
|
||||
# Create artifacts directory if it doesn't exist
|
||||
mkdir -p "$LOCAL_ARTIFACTS_DIR"
|
||||
|
|
@ -41,17 +42,21 @@ case "${1:-help}" in
|
|||
echo "=== Cleaning Artifacts ==="
|
||||
case "${2:-all}" in
|
||||
"all")
|
||||
echo "Removing all artifacts..."
|
||||
rm -rf "$LOCAL_ARTIFACTS_DIR"
|
||||
echo "All artifacts removed"
|
||||
echo "Archiving all artifacts..."
|
||||
stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||
mkdir -p "$ARCHIVE_DIR/$stamp"
|
||||
mv "$LOCAL_ARTIFACTS_DIR"/run_* "$ARCHIVE_DIR/$stamp"/ 2>/dev/null || true
|
||||
echo "All artifacts archived"
|
||||
;;
|
||||
"old")
|
||||
keep_count="${3:-10}"
|
||||
echo "Keeping last $keep_count runs, removing older ones..."
|
||||
echo "Keeping last $keep_count runs, archiving older ones..."
|
||||
stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||
mkdir -p "$ARCHIVE_DIR/$stamp"
|
||||
cd "$LOCAL_ARTIFACTS_DIR"
|
||||
ls -1t run_* 2>/dev/null | tail -n +$((keep_count + 1)) | while read -r run; do
|
||||
echo "Removing: $run"
|
||||
rm -rf "$run"
|
||||
echo "Archiving: $run"
|
||||
mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
|
||||
done
|
||||
;;
|
||||
"run")
|
||||
|
|
@ -64,8 +69,10 @@ case "${1:-help}" in
|
|||
fi
|
||||
run_dir="$LOCAL_ARTIFACTS_DIR/run_$run_id"
|
||||
if [ -d "$run_dir" ]; then
|
||||
echo "Removing run: $run_id"
|
||||
rm -rf "$run_dir"
|
||||
echo "Archiving run: $run_id"
|
||||
stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||
mkdir -p "$ARCHIVE_DIR/$stamp"
|
||||
mv "$run_dir" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
|
||||
else
|
||||
echo "Run not found: $run_id"
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -1,169 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Secure Homelab Setup Script for Fetch ML
|
||||
# This script generates secure API keys and TLS certificates
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||
CONFIG_DIR="$PROJECT_ROOT/configs/environments"
|
||||
SSL_DIR="$PROJECT_ROOT/ssl"
|
||||
|
||||
echo "🔒 Setting up secure homelab configuration..."
|
||||
|
||||
# Create SSL directory
|
||||
mkdir -p "$SSL_DIR"
|
||||
|
||||
# Generate TLS certificates
|
||||
echo "📜 Generating TLS certificates..."
|
||||
if [[ ! -f "$SSL_DIR/cert.pem" ]] || [[ ! -f "$SSL_DIR/key.pem" ]]; then
|
||||
openssl req -x509 -newkey rsa:4096 -keyout "$SSL_DIR/key.pem" -out "$SSL_DIR/cert.pem" -days 365 -nodes \
|
||||
-subj "/C=US/ST=Homelab/L=Local/O=FetchML/OU=Homelab/CN=localhost" \
|
||||
-addext "subjectAltName=DNS:localhost,DNS:$(hostname),IP:127.0.0.1"
|
||||
chmod 600 "$SSL_DIR/key.pem"
|
||||
chmod 644 "$SSL_DIR/cert.pem"
|
||||
echo "✅ TLS certificates generated in $SSL_DIR/"
|
||||
else
|
||||
echo "ℹ️ TLS certificates already exist, skipping generation"
|
||||
fi
|
||||
|
||||
# Generate secure API keys
|
||||
echo "🔑 Generating secure API keys..."
|
||||
generate_api_key() {
|
||||
openssl rand -hex 32
|
||||
}
|
||||
|
||||
# Hash function
|
||||
hash_key() {
|
||||
echo -n "$1" | sha256sum | cut -d' ' -f1
|
||||
}
|
||||
|
||||
# Generate keys
|
||||
ADMIN_KEY=$(generate_api_key)
|
||||
USER_KEY=$(generate_api_key)
|
||||
ADMIN_HASH=$(hash_key "$ADMIN_KEY")
|
||||
USER_HASH=$(hash_key "$USER_KEY")
|
||||
|
||||
# Create secure config
|
||||
echo "⚙️ Creating secure configuration..."
|
||||
cat > "$CONFIG_DIR/config-homelab-secure.yaml" << EOF
|
||||
# Secure Homelab Configuration
|
||||
# IMPORTANT: Keep your API keys safe and never share them!
|
||||
|
||||
redis:
|
||||
url: "redis://localhost:6379"
|
||||
max_connections: 10
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
api_keys:
|
||||
homelab_admin:
|
||||
hash: $ADMIN_HASH
|
||||
admin: true
|
||||
roles:
|
||||
- admin
|
||||
permissions:
|
||||
'*': true
|
||||
homelab_user:
|
||||
hash: $USER_HASH
|
||||
admin: false
|
||||
roles:
|
||||
- researcher
|
||||
permissions:
|
||||
'experiments': true
|
||||
'datasets': true
|
||||
'jupyter': true
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: true
|
||||
cert_file: "$SSL_DIR/cert.pem"
|
||||
key_file: "$SSL_DIR/key.pem"
|
||||
|
||||
security:
|
||||
rate_limit:
|
||||
enabled: true
|
||||
requests_per_minute: 60
|
||||
burst_size: 10
|
||||
ip_whitelist:
|
||||
- "127.0.0.1"
|
||||
- "::1"
|
||||
- "localhost"
|
||||
- "192.168.1.0/24" # Adjust to your network
|
||||
- "10.0.0.0/8"
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file: "logs/fetch_ml.log"
|
||||
console: true
|
||||
|
||||
resources:
|
||||
cpu_limit: "2"
|
||||
memory_limit: "4Gi"
|
||||
gpu_limit: 0
|
||||
disk_limit: "10Gi"
|
||||
|
||||
# Prometheus metrics
|
||||
metrics:
|
||||
enabled: true
|
||||
listen_addr: ":9100"
|
||||
tls:
|
||||
enabled: false
|
||||
EOF
|
||||
|
||||
# Save API keys to a secure file
|
||||
echo "🔐 Saving API keys..."
|
||||
cat > "$PROJECT_ROOT/.api-keys" << EOF
|
||||
# Fetch ML Homelab API Keys
|
||||
# IMPORTANT: Keep this file secure and never commit to version control!
|
||||
|
||||
ADMIN_API_KEY: $ADMIN_KEY
|
||||
USER_API_KEY: $USER_KEY
|
||||
|
||||
# Usage examples:
|
||||
# curl -H "X-API-Key: $ADMIN_KEY" https://localhost:9101/health
|
||||
# curl -H "X-API-Key: $USER_KEY" https://localhost:9101/api/jupyter/services
|
||||
EOF
|
||||
|
||||
chmod 600 "$PROJECT_ROOT/.api-keys"
|
||||
|
||||
# Create environment file for JWT secret
|
||||
JWT_SECRET=$(generate_api_key)
|
||||
cat > "$PROJECT_ROOT/.env.secure" << EOF
|
||||
# Secure environment variables for Fetch ML
|
||||
# IMPORTANT: Keep this file secure and never commit to version control!
|
||||
|
||||
JWT_SECRET=$JWT_SECRET
|
||||
|
||||
# Source this file before running the server:
|
||||
# source .env.secure
|
||||
EOF
|
||||
|
||||
chmod 600 "$PROJECT_ROOT/.env.secure"
|
||||
|
||||
# Update .gitignore to exclude sensitive files
|
||||
echo "📝 Updating .gitignore..."
|
||||
if ! grep -q ".api-keys" "$PROJECT_ROOT/.gitignore"; then
|
||||
echo -e "\n# Security files\n.api-keys\n.env.secure\nssl/\n*.pem\n*.key" >> "$PROJECT_ROOT/.gitignore"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "🎉 Secure homelab setup complete!"
|
||||
echo ""
|
||||
echo "📋 Next steps:"
|
||||
echo "1. Review and adjust the IP whitelist in config-homelab-secure.yaml"
|
||||
echo "2. Start the server with: ./api-server -config configs/environments/config-homelab-secure.yaml"
|
||||
echo "3. Source the environment: source .env.secure"
|
||||
echo "4. Your API keys are saved in .api-keys"
|
||||
echo ""
|
||||
echo "🔐 API Keys:"
|
||||
echo " Admin: $ADMIN_KEY"
|
||||
echo " User: $USER_KEY"
|
||||
echo ""
|
||||
echo "⚠️ IMPORTANT:"
|
||||
echo " - Never share your API keys"
|
||||
echo " - Never commit .api-keys or .env.secure to version control"
|
||||
echo " - Backup your SSL certificates and API keys securely"
|
||||
echo " - Consider using a password manager for storing keys"
|
||||
311
scripts/setup.sh
311
scripts/setup.sh
|
|
@ -1,311 +0,0 @@
|
|||
#!/bin/bash
|
||||
# setup.sh: One-shot homelab setup (security + core services)
|
||||
# Keeps essential security (Fail2Ban, monitoring) while simplifying complexity
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
readonly RED='\033[0;31m'
|
||||
readonly GREEN='\033[0;32m'
|
||||
readonly YELLOW='\033[1;33m'
|
||||
readonly BLUE='\033[0;34m'
|
||||
readonly NC='\033[0m'
|
||||
|
||||
print_info() {
|
||||
echo -e "${BLUE}[INFO]${NC} $1"
|
||||
}
|
||||
|
||||
print_success() {
|
||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||
}
|
||||
|
||||
print_warning() {
|
||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||
}
|
||||
|
||||
print_error() {
|
||||
echo -e "${RED}[ERROR]${NC} $1"
|
||||
}
|
||||
|
||||
# Simple dependency check
|
||||
check_deps() {
|
||||
print_info "Checking dependencies..."
|
||||
|
||||
local missing=()
|
||||
|
||||
if ! command -v go &> /dev/null; then
|
||||
missing+=("go")
|
||||
fi
|
||||
|
||||
if ! command -v zig &> /dev/null; then
|
||||
missing+=("zig")
|
||||
fi
|
||||
|
||||
if ! command -v redis-server &> /dev/null; then
|
||||
missing+=("redis-server")
|
||||
fi
|
||||
|
||||
if ! command -v docker &> /dev/null; then
|
||||
missing+=("docker")
|
||||
fi
|
||||
|
||||
if [[ ${#missing[@]} -gt 0 ]]; then
|
||||
print_error "Missing dependencies: ${missing[*]}"
|
||||
echo ""
|
||||
echo "Install with:"
|
||||
echo " macOS: brew install ${missing[*]}"
|
||||
echo " Ubuntu: sudo apt-get install ${missing[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_success "Dependencies OK"
|
||||
}
|
||||
|
||||
# Simple setup
|
||||
setup_project() {
|
||||
print_info "Setting up project..."
|
||||
|
||||
# Create essential directories
|
||||
mkdir -p ssl logs configs data monitoring
|
||||
|
||||
# Generate simple SSL cert
|
||||
if [[ ! -f "ssl/cert.pem" ]]; then
|
||||
openssl req -x509 -newkey rsa:2048 -keyout ssl/key.pem -out ssl/cert.pem \
|
||||
-days 365 -nodes -subj "/C=US/ST=State/L=City/O=Homelab/CN=localhost" \
|
||||
-addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null
|
||||
print_success "SSL certificates generated"
|
||||
fi
|
||||
|
||||
# Create balanced config
|
||||
cat > configs/config.yaml << 'EOF'
|
||||
base_path: "./data/experiments"
|
||||
|
||||
auth:
|
||||
enabled: true
|
||||
api_keys:
|
||||
homelab_user:
|
||||
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
|
||||
admin: true
|
||||
roles: ["user", "admin"]
|
||||
permissions:
|
||||
read: true
|
||||
write: true
|
||||
delete: true
|
||||
|
||||
server:
|
||||
address: ":9101"
|
||||
tls:
|
||||
enabled: true
|
||||
cert_file: "./ssl/cert.pem"
|
||||
key_file: "./ssl/key.pem"
|
||||
|
||||
security:
|
||||
rate_limit:
|
||||
enabled: true
|
||||
requests_per_minute: 30
|
||||
burst_size: 5
|
||||
ip_whitelist:
|
||||
- "127.0.0.1"
|
||||
- "::1"
|
||||
- "192.168.0.0/16"
|
||||
- "10.0.0.0/8"
|
||||
- "172.16.0.0/12"
|
||||
failed_login_lockout:
|
||||
enabled: true
|
||||
max_attempts: 3
|
||||
lockout_duration: "15m"
|
||||
|
||||
redis:
|
||||
url: "redis://localhost:6379"
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file: "./logs/app.log"
|
||||
audit_log: "./logs/audit.log"
|
||||
access_log: "./logs/access.log"
|
||||
|
||||
monitoring:
|
||||
enabled: true
|
||||
metrics_port: 9090
|
||||
health_check_interval: "30s"
|
||||
EOF
|
||||
|
||||
print_success "Configuration created"
|
||||
}
|
||||
|
||||
# Simple build
|
||||
build_project() {
|
||||
print_info "Building project..."
|
||||
|
||||
# Build Go apps
|
||||
go build -o bin/api-server ./cmd/api-server
|
||||
go build -o bin/worker ./cmd/worker
|
||||
go build -o bin/tui ./cmd/tui
|
||||
|
||||
# Build Zig CLI
|
||||
cd cli && zig build && cd ..
|
||||
|
||||
print_success "Build completed"
|
||||
}
|
||||
|
||||
# Setup Fail2Ban
|
||||
setup_fail2ban() {
|
||||
print_info "Setting up Fail2Ban..."
|
||||
|
||||
if ! command -v fail2ban-server &> /dev/null; then
|
||||
print_warning "Fail2Ban not installed, skipping..."
|
||||
return
|
||||
fi
|
||||
|
||||
# Create Fail2Ban configuration
|
||||
sudo mkdir -p /etc/fail2ban/jail.d 2>/dev/null || true
|
||||
|
||||
cat > /tmp/ml-experiments-jail.conf << 'EOF'
|
||||
[DEFAULT]
|
||||
bantime = 3600
|
||||
findtime = 600
|
||||
maxretry = 3
|
||||
backend = systemd
|
||||
|
||||
[sshd]
|
||||
enabled = true
|
||||
port = ssh
|
||||
logpath = /var/log/auth.log
|
||||
maxretry = 3
|
||||
|
||||
[ml-experiments-api]
|
||||
enabled = true
|
||||
port = 9101
|
||||
filter = ml-experiments-api
|
||||
logpath = ./logs/audit.log
|
||||
maxretry = 5
|
||||
bantime = 7200
|
||||
|
||||
[ml-experiments-auth]
|
||||
enabled = true
|
||||
filter = ml-experiments-auth
|
||||
logpath = ./logs/audit.log
|
||||
maxretry = 3
|
||||
bantime = 3600
|
||||
EOF
|
||||
|
||||
# Create filter definitions
|
||||
cat > /tmp/ml-experiments-api.conf << 'EOF'
|
||||
[Definition]
|
||||
failregex = ^.*<HOST>.*"status":40[13].*$
|
||||
ignoreregex =
|
||||
EOF
|
||||
|
||||
cat > /tmp/ml-experiments-auth.conf << 'EOF'
|
||||
[Definition]
|
||||
failregex = ^.*"event":"failed_login".*"client_ip":"<HOST>".*$
|
||||
ignoreregex =
|
||||
EOF
|
||||
|
||||
# Try to install configurations
|
||||
if sudo cp /tmp/ml-experiments-jail.conf /etc/fail2ban/jail.d/ 2>/dev/null; then
|
||||
sudo cp /tmp/ml-experiments-*.conf /etc/fail2ban/filter.d/ 2>/dev/null || true
|
||||
sudo systemctl restart fail2ban 2>/dev/null || true
|
||||
print_success "Fail2Ban configured"
|
||||
else
|
||||
print_warning "Could not configure Fail2Ban (requires sudo)"
|
||||
fi
|
||||
|
||||
rm -f /tmp/ml-experiments-*.conf
|
||||
}
|
||||
|
||||
# Setup Redis
|
||||
setup_redis() {
|
||||
print_info "Setting up Redis..."
|
||||
|
||||
if ! pgrep -f "redis-server" > /dev/null; then
|
||||
redis-server --daemonize yes --port 6379
|
||||
print_success "Redis started"
|
||||
else
|
||||
print_info "Redis already running"
|
||||
fi
|
||||
}
|
||||
|
||||
# Create simple management script
|
||||
create_manage_script() {
|
||||
cat > manage.sh << 'EOF'
|
||||
#!/bin/bash
|
||||
|
||||
# Simple management script
|
||||
|
||||
case "${1:-status}" in
|
||||
"start")
|
||||
echo "Starting services..."
|
||||
redis-server --daemonize yes --port 6379 2>/dev/null || true
|
||||
./bin/api-server -config configs/config.yaml &
|
||||
echo "Services started"
|
||||
;;
|
||||
"stop")
|
||||
echo "Stopping services..."
|
||||
pkill -f "api-server" || true
|
||||
redis-cli shutdown 2>/dev/null || true
|
||||
echo "Services stopped"
|
||||
;;
|
||||
"status")
|
||||
echo "=== Status ==="
|
||||
if pgrep -f "redis-server" > /dev/null; then
|
||||
echo "✅ Redis: Running"
|
||||
else
|
||||
echo "❌ Redis: Stopped"
|
||||
fi
|
||||
|
||||
if pgrep -f "api-server" > /dev/null; then
|
||||
echo "✅ API Server: Running"
|
||||
else
|
||||
echo "❌ API Server: Stopped"
|
||||
fi
|
||||
;;
|
||||
"logs")
|
||||
echo "=== Recent Logs ==="
|
||||
tail -20 logs/app.log 2>/dev/null || echo "No logs yet"
|
||||
;;
|
||||
"test")
|
||||
echo "=== Testing ==="
|
||||
curl -k -s https://localhost:9101/health || echo "API server not responding"
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 {start|stop|status|logs|test}"
|
||||
;;
|
||||
esac
|
||||
EOF
|
||||
|
||||
chmod +x manage.sh
|
||||
print_success "Management script created"
|
||||
}
|
||||
|
||||
# Show next steps
|
||||
show_next_steps() {
|
||||
print_success "Setup completed!"
|
||||
echo ""
|
||||
echo "🎉 Setup complete!"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo " 1. Start services: ./tools/manage.sh start"
|
||||
echo " 2. Check status: ./tools/manage.sh status"
|
||||
echo " 3. Test API: curl -k -H 'X-API-Key: password' https://localhost:9101/health"
|
||||
echo ""
|
||||
echo "Configuration: configs/config.yaml"
|
||||
echo "Logs: logs/app.log and logs/audit.log"
|
||||
echo ""
|
||||
print_success "Ready for homelab use!"
|
||||
}
|
||||
|
||||
# Main setup
|
||||
main() {
|
||||
echo "ML Experiment Manager - Homelab Setup"
|
||||
echo "====================================="
|
||||
echo ""
|
||||
|
||||
check_deps
|
||||
setup_project
|
||||
build_project
|
||||
setup_redis
|
||||
create_manage_script
|
||||
show_next_steps
|
||||
}
|
||||
|
||||
main "$@"
|
||||
62
scripts/setup_monitoring.py
Normal file
62
scripts/setup_monitoring.py
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
|
||||
# Create monitoring directory structure
|
||||
repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
||||
monitoring_dir = os.path.join(repo_root, 'monitoring')
|
||||
grafana_dir = os.path.join(monitoring_dir, 'grafana')
|
||||
|
||||
datasources_dir = os.path.join(grafana_dir, 'provisioning', 'datasources')
|
||||
providers_dir = os.path.join(grafana_dir, 'provisioning', 'dashboards')
|
||||
|
||||
os.makedirs(datasources_dir, exist_ok=True)
|
||||
os.makedirs(providers_dir, exist_ok=True)
|
||||
|
||||
# Essential datasource configurations
|
||||
datasources = {
|
||||
'prometheus.yml': """apiVersion: 1
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: true
|
||||
jsonData:
|
||||
timeInterval: "5s"
|
||||
""",
|
||||
'loki.yml': """apiVersion: 1
|
||||
datasources:
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
editable: true
|
||||
jsonData:
|
||||
maxLines: 1000
|
||||
""",
|
||||
'dashboards.yml': """apiVersion: 1
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
"""
|
||||
}
|
||||
|
||||
# Write configuration files
|
||||
for filename, content in datasources.items():
|
||||
if filename == 'dashboards.yml':
|
||||
path = os.path.join(providers_dir, filename)
|
||||
else:
|
||||
path = os.path.join(datasources_dir, filename)
|
||||
|
||||
with open(path, 'w') as f:
|
||||
f.write(content)
|
||||
|
||||
print("Monitoring setup completed!")
|
||||
111
scripts/smoke-test.sh
Normal file
111
scripts/smoke-test.sh
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
set -euo pipefail;
|
||||
|
||||
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
export FETCHML_REPO_ROOT="$repo_root"
|
||||
|
||||
env="${1:-dev}";
|
||||
if [ "$env" != "dev" ] && [ "$env" != "prod" ]; then
|
||||
echo "usage: $0 [dev|prod]" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
probe_https_health_openssl() {
|
||||
host="$1"
|
||||
port="$2"
|
||||
path="$3"
|
||||
|
||||
req="GET ${path} HTTP/1.1\r\nHost: ${host}\r\nConnection: close\r\n\r\n"
|
||||
resp=$(printf "%b" "$req" | openssl s_client -connect "127.0.0.1:${port}" -servername "${host}" -tls1_2 -quiet 2>/dev/null || true)
|
||||
printf "%s" "$resp" | tr -d '\r' | head -n 1 | grep -Eq '^HTTP/1\.[01] 200'
|
||||
}
|
||||
|
||||
compose_cmd="docker-compose";
|
||||
if ! command -v docker-compose >/dev/null 2>&1; then
|
||||
compose_cmd="docker compose";
|
||||
fi
|
||||
|
||||
compose_files=()
|
||||
compose_project_args=("--project-directory" "$repo_root")
|
||||
api_base=""
|
||||
prometheus_base=""
|
||||
stack_name=""
|
||||
|
||||
if [ "$env" = "dev" ]; then
|
||||
mkdir -p \
|
||||
"$repo_root/data/dev/redis" \
|
||||
"$repo_root/data/dev/minio" \
|
||||
"$repo_root/data/dev/prometheus" \
|
||||
"$repo_root/data/dev/grafana" \
|
||||
"$repo_root/data/dev/loki" \
|
||||
"$repo_root/data/dev/logs" \
|
||||
"$repo_root/data/dev/experiments" \
|
||||
"$repo_root/data/dev/active" \
|
||||
"$repo_root/data/dev/workspaces"
|
||||
|
||||
stack_name="dev"
|
||||
compose_files=("-f" "$repo_root/deployments/docker-compose.dev.yml")
|
||||
api_base="https://localhost:9101"
|
||||
if ! curl -skf "$api_base/health" >/dev/null 2>&1; then
|
||||
api_base="http://localhost:9101"
|
||||
fi
|
||||
prometheus_base="http://localhost:9090"
|
||||
else
|
||||
mkdir -p \
|
||||
"$repo_root/data/prod-smoke/caddy/data" \
|
||||
"$repo_root/data/prod-smoke/caddy/config" \
|
||||
"$repo_root/data/prod-smoke/redis" \
|
||||
"$repo_root/data/prod-smoke/logs" \
|
||||
"$repo_root/data/prod-smoke/experiments" \
|
||||
"$repo_root/data/prod-smoke/active"
|
||||
|
||||
stack_name="prod"
|
||||
compose_files=("-f" "$repo_root/deployments/docker-compose.prod.smoke.yml")
|
||||
api_base="https://localhost:8443"
|
||||
export FETCHML_DOMAIN=localhost
|
||||
export CADDY_EMAIL=smoke@example.invalid
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
status=$?;
|
||||
if [ "$status" -ne 0 ]; then
|
||||
$compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" logs --no-color || true;
|
||||
fi
|
||||
if [ "${KEEP_STACK:-0}" != "1" ]; then
|
||||
$compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" down -v >/dev/null 2>&1 || true;
|
||||
fi
|
||||
exit "$status";
|
||||
}
|
||||
|
||||
trap cleanup EXIT;
|
||||
echo "Starting $stack_name stack for smoke test...";
|
||||
|
||||
$compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" up -d --build >/dev/null;
|
||||
echo "Waiting for API to become healthy...";
|
||||
|
||||
deadline=$(($(date +%s) + 90));
|
||||
while true; do
|
||||
if [ "$env" = "dev" ]; then
|
||||
if curl -skf "$api_base/health" >/dev/null 2>&1; then break; fi;
|
||||
else
|
||||
if probe_https_health_openssl "localhost" "8443" "/health"; then break; fi;
|
||||
fi
|
||||
if [ $(date +%s) -ge $deadline ]; then echo "Timed out waiting for $api_base/health"; exit 1; fi;
|
||||
sleep 2;
|
||||
done;
|
||||
|
||||
if [ "$env" = "dev" ]; then
|
||||
echo "Checking metrics endpoint...";
|
||||
curl -skf "$api_base/metrics" >/dev/null;
|
||||
|
||||
echo "Waiting for Prometheus target api-server to be up...";
|
||||
deadline=$(($(date +%s) + 90));
|
||||
query_url="$prometheus_base/api/v1/query?query=up%7Bjob%3D%22api-server%22%7D";
|
||||
|
||||
while true; do
|
||||
resp=$(curl -sf "$query_url" || true);
|
||||
resp_compact=$(printf "%s" "$resp" | tr -d '\n' | tr -d '\r');
|
||||
if echo "$resp_compact" | grep -Fq '"instance":"api-server:9101"' && echo "$resp_compact" | grep -Fq ',"1"]'; then break; fi;
|
||||
if [ $(date +%s) -ge $deadline ]; then echo "Timed out waiting for Prometheus api-server target to be up"; echo "$resp"; exit 1; fi;
|
||||
sleep 2;
|
||||
done;
|
||||
fi
|
||||
|
|
@ -1,80 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Homelab Secure Test Environment Script
|
||||
set -e
|
||||
|
||||
echo "Starting Homelab Secure Production Environment..."
|
||||
|
||||
# Clean up any existing containers
|
||||
echo "Cleaning up existing containers..."
|
||||
docker-compose -f deployments/docker-compose.homelab-secure.yml down -v
|
||||
|
||||
# Create necessary directories with proper permissions
|
||||
echo "Creating directories..."
|
||||
mkdir -p data logs
|
||||
chmod 750 data logs
|
||||
|
||||
# Build and start services
|
||||
echo "Building and starting services..."
|
||||
docker-compose -f deployments/docker-compose.homelab-secure.yml up --build -d
|
||||
|
||||
# Wait for services to be healthy
|
||||
echo "Waiting for services to be healthy..."
|
||||
sleep 20
|
||||
|
||||
# Check service health
|
||||
echo "Checking service health..."
|
||||
docker-compose -f deployments/docker-compose.homelab-secure.yml ps
|
||||
|
||||
# Test API server with TLS
|
||||
echo "Testing API server..."
|
||||
curl -k -s https://localhost:9104/health || echo "API health check failed"
|
||||
|
||||
# Test Redis with authentication
|
||||
echo "Testing Redis with authentication..."
|
||||
docker exec ml-homelab-redis redis-cli -a "HomelabRedis2024!" ping || echo "Redis health check failed"
|
||||
|
||||
# Test SSH connectivity with security
|
||||
echo "Testing SSH connectivity..."
|
||||
docker exec -u worker ml-homelab-worker ssh -o StrictHostKeyChecking=no -o Port=2222 worker@localhost "echo 'SSH OK'" || echo "SSH test failed"
|
||||
|
||||
# Test fail2ban status
|
||||
echo "Testing fail2ban..."
|
||||
docker exec ml-homelab-api fail2ban-client status sshd || echo "fail2ban check failed"
|
||||
|
||||
echo ""
|
||||
echo "Homelab secure production environment is ready!"
|
||||
echo ""
|
||||
echo "Services:"
|
||||
echo " - API Server: https://localhost:9104"
|
||||
echo " - SSH: localhost:2223 (worker user)"
|
||||
echo " - Redis: localhost:6379 (with password)"
|
||||
echo " - Metrics: http://localhost:9101"
|
||||
echo ""
|
||||
echo "Security Features:"
|
||||
echo " ✓ Strong TLS 1.3 with modern ciphers"
|
||||
echo " ✓ SSH with fail2ban protection"
|
||||
echo " ✓ Redis with password authentication"
|
||||
echo " ✓ SQLite database with encryption"
|
||||
echo " ✓ Container security hardening"
|
||||
echo " ✓ Rate limiting and CORS protection"
|
||||
echo " ✓ Security headers and CSRF protection"
|
||||
echo " ✓ Podman sandboxed job execution"
|
||||
echo " ✓ Audit logging and monitoring"
|
||||
echo ""
|
||||
echo "Credentials:"
|
||||
echo " - API User: homelab_user / password"
|
||||
echo " - SSH User: worker / HomelabWorker2024!"
|
||||
echo " - Redis Password: HomelabRedis2024!"
|
||||
echo ""
|
||||
echo "To test with CLI:"
|
||||
echo " ./cli/zig-out/bin/ml queue homelab-secure-test"
|
||||
echo " ./cli/zig-out/bin/ml status"
|
||||
echo ""
|
||||
echo "To view logs:"
|
||||
echo " docker-compose -f deployments/docker-compose.homelab-secure.yml logs -f api-server"
|
||||
echo " docker-compose -f deployments/docker-compose.homelab-secure.yml logs -f worker"
|
||||
echo " docker-compose -f deployments/docker-compose.homelab-secure.yml down"
|
||||
echo ""
|
||||
echo "To stop:"
|
||||
echo " docker-compose -f deployments/docker-compose.homelab-secure.yml down"
|
||||
64
scripts/track_performance.sh
Executable file
64
scripts/track_performance.sh
Executable file
|
|
@ -0,0 +1,64 @@
|
|||
#!/bin/bash
|
||||
# Simple performance tracking script
|
||||
|
||||
RESULTS_DIR="test_results/performance"
|
||||
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
|
||||
RESULTS_FILE="$RESULTS_DIR/load_test_$TIMESTAMP.json"
|
||||
|
||||
mkdir -p "$RESULTS_DIR"
|
||||
|
||||
echo "Running load test performance tracking..."
|
||||
echo "Timestamp: $TIMESTAMP"
|
||||
|
||||
# Run tests and capture results
|
||||
go test ./tests/load -run=TestLoadTestSuite -v -load-suite=medium -timeout=10m > "$RESULTS_DIR/raw_$TIMESTAMP.log"
|
||||
|
||||
# Extract key metrics
|
||||
{
|
||||
echo "{"
|
||||
echo " \"timestamp\": \"$TIMESTAMP\","
|
||||
echo " \"tests\": ["
|
||||
|
||||
# Parse light load
|
||||
LIGHT_RPS=$(grep -A1 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Throughput" | awk '{print $2}')
|
||||
LIGHT_ERROR=$(grep -A2 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Error rate" | awk '{print $3}')
|
||||
LIGHT_P99=$(grep -A4 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "P99 latency" | awk '{print $3}')
|
||||
|
||||
echo " {"
|
||||
echo " \"name\": \"LightLoad\","
|
||||
echo " \"throughput_rps\": $LIGHT_RPS,"
|
||||
echo " \"error_rate_percent\": $LIGHT_ERROR,"
|
||||
echo " \"p99_latency_ms\": \"$LIGHT_P99\""
|
||||
echo " },"
|
||||
|
||||
# Parse medium load
|
||||
MEDIUM_RPS=$(grep -A1 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Throughput" | awk '{print $2}')
|
||||
MEDIUM_ERROR=$(grep -A2 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Error rate" | awk '{print $3}')
|
||||
MEDIUM_P99=$(grep -A4 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "P99 latency" | awk '{print $3}')
|
||||
|
||||
echo " {"
|
||||
echo " \"name\": \"MediumLoad\","
|
||||
echo " \"throughput_rps\": $MEDIUM_RPS,"
|
||||
echo " \"error_rate_percent\": $MEDIUM_ERROR,"
|
||||
echo " \"p99_latency_ms\": \"$MEDIUM_P99\""
|
||||
echo " }"
|
||||
echo " ]"
|
||||
echo "}"
|
||||
} > "$RESULTS_FILE"
|
||||
|
||||
echo "Results saved to: $RESULTS_FILE"
|
||||
echo "Raw logs: $RESULTS_DIR/raw_$TIMESTAMP.log"
|
||||
|
||||
# Show comparison with previous run if exists
|
||||
PREV_FILE=$(ls -t "$RESULTS_DIR"/load_test_*.json | sed -n '2p')
|
||||
if [ -n "$PREV_FILE" ]; then
|
||||
echo ""
|
||||
echo "=== Comparison with previous run ==="
|
||||
echo "Previous: $(basename $PREV_FILE)"
|
||||
echo "Current: $(basename $RESULTS_FILE)"
|
||||
echo ""
|
||||
echo "Light Load Throughput:"
|
||||
echo " Previous: $(jq -r '.tests[0].throughput_rps' "$PREV_FILE") RPS"
|
||||
echo " Current: $(jq -r '.tests[0].throughput_rps' "$RESULTS_FILE") RPS"
|
||||
echo " Change: $(echo "$(jq -r '.tests[0].throughput_rps' "$RESULTS_FILE") - $(jq -r '.tests[0].throughput_rps' "$PREV_FILE")" | bc -l) RPS"
|
||||
fi
|
||||
|
|
@ -1,204 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Production Configuration Validator
|
||||
# Verifies all paths and configs are consistent for experiment lifecycle
|
||||
|
||||
set -e
|
||||
|
||||
BOLD='\033[1m'
|
||||
GREEN='\033[0;32m'
|
||||
RED='\033[0;31m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
echo -e "${BOLD}=== FetchML Production Configuration Validator ===${NC}\n"
|
||||
|
||||
# Configuration file paths
|
||||
API_CONFIG="${1:-configs/config-prod.yaml}"
|
||||
WORKER_CONFIG="${2:-configs/worker-prod.toml}"
|
||||
|
||||
errors=0
|
||||
warnings=0
|
||||
|
||||
check_pass() {
|
||||
echo -e "${GREEN}✓${NC} $1"
|
||||
}
|
||||
|
||||
check_fail() {
|
||||
echo -e "${RED}✗${NC} $1"
|
||||
((errors++))
|
||||
}
|
||||
|
||||
check_warn() {
|
||||
echo -e "${YELLOW}⚠${NC} $1"
|
||||
((warnings++))
|
||||
}
|
||||
|
||||
# 1. Check API server config exists
|
||||
echo -e "${BOLD}Checking API Server Configuration${NC}"
|
||||
if [ ! -f "$API_CONFIG" ]; then
|
||||
check_fail "API config not found: $API_CONFIG"
|
||||
else
|
||||
check_pass "API config found: $API_CONFIG"
|
||||
|
||||
# Extract base_path from API config
|
||||
API_BASE_PATH=$(grep 'base_path:' "$API_CONFIG" | head -1 | awk '{print $2}' | tr -d '"')
|
||||
echo " Base path: $API_BASE_PATH"
|
||||
|
||||
# Check if path is absolute
|
||||
if [[ "$API_BASE_PATH" != /* ]]; then
|
||||
check_fail "base_path must be absolute: $API_BASE_PATH"
|
||||
else
|
||||
check_pass "base_path is absolute"
|
||||
fi
|
||||
|
||||
# Check Redis config
|
||||
if grep -q 'redis:' "$API_CONFIG"; then
|
||||
check_pass "Redis configuration present"
|
||||
else
|
||||
check_fail "Redis configuration missing"
|
||||
fi
|
||||
|
||||
# Check auth enabled
|
||||
if grep -q 'enabled: true' "$API_CONFIG"; then
|
||||
check_pass "Authentication enabled"
|
||||
else
|
||||
check_warn "Authentication disabled (not recommended for production)"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# 2. Check Worker config (if provided)
|
||||
if [ -f "$WORKER_CONFIG" ]; then
|
||||
echo -e "${BOLD}Checking Worker Configuration${NC}"
|
||||
check_pass "Worker config found: $WORKER_CONFIG"
|
||||
|
||||
# Extract base_path from worker config
|
||||
WORKER_BASE_PATH=$(grep 'base_path' "$WORKER_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
|
||||
echo " Base path: $WORKER_BASE_PATH"
|
||||
|
||||
# Compare paths
|
||||
if [ "$API_BASE_PATH" = "$WORKER_BASE_PATH" ]; then
|
||||
check_pass "API and Worker base_path match"
|
||||
else
|
||||
check_fail "base_path mismatch! API: $API_BASE_PATH, Worker: $WORKER_BASE_PATH"
|
||||
fi
|
||||
|
||||
# Check podman_image configured
|
||||
if grep -q 'podman_image' "$WORKER_CONFIG"; then
|
||||
PODMAN_IMAGE=$(grep 'podman_image' "$WORKER_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
|
||||
check_pass "Podman image configured: $PODMAN_IMAGE"
|
||||
else
|
||||
check_fail "podman_image not configured"
|
||||
fi
|
||||
else
|
||||
check_warn "Worker config not found: $WORKER_CONFIG (optional for API server only)"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# 3. Check directory structure (if base_path exists)
|
||||
if [ -n "$API_BASE_PATH" ] && [ -d "$API_BASE_PATH" ]; then
|
||||
echo -e "${BOLD}Checking Directory Structure${NC}"
|
||||
check_pass "Base directory exists: $API_BASE_PATH"
|
||||
|
||||
# Check subdirectories
|
||||
for dir in experiments pending running finished failed; do
|
||||
if [ -d "$API_BASE_PATH/$dir" ]; then
|
||||
check_pass "$dir/ directory exists"
|
||||
else
|
||||
check_warn "$dir/ directory missing (will be created automatically)"
|
||||
fi
|
||||
done
|
||||
|
||||
# Check permissions
|
||||
if [ -w "$API_BASE_PATH" ]; then
|
||||
check_pass "Base directory is writable"
|
||||
else
|
||||
check_fail "Base directory is not writable (check permissions)"
|
||||
fi
|
||||
|
||||
elif [ -n "$API_BASE_PATH" ]; then
|
||||
check_warn "Base directory does not exist: $API_BASE_PATH (will need to be created)"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# 4. Check Redis connectivity (if server is running)
|
||||
echo -e "${BOLD}Checking Redis Connectivity${NC}"
|
||||
if command -v redis-cli &> /dev/null; then
|
||||
if redis-cli ping &> /dev/null; then
|
||||
check_pass "Redis server is running and accessible"
|
||||
|
||||
# Check queue
|
||||
QUEUE_SIZE=$(redis-cli llen fetchml:tasks:queue 2>/dev/null || echo "0")
|
||||
echo " Queue size: $QUEUE_SIZE tasks"
|
||||
else
|
||||
check_warn "Redis server not accessible (start with: redis-server)"
|
||||
fi
|
||||
else
|
||||
check_warn "redis-cli not installed (cannot verify Redis connectivity)"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# 5. Check Podman (if worker config exists)
|
||||
if [ -f "$WORKER_CONFIG" ]; then
|
||||
echo -e "${BOLD}Checking Podman${NC}"
|
||||
if command -v podman &> /dev/null; then
|
||||
check_pass "Podman is installed"
|
||||
|
||||
# Check if image exists
|
||||
if [ -n "$PODMAN_IMAGE" ]; then
|
||||
if podman image exists "$PODMAN_IMAGE" 2>/dev/null; then
|
||||
check_pass "Podman image exists: $PODMAN_IMAGE"
|
||||
else
|
||||
check_warn "Podman image not found: $PODMAN_IMAGE (needs to be built)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check GPU access (if configured)
|
||||
if grep -q 'gpu_access.*true' "$WORKER_CONFIG" 2>/dev/null; then
|
||||
if podman run --rm --device nvidia.com/gpu=all nvidia/cuda:11.8.0-base nvidia-smi &>/dev/null; then
|
||||
check_pass "GPU access working"
|
||||
else
|
||||
check_warn "GPU access configured but not working (check nvidia-container-toolkit)"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
check_fail "Podman not installed (required for worker)"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# 6. Check CLI config consistency
|
||||
echo -e "${BOLD}Checking CLI Configuration${NC}"
|
||||
CLI_CONFIG="$HOME/.ml/config.toml"
|
||||
if [ -f "$CLI_CONFIG" ]; then
|
||||
check_pass "CLI config found: $CLI_CONFIG"
|
||||
|
||||
CLI_BASE=$(grep 'worker_base' "$CLI_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
|
||||
if [ "$CLI_BASE" = "$API_BASE_PATH" ]; then
|
||||
check_pass "CLI worker_base matches server base_path"
|
||||
else
|
||||
check_warn "CLI worker_base ($CLI_BASE) differs from server ($API_BASE_PATH)"
|
||||
fi
|
||||
else
|
||||
check_warn "CLI config not found (run: ml init)"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# Summary
|
||||
echo -e "${BOLD}=== Summary ===${NC}"
|
||||
if [ $errors -eq 0 ] && [ $warnings -eq 0 ]; then
|
||||
echo -e "${GREEN}All checks passed! Configuration is ready for production.${NC}"
|
||||
exit 0
|
||||
elif [ $errors -eq 0 ]; then
|
||||
echo -e "${YELLOW}Configuration has $warnings warning(s). Review before deployment.${NC}"
|
||||
exit 0
|
||||
else
|
||||
echo -e "${RED}Configuration has $errors error(s) and $warnings warning(s). Fix before deployment.${NC}"
|
||||
exit 1
|
||||
fi
|
||||
148
scripts/verify_release.sh
Normal file
148
scripts/verify_release.sh
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage:
|
||||
scripts/verify_release.sh --dir <release_dir> [--repo <org>/<repo>]
|
||||
|
||||
What it does:
|
||||
- Verifies checksums.txt signature (keyless cosign) if cosign + checksums.txt.sig/.cert are present
|
||||
- Verifies *.tar.gz files against checksums.txt
|
||||
|
||||
Notes:
|
||||
- --repo enables strict Sigstore identity checking against the release workflow.
|
||||
- Without cosign, the script still verifies SHA256 hashes.
|
||||
|
||||
Examples:
|
||||
scripts/verify_release.sh --dir ./release --repo jfraeys/fetch_ml
|
||||
scripts/verify_release.sh --dir .
|
||||
EOF
|
||||
}
|
||||
|
||||
release_dir=""
|
||||
repo=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--dir)
|
||||
release_dir="${2:-}"
|
||||
shift 2
|
||||
;;
|
||||
--repo)
|
||||
repo="${2:-}"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "unknown argument: $1" >&2
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -z "$release_dir" ]]; then
|
||||
echo "missing --dir" >&2
|
||||
usage >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [[ ! -d "$release_dir" ]]; then
|
||||
echo "directory not found: $release_dir" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
cd "$release_dir"
|
||||
|
||||
if [[ ! -f checksums.txt ]]; then
|
||||
echo "missing checksums.txt in $release_dir" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
has_cosign=false
|
||||
if command -v cosign >/dev/null 2>&1; then
|
||||
has_cosign=true
|
||||
fi
|
||||
|
||||
verify_sigstore() {
|
||||
if [[ ! -f checksums.txt.sig ]] || [[ ! -f checksums.txt.cert ]]; then
|
||||
echo "[verify] cosign available, but checksums.txt.sig/.cert not found; skipping signature verification" >&2
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ -z "$repo" ]]; then
|
||||
echo "[verify] verifying signature (no repo identity pin; pass --repo to pin identity)" >&2
|
||||
COSIGN_YES=true cosign verify-blob \
|
||||
--certificate checksums.txt.cert \
|
||||
--signature checksums.txt.sig \
|
||||
--certificate-oidc-issuer https://token.actions.githubusercontent.com \
|
||||
checksums.txt >/dev/null
|
||||
echo "[ok] checksums.txt signature verified (un-pinned identity)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
local identity
|
||||
identity="^https://github.com/${repo}/\.github/workflows/release\.yml@refs/tags/v.*$"
|
||||
|
||||
COSIGN_YES=true cosign verify-blob \
|
||||
--certificate checksums.txt.cert \
|
||||
--signature checksums.txt.sig \
|
||||
--certificate-identity-regexp "$identity" \
|
||||
--certificate-oidc-issuer https://token.actions.githubusercontent.com \
|
||||
checksums.txt >/dev/null
|
||||
|
||||
echo "[ok] checksums.txt signature verified (pinned to ${repo} release workflow)"
|
||||
}
|
||||
|
||||
verify_hashes() {
|
||||
local failures=0
|
||||
|
||||
local has_sha256sum=false
|
||||
if command -v sha256sum >/dev/null 2>&1; then
|
||||
has_sha256sum=true
|
||||
fi
|
||||
|
||||
while IFS= read -r expected file; do
|
||||
[[ -z "${expected}" ]] && continue
|
||||
[[ -z "${file}" ]] && continue
|
||||
|
||||
if [[ ! -f "$file" ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
local actual
|
||||
if [[ "$has_sha256sum" == true ]]; then
|
||||
actual="$(sha256sum "$file" | awk '{print $1}')"
|
||||
else
|
||||
actual="$(shasum -a 256 "$file" | awk '{print $1}')"
|
||||
fi
|
||||
|
||||
if [[ "$actual" != "$expected" ]]; then
|
||||
echo "[fail] $file" >&2
|
||||
echo " expected: $expected" >&2
|
||||
echo " actual: $actual" >&2
|
||||
failures=$((failures+1))
|
||||
fi
|
||||
done < <(awk '{print $1, $2}' checksums.txt)
|
||||
|
||||
if [[ $failures -gt 0 ]]; then
|
||||
echo "[fail] checksum verification failed ($failures file(s))" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[ok] all available artifacts match checksums.txt"
|
||||
}
|
||||
|
||||
if [[ "$has_cosign" == true ]]; then
|
||||
verify_sigstore
|
||||
else
|
||||
echo "[verify] cosign not installed; skipping signature verification" >&2
|
||||
fi
|
||||
|
||||
verify_hashes
|
||||
|
||||
echo "[ok] release verification complete"
|
||||
|
|
@ -5,6 +5,10 @@
|
|||
|
||||
set -euo pipefail
|
||||
|
||||
make_target_exists() {
|
||||
make -n "$1" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
|
|
@ -45,7 +49,7 @@ show_status() {
|
|||
|
||||
# Check Go apps
|
||||
print_app "Go Applications:"
|
||||
local go_apps=("api-server" "worker" "tui" "data_manager" "user_manager")
|
||||
local go_apps=("api-server" "worker" "tui")
|
||||
for app in "${go_apps[@]}"; do
|
||||
if [[ -f "bin/$app" ]]; then
|
||||
echo " ✅ $app: Built"
|
||||
|
|
@ -85,7 +89,7 @@ show_status() {
|
|||
|
||||
# Check configuration
|
||||
print_app "Configuration:"
|
||||
if [[ -f "configs/config-local.yaml" ]]; then
|
||||
if [[ -f "configs/api/dev.yaml" ]]; then
|
||||
echo " ✅ Security config: Found"
|
||||
else
|
||||
echo " ⚠️ Security config: Not found"
|
||||
|
|
@ -110,14 +114,14 @@ build_all() {
|
|||
echo "============================="
|
||||
echo ""
|
||||
|
||||
print_info "Building Go applications..."
|
||||
make build
|
||||
|
||||
if command -v zig &> /dev/null; then
|
||||
print_info "Building Zig CLI..."
|
||||
make cli-build
|
||||
print_info "Building all components (Go + Zig CLI)..."
|
||||
make build
|
||||
else
|
||||
print_warning "Zig not found, skipping CLI build"
|
||||
print_warning "Zig not found, building Go components only"
|
||||
go build -o bin/api-server cmd/api-server/main.go
|
||||
go build -o bin/worker cmd/worker/worker_server.go
|
||||
go build -o bin/tui ./cmd/tui
|
||||
fi
|
||||
|
||||
print_success "Build completed!"
|
||||
|
|
@ -128,11 +132,13 @@ test_all() {
|
|||
echo "===================="
|
||||
echo ""
|
||||
|
||||
print_info "Running main test suite..."
|
||||
make test
|
||||
|
||||
print_info "Running comprehensive tests..."
|
||||
make test-all
|
||||
if make_target_exists test-full; then
|
||||
print_info "Running full test suite..."
|
||||
make test-full
|
||||
else
|
||||
print_info "Running test suite..."
|
||||
make test
|
||||
fi
|
||||
|
||||
print_success "All tests completed!"
|
||||
}
|
||||
|
|
@ -156,8 +162,8 @@ start_services() {
|
|||
# Start API server if built
|
||||
if [[ -f "bin/api-server" ]]; then
|
||||
print_info "Starting API server..."
|
||||
if [[ -f "configs/config-local.yaml" ]]; then
|
||||
./bin/api-server --config configs/config-local.yaml &
|
||||
if [[ -f "configs/api/dev.yaml" ]]; then
|
||||
./bin/api-server --config configs/api/dev.yaml &
|
||||
else
|
||||
print_warning "No config found, using defaults"
|
||||
./bin/api-server &
|
||||
|
|
@ -187,13 +193,25 @@ check_health() {
|
|||
print_info "Port 9101 is open, checking API health endpoint..."
|
||||
|
||||
# Try the health endpoint
|
||||
response=$(curl -k -s --max-time 3 -H 'X-API-Key: password' -H 'X-Forwarded-For: 127.0.0.1' https://localhost:9101/health 2>/dev/null)
|
||||
local api_key_header=""
|
||||
if [[ -n "${FETCH_ML_API_KEY:-}" ]]; then
|
||||
api_key_header="-H X-API-Key: ${FETCH_ML_API_KEY}"
|
||||
fi
|
||||
|
||||
response=$(curl -s --max-time 3 ${api_key_header} http://localhost:9101/health 2>/dev/null || true)
|
||||
if [[ -z "$response" ]]; then
|
||||
response=$(curl -k -s --max-time 3 ${api_key_header} https://localhost:9101/health 2>/dev/null || true)
|
||||
fi
|
||||
|
||||
if [[ "$response" == "OK" ]]; then
|
||||
print_success "API is healthy: $response"
|
||||
elif [[ "$response" == *"IP not whitelisted"* ]]; then
|
||||
print_warning "API running but IP not whitelisted (expected behavior)"
|
||||
print_info "Try: curl -k -H 'X-API-Key: password' -H 'X-Forwarded-For: 127.0.0.1' https://localhost:9101/health"
|
||||
if [[ -n "${FETCH_ML_API_KEY:-}" ]]; then
|
||||
print_info "Try: curl -k -H 'X-API-Key: $FETCH_ML_API_KEY' https://localhost:9101/health"
|
||||
else
|
||||
print_info "Try: curl -k https://localhost:9101/health"
|
||||
fi
|
||||
else
|
||||
print_error "Unexpected response: $response"
|
||||
fi
|
||||
|
|
@ -229,19 +247,36 @@ run_security() {
|
|||
case "${1:-check}" in
|
||||
"check")
|
||||
print_info "Running security checks..."
|
||||
make security-check
|
||||
if make_target_exists security-check; then
|
||||
make security-check
|
||||
else
|
||||
print_warning "No 'security-check' Make target found"
|
||||
print_info "Try: make ci-local"
|
||||
fi
|
||||
;;
|
||||
"monitor")
|
||||
print_info "Starting security monitoring..."
|
||||
make security-monitor
|
||||
if make_target_exists security-monitor; then
|
||||
make security-monitor
|
||||
else
|
||||
print_warning "No 'security-monitor' Make target found"
|
||||
fi
|
||||
;;
|
||||
"deploy")
|
||||
print_info "Deploying with security..."
|
||||
make security-deploy
|
||||
if make_target_exists security-deploy; then
|
||||
make security-deploy
|
||||
else
|
||||
print_warning "No 'security-deploy' Make target found"
|
||||
fi
|
||||
;;
|
||||
"audit")
|
||||
print_info "Running security audit..."
|
||||
make security-audit
|
||||
if make_target_exists security-audit; then
|
||||
make security-audit
|
||||
else
|
||||
print_warning "No 'security-audit' Make target found"
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 security {check|monitor|deploy|audit}"
|
||||
|
|
@ -258,15 +293,22 @@ run_development() {
|
|||
case "${1:-setup}" in
|
||||
"setup")
|
||||
print_info "Setting up development environment..."
|
||||
./scripts/auto_setup.sh
|
||||
print_warning "Legacy setup scripts were removed; using Makefile/deployments instead"
|
||||
print_info "Try: make dev"
|
||||
print_info "Or: ./deployments/deploy.sh dev up"
|
||||
;;
|
||||
"quick")
|
||||
print_info "Running quick start..."
|
||||
./scripts/quick_start.sh
|
||||
print_warning "Legacy quick start script was removed; using deployments instead"
|
||||
print_info "Try: ./deployments/deploy.sh dev up"
|
||||
;;
|
||||
"deps")
|
||||
print_info "Installing dependencies..."
|
||||
make install-deps
|
||||
if make_target_exists install-deps; then
|
||||
make install-deps
|
||||
else
|
||||
print_warning "No 'install-deps' Make target found"
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 dev {setup|quick|deps}"
|
||||
|
|
@ -309,7 +351,7 @@ cleanup() {
|
|||
echo ""
|
||||
|
||||
print_info "Cleaning project artifacts..."
|
||||
make clean-all
|
||||
make clean
|
||||
|
||||
print_info "Stopping services..."
|
||||
stop_services
|
||||
|
|
@ -330,7 +372,7 @@ show_help() {
|
|||
echo " start - Start all services"
|
||||
echo " stop - Stop all services"
|
||||
echo " health - Check API health endpoint"
|
||||
echo " security - Security management (check|monitor|deploy|audit)"
|
||||
echo " security - Security management (check|monitor|deploy|audit)"
|
||||
echo " dev - Development environment (setup|quick|deps)"
|
||||
echo " logs - Show application logs"
|
||||
echo " cleanup - Clean project artifacts and stop services"
|
||||
|
|
|
|||
|
|
@ -47,7 +47,10 @@ type Improvement struct {
|
|||
}
|
||||
|
||||
// NewPerformanceRegressionDetector creates a new detector instance
|
||||
func NewPerformanceRegressionDetector(baselineFile string, threshold float64) *PerformanceRegressionDetector {
|
||||
func NewPerformanceRegressionDetector(
|
||||
baselineFile string,
|
||||
threshold float64,
|
||||
) *PerformanceRegressionDetector {
|
||||
return &PerformanceRegressionDetector{
|
||||
BaselineFile: baselineFile,
|
||||
Threshold: threshold,
|
||||
|
|
@ -74,7 +77,9 @@ func (prd *PerformanceRegressionDetector) LoadBaseline() ([]BenchmarkResult, err
|
|||
}
|
||||
|
||||
// AnalyzeResults analyzes current results against baseline
|
||||
func (prd *PerformanceRegressionDetector) AnalyzeResults(current []BenchmarkResult) (*RegressionReport, error) {
|
||||
func (prd *PerformanceRegressionDetector) AnalyzeResults(
|
||||
current []BenchmarkResult,
|
||||
) (*RegressionReport, error) {
|
||||
baseline, err := prd.LoadBaseline()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load baseline: %w", err)
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue