chore(ops): reorganize deployments/monitoring and remove legacy scripts
This commit is contained in:
parent
5ef24e4c6d
commit
f726806770
101 changed files with 3598 additions and 4982 deletions
56
configs/api/dev.yaml
Normal file
56
configs/api/dev.yaml
Normal file
|
|
@ -0,0 +1,56 @@
|
||||||
|
base_path: "/data/experiments"
|
||||||
|
|
||||||
|
data_dir: "/data/active"
|
||||||
|
|
||||||
|
auth:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
server:
|
||||||
|
address: "0.0.0.0:9101"
|
||||||
|
tls:
|
||||||
|
enabled: false
|
||||||
|
cert_file: "/app/ssl/cert.pem"
|
||||||
|
key_file: "/app/ssl/key.pem"
|
||||||
|
|
||||||
|
security:
|
||||||
|
production_mode: false
|
||||||
|
allowed_origins:
|
||||||
|
- "http://localhost:3000"
|
||||||
|
api_key_rotation_days: 90
|
||||||
|
audit_logging:
|
||||||
|
enabled: true
|
||||||
|
log_path: "/tmp/fetchml-audit.log"
|
||||||
|
rate_limit:
|
||||||
|
enabled: false
|
||||||
|
requests_per_minute: 60
|
||||||
|
burst_size: 10
|
||||||
|
ip_whitelist: []
|
||||||
|
|
||||||
|
monitoring:
|
||||||
|
prometheus:
|
||||||
|
enabled: true
|
||||||
|
port: 9101
|
||||||
|
path: "/metrics"
|
||||||
|
health_checks:
|
||||||
|
enabled: true
|
||||||
|
interval: "30s"
|
||||||
|
|
||||||
|
redis:
|
||||||
|
addr: "redis:6379"
|
||||||
|
password: ""
|
||||||
|
db: 0
|
||||||
|
|
||||||
|
database:
|
||||||
|
type: "sqlite"
|
||||||
|
connection: "/tmp/fetchml.sqlite"
|
||||||
|
|
||||||
|
logging:
|
||||||
|
level: "info"
|
||||||
|
file: ""
|
||||||
|
audit_log: ""
|
||||||
|
|
||||||
|
resources:
|
||||||
|
max_workers: 1
|
||||||
|
desired_rps_per_worker: 2
|
||||||
|
podman_cpus: "2"
|
||||||
|
podman_memory: "4Gi"
|
||||||
71
configs/api/homelab-secure.yaml
Normal file
71
configs/api/homelab-secure.yaml
Normal file
|
|
@ -0,0 +1,71 @@
|
||||||
|
base_path: "/data/experiments"
|
||||||
|
|
||||||
|
data_dir: "/data/active"
|
||||||
|
|
||||||
|
auth:
|
||||||
|
enabled: true
|
||||||
|
api_keys:
|
||||||
|
homelab_admin:
|
||||||
|
hash: "CHANGE_ME_SHA256_HOMELAB_ADMIN_KEY"
|
||||||
|
admin: true
|
||||||
|
roles:
|
||||||
|
- admin
|
||||||
|
permissions:
|
||||||
|
"*": true
|
||||||
|
homelab_user:
|
||||||
|
hash: "CHANGE_ME_SHA256_HOMELAB_USER_KEY"
|
||||||
|
admin: false
|
||||||
|
roles:
|
||||||
|
- researcher
|
||||||
|
permissions:
|
||||||
|
experiments: true
|
||||||
|
datasets: true
|
||||||
|
jupyter: true
|
||||||
|
|
||||||
|
server:
|
||||||
|
address: ":9101"
|
||||||
|
tls:
|
||||||
|
enabled: false
|
||||||
|
cert_file: "/app/ssl/cert.pem"
|
||||||
|
key_file: "/app/ssl/key.pem"
|
||||||
|
|
||||||
|
security:
|
||||||
|
production_mode: true
|
||||||
|
allowed_origins:
|
||||||
|
- "https://ml-experiments.example.com"
|
||||||
|
rate_limit:
|
||||||
|
enabled: true
|
||||||
|
requests_per_minute: 60
|
||||||
|
burst_size: 10
|
||||||
|
ip_whitelist:
|
||||||
|
- "127.0.0.1"
|
||||||
|
- "192.168.0.0/16"
|
||||||
|
|
||||||
|
monitoring:
|
||||||
|
prometheus:
|
||||||
|
enabled: true
|
||||||
|
port: 9101
|
||||||
|
path: "/metrics"
|
||||||
|
health_checks:
|
||||||
|
enabled: true
|
||||||
|
interval: "30s"
|
||||||
|
|
||||||
|
redis:
|
||||||
|
url: "redis://:CHANGE_ME_REDIS_PASSWORD@redis:6379"
|
||||||
|
password: ""
|
||||||
|
db: 0
|
||||||
|
|
||||||
|
database:
|
||||||
|
type: "sqlite"
|
||||||
|
connection: "/data/experiments/fetch_ml.sqlite"
|
||||||
|
|
||||||
|
logging:
|
||||||
|
level: "info"
|
||||||
|
file: "/logs/fetch_ml.log"
|
||||||
|
audit_log: ""
|
||||||
|
|
||||||
|
resources:
|
||||||
|
max_workers: 1
|
||||||
|
desired_rps_per_worker: 2
|
||||||
|
podman_cpus: "2"
|
||||||
|
podman_memory: "4Gi"
|
||||||
74
configs/api/multi-user.yaml
Normal file
74
configs/api/multi-user.yaml
Normal file
|
|
@ -0,0 +1,74 @@
|
||||||
|
base_path: "/app/data/experiments"
|
||||||
|
|
||||||
|
data_dir: "/data/active"
|
||||||
|
|
||||||
|
auth:
|
||||||
|
enabled: true
|
||||||
|
api_keys:
|
||||||
|
admin_user:
|
||||||
|
hash: "CHANGE_ME_SHA256_ADMIN_USER_KEY"
|
||||||
|
admin: true
|
||||||
|
roles: ["user", "admin"]
|
||||||
|
permissions:
|
||||||
|
"*": true
|
||||||
|
researcher1:
|
||||||
|
hash: "CHANGE_ME_SHA256_RESEARCHER1_KEY"
|
||||||
|
admin: false
|
||||||
|
roles: ["user", "researcher"]
|
||||||
|
permissions:
|
||||||
|
"jobs:read": true
|
||||||
|
"jobs:create": true
|
||||||
|
"jobs:update": true
|
||||||
|
"jobs:delete": false
|
||||||
|
analyst1:
|
||||||
|
hash: "CHANGE_ME_SHA256_ANALYST1_KEY"
|
||||||
|
admin: false
|
||||||
|
roles: ["user", "analyst"]
|
||||||
|
permissions:
|
||||||
|
"jobs:read": true
|
||||||
|
"jobs:create": false
|
||||||
|
"jobs:update": false
|
||||||
|
"jobs:delete": false
|
||||||
|
|
||||||
|
server:
|
||||||
|
address: ":9101"
|
||||||
|
tls:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
security:
|
||||||
|
production_mode: false
|
||||||
|
allowed_origins: []
|
||||||
|
rate_limit:
|
||||||
|
enabled: true
|
||||||
|
requests_per_minute: 60
|
||||||
|
burst_size: 20
|
||||||
|
ip_whitelist: []
|
||||||
|
|
||||||
|
monitoring:
|
||||||
|
prometheus:
|
||||||
|
enabled: true
|
||||||
|
port: 9101
|
||||||
|
path: "/metrics"
|
||||||
|
health_checks:
|
||||||
|
enabled: true
|
||||||
|
interval: "30s"
|
||||||
|
|
||||||
|
redis:
|
||||||
|
url: "redis://redis:6379"
|
||||||
|
password: ""
|
||||||
|
db: 0
|
||||||
|
|
||||||
|
database:
|
||||||
|
type: "sqlite"
|
||||||
|
connection: "/app/data/experiments/fetch_ml.sqlite"
|
||||||
|
|
||||||
|
logging:
|
||||||
|
level: "info"
|
||||||
|
file: "/logs/app.log"
|
||||||
|
audit_log: ""
|
||||||
|
|
||||||
|
resources:
|
||||||
|
max_workers: 3
|
||||||
|
desired_rps_per_worker: 3
|
||||||
|
podman_cpus: "2"
|
||||||
|
podman_memory: "4Gi"
|
||||||
59
configs/api/prod.yaml
Normal file
59
configs/api/prod.yaml
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
base_path: "/app/data/experiments"
|
||||||
|
|
||||||
|
data_dir: "/data/active"
|
||||||
|
|
||||||
|
auth:
|
||||||
|
enabled: true
|
||||||
|
api_keys:
|
||||||
|
admin:
|
||||||
|
hash: "replace-with-sha256-of-your-api-key"
|
||||||
|
admin: true
|
||||||
|
roles:
|
||||||
|
- admin
|
||||||
|
permissions:
|
||||||
|
"*": true
|
||||||
|
|
||||||
|
server:
|
||||||
|
address: ":9101"
|
||||||
|
tls:
|
||||||
|
enabled: true
|
||||||
|
cert_file: "/app/ssl/cert.pem"
|
||||||
|
key_file: "/app/ssl/key.pem"
|
||||||
|
|
||||||
|
security:
|
||||||
|
production_mode: false
|
||||||
|
allowed_origins: []
|
||||||
|
rate_limit:
|
||||||
|
enabled: true
|
||||||
|
requests_per_minute: 60
|
||||||
|
burst_size: 10
|
||||||
|
ip_whitelist: []
|
||||||
|
|
||||||
|
monitoring:
|
||||||
|
prometheus:
|
||||||
|
enabled: true
|
||||||
|
port: 9101
|
||||||
|
path: "/metrics"
|
||||||
|
health_checks:
|
||||||
|
enabled: true
|
||||||
|
interval: "30s"
|
||||||
|
|
||||||
|
redis:
|
||||||
|
addr: "redis:6379"
|
||||||
|
password: ""
|
||||||
|
db: 0
|
||||||
|
|
||||||
|
database:
|
||||||
|
type: "sqlite"
|
||||||
|
connection: "/app/data/experiments/fetch_ml.sqlite"
|
||||||
|
|
||||||
|
logging:
|
||||||
|
level: "info"
|
||||||
|
file: "/logs/fetch_ml.log"
|
||||||
|
audit_log: ""
|
||||||
|
|
||||||
|
resources:
|
||||||
|
max_workers: 2
|
||||||
|
desired_rps_per_worker: 5
|
||||||
|
podman_cpus: "2"
|
||||||
|
podman_memory: "4Gi"
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
# Local development config (TOML)
|
|
||||||
# Used by both CLI and TUI when no overrides are set
|
|
||||||
|
|
||||||
worker_host = "127.0.0.1"
|
|
||||||
worker_user = "dev_user"
|
|
||||||
worker_base = "/tmp/ml-experiments"
|
|
||||||
worker_port = 9101
|
|
||||||
api_key = "your-api-key-here"
|
|
||||||
|
|
@ -1,26 +0,0 @@
|
||||||
auth:
|
|
||||||
enabled: true
|
|
||||||
api_keys:
|
|
||||||
dev_user:
|
|
||||||
hash: "replace-with-sha256-of-your-api-key"
|
|
||||||
admin: true
|
|
||||||
roles:
|
|
||||||
- admin
|
|
||||||
permissions:
|
|
||||||
'*': true
|
|
||||||
|
|
||||||
server:
|
|
||||||
address: ":9101"
|
|
||||||
tls:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
security:
|
|
||||||
rate_limit:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
redis:
|
|
||||||
url: "redis://redis:6379"
|
|
||||||
|
|
||||||
logging:
|
|
||||||
level: info
|
|
||||||
console: true
|
|
||||||
|
|
@ -1,17 +0,0 @@
|
||||||
base_path: "/app/data/experiments"
|
|
||||||
|
|
||||||
auth:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
server:
|
|
||||||
address: ":9101"
|
|
||||||
|
|
||||||
database:
|
|
||||||
type: "sqlite"
|
|
||||||
connection: "/app/data/experiments/fetch_ml.db"
|
|
||||||
|
|
||||||
redis:
|
|
||||||
url: "redis://redis:6379"
|
|
||||||
|
|
||||||
logging:
|
|
||||||
level: "debug"
|
|
||||||
|
|
@ -1,46 +0,0 @@
|
||||||
base_path: "/app/data/experiments"
|
|
||||||
|
|
||||||
auth:
|
|
||||||
enabled: true
|
|
||||||
api_keys:
|
|
||||||
homelab_user:
|
|
||||||
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
|
|
||||||
admin: true
|
|
||||||
roles: ["user", "admin"]
|
|
||||||
permissions:
|
|
||||||
read: true
|
|
||||||
write: true
|
|
||||||
delete: true
|
|
||||||
|
|
||||||
server:
|
|
||||||
address: ":9101"
|
|
||||||
tls:
|
|
||||||
enabled: true
|
|
||||||
cert_file: "/app/ssl/cert.pem"
|
|
||||||
key_file: "/app/ssl/key.pem"
|
|
||||||
|
|
||||||
security:
|
|
||||||
rate_limit:
|
|
||||||
enabled: true
|
|
||||||
requests_per_minute: 30
|
|
||||||
ip_whitelist: []
|
|
||||||
|
|
||||||
# SQLite database for persistence
|
|
||||||
database:
|
|
||||||
type: "sqlite"
|
|
||||||
connection: "/app/data/fetch_ml.db"
|
|
||||||
|
|
||||||
redis:
|
|
||||||
url: "redis://redis:6379"
|
|
||||||
max_connections: 10
|
|
||||||
|
|
||||||
logging:
|
|
||||||
level: "info"
|
|
||||||
file: "/app/logs/app.log"
|
|
||||||
audit_file: "/app/logs/audit.log"
|
|
||||||
|
|
||||||
resources:
|
|
||||||
max_workers: 1
|
|
||||||
desired_rps_per_worker: 2
|
|
||||||
podman_cpus: "2"
|
|
||||||
podman_memory: "8g"
|
|
||||||
|
|
@ -1,39 +0,0 @@
|
||||||
base_path: "/app/data/experiments"
|
|
||||||
|
|
||||||
auth:
|
|
||||||
enabled: true
|
|
||||||
api_keys:
|
|
||||||
homelab_user:
|
|
||||||
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
|
|
||||||
admin: true
|
|
||||||
roles: ["user", "admin"]
|
|
||||||
permissions:
|
|
||||||
read: true
|
|
||||||
write: true
|
|
||||||
delete: true
|
|
||||||
|
|
||||||
server:
|
|
||||||
address: ":9101"
|
|
||||||
tls:
|
|
||||||
enabled: true
|
|
||||||
cert_file: "/app/ssl/cert.pem"
|
|
||||||
key_file: "/app/ssl/key.pem"
|
|
||||||
|
|
||||||
security:
|
|
||||||
rate_limit:
|
|
||||||
enabled: true
|
|
||||||
requests_per_minute: 30
|
|
||||||
ip_whitelist:
|
|
||||||
- "127.0.0.1"
|
|
||||||
- "::1"
|
|
||||||
- "192.168.0.0/16"
|
|
||||||
- "10.0.0.0/8"
|
|
||||||
|
|
||||||
redis:
|
|
||||||
url: "redis://redis:6379"
|
|
||||||
max_connections: 10
|
|
||||||
|
|
||||||
logging:
|
|
||||||
level: "info"
|
|
||||||
file: "/app/logs/app.log"
|
|
||||||
audit_file: "/app/logs/audit.log"
|
|
||||||
|
|
@ -1,58 +0,0 @@
|
||||||
# Secure Homelab Configuration
|
|
||||||
# IMPORTANT: Keep your API keys safe and never share them!
|
|
||||||
|
|
||||||
redis:
|
|
||||||
url: "redis://redis:6379"
|
|
||||||
max_connections: 10
|
|
||||||
|
|
||||||
auth:
|
|
||||||
enabled: true
|
|
||||||
api_keys:
|
|
||||||
homelab_admin:
|
|
||||||
hash: b444f7d99edd0e32c838d900c4f0dfab86690b55871b587b730f3bc84812dd5f
|
|
||||||
admin: true
|
|
||||||
roles:
|
|
||||||
- admin
|
|
||||||
permissions:
|
|
||||||
'*': true
|
|
||||||
homelab_user:
|
|
||||||
hash: 5badb9721b0cb19f5be512854885cadbc7490afc0de1f62db5ae3144c6cc294c
|
|
||||||
admin: false
|
|
||||||
roles:
|
|
||||||
- researcher
|
|
||||||
permissions:
|
|
||||||
'experiments': true
|
|
||||||
'datasets': true
|
|
||||||
'jupyter': true
|
|
||||||
|
|
||||||
server:
|
|
||||||
address: ":9101"
|
|
||||||
tls:
|
|
||||||
enabled: true
|
|
||||||
key_file: "/app/ssl/key.pem"
|
|
||||||
cert_file: "/app/ssl/cert.pem"
|
|
||||||
|
|
||||||
security:
|
|
||||||
rate_limit:
|
|
||||||
enabled: true
|
|
||||||
requests_per_minute: 60
|
|
||||||
burst_size: 10
|
|
||||||
ip_whitelist: []
|
|
||||||
|
|
||||||
logging:
|
|
||||||
level: "info"
|
|
||||||
file: "logs/fetch_ml.log"
|
|
||||||
console: true
|
|
||||||
|
|
||||||
resources:
|
|
||||||
cpu_limit: "2"
|
|
||||||
memory_limit: "4Gi"
|
|
||||||
gpu_limit: 0
|
|
||||||
disk_limit: "10Gi"
|
|
||||||
|
|
||||||
# Prometheus metrics
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
listen_addr: ":9100"
|
|
||||||
tls:
|
|
||||||
enabled: false
|
|
||||||
|
|
@ -1,49 +0,0 @@
|
||||||
redis:
|
|
||||||
url: "redis://redis:6379"
|
|
||||||
max_connections: 10
|
|
||||||
|
|
||||||
auth:
|
|
||||||
enabled: true
|
|
||||||
api_keys:
|
|
||||||
homelab_admin:
|
|
||||||
hash: b444f7d99edd0e32c838d900c4f0dfab86690b55871b587b730f3bc84812dd5f
|
|
||||||
admin: true
|
|
||||||
roles:
|
|
||||||
- admin
|
|
||||||
permissions:
|
|
||||||
'*': true
|
|
||||||
homelab_user:
|
|
||||||
hash: 5badb9721b0cb19f5be512854885cadbc7490afc0de1f62db5ae3144c6cc294c
|
|
||||||
admin: false
|
|
||||||
roles:
|
|
||||||
- researcher
|
|
||||||
permissions:
|
|
||||||
'experiments': true
|
|
||||||
'datasets': true
|
|
||||||
'jupyter': true
|
|
||||||
|
|
||||||
server:
|
|
||||||
address: ":9101"
|
|
||||||
tls:
|
|
||||||
enabled: true
|
|
||||||
cert_file: "/app/ssl/cert.pem"
|
|
||||||
key_file: "/app/ssl/key.pem"
|
|
||||||
|
|
||||||
security:
|
|
||||||
rate_limit:
|
|
||||||
enabled: true
|
|
||||||
requests_per_minute: 60
|
|
||||||
burst_size: 10
|
|
||||||
ip_whitelist:
|
|
||||||
- "127.0.0.1"
|
|
||||||
- "::1"
|
|
||||||
- "172.21.0.1" # Docker gateway
|
|
||||||
|
|
||||||
# Prometheus metrics
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
listen_addr: ":9100"
|
|
||||||
tls:
|
|
||||||
enabled: true
|
|
||||||
cert_file: "/app/ssl/cert.pem"
|
|
||||||
key_file: "/app/ssl/key.pem"
|
|
||||||
|
|
@ -1,78 +0,0 @@
|
||||||
base_path: "/app/data/experiments"
|
|
||||||
|
|
||||||
auth:
|
|
||||||
enabled: true
|
|
||||||
api_keys:
|
|
||||||
admin_user:
|
|
||||||
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
|
|
||||||
admin: true
|
|
||||||
roles: ["user", "admin"]
|
|
||||||
permissions:
|
|
||||||
read: true
|
|
||||||
write: true
|
|
||||||
delete: true
|
|
||||||
researcher1:
|
|
||||||
hash: "ef92b778ba7a6c8f2150019a5678047b6a9a2b95cef8189518f9b35c54d2e3ae" # "research123"
|
|
||||||
admin: false
|
|
||||||
roles: ["user", "researcher"]
|
|
||||||
permissions:
|
|
||||||
jobs:read: true
|
|
||||||
jobs:create: true
|
|
||||||
jobs:update: true
|
|
||||||
jobs:delete: false
|
|
||||||
analyst1:
|
|
||||||
hash: "a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3" # "analyst123"
|
|
||||||
admin: false
|
|
||||||
roles: ["user", "analyst"]
|
|
||||||
permissions:
|
|
||||||
jobs:read: true
|
|
||||||
jobs:create: false
|
|
||||||
jobs:update: false
|
|
||||||
jobs:delete: false
|
|
||||||
|
|
||||||
server:
|
|
||||||
address: ":9101"
|
|
||||||
tls:
|
|
||||||
enabled: false
|
|
||||||
|
|
||||||
security:
|
|
||||||
rate_limit:
|
|
||||||
enabled: true
|
|
||||||
requests_per_minute: 60
|
|
||||||
burst_size: 20
|
|
||||||
ip_whitelist: []
|
|
||||||
cors:
|
|
||||||
enabled: true
|
|
||||||
allowed_origins: ["https://localhost:9103", "https://localhost:3000"]
|
|
||||||
allowed_methods: ["GET", "POST", "PUT", "DELETE", "OPTIONS"]
|
|
||||||
allowed_headers: ["Content-Type", "Authorization"]
|
|
||||||
|
|
||||||
database:
|
|
||||||
type: "sqlite"
|
|
||||||
connection: "/app/data/experiments/fetch_ml.db"
|
|
||||||
max_connections: 20
|
|
||||||
connection_timeout: "30s"
|
|
||||||
|
|
||||||
redis:
|
|
||||||
url: "redis://redis:6379"
|
|
||||||
max_connections: 15
|
|
||||||
connection_timeout: "10s"
|
|
||||||
|
|
||||||
logging:
|
|
||||||
level: "info"
|
|
||||||
file: "/app/logs/app.log"
|
|
||||||
max_size: "100MB"
|
|
||||||
max_backups: 5
|
|
||||||
compress: true
|
|
||||||
|
|
||||||
resources:
|
|
||||||
max_workers: 3
|
|
||||||
desired_rps_per_worker: 3
|
|
||||||
podman_cpus: "2"
|
|
||||||
podman_memory: "4g"
|
|
||||||
job_timeout: "30m"
|
|
||||||
|
|
||||||
monitoring:
|
|
||||||
enabled: true
|
|
||||||
metrics_path: "/metrics"
|
|
||||||
health_check_interval: "30s"
|
|
||||||
|
|
@ -1,59 +0,0 @@
|
||||||
base_path: "./data/ml-experiments"
|
|
||||||
|
|
||||||
auth:
|
|
||||||
enabled: true
|
|
||||||
apikeys:
|
|
||||||
homelab_user:
|
|
||||||
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
|
|
||||||
admin: true
|
|
||||||
roles: ["admin"]
|
|
||||||
permissions:
|
|
||||||
read: true
|
|
||||||
write: true
|
|
||||||
delete: true
|
|
||||||
|
|
||||||
server:
|
|
||||||
address: ":9101"
|
|
||||||
tls:
|
|
||||||
enabled: false # Disabled for local testing
|
|
||||||
cert_file: "./ssl/cert.pem"
|
|
||||||
key_file: "./ssl/key.pem"
|
|
||||||
min_version: "1.3"
|
|
||||||
|
|
||||||
security:
|
|
||||||
rate_limit:
|
|
||||||
enabled: true
|
|
||||||
requests_per_minute: 60
|
|
||||||
burst_size: 10
|
|
||||||
ip_whitelist:
|
|
||||||
- "127.0.0.1"
|
|
||||||
- "::1"
|
|
||||||
- "localhost"
|
|
||||||
- "10.0.0.0/8"
|
|
||||||
- "192.168.0.0/16"
|
|
||||||
- "172.16.0.0/12"
|
|
||||||
failed_login_lockout:
|
|
||||||
enabled: true
|
|
||||||
max_attempts: 5
|
|
||||||
lockout_duration: "15m"
|
|
||||||
|
|
||||||
# SQLite database for production
|
|
||||||
database:
|
|
||||||
type: "sqlite"
|
|
||||||
connection: "data/fetch_ml.db"
|
|
||||||
|
|
||||||
redis:
|
|
||||||
url: "redis://localhost:6379"
|
|
||||||
addr: "localhost:6379"
|
|
||||||
password: "JZVd2Y6IDaLNaYLBOFgQ7ae4Ox5t37NTIyPMQlLJD4k="
|
|
||||||
|
|
||||||
logging:
|
|
||||||
level: "info"
|
|
||||||
file: "logs/fetch_ml.log"
|
|
||||||
audit_log: "logs/audit.log"
|
|
||||||
|
|
||||||
resources:
|
|
||||||
max_workers: 2
|
|
||||||
desired_rps_per_worker: 5
|
|
||||||
podman_cpus: "8"
|
|
||||||
podman_memory: "32g"
|
|
||||||
|
|
@ -1,13 +1,17 @@
|
||||||
# Fetch ML Configuration Example for PostgreSQL
|
# Fetch ML Configuration Example for PostgreSQL
|
||||||
# This example shows how to configure Fetch ML to use PostgreSQL as the database
|
# This example shows how to configure Fetch ML to use PostgreSQL as the database
|
||||||
|
|
||||||
|
base_path: "./data/experiments"
|
||||||
|
|
||||||
auth:
|
auth:
|
||||||
enabled: true
|
enabled: true
|
||||||
apikeys:
|
api_keys:
|
||||||
admin:
|
admin:
|
||||||
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd5f8b6c8b0b4f0b8e3" # "password"
|
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd5f8b6c8b0b4f0b8e3" # "password"
|
||||||
admin: true
|
admin: true
|
||||||
roles: ["admin"]
|
roles: ["admin"]
|
||||||
|
permissions:
|
||||||
|
"*": true
|
||||||
|
|
||||||
server:
|
server:
|
||||||
address: ":9101"
|
address: ":9101"
|
||||||
|
|
@ -25,40 +29,34 @@ database:
|
||||||
# connection: "postgres://fetchml:your_password_here@localhost:5432/fetchml?sslmode=disable"
|
# connection: "postgres://fetchml:your_password_here@localhost:5432/fetchml?sslmode=disable"
|
||||||
|
|
||||||
redis:
|
redis:
|
||||||
host: "localhost"
|
addr: "localhost:6379"
|
||||||
port: 6379
|
|
||||||
password: ""
|
password: ""
|
||||||
db: 0
|
db: 0
|
||||||
pool_size: 10
|
|
||||||
max_retries: 3
|
|
||||||
|
|
||||||
logging:
|
logging:
|
||||||
level: "info"
|
level: "info"
|
||||||
console: true
|
file: ""
|
||||||
format: "text"
|
audit_log: ""
|
||||||
|
|
||||||
security:
|
security:
|
||||||
secret_key: "your-secret-key-here-at-least-16-characters"
|
production_mode: false
|
||||||
jwt_expiry: "24h"
|
|
||||||
rate_limit:
|
rate_limit:
|
||||||
enabled: false
|
enabled: false
|
||||||
requests_per_minute: 60
|
requests_per_minute: 60
|
||||||
burst_size: 10
|
burst_size: 10
|
||||||
|
ip_whitelist: []
|
||||||
|
|
||||||
containers:
|
monitoring:
|
||||||
runtime: "podman"
|
prometheus:
|
||||||
registry: "docker.io"
|
|
||||||
pull_policy: "missing"
|
|
||||||
resources:
|
|
||||||
cpu_limit: "2"
|
|
||||||
memory_limit: "4Gi"
|
|
||||||
gpu_limit: 1
|
|
||||||
|
|
||||||
storage:
|
|
||||||
data_path: "data"
|
|
||||||
results_path: "results"
|
|
||||||
temp_path: "/tmp/fetch_ml"
|
|
||||||
cleanup:
|
|
||||||
enabled: true
|
enabled: true
|
||||||
max_age_hours: 168
|
port: 9101
|
||||||
max_size_gb: 10
|
path: "/metrics"
|
||||||
|
health_checks:
|
||||||
|
enabled: true
|
||||||
|
interval: "30s"
|
||||||
|
|
||||||
|
resources:
|
||||||
|
max_workers: 1
|
||||||
|
desired_rps_per_worker: 2
|
||||||
|
podman_cpus: "2"
|
||||||
|
podman_memory: "4Gi"
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,8 @@
|
||||||
# Fetch ML Configuration Example
|
# Fetch ML Configuration Example
|
||||||
# Copy this file to config.yaml and customize for your environment
|
# Copy this file to config.yaml and customize for your environment
|
||||||
|
|
||||||
|
base_path: "./data/experiments"
|
||||||
|
|
||||||
auth:
|
auth:
|
||||||
enabled: true
|
enabled: true
|
||||||
api_keys:
|
api_keys:
|
||||||
|
|
@ -13,54 +15,43 @@ auth:
|
||||||
"*": true
|
"*": true
|
||||||
|
|
||||||
server:
|
server:
|
||||||
host: "localhost"
|
address: ":9101"
|
||||||
port: 8080
|
tls:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
database:
|
database:
|
||||||
type: "sqlite"
|
type: "sqlite"
|
||||||
connection: "data/fetch_ml.db"
|
connection: "data/fetch_ml.db"
|
||||||
host: ""
|
|
||||||
port: 5432
|
|
||||||
username: ""
|
|
||||||
password: ""
|
|
||||||
database: "fetch_ml"
|
|
||||||
|
|
||||||
redis:
|
redis:
|
||||||
url: "redis://localhost:6379"
|
addr: "localhost:6379"
|
||||||
host: "localhost"
|
|
||||||
port: 6379
|
|
||||||
password: ""
|
password: ""
|
||||||
db: 0
|
db: 0
|
||||||
pool_size: 10
|
|
||||||
max_retries: 3
|
|
||||||
|
|
||||||
logging:
|
logging:
|
||||||
level: "info"
|
level: "info"
|
||||||
file: "logs/fetch_ml.log"
|
file: "logs/fetch_ml.log"
|
||||||
format: "text"
|
audit_log: "logs/audit.log"
|
||||||
console: true
|
|
||||||
|
|
||||||
security:
|
security:
|
||||||
secret_key: "your-secret-key-at-least-16-chars"
|
|
||||||
jwt_expiry: "24h"
|
|
||||||
rate_limit:
|
rate_limit:
|
||||||
enabled: false
|
enabled: false
|
||||||
requests_per_minute: 60
|
requests_per_minute: 60
|
||||||
|
burst_size: 10
|
||||||
|
ip_whitelist: []
|
||||||
|
production_mode: false
|
||||||
|
|
||||||
containers:
|
monitoring:
|
||||||
runtime: "podman"
|
prometheus:
|
||||||
registry: "docker.io"
|
|
||||||
pull_policy: "missing"
|
|
||||||
resources:
|
|
||||||
cpu_limit: "2"
|
|
||||||
memory_limit: "4Gi"
|
|
||||||
gpu_limit: 1
|
|
||||||
|
|
||||||
storage:
|
|
||||||
data_path: "data"
|
|
||||||
results_path: "results"
|
|
||||||
temp_path: "/tmp/fetch_ml"
|
|
||||||
cleanup:
|
|
||||||
enabled: true
|
enabled: true
|
||||||
max_age_hours: 168
|
port: 9101
|
||||||
max_size_gb: 10
|
path: "/metrics"
|
||||||
|
health_checks:
|
||||||
|
enabled: true
|
||||||
|
interval: "30s"
|
||||||
|
|
||||||
|
resources:
|
||||||
|
max_workers: 1
|
||||||
|
desired_rps_per_worker: 2
|
||||||
|
podman_cpus: "2"
|
||||||
|
podman_memory: "4Gi"
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,10 @@ properties:
|
||||||
type: string
|
type: string
|
||||||
description: Base path for experiment data
|
description: Base path for experiment data
|
||||||
default: "/tmp/ml-experiments"
|
default: "/tmp/ml-experiments"
|
||||||
|
data_dir:
|
||||||
|
type: string
|
||||||
|
description: Data directory (datasets/snapshots) for integrity validation
|
||||||
|
default: "/data/active"
|
||||||
auth:
|
auth:
|
||||||
type: object
|
type: object
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
|
|
@ -40,7 +44,6 @@ properties:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
type: string
|
type: string
|
||||||
enum: [admin, data_scientist, data_engineer, viewer, operator]
|
|
||||||
permissions:
|
permissions:
|
||||||
type: object
|
type: object
|
||||||
additionalProperties:
|
additionalProperties:
|
||||||
|
|
@ -64,9 +67,30 @@ properties:
|
||||||
type: string
|
type: string
|
||||||
key_file:
|
key_file:
|
||||||
type: string
|
type: string
|
||||||
min_version:
|
monitoring:
|
||||||
|
type: object
|
||||||
|
additionalProperties: false
|
||||||
|
properties:
|
||||||
|
prometheus:
|
||||||
|
type: object
|
||||||
|
additionalProperties: false
|
||||||
|
properties:
|
||||||
|
enabled:
|
||||||
|
type: boolean
|
||||||
|
port:
|
||||||
|
type: integer
|
||||||
|
minimum: 1
|
||||||
|
maximum: 65535
|
||||||
|
path:
|
||||||
|
type: string
|
||||||
|
health_checks:
|
||||||
|
type: object
|
||||||
|
additionalProperties: false
|
||||||
|
properties:
|
||||||
|
enabled:
|
||||||
|
type: boolean
|
||||||
|
interval:
|
||||||
type: string
|
type: string
|
||||||
description: Minimum TLS version (e.g. "1.3")
|
|
||||||
database:
|
database:
|
||||||
type: object
|
type: object
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
|
|
@ -99,58 +123,56 @@ properties:
|
||||||
addr:
|
addr:
|
||||||
type: string
|
type: string
|
||||||
description: Optional host:port shorthand for Redis
|
description: Optional host:port shorthand for Redis
|
||||||
host:
|
|
||||||
type: string
|
|
||||||
default: "localhost"
|
|
||||||
port:
|
|
||||||
type: integer
|
|
||||||
minimum: 1
|
|
||||||
maximum: 65535
|
|
||||||
default: 6379
|
|
||||||
password:
|
password:
|
||||||
type: string
|
type: string
|
||||||
db:
|
db:
|
||||||
type: integer
|
type: integer
|
||||||
minimum: 0
|
minimum: 0
|
||||||
default: 0
|
default: 0
|
||||||
pool_size:
|
queue:
|
||||||
type: integer
|
type: object
|
||||||
minimum: 1
|
additionalProperties: false
|
||||||
default: 10
|
properties:
|
||||||
max_retries:
|
backend:
|
||||||
type: integer
|
type: string
|
||||||
minimum: 0
|
enum: [redis, sqlite]
|
||||||
default: 3
|
default: redis
|
||||||
|
sqlite_path:
|
||||||
|
type: string
|
||||||
logging:
|
logging:
|
||||||
type: object
|
type: object
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
properties:
|
properties:
|
||||||
level:
|
level:
|
||||||
type: string
|
type: string
|
||||||
enum: [debug, info, warn, error, fatal]
|
enum: [debug, info, warn, error]
|
||||||
default: "info"
|
default: "info"
|
||||||
file:
|
file:
|
||||||
type: string
|
type: string
|
||||||
audit_log:
|
audit_log:
|
||||||
type: string
|
type: string
|
||||||
format:
|
|
||||||
type: string
|
|
||||||
enum: [text, json]
|
|
||||||
default: "text"
|
|
||||||
console:
|
|
||||||
type: boolean
|
|
||||||
default: true
|
|
||||||
security:
|
security:
|
||||||
type: object
|
type: object
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
properties:
|
properties:
|
||||||
secret_key:
|
production_mode:
|
||||||
type: string
|
type: boolean
|
||||||
minLength: 16
|
default: false
|
||||||
jwt_expiry:
|
allowed_origins:
|
||||||
type: string
|
type: array
|
||||||
pattern: "^\\d+[smhd]$"
|
items:
|
||||||
default: "24h"
|
type: string
|
||||||
|
api_key_rotation_days:
|
||||||
|
type: integer
|
||||||
|
minimum: 0
|
||||||
|
audit_logging:
|
||||||
|
type: object
|
||||||
|
additionalProperties: false
|
||||||
|
properties:
|
||||||
|
enabled:
|
||||||
|
type: boolean
|
||||||
|
log_path:
|
||||||
|
type: string
|
||||||
ip_whitelist:
|
ip_whitelist:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
|
|
@ -183,23 +205,23 @@ properties:
|
||||||
minimum: 1
|
minimum: 1
|
||||||
resources:
|
resources:
|
||||||
type: object
|
type: object
|
||||||
description: Resource configuration defaults
|
description: Resource configuration
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
properties:
|
properties:
|
||||||
cpu_limit:
|
max_workers:
|
||||||
type: string
|
|
||||||
description: Default CPU limit (e.g., "2" or "500m")
|
|
||||||
default: "2"
|
|
||||||
memory_limit:
|
|
||||||
type: string
|
|
||||||
description: Default memory limit (e.g., "1Gi" or "512Mi")
|
|
||||||
default: "4Gi"
|
|
||||||
gpu_limit:
|
|
||||||
type: integer
|
type: integer
|
||||||
description: Default GPU limit
|
minimum: 1
|
||||||
minimum: 0
|
default: 1
|
||||||
default: 0
|
desired_rps_per_worker:
|
||||||
disk_limit:
|
type: integer
|
||||||
|
minimum: 1
|
||||||
|
requests_per_sec:
|
||||||
|
type: integer
|
||||||
|
minimum: 1
|
||||||
|
podman_cpus:
|
||||||
type: string
|
type: string
|
||||||
description: Default disk limit
|
podman_memory:
|
||||||
default: "10Gi"
|
type: string
|
||||||
|
request_burst:
|
||||||
|
type: integer
|
||||||
|
minimum: 0
|
||||||
|
|
|
||||||
|
|
@ -2,10 +2,28 @@ $schema: "http://json-schema.org/draft-07/schema#"
|
||||||
title: "Fetch ML Worker Configuration"
|
title: "Fetch ML Worker Configuration"
|
||||||
type: object
|
type: object
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
|
allOf:
|
||||||
|
# forbid both index and UUID at once (allow zero or one)
|
||||||
|
- not:
|
||||||
|
required: [gpu_visible_devices, gpu_visible_device_ids]
|
||||||
|
- if:
|
||||||
|
properties:
|
||||||
|
queue:
|
||||||
|
properties:
|
||||||
|
backend:
|
||||||
|
const: sqlite
|
||||||
|
required: [queue]
|
||||||
|
then:
|
||||||
|
properties:
|
||||||
|
queue:
|
||||||
|
required: [sqlite_path]
|
||||||
|
else:
|
||||||
|
anyOf:
|
||||||
|
- required: [redis_addr]
|
||||||
|
- required: [redis_url]
|
||||||
required:
|
required:
|
||||||
- base_path
|
- base_path
|
||||||
- worker_id
|
- worker_id
|
||||||
- redis_addr
|
|
||||||
- podman_image
|
- podman_image
|
||||||
- container_workspace
|
- container_workspace
|
||||||
- container_results
|
- container_results
|
||||||
|
|
@ -31,6 +49,9 @@ properties:
|
||||||
train_script:
|
train_script:
|
||||||
type: string
|
type: string
|
||||||
description: Path to training script
|
description: Path to training script
|
||||||
|
redis_url:
|
||||||
|
type: string
|
||||||
|
description: Legacy Redis URL (if set, redis_addr/password/db are derived)
|
||||||
redis_addr:
|
redis_addr:
|
||||||
type: string
|
type: string
|
||||||
description: Redis server address
|
description: Redis server address
|
||||||
|
|
@ -42,6 +63,18 @@ properties:
|
||||||
minimum: 0
|
minimum: 0
|
||||||
default: 0
|
default: 0
|
||||||
description: Redis database number
|
description: Redis database number
|
||||||
|
queue:
|
||||||
|
type: object
|
||||||
|
description: Queue backend configuration (optional; defaults to redis)
|
||||||
|
additionalProperties: false
|
||||||
|
properties:
|
||||||
|
backend:
|
||||||
|
type: string
|
||||||
|
enum: [redis, sqlite]
|
||||||
|
default: redis
|
||||||
|
sqlite_path:
|
||||||
|
type: string
|
||||||
|
description: Path to queue.db (sqlite backend only)
|
||||||
known_hosts:
|
known_hosts:
|
||||||
type: string
|
type: string
|
||||||
description: Path to SSH known hosts file
|
description: Path to SSH known hosts file
|
||||||
|
|
@ -116,6 +149,48 @@ properties:
|
||||||
type: string
|
type: string
|
||||||
description: Dataset cache TTL duration
|
description: Dataset cache TTL duration
|
||||||
default: "30m"
|
default: "30m"
|
||||||
|
snapshot_store:
|
||||||
|
type: object
|
||||||
|
description: Optional S3-compatible snapshot store configuration (worker pulls snapshots by snapshot_id)
|
||||||
|
additionalProperties: false
|
||||||
|
properties:
|
||||||
|
enabled:
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
endpoint:
|
||||||
|
type: string
|
||||||
|
description: S3-compatible endpoint (e.g. "s3.amazonaws.com" or "minio:9000")
|
||||||
|
secure:
|
||||||
|
type: boolean
|
||||||
|
default: true
|
||||||
|
region:
|
||||||
|
type: string
|
||||||
|
bucket:
|
||||||
|
type: string
|
||||||
|
prefix:
|
||||||
|
type: string
|
||||||
|
description: Object key prefix where snapshots are stored
|
||||||
|
access_key:
|
||||||
|
type: string
|
||||||
|
description: Optional static access key (otherwise uses env credentials)
|
||||||
|
secret_key:
|
||||||
|
type: string
|
||||||
|
description: Optional static secret key (otherwise uses env credentials)
|
||||||
|
session_token:
|
||||||
|
type: string
|
||||||
|
description: Optional session token for temporary credentials
|
||||||
|
timeout:
|
||||||
|
type: string
|
||||||
|
description: Duration string (e.g., "10m")
|
||||||
|
default: "10m"
|
||||||
|
max_retries:
|
||||||
|
type: integer
|
||||||
|
minimum: 0
|
||||||
|
default: 3
|
||||||
|
prewarm_enabled:
|
||||||
|
type: boolean
|
||||||
|
description: Enable best-effort prewarming of next task artifacts (snapshot/datasets/env image). Default off.
|
||||||
|
default: false
|
||||||
podman_image:
|
podman_image:
|
||||||
type: string
|
type: string
|
||||||
minLength: 1
|
minLength: 1
|
||||||
|
|
@ -126,10 +201,40 @@ properties:
|
||||||
container_results:
|
container_results:
|
||||||
type: string
|
type: string
|
||||||
description: Container results path
|
description: Container results path
|
||||||
gpu_access:
|
gpu_devices:
|
||||||
type: boolean
|
type: array
|
||||||
default: false
|
description: GPU device paths to expose to the container (e.g. ["/dev/dri"]).
|
||||||
description: Enable GPU access
|
items:
|
||||||
|
type: string
|
||||||
|
gpu_vendor:
|
||||||
|
type: string
|
||||||
|
enum: [nvidia, amd, apple, none]
|
||||||
|
description: GPU vendor/runtime selection for env var injection (nvidia|amd|apple|none).
|
||||||
|
default: "none"
|
||||||
|
gpu_visible_devices:
|
||||||
|
type: array
|
||||||
|
description: GPU indices to expose via vendor-specific env (e.g. [0,1]).
|
||||||
|
items:
|
||||||
|
type: integer
|
||||||
|
gpu_visible_device_ids:
|
||||||
|
type: array
|
||||||
|
description: NVIDIA GPU UUIDs to expose via CUDA_VISIBLE_DEVICES (e.g. ["GPU-..."]). Mutually exclusive with gpu_visible_devices.
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
apple_gpu:
|
||||||
|
type: object
|
||||||
|
description: Apple M-series GPU configuration
|
||||||
|
additionalProperties: false
|
||||||
|
properties:
|
||||||
|
enabled:
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
metal_device:
|
||||||
|
type: string
|
||||||
|
description: Path to Metal device node (e.g. /dev/metal)
|
||||||
|
mps_runtime:
|
||||||
|
type: string
|
||||||
|
description: Path to MPS runtime device node (e.g. /dev/mps)
|
||||||
task_lease_duration:
|
task_lease_duration:
|
||||||
type: string
|
type: string
|
||||||
description: Task lease duration
|
description: Task lease duration
|
||||||
|
|
|
||||||
58
configs/workers/docker-dev.yaml
Normal file
58
configs/workers/docker-dev.yaml
Normal file
|
|
@ -0,0 +1,58 @@
|
||||||
|
worker_id: "docker-worker"
|
||||||
|
base_path: "/data/experiments"
|
||||||
|
train_script: "train.py"
|
||||||
|
|
||||||
|
redis_url: "redis://redis:6379/0"
|
||||||
|
|
||||||
|
local_mode: true
|
||||||
|
|
||||||
|
prewarm_enabled: true
|
||||||
|
|
||||||
|
max_workers: 1
|
||||||
|
poll_interval_seconds: 2
|
||||||
|
|
||||||
|
auto_fetch_data: false
|
||||||
|
|
||||||
|
data_manager_path: "./data_manager"
|
||||||
|
dataset_cache_ttl: "30m"
|
||||||
|
|
||||||
|
data_dir: "/data/active"
|
||||||
|
|
||||||
|
snapshot_store:
|
||||||
|
enabled: true
|
||||||
|
endpoint: "minio:9000"
|
||||||
|
secure: false
|
||||||
|
bucket: "fetchml-snapshots"
|
||||||
|
prefix: "snapshots"
|
||||||
|
timeout: "2m"
|
||||||
|
max_retries: 3
|
||||||
|
|
||||||
|
podman_image: "python:3.9-slim"
|
||||||
|
container_workspace: "/workspace"
|
||||||
|
container_results: "/results"
|
||||||
|
gpu_devices:
|
||||||
|
- "/dev/dri"
|
||||||
|
gpu_vendor: "apple"
|
||||||
|
gpu_visible_devices: []
|
||||||
|
|
||||||
|
# Apple M-series GPU configuration
|
||||||
|
apple_gpu:
|
||||||
|
enabled: true
|
||||||
|
metal_device: "/dev/metal"
|
||||||
|
mps_runtime: "/dev/mps"
|
||||||
|
|
||||||
|
resources:
|
||||||
|
max_workers: 1
|
||||||
|
desired_rps_per_worker: 2
|
||||||
|
podman_cpus: "2"
|
||||||
|
podman_memory: "4Gi"
|
||||||
|
|
||||||
|
metrics:
|
||||||
|
enabled: true
|
||||||
|
listen_addr: ":9100"
|
||||||
|
metrics_flush_interval: "500ms"
|
||||||
|
|
||||||
|
task_lease_duration: "30m"
|
||||||
|
heartbeat_interval: "1m"
|
||||||
|
max_retries: 3
|
||||||
|
graceful_timeout: "5m"
|
||||||
50
configs/workers/docker-prod.yaml
Normal file
50
configs/workers/docker-prod.yaml
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
worker_id: "docker-worker"
|
||||||
|
base_path: "/tmp/fetchml-jobs"
|
||||||
|
train_script: "train.py"
|
||||||
|
|
||||||
|
redis_url: "redis://redis:6379/0"
|
||||||
|
|
||||||
|
local_mode: true
|
||||||
|
|
||||||
|
max_workers: 1
|
||||||
|
poll_interval_seconds: 2
|
||||||
|
|
||||||
|
auto_fetch_data: false
|
||||||
|
|
||||||
|
data_manager_path: "./data_manager"
|
||||||
|
dataset_cache_ttl: "30m"
|
||||||
|
|
||||||
|
data_dir: "/data/active"
|
||||||
|
|
||||||
|
snapshot_store:
|
||||||
|
enabled: true
|
||||||
|
endpoint: "minio:9000"
|
||||||
|
secure: false
|
||||||
|
bucket: "fetchml-snapshots"
|
||||||
|
prefix: "snapshots"
|
||||||
|
timeout: "5m"
|
||||||
|
max_retries: 3
|
||||||
|
|
||||||
|
podman_image: "python:3.9-slim"
|
||||||
|
container_workspace: "/workspace"
|
||||||
|
container_results: "/results"
|
||||||
|
gpu_vendor: "nvidia"
|
||||||
|
gpu_visible_devices: [0]
|
||||||
|
gpu_devices: ["/dev/nvidia0"]
|
||||||
|
|
||||||
|
|
||||||
|
resources:
|
||||||
|
max_workers: 1
|
||||||
|
desired_rps_per_worker: 2
|
||||||
|
podman_cpus: "2"
|
||||||
|
podman_memory: "4Gi"
|
||||||
|
|
||||||
|
metrics:
|
||||||
|
enabled: true
|
||||||
|
listen_addr: ":9100"
|
||||||
|
metrics_flush_interval: "500ms"
|
||||||
|
|
||||||
|
task_lease_duration: "30m"
|
||||||
|
heartbeat_interval: "1m"
|
||||||
|
max_retries: 3
|
||||||
|
graceful_timeout: "5m"
|
||||||
43
configs/workers/docker.yaml
Normal file
43
configs/workers/docker.yaml
Normal file
|
|
@ -0,0 +1,43 @@
|
||||||
|
worker_id: "docker-worker"
|
||||||
|
base_path: "/tmp/fetchml-jobs"
|
||||||
|
train_script: "train.py"
|
||||||
|
|
||||||
|
redis_addr: "redis:6379"
|
||||||
|
redis_password: ""
|
||||||
|
redis_db: 0
|
||||||
|
|
||||||
|
local_mode: true
|
||||||
|
|
||||||
|
max_workers: 1
|
||||||
|
poll_interval_seconds: 5
|
||||||
|
|
||||||
|
auto_fetch_data: false
|
||||||
|
|
||||||
|
data_manager_path: "./data_manager"
|
||||||
|
dataset_cache_ttl: "30m"
|
||||||
|
|
||||||
|
snapshot_store:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
podman_image: "python:3.9-slim"
|
||||||
|
container_workspace: "/workspace"
|
||||||
|
container_results: "/results"
|
||||||
|
gpu_devices: []
|
||||||
|
gpu_vendor: "none"
|
||||||
|
gpu_visible_devices: []
|
||||||
|
|
||||||
|
resources:
|
||||||
|
max_workers: 1
|
||||||
|
desired_rps_per_worker: 2
|
||||||
|
podman_cpus: "2"
|
||||||
|
podman_memory: "4Gi"
|
||||||
|
|
||||||
|
metrics:
|
||||||
|
enabled: true
|
||||||
|
listen_addr: ":9100"
|
||||||
|
metrics_flush_interval: "500ms"
|
||||||
|
|
||||||
|
task_lease_duration: "30m"
|
||||||
|
heartbeat_interval: "1m"
|
||||||
|
max_retries: 3
|
||||||
|
graceful_timeout: "5m"
|
||||||
27
configs/workers/examples/prewarm-worker.yaml
Normal file
27
configs/workers/examples/prewarm-worker.yaml
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
worker_id: "test-prewarm-worker"
|
||||||
|
host: "localhost"
|
||||||
|
port: 8081
|
||||||
|
base_path: "/tmp/fetch-ml-test"
|
||||||
|
data_dir: "/tmp/fetch-ml-test/data"
|
||||||
|
max_workers: 2
|
||||||
|
local_mode: true
|
||||||
|
auto_fetch_data: true
|
||||||
|
prewarm_enabled: true
|
||||||
|
metrics:
|
||||||
|
enabled: true
|
||||||
|
listen_addr: ":9102"
|
||||||
|
train_script: "train.py"
|
||||||
|
snapshot_store:
|
||||||
|
enabled: false
|
||||||
|
endpoint: ""
|
||||||
|
secure: false
|
||||||
|
region: ""
|
||||||
|
bucket: ""
|
||||||
|
prefix: ""
|
||||||
|
access_key: ""
|
||||||
|
secret_key: ""
|
||||||
|
session_token: ""
|
||||||
|
max_retries: 3
|
||||||
|
timeout: 0s
|
||||||
|
gpu_devices: []
|
||||||
|
gpu_access: "none"
|
||||||
47
configs/workers/homelab-secure.yaml
Normal file
47
configs/workers/homelab-secure.yaml
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
worker_id: "homelab-worker"
|
||||||
|
base_path: "/tmp/fetchml-jobs"
|
||||||
|
train_script: "train.py"
|
||||||
|
|
||||||
|
redis_url: "redis://:${REDIS_PASSWORD}@redis:6379/0"
|
||||||
|
|
||||||
|
local_mode: true
|
||||||
|
|
||||||
|
max_workers: 1
|
||||||
|
poll_interval_seconds: 2
|
||||||
|
|
||||||
|
auto_fetch_data: false
|
||||||
|
|
||||||
|
data_manager_path: "./data_manager"
|
||||||
|
dataset_cache_ttl: "30m"
|
||||||
|
|
||||||
|
data_dir: "/data/active"
|
||||||
|
|
||||||
|
snapshot_store:
|
||||||
|
enabled: true
|
||||||
|
endpoint: "minio:9000"
|
||||||
|
secure: false
|
||||||
|
bucket: "fetchml-snapshots"
|
||||||
|
prefix: "snapshots"
|
||||||
|
timeout: "5m"
|
||||||
|
max_retries: 3
|
||||||
|
|
||||||
|
podman_image: "python:3.9-slim"
|
||||||
|
container_workspace: "/workspace"
|
||||||
|
container_results: "/results"
|
||||||
|
gpu_devices: []
|
||||||
|
|
||||||
|
resources:
|
||||||
|
max_workers: 1
|
||||||
|
desired_rps_per_worker: 2
|
||||||
|
podman_cpus: "2"
|
||||||
|
podman_memory: "4Gi"
|
||||||
|
|
||||||
|
metrics:
|
||||||
|
enabled: true
|
||||||
|
listen_addr: ":9100"
|
||||||
|
metrics_flush_interval: "500ms"
|
||||||
|
|
||||||
|
task_lease_duration: "30m"
|
||||||
|
heartbeat_interval: "1m"
|
||||||
|
max_retries: 3
|
||||||
|
graceful_timeout: "5m"
|
||||||
|
|
@ -1,51 +0,0 @@
|
||||||
# Worker configuration for Docker production-like testing
|
|
||||||
worker_id: "docker-test-worker-1"
|
|
||||||
|
|
||||||
# Redis configuration
|
|
||||||
redis:
|
|
||||||
url: "redis://redis:6379"
|
|
||||||
max_connections: 10
|
|
||||||
|
|
||||||
# Local mode settings
|
|
||||||
local_mode: false # Use Podman for containerized job execution
|
|
||||||
|
|
||||||
# Job paths
|
|
||||||
base_path: "/tmp/fetchml-jobs"
|
|
||||||
|
|
||||||
# Container workspace (not used in local mode)
|
|
||||||
container_workspace: "/workspace"
|
|
||||||
container_results: "/results"
|
|
||||||
|
|
||||||
# Podman settings (not used in local mode)
|
|
||||||
podman_image: "python:3.9-slim"
|
|
||||||
podman_cpus: "2"
|
|
||||||
podman_memory: "4g"
|
|
||||||
|
|
||||||
# Worker configuration
|
|
||||||
heartbeat_interval: "30s"
|
|
||||||
lease_duration: "5m"
|
|
||||||
max_concurrent_tasks: 1
|
|
||||||
|
|
||||||
# Data manager settings
|
|
||||||
data_manager:
|
|
||||||
enabled: false
|
|
||||||
base_path: "/data"
|
|
||||||
|
|
||||||
# SSH settings for Podman communication
|
|
||||||
ssh:
|
|
||||||
enabled: true
|
|
||||||
host: "localhost"
|
|
||||||
port: 2222
|
|
||||||
user: "worker"
|
|
||||||
password: "SecureWorkerPass2024!"
|
|
||||||
key_path: "/home/worker/.ssh/id_rsa"
|
|
||||||
|
|
||||||
# Logging
|
|
||||||
logging:
|
|
||||||
level: "info"
|
|
||||||
file: "/logs/worker.log"
|
|
||||||
|
|
||||||
# Metrics
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
endpoint: ":9100"
|
|
||||||
|
|
@ -1,79 +0,0 @@
|
||||||
# Worker configuration for Homelab secure environment
|
|
||||||
worker_id: "homelab-secure-worker-1"
|
|
||||||
|
|
||||||
# Redis configuration with connection pooling
|
|
||||||
redis:
|
|
||||||
url: "redis://redis:6379"
|
|
||||||
max_connections: 10
|
|
||||||
connection_timeout: "10s"
|
|
||||||
read_timeout: "5s"
|
|
||||||
write_timeout: "5s"
|
|
||||||
|
|
||||||
# Local mode disabled for containerized execution
|
|
||||||
local_mode: false
|
|
||||||
|
|
||||||
# Job paths with security considerations
|
|
||||||
base_path: "/tmp/fetchml-jobs"
|
|
||||||
container_workspace: "/workspace"
|
|
||||||
container_results: "/results"
|
|
||||||
|
|
||||||
# Podman settings with resource limits
|
|
||||||
podman_image: "python:3.11-slim"
|
|
||||||
podman_cpus: "2"
|
|
||||||
podman_memory: "4g"
|
|
||||||
podman_network: "ml-job-network"
|
|
||||||
podman_timeout: "30m"
|
|
||||||
|
|
||||||
# Worker configuration with security
|
|
||||||
heartbeat_interval: "30s"
|
|
||||||
lease_duration: "5m"
|
|
||||||
max_concurrent_tasks: 2
|
|
||||||
task_timeout: "30m"
|
|
||||||
|
|
||||||
# Data manager settings
|
|
||||||
data_manager:
|
|
||||||
enabled: true
|
|
||||||
base_path: "/data"
|
|
||||||
encryption_enabled: true
|
|
||||||
backup_enabled: true
|
|
||||||
|
|
||||||
# SSH settings with secure configuration
|
|
||||||
ssh:
|
|
||||||
enabled: true
|
|
||||||
host: "localhost"
|
|
||||||
port: 2222
|
|
||||||
user: "worker"
|
|
||||||
password: "HomelabWorker2024!"
|
|
||||||
key_path: "/home/worker/.ssh/id_rsa"
|
|
||||||
max_retries: 3
|
|
||||||
connection_timeout: "30s"
|
|
||||||
strict_host_key_checking: false
|
|
||||||
|
|
||||||
# Logging with rotation and security
|
|
||||||
logging:
|
|
||||||
level: "info"
|
|
||||||
file: "/logs/worker.log"
|
|
||||||
max_size: "50MB"
|
|
||||||
max_backups: 5
|
|
||||||
compress: true
|
|
||||||
audit_enabled: true
|
|
||||||
|
|
||||||
# Metrics and monitoring
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
endpoint: ":9100"
|
|
||||||
path: "/metrics"
|
|
||||||
|
|
||||||
# Security settings
|
|
||||||
security:
|
|
||||||
enable_job_isolation: true
|
|
||||||
sandbox_enabled: true
|
|
||||||
resource_monitoring: true
|
|
||||||
audit_commands: true
|
|
||||||
|
|
||||||
# Health check configuration
|
|
||||||
health_check:
|
|
||||||
enabled: true
|
|
||||||
interval: "30s"
|
|
||||||
timeout: "10s"
|
|
||||||
failure_threshold: 3
|
|
||||||
|
|
@ -4,7 +4,7 @@ max_workers = 4
|
||||||
|
|
||||||
# Redis connection
|
# Redis connection
|
||||||
redis_addr = "localhost:6379"
|
redis_addr = "localhost:6379"
|
||||||
redis_password = "your-redis-password"
|
redis_password = "CHANGE_ME_REDIS_PASSWORD"
|
||||||
redis_db = 0
|
redis_db = 0
|
||||||
|
|
||||||
# SSH connection (for remote operations)
|
# SSH connection (for remote operations)
|
||||||
|
|
@ -15,17 +15,13 @@ ssh_key = "~/.ssh/id_rsa"
|
||||||
|
|
||||||
# Podman configuration
|
# Podman configuration
|
||||||
podman_image = "ml-training:latest"
|
podman_image = "ml-training:latest"
|
||||||
gpu_access = true
|
gpu_vendor = "none"
|
||||||
|
gpu_visible_devices = []
|
||||||
|
gpu_devices = []
|
||||||
container_workspace = "/workspace"
|
container_workspace = "/workspace"
|
||||||
container_results = "/results"
|
container_results = "/results"
|
||||||
train_script = "train.py"
|
train_script = "train.py"
|
||||||
|
|
||||||
[resources]
|
|
||||||
max_workers = 4
|
|
||||||
desired_rps_per_worker = 2
|
|
||||||
podman_cpus = "4"
|
|
||||||
podman_memory = "16g"
|
|
||||||
|
|
||||||
# Dataset management
|
# Dataset management
|
||||||
auto_fetch_data = true
|
auto_fetch_data = true
|
||||||
data_dir = "/data/datasets"
|
data_dir = "/data/datasets"
|
||||||
|
|
@ -36,10 +32,16 @@ dataset_cache_ttl = "24h"
|
||||||
task_lease_duration = "1h"
|
task_lease_duration = "1h"
|
||||||
heartbeat_interval = "30s"
|
heartbeat_interval = "30s"
|
||||||
graceful_timeout = "5m"
|
graceful_timeout = "5m"
|
||||||
poll_interval = "100ms"
|
poll_interval_seconds = 1
|
||||||
metrics_flush_interval = "10s"
|
metrics_flush_interval = "10s"
|
||||||
|
|
||||||
|
[resources]
|
||||||
|
max_workers = 4
|
||||||
|
desired_rps_per_worker = 2
|
||||||
|
podman_cpus = "4"
|
||||||
|
podman_memory = "16g"
|
||||||
|
|
||||||
# Metrics exporter
|
# Metrics exporter
|
||||||
[metrics]
|
[metrics]
|
||||||
enabled = true
|
enabled = true
|
||||||
listen_addr = ":9090"
|
listen_addr = ":9100"
|
||||||
|
|
|
||||||
45
deployments/Caddyfile.dev
Normal file
45
deployments/Caddyfile.dev
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
{
|
||||||
|
auto_https off
|
||||||
|
admin off
|
||||||
|
servers {
|
||||||
|
protocols h1 h2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
http://localhost {
|
||||||
|
handle /health {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle /ws* {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle /api/* {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle {
|
||||||
|
respond 404
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
https://localhost {
|
||||||
|
tls internal
|
||||||
|
|
||||||
|
handle /health {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle /ws* {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle /api/* {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle {
|
||||||
|
respond 404
|
||||||
|
}
|
||||||
|
}
|
||||||
44
deployments/Caddyfile.homelab-secure
Normal file
44
deployments/Caddyfile.homelab-secure
Normal file
|
|
@ -0,0 +1,44 @@
|
||||||
|
{
|
||||||
|
admin off
|
||||||
|
servers {
|
||||||
|
protocols h1 h2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{$FETCHML_DOMAIN} {
|
||||||
|
encode gzip
|
||||||
|
|
||||||
|
tls /etc/caddy/ssl/cert.pem /etc/caddy/ssl/key.pem
|
||||||
|
|
||||||
|
header {
|
||||||
|
-Server
|
||||||
|
X-Frame-Options "DENY"
|
||||||
|
X-Content-Type-Options "nosniff"
|
||||||
|
Referrer-Policy "strict-origin-when-cross-origin"
|
||||||
|
Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
|
||||||
|
Content-Security-Policy "default-src 'self'; base-uri 'self'; frame-ancestors 'none'"
|
||||||
|
}
|
||||||
|
|
||||||
|
@admin path /admin/*
|
||||||
|
@admin_private remote_ip private_ranges
|
||||||
|
handle @admin {
|
||||||
|
respond @admin_private 404
|
||||||
|
respond 404
|
||||||
|
}
|
||||||
|
|
||||||
|
handle /health {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle /ws* {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle /api/* {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle {
|
||||||
|
respond 404
|
||||||
|
}
|
||||||
|
}
|
||||||
47
deployments/Caddyfile.prod
Normal file
47
deployments/Caddyfile.prod
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
{
|
||||||
|
email {$CADDY_EMAIL}
|
||||||
|
admin off
|
||||||
|
servers {
|
||||||
|
protocols h1 h2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{$FETCHML_DOMAIN} {
|
||||||
|
encode gzip
|
||||||
|
|
||||||
|
request_body {
|
||||||
|
max_size 10MB
|
||||||
|
}
|
||||||
|
|
||||||
|
header {
|
||||||
|
-Server
|
||||||
|
X-Frame-Options "DENY"
|
||||||
|
X-Content-Type-Options "nosniff"
|
||||||
|
Referrer-Policy "strict-origin-when-cross-origin"
|
||||||
|
Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
|
||||||
|
Content-Security-Policy "default-src 'self'; base-uri 'self'; frame-ancestors 'none'"
|
||||||
|
}
|
||||||
|
|
||||||
|
@admin path /admin/*
|
||||||
|
@admin_private remote_ip private_ranges
|
||||||
|
handle @admin {
|
||||||
|
respond @admin_private 404
|
||||||
|
respond 404
|
||||||
|
}
|
||||||
|
|
||||||
|
handle /health {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle /ws* {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle /api/* {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle {
|
||||||
|
respond 404
|
||||||
|
}
|
||||||
|
}
|
||||||
23
deployments/Caddyfile.smoke
Normal file
23
deployments/Caddyfile.smoke
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
{
|
||||||
|
auto_https off
|
||||||
|
}
|
||||||
|
|
||||||
|
localhost {
|
||||||
|
tls internal
|
||||||
|
|
||||||
|
handle /health {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle /ws* {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle /api/* {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
|
||||||
|
handle {
|
||||||
|
respond 404
|
||||||
|
}
|
||||||
|
}
|
||||||
76
deployments/Makefile
Normal file
76
deployments/Makefile
Normal file
|
|
@ -0,0 +1,76 @@
|
||||||
|
# Docker Compose Deployment Management
|
||||||
|
.PHONY: help dev-up dev-down dev-logs dev-restart homelab-secure-up homelab-secure-down prod-up prod-down status clean
|
||||||
|
|
||||||
|
# Default target
|
||||||
|
help: ## Show this help message
|
||||||
|
@echo "Available commands:"
|
||||||
|
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
|
||||||
|
|
||||||
|
# Development environment
|
||||||
|
dev-up: ## Start development environment
|
||||||
|
@echo "Starting development environment..."
|
||||||
|
docker-compose -f deployments/docker-compose.dev.yml up -d
|
||||||
|
@echo "Services: Caddy (8080/8443), Redis (6379), Prometheus (9090), Grafana (3000)"
|
||||||
|
|
||||||
|
dev-down: ## Stop development environment
|
||||||
|
@echo "Stopping development environment..."
|
||||||
|
docker-compose -f deployments/docker-compose.dev.yml down
|
||||||
|
|
||||||
|
dev-logs: ## Show development logs
|
||||||
|
docker-compose -f deployments/docker-compose.dev.yml logs -f
|
||||||
|
|
||||||
|
dev-restart: ## Restart development environment
|
||||||
|
@echo "Restarting development environment..."
|
||||||
|
docker-compose -f deployments/docker-compose.dev.yml restart
|
||||||
|
|
||||||
|
|
||||||
|
# Homelab environment
|
||||||
|
homelab-secure-up: ## Start secure homelab environment
|
||||||
|
@echo "Starting secure homelab environment..."
|
||||||
|
docker-compose -f deployments/docker-compose.homelab-secure.yml up -d
|
||||||
|
|
||||||
|
homelab-secure-down: ## Stop secure homelab environment
|
||||||
|
@echo "Stopping secure homelab environment..."
|
||||||
|
docker-compose -f deployments/docker-compose.homelab-secure.yml down
|
||||||
|
|
||||||
|
# Production environment
|
||||||
|
prod-up: ## Start production environment
|
||||||
|
@echo "Starting production environment..."
|
||||||
|
docker-compose -f deployments/docker-compose.prod.yml up -d
|
||||||
|
|
||||||
|
prod-down: ## Stop production environment
|
||||||
|
@echo "Stopping production environment..."
|
||||||
|
docker-compose -f deployments/docker-compose.prod.yml down
|
||||||
|
|
||||||
|
# Utility commands
|
||||||
|
status: ## Show status of all environments
|
||||||
|
@echo "=== Development Status ==="
|
||||||
|
@if [ -f deployments/docker-compose.dev.yml ]; then \
|
||||||
|
docker-compose -f deployments/docker-compose.dev.yml ps; \
|
||||||
|
fi
|
||||||
|
@echo ""
|
||||||
|
@echo "=== Homelab Secure Status ==="
|
||||||
|
@if [ -f deployments/docker-compose.homelab-secure.yml ]; then \
|
||||||
|
docker-compose -f deployments/docker-compose.homelab-secure.yml ps 2>/dev/null || echo "Not running"; \
|
||||||
|
fi
|
||||||
|
@echo ""
|
||||||
|
@echo "=== Production Status ==="
|
||||||
|
@if [ -f deployments/docker-compose.prod.yml ]; then \
|
||||||
|
docker-compose -f deployments/docker-compose.prod.yml ps 2>/dev/null || echo "Not running"; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
clean: ## Clean up all containers and volumes
|
||||||
|
@echo "Cleaning up all Docker resources..."
|
||||||
|
@echo "This will remove all containers and volumes. Continue? [y/N]"
|
||||||
|
@read -r confirm && [ "$$confirm" = "y" ] || exit 1
|
||||||
|
docker-compose -f deployments/docker-compose.dev.yml down -v 2>/dev/null || true
|
||||||
|
docker-compose -f deployments/docker-compose.homelab-secure.yml down -v 2>/dev/null || true
|
||||||
|
docker-compose -f deployments/docker-compose.prod.yml down -v 2>/dev/null || true
|
||||||
|
docker system prune -f
|
||||||
|
@echo "Cleanup complete."
|
||||||
|
|
||||||
|
# Quick aliases
|
||||||
|
up: dev-up ## Alias for dev-up
|
||||||
|
down: dev-down ## Alias for dev-down
|
||||||
|
logs: dev-logs ## Alias for dev-logs
|
||||||
|
restart: dev-restart ## Alias for dev-restart
|
||||||
|
|
@ -2,33 +2,123 @@
|
||||||
|
|
||||||
This directory contains Docker Compose configurations for different deployment environments.
|
This directory contains Docker Compose configurations for different deployment environments.
|
||||||
|
|
||||||
## Files
|
## Environment Configurations
|
||||||
|
|
||||||
- `docker-compose.homelab-secure.yml` - Secure homelab deployment with TLS and authentication
|
### Development (`docker-compose.dev.yml`)
|
||||||
- `docker-compose.prod.yml` - Production deployment configuration
|
- Full development stack with monitoring
|
||||||
|
- Includes: API, Worker, Redis, MinIO (snapshots), Prometheus, Grafana, Loki, Promtail
|
||||||
|
- Optimized for local development and testing
|
||||||
|
- **Usage**: `docker-compose -f deployments/docker-compose.dev.yml up -d`
|
||||||
|
|
||||||
## Usage
|
### Homelab - Secure (`docker-compose.homelab-secure.yml`)
|
||||||
|
- Secure homelab deployment with authentication and a Caddy reverse proxy
|
||||||
|
- TLS is terminated at the reverse proxy (Approach A)
|
||||||
|
- Includes: API, Redis (password protected), Caddy reverse proxy
|
||||||
|
- **Usage**: `docker-compose -f deployments/docker-compose.homelab-secure.yml up -d`
|
||||||
|
|
||||||
|
### Production (`docker-compose.prod.yml`)
|
||||||
|
- Production deployment configuration
|
||||||
|
- Optimized for performance and security
|
||||||
|
- External services assumed (Redis, monitoring)
|
||||||
|
- **Usage**: `docker-compose -f deployments/docker-compose.prod.yml up -d`
|
||||||
|
|
||||||
|
Note: `docker-compose.prod.yml` is a reproducible staging/testing harness. Real production deployments do not require Docker; you can run the Go services directly (systemd) and use Caddy for TLS/WSS termination.
|
||||||
|
|
||||||
|
## TLS / WSS Policy
|
||||||
|
|
||||||
|
- The Zig CLI currently supports `ws://` only (native `wss://` is not implemented).
|
||||||
|
- Production deployments terminate TLS/WSS at a reverse proxy (Caddy in `docker-compose.prod.yml`) and keep the API server on internal `ws://`.
|
||||||
|
- Homelab deployments terminate TLS/WSS at a reverse proxy (Caddy) and keep the API server on internal `ws://`.
|
||||||
|
- Health checks in compose files should use `http://localhost:9101/health` when `server.tls.enabled: false`.
|
||||||
|
|
||||||
|
## Required Volume Mounts
|
||||||
|
|
||||||
|
- `base_path` (experiments) must be writable by the API server.
|
||||||
|
- `data_dir` should be mounted if you want snapshot/dataset integrity validation via `ml validate`.
|
||||||
|
|
||||||
|
For the default configs:
|
||||||
|
|
||||||
|
- `base_path`: `/data/experiments` (dev/homelab configs) or `/app/data/experiments` (prod configs)
|
||||||
|
- `data_dir`: `/data/active`
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
### Development
|
|
||||||
```bash
|
```bash
|
||||||
# Use the main docker-compose.yml in project root
|
# Development (most common)
|
||||||
docker-compose up -d
|
docker-compose -f deployments/docker-compose.dev.yml up -d
|
||||||
|
|
||||||
|
# Check status
|
||||||
|
docker-compose -f deployments/docker-compose.dev.yml ps
|
||||||
|
|
||||||
|
# View logs
|
||||||
|
docker-compose -f deployments/docker-compose.dev.yml logs -f api-server
|
||||||
|
|
||||||
|
# Stop services
|
||||||
|
docker-compose -f deployments/docker-compose.dev.yml down
|
||||||
```
|
```
|
||||||
|
|
||||||
### Homelab (Secure)
|
## Dev: MinIO-backed snapshots (smoke test)
|
||||||
```bash
|
|
||||||
docker-compose -f deployments/docker-compose.homelab-secure.yml up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
### Production
|
The dev compose file provisions a MinIO bucket and uploads a small example snapshot object at:
|
||||||
```bash
|
|
||||||
docker-compose -f deployments/docker-compose.prod.yml up -d
|
`s3://fetchml-snapshots/snapshots/snap-1.tar.gz`
|
||||||
```
|
|
||||||
|
To queue a task that forces the worker to pull the snapshot from MinIO:
|
||||||
|
|
||||||
|
1. Start the dev stack:
|
||||||
|
`docker-compose -f deployments/docker-compose.dev.yml up -d`
|
||||||
|
|
||||||
|
2. Read the `snapshot_sha256` printed by the init job:
|
||||||
|
`docker-compose -f deployments/docker-compose.dev.yml logs minio-init`
|
||||||
|
|
||||||
|
3. Queue a job using the snapshot fields:
|
||||||
|
`ml queue <job-name> --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`
|
||||||
|
|
||||||
|
## Smoke tests
|
||||||
|
|
||||||
|
- `make dev-smoke` runs the development stack smoke test.
|
||||||
|
- `make prod-smoke` runs a Docker-based staging smoke test for the production stack, using a localhost-only Caddy configuration.
|
||||||
|
|
||||||
|
Note: `ml queue` by itself will generate a random commit ID. For full provenance enforcement (manifest + dependency manifest), use `ml sync ./your-project --queue` so the server has real code + dependency files.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- `ml queue train-mnist --priority 3 --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`
|
||||||
|
- `ml queue train-a train-b train-c --priority 5 --snapshot-id snap-1 --snapshot-sha256 <snapshot_sha256>`
|
||||||
|
|
||||||
## Environment Variables
|
## Environment Variables
|
||||||
|
|
||||||
Each deployment may require specific environment variables. Refer to the individual compose files for requirements.
|
Create a `.env` file in the project root:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Grafana
|
||||||
|
GRAFANA_ADMIN_PASSWORD=your_secure_password
|
||||||
|
|
||||||
|
# API Configuration
|
||||||
|
LOG_LEVEL=info
|
||||||
|
|
||||||
|
# TLS (for secure deployments)
|
||||||
|
TLS_CERT_PATH=/app/ssl/cert.pem
|
||||||
|
TLS_KEY_PATH=/app/ssl/key.pem
|
||||||
|
```
|
||||||
|
|
||||||
|
## Service Ports
|
||||||
|
|
||||||
|
| Service | Development | Homelab | Production |
|
||||||
|
|---------|-------------|---------|------------|
|
||||||
|
| API Server | 9101 | 9101 | 9101 |
|
||||||
|
| Redis | 6379 | 6379 | - |
|
||||||
|
| Prometheus | 9090 | - | - |
|
||||||
|
| Grafana | 3000 | - | - |
|
||||||
|
| Loki | 3100 | - | - |
|
||||||
|
|
||||||
## Monitoring
|
## Monitoring
|
||||||
|
|
||||||
Performance monitoring configurations are in `monitoring/docker-compose.performance.yml`
|
- **Development**: Full monitoring stack included
|
||||||
|
- **Homelab**: Basic monitoring (configurable)
|
||||||
|
- **Production**: External monitoring assumed
|
||||||
|
|
||||||
|
## Security Notes
|
||||||
|
|
||||||
|
- If you need HTTPS externally, terminate TLS at a reverse proxy.
|
||||||
|
- API keys should be managed via environment variables
|
||||||
|
- Database credentials should use secrets management in production
|
||||||
|
|
|
||||||
162
deployments/deploy.sh
Executable file
162
deployments/deploy.sh
Executable file
|
|
@ -0,0 +1,162 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Quick deployment script for fetch_ml
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Function to print colored output
|
||||||
|
print_status() {
|
||||||
|
echo -e "${BLUE}[INFO]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_success() {
|
||||||
|
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_warning() {
|
||||||
|
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_error() {
|
||||||
|
echo -e "${RED}[ERROR]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to show usage
|
||||||
|
show_usage() {
|
||||||
|
echo "Usage: $0 [ENVIRONMENT] [ACTION]"
|
||||||
|
echo ""
|
||||||
|
echo "Environments:"
|
||||||
|
echo " dev Development environment"
|
||||||
|
echo " secure Secure homelab environment"
|
||||||
|
echo " prod Production environment"
|
||||||
|
echo ""
|
||||||
|
echo "Actions:"
|
||||||
|
echo " up Start services"
|
||||||
|
echo " down Stop services"
|
||||||
|
echo " restart Restart services"
|
||||||
|
echo " logs Show logs"
|
||||||
|
echo " status Show status"
|
||||||
|
echo ""
|
||||||
|
echo "Examples:"
|
||||||
|
echo " $0 dev up # Start development environment"
|
||||||
|
echo " $0 prod down # Stop production environment"
|
||||||
|
echo " $0 secure logs # Show secure environment logs"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to check if docker-compose file exists
|
||||||
|
check_compose_file() {
|
||||||
|
local env=$1
|
||||||
|
local compose_file=""
|
||||||
|
|
||||||
|
case $env in
|
||||||
|
"dev")
|
||||||
|
compose_file="deployments/docker-compose.dev.yml"
|
||||||
|
;;
|
||||||
|
"secure")
|
||||||
|
compose_file="deployments/docker-compose.homelab-secure.yml"
|
||||||
|
;;
|
||||||
|
"prod")
|
||||||
|
compose_file="deployments/docker-compose.prod.yml"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
print_error "Unknown environment: $env"
|
||||||
|
show_usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
if [ ! -f "$compose_file" ]; then
|
||||||
|
print_error "Docker Compose file not found: $compose_file"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "$compose_file"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to check if .env file exists
|
||||||
|
check_env_file() {
|
||||||
|
local env=$1
|
||||||
|
|
||||||
|
if [ ! -f ".env" ]; then
|
||||||
|
print_warning ".env file not found. Creating from example..."
|
||||||
|
if [ "$env" = "dev" ]; then
|
||||||
|
cp deployments/env.dev.example .env
|
||||||
|
elif [ "$env" = "prod" ]; then
|
||||||
|
cp deployments/env.prod.example .env
|
||||||
|
else
|
||||||
|
cp deployments/env.dev.example .env
|
||||||
|
fi
|
||||||
|
print_warning "Please edit .env file with your configuration"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Main script
|
||||||
|
main() {
|
||||||
|
if [ $# -ne 2 ]; then
|
||||||
|
show_usage
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local environment=$1
|
||||||
|
local action=$2
|
||||||
|
|
||||||
|
print_status "Environment: $environment"
|
||||||
|
print_status "Action: $action"
|
||||||
|
|
||||||
|
# Check compose file
|
||||||
|
compose_file=$(check_compose_file "$environment")
|
||||||
|
print_status "Using: $compose_file"
|
||||||
|
|
||||||
|
# Check .env file
|
||||||
|
check_env_file "$environment"
|
||||||
|
|
||||||
|
# Execute action
|
||||||
|
case $action in
|
||||||
|
"up")
|
||||||
|
print_status "Starting $environment environment..."
|
||||||
|
docker-compose -f "$compose_file" up -d
|
||||||
|
print_success "$environment environment started successfully!"
|
||||||
|
|
||||||
|
# Show service URLs
|
||||||
|
echo ""
|
||||||
|
print_status "Service URLs:"
|
||||||
|
echo " API Server: http://localhost:9101"
|
||||||
|
if [ "$environment" = "dev" ]; then
|
||||||
|
echo " Grafana: http://localhost:3000 (admin/admin123)"
|
||||||
|
echo " Prometheus: http://localhost:9090"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
"down")
|
||||||
|
print_status "Stopping $environment environment..."
|
||||||
|
docker-compose -f "$compose_file" down
|
||||||
|
print_success "$environment environment stopped successfully!"
|
||||||
|
;;
|
||||||
|
"restart")
|
||||||
|
print_status "Restarting $environment environment..."
|
||||||
|
docker-compose -f "$compose_file" restart
|
||||||
|
print_success "$environment environment restarted successfully!"
|
||||||
|
;;
|
||||||
|
"logs")
|
||||||
|
print_status "Showing logs for $environment environment..."
|
||||||
|
docker-compose -f "$compose_file" logs -f
|
||||||
|
;;
|
||||||
|
"status")
|
||||||
|
print_status "Status of $environment environment:"
|
||||||
|
docker-compose -f "$compose_file" ps
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
print_error "Unknown action: $action"
|
||||||
|
show_usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run main function
|
||||||
|
main "$@"
|
||||||
225
deployments/docker-compose.dev.yml
Normal file
225
deployments/docker-compose.dev.yml
Normal file
|
|
@ -0,0 +1,225 @@
|
||||||
|
# Homelab Docker Compose with Centralized Monitoring
|
||||||
|
# Includes: API, Redis, Prometheus, Grafana, Loki
|
||||||
|
|
||||||
|
services:
|
||||||
|
caddy:
|
||||||
|
image: caddy:2-alpine
|
||||||
|
container_name: ml-dev-caddy
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "8080:80"
|
||||||
|
- "8443:443"
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/deployments/Caddyfile.dev:/etc/caddy/Caddyfile:ro
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/dev/caddy/data:/data
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/dev/caddy/config:/config
|
||||||
|
depends_on:
|
||||||
|
api-server:
|
||||||
|
condition: service_healthy
|
||||||
|
|
||||||
|
redis:
|
||||||
|
image: redis:7-alpine
|
||||||
|
container_name: ml-experiments-redis
|
||||||
|
user: "999:999"
|
||||||
|
ports:
|
||||||
|
- "6379:6379"
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/dev/redis:/data
|
||||||
|
restart: unless-stopped
|
||||||
|
command: redis-server --appendonly yes
|
||||||
|
healthcheck:
|
||||||
|
test: [ "CMD", "redis-cli", "ping" ]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
api-server:
|
||||||
|
build:
|
||||||
|
context: ${FETCHML_REPO_ROOT:-.}
|
||||||
|
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
|
||||||
|
container_name: ml-experiments-api
|
||||||
|
user: "0:0"
|
||||||
|
expose:
|
||||||
|
- "9101" # API and health endpoints (internal; external access via Caddy)
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/dev/active:/data/active
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/dev/workspaces:/data/active/workspaces:delegated
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/configs/api/dev.yaml:/app/configs/api/dev.yaml
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/ssl:/app/ssl
|
||||||
|
depends_on:
|
||||||
|
- redis
|
||||||
|
restart: unless-stopped
|
||||||
|
command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
|
||||||
|
environment:
|
||||||
|
- LOG_LEVEL=info
|
||||||
|
healthcheck:
|
||||||
|
test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 40s
|
||||||
|
labels:
|
||||||
|
logging: "promtail"
|
||||||
|
job: "api-server"
|
||||||
|
|
||||||
|
minio:
|
||||||
|
image: minio/minio:latest
|
||||||
|
container_name: ml-experiments-minio
|
||||||
|
ports:
|
||||||
|
- "9000:9000"
|
||||||
|
- "9001:9001"
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/dev/minio:/data
|
||||||
|
environment:
|
||||||
|
- MINIO_ROOT_USER=minioadmin
|
||||||
|
- MINIO_ROOT_PASSWORD=minioadmin123
|
||||||
|
command: ["server", "/data", "--console-address", ":9001"]
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 10
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
minio-init:
|
||||||
|
image: alpine:3.19
|
||||||
|
container_name: ml-experiments-minio-init
|
||||||
|
depends_on:
|
||||||
|
minio:
|
||||||
|
condition: service_healthy
|
||||||
|
entrypoint: ["/bin/sh", "-c"]
|
||||||
|
command:
|
||||||
|
- |
|
||||||
|
set -eu
|
||||||
|
apk add --no-cache ca-certificates curl tar gzip
|
||||||
|
ARCH=$$(uname -m)
|
||||||
|
MC_ARCH=amd64
|
||||||
|
if [ "$$ARCH" = "aarch64" ] || [ "$$ARCH" = "arm64" ]; then
|
||||||
|
MC_ARCH=arm64
|
||||||
|
fi
|
||||||
|
curl -fsSL -o /usr/local/bin/mc "https://dl.min.io/client/mc/release/linux-$$MC_ARCH/mc"
|
||||||
|
chmod +x /usr/local/bin/mc
|
||||||
|
i=0
|
||||||
|
while ! mc alias set local http://minio:9000 minioadmin minioadmin123; do
|
||||||
|
i=$$((i+1))
|
||||||
|
if [ $$i -ge 30 ]; then
|
||||||
|
echo "minio not ready after 30 attempts" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "waiting for minio... ($$i/30)"
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
mc mb -p local/fetchml-snapshots || true
|
||||||
|
mkdir -p /tmp/snapshots/snap-1
|
||||||
|
echo -n "hello" > /tmp/snapshots/snap-1/hello.txt
|
||||||
|
tar -C /tmp/snapshots/snap-1 -czf /tmp/snap-1.tar.gz .
|
||||||
|
mc cp /tmp/snap-1.tar.gz local/fetchml-snapshots/snapshots/snap-1.tar.gz
|
||||||
|
FILE_SHA=$$(sha256sum /tmp/snapshots/snap-1/hello.txt | cut -d' ' -f1)
|
||||||
|
SNAP_SHA=$$(echo -n "$$FILE_SHA" | sha256sum | cut -d' ' -f1)
|
||||||
|
echo "snapshot_id=snap-1 snapshot_sha256=$$SNAP_SHA"
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
|
worker:
|
||||||
|
build:
|
||||||
|
context: ${FETCHML_REPO_ROOT:-.}
|
||||||
|
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
|
||||||
|
container_name: ml-experiments-worker
|
||||||
|
user: "0:0"
|
||||||
|
ports:
|
||||||
|
- "8888:8888"
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/logs
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/dev/active:/data/active
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/dev/experiments:/data/experiments
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/dev/workspaces:/data/active/workspaces:delegated
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/configs/workers/docker-dev.yaml:/app/configs/worker.yaml
|
||||||
|
- /sys/fs/cgroup:/sys/fs/cgroup:rw
|
||||||
|
depends_on:
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
api-server:
|
||||||
|
condition: service_healthy
|
||||||
|
minio-init:
|
||||||
|
condition: service_completed_successfully
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
- LOG_LEVEL=info
|
||||||
|
- MINIO_ROOT_USER=minioadmin
|
||||||
|
- MINIO_ROOT_PASSWORD=minioadmin123
|
||||||
|
- FETCHML_JUPYTER_DEFAULT_IMAGE=quay.io/jupyter/base-notebook:latest
|
||||||
|
- FETCHML_JUPYTER_CONDA_ENV=base
|
||||||
|
- FETCHML_JUPYTER_KERNEL_NAME=python
|
||||||
|
- FETCHML_PODMAN_CGROUPS=disabled
|
||||||
|
privileged: true
|
||||||
|
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
|
||||||
|
|
||||||
|
# Prometheus - Metrics collection
|
||||||
|
prometheus:
|
||||||
|
image: prom/prometheus:latest
|
||||||
|
container_name: ml-experiments-prometheus
|
||||||
|
ports:
|
||||||
|
- "9090:9090"
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
|
||||||
|
- prometheus_data:/prometheus
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||||
|
- '--storage.tsdb.path=/prometheus'
|
||||||
|
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||||
|
- '--web.console.templates=/etc/prometheus/consoles'
|
||||||
|
- '--web.enable-lifecycle'
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Grafana - Visualization
|
||||||
|
grafana:
|
||||||
|
image: grafana/grafana:latest
|
||||||
|
container_name: ml-experiments-grafana
|
||||||
|
ports:
|
||||||
|
- "3000:3000"
|
||||||
|
volumes:
|
||||||
|
- grafana_data:/var/lib/grafana
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards
|
||||||
|
environment:
|
||||||
|
- GF_SECURITY_ADMIN_PASSWORD=admin123
|
||||||
|
- GF_USERS_ALLOW_SIGN_UP=false
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
- prometheus
|
||||||
|
- loki
|
||||||
|
|
||||||
|
# Loki - Log aggregation
|
||||||
|
loki:
|
||||||
|
image: grafana/loki:latest
|
||||||
|
container_name: ml-experiments-loki
|
||||||
|
ports:
|
||||||
|
- "3100:3100"
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/monitoring/loki-config.yml:/etc/loki/local-config.yaml
|
||||||
|
- loki_data:/loki
|
||||||
|
command: -config.file=/etc/loki/local-config.yaml
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Promtail - Log collector
|
||||||
|
promtail:
|
||||||
|
image: grafana/promtail:latest
|
||||||
|
container_name: ml-experiments-promtail
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/monitoring/promtail-config.yml:/etc/promtail/config.yml
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/dev/logs:/var/log/app
|
||||||
|
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
|
command: -config.file=/etc/promtail/config.yml
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
- loki
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
prometheus_data:
|
||||||
|
driver: local
|
||||||
|
grafana_data:
|
||||||
|
driver: local
|
||||||
|
loki_data:
|
||||||
|
driver: local
|
||||||
|
|
@ -1,104 +1,152 @@
|
||||||
# Homelab Secure Docker Environment
|
# Secure Homelab Docker Compose Configuration
|
||||||
services:
|
# Use with: docker-compose -f docker-compose.yml -f docker-compose.homelab-secure.yml up -d
|
||||||
redis:
|
|
||||||
image: redis:7-alpine
|
|
||||||
container_name: ml-homelab-redis
|
|
||||||
ports:
|
|
||||||
- "6379:6379"
|
|
||||||
volumes:
|
|
||||||
- redis_homelab_data:/data
|
|
||||||
restart: unless-stopped
|
|
||||||
command: >
|
|
||||||
redis-server
|
|
||||||
--appendonly yes
|
|
||||||
--requirepass "HomelabRedis2024!"
|
|
||||||
--maxmemory 512mb
|
|
||||||
--maxmemory-policy allkeys-lru
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "redis-cli", "-a", "HomelabRedis2024!", "ping"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 3
|
|
||||||
networks:
|
|
||||||
- ml-homelab-network
|
|
||||||
|
|
||||||
|
services:
|
||||||
api-server:
|
api-server:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: ${FETCHML_REPO_ROOT:-.}
|
||||||
dockerfile: build/docker/homelab-secure.Dockerfile
|
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
|
||||||
container_name: ml-homelab-api
|
container_name: ml-experiments-api
|
||||||
ports:
|
ports:
|
||||||
- "9104:9101" # API server port
|
- "9101:9101"
|
||||||
- "2223:2222" # Secure SSH port
|
- "9100:9100" # Prometheus metrics endpoint
|
||||||
- "9101:9100" # Prometheus metrics
|
|
||||||
volumes:
|
volumes:
|
||||||
- ./data:/app/data/experiments
|
- ${FETCHML_REPO_ROOT:-.}/data/homelab/experiments:/data/experiments
|
||||||
- ./logs:/logs
|
- ${FETCHML_REPO_ROOT:-.}/data/homelab/active:/data/active
|
||||||
- ./configs/config-homelab-secure.yaml:/app/configs/config.yaml
|
- ${FETCHML_REPO_ROOT:-.}/data/homelab/logs:/logs
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/ssl:/app/ssl:ro
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/configs/api/homelab-secure.yaml:/app/configs/api/prod.yaml:ro
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/.env.secure:/app/.env.secure:ro
|
||||||
depends_on:
|
depends_on:
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
environment:
|
environment:
|
||||||
- REDIS_URL=redis://:HomelabRedis2024!@redis:6379
|
|
||||||
- LOG_LEVEL=info
|
- LOG_LEVEL=info
|
||||||
- TZ=America/New_York
|
# Load secure environment variables
|
||||||
|
- JWT_SECRET_FILE=/app/.env.secure
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "curl", "-k", "-f", "https://localhost:9101/health"]
|
test: ["CMD", "curl", "-f", "http://localhost:9101/health"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 3
|
retries: 3
|
||||||
start_period: 40s
|
start_period: 40s
|
||||||
command: >
|
labels:
|
||||||
sh -c "
|
logging: "promtail"
|
||||||
sudo /app/start-security.sh &
|
job: "api-server"
|
||||||
/usr/local/bin/api-server -config /app/configs/config.yaml
|
command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"]
|
||||||
"
|
|
||||||
networks:
|
networks:
|
||||||
- ml-homelab-network
|
- ml-experiments-network
|
||||||
|
# Add internal network for secure communication
|
||||||
|
- ml-backend-network
|
||||||
|
|
||||||
|
minio:
|
||||||
|
image: minio/minio:latest
|
||||||
|
container_name: ml-experiments-minio
|
||||||
|
ports:
|
||||||
|
- "9000:9000"
|
||||||
|
- "9001:9001"
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/homelab/minio:/data
|
||||||
|
environment:
|
||||||
|
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
||||||
|
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
|
||||||
|
command: ["server", "/data", "--console-address", ":9001"]
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- ml-backend-network
|
||||||
|
|
||||||
|
minio-init:
|
||||||
|
image: alpine:3.19
|
||||||
|
container_name: ml-experiments-minio-init
|
||||||
|
depends_on:
|
||||||
|
- minio
|
||||||
|
entrypoint: ["/bin/sh", "-c"]
|
||||||
|
command:
|
||||||
|
- |
|
||||||
|
apk add --no-cache ca-certificates curl >/dev/null
|
||||||
|
curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc
|
||||||
|
chmod +x /usr/local/bin/mc
|
||||||
|
mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123}
|
||||||
|
mc mb -p local/fetchml-snapshots || true
|
||||||
|
restart: "no"
|
||||||
|
networks:
|
||||||
|
- ml-backend-network
|
||||||
|
|
||||||
worker:
|
worker:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: ${FETCHML_REPO_ROOT:-.}
|
||||||
dockerfile: build/docker/homelab-secure.Dockerfile
|
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
|
||||||
container_name: ml-homelab-worker
|
container_name: ml-experiments-worker
|
||||||
volumes:
|
volumes:
|
||||||
- ./data:/app/data/experiments
|
- ${FETCHML_REPO_ROOT:-.}/data/homelab/experiments:/app/data/experiments
|
||||||
- ./logs:/logs
|
- ${FETCHML_REPO_ROOT:-.}/data/homelab/active:/data/active
|
||||||
- ./configs/worker-homelab-secure.yaml:/app/configs/worker.yaml
|
- ${FETCHML_REPO_ROOT:-.}/data/homelab/logs:/logs
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/configs/workers/homelab-secure.yaml:/app/configs/worker.yaml
|
||||||
depends_on:
|
depends_on:
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
api-server:
|
api-server:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
minio-init:
|
||||||
|
condition: service_started
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
environment:
|
environment:
|
||||||
- REDIS_URL=redis://:HomelabRedis2024!@redis:6379
|
|
||||||
- LOG_LEVEL=info
|
- LOG_LEVEL=info
|
||||||
- TZ=America/New_York
|
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
||||||
privileged: true # Required for Podman
|
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
|
||||||
security_opt:
|
- REDIS_PASSWORD=${REDIS_PASSWORD}
|
||||||
- no-new-privileges:true
|
privileged: true
|
||||||
cap_drop:
|
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
|
||||||
- ALL
|
|
||||||
cap_add:
|
|
||||||
- NET_ADMIN
|
|
||||||
- SYS_ADMIN
|
|
||||||
command: >
|
|
||||||
sh -c "
|
|
||||||
sudo /app/start-security.sh &
|
|
||||||
/usr/local/bin/worker -config /app/configs/worker.yaml
|
|
||||||
"
|
|
||||||
networks:
|
networks:
|
||||||
- ml-homelab-network
|
- ml-backend-network
|
||||||
|
|
||||||
volumes:
|
caddy:
|
||||||
redis_homelab_data:
|
image: caddy:2-alpine
|
||||||
driver: local
|
container_name: ml-experiments-caddy
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "80:80"
|
||||||
|
- "443:443"
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/deployments/Caddyfile.homelab-secure:/etc/caddy/Caddyfile:ro
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/ssl:/etc/caddy/ssl:ro
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/homelab/caddy/data:/data
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/homelab/caddy/config:/config
|
||||||
|
environment:
|
||||||
|
- FETCHML_DOMAIN=${FETCHML_DOMAIN:-ml.local}
|
||||||
|
depends_on:
|
||||||
|
api-server:
|
||||||
|
condition: service_healthy
|
||||||
|
networks:
|
||||||
|
- ml-experiments-network
|
||||||
|
|
||||||
|
# Redis with authentication
|
||||||
|
redis:
|
||||||
|
image: redis:7-alpine
|
||||||
|
container_name: ml-experiments-redis
|
||||||
|
user: "999:999"
|
||||||
|
ports:
|
||||||
|
- "127.0.0.1:6379:6379" # Bind to localhost only
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/homelab/redis:/data
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/redis/redis-secure.conf:/usr/local/etc/redis/redis.conf:ro
|
||||||
|
restart: unless-stopped
|
||||||
|
command: redis-server /usr/local/etc/redis/redis.conf --requirepass ${REDIS_PASSWORD}
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "redis-cli", "--no-auth-warning", "-a", "${REDIS_PASSWORD}", "ping"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
networks:
|
||||||
|
- ml-backend-network
|
||||||
|
environment:
|
||||||
|
- REDIS_PASSWORD=${REDIS_PASSWORD}
|
||||||
|
|
||||||
|
volumes: {}
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
ml-homelab-network:
|
ml-experiments-network:
|
||||||
|
driver: bridge
|
||||||
|
ml-backend-network:
|
||||||
driver: bridge
|
driver: bridge
|
||||||
ipam:
|
|
||||||
config:
|
|
||||||
- subnet: 172.25.0.0/16
|
|
||||||
|
|
|
||||||
75
deployments/docker-compose.prod.smoke.yml
Normal file
75
deployments/docker-compose.prod.smoke.yml
Normal file
|
|
@ -0,0 +1,75 @@
|
||||||
|
services:
|
||||||
|
caddy:
|
||||||
|
image: caddy:2-alpine
|
||||||
|
environment:
|
||||||
|
- FETCHML_DOMAIN=localhost
|
||||||
|
- CADDY_EMAIL=smoke@example.invalid
|
||||||
|
ports:
|
||||||
|
- "8080:80"
|
||||||
|
- "8443:443"
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/caddy/data:/data
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/caddy/config:/config
|
||||||
|
command:
|
||||||
|
- /bin/sh
|
||||||
|
- -c
|
||||||
|
- |
|
||||||
|
cat > /etc/caddy/Caddyfile <<'EOF'
|
||||||
|
{
|
||||||
|
debug
|
||||||
|
servers {
|
||||||
|
protocols h1 h2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
https://localhost {
|
||||||
|
tls internal {
|
||||||
|
protocols tls1.2 tls1.3
|
||||||
|
}
|
||||||
|
|
||||||
|
handle {
|
||||||
|
reverse_proxy api-server:9101
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
exec caddy run --config /etc/caddy/Caddyfile
|
||||||
|
|
||||||
|
redis:
|
||||||
|
image: redis:7-alpine
|
||||||
|
user: "999:999"
|
||||||
|
restart: unless-stopped
|
||||||
|
expose:
|
||||||
|
- "6379"
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/redis:/data
|
||||||
|
command: redis-server --appendonly yes
|
||||||
|
healthcheck:
|
||||||
|
test: [ "CMD", "redis-cli", "ping" ]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 10
|
||||||
|
|
||||||
|
api-server:
|
||||||
|
build:
|
||||||
|
context: ${FETCHML_REPO_ROOT:-.}
|
||||||
|
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
|
||||||
|
user: "0:0"
|
||||||
|
restart: unless-stopped
|
||||||
|
expose:
|
||||||
|
- "9101"
|
||||||
|
depends_on:
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/experiments:/data/experiments
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/active:/data/active
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/prod-smoke/logs:/logs
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/configs/api/dev.yaml:/app/configs/api/dev.yaml:ro
|
||||||
|
command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
|
||||||
|
healthcheck:
|
||||||
|
test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 10
|
||||||
|
|
||||||
|
volumes: {}
|
||||||
|
|
@ -1,12 +1,31 @@
|
||||||
# Full Production Docker Environment with Podman and SQLite
|
# Full Production Docker Environment with Podman and SQLite
|
||||||
services:
|
services:
|
||||||
|
caddy:
|
||||||
|
image: caddy:2-alpine
|
||||||
|
container_name: ml-prod-caddy
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "80:80"
|
||||||
|
- "443:443"
|
||||||
|
volumes:
|
||||||
|
- ./Caddyfile.prod:/etc/caddy/Caddyfile:ro
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/prod/caddy/data:/data
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/prod/caddy/config:/config
|
||||||
|
environment:
|
||||||
|
- FETCHML_DOMAIN=${FETCHML_DOMAIN}
|
||||||
|
- CADDY_EMAIL=${CADDY_EMAIL}
|
||||||
|
depends_on:
|
||||||
|
api-server:
|
||||||
|
condition: service_healthy
|
||||||
|
|
||||||
redis:
|
redis:
|
||||||
image: redis:7-alpine
|
image: redis:7-alpine
|
||||||
container_name: ml-prod-redis
|
container_name: ml-prod-redis
|
||||||
ports:
|
user: "999:999"
|
||||||
- "6379:6379"
|
expose:
|
||||||
|
- "6379"
|
||||||
volumes:
|
volumes:
|
||||||
- redis_prod_data:/data
|
- ${FETCHML_REPO_ROOT:-.}/data/prod/redis:/data
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
command: redis-server --appendonly yes
|
command: redis-server --appendonly yes
|
||||||
healthcheck:
|
healthcheck:
|
||||||
|
|
@ -17,57 +36,87 @@ services:
|
||||||
|
|
||||||
api-server:
|
api-server:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: ${FETCHML_REPO_ROOT:-.}
|
||||||
dockerfile: build/docker/secure-prod.Dockerfile
|
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/secure-prod.Dockerfile
|
||||||
container_name: ml-prod-api
|
container_name: ml-prod-api
|
||||||
ports:
|
expose:
|
||||||
- "9103:9101" # API server port
|
- "9101" # API server port (internal; external access via Caddy)
|
||||||
- "2222:2222" # Secure SSH port for Podman communication
|
- "2222" # Secure SSH port for Podman communication (internal)
|
||||||
- "9100:9100" # Prometheus metrics
|
|
||||||
volumes:
|
volumes:
|
||||||
- ./data:/app/data/experiments
|
- ${FETCHML_REPO_ROOT:-.}/data/prod/experiments:/app/data/experiments
|
||||||
- ./logs:/logs
|
- ${FETCHML_REPO_ROOT:-.}/data/prod/active:/data/active
|
||||||
- ./configs/config-multi-user.yaml:/app/configs/config.yaml
|
- ${FETCHML_REPO_ROOT:-.}/data/prod/logs:/logs
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/configs/api/multi-user.yaml:/app/configs/api/prod.yaml
|
||||||
depends_on:
|
depends_on:
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
environment:
|
environment:
|
||||||
- REDIS_URL=redis://redis:6379
|
|
||||||
- LOG_LEVEL=info
|
- LOG_LEVEL=info
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: [ "CMD", "curl", "-k", "https://localhost:9101/health" ]
|
test: [ "CMD", "curl", "-f", "http://localhost:9101/health" ]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 3
|
retries: 3
|
||||||
start_period: 40s
|
start_period: 40s
|
||||||
# Start SSH daemon for Podman communication
|
# Start API server (ensure data_dir exists for snapshot/dataset validation)
|
||||||
command: ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"]
|
command: ["/bin/sh", "-c", "mkdir -p /data/active/datasets /data/active/snapshots && exec /usr/local/bin/api-server -config /app/configs/api/prod.yaml"]
|
||||||
|
|
||||||
|
minio:
|
||||||
|
image: minio/minio:latest
|
||||||
|
container_name: ml-prod-minio
|
||||||
|
expose:
|
||||||
|
- "9000"
|
||||||
|
- "9001"
|
||||||
|
volumes:
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/data/prod/minio:/data
|
||||||
|
environment:
|
||||||
|
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
||||||
|
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
|
||||||
|
command: ["server", "/data", "--console-address", ":9001"]
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
minio-init:
|
||||||
|
image: alpine:3.19
|
||||||
|
container_name: ml-prod-minio-init
|
||||||
|
depends_on:
|
||||||
|
- minio
|
||||||
|
entrypoint: ["/bin/sh", "-c"]
|
||||||
|
command:
|
||||||
|
- |
|
||||||
|
apk add --no-cache ca-certificates curl >/dev/null
|
||||||
|
curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc
|
||||||
|
chmod +x /usr/local/bin/mc
|
||||||
|
mc alias set local http://minio:9000 ${MINIO_ROOT_USER:-minioadmin} ${MINIO_ROOT_PASSWORD:-minioadmin123}
|
||||||
|
mc mb -p local/fetchml-snapshots || true
|
||||||
|
restart: "no"
|
||||||
|
|
||||||
worker:
|
worker:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: ${FETCHML_REPO_ROOT:-.}
|
||||||
dockerfile: build/docker/secure-prod.Dockerfile
|
dockerfile: ${FETCHML_REPO_ROOT:-.}/build/docker/simple.Dockerfile
|
||||||
container_name: ml-prod-worker
|
container_name: ml-prod-worker
|
||||||
volumes:
|
volumes:
|
||||||
- ./data:/app/data/experiments
|
- ${FETCHML_REPO_ROOT:-.}/data/prod/experiments:/app/data/experiments
|
||||||
- ./logs:/logs
|
- ${FETCHML_REPO_ROOT:-.}/data/prod/active:/data/active
|
||||||
- ./configs/worker-docker.yaml:/app/configs/worker.yaml
|
- ${FETCHML_REPO_ROOT:-.}/data/prod/logs:/logs
|
||||||
|
- ${FETCHML_REPO_ROOT:-.}/configs/workers/docker-prod.yaml:/app/configs/worker.yaml
|
||||||
depends_on:
|
depends_on:
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
api-server:
|
api-server:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
minio-init:
|
||||||
|
condition: service_started
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
environment:
|
environment:
|
||||||
- REDIS_URL=redis://redis:6379
|
|
||||||
- LOG_LEVEL=info
|
- LOG_LEVEL=info
|
||||||
|
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
||||||
|
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin123}
|
||||||
privileged: true # Required for Podman to work in Docker
|
privileged: true # Required for Podman to work in Docker
|
||||||
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
|
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
|
||||||
|
|
||||||
volumes:
|
volumes: {}
|
||||||
redis_prod_data:
|
|
||||||
driver: local
|
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
default:
|
default:
|
||||||
|
|
|
||||||
17
deployments/env.dev.example
Normal file
17
deployments/env.dev.example
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
# Development Environment Variables
|
||||||
|
# Copy this file to .env and modify as needed
|
||||||
|
|
||||||
|
# Grafana
|
||||||
|
GRAFANA_ADMIN_PASSWORD=admin123
|
||||||
|
|
||||||
|
# API Configuration
|
||||||
|
LOG_LEVEL=info
|
||||||
|
|
||||||
|
# TLS (development uses self-signed certs)
|
||||||
|
TLS_CERT_PATH=/app/ssl/cert.pem
|
||||||
|
TLS_KEY_PATH=/app/ssl/key.pem
|
||||||
|
|
||||||
|
# Development-specific
|
||||||
|
ENVIRONMENT=development
|
||||||
|
DEBUG=true
|
||||||
|
API_KEY=development_key_only
|
||||||
28
deployments/env.prod.example
Normal file
28
deployments/env.prod.example
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
# Production Environment Variables
|
||||||
|
# Copy this file to .env and modify as needed
|
||||||
|
|
||||||
|
# Grafana (if using)
|
||||||
|
GRAFANA_ADMIN_PASSWORD=CHANGE_ME_SECURE_PASSWORD
|
||||||
|
|
||||||
|
# API Configuration
|
||||||
|
LOG_LEVEL=warn
|
||||||
|
|
||||||
|
# TLS (production should use CA-signed certs)
|
||||||
|
TLS_CERT_PATH=/app/ssl/cert.pem
|
||||||
|
TLS_KEY_PATH=/app/ssl/key.pem
|
||||||
|
|
||||||
|
# Caddy (TLS/WSS termination)
|
||||||
|
FETCHML_DOMAIN=ml.example.com
|
||||||
|
CADDY_EMAIL=admin@example.com
|
||||||
|
|
||||||
|
# Production-specific
|
||||||
|
ENVIRONMENT=production
|
||||||
|
DEBUG=false
|
||||||
|
|
||||||
|
# Security
|
||||||
|
API_KEY=CHANGE_ME_SECURE_API_KEY
|
||||||
|
ALLOWED_ORIGINS=https://yourdomain.com
|
||||||
|
|
||||||
|
# External services (if applicable)
|
||||||
|
EXTERNAL_REDIS_URL=redis://external-redis:6379
|
||||||
|
EXTERNAL_PROMETHEUS_URL=http://external-prometheus:9090
|
||||||
112
deployments/setup.sh
Normal file
112
deployments/setup.sh
Normal file
|
|
@ -0,0 +1,112 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<'EOF'
|
||||||
|
Usage: ./deployments/setup.sh
|
||||||
|
|
||||||
|
This script DOES NOT install dependencies.
|
||||||
|
It prints the manual steps and required/optional dependencies for a real (non-Docker) production deployment.
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
cat <<'EOF'
|
||||||
|
== FetchML production setup (non-Docker) ==
|
||||||
|
|
||||||
|
Required (core):
|
||||||
|
- Go-built binaries: api-server, worker
|
||||||
|
- Redis (reachable from api-server + worker)
|
||||||
|
- A writable base_path for experiments
|
||||||
|
- A writable data_dir if you want snapshot/dataset staging + integrity validation
|
||||||
|
|
||||||
|
Required (TLS/WSS):
|
||||||
|
- Caddy (recommended) OR another reverse proxy that can terminate TLS and proxy WebSockets
|
||||||
|
|
||||||
|
Optional:
|
||||||
|
- systemd (recommended) for service supervision
|
||||||
|
- MinIO / S3-compatible storage (only if you use remote snapshot_store)
|
||||||
|
- Podman (only if your worker executes jobs in Podman)
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- The Zig CLI currently supports ws:// only. In production, keep the API server internal on ws:// and terminate TLS/WSS at Caddy.
|
||||||
|
- This script is informational; it will not modify your system.
|
||||||
|
|
||||||
|
---
|
||||||
|
1) Build binaries
|
||||||
|
|
||||||
|
make prod
|
||||||
|
|
||||||
|
Artifacts:
|
||||||
|
./bin/api-server
|
||||||
|
./bin/worker
|
||||||
|
|
||||||
|
---
|
||||||
|
2) Create a dedicated user (recommended)
|
||||||
|
|
||||||
|
useradd --system --create-home --shell /usr/sbin/nologin fetchml
|
||||||
|
|
||||||
|
---
|
||||||
|
3) Create directories (example paths)
|
||||||
|
|
||||||
|
mkdir -p /var/lib/fetchml/experiments
|
||||||
|
mkdir -p /var/lib/fetchml/active/datasets /var/lib/fetchml/active/snapshots
|
||||||
|
mkdir -p /var/log/fetchml
|
||||||
|
|
||||||
|
Ensure ownership:
|
||||||
|
chown -R fetchml:fetchml /var/lib/fetchml /var/log/fetchml
|
||||||
|
|
||||||
|
---
|
||||||
|
4) Configure the API server
|
||||||
|
|
||||||
|
- Start from: configs/api/prod.yaml (or your multi-user config)
|
||||||
|
- For real production, keep server.tls.enabled: false
|
||||||
|
- Ensure monitoring.health_checks.enabled is set appropriately
|
||||||
|
|
||||||
|
Example flags:
|
||||||
|
./bin/api-server -config /etc/fetchml/api.yaml
|
||||||
|
|
||||||
|
---
|
||||||
|
5) Configure Caddy (TLS/WSS termination)
|
||||||
|
|
||||||
|
- Recommended: use deployments/Caddyfile.prod as a baseline.
|
||||||
|
- Caddy should listen on 443 and reverse proxy to the API server (internal) on 9101.
|
||||||
|
|
||||||
|
Example layout:
|
||||||
|
/etc/caddy/Caddyfile
|
||||||
|
/var/lib/caddy
|
||||||
|
|
||||||
|
---
|
||||||
|
6) Configure Redis
|
||||||
|
|
||||||
|
- Use Redis AUTH in production.
|
||||||
|
- Ensure the api-server + worker can reach it.
|
||||||
|
|
||||||
|
---
|
||||||
|
7) Run under systemd (recommended)
|
||||||
|
|
||||||
|
Create unit files (example):
|
||||||
|
/etc/systemd/system/fetchml-api.service
|
||||||
|
/etc/systemd/system/fetchml-worker.service
|
||||||
|
/etc/systemd/system/caddy.service (if not already provided)
|
||||||
|
|
||||||
|
Then:
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl enable --now fetchml-api
|
||||||
|
systemctl enable --now fetchml-worker
|
||||||
|
systemctl enable --now caddy
|
||||||
|
|
||||||
|
---
|
||||||
|
8) Smoke check
|
||||||
|
|
||||||
|
Internal health (no TLS):
|
||||||
|
curl -f http://127.0.0.1:9101/health
|
||||||
|
|
||||||
|
External health (through Caddy TLS termination):
|
||||||
|
curl -f https://YOUR_DOMAIN/health
|
||||||
|
|
||||||
|
EOF
|
||||||
|
|
@ -1,13 +1,52 @@
|
||||||
# Centralized Monitoring Stack
|
# Monitoring Stack
|
||||||
|
|
||||||
|
## Directory Structure (Canonical)
|
||||||
|
|
||||||
|
All monitoring configuration lives under `monitoring/`.
|
||||||
|
|
||||||
|
```text
|
||||||
|
monitoring/
|
||||||
|
prometheus/
|
||||||
|
prometheus.yml # Prometheus scrape configuration
|
||||||
|
grafana/
|
||||||
|
dashboards/ # Grafana dashboards (JSON)
|
||||||
|
provisioning/
|
||||||
|
datasources/ # Grafana data sources (Prometheus/Loki)
|
||||||
|
dashboards/ # Grafana dashboard provider (points at dashboards/)
|
||||||
|
loki-config.yml # Loki configuration
|
||||||
|
promtail-config.yml # Promtail configuration
|
||||||
|
```
|
||||||
|
|
||||||
|
### What is "Grafana provisioning"?
|
||||||
|
|
||||||
|
Grafana provisioning is how Grafana auto-configures itself on startup (no clicking in the UI):
|
||||||
|
|
||||||
|
- **`grafana/provisioning/datasources/*.yml`**
|
||||||
|
- Defines where Grafana reads data from (e.g. Prometheus at `http://prometheus:9090`, Loki at `http://loki:3100`).
|
||||||
|
- **`grafana/provisioning/dashboards/*.yml`**
|
||||||
|
- Tells Grafana to load dashboard JSON files from `/var/lib/grafana/dashboards`.
|
||||||
|
- **`grafana/dashboards/*.json`**
|
||||||
|
- The dashboards themselves.
|
||||||
|
|
||||||
|
### Source of truth
|
||||||
|
|
||||||
|
- **Dashboards**: edit/add JSON in `monitoring/grafana/dashboards/`.
|
||||||
|
- **Grafana provisioning**: edit files in `monitoring/grafana/provisioning/`.
|
||||||
|
- **Prometheus scrape config**: edit `monitoring/prometheus/prometheus.yml`.
|
||||||
|
|
||||||
|
`scripts/setup_monitoring.py` is intentionally **provisioning-only**:
|
||||||
|
|
||||||
|
- It (re)writes Grafana **datasources** and the **dashboard provider**.
|
||||||
|
- It does **not** create or overwrite any dashboard JSON files.
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Start everything
|
# Start deployment
|
||||||
docker-compose up -d
|
make deploy-up
|
||||||
|
|
||||||
# Access services
|
# Access services
|
||||||
open http://localhost:3000 # Grafana (admin/admin)
|
open http://localhost:3000 # Grafana (admin/admin123)
|
||||||
open http://localhost:9090 # Prometheus
|
open http://localhost:9090 # Prometheus
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
@ -15,137 +54,80 @@ open http://localhost:9090 # Prometheus
|
||||||
|
|
||||||
### Grafana (Port 3000)
|
### Grafana (Port 3000)
|
||||||
**Main monitoring dashboard**
|
**Main monitoring dashboard**
|
||||||
- Username: `admin`
|
- Username: `admin`
|
||||||
- Password: `admin`
|
- Password: `admin123`
|
||||||
- Pre-configured datasources: Prometheus + Loki
|
- Data source: Prometheus (http://localhost:9090)
|
||||||
- Pre-loaded ML Queue dashboard
|
|
||||||
|
|
||||||
### Prometheus (Port 9090)
|
### Prometheus (Port 9090)
|
||||||
**Metrics collection**
|
**Metrics collection and storage**
|
||||||
- Scrapes metrics from API server (`:9100/metrics`)
|
|
||||||
- 15s scrape interval
|
|
||||||
- Data retention: 15 days (default)
|
|
||||||
|
|
||||||
### Loki (Port 3100)
|
### Loki (Port 3100)
|
||||||
**Log aggregation**
|
**Log aggregation**
|
||||||
- Collects logs from all containers
|
|
||||||
- Collects application logs from `./logs/`
|
|
||||||
- Retention: 7 days
|
|
||||||
|
|
||||||
### Promtail
|
## Dashboards
|
||||||
**Log shipping**
|
|
||||||
- Watches Docker container logs
|
|
||||||
- Watches `./logs/*.log`
|
|
||||||
- Sends to Loki
|
|
||||||
|
|
||||||
## Viewing Data
|
Available dashboard configurations in `grafana/dashboards/`:
|
||||||
|
|
||||||
### Metrics
|
- `load-test-performance.json` - Load test metrics
|
||||||
1. Open Grafana: http://localhost:3000
|
- `websocket-performance.json` - WebSocket performance
|
||||||
2. Go to "ML Task Queue Monitoring" dashboard
|
- `system-health.json` - System health monitoring
|
||||||
3. See: queue depth, task duration, error rates, etc.
|
- `rsync-performance.json` - Rsync performance metrics
|
||||||
|
|
||||||
### Logs
|
### Importing Dashboards
|
||||||
1. Open Grafana → Explore
|
|
||||||
2. Select "Loki" datasource
|
|
||||||
3. Query examples:
|
|
||||||
```logql
|
|
||||||
{job="app_logs"} # All app logs
|
|
||||||
{job="docker",service="api-server"} # API server logs
|
|
||||||
{job="docker"} |= "error" # All errors
|
|
||||||
```
|
|
||||||
|
|
||||||
## Architecture
|
1. Go to Grafana → "+" → "Import"
|
||||||
|
2. Upload JSON files from `grafana/dashboards/` directory
|
||||||
```
|
3. Select Prometheus data source
|
||||||
┌─────────────┐
|
|
||||||
│ API Server │──┐
|
|
||||||
└─────────────┘ │
|
|
||||||
├──► Prometheus ──► Grafana
|
|
||||||
┌─────────────┐ │ ▲
|
|
||||||
│ Worker │──┘ │
|
|
||||||
└─────────────┘ │
|
|
||||||
│
|
|
||||||
┌─────────────┐ │
|
|
||||||
│ App Logs │──┐ │
|
|
||||||
└─────────────┘ │ │
|
|
||||||
├──► Promtail ──► Loki ┘
|
|
||||||
┌─────────────┐ │
|
|
||||||
│Docker Logs │──┘
|
|
||||||
└─────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## Configuration Files
|
## Configuration Files
|
||||||
|
|
||||||
- `prometheus.yml` - Metrics scraping config
|
- `prometheus/prometheus.yml` - Prometheus configuration
|
||||||
- `loki-config.yml` - Log storage config
|
- `loki-config.yml` - Loki configuration
|
||||||
- `promtail-config.yml` - Log collection config
|
- `promtail-config.yml` - Promtail configuration
|
||||||
- `grafana/provisioning/` - Auto-configuration
|
- `security_rules.yml` - Security rules
|
||||||
|
|
||||||
## Customization
|
## Usage
|
||||||
|
|
||||||
### Add More Scrapers
|
1. Start monitoring stack: `make deploy-up`
|
||||||
Edit `monitoring/prometheus.yml`:
|
2. Access Grafana: http://localhost:3000 (admin/admin123)
|
||||||
```yaml
|
3. Import dashboards from `grafana/dashboards/` directory
|
||||||
scrape_configs:
|
4. View metrics and test results in real-time
|
||||||
- job_name: 'my-service'
|
|
||||||
static_configs:
|
|
||||||
- targets: ['my-service:9100']
|
|
||||||
```
|
|
||||||
|
|
||||||
### Change Retention
|
## Health Endpoints
|
||||||
**Prometheus:** Add to command in docker-compose:
|
|
||||||
```yaml
|
|
||||||
- '--storage.tsdb.retention.time=30d'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Loki:** Edit `loki-config.yml`:
|
The API server provides health check endpoints for monitoring:
|
||||||
```yaml
|
|
||||||
limits_config:
|
|
||||||
retention_period: 720h # 30 days
|
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting
|
- **`/health`** - Overall service health (for Docker healthcheck)
|
||||||
|
- **`/health/live`** - Liveness probe (is the service running?)
|
||||||
|
- **`/health/ready`** - Readiness probe (can the service accept traffic?)
|
||||||
|
|
||||||
**No metrics showing:**
|
### Testing Health Endpoints
|
||||||
```bash
|
|
||||||
# Check if Prometheus can reach targets
|
|
||||||
curl http://localhost:9090/api/v1/targets
|
|
||||||
|
|
||||||
# Check if API exposes metrics
|
|
||||||
curl http://localhost:9100/metrics
|
|
||||||
```
|
|
||||||
|
|
||||||
**No logs showing:**
|
|
||||||
```bash
|
|
||||||
# Check Promtail status
|
|
||||||
docker logs ml-experiments-promtail
|
|
||||||
|
|
||||||
# Verify Loki is receiving logs
|
|
||||||
curl http://localhost:3100/ready
|
|
||||||
```
|
|
||||||
|
|
||||||
**Grafana can't connect to datasources:**
|
|
||||||
```bash
|
|
||||||
# Restart Grafana
|
|
||||||
docker-compose restart grafana
|
|
||||||
```
|
|
||||||
|
|
||||||
## Profiling Quick Start
|
|
||||||
|
|
||||||
To capture CPU profiles while exercising real workloads:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# HTTP LoadTestSuite (MediumLoad scenario)
|
# Basic health check
|
||||||
make profile-load
|
curl -k https://localhost:9101/health
|
||||||
|
|
||||||
# WebSocket → Redis queue → worker integration
|
# Liveness check (for K8s or monitoring)
|
||||||
make profile-ws-queue
|
curl -k https://localhost:9101/health/live
|
||||||
|
|
||||||
|
# Readiness check (verifies dependencies)
|
||||||
|
curl -k https://localhost:9101/health/ready
|
||||||
```
|
```
|
||||||
|
|
||||||
Then inspect profiles with:
|
See `health-testing.md` for detailed testing procedures.
|
||||||
|
|
||||||
```bash
|
## Prometheus Integration
|
||||||
go tool pprof cpu_load.out # HTTP load
|
|
||||||
go tool pprof cpu_ws.out # WebSocket/queue/worker
|
Prometheus scrapes the following endpoints:
|
||||||
```
|
- `api-server:9101/metrics` - Application metrics (future)
|
||||||
|
- `api-server:9101/health` - Health status monitoring
|
||||||
|
- `host.docker.internal:9100/metrics` - Worker metrics (when the worker runs on the host)
|
||||||
|
- `worker:9100/metrics` - Worker metrics (when the worker runs as a container in the compose network)
|
||||||
|
|
||||||
|
## Cleanup (deprecated paths)
|
||||||
|
|
||||||
|
These legacy paths may still exist in the repo but are **not used** by the current dev compose config:
|
||||||
|
|
||||||
|
- `monitoring/dashboards/` (old dashboards location)
|
||||||
|
- `monitoring/prometheus.yml` (old Prometheus config location)
|
||||||
|
- `monitoring/grafana/provisioning/dashboards/dashboard.yml` (duplicate of `dashboards.yml`)
|
||||||
|
|
@ -1,147 +0,0 @@
|
||||||
{
|
|
||||||
"dashboard": {
|
|
||||||
"title": "ML Task Queue Monitoring",
|
|
||||||
"tags": [
|
|
||||||
"ml",
|
|
||||||
"queue",
|
|
||||||
"fetch_ml"
|
|
||||||
],
|
|
||||||
"timezone": "browser",
|
|
||||||
"panels": [
|
|
||||||
{
|
|
||||||
"title": "Queue Depth",
|
|
||||||
"type": "graph",
|
|
||||||
"gridPos": {
|
|
||||||
"h": 8,
|
|
||||||
"w": 12,
|
|
||||||
"x": 0,
|
|
||||||
"y": 0
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "fetch_ml_queue_depth",
|
|
||||||
"legendFormat": "Queue Depth"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Active Tasks",
|
|
||||||
"type": "graph",
|
|
||||||
"gridPos": {
|
|
||||||
"h": 8,
|
|
||||||
"w": 12,
|
|
||||||
"x": 12,
|
|
||||||
"y": 0
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "sum(fetch_ml_active_tasks) by (worker_id)",
|
|
||||||
"legendFormat": "{{worker_id}}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Task Duration (p50, p95, p99)",
|
|
||||||
"type": "graph",
|
|
||||||
"gridPos": {
|
|
||||||
"h": 8,
|
|
||||||
"w": 24,
|
|
||||||
"x": 0,
|
|
||||||
"y": 8
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "histogram_quantile(0.50, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
|
|
||||||
"legendFormat": "p50"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"expr": "histogram_quantile(0.95, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
|
|
||||||
"legendFormat": "p95"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"expr": "histogram_quantile(0.99, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
|
|
||||||
"legendFormat": "p99"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Task Completion Rate",
|
|
||||||
"type": "graph",
|
|
||||||
"gridPos": {
|
|
||||||
"h": 8,
|
|
||||||
"w": 12,
|
|
||||||
"x": 0,
|
|
||||||
"y": 16
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "rate(fetch_ml_tasks_completed_total[5m])",
|
|
||||||
"legendFormat": "{{status}}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Failure Rate by Error Category",
|
|
||||||
"type": "graph",
|
|
||||||
"gridPos": {
|
|
||||||
"h": 8,
|
|
||||||
"w": 12,
|
|
||||||
"x": 12,
|
|
||||||
"y": 16
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "rate(fetch_ml_task_failures_total[5m])",
|
|
||||||
"legendFormat": "{{error_category}}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Retry Rate",
|
|
||||||
"type": "graph",
|
|
||||||
"gridPos": {
|
|
||||||
"h": 8,
|
|
||||||
"w": 12,
|
|
||||||
"x": 0,
|
|
||||||
"y": 24
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "rate(fetch_ml_task_retries_total[5m])",
|
|
||||||
"legendFormat": "{{error_category}}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Dead Letter Queue Size",
|
|
||||||
"type": "stat",
|
|
||||||
"gridPos": {
|
|
||||||
"h": 8,
|
|
||||||
"w": 6,
|
|
||||||
"x": 12,
|
|
||||||
"y": 24
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "fetch_ml_dlq_size"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Lease Expirations",
|
|
||||||
"type": "stat",
|
|
||||||
"gridPos": {
|
|
||||||
"h": 8,
|
|
||||||
"w": 6,
|
|
||||||
"x": 18,
|
|
||||||
"y": 24
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "fetch_ml_lease_expirations_total"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,278 +0,0 @@
|
||||||
{
|
|
||||||
"dashboard": {
|
|
||||||
"title": "Application Logs",
|
|
||||||
"tags": [
|
|
||||||
"logs",
|
|
||||||
"loki",
|
|
||||||
"fetch_ml"
|
|
||||||
],
|
|
||||||
"timezone": "browser",
|
|
||||||
"editable": true,
|
|
||||||
"graphTooltip": 1,
|
|
||||||
"time": {
|
|
||||||
"from": "now-1h",
|
|
||||||
"to": "now"
|
|
||||||
},
|
|
||||||
"timepicker": {
|
|
||||||
"refresh_intervals": [
|
|
||||||
"5s",
|
|
||||||
"10s",
|
|
||||||
"30s",
|
|
||||||
"1m",
|
|
||||||
"5m",
|
|
||||||
"15m",
|
|
||||||
"30m",
|
|
||||||
"1h"
|
|
||||||
],
|
|
||||||
"time_options": [
|
|
||||||
"5m",
|
|
||||||
"15m",
|
|
||||||
"1h",
|
|
||||||
"6h",
|
|
||||||
"12h",
|
|
||||||
"24h",
|
|
||||||
"2d",
|
|
||||||
"7d",
|
|
||||||
"30d"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"panels": [
|
|
||||||
{
|
|
||||||
"title": "Log Stream",
|
|
||||||
"type": "logs",
|
|
||||||
"gridPos": {
|
|
||||||
"x": 0,
|
|
||||||
"y": 0,
|
|
||||||
"w": 24,
|
|
||||||
"h": 12
|
|
||||||
},
|
|
||||||
"id": 1,
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "{job=\"app_logs\"}",
|
|
||||||
"refId": "A",
|
|
||||||
"datasource": "Loki"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"options": {
|
|
||||||
"showTime": true,
|
|
||||||
"showLabels": true,
|
|
||||||
"showCommonLabels": false,
|
|
||||||
"wrapLogMessage": false,
|
|
||||||
"prettifyLogMessage": false,
|
|
||||||
"enableLogDetails": true,
|
|
||||||
"dedupStrategy": "none",
|
|
||||||
"sortOrder": "Descending"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Log Level Distribution",
|
|
||||||
"type": "bargauge",
|
|
||||||
"gridPos": {
|
|
||||||
"x": 0,
|
|
||||||
"y": 12,
|
|
||||||
"w": 8,
|
|
||||||
"h": 8
|
|
||||||
},
|
|
||||||
"id": 2,
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "sum by (level) (count_over_time({job=\"app_logs\"} | logfmt | level != \"\" [5m]))",
|
|
||||||
"refId": "A",
|
|
||||||
"datasource": "Loki",
|
|
||||||
"legendFormat": "{{level}}"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"options": {
|
|
||||||
"orientation": "horizontal",
|
|
||||||
"displayMode": "gradient",
|
|
||||||
"showUnfilled": true
|
|
||||||
},
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "palette-classic"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"overrides": [
|
|
||||||
{
|
|
||||||
"matcher": {
|
|
||||||
"id": "byName",
|
|
||||||
"options": "INFO"
|
|
||||||
},
|
|
||||||
"properties": [
|
|
||||||
{
|
|
||||||
"id": "color",
|
|
||||||
"value": {
|
|
||||||
"mode": "fixed",
|
|
||||||
"fixedColor": "green"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"matcher": {
|
|
||||||
"id": "byName",
|
|
||||||
"options": "WARN"
|
|
||||||
},
|
|
||||||
"properties": [
|
|
||||||
{
|
|
||||||
"id": "color",
|
|
||||||
"value": {
|
|
||||||
"mode": "fixed",
|
|
||||||
"fixedColor": "yellow"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"matcher": {
|
|
||||||
"id": "byName",
|
|
||||||
"options": "ERROR"
|
|
||||||
},
|
|
||||||
"properties": [
|
|
||||||
{
|
|
||||||
"id": "color",
|
|
||||||
"value": {
|
|
||||||
"mode": "fixed",
|
|
||||||
"fixedColor": "red"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Error Logs (Last Hour)",
|
|
||||||
"type": "table",
|
|
||||||
"gridPos": {
|
|
||||||
"x": 8,
|
|
||||||
"y": 12,
|
|
||||||
"w": 16,
|
|
||||||
"h": 8
|
|
||||||
},
|
|
||||||
"id": 3,
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "{job=\"app_logs\"} | logfmt | level=\"ERROR\"",
|
|
||||||
"refId": "A",
|
|
||||||
"datasource": "Loki"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"options": {
|
|
||||||
"showHeader": true
|
|
||||||
},
|
|
||||||
"transformations": [
|
|
||||||
{
|
|
||||||
"id": "labelsToFields",
|
|
||||||
"options": {}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Logs by Component",
|
|
||||||
"type": "timeseries",
|
|
||||||
"gridPos": {
|
|
||||||
"x": 0,
|
|
||||||
"y": 20,
|
|
||||||
"w": 12,
|
|
||||||
"h": 8
|
|
||||||
},
|
|
||||||
"id": 4,
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "sum by (component) (rate({job=\"app_logs\"} | logfmt [1m]))",
|
|
||||||
"refId": "A",
|
|
||||||
"datasource": "Loki",
|
|
||||||
"legendFormat": "{{component}}"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"custom": {
|
|
||||||
"drawStyle": "line",
|
|
||||||
"lineInterpolation": "smooth",
|
|
||||||
"fillOpacity": 10,
|
|
||||||
"spanNulls": false,
|
|
||||||
"showPoints": "never",
|
|
||||||
"stacking": {
|
|
||||||
"mode": "none"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"unit": "reqps"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Warning Logs Timeline",
|
|
||||||
"type": "timeseries",
|
|
||||||
"gridPos": {
|
|
||||||
"x": 12,
|
|
||||||
"y": 20,
|
|
||||||
"w": 12,
|
|
||||||
"h": 8
|
|
||||||
},
|
|
||||||
"id": 5,
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "sum(count_over_time({job=\"app_logs\"} | logfmt | level=\"WARN\" [1m]))",
|
|
||||||
"refId": "A",
|
|
||||||
"datasource": "Loki",
|
|
||||||
"legendFormat": "Warnings"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"custom": {
|
|
||||||
"drawStyle": "bars",
|
|
||||||
"fillOpacity": 50
|
|
||||||
},
|
|
||||||
"color": {
|
|
||||||
"mode": "fixed",
|
|
||||||
"fixedColor": "yellow"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Search Logs",
|
|
||||||
"type": "logs",
|
|
||||||
"gridPos": {
|
|
||||||
"x": 0,
|
|
||||||
"y": 28,
|
|
||||||
"w": 24,
|
|
||||||
"h": 10
|
|
||||||
},
|
|
||||||
"id": 6,
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "{job=\"app_logs\"} |= \"$search_term\"",
|
|
||||||
"refId": "A",
|
|
||||||
"datasource": "Loki"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"options": {
|
|
||||||
"showTime": true,
|
|
||||||
"showLabels": true,
|
|
||||||
"wrapLogMessage": true,
|
|
||||||
"enableLogDetails": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"templating": {
|
|
||||||
"list": [
|
|
||||||
{
|
|
||||||
"name": "search_term",
|
|
||||||
"type": "textbox",
|
|
||||||
"label": "Search Term",
|
|
||||||
"current": {
|
|
||||||
"value": "",
|
|
||||||
"text": ""
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"refresh": "30s"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,157 +0,0 @@
|
||||||
{
|
|
||||||
"annotations": {
|
|
||||||
"list": [
|
|
||||||
{
|
|
||||||
"builtIn": 1,
|
|
||||||
"datasource": "-- Grafana --",
|
|
||||||
"enable": true,
|
|
||||||
"hide": true,
|
|
||||||
"iconColor": "rgba(0, 211, 255, 1)",
|
|
||||||
"name": "Annotations & Alerts",
|
|
||||||
"type": "dashboard"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"editable": true,
|
|
||||||
"gnetId": null,
|
|
||||||
"graphTooltip": 0,
|
|
||||||
"id": null,
|
|
||||||
"links": [],
|
|
||||||
"panels": [
|
|
||||||
{
|
|
||||||
"aliasColors": {},
|
|
||||||
"bars": false,
|
|
||||||
"dashLength": 10,
|
|
||||||
"dashes": false,
|
|
||||||
"datasource": "loki",
|
|
||||||
"fill": 1,
|
|
||||||
"fillGradient": 0,
|
|
||||||
"gridPos": {
|
|
||||||
"h": 8,
|
|
||||||
"w": 12,
|
|
||||||
"x": 0,
|
|
||||||
"y": 0
|
|
||||||
},
|
|
||||||
"hiddenSeries": false,
|
|
||||||
"id": 1,
|
|
||||||
"legend": {
|
|
||||||
"avg": false,
|
|
||||||
"current": false,
|
|
||||||
"max": false,
|
|
||||||
"min": false,
|
|
||||||
"show": true,
|
|
||||||
"total": false,
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"lines": true,
|
|
||||||
"linewidth": 1,
|
|
||||||
"nullPointMode": "null",
|
|
||||||
"options": {
|
|
||||||
"dataLinks": []
|
|
||||||
},
|
|
||||||
"percentage": false,
|
|
||||||
"pointradius": 2,
|
|
||||||
"points": false,
|
|
||||||
"renderer": "flot",
|
|
||||||
"seriesOverrides": [],
|
|
||||||
"spaceLength": 10,
|
|
||||||
"stack": false,
|
|
||||||
"steppedLine": false,
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "{job=\"fetchml-performance\"} |= \"BenchmarkAPIServerCreateJobSimple\"",
|
|
||||||
"legendFormat": "API Job Creation",
|
|
||||||
"refId": "A"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"expr": "{job=\"fetchml-performance\"} |= \"BenchmarkMLExperimentExecution/SmallExperiment\"",
|
|
||||||
"legendFormat": "ML Small Experiment",
|
|
||||||
"refId": "B"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"expr": "{job=\"fetchml-performance\"} |= \"BenchmarkDatasetOperations/DatasetCreation\"",
|
|
||||||
"legendFormat": "Dataset Creation",
|
|
||||||
"refId": "C"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"thresholds": [],
|
|
||||||
"timeFrom": null,
|
|
||||||
"timeRegions": [],
|
|
||||||
"timeShift": null,
|
|
||||||
"title": "API Performance Trends",
|
|
||||||
"tooltip": {
|
|
||||||
"shared": true,
|
|
||||||
"sort": 0,
|
|
||||||
"value_type": "individual"
|
|
||||||
},
|
|
||||||
"type": "graph",
|
|
||||||
"xaxis": {
|
|
||||||
"buckets": null,
|
|
||||||
"mode": "time",
|
|
||||||
"name": null,
|
|
||||||
"show": true,
|
|
||||||
"values": []
|
|
||||||
},
|
|
||||||
"yaxes": [
|
|
||||||
{
|
|
||||||
"format": "short",
|
|
||||||
"label": "Time (ns/op)",
|
|
||||||
"logBase": 1,
|
|
||||||
"max": null,
|
|
||||||
"min": null,
|
|
||||||
"show": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"format": "short",
|
|
||||||
"label": null,
|
|
||||||
"logBase": 1,
|
|
||||||
"max": null,
|
|
||||||
"min": null,
|
|
||||||
"show": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"yaxis": {
|
|
||||||
"align": false,
|
|
||||||
"alignLevel": null
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": "loki",
|
|
||||||
"gridPos": {
|
|
||||||
"h": 8,
|
|
||||||
"w": 12,
|
|
||||||
"x": 12,
|
|
||||||
"y": 0
|
|
||||||
},
|
|
||||||
"id": 2,
|
|
||||||
"options": {
|
|
||||||
"showLabels": true
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "{job=\"fetchml-performance\"} |= \"Performance Summary\"",
|
|
||||||
"legendFormat": "{{timestamp}}",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"title": "Latest Performance Summary",
|
|
||||||
"type": "logs"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"refresh": "30s",
|
|
||||||
"schemaVersion": 27,
|
|
||||||
"style": "dark",
|
|
||||||
"tags": ["fetchml", "performance"],
|
|
||||||
"templating": {
|
|
||||||
"list": []
|
|
||||||
},
|
|
||||||
"time": {
|
|
||||||
"from": "now-1h",
|
|
||||||
"to": "now"
|
|
||||||
},
|
|
||||||
"timepicker": {},
|
|
||||||
"timezone": "",
|
|
||||||
"title": "Fetch ML Performance Dashboard",
|
|
||||||
"uid": "fetchml-performance",
|
|
||||||
"version": 1
|
|
||||||
}
|
|
||||||
|
|
@ -1,64 +0,0 @@
|
||||||
services:
|
|
||||||
prometheus:
|
|
||||||
image: prom/prometheus:latest
|
|
||||||
ports:
|
|
||||||
- "9090:9090"
|
|
||||||
command:
|
|
||||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
||||||
- '--storage.tsdb.path=/prometheus'
|
|
||||||
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
|
||||||
- '--web.console.templates=/etc/prometheus/consoles'
|
|
||||||
- '--web.enable-lifecycle'
|
|
||||||
volumes:
|
|
||||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
|
||||||
- prometheus-data:/prometheus
|
|
||||||
networks:
|
|
||||||
- monitoring
|
|
||||||
|
|
||||||
loki:
|
|
||||||
image: grafana/loki:2.9.0
|
|
||||||
ports:
|
|
||||||
- "3100:3100"
|
|
||||||
command: -config.file=/etc/loki/local-config.yaml
|
|
||||||
volumes:
|
|
||||||
- ./loki-performance-config.yaml:/etc/loki/local-config.yaml
|
|
||||||
networks:
|
|
||||||
- monitoring
|
|
||||||
|
|
||||||
promtail:
|
|
||||||
image: grafana/promtail:latest
|
|
||||||
volumes:
|
|
||||||
- ./promtail-performance-config.yaml:/etc/promtail/config.yml
|
|
||||||
- /var/log:/var/log:ro
|
|
||||||
command: -config.file=/etc/promtail/config.yml
|
|
||||||
networks:
|
|
||||||
- monitoring
|
|
||||||
|
|
||||||
pushgateway:
|
|
||||||
image: prom/pushgateway:latest
|
|
||||||
ports:
|
|
||||||
- "9091:9091"
|
|
||||||
networks:
|
|
||||||
- monitoring
|
|
||||||
|
|
||||||
grafana:
|
|
||||||
image: grafana/grafana:latest
|
|
||||||
ports:
|
|
||||||
- "3001:3000"
|
|
||||||
environment:
|
|
||||||
- GF_SECURITY_ADMIN_PASSWORD=admin
|
|
||||||
volumes:
|
|
||||||
- grafana-data:/var/lib/grafana
|
|
||||||
- ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards
|
|
||||||
- ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources
|
|
||||||
networks:
|
|
||||||
- monitoring
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
loki-data:
|
|
||||||
grafana-data:
|
|
||||||
prometheus-data:
|
|
||||||
|
|
||||||
networks:
|
|
||||||
monitoring:
|
|
||||||
driver: bridge
|
|
||||||
51
monitoring/grafana/dashboards/load-test-performance.json
Normal file
51
monitoring/grafana/dashboards/load-test-performance.json
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"id": null,
|
||||||
|
"title": "Load Test Performance",
|
||||||
|
"tags": [
|
||||||
|
"load-test",
|
||||||
|
"performance"
|
||||||
|
],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "Service Health",
|
||||||
|
"type": "stat",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "up",
|
||||||
|
"legendFormat": "{{job}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "Request Rate",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(http_requests_total[5m])",
|
||||||
|
"legendFormat": "RPS"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"refresh": "5s"
|
||||||
|
}
|
||||||
|
}
|
||||||
1
monitoring/grafana/dashboards/load-test-simple.json
Normal file
1
monitoring/grafana/dashboards/load-test-simple.json
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
{"dashboard": {"id": null, "title": "Load Test Performance", "tags": ["load-test", "performance"], "panels": [{"id": 1, "title": "Service Status", "type": "stat", "targets": [{"expr": "up", "legendFormat": "{{job}}"}]}]}}
|
||||||
51
monitoring/grafana/dashboards/loki-logs.json
Normal file
51
monitoring/grafana/dashboards/loki-logs.json
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"id": null,
|
||||||
|
"title": "Log Analysis",
|
||||||
|
"tags": [
|
||||||
|
"loki",
|
||||||
|
"logs"
|
||||||
|
],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "Error Logs",
|
||||||
|
"type": "logs",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{job=~\".+\"} |= \"error\"",
|
||||||
|
"legendFormat": "Errors"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "All Logs",
|
||||||
|
"type": "logs",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{job=~\".+\"}",
|
||||||
|
"legendFormat": "All logs"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"time": {
|
||||||
|
"from": "now-30m",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"refresh": "30s"
|
||||||
|
}
|
||||||
|
}
|
||||||
135
monitoring/grafana/dashboards/prewarm-performance.txt
Normal file
135
monitoring/grafana/dashboards/prewarm-performance.txt
Normal file
|
|
@ -0,0 +1,135 @@
|
||||||
|
# Grafana Dashboard: Prewarm Performance
|
||||||
|
# Import this JSON into Grafana to create a prewarm monitoring dashboard
|
||||||
|
|
||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"id": null,
|
||||||
|
"title": "Prewarm Performance",
|
||||||
|
"tags": ["prewarm", "performance", "worker"],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "Environment Prewarm Hit Rate (%)",
|
||||||
|
"type": "stat",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (fetchml_prewarm_env_hit_total / clamp_min(fetchml_prewarm_env_hit_total + fetchml_prewarm_env_miss_total, 1))",
|
||||||
|
"legendFormat": "{{worker_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {"h": 6, "w": 6, "x": 0, "y": 0},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"thresholds": {
|
||||||
|
"steps": [
|
||||||
|
{"color": "red", "value": 0},
|
||||||
|
{"color": "yellow", "value": 50},
|
||||||
|
{"color": "green", "value": 80}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "Snapshot Prewarm Hit Rate (%)",
|
||||||
|
"type": "stat",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (fetchml_prewarm_snapshot_hit_total / clamp_min(fetchml_prewarm_snapshot_hit_total + fetchml_prewarm_snapshot_miss_total, 1))",
|
||||||
|
"legendFormat": "{{worker_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 0},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"thresholds": {
|
||||||
|
"steps": [
|
||||||
|
{"color": "red", "value": 0},
|
||||||
|
{"color": "yellow", "value": 50},
|
||||||
|
{"color": "green", "value": 80}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "Environment Prewarm Hits vs Misses",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{"expr": "rate(fetchml_prewarm_env_hit_total[5m])", "legendFormat": "hits {{worker_id}}"},
|
||||||
|
{"expr": "rate(fetchml_prewarm_env_miss_total[5m])", "legendFormat": "misses {{worker_id}}"}
|
||||||
|
],
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||||
|
"yAxes": [{"unit": "reqps"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"title": "Snapshot Prewarm Hits vs Misses",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{"expr": "rate(fetchml_prewarm_snapshot_hit_total[5m])", "legendFormat": "hits {{worker_id}}"},
|
||||||
|
{"expr": "rate(fetchml_prewarm_snapshot_miss_total[5m])", "legendFormat": "misses {{worker_id}}"}
|
||||||
|
],
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
||||||
|
"yAxes": [{"unit": "reqps"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"title": "Environment Build Time",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{"expr": "rate(fetchml_prewarm_env_time_seconds_total[5m])", "legendFormat": "build time {{worker_id}}"}
|
||||||
|
],
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
||||||
|
"yAxes": [{"unit": "seconds"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"title": "Snapshot Prewarm Time",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{"expr": "rate(fetchml_prewarm_snapshot_time_seconds_total[5m])", "legendFormat": "prewarm time {{worker_id}}"}
|
||||||
|
],
|
||||||
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
|
||||||
|
"yAxes": [{"unit": "seconds"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"title": "Environment Images Built",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{"expr": "increase(fetchml_prewarm_env_built_total[1h])", "legendFormat": "built {{worker_id}}"}
|
||||||
|
],
|
||||||
|
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 16},
|
||||||
|
"yAxes": [{"unit": "short"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"title": "Snapshots Prewarmed",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{"expr": "increase(fetchml_prewarm_snapshot_built_total[1h])", "legendFormat": "prewarmed {{worker_id}}"}
|
||||||
|
],
|
||||||
|
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 16},
|
||||||
|
"yAxes": [{"unit": "short"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"title": "Prewarm Efficiency",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{"expr": "fetchml_prewarm_env_hit_total + fetchml_prewarm_snapshot_hit_total", "legendFormat": "total hits {{worker_id}}"},
|
||||||
|
{"expr": "fetchml_prewarm_env_miss_total + fetchml_prewarm_snapshot_miss_total", "legendFormat": "total misses {{worker_id}}"}
|
||||||
|
],
|
||||||
|
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 24},
|
||||||
|
"yAxes": [{"unit": "short"}]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"time": {"from": "now-1h", "to": "now"},
|
||||||
|
"refresh": "5s"
|
||||||
|
}
|
||||||
|
}
|
||||||
86
monitoring/grafana/dashboards/rsync-performance.json
Normal file
86
monitoring/grafana/dashboards/rsync-performance.json
Normal file
|
|
@ -0,0 +1,86 @@
|
||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"id": null,
|
||||||
|
"title": "Rsync Performance",
|
||||||
|
"tags": [
|
||||||
|
"rsync",
|
||||||
|
"sync",
|
||||||
|
"performance"
|
||||||
|
],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "Rsync Operations",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(rsync_operations_total[5m])",
|
||||||
|
"legendFormat": "Operations/sec"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "Data Transfer Rate",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(rsync_bytes_transferred_total[5m])",
|
||||||
|
"legendFormat": "Bytes/sec"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "Sync Duration",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rsync_sync_duration_seconds",
|
||||||
|
"legendFormat": "Duration"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 8
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"title": "Sync Errors",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(rsync_errors_total[5m])",
|
||||||
|
"legendFormat": "Errors/sec"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 8
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"refresh": "5s"
|
||||||
|
}
|
||||||
|
}
|
||||||
51
monitoring/grafana/dashboards/system-health.json
Normal file
51
monitoring/grafana/dashboards/system-health.json
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"id": null,
|
||||||
|
"title": "System Health",
|
||||||
|
"tags": [
|
||||||
|
"system",
|
||||||
|
"health"
|
||||||
|
],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "Service Status",
|
||||||
|
"type": "stat",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "up",
|
||||||
|
"legendFormat": "{{job}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "Memory Usage",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "process_resident_memory_bytes",
|
||||||
|
"legendFormat": "Memory"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"refresh": "10s"
|
||||||
|
}
|
||||||
|
}
|
||||||
68
monitoring/grafana/dashboards/websocket-performance.json
Normal file
68
monitoring/grafana/dashboards/websocket-performance.json
Normal file
|
|
@ -0,0 +1,68 @@
|
||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"id": null,
|
||||||
|
"title": "WebSocket Performance",
|
||||||
|
"tags": [
|
||||||
|
"websocket",
|
||||||
|
"performance"
|
||||||
|
],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "WebSocket Connections",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "websocket_connections_active",
|
||||||
|
"legendFormat": "Active Connections"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "WebSocket Messages",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(websocket_messages_total[5m])",
|
||||||
|
"legendFormat": "Messages/sec"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "Connection Errors",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(websocket_connection_errors_total[5m])",
|
||||||
|
"legendFormat": "Errors/sec"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 8
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"refresh": "5s"
|
||||||
|
}
|
||||||
|
}
|
||||||
280
monitoring/grafana/dashboards/worker-resources.json
Normal file
280
monitoring/grafana/dashboards/worker-resources.json
Normal file
|
|
@ -0,0 +1,280 @@
|
||||||
|
{
|
||||||
|
"id": null,
|
||||||
|
"title": "Worker Resources",
|
||||||
|
"tags": [
|
||||||
|
"worker",
|
||||||
|
"resources"
|
||||||
|
],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "CPU Free",
|
||||||
|
"type": "stat",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "fetchml_resources_cpu_free",
|
||||||
|
"legendFormat": "{{worker_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 6,
|
||||||
|
"w": 6,
|
||||||
|
"x": 0,
|
||||||
|
"y": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "CPU Total",
|
||||||
|
"type": "stat",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "fetchml_resources_cpu_total",
|
||||||
|
"legendFormat": "{{worker_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 6,
|
||||||
|
"w": 6,
|
||||||
|
"x": 6,
|
||||||
|
"y": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "CPU Utilization (%)",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (1 - (fetchml_resources_cpu_free / clamp_min(fetchml_resources_cpu_total, 1)))",
|
||||||
|
"legendFormat": "{{worker_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"title": "GPU Slots Free",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "fetchml_resources_gpu_slots_free",
|
||||||
|
"legendFormat": "{{worker_id}} gpu={{gpu_index}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 6
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"title": "Acquire Wait / Timeout (Totals)",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "fetchml_resources_acquire_wait_total",
|
||||||
|
"legendFormat": "wait {{worker_id}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "fetchml_resources_acquire_timeout_total",
|
||||||
|
"legendFormat": "timeout {{worker_id}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "fetchml_resources_acquire_total",
|
||||||
|
"legendFormat": "total {{worker_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 8
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"title": "Avg Acquire Wait (seconds)",
|
||||||
|
"type": "stat",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "fetchml_resources_acquire_wait_seconds_total / clamp_min(fetchml_resources_acquire_wait_total, 1)",
|
||||||
|
"legendFormat": "{{worker_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 6,
|
||||||
|
"w": 6,
|
||||||
|
"x": 0,
|
||||||
|
"y": 14
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"title": "Acquire Wait Ratio",
|
||||||
|
"type": "stat",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "fetchml_resources_acquire_wait_total / clamp_min(fetchml_resources_acquire_total, 1)",
|
||||||
|
"legendFormat": "{{worker_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 6,
|
||||||
|
"w": 6,
|
||||||
|
"x": 6,
|
||||||
|
"y": 14
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"title": "Environment Prewarm Hit Rate (%)",
|
||||||
|
"type": "stat",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (fetchml_prewarm_env_hit_total / clamp_min(fetchml_prewarm_env_hit_total + fetchml_prewarm_env_miss_total, 1))",
|
||||||
|
"legendFormat": "{{worker_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 6,
|
||||||
|
"w": 6,
|
||||||
|
"x": 12,
|
||||||
|
"y": 14
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"thresholds": {
|
||||||
|
"steps": [
|
||||||
|
{"color": "red", "value": 0},
|
||||||
|
{"color": "yellow", "value": 50},
|
||||||
|
{"color": "green", "value": 80}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"title": "Snapshot Prewarm Hit Rate (%)",
|
||||||
|
"type": "stat",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "100 * (fetchml_prewarm_snapshot_hit_total / clamp_min(fetchml_prewarm_snapshot_hit_total + fetchml_prewarm_snapshot_miss_total, 1))",
|
||||||
|
"legendFormat": "{{worker_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 6,
|
||||||
|
"w": 6,
|
||||||
|
"x": 18,
|
||||||
|
"y": 14
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent",
|
||||||
|
"thresholds": {
|
||||||
|
"steps": [
|
||||||
|
{"color": "red", "value": 0},
|
||||||
|
{"color": "yellow", "value": 50},
|
||||||
|
{"color": "green", "value": 80}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"title": "Prewarm Hits vs Misses",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(fetchml_prewarm_env_hit_total[5m])",
|
||||||
|
"legendFormat": "env hits {{worker_id}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "rate(fetchml_prewarm_env_miss_total[5m])",
|
||||||
|
"legendFormat": "env misses {{worker_id}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "rate(fetchml_prewarm_snapshot_hit_total[5m])",
|
||||||
|
"legendFormat": "snapshot hits {{worker_id}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "rate(fetchml_prewarm_snapshot_miss_total[5m])",
|
||||||
|
"legendFormat": "snapshot misses {{worker_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 20
|
||||||
|
},
|
||||||
|
"yAxes": [
|
||||||
|
{"unit": "reqps"}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"title": "Prewarm Build Time",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "rate(fetchml_prewarm_env_time_seconds_total[5m])",
|
||||||
|
"legendFormat": "env build {{worker_id}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "rate(fetchml_prewarm_snapshot_time_seconds_total[5m])",
|
||||||
|
"legendFormat": "snapshot prewarm {{worker_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 28
|
||||||
|
},
|
||||||
|
"yAxes": [
|
||||||
|
{"unit": "seconds"}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"title": "Prewarm Builds",
|
||||||
|
"type": "graph",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "increase(fetchml_prewarm_env_built_total[1h])",
|
||||||
|
"legendFormat": "env built {{worker_id}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "increase(fetchml_prewarm_snapshot_built_total[1h])",
|
||||||
|
"legendFormat": "snapshots prewarmed {{worker_id}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 28
|
||||||
|
},
|
||||||
|
"yAxes": [
|
||||||
|
{"unit": "short"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"refresh": "5s"
|
||||||
|
}
|
||||||
|
|
@ -1,5 +1,4 @@
|
||||||
apiVersion: 1
|
apiVersion: 1
|
||||||
|
|
||||||
providers:
|
providers:
|
||||||
- name: 'default'
|
- name: 'default'
|
||||||
orgId: 1
|
orgId: 1
|
||||||
|
|
|
||||||
9
monitoring/grafana/provisioning/datasources/loki.yml
Normal file
9
monitoring/grafana/provisioning/datasources/loki.yml
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
apiVersion: 1
|
||||||
|
datasources:
|
||||||
|
- name: Loki
|
||||||
|
type: loki
|
||||||
|
access: proxy
|
||||||
|
url: http://loki:3100
|
||||||
|
editable: true
|
||||||
|
jsonData:
|
||||||
|
maxLines: 1000
|
||||||
|
|
@ -1,16 +1,10 @@
|
||||||
apiVersion: 1
|
apiVersion: 1
|
||||||
|
|
||||||
datasources:
|
datasources:
|
||||||
- name: Prometheus
|
- name: Prometheus
|
||||||
type: prometheus
|
type: prometheus
|
||||||
access: proxy
|
access: proxy
|
||||||
url: http://prometheus:9090
|
url: http://prometheus:9090
|
||||||
isDefault: false
|
|
||||||
editable: false
|
|
||||||
|
|
||||||
- name: Loki
|
|
||||||
type: loki
|
|
||||||
access: proxy
|
|
||||||
url: http://loki:3100
|
|
||||||
isDefault: true
|
isDefault: true
|
||||||
editable: false
|
editable: true
|
||||||
|
jsonData:
|
||||||
|
timeInterval: "5s"
|
||||||
100
monitoring/health-testing.md
Normal file
100
monitoring/health-testing.md
Normal file
|
|
@ -0,0 +1,100 @@
|
||||||
|
# Testing Health Endpoints with Monitoring Stack
|
||||||
|
|
||||||
|
## Verify Health Endpoints
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Start the monitoring stack
|
||||||
|
cd deployments
|
||||||
|
docker-compose -f docker-compose.dev.yml up -d
|
||||||
|
|
||||||
|
# 2. Wait for services to start (30 seconds)
|
||||||
|
sleep 30
|
||||||
|
|
||||||
|
# 3. Test health endpoints
|
||||||
|
curl -k https://localhost:9101/health
|
||||||
|
# Expected: {"status":"healthy","timestamp":"...","checks":{}}
|
||||||
|
|
||||||
|
curl -k https://localhost:9101/health/live
|
||||||
|
# Expected: {"status":"alive","timestamp":"..."}
|
||||||
|
|
||||||
|
curl -k https://localhost:9101/health/ready
|
||||||
|
# Expected: {"status":"ready","timestamp":"...","checks":{"queue":"ok","experiments":"ok"}}
|
||||||
|
|
||||||
|
# 4. Check Docker health status
|
||||||
|
docker ps | grep api-server
|
||||||
|
# Should show: (healthy)
|
||||||
|
|
||||||
|
# 5. Access Grafana
|
||||||
|
open http://localhost:3000
|
||||||
|
# Login: admin / admin123
|
||||||
|
|
||||||
|
# 6. Access Prometheus
|
||||||
|
open http://localhost:9090
|
||||||
|
# Check targets: Status > Targets
|
||||||
|
# Should see: api-server, api-server-health
|
||||||
|
|
||||||
|
# 7. Query health metrics in Prometheus
|
||||||
|
# Go to Graph and enter: up{job="api-server-health"}
|
||||||
|
# Should show: value=1 (service is up)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Health Check Integration
|
||||||
|
|
||||||
|
### Docker Compose
|
||||||
|
The health check is configured in `deployments/docker-compose.dev.yml`:
|
||||||
|
```yaml
|
||||||
|
healthcheck:
|
||||||
|
test: [ "CMD", "curl", "-k", "https://localhost:9101/health" ]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 40s
|
||||||
|
```
|
||||||
|
|
||||||
|
### Prometheus Monitoring
|
||||||
|
Prometheus scrapes health status every 30s from:
|
||||||
|
- `/health` - Overall service health
|
||||||
|
- `/metrics` - Future Prometheus metrics (when implemented)
|
||||||
|
|
||||||
|
### Kubernetes (Future)
|
||||||
|
Health endpoints ready for K8s probes:
|
||||||
|
```yaml
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health/live
|
||||||
|
port: 9101
|
||||||
|
scheme: HTTPS
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 10
|
||||||
|
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health/ready
|
||||||
|
port: 9101
|
||||||
|
scheme: HTTPS
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 5
|
||||||
|
```
|
||||||
|
|
||||||
|
## Monitoring Stack Services
|
||||||
|
|
||||||
|
- **Grafana** (port 3000): Dashboards and visualization
|
||||||
|
- **Prometheus** (port 9090): Metrics collection
|
||||||
|
- **Loki** (port 3100): Log aggregation
|
||||||
|
- **Promtail**: Log shipping
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check API server logs
|
||||||
|
docker logs ml-experiments-api
|
||||||
|
|
||||||
|
# Check Prometheus targets
|
||||||
|
curl http://localhost:9090/api/v1/targets
|
||||||
|
|
||||||
|
# Check health endpoint directly
|
||||||
|
docker exec ml-experiments-api curl -k https://localhost:9101/health
|
||||||
|
|
||||||
|
# Restart services
|
||||||
|
docker-compose -f deployments/docker-compose.dev.yml restart api-server
|
||||||
|
```
|
||||||
|
|
@ -12,7 +12,7 @@ common:
|
||||||
rules_directory: /loki/rules
|
rules_directory: /loki/rules
|
||||||
replication_factor: 1
|
replication_factor: 1
|
||||||
ring:
|
ring:
|
||||||
instance_addr: 127.0.0.1
|
instance_addr: 0.0.0.0
|
||||||
kvstore:
|
kvstore:
|
||||||
store: inmemory
|
store: inmemory
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,40 +0,0 @@
|
||||||
auth_enabled: false
|
|
||||||
|
|
||||||
server:
|
|
||||||
http_listen_port: 3100
|
|
||||||
|
|
||||||
ingester:
|
|
||||||
lifecycler:
|
|
||||||
address: 127.0.0.1
|
|
||||||
ring:
|
|
||||||
kvstore:
|
|
||||||
store: inmemory
|
|
||||||
replication_factor: 1
|
|
||||||
final_sleep: 0s
|
|
||||||
min_ready_duration: 0s
|
|
||||||
chunk_idle_period: 1h
|
|
||||||
max_chunk_age: 1h
|
|
||||||
chunk_target_size: 1048576
|
|
||||||
chunk_retain_period: 30s
|
|
||||||
|
|
||||||
schema_config:
|
|
||||||
configs:
|
|
||||||
- from: 2020-10-24
|
|
||||||
store: boltdb-shipper
|
|
||||||
object_store: filesystem
|
|
||||||
schema: v11
|
|
||||||
index:
|
|
||||||
prefix: index_
|
|
||||||
period: 24h
|
|
||||||
|
|
||||||
storage_config:
|
|
||||||
boltdb_shipper:
|
|
||||||
active_index_directory: /loki/boltdb-shipper-active
|
|
||||||
cache_location: /loki/boltdb-shipper-cache
|
|
||||||
filesystem:
|
|
||||||
directory: /loki/chunks
|
|
||||||
|
|
||||||
limits_config:
|
|
||||||
reject_old_samples: true
|
|
||||||
reject_old_samples_max_age: 168h
|
|
||||||
allow_structured_metadata: false
|
|
||||||
|
|
@ -5,39 +5,35 @@ global:
|
||||||
evaluation_interval: 15s
|
evaluation_interval: 15s
|
||||||
|
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
# API Server metrics
|
# API Server metrics and health
|
||||||
- job_name: 'api-server'
|
- job_name: 'api-server'
|
||||||
|
scheme: http
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ['api-server:9100']
|
- targets: ['api-server:9101']
|
||||||
labels:
|
labels:
|
||||||
service: 'api-server'
|
service: 'api-server'
|
||||||
|
metrics_path: /metrics # Future: Prometheus metrics endpoint
|
||||||
|
|
||||||
# Worker metrics (if running in docker)
|
# Benchmark metrics from Pushgateway
|
||||||
|
- job_name: 'benchmark'
|
||||||
|
static_configs: []
|
||||||
|
|
||||||
|
# Worker metrics (ResourceManager + task execution)
|
||||||
|
# For docker-compose dev on macOS/Windows, Prometheus can reach a locally running worker
|
||||||
|
# via host.docker.internal.
|
||||||
- job_name: 'worker'
|
- job_name: 'worker'
|
||||||
|
scrape_interval: 15s
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ['worker:9100']
|
- targets: ['worker:9100']
|
||||||
labels:
|
labels:
|
||||||
service: 'worker'
|
service: 'worker'
|
||||||
# Allow failures if worker not running
|
target_type: 'container'
|
||||||
relabel_configs:
|
|
||||||
- source_labels: [__address__]
|
|
||||||
target_label: __param_target
|
|
||||||
- source_labels: [__param_target]
|
|
||||||
target_label: instance
|
|
||||||
|
|
||||||
# Benchmark metrics from Pushgateway
|
|
||||||
- job_name: 'benchmark'
|
|
||||||
static_configs:
|
|
||||||
- targets: ['localhost:9091']
|
|
||||||
labels:
|
|
||||||
service: 'benchmark'
|
|
||||||
metrics_path: /metrics
|
metrics_path: /metrics
|
||||||
honor_labels: true
|
|
||||||
|
|
||||||
# Loki metrics
|
# Loki metrics
|
||||||
- job_name: 'loki'
|
- job_name: 'loki'
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ['ml-experiments-loki:3100']
|
- targets: ['loki:3100']
|
||||||
labels:
|
labels:
|
||||||
service: 'loki'
|
service: 'loki'
|
||||||
metrics_path: /metrics
|
metrics_path: /metrics
|
||||||
|
|
@ -1,50 +0,0 @@
|
||||||
server:
|
|
||||||
http_listen_port: 9080
|
|
||||||
grpc_listen_port: 0
|
|
||||||
|
|
||||||
positions:
|
|
||||||
filename: /tmp/positions.yaml
|
|
||||||
|
|
||||||
clients:
|
|
||||||
- url: http://loki:3100/loki/api/v1/push
|
|
||||||
|
|
||||||
scrape_configs:
|
|
||||||
- job_name: fetchml-performance
|
|
||||||
static_configs:
|
|
||||||
- targets:
|
|
||||||
- localhost
|
|
||||||
labels:
|
|
||||||
job: fetchml-performance
|
|
||||||
__path__: /reports/performance.log
|
|
||||||
|
|
||||||
pipeline_stages:
|
|
||||||
- json:
|
|
||||||
expressions:
|
|
||||||
timestamp: timestamp
|
|
||||||
git_commit: git_commit
|
|
||||||
benchmark_name: name
|
|
||||||
time_per_op: time_per_op_ns
|
|
||||||
memory_per_op: memory_per_op_b
|
|
||||||
allocs_per_op: allocs_per_op
|
|
||||||
|
|
||||||
- labels:
|
|
||||||
benchmark_name:
|
|
||||||
git_commit:
|
|
||||||
|
|
||||||
- output:
|
|
||||||
source: output
|
|
||||||
|
|
||||||
- job_name: fetchml-performance-summary
|
|
||||||
static_configs:
|
|
||||||
- targets:
|
|
||||||
- localhost
|
|
||||||
labels:
|
|
||||||
job: fetchml-performance
|
|
||||||
__path__: /reports/performance_summary.log
|
|
||||||
|
|
||||||
pipeline_stages:
|
|
||||||
- regex:
|
|
||||||
expression: "=== Performance Summary ==="
|
|
||||||
|
|
||||||
- output:
|
|
||||||
source: output
|
|
||||||
|
|
@ -1,112 +0,0 @@
|
||||||
groups:
|
|
||||||
- name: security.rules
|
|
||||||
rules:
|
|
||||||
# High rate of failed authentication attempts
|
|
||||||
- alert: HighFailedAuthRate
|
|
||||||
expr: rate(failed_auth_total[5m]) > 10
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "High rate of failed authentication attempts"
|
|
||||||
description: "More than 10 failed auth attempts per minute for the last 2 minutes"
|
|
||||||
|
|
||||||
# Potential brute force attack
|
|
||||||
- alert: BruteForceAttack
|
|
||||||
expr: rate(failed_auth_total[1m]) > 30
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "Potential brute force attack detected"
|
|
||||||
description: "More than 30 failed auth attempts per minute"
|
|
||||||
|
|
||||||
# Unusual WebSocket connection patterns
|
|
||||||
- alert: UnusualWebSocketActivity
|
|
||||||
expr: rate(websocket_connections_total[5m]) > 100
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "Unusual WebSocket connection activity"
|
|
||||||
description: "WebSocket connection rate is unusually high"
|
|
||||||
|
|
||||||
# Rate limit breaches
|
|
||||||
- alert: RateLimitBreached
|
|
||||||
expr: rate(rate_limit_exceeded_total[5m]) > 5
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "Rate limits being exceeded"
|
|
||||||
description: "Rate limit exceeded more than 5 times per minute"
|
|
||||||
|
|
||||||
# SSL certificate expiration warning
|
|
||||||
- alert: SSLCertificateExpiring
|
|
||||||
expr: ssl_certificate_expiry_days < 30
|
|
||||||
for: 1h
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "SSL certificate expiring soon"
|
|
||||||
description: "SSL certificate will expire in less than 30 days"
|
|
||||||
|
|
||||||
# High memory usage
|
|
||||||
- alert: HighMemoryUsage
|
|
||||||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "High memory usage detected"
|
|
||||||
description: "Memory usage is above 90%"
|
|
||||||
|
|
||||||
# High CPU usage
|
|
||||||
- alert: HighCPUUsage
|
|
||||||
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "High CPU usage detected"
|
|
||||||
description: "CPU usage is above 80%"
|
|
||||||
|
|
||||||
# Disk space running low
|
|
||||||
- alert: LowDiskSpace
|
|
||||||
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "Low disk space"
|
|
||||||
description: "Disk space is below 10%"
|
|
||||||
|
|
||||||
# Service down
|
|
||||||
- alert: ServiceDown
|
|
||||||
expr: up == 0
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "Service is down"
|
|
||||||
description: "{{ $labels.instance }} service has been down for more than 1 minute"
|
|
||||||
|
|
||||||
# Unexpected error rates
|
|
||||||
- alert: HighErrorRate
|
|
||||||
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "High error rate detected"
|
|
||||||
description: "Error rate is above 10%"
|
|
||||||
|
|
||||||
# Suspicious IP activity
|
|
||||||
- alert: SuspiciousIPActivity
|
|
||||||
expr: rate(requests_by_ip[5m]) > 1000
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "Suspicious IP activity"
|
|
||||||
description: "IP address making unusually many requests"
|
|
||||||
|
|
@ -118,7 +118,7 @@ jupyter>=1.0.0
|
||||||
"allow_network": false,
|
"allow_network": false,
|
||||||
"blocked_packages": ["requests", "urllib3", "httpx"],
|
"blocked_packages": ["requests", "urllib3", "httpx"],
|
||||||
"max_execution_time": 3600,
|
"max_execution_time": 3600,
|
||||||
"gpu_access": true,
|
"gpu_devices": ["/dev/dri"],
|
||||||
"ml_env": "ml_env",
|
"ml_env": "ml_env",
|
||||||
"package_manager": "mamba"
|
"package_manager": "mamba"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,10 @@ RUN mamba install -n ml_env \
|
||||||
-c pytorch -c conda-forge -y && \
|
-c pytorch -c conda-forge -y && \
|
||||||
conda clean -afy
|
conda clean -afy
|
||||||
|
|
||||||
|
# Poetry (for pyproject.toml + poetry.lock projects)
|
||||||
|
RUN mamba install -n ml_env poetry -c conda-forge -y && \
|
||||||
|
conda clean -afy
|
||||||
|
|
||||||
# Copy security wrapper
|
# Copy security wrapper
|
||||||
COPY secure_runner.py /usr/local/bin/secure_runner.py
|
COPY secure_runner.py /usr/local/bin/secure_runner.py
|
||||||
COPY security_policy.json /etc/ml_runner/security_policy.json
|
COPY security_policy.json /etc/ml_runner/security_policy.json
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,7 @@ class SecurityPolicy:
|
||||||
],
|
],
|
||||||
"max_execution_time": 3600,
|
"max_execution_time": 3600,
|
||||||
"max_memory_gb": 16,
|
"max_memory_gb": 16,
|
||||||
"gpu_access": True,
|
"gpu_devices": ["/dev/dri"],
|
||||||
"allow_file_writes": True,
|
"allow_file_writes": True,
|
||||||
"resource_limits": {
|
"resource_limits": {
|
||||||
"cpu_count": 4,
|
"cpu_count": 4,
|
||||||
|
|
@ -106,97 +106,197 @@ class CondaRunner:
|
||||||
self.conda_prefix = os.environ.get("CONDA_PREFIX", "/opt/conda")
|
self.conda_prefix = os.environ.get("CONDA_PREFIX", "/opt/conda")
|
||||||
self.env_path = f"{self.conda_prefix}/envs/{self.conda_env}"
|
self.env_path = f"{self.conda_prefix}/envs/{self.conda_env}"
|
||||||
|
|
||||||
def setup_environment(self, requirements_file: Path) -> bool:
|
self.gpu_devices = self.security_policy.policy.get("gpu_devices", [])
|
||||||
"""Setup Conda environment with mamba"""
|
|
||||||
|
def setup_environment(self, deps_file: Path) -> bool:
|
||||||
|
"""Setup Conda environment based on a dependency manifest."""
|
||||||
try:
|
try:
|
||||||
# Read requirements
|
name = deps_file.name
|
||||||
with open(requirements_file, "r") as f:
|
|
||||||
requirements = [
|
|
||||||
line.strip()
|
|
||||||
for line in f
|
|
||||||
if line.strip() and not line.startswith("#")
|
|
||||||
]
|
|
||||||
|
|
||||||
# Check each package for security
|
print(f"[MANIFEST] Using dependency manifest: {name}")
|
||||||
for req in requirements:
|
|
||||||
package_name = (
|
|
||||||
req.split("==")[0].split(">=")[0].split("<=")[0].strip()
|
|
||||||
)
|
|
||||||
if not self.security_policy.check_package_safety(package_name):
|
|
||||||
print(
|
|
||||||
f"[SECURITY] Package '{package_name}' is blocked for security reasons"
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Install packages with mamba (super fast!)
|
if name in ("environment.yml", "environment.yaml"):
|
||||||
for req in requirements:
|
print(f"[SETUP] Applying conda environment file: {deps_file}")
|
||||||
package_name = (
|
cmd = [
|
||||||
req.split("==")[0].split(">=")[0].split("<=")[0].strip()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check if already installed with conda
|
|
||||||
check_cmd = [
|
|
||||||
"conda",
|
|
||||||
"run",
|
|
||||||
"-n",
|
|
||||||
self.conda_env,
|
|
||||||
"python",
|
|
||||||
"-c",
|
|
||||||
f"import {package_name.replace('-', '_')}",
|
|
||||||
]
|
|
||||||
result = subprocess.run(
|
|
||||||
check_cmd, capture_output=True, text=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.returncode == 0:
|
|
||||||
print(f"[OK] {package_name} already installed in conda env")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Try conda-forge first (faster and more reliable)
|
|
||||||
print(
|
|
||||||
f"[INSTALL] Installing {req} with {self.package_manager}..."
|
|
||||||
)
|
|
||||||
install_cmd = [
|
|
||||||
self.package_manager,
|
self.package_manager,
|
||||||
"install",
|
"env",
|
||||||
|
"update",
|
||||||
"-n",
|
"-n",
|
||||||
self.conda_env,
|
self.conda_env,
|
||||||
req,
|
"-f",
|
||||||
"-c",
|
str(deps_file),
|
||||||
"conda-forge",
|
|
||||||
"-y",
|
"-y",
|
||||||
]
|
]
|
||||||
result = subprocess.run(
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
|
||||||
install_cmd, capture_output=True, text=True, timeout=300
|
if result.returncode != 0:
|
||||||
|
print(f"[ERROR] Failed to apply environment file: {result.stderr}")
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
if name == "poetry.lock":
|
||||||
|
pyproject = self.workspace_dir / "pyproject.toml"
|
||||||
|
if not pyproject.exists():
|
||||||
|
print("[ERROR] poetry.lock provided but pyproject.toml is missing")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"[SETUP] Installing dependencies from Poetry lockfile: {deps_file}")
|
||||||
|
env = os.environ.copy()
|
||||||
|
env.update(
|
||||||
|
{
|
||||||
|
"POETRY_VIRTUALENVS_CREATE": "false",
|
||||||
|
"POETRY_NO_INTERACTION": "1",
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.returncode == 0:
|
# Ensure Poetry is available in the conda env.
|
||||||
print(f"[OK] Installed {req} with {self.package_manager}")
|
check = subprocess.run(
|
||||||
continue
|
["conda", "run", "-n", self.conda_env, "poetry", "--version"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
env=env,
|
||||||
|
)
|
||||||
|
if check.returncode != 0:
|
||||||
|
print("[ERROR] Poetry is not available in the container environment")
|
||||||
|
print(check.stderr)
|
||||||
|
return False
|
||||||
|
|
||||||
# Fallback to pip if conda fails
|
# Install into the conda env (no separate venv).
|
||||||
print(f"[FALLBACK] Trying pip for {req}...")
|
install = subprocess.run(
|
||||||
pip_cmd = [
|
[
|
||||||
|
"conda",
|
||||||
|
"run",
|
||||||
|
"-n",
|
||||||
|
self.conda_env,
|
||||||
|
"poetry",
|
||||||
|
"install",
|
||||||
|
"--no-ansi",
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=900,
|
||||||
|
cwd=str(self.workspace_dir),
|
||||||
|
env=env,
|
||||||
|
)
|
||||||
|
if install.returncode != 0:
|
||||||
|
print("[ERROR] Poetry install failed")
|
||||||
|
print(install.stderr)
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
if name == "pyproject.toml":
|
||||||
|
# Use pip's PEP517/pyproject support (no Poetry required).
|
||||||
|
# This installs the project itself; dependencies may be fetched as needed.
|
||||||
|
print(f"[SETUP] Installing project from pyproject.toml: {deps_file}")
|
||||||
|
cmd = [
|
||||||
"conda",
|
"conda",
|
||||||
"run",
|
"run",
|
||||||
"-n",
|
"-n",
|
||||||
self.conda_env,
|
self.conda_env,
|
||||||
"pip",
|
"pip",
|
||||||
"install",
|
"install",
|
||||||
req,
|
str(self.workspace_dir),
|
||||||
"--no-cache-dir",
|
"--no-cache-dir",
|
||||||
]
|
]
|
||||||
result = subprocess.run(
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
|
||||||
pip_cmd, capture_output=True, text=True, timeout=300
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
print(f"[ERROR] Failed to install {req}: {result.stderr}")
|
print(f"[ERROR] Failed to install project from pyproject.toml: {result.stderr}")
|
||||||
return False
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
print(f"[OK] Installed {req} with pip")
|
if name == "requirements.txt":
|
||||||
|
# Read requirements
|
||||||
|
with open(deps_file, "r") as f:
|
||||||
|
requirements = [
|
||||||
|
line.strip()
|
||||||
|
for line in f
|
||||||
|
if line.strip() and not line.startswith("#")
|
||||||
|
]
|
||||||
|
|
||||||
return True
|
# Check each package for security
|
||||||
|
for req in requirements:
|
||||||
|
package_name = (
|
||||||
|
req.split("==")[0].split(">=")[0].split("<=")[0].strip()
|
||||||
|
)
|
||||||
|
if not self.security_policy.check_package_safety(package_name):
|
||||||
|
print(
|
||||||
|
f"[SECURITY] Package '{package_name}' is blocked for security reasons"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Install packages with mamba (super fast!)
|
||||||
|
for req in requirements:
|
||||||
|
package_name = (
|
||||||
|
req.split("==")[0].split(">=")[0].split("<=")[0].strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if already installed with conda
|
||||||
|
check_cmd = [
|
||||||
|
"conda",
|
||||||
|
"run",
|
||||||
|
"-n",
|
||||||
|
self.conda_env,
|
||||||
|
"python",
|
||||||
|
"-c",
|
||||||
|
f"import {package_name.replace('-', '_')}",
|
||||||
|
]
|
||||||
|
result = subprocess.run(
|
||||||
|
check_cmd, capture_output=True, text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode == 0:
|
||||||
|
print(f"[OK] {package_name} already installed in conda env")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Try conda-forge first (faster and more reliable)
|
||||||
|
print(
|
||||||
|
f"[INSTALL] Installing {req} with {self.package_manager}..."
|
||||||
|
)
|
||||||
|
install_cmd = [
|
||||||
|
self.package_manager,
|
||||||
|
"install",
|
||||||
|
"-n",
|
||||||
|
self.conda_env,
|
||||||
|
req,
|
||||||
|
"-c",
|
||||||
|
"conda-forge",
|
||||||
|
"-y",
|
||||||
|
]
|
||||||
|
result = subprocess.run(
|
||||||
|
install_cmd, capture_output=True, text=True, timeout=300
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode == 0:
|
||||||
|
print(f"[OK] Installed {req} with {self.package_manager}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Fallback to pip if conda fails
|
||||||
|
print(f"[FALLBACK] Trying pip for {req}...")
|
||||||
|
pip_cmd = [
|
||||||
|
"conda",
|
||||||
|
"run",
|
||||||
|
"-n",
|
||||||
|
self.conda_env,
|
||||||
|
"pip",
|
||||||
|
"install",
|
||||||
|
req,
|
||||||
|
"--no-cache-dir",
|
||||||
|
]
|
||||||
|
result = subprocess.run(
|
||||||
|
pip_cmd, capture_output=True, text=True, timeout=300
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
print(f"[ERROR] Failed to install {req}: {result.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"[OK] Installed {req} with pip")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
print(f"[ERROR] Unsupported dependency manifest: {deps_file}")
|
||||||
|
print("Supported: environment.yml, environment.yaml, poetry.lock (requires pyproject.toml), pyproject.toml, requirements.txt")
|
||||||
|
return False
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[ERROR] Environment setup failed: {e}")
|
print(f"[ERROR] Environment setup failed: {e}")
|
||||||
|
|
@ -217,7 +317,7 @@ class CondaRunner:
|
||||||
env.update(
|
env.update(
|
||||||
{
|
{
|
||||||
"CONDA_DEFAULT_ENV": self.conda_env,
|
"CONDA_DEFAULT_ENV": self.conda_env,
|
||||||
"CUDA_VISIBLE_DEVICES": "0", # Allow GPU access
|
"CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES", ""), # Allow GPU access
|
||||||
"SECURE_MODE": "1",
|
"SECURE_MODE": "1",
|
||||||
"NETWORK_ACCESS": (
|
"NETWORK_ACCESS": (
|
||||||
"1"
|
"1"
|
||||||
|
|
@ -280,7 +380,7 @@ class CondaRunner:
|
||||||
"stdout": stdout,
|
"stdout": stdout,
|
||||||
"stderr": stderr,
|
"stderr": stderr,
|
||||||
"return_code": process.returncode,
|
"return_code": process.returncode,
|
||||||
"gpu_accessible": True,
|
"gpu_accessible": len(self.gpu_devices) > 0,
|
||||||
"security_mode": "enabled",
|
"security_mode": "enabled",
|
||||||
"container_type": "conda",
|
"container_type": "conda",
|
||||||
"conda_env": self.conda_env,
|
"conda_env": self.conda_env,
|
||||||
|
|
@ -338,8 +438,12 @@ def main():
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--workspace", default="/workspace", help="Workspace directory"
|
"--workspace", default="/workspace", help="Workspace directory"
|
||||||
)
|
)
|
||||||
parser.add_argument("--requirements", help="Requirements file path")
|
parser.add_argument("--deps", help="Dependency manifest path (environment.yml | poetry.lock | pyproject.toml | requirements.txt)")
|
||||||
|
parser.add_argument("--requirements", help="Deprecated alias for --deps")
|
||||||
parser.add_argument("--script", help="Training script path")
|
parser.add_argument("--script", help="Training script path")
|
||||||
|
parser.add_argument(
|
||||||
|
"--prepare-only", action="store_true", help="Only prepare dependencies and exit"
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--args",
|
"--args",
|
||||||
nargs=argparse.REMAINDER,
|
nargs=argparse.REMAINDER,
|
||||||
|
|
@ -383,17 +487,26 @@ def main():
|
||||||
if args.check_gpu:
|
if args.check_gpu:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
deps_arg = args.deps or args.requirements
|
||||||
|
if not deps_arg:
|
||||||
|
print("[ERROR] Missing dependency manifest. Provide --deps.")
|
||||||
|
return 1
|
||||||
|
|
||||||
# Setup environment
|
# Setup environment
|
||||||
requirements_path = Path(args.requirements)
|
deps_path = Path(deps_arg)
|
||||||
if not requirements_path.exists():
|
if not deps_path.exists():
|
||||||
print(f"[ERROR] Requirements file not found: {requirements_path}")
|
print(f"[ERROR] Dependency manifest not found: {deps_path}")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
print("[SETUP] Setting up secure environment...")
|
print("[SETUP] Setting up secure environment...")
|
||||||
if not runner.setup_environment(requirements_path):
|
if not runner.setup_environment(deps_path):
|
||||||
print("[ERROR] Failed to setup secure environment")
|
print("[ERROR] Failed to setup secure environment")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
if args.prepare_only:
|
||||||
|
print("[DONE] Environment prepared successfully")
|
||||||
|
return 0
|
||||||
|
|
||||||
# Run experiment
|
# Run experiment
|
||||||
script_path = Path(args.script)
|
script_path = Path(args.script)
|
||||||
if not script_path.exists():
|
if not script_path.exists():
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@
|
||||||
],
|
],
|
||||||
"max_execution_time": 3600,
|
"max_execution_time": 3600,
|
||||||
"max_memory_gb": 16,
|
"max_memory_gb": 16,
|
||||||
"gpu_access": true,
|
"gpu_devices": ["/dev/dri"],
|
||||||
"allow_file_writes": true,
|
"allow_file_writes": true,
|
||||||
"resource_limits": {
|
"resource_limits": {
|
||||||
"cpu_count": 4,
|
"cpu_count": 4,
|
||||||
|
|
|
||||||
|
|
@ -20,19 +20,12 @@ This directory contains setup and utility scripts for FetchML.
|
||||||
sudo ./scripts/setup-prod.sh /data/ml-experiments ml-user ml-group
|
sudo ./scripts/setup-prod.sh /data/ml-experiments ml-user ml-group
|
||||||
```
|
```
|
||||||
|
|
||||||
### `validate-prod-config.sh`
|
### Configuration validation
|
||||||
**Purpose**: Validates production configuration files
|
Validate configs using the built-in config lint targets:
|
||||||
**Usage**: `./scripts/validate-prod-config.sh [api-config] [worker-config]`
|
|
||||||
**What it does**:
|
|
||||||
- Checks config file syntax
|
|
||||||
- Verifies base_path consistency
|
|
||||||
- Tests Redis connectivity
|
|
||||||
- Validates Podman setup
|
|
||||||
- Checks directory permissions
|
|
||||||
|
|
||||||
**Example**:
|
|
||||||
```bash
|
```bash
|
||||||
./scripts/validate-prod-config.sh configs/config-prod.yaml configs/worker-prod.toml
|
make configlint
|
||||||
|
make worker-configlint
|
||||||
```
|
```
|
||||||
|
|
||||||
## Legacy Setup Scripts (Deprecated)
|
## Legacy Setup Scripts (Deprecated)
|
||||||
|
|
@ -44,12 +37,11 @@ The following scripts are from earlier iterations and are **deprecated** in favo
|
||||||
- `auto_setup.sh` - Old automated setup (superseded)
|
- `auto_setup.sh` - Old automated setup (superseded)
|
||||||
- `setup_common.sh` - Common functions (integrated into setup-prod.sh)
|
- `setup_common.sh` - Common functions (integrated into setup-prod.sh)
|
||||||
- `quick_start.sh` - Quick dev setup (use docker-compose on macOS instead)
|
- `quick_start.sh` - Quick dev setup (use docker-compose on macOS instead)
|
||||||
- `test_tools.sh` - Tool testing (integrated into validate-prod-config.sh)
|
|
||||||
|
|
||||||
### Cleanup Recommendation
|
### Cleanup Recommendation
|
||||||
These legacy scripts can be removed or archived. The current production setup only needs:
|
These legacy scripts can be removed or archived. The current production setup only needs:
|
||||||
- `setup-prod.sh`
|
- `setup-prod.sh`
|
||||||
- `validate-prod-config.sh`
|
|
||||||
|
|
||||||
## Usage Workflow
|
## Usage Workflow
|
||||||
|
|
||||||
|
|
@ -59,8 +51,8 @@ These legacy scripts can be removed or archived. The current production setup on
|
||||||
sudo ./scripts/setup-prod.sh
|
sudo ./scripts/setup-prod.sh
|
||||||
|
|
||||||
# 2. Copy and configure
|
# 2. Copy and configure
|
||||||
sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml
|
sudo cp configs/api/prod.yaml /etc/fetch_ml/config.yaml
|
||||||
sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml
|
sudo cp configs/workers/worker-prod.toml /etc/fetch_ml/worker.toml
|
||||||
sudo vim /etc/fetch_ml/config.yaml # Update API keys, etc.
|
sudo vim /etc/fetch_ml/config.yaml # Update API keys, etc.
|
||||||
|
|
||||||
# 3. Build and install
|
# 3. Build and install
|
||||||
|
|
@ -68,7 +60,8 @@ make prod
|
||||||
sudo make install
|
sudo make install
|
||||||
|
|
||||||
# 4. Validate
|
# 4. Validate
|
||||||
./scripts/validate-prod-config.sh /etc/fetch_ml/config.yaml /etc/fetch_ml/worker.toml
|
./bin/configlint --schema configs/schema/api_server_config.yaml /etc/fetch_ml/config.yaml
|
||||||
|
./bin/configlint --schema configs/schema/worker_config_schema.yaml /etc/fetch_ml/worker.toml
|
||||||
|
|
||||||
# 5. Start services
|
# 5. Start services
|
||||||
sudo systemctl start fetchml-api fetchml-worker
|
sudo systemctl start fetchml-api fetchml-worker
|
||||||
|
|
@ -82,7 +75,7 @@ docker-compose up -d
|
||||||
|
|
||||||
# Or run components directly
|
# Or run components directly
|
||||||
make dev
|
make dev
|
||||||
./bin/api-server -config configs/config-local.yaml
|
./bin/api-server -config configs/api/dev.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
## Script Maintenance
|
## Script Maintenance
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ set -e
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||||
LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
|
LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
|
||||||
|
ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"
|
||||||
TIMESTAMP=$(date -u +"%Y%m%d_%H%M%S")
|
TIMESTAMP=$(date -u +"%Y%m%d_%H%M%S")
|
||||||
RUN_DIR="$LOCAL_ARTIFACTS_DIR/run_$TIMESTAMP"
|
RUN_DIR="$LOCAL_ARTIFACTS_DIR/run_$TIMESTAMP"
|
||||||
|
|
||||||
|
|
@ -168,14 +169,25 @@ if [ -f "$SCRIPT_DIR/cleanup-benchmarks.sh" ]; then
|
||||||
"$SCRIPT_DIR/cleanup-benchmarks.sh" benchmarks
|
"$SCRIPT_DIR/cleanup-benchmarks.sh" benchmarks
|
||||||
else
|
else
|
||||||
# Fallback cleanup if script not available
|
# Fallback cleanup if script not available
|
||||||
echo "Cleaning old benchmark runs (keeping last 10)..."
|
echo "Archiving old benchmark runs (keeping last 10)..."
|
||||||
|
stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||||
|
mkdir -p "$ARCHIVE_DIR/$stamp"
|
||||||
cd "$LOCAL_ARTIFACTS_DIR"
|
cd "$LOCAL_ARTIFACTS_DIR"
|
||||||
ls -1t run_* 2>/dev/null | tail -n +11 | xargs rm -rf 2>/dev/null || echo "No old runs to clean"
|
ls -1t run_* 2>/dev/null | tail -n +11 | while read -r run; do
|
||||||
|
[ -n "$run" ] || continue
|
||||||
|
mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
|
||||||
# Clean temporary files
|
# Clean temporary files
|
||||||
echo "Cleaning temporary files..."
|
echo "Archiving temporary files..."
|
||||||
find /tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
|
tmp_archive_dir="$LOCAL_ARTIFACTS_DIR/tmp-archive/$stamp"
|
||||||
find /var/tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
|
mkdir -p "$tmp_archive_dir"
|
||||||
|
find /tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
|
||||||
|
mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
find /var/tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
|
||||||
|
mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
|
||||||
# Clean Go build cache
|
# Clean Go build cache
|
||||||
echo "Cleaning Go build cache..."
|
echo "Cleaning Go build cache..."
|
||||||
|
|
|
||||||
|
|
@ -1,49 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Create a Bitwarden item for a FetchML API user.
|
|
||||||
#
|
|
||||||
# Usage:
|
|
||||||
# ./scripts/create_bitwarden_fetchml_item.sh <username> <api_key> <api_key_hash>
|
|
||||||
#
|
|
||||||
# Requirements:
|
|
||||||
# - Bitwarden CLI (bw) installed
|
|
||||||
# - You are logged in and unlocked (bw login; bw unlock)
|
|
||||||
# - jq installed
|
|
||||||
#
|
|
||||||
# This script does NOT run on the homelab server. Run it from your
|
|
||||||
# own machine where you manage Bitwarden.
|
|
||||||
|
|
||||||
if [[ $# -ne 3 ]]; then
|
|
||||||
echo "Usage: $0 <username> <api_key> <api_key_hash>" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
USER_NAME="$1"
|
|
||||||
API_KEY="$2"
|
|
||||||
API_KEY_HASH="$3"
|
|
||||||
|
|
||||||
ITEM_NAME="FetchML API $USER_NAME"
|
|
||||||
|
|
||||||
# Get base item template
|
|
||||||
TEMPLATE_JSON=$(bw get template item)
|
|
||||||
|
|
||||||
# Build item JSON with jq
|
|
||||||
ITEM_JSON=$(echo "$TEMPLATE_JSON" | jq \
|
|
||||||
--arg name "$ITEM_NAME" \
|
|
||||||
--arg username "$USER_NAME" \
|
|
||||||
--arg password "$API_KEY" \
|
|
||||||
--arg hash "$API_KEY_HASH" \
|
|
||||||
'.name = $name
|
|
||||||
| .login.username = $username
|
|
||||||
| .login.password = $password
|
|
||||||
| .notes = "FetchML API key for user " + $username
|
|
||||||
| .fields = [{"name":"api_key_hash","value":$hash,"type":1}]')
|
|
||||||
|
|
||||||
# Create item in Bitwarden
|
|
||||||
# If you ever want to edit instead, you can capture the ID from this call
|
|
||||||
# and use: bw edit item <id> <json>
|
|
||||||
|
|
||||||
echo "$ITEM_JSON" | bw encode | bw create item
|
|
||||||
|
|
||||||
echo "Created Bitwarden item: $ITEM_NAME"
|
|
||||||
|
|
@ -1,90 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Setup auto-cleanup service for fetch_ml
|
|
||||||
# This creates a systemd timer that runs cleanup daily
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
|
||||||
|
|
||||||
# Colors
|
|
||||||
GREEN='\033[0;32m'
|
|
||||||
BLUE='\033[0;34m'
|
|
||||||
NC='\033[0m'
|
|
||||||
|
|
||||||
log_info() {
|
|
||||||
echo -e "${BLUE}[INFO]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
log_success() {
|
|
||||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
log_info "Setting up auto-cleanup service..."
|
|
||||||
|
|
||||||
# Check if running on macOS or Linux
|
|
||||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
|
||||||
log_info "Detected macOS - setting up launchd agent"
|
|
||||||
|
|
||||||
# Create launchd plist
|
|
||||||
cat > ~/Library/LaunchAgents/com.fetchml.cleanup.plist << EOF
|
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
|
||||||
<plist version="1.0">
|
|
||||||
<dict>
|
|
||||||
<key>Label</key>
|
|
||||||
<string>com.fetchml.cleanup</string>
|
|
||||||
<key>ProgramArguments</key>
|
|
||||||
<array>
|
|
||||||
<string>$PROJECT_DIR/scripts/cleanup.sh</string>
|
|
||||||
<string>--force</string>
|
|
||||||
</array>
|
|
||||||
<key>StartInterval</key>
|
|
||||||
<integer>86400</integer>
|
|
||||||
<key>RunAtLoad</key>
|
|
||||||
<false/>
|
|
||||||
<key>StandardOutPath</key>
|
|
||||||
<string>/tmp/fetchml-cleanup.log</string>
|
|
||||||
<key>StandardErrorPath</key>
|
|
||||||
<string>/tmp/fetchml-cleanup.error.log</string>
|
|
||||||
</dict>
|
|
||||||
</plist>
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Load the launchd agent
|
|
||||||
launchctl load ~/Library/LaunchAgents/com.fetchml.cleanup.plist
|
|
||||||
|
|
||||||
log_success "Auto-cleanup service installed for macOS"
|
|
||||||
log_info "Logs will be in /tmp/fetchml-cleanup.log"
|
|
||||||
|
|
||||||
elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
|
|
||||||
log_info "Detected Linux - setting up systemd timer"
|
|
||||||
|
|
||||||
# Copy service files
|
|
||||||
sudo cp "$SCRIPT_DIR/auto-cleanup.service" /etc/systemd/system/
|
|
||||||
sudo cp "$SCRIPT_DIR/auto-cleanup.timer" /etc/systemd/system/
|
|
||||||
|
|
||||||
# Reload systemd and enable timer
|
|
||||||
sudo systemctl daemon-reload
|
|
||||||
sudo systemctl enable auto-cleanup.timer
|
|
||||||
sudo systemctl start auto-cleanup.timer
|
|
||||||
|
|
||||||
log_success "Auto-cleanup service installed for Linux"
|
|
||||||
log_info "Check status with: systemctl status auto-cleanup.timer"
|
|
||||||
|
|
||||||
else
|
|
||||||
echo "Unsupported OS: $OSTYPE"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
log_info "Auto-cleanup will run daily"
|
|
||||||
log_info "To uninstall:"
|
|
||||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
|
||||||
echo " launchctl unload ~/Library/LaunchAgents/com.fetchml.cleanup.plist"
|
|
||||||
echo " rm ~/Library/LaunchAgents/com.fetchml.cleanup.plist"
|
|
||||||
else
|
|
||||||
echo " sudo systemctl stop auto-cleanup.timer"
|
|
||||||
echo " sudo systemctl disable auto-cleanup.timer"
|
|
||||||
echo " sudo rm /etc/systemd/system/auto-cleanup.*"
|
|
||||||
fi
|
|
||||||
|
|
@ -1,275 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
# Production Monitoring Stack Setup for Linux
|
|
||||||
# Deploys Prometheus/Grafana/Loki/Promtail as Podman containers with systemd
|
|
||||||
# Compatible with: Rocky/RHEL/CentOS, Ubuntu/Debian, Arch, SUSE, etc.
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
BOLD='\033[1m'
|
|
||||||
GREEN='\033[0;32m'
|
|
||||||
BLUE='\033[0;34m'
|
|
||||||
YELLOW='\033[0;33m'
|
|
||||||
NC='\033[0m'
|
|
||||||
|
|
||||||
echo -e "${BOLD}=== FetchML Monitoring Stack Setup (Linux) ===${NC}\n"
|
|
||||||
|
|
||||||
# Detect Linux distribution and package manager
|
|
||||||
detect_distro() {
|
|
||||||
if [ -f /etc/os-release ]; then
|
|
||||||
. /etc/os-release
|
|
||||||
DISTRO=$ID
|
|
||||||
DISTRO_VERSION=$VERSION_ID
|
|
||||||
elif [ -f /etc/redhat-release ]; then
|
|
||||||
DISTRO="rhel"
|
|
||||||
else
|
|
||||||
DISTRO="unknown"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Detect package manager
|
|
||||||
if command -v dnf &>/dev/null; then
|
|
||||||
PKG_MANAGER="dnf"
|
|
||||||
elif command -v yum &>/dev/null; then
|
|
||||||
PKG_MANAGER="yum"
|
|
||||||
elif command -v apt-get &>/dev/null; then
|
|
||||||
PKG_MANAGER="apt"
|
|
||||||
elif command -v pacman &>/dev/null; then
|
|
||||||
PKG_MANAGER="pacman"
|
|
||||||
elif command -v zypper &>/dev/null; then
|
|
||||||
PKG_MANAGER="zypper"
|
|
||||||
else
|
|
||||||
echo -e "${YELLOW}Warning: No known package manager found${NC}"
|
|
||||||
PKG_MANAGER="unknown"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Detected distribution: $DISTRO (using $PKG_MANAGER)"
|
|
||||||
}
|
|
||||||
|
|
||||||
detect_distro
|
|
||||||
|
|
||||||
# Configuration
|
|
||||||
DATA_PATH="${1:-/data/monitoring}"
|
|
||||||
ML_USER="${2:-ml-user}"
|
|
||||||
ML_GROUP="${3:-ml-group}"
|
|
||||||
|
|
||||||
echo "Configuration:"
|
|
||||||
echo " Monitoring data path: $DATA_PATH"
|
|
||||||
echo " User: $ML_USER"
|
|
||||||
echo " Group: $ML_GROUP"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Create pod for monitoring stack
|
|
||||||
POD_NAME="monitoring"
|
|
||||||
|
|
||||||
# 1. Create directories
|
|
||||||
echo -e "${BLUE}[1/6]${NC} Creating directory structure..."
|
|
||||||
sudo mkdir -p "${DATA_PATH}"/{prometheus,grafana,loki,promtail-config}
|
|
||||||
sudo mkdir -p /etc/fetch_ml/monitoring
|
|
||||||
sudo mkdir -p /var/lib/grafana/dashboards
|
|
||||||
|
|
||||||
sudo chown -R $ML_USER:$ML_GROUP $DATA_PATH
|
|
||||||
sudo chmod 755 $DATA_PATH
|
|
||||||
|
|
||||||
echo -e "${GREEN}✓${NC} Directories created"
|
|
||||||
|
|
||||||
# 2. Copy configuration files
|
|
||||||
echo -e "${BLUE}[2/6]${NC} Copying configuration files..."
|
|
||||||
sudo cp monitoring/prometheus.yml /etc/fetch_ml/monitoring/
|
|
||||||
sudo cp monitoring/loki-config.yml /etc/fetch_ml/monitoring/
|
|
||||||
sudo cp monitoring/promtail-config.yml /etc/fetch_ml/monitoring/
|
|
||||||
sudo cp monitoring/grafana/provisioning /etc/fetch_ml/monitoring/ -r
|
|
||||||
sudo cp monitoring/grafana-dashboard.json /var/lib/grafana/dashboards/ml-queue.json
|
|
||||||
sudo cp monitoring/logs-dashboard.json /var/lib/grafana/dashboards/logs.json
|
|
||||||
|
|
||||||
sudo chown -R $ML_USER:$ML_GROUP /etc/fetch_ml/monitoring
|
|
||||||
sudo chown -R $ML_USER:$ML_GROUP /var/lib/grafana
|
|
||||||
|
|
||||||
echo -e "${GREEN}✓${NC} Configuration copied"
|
|
||||||
|
|
||||||
# 3. Create Podman pod
|
|
||||||
echo -e "${BLUE}[3/6]${NC} Creating Podman pod..."
|
|
||||||
sudo -u $ML_USER podman pod create \\
|
|
||||||
--name $POD_NAME \\
|
|
||||||
-p 3000:3000 \\
|
|
||||||
-p 9090:9090 \\
|
|
||||||
-p 3100:3100 \\
|
|
||||||
|| echo "Pod may already exist"
|
|
||||||
|
|
||||||
echo -e "${GREEN}✓${NC} Pod created"
|
|
||||||
|
|
||||||
# 4. Create systemd service for monitoring pod
|
|
||||||
echo -e "${BLUE}[4/6]${NC} Creating systemd services..."
|
|
||||||
|
|
||||||
# Prometheus service
|
|
||||||
sudo tee /etc/systemd/system/prometheus.service >/dev/null <<EOF
|
|
||||||
[Unit]
|
|
||||||
Description=Prometheus Monitoring
|
|
||||||
After=network.target
|
|
||||||
PartOf=$POD_NAME-pod.service
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
User=$ML_USER
|
|
||||||
Group=$ML_GROUP
|
|
||||||
Restart=always
|
|
||||||
RestartSec=10
|
|
||||||
|
|
||||||
ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 9090:9090
|
|
||||||
ExecStart=/usr/bin/podman run --rm --name prometheus \\
|
|
||||||
--pod $POD_NAME \\
|
|
||||||
-v /etc/fetch_ml/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro \\
|
|
||||||
-v ${DATA_PATH}/prometheus:/prometheus \\
|
|
||||||
docker.io/prom/prometheus:latest \\
|
|
||||||
--config.file=/etc/prometheus/prometheus.yml \\
|
|
||||||
--storage.tsdb.path=/prometheus \\
|
|
||||||
--web.enable-lifecycle
|
|
||||||
|
|
||||||
ExecStop=/usr/bin/podman stop -t 10 prometheus
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Loki service
|
|
||||||
sudo tee /etc/systemd/system/loki.service >/dev/null <<EOF
|
|
||||||
[Unit]
|
|
||||||
Description=Loki Log Aggregation
|
|
||||||
After=network.target
|
|
||||||
PartOf=$POD_NAME-pod.service
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
User=$ML_USER
|
|
||||||
Group=$ML_GROUP
|
|
||||||
Restart=always
|
|
||||||
RestartSec=10
|
|
||||||
|
|
||||||
ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 3100:3100
|
|
||||||
ExecStart=/usr/bin/podman run --rm --name loki \\
|
|
||||||
--pod $POD_NAME \\
|
|
||||||
-v /etc/fetch_ml/monitoring/loki-config.yml:/etc/loki/local-config.yaml:ro \\
|
|
||||||
-v ${DATA_PATH}/loki:/loki \\
|
|
||||||
docker.io/grafana/loki:latest \\
|
|
||||||
-config.file=/etc/loki/local-config.yaml
|
|
||||||
|
|
||||||
ExecStop=/usr/bin/podman stop -t 10 loki
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Grafana service
|
|
||||||
sudo tee /etc/systemd/system/grafana.service >/dev/null <<EOF
|
|
||||||
[Unit]
|
|
||||||
Description=Grafana Visualization
|
|
||||||
After=network.target prometheus.service loki.service
|
|
||||||
PartOf=$POD_NAME-pod.service
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
User=$ML_USER
|
|
||||||
Group=$ML_GROUP
|
|
||||||
Restart=always
|
|
||||||
RestartSec=10
|
|
||||||
|
|
||||||
ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME -p 3000:3000
|
|
||||||
ExecStart=/usr/bin/podman run --rm --name grafana \\
|
|
||||||
--pod $POD_NAME \\
|
|
||||||
-v ${DATA_PATH}/grafana:/var/lib/grafana \\
|
|
||||||
-v /etc/fetch_ml/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro \\
|
|
||||||
-v /var/lib/grafana/dashboards:/var/lib/grafana/dashboards:ro \\
|
|
||||||
-e GF_SECURITY_ADMIN_PASSWORD=\${GRAFANA_ADMIN_PASSWORD:-$(openssl rand -base64 32)} \\
|
|
||||||
-e GF_USERS_ALLOW_SIGN_UP=false \\
|
|
||||||
-e GF_AUTH_ANONYMOUS_ENABLED=false \\
|
|
||||||
docker.io/grafana/grafana:latest
|
|
||||||
|
|
||||||
ExecStop=/usr/bin/podman stop -t 10 grafana
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Promtail service
|
|
||||||
sudo tee /etc/systemd/system/promtail.service >/dev/null <<EOF
|
|
||||||
[Unit]
|
|
||||||
Description=Promtail Log Collector
|
|
||||||
After=network.target loki.service
|
|
||||||
PartOf=$POD_NAME-pod.service
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
User=$ML_USER
|
|
||||||
Group=$ML_GROUP
|
|
||||||
Restart=always
|
|
||||||
RestartSec=10
|
|
||||||
|
|
||||||
ExecStartPre=/usr/bin/podman pod exists $POD_NAME || /usr/bin/podman pod create --name $POD_NAME
|
|
||||||
ExecStart=/usr/bin/podman run --rm --name promtail \\
|
|
||||||
--pod $POD_NAME \\
|
|
||||||
-v /etc/fetch_ml/monitoring/promtail-config.yml:/etc/promtail/config.yml:ro \\
|
|
||||||
-v /var/log/fetch_ml:/var/log/app:ro \\
|
|
||||||
docker.io/grafana/promtail:latest \\
|
|
||||||
-config.file=/etc/promtail/config.yml
|
|
||||||
|
|
||||||
ExecStop=/usr/bin/podman stop -t 10 promtail
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
EOF
|
|
||||||
|
|
||||||
sudo systemctl daemon-reload
|
|
||||||
echo -e "${GREEN}✓${NC} Systemd services created"
|
|
||||||
|
|
||||||
# 5. Create monitoring pod service
|
|
||||||
echo -e "${BLUE}[5/6]${NC} Creating pod management service..."
|
|
||||||
sudo -u $ML_USER podman generate systemd --new --name $POD_NAME \\
|
|
||||||
| sudo tee /etc/systemd/system/$POD_NAME-pod.service >/dev/null
|
|
||||||
|
|
||||||
sudo systemctl daemon-reload
|
|
||||||
echo -e "${GREEN}✓${NC} Pod service created"
|
|
||||||
|
|
||||||
# 6. Setup firewall rules
|
|
||||||
echo -e "${BLUE}[6/6]${NC} Configuring firewall..."
|
|
||||||
if command -v firewall-cmd &>/dev/null; then
|
|
||||||
# RHEL/Rocky/Fedora (firewalld)
|
|
||||||
sudo firewall-cmd --permanent --add-port=3000/tcp # Grafana
|
|
||||||
sudo firewall-cmd --permanent --add-port=9090/tcp # Prometheus
|
|
||||||
sudo firewall-cmd --reload
|
|
||||||
echo -e "${GREEN}✓${NC} Firewall configured (firewalld)"
|
|
||||||
elif command -v ufw &>/dev/null; then
|
|
||||||
# Ubuntu/Debian (ufw)
|
|
||||||
sudo ufw allow 3000/tcp comment 'Grafana'
|
|
||||||
sudo ufw allow 9090/tcp comment 'Prometheus'
|
|
||||||
echo -e "${GREEN}✓${NC} Firewall configured (ufw)"
|
|
||||||
else
|
|
||||||
echo -e "${YELLOW}!${NC} No firewall detected. You may need to manually open ports 3000 and 9090"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Summary
|
|
||||||
echo ""
|
|
||||||
echo -e "${BOLD}=== Monitoring Stack Setup Complete! ===${NC}"
|
|
||||||
echo ""
|
|
||||||
echo "Services created:"
|
|
||||||
echo " - prometheus.service (Metrics collection)"
|
|
||||||
echo " - loki.service (Log aggregation)"
|
|
||||||
echo " - grafana.service (Visualization)"
|
|
||||||
echo " - promtail.service (Log shipping)"
|
|
||||||
echo ""
|
|
||||||
echo -e "${BOLD}Next steps:${NC}"
|
|
||||||
echo "1. Start services:"
|
|
||||||
echo " sudo systemctl start prometheus"
|
|
||||||
echo " sudo systemctl start loki"
|
|
||||||
echo " sudo systemctl start promtail"
|
|
||||||
echo " sudo systemctl start grafana"
|
|
||||||
echo ""
|
|
||||||
echo "2. Enable on boot:"
|
|
||||||
echo " sudo systemctl enable prometheus loki promtail grafana"
|
|
||||||
echo ""
|
|
||||||
echo "3. Access Grafana:"
|
|
||||||
echo " http://YOUR_SERVER_IP:3000"
|
|
||||||
echo " Username: admin"
|
|
||||||
echo " Password: admin (change on first login)"
|
|
||||||
echo ""
|
|
||||||
echo "4. Check logs:"
|
|
||||||
echo " sudo journalctl -u prometheus -f"
|
|
||||||
echo " sudo journalctl -u grafana -f"
|
|
||||||
echo ""
|
|
||||||
|
|
@ -1,229 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
# Production Setup Script for Rocky Linux (Bare Metal)
|
|
||||||
# This script sets up the complete FetchML environment on bare metal
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
BOLD='\033[1m'
|
|
||||||
GREEN='\033[0;32m'
|
|
||||||
BLUE='\033[0;34m'
|
|
||||||
NC='\033[0m'
|
|
||||||
|
|
||||||
echo -e "${BOLD}=== FetchML Production Setup (Rocky Linux Bare Metal) ===${NC}\n"
|
|
||||||
|
|
||||||
# Configuration
|
|
||||||
BASE_PATH="${1:-/data/ml-experiments}"
|
|
||||||
ML_USER="${2:-ml-user}"
|
|
||||||
ML_GROUP="${3:-ml-group}"
|
|
||||||
|
|
||||||
echo "Configuration:"
|
|
||||||
echo " Base path: $BASE_PATH"
|
|
||||||
echo " ML user: $ML_USER"
|
|
||||||
echo " ML group: $ML_GROUP"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# 1. Create system user if it doesn't exist
|
|
||||||
echo -e "${BLUE}[1/8]${NC} Creating system user..."
|
|
||||||
if id "$ML_USER" &>/dev/null; then
|
|
||||||
echo " User $ML_USER already exists"
|
|
||||||
else
|
|
||||||
sudo useradd -r -s /bin/bash -m -d /home/$ML_USER -c "ML System User" $ML_USER
|
|
||||||
echo -e "${GREEN}✓${NC} Created user: $ML_USER"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 2. Create directory structure
|
|
||||||
echo -e "${BLUE}[2/8]${NC} Creating directory structure..."
|
|
||||||
sudo mkdir -p "${BASE_PATH}"/{experiments,pending,running,finished,failed,datasets}
|
|
||||||
sudo mkdir -p /var/log/fetch_ml
|
|
||||||
sudo mkdir -p /etc/fetch_ml
|
|
||||||
|
|
||||||
echo -e "${GREEN}✓${NC} Created directories:"
|
|
||||||
echo " $BASE_PATH/experiments/"
|
|
||||||
echo " $BASE_PATH/pending/"
|
|
||||||
echo " $BASE_PATH/running/"
|
|
||||||
echo " $BASE_PATH/finished/"
|
|
||||||
echo " $BASE_PATH/failed/"
|
|
||||||
echo " $BASE_PATH/datasets/"
|
|
||||||
echo " /var/log/fetch_ml/"
|
|
||||||
echo " /etc/fetch_ml/"
|
|
||||||
|
|
||||||
# 3. Set ownership and permissions
|
|
||||||
echo -e "${BLUE}[3/8]${NC} Setting permissions..."
|
|
||||||
sudo chown -R $ML_USER:$ML_GROUP $BASE_PATH
|
|
||||||
sudo chmod 755 $BASE_PATH
|
|
||||||
sudo chmod 700 $BASE_PATH/experiments # Restrict experiment data
|
|
||||||
|
|
||||||
sudo chown -R $ML_USER:$ML_GROUP /var/log/fetch_ml
|
|
||||||
sudo chmod 755 /var/log/fetch_ml
|
|
||||||
|
|
||||||
echo -e "${GREEN}✓${NC} Permissions set"
|
|
||||||
|
|
||||||
# 4. Install system dependencies (Rocky Linux)
|
|
||||||
echo -e "${BLUE}[4/8]${NC} Installing system dependencies..."
|
|
||||||
sudo dnf install -y \
|
|
||||||
golang \
|
|
||||||
podman \
|
|
||||||
redis \
|
|
||||||
git \
|
|
||||||
make \
|
|
||||||
gcc \
|
|
||||||
|| echo "Some packages may already be installed"
|
|
||||||
|
|
||||||
echo -e "${GREEN}✓${NC} Dependencies installed"
|
|
||||||
|
|
||||||
# 5. Configure Podman for GPU access (if NVIDIA GPU present)
|
|
||||||
echo -e "${BLUE}[5/8]${NC} Configuring Podman..."
|
|
||||||
if lspci | grep -i nvidia &>/dev/null; then
|
|
||||||
echo " NVIDIA GPU detected, configuring GPU access..."
|
|
||||||
|
|
||||||
# Install nvidia-container-toolkit if not present
|
|
||||||
if ! command -v nvidia-container-toolkit &>/dev/null; then
|
|
||||||
echo " Installing nvidia-container-toolkit..."
|
|
||||||
sudo dnf config-manager --add-repo \
|
|
||||||
https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo
|
|
||||||
sudo dnf install -y nvidia-container-toolkit
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Configure Podman CDI
|
|
||||||
sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
|
|
||||||
echo -e "${GREEN}✓${NC} GPU support configured"
|
|
||||||
else
|
|
||||||
echo " No NVIDIA GPU detected, skipping GPU setup"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 6. Configure Redis
|
|
||||||
echo -e "${BLUE}[6/8]${NC} Configuring Redis..."
|
|
||||||
sudo systemctl enable redis
|
|
||||||
sudo systemctl start redis || echo "Redis may already be running"
|
|
||||||
|
|
||||||
# Set Redis password if not already configured
|
|
||||||
if ! sudo grep -q "^requirepass" /etc/redis/redis.conf 2>/dev/null; then
|
|
||||||
REDIS_PASSWORD=$(openssl rand -base64 32)
|
|
||||||
echo "requirepass $REDIS_PASSWORD" | sudo tee -a /etc/redis/redis.conf >/dev/null
|
|
||||||
sudo systemctl restart redis
|
|
||||||
echo " Generated Redis password: $REDIS_PASSWORD"
|
|
||||||
echo " Save this password for your configuration!"
|
|
||||||
else
|
|
||||||
echo " Redis password already configured"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo -e "${GREEN}✓${NC} Redis configured"
|
|
||||||
|
|
||||||
# 7. Setup systemd services
|
|
||||||
echo -e "${BLUE}[7/8]${NC} Creating systemd services..."
|
|
||||||
|
|
||||||
# API Server service
|
|
||||||
sudo tee /etc/systemd/system/fetchml-api.service >/dev/null <<EOF
|
|
||||||
[Unit]
|
|
||||||
Description=FetchML API Server
|
|
||||||
After=network.target redis.service
|
|
||||||
Wants=redis.service
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
User=$ML_USER
|
|
||||||
Group=$ML_GROUP
|
|
||||||
WorkingDirectory=/opt/fetch_ml
|
|
||||||
ExecStart=/usr/local/bin/fetchml-api -config /etc/fetch_ml/config.yaml
|
|
||||||
Restart=always
|
|
||||||
RestartSec=10
|
|
||||||
StandardOutput=append:/var/log/fetch_ml/api.log
|
|
||||||
StandardError=append:/var/log/fetch_ml/api-error.log
|
|
||||||
|
|
||||||
# Security hardening
|
|
||||||
NoNewPrivileges=true
|
|
||||||
PrivateTmp=true
|
|
||||||
ProtectSystem=strict
|
|
||||||
ProtectHome=true
|
|
||||||
ReadWritePaths=$BASE_PATH /var/log/fetch_ml
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Worker service
|
|
||||||
sudo tee /etc/systemd/system/fetchml-worker.service >/dev/null <<EOF
|
|
||||||
[Unit]
|
|
||||||
Description=FetchML Worker
|
|
||||||
After=network.target redis.service fetchml-api.service
|
|
||||||
Wants=redis.service
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
User=$ML_USER
|
|
||||||
Group=$ML_GROUP
|
|
||||||
WorkingDirectory=/opt/fetch_ml
|
|
||||||
ExecStart=/usr/local/bin/fetchml-worker -config /etc/fetch_ml/worker.toml
|
|
||||||
Restart=always
|
|
||||||
RestartSec=10
|
|
||||||
StandardOutput=append:/var/log/fetch_ml/worker.log
|
|
||||||
StandardError=append:/var/log/fetch_ml/worker-error.log
|
|
||||||
|
|
||||||
# Security hardening
|
|
||||||
NoNewPrivileges=true
|
|
||||||
PrivateTmp=true
|
|
||||||
ProtectSystem=strict
|
|
||||||
ProtectHome=true
|
|
||||||
ReadWritePaths=$BASE_PATH /var/log/fetch_ml
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
EOF
|
|
||||||
|
|
||||||
sudo systemctl daemon-reload
|
|
||||||
echo -e "${GREEN}✓${NC} Systemd services created"
|
|
||||||
|
|
||||||
# 8. Setup logrotate
|
|
||||||
echo -e "${BLUE}[8/8]${NC} Configuring log rotation..."
|
|
||||||
sudo tee /etc/logrotate.d/fetchml >/dev/null <<EOF
|
|
||||||
/var/log/fetch_ml/*.log {
|
|
||||||
daily
|
|
||||||
rotate 14
|
|
||||||
compress
|
|
||||||
delaycompress
|
|
||||||
notifempty
|
|
||||||
missingok
|
|
||||||
create 0640 $ML_USER $ML_GROUP
|
|
||||||
sharedscripts
|
|
||||||
postrotate
|
|
||||||
systemctl reload fetchml-api >/dev/null 2>&1 || true
|
|
||||||
systemctl reload fetchml-worker >/dev/null 2>&1 || true
|
|
||||||
endscript
|
|
||||||
}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
echo -e "${GREEN}✓${NC} Log rotation configured"
|
|
||||||
|
|
||||||
# Summary
|
|
||||||
echo ""
|
|
||||||
echo -e "${BOLD}=== Setup Complete! ===${NC}"
|
|
||||||
echo ""
|
|
||||||
echo "Directory structure created at: $BASE_PATH"
|
|
||||||
echo "Logs will be written to: /var/log/fetch_ml/"
|
|
||||||
echo "Configuration directory: /etc/fetch_ml/"
|
|
||||||
echo ""
|
|
||||||
echo -e "${BOLD}Next steps:${NC}"
|
|
||||||
echo "1. Copy your config files:"
|
|
||||||
echo " sudo cp configs/config-prod.yaml /etc/fetch_ml/config.yaml"
|
|
||||||
echo " sudo cp configs/worker-prod.toml /etc/fetch_ml/worker.toml"
|
|
||||||
echo ""
|
|
||||||
echo "2. Build and install binaries:"
|
|
||||||
echo " make build"
|
|
||||||
echo " sudo cp bin/api-server /usr/local/bin/fetchml-api"
|
|
||||||
echo " sudo cp bin/worker /usr/local/bin/fetchml-worker"
|
|
||||||
echo ""
|
|
||||||
echo "3. Update config files with your settings (Redis password, API keys, etc.)"
|
|
||||||
echo ""
|
|
||||||
echo "4. Start services:"
|
|
||||||
echo " sudo systemctl start fetchml-api"
|
|
||||||
echo " sudo systemctl start fetchml-worker"
|
|
||||||
echo ""
|
|
||||||
echo "5. Enable services to start on boot:"
|
|
||||||
echo " sudo systemctl enable fetchml-api"
|
|
||||||
echo " sudo systemctl enable fetchml-worker"
|
|
||||||
echo ""
|
|
||||||
echo "6. Check status:"
|
|
||||||
echo " sudo systemctl status fetchml-api"
|
|
||||||
echo " sudo systemctl status fetchml-worker"
|
|
||||||
echo " sudo journalctl -u fetchml-api -f"
|
|
||||||
echo ""
|
|
||||||
|
|
@ -1,455 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Automatic Setup Script for ML Experiment Manager
|
|
||||||
# Handles complete environment setup with security features
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Colors
|
|
||||||
RED='\033[0;31m'
|
|
||||||
GREEN='\033[0;32m'
|
|
||||||
YELLOW='\033[1;33m'
|
|
||||||
BLUE='\033[0;34m'
|
|
||||||
NC='\033[0m'
|
|
||||||
|
|
||||||
print_info() {
|
|
||||||
echo -e "${BLUE}[INFO]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
print_success() {
|
|
||||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
print_warning() {
|
|
||||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
print_error() {
|
|
||||||
echo -e "${RED}[ERROR]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
detect_os() {
|
|
||||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
|
||||||
echo "macos"
|
|
||||||
elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
|
|
||||||
echo "linux"
|
|
||||||
else
|
|
||||||
echo "unknown"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
install_go() {
|
|
||||||
print_info "Installing Go..."
|
|
||||||
|
|
||||||
local os=$(detect_os)
|
|
||||||
local go_version="1.23.0"
|
|
||||||
|
|
||||||
if [[ "$os" == "macos" ]]; then
|
|
||||||
if command -v brew &> /dev/null; then
|
|
||||||
brew install go
|
|
||||||
else
|
|
||||||
print_error "Homebrew not found. Please install Go manually."
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
elif [[ "$os" == "linux" ]]; then
|
|
||||||
wget -q "https://go.dev/dl/go${go_version}.linux-amd64.tar.gz"
|
|
||||||
sudo rm -rf /usr/local/go
|
|
||||||
sudo tar -C /usr/local -xzf "go${go_version}.linux-amd64.tar.gz"
|
|
||||||
rm "go${go_version}.linux-amd64.tar.gz"
|
|
||||||
|
|
||||||
# Add to PATH
|
|
||||||
echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc
|
|
||||||
export PATH=$PATH:/usr/local/go/bin
|
|
||||||
fi
|
|
||||||
|
|
||||||
print_success "Go installed"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_zig() {
|
|
||||||
print_info "Installing Zig..."
|
|
||||||
|
|
||||||
local os=$(detect_os)
|
|
||||||
|
|
||||||
if [[ "$os" == "macos" ]]; then
|
|
||||||
if command -v brew &> /dev/null; then
|
|
||||||
brew install zig
|
|
||||||
else
|
|
||||||
print_error "Homebrew not found. Please install Zig manually."
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
elif [[ "$os" == "linux" ]]; then
|
|
||||||
# Download Zig binary
|
|
||||||
local zig_version="0.13.0"
|
|
||||||
wget -q "https://ziglang.org/download/${zig_version}/zig-linux-x86_64-${zig_version}.tar.xz"
|
|
||||||
tar -xf "zig-linux-x86_64-${zig_version}.tar.xz"
|
|
||||||
sudo mv "zig-linux-x86_64-${zig_version}/zig" /usr/local/bin/
|
|
||||||
rm -rf "zig-linux-x86_64-${zig_version}.tar.xz" "zig-linux-x86_64-${zig_version}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
print_success "Zig installed"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_docker() {
|
|
||||||
print_info "Installing Docker..."
|
|
||||||
|
|
||||||
local os=$(detect_os)
|
|
||||||
|
|
||||||
if [[ "$os" == "macos" ]]; then
|
|
||||||
if command -v brew &> /dev/null; then
|
|
||||||
brew install --cask docker
|
|
||||||
print_warning "Docker Desktop installed. Please start it manually."
|
|
||||||
else
|
|
||||||
print_error "Homebrew not found. Please install Docker manually."
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
elif [[ "$os" == "linux" ]]; then
|
|
||||||
# Install Docker using official script
|
|
||||||
curl -fsSL https://get.docker.com -o get-docker.sh
|
|
||||||
sudo sh get-docker.sh
|
|
||||||
sudo usermod -aG docker $USER
|
|
||||||
rm get-docker.sh
|
|
||||||
|
|
||||||
# Start Docker
|
|
||||||
sudo systemctl enable docker
|
|
||||||
sudo systemctl start docker
|
|
||||||
|
|
||||||
print_success "Docker installed. You may need to log out and log back in."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
install_redis() {
|
|
||||||
print_info "Installing Redis..."
|
|
||||||
|
|
||||||
local os=$(detect_os)
|
|
||||||
|
|
||||||
if [[ "$os" == "macos" ]]; then
|
|
||||||
if command -v brew &> /dev/null; then
|
|
||||||
brew install redis
|
|
||||||
brew services start redis
|
|
||||||
else
|
|
||||||
print_error "Homebrew not found. Please install Redis manually."
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
elif [[ "$os" == "linux" ]]; then
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y redis-server
|
|
||||||
sudo systemctl enable redis-server
|
|
||||||
sudo systemctl start redis-server
|
|
||||||
fi
|
|
||||||
|
|
||||||
print_success "Redis installed and started"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_dependencies() {
|
|
||||||
print_info "Installing dependencies..."
|
|
||||||
|
|
||||||
local os=$(detect_os)
|
|
||||||
|
|
||||||
# Install basic tools
|
|
||||||
if [[ "$os" == "macos" ]]; then
|
|
||||||
if command -v brew &> /dev/null; then
|
|
||||||
brew install openssl curl jq
|
|
||||||
fi
|
|
||||||
elif [[ "$os" == "linux" ]]; then
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y openssl curl jq build-essential
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Install Go tools
|
|
||||||
if command -v go &> /dev/null; then
|
|
||||||
go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest
|
|
||||||
go install golang.org/x/tools/cmd/goimports@latest
|
|
||||||
fi
|
|
||||||
|
|
||||||
print_success "Dependencies installed"
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_project() {
|
|
||||||
print_info "Setting up project..."
|
|
||||||
|
|
||||||
# Create directories
|
|
||||||
mkdir -p bin
|
|
||||||
mkdir -p data
|
|
||||||
mkdir -p logs
|
|
||||||
mkdir -p db
|
|
||||||
mkdir -p ssl
|
|
||||||
mkdir -p configs
|
|
||||||
mkdir -p scripts
|
|
||||||
|
|
||||||
# Build project
|
|
||||||
if command -v make &> /dev/null; then
|
|
||||||
make build
|
|
||||||
if command -v zig &> /dev/null; then
|
|
||||||
make cli-build
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
print_warning "Make not found, building manually..."
|
|
||||||
go build -o bin/worker ./cmd/worker
|
|
||||||
go build -o bin/tui ./cmd/tui
|
|
||||||
go build -o bin/data_manager ./cmd/data_manager
|
|
||||||
go build -o bin/user_manager ./cmd/user_manager
|
|
||||||
go build -o bin/api-server ./cmd/api-server
|
|
||||||
|
|
||||||
if command -v zig &> /dev/null; then
|
|
||||||
cd cli && zig build && cd ..
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
print_success "Project setup completed"
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_security() {
|
|
||||||
print_info "Setting up security features..."
|
|
||||||
|
|
||||||
# Generate SSL certificates
|
|
||||||
if command -v openssl &> /dev/null; then
|
|
||||||
openssl req -x509 -newkey rsa:4096 -keyout ssl/key.pem -out ssl/cert.pem \
|
|
||||||
-days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" \
|
|
||||||
-addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null || {
|
|
||||||
print_warning "Failed to generate SSL certificates"
|
|
||||||
}
|
|
||||||
print_success "SSL certificates generated"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Generate secure configuration
|
|
||||||
local redis_password=$(openssl rand -base64 32 2>/dev/null || echo "dev_redis_password_123")
|
|
||||||
local jwt_secret=$(openssl rand -base64 64 2>/dev/null || echo "dev_jwt_secret_1234567890123456789012345678901234567890123456789012345678901234")
|
|
||||||
|
|
||||||
cat > configs/security-config.yaml << EOF
|
|
||||||
base_path: "/data/ml-experiments"
|
|
||||||
|
|
||||||
auth:
|
|
||||||
enabled: true
|
|
||||||
api_keys:
|
|
||||||
test_user:
|
|
||||||
hash: "$(echo -n "dev_test_api_key_12345" | sha256sum | cut -d' ' -f1)"
|
|
||||||
admin: true
|
|
||||||
roles: ["data_scientist", "admin"]
|
|
||||||
permissions:
|
|
||||||
read: true
|
|
||||||
write: true
|
|
||||||
delete: true
|
|
||||||
|
|
||||||
server:
|
|
||||||
address: ":9101"
|
|
||||||
tls:
|
|
||||||
enabled: true
|
|
||||||
cert_file: "./ssl/cert.pem"
|
|
||||||
key_file: "./ssl/key.pem"
|
|
||||||
min_version: "1.3"
|
|
||||||
|
|
||||||
security:
|
|
||||||
rate_limit:
|
|
||||||
enabled: true
|
|
||||||
requests_per_minute: 60
|
|
||||||
burst_size: 10
|
|
||||||
ip_whitelist:
|
|
||||||
- "127.0.0.1"
|
|
||||||
- "::1"
|
|
||||||
- "10.0.0.0/8"
|
|
||||||
- "192.168.0.0/16"
|
|
||||||
- "172.16.0.0/12"
|
|
||||||
failed_login_lockout:
|
|
||||||
enabled: true
|
|
||||||
max_attempts: 5
|
|
||||||
lockout_duration: "15m"
|
|
||||||
|
|
||||||
redis:
|
|
||||||
url: "redis://localhost:6379"
|
|
||||||
password: "${redis_password}"
|
|
||||||
|
|
||||||
logging:
|
|
||||||
level: "info"
|
|
||||||
file: "logs/fetch_ml.log"
|
|
||||||
audit_log: "logs/audit.log"
|
|
||||||
EOF
|
|
||||||
|
|
||||||
cat > .env.dev << EOF
|
|
||||||
# Development environment variables
|
|
||||||
REDIS_PASSWORD=${redis_password}
|
|
||||||
JWT_SECRET=${jwt_secret}
|
|
||||||
GRAFANA_USER=admin
|
|
||||||
GRAFANA_PASSWORD=$(openssl rand -base64 16 2>/dev/null || echo "dev_grafana_password")
|
|
||||||
EOF
|
|
||||||
|
|
||||||
print_success "Security configuration created"
|
|
||||||
}
|
|
||||||
|
|
||||||
test_installation() {
|
|
||||||
print_info "Testing installation..."
|
|
||||||
|
|
||||||
local tests_passed=0
|
|
||||||
local tests_total=0
|
|
||||||
|
|
||||||
# Test Go
|
|
||||||
tests_total=$((tests_total + 1))
|
|
||||||
if command -v go &> /dev/null; then
|
|
||||||
print_success "Go: Installed"
|
|
||||||
tests_passed=$((tests_passed + 1))
|
|
||||||
else
|
|
||||||
print_error "Go: Not found"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Test Zig
|
|
||||||
tests_total=$((tests_total + 1))
|
|
||||||
if command -v zig &> /dev/null; then
|
|
||||||
print_success "Zig: Installed"
|
|
||||||
tests_passed=$((tests_passed + 1))
|
|
||||||
else
|
|
||||||
print_warning "Zig: Not found (optional)"
|
|
||||||
tests_total=$((tests_total - 1))
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Test Docker
|
|
||||||
tests_total=$((tests_total + 1))
|
|
||||||
if command -v docker &> /dev/null; then
|
|
||||||
print_success "Docker: Installed"
|
|
||||||
tests_passed=$((tests_passed + 1))
|
|
||||||
else
|
|
||||||
print_warning "Docker: Not found (optional)"
|
|
||||||
tests_total=$((tests_total - 1))
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Test Redis
|
|
||||||
tests_total=$((tests_total + 1))
|
|
||||||
if command -v redis-cli &> /dev/null; then
|
|
||||||
if redis-cli ping | grep -q "PONG"; then
|
|
||||||
print_success "Redis: Running"
|
|
||||||
tests_passed=$((tests_passed + 1))
|
|
||||||
else
|
|
||||||
print_warning "Redis: Not running"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
print_warning "Redis: Not found (optional)"
|
|
||||||
tests_total=$((tests_total - 1))
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Test binaries
|
|
||||||
if [[ -f "bin/api-server" ]]; then
|
|
||||||
tests_total=$((tests_total + 1))
|
|
||||||
if ./bin/api-server --help > /dev/null 2>&1; then
|
|
||||||
print_success "API Server: Built"
|
|
||||||
tests_passed=$((tests_passed + 1))
|
|
||||||
else
|
|
||||||
print_error "API Server: Build failed"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $tests_total -gt 0 ]]; then
|
|
||||||
local success_rate=$((tests_passed * 100 / tests_total))
|
|
||||||
print_info "Tests: $tests_passed/$tests_total passed ($success_rate%)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
print_success "Installation testing completed"
|
|
||||||
}
|
|
||||||
|
|
||||||
show_next_steps() {
|
|
||||||
print_success "Automatic setup completed!"
|
|
||||||
echo
|
|
||||||
echo "Next Steps:"
|
|
||||||
echo "==========="
|
|
||||||
echo ""
|
|
||||||
echo "1. Load environment variables:"
|
|
||||||
echo " source .env.dev"
|
|
||||||
echo ""
|
|
||||||
echo "2. Start the API server:"
|
|
||||||
echo " ./bin/api-server -config configs/config.yaml"
|
|
||||||
echo ""
|
|
||||||
echo "3. Test the Zig CLI (if installed):"
|
|
||||||
echo " ./cli/zig-out/bin/ml --help"
|
|
||||||
echo ""
|
|
||||||
echo "4. Deploy with Docker (optional):"
|
|
||||||
echo " make docker-run"
|
|
||||||
echo ""
|
|
||||||
echo "5. Docker Compose deployment:"
|
|
||||||
echo " docker-compose up -d"
|
|
||||||
echo ""
|
|
||||||
echo "Configuration Files:"
|
|
||||||
echo " configs/config.yaml # Main configuration"
|
|
||||||
echo " configs/config_local.yaml # Local development"
|
|
||||||
echo " ssl/cert.pem, ssl/key.pem # TLS certificates"
|
|
||||||
echo ""
|
|
||||||
echo "Documentation:"
|
|
||||||
echo " docs/DEPLOYMENT.md # Deployment guide"
|
|
||||||
echo ""
|
|
||||||
echo "Quick Commands:"
|
|
||||||
echo " make help # Show all commands"
|
|
||||||
echo " make test # Run tests"
|
|
||||||
echo " docker-compose up -d # Start services"
|
|
||||||
echo ""
|
|
||||||
print_success "Ready to use ML Experiment Manager!"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Main setup function
|
|
||||||
main() {
|
|
||||||
echo "ML Experiment Manager Automatic Setup"
|
|
||||||
echo "====================================="
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
print_info "Starting automatic setup..."
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Check and install dependencies
|
|
||||||
if ! command -v go &> /dev/null; then
|
|
||||||
print_info "Go not found, installing..."
|
|
||||||
install_go
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v zig &> /dev/null; then
|
|
||||||
print_info "Zig not found, installing..."
|
|
||||||
install_zig
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v docker &> /dev/null; then
|
|
||||||
print_info "Docker not found, installing..."
|
|
||||||
install_docker
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v redis-cli &> /dev/null; then
|
|
||||||
print_info "Redis not found, installing..."
|
|
||||||
install_redis
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Install additional dependencies
|
|
||||||
install_dependencies
|
|
||||||
|
|
||||||
# Setup project
|
|
||||||
setup_project
|
|
||||||
|
|
||||||
# Setup security
|
|
||||||
setup_security
|
|
||||||
|
|
||||||
# Test installation
|
|
||||||
test_installation
|
|
||||||
|
|
||||||
# Show next steps
|
|
||||||
show_next_steps
|
|
||||||
}
|
|
||||||
|
|
||||||
# Handle command line arguments
|
|
||||||
case "${1:-setup}" in
|
|
||||||
"setup")
|
|
||||||
main
|
|
||||||
;;
|
|
||||||
"deps")
|
|
||||||
install_dependencies
|
|
||||||
;;
|
|
||||||
"test")
|
|
||||||
test_installation
|
|
||||||
;;
|
|
||||||
"help"|"-h"|"--help")
|
|
||||||
echo "Automatic Setup Script"
|
|
||||||
echo "Usage: $0 {setup|deps|test|help}"
|
|
||||||
echo ""
|
|
||||||
echo "Commands:"
|
|
||||||
echo " setup - Run full automatic setup"
|
|
||||||
echo " deps - Install dependencies only"
|
|
||||||
echo " test - Test installation"
|
|
||||||
echo " help - Show this help"
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
print_error "Unknown command: $1"
|
|
||||||
echo "Use '$0 help' for usage information"
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
@ -1,314 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# Fetch ML Quick Start Script with Security
|
|
||||||
# Sets up development environment with security features and creates test user
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Colors
|
|
||||||
RED='\033[0;31m'
|
|
||||||
GREEN='\033[0;32m'
|
|
||||||
YELLOW='\033[1;33m'
|
|
||||||
BLUE='\033[0;34m'
|
|
||||||
NC='\033[0m'
|
|
||||||
|
|
||||||
print_info() {
|
|
||||||
echo -e "${BLUE}[INFO]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
print_success() {
|
|
||||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
print_warning() {
|
|
||||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
print_error() {
|
|
||||||
echo -e "${RED}[ERROR]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
check_prerequisites() {
|
|
||||||
print_info "Checking prerequisites..."
|
|
||||||
|
|
||||||
# Check Go
|
|
||||||
if ! command -v go &> /dev/null; then
|
|
||||||
print_error "Go is not installed. Please install Go 1.25 or later."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
local go_version=$(go version | awk '{print $3}' | sed 's/go//')
|
|
||||||
print_info "Go version: $go_version"
|
|
||||||
|
|
||||||
# Check Zig
|
|
||||||
if ! command -v zig &> /dev/null; then
|
|
||||||
print_warning "Zig is not installed. CLI features will not be available."
|
|
||||||
else
|
|
||||||
local zig_version=$(zig version)
|
|
||||||
print_info "Zig version: $zig_version"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check Docker
|
|
||||||
if ! command -v docker &> /dev/null; then
|
|
||||||
print_warning "Docker is not installed. Container features will not work."
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check Redis
|
|
||||||
if ! command -v redis-server &> /dev/null && ! command -v redis-cli &> /dev/null; then
|
|
||||||
print_warning "Redis is not installed. Starting local Redis..."
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check OpenSSL for certificates
|
|
||||||
if ! command -v openssl &> /dev/null; then
|
|
||||||
print_warning "OpenSSL is not installed. TLS certificates will not be generated."
|
|
||||||
fi
|
|
||||||
|
|
||||||
print_success "Prerequisites checked"
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_project() {
|
|
||||||
print_info "Setting up Fetch ML project..."
|
|
||||||
|
|
||||||
# Create directories
|
|
||||||
mkdir -p bin
|
|
||||||
mkdir -p data
|
|
||||||
mkdir -p logs
|
|
||||||
mkdir -p db
|
|
||||||
mkdir -p ssl
|
|
||||||
mkdir -p configs
|
|
||||||
|
|
||||||
print_success "Project directories created"
|
|
||||||
}
|
|
||||||
|
|
||||||
build_project() {
|
|
||||||
print_info "Building Fetch ML..."
|
|
||||||
|
|
||||||
# Build Go binaries
|
|
||||||
make build
|
|
||||||
|
|
||||||
# Build Zig CLI if available
|
|
||||||
if command -v zig &> /dev/null; then
|
|
||||||
make cli-build
|
|
||||||
print_success "Zig CLI built"
|
|
||||||
fi
|
|
||||||
|
|
||||||
print_success "Build completed"
|
|
||||||
}
|
|
||||||
|
|
||||||
generate_ssl_certificates() {
|
|
||||||
print_info "Generating SSL certificates..."
|
|
||||||
|
|
||||||
if command -v openssl &> /dev/null; then
|
|
||||||
# Generate self-signed certificate for development
|
|
||||||
openssl req -x509 -newkey rsa:4096 -keyout ssl/key.pem -out ssl/cert.pem \
|
|
||||||
-days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=localhost" \
|
|
||||||
-addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null || {
|
|
||||||
print_warning "Failed to generate SSL certificates"
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
print_success "SSL certificates generated in ssl/"
|
|
||||||
print_info "Certificates are self-signed (development only)"
|
|
||||||
else
|
|
||||||
print_warning "OpenSSL not available, skipping SSL certificates"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_redis() {
|
|
||||||
print_info "Setting up Redis..."
|
|
||||||
|
|
||||||
if command -v redis-server &> /dev/null; then
|
|
||||||
if ! pgrep -f "redis-server" > /dev/null; then
|
|
||||||
redis-server --daemonize yes --port 6379
|
|
||||||
print_success "Redis started"
|
|
||||||
else
|
|
||||||
print_info "Redis already running"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
print_warning "Redis not available, some features may be limited"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
create_secure_config() {
|
|
||||||
print_info "Creating secure development configuration..."
|
|
||||||
|
|
||||||
# Generate secure passwords and secrets
|
|
||||||
local redis_password=$(openssl rand -base64 32 2>/dev/null || echo "dev_redis_password_123")
|
|
||||||
local jwt_secret=$(openssl rand -base64 64 2>/dev/null || echo "dev_jwt_secret_1234567890123456789012345678901234567890123456789012345678901234")
|
|
||||||
|
|
||||||
# Create development config
|
|
||||||
cat > configs/config.yaml << EOF
|
|
||||||
base_path: "/data/ml-experiments"
|
|
||||||
|
|
||||||
auth:
|
|
||||||
enabled: true
|
|
||||||
api_keys:
|
|
||||||
test_user:
|
|
||||||
hash: "$(echo -n "dev_test_api_key_12345" | sha256sum | cut -d' ' -f1)"
|
|
||||||
admin: true
|
|
||||||
roles: ["data_scientist", "admin"]
|
|
||||||
permissions:
|
|
||||||
read: true
|
|
||||||
write: true
|
|
||||||
delete: true
|
|
||||||
|
|
||||||
server:
|
|
||||||
address: ":9101"
|
|
||||||
tls:
|
|
||||||
enabled: true
|
|
||||||
cert_file: "./ssl/cert.pem"
|
|
||||||
key_file: "./ssl/key.pem"
|
|
||||||
min_version: "1.3"
|
|
||||||
|
|
||||||
security:
|
|
||||||
rate_limit:
|
|
||||||
enabled: true
|
|
||||||
requests_per_minute: 60
|
|
||||||
burst_size: 10
|
|
||||||
ip_whitelist:
|
|
||||||
- "127.0.0.1"
|
|
||||||
- "::1"
|
|
||||||
- "10.0.0.0/8"
|
|
||||||
- "192.168.0.0/16"
|
|
||||||
- "172.16.0.0/12"
|
|
||||||
failed_login_lockout:
|
|
||||||
enabled: true
|
|
||||||
max_attempts: 5
|
|
||||||
lockout_duration: "15m"
|
|
||||||
|
|
||||||
redis:
|
|
||||||
url: "redis://localhost:6379"
|
|
||||||
password: "${redis_password}"
|
|
||||||
|
|
||||||
logging:
|
|
||||||
level: "info"
|
|
||||||
file: "logs/fetch_ml.log"
|
|
||||||
audit_log: "logs/audit.log"
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Create environment file
|
|
||||||
cat > .env.dev << EOF
|
|
||||||
# Development environment variables
|
|
||||||
REDIS_PASSWORD=${redis_password}
|
|
||||||
JWT_SECRET=${jwt_secret}
|
|
||||||
GRAFANA_USER=admin
|
|
||||||
GRAFANA_PASSWORD=$(openssl rand -base64 16 2>/dev/null || echo "dev_grafana_password")
|
|
||||||
EOF
|
|
||||||
|
|
||||||
print_success "Secure configuration created"
|
|
||||||
print_warning "Using development certificates and passwords"
|
|
||||||
}
|
|
||||||
|
|
||||||
create_test_user() {
|
|
||||||
print_info "Creating test user..."
|
|
||||||
|
|
||||||
# Generate API key for test user
|
|
||||||
local api_key="dev_test_api_key_12345"
|
|
||||||
local api_key_hash=$(echo -n "$api_key" | sha256sum | cut -d' ' -f1)
|
|
||||||
|
|
||||||
print_success "Test user created successfully"
|
|
||||||
echo "Username: test_user"
|
|
||||||
echo "API Key: $api_key"
|
|
||||||
echo "API Key Hash: $api_key_hash"
|
|
||||||
echo "Store this key safely!"
|
|
||||||
echo ""
|
|
||||||
echo "Environment variables in .env.dev"
|
|
||||||
echo "Run: source .env.dev"
|
|
||||||
}
|
|
||||||
|
|
||||||
test_setup() {
|
|
||||||
print_info "Testing setup..."
|
|
||||||
|
|
||||||
# Test Go binaries
|
|
||||||
if [[ -f "bin/api-server" ]]; then
|
|
||||||
./bin/api-server --help > /dev/null 2>&1 || true
|
|
||||||
print_success "API server binary OK"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -f "bin/worker" ]]; then
|
|
||||||
./bin/worker --help > /dev/null 2>&1 || true
|
|
||||||
print_success "Worker binary OK"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Test Zig CLI
|
|
||||||
if [[ -f "cli/zig-out/bin/ml" ]]; then
|
|
||||||
./cli/zig-out/bin/ml --help > /dev/null 2>&1 || true
|
|
||||||
print_success "Zig CLI binary OK"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Test Redis connection
|
|
||||||
if command -v redis-cli &> /dev/null; then
|
|
||||||
if redis-cli ping > /dev/null 2>&1; then
|
|
||||||
print_success "Redis connection OK"
|
|
||||||
else
|
|
||||||
print_warning "Redis not responding"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Test SSL certificates
|
|
||||||
if [[ -f "ssl/cert.pem" && -f "ssl/key.pem" ]]; then
|
|
||||||
if openssl x509 -in ssl/cert.pem -noout -checkend 86400 > /dev/null 2>&1; then
|
|
||||||
print_success "SSL certificates valid"
|
|
||||||
else
|
|
||||||
print_warning "SSL certificates expired or invalid"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
show_next_steps() {
|
|
||||||
print_success "Secure quick start completed!"
|
|
||||||
echo
|
|
||||||
echo "Next steps:"
|
|
||||||
echo "1. Load environment variables:"
|
|
||||||
echo " source .env.dev"
|
|
||||||
echo
|
|
||||||
echo "2. Start API server:"
|
|
||||||
echo " ./bin/api-server -config configs/config.yaml"
|
|
||||||
echo
|
|
||||||
echo "3. Test Zig CLI:"
|
|
||||||
echo " ./cli/zig-out/bin/ml --help"
|
|
||||||
echo
|
|
||||||
echo "4. Test with curl (HTTPS):"
|
|
||||||
echo " curl -k -H 'X-API-Key: dev_test_api_key_12345' https://localhost:9101/health"
|
|
||||||
echo
|
|
||||||
echo "5. Deploy with Docker:"
|
|
||||||
echo " docker-compose up -d"
|
|
||||||
echo
|
|
||||||
echo "Features Enabled:"
|
|
||||||
echo " ✅ HTTPS/TLS encryption"
|
|
||||||
echo " ✅ API key authentication"
|
|
||||||
echo " ✅ Rate limiting"
|
|
||||||
echo " ✅ IP whitelisting"
|
|
||||||
echo " ✅ Security headers"
|
|
||||||
echo " ✅ Audit logging"
|
|
||||||
echo
|
|
||||||
echo "Configuration Files:"
|
|
||||||
echo " configs/config.yaml # Main configuration"
|
|
||||||
echo " .env.dev # Environment variables"
|
|
||||||
echo " ssl/cert.pem, ssl/key.pem # TLS certificates"
|
|
||||||
echo
|
|
||||||
echo "Documentation:"
|
|
||||||
echo " docs/DEPLOYMENT.md # Deployment guide"
|
|
||||||
echo ""
|
|
||||||
print_success "Ready to run ML experiments!"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Main function
|
|
||||||
main() {
|
|
||||||
echo "Fetch ML Quick Start Script (with Security & Zig CLI)"
|
|
||||||
echo "===================================================="
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
check_prerequisites
|
|
||||||
setup_project
|
|
||||||
build_project
|
|
||||||
generate_ssl_certificates
|
|
||||||
setup_redis
|
|
||||||
create_secure_config
|
|
||||||
create_test_user
|
|
||||||
test_setup
|
|
||||||
show_next_steps
|
|
||||||
}
|
|
||||||
|
|
||||||
# Run main function
|
|
||||||
main "$@"
|
|
||||||
|
|
@ -1,124 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# Shared helper functions for Fetch ML setup scripts (Ubuntu/Rocky)
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Colors
|
|
||||||
RED='\033[0;31m'
|
|
||||||
GREEN='\033[0;32m'
|
|
||||||
YELLOW='\033[1;33m'
|
|
||||||
BLUE='\033[0;34m'
|
|
||||||
NC='\033[0m'
|
|
||||||
|
|
||||||
# Configuration defaults
|
|
||||||
FETCH_ML_USER="fetchml"
|
|
||||||
FETCH_ML_HOME="/opt/fetchml"
|
|
||||||
SERVICE_DIR="/etc/systemd/system"
|
|
||||||
LOG_DIR="/var/log/fetchml"
|
|
||||||
DATA_DIR="/var/lib/fetchml"
|
|
||||||
CONFIG_DIR="$FETCH_ML_HOME/configs"
|
|
||||||
|
|
||||||
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
|
|
||||||
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
|
|
||||||
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
|
|
||||||
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
|
|
||||||
|
|
||||||
# Download file with checksum verification
|
|
||||||
# Args: url, checksum, dest
|
|
||||||
secure_download() {
|
|
||||||
local url="$1" checksum="$2" dest="$3"
|
|
||||||
curl -fsSL "$url" -o "$dest"
|
|
||||||
echo "$checksum $dest" | sha256sum --check --status || {
|
|
||||||
log_error "Checksum verification failed for $dest"
|
|
||||||
rm -f "$dest"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cleanup_temp() {
|
|
||||||
if [[ -n "${TMP_FILES:-}" ]]; then
|
|
||||||
rm -f $TMP_FILES || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
trap cleanup_temp EXIT
|
|
||||||
|
|
||||||
ensure_user() {
|
|
||||||
if ! id "$FETCH_ML_USER" &>/dev/null; then
|
|
||||||
useradd -m -d "$FETCH_ML_HOME" -s /bin/bash "$FETCH_ML_USER"
|
|
||||||
fi
|
|
||||||
usermod -aG podman "$FETCH_ML_USER" || true
|
|
||||||
}
|
|
||||||
|
|
||||||
create_directories() {
|
|
||||||
mkdir -p "$FETCH_ML_HOME" "$LOG_DIR" "$DATA_DIR" "$FETCH_ML_HOME/bin" "$CONFIG_DIR"
|
|
||||||
chown -R "$FETCH_ML_USER":"$FETCH_ML_USER" "$FETCH_ML_HOME" "$LOG_DIR" "$DATA_DIR"
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_systemd_service() {
|
|
||||||
local name="$1" exec="$2"
|
|
||||||
cat > "$SERVICE_DIR/${name}.service" <<EOF
|
|
||||||
[Unit]
|
|
||||||
Description=Fetch ML ${name^} Service
|
|
||||||
After=network.target redis.service
|
|
||||||
Wants=redis.service
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
User=$FETCH_ML_USER
|
|
||||||
Group=$FETCH_ML_USER
|
|
||||||
WorkingDirectory=$FETCH_ML_HOME
|
|
||||||
Environment=PATH=$FETCH_ML_HOME/bin:/usr/local/go/bin:/usr/bin:/bin
|
|
||||||
ExecStart=$exec
|
|
||||||
Restart=on-failure
|
|
||||||
RestartSec=5
|
|
||||||
StandardOutput=journal
|
|
||||||
StandardError=journal
|
|
||||||
SyslogIdentifier=fetch_ml_${name}
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_logrotate() {
|
|
||||||
cat > /etc/logrotate.d/fetch_ml <<'EOF'
|
|
||||||
/var/log/fetchml/*.log {
|
|
||||||
daily
|
|
||||||
missingok
|
|
||||||
rotate 14
|
|
||||||
compress
|
|
||||||
delaycompress
|
|
||||||
notifempty
|
|
||||||
create 0640 fetchml fetchml
|
|
||||||
}
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
hardening_steps() {
|
|
||||||
# Increase file limits
|
|
||||||
if ! grep -q fetchml /etc/security/limits.conf; then
|
|
||||||
cat >> /etc/security/limits.conf <<'EOF'
|
|
||||||
fetchml soft nofile 65536
|
|
||||||
fetchml hard nofile 65536
|
|
||||||
EOF
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Enable unattended security upgrades if available
|
|
||||||
if command -v apt-get &>/dev/null; then
|
|
||||||
apt-get install -y unattended-upgrades >/dev/null || true
|
|
||||||
elif command -v dnf &>/dev/null; then
|
|
||||||
dnf install -y dnf-automatic >/dev/null || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
selinux_guidance() {
|
|
||||||
if command -v getenforce &>/dev/null; then
|
|
||||||
local mode=$(getenforce)
|
|
||||||
log_info "SELinux mode: $mode"
|
|
||||||
if [[ "$mode" == "Enforcing" ]]; then
|
|
||||||
log_info "Ensure systemd units and directories have proper contexts. Example:"
|
|
||||||
echo " semanage fcontext -a -t bin_t '$FETCH_ML_HOME/bin(/.*)?'"
|
|
||||||
echo " restorecon -Rv $FETCH_ML_HOME/bin"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
@ -1,417 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# Fetch ML Rocky Linux Setup Script
|
|
||||||
# Optimized for ML experiments on Rocky Linux 8/9
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# shellcheck source=scripts/setup_common.sh
|
|
||||||
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
|
||||||
source "$SCRIPT_DIR/setup_common.sh"
|
|
||||||
|
|
||||||
check_root() {
|
|
||||||
if [[ $EUID -ne 0 ]]; then
|
|
||||||
log_error "This script must be run as root"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
check_rocky() {
|
|
||||||
if ! command -v dnf &> /dev/null && ! command -v yum &> /dev/null; then
|
|
||||||
log_error "This script is designed for Rocky Linux systems"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
local rocky_version=$(cat /etc/rocky-release | grep -oE '[0-9]+\.[0-9]+')
|
|
||||||
log_info "Rocky Linux version: $rocky_version"
|
|
||||||
|
|
||||||
# Use dnf for Rocky 9+, yum for Rocky 8
|
|
||||||
if command -v dnf &> /dev/null; then
|
|
||||||
PKG_MANAGER="dnf"
|
|
||||||
else
|
|
||||||
PKG_MANAGER="yum"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
update_system() {
|
|
||||||
log_info "Updating system packages..."
|
|
||||||
$PKG_MANAGER update -y
|
|
||||||
$PKG_MANAGER upgrade -y
|
|
||||||
$PKG_MANAGER install -y curl wget gnupg2
|
|
||||||
}
|
|
||||||
|
|
||||||
enable_epel() {
|
|
||||||
log_info "Enabling EPEL repository..."
|
|
||||||
|
|
||||||
if $PKG_MANAGER repolist | grep -q "epel"; then
|
|
||||||
log_info "EPEL already enabled"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
$PKG_MANAGER install -y epel-release
|
|
||||||
$PKG_MANAGER config-manager --set-enabled powertools
|
|
||||||
|
|
||||||
log_success "EPEL repository enabled"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_go() {
|
|
||||||
log_info "Installing Go 1.25..."
|
|
||||||
|
|
||||||
if command -v go &> /dev/null; then
|
|
||||||
local go_version=$(go version | awk '{print $3}' | sed 's/go//')
|
|
||||||
log_info "Go already installed: $go_version"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
cd /tmp
|
|
||||||
TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz"
|
|
||||||
secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz"
|
|
||||||
tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz
|
|
||||||
|
|
||||||
# Add to PATH
|
|
||||||
echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile
|
|
||||||
echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile
|
|
||||||
export PATH=$PATH:/usr/local/go/bin
|
|
||||||
|
|
||||||
log_success "Go 1.25 installed"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_podman() {
|
|
||||||
log_info "Installing Podman..."
|
|
||||||
|
|
||||||
if command -v podman &> /dev/null; then
|
|
||||||
log_info "Podman already installed"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Install Podman and related tools
|
|
||||||
$PKG_MANAGER install -y podman podman-compose containernetworking-plugins
|
|
||||||
|
|
||||||
# Configure Podman
|
|
||||||
mkdir -p /etc/containers
|
|
||||||
cat > /etc/containers/containers.conf << EOF
|
|
||||||
[containers]
|
|
||||||
user_namespace_enable = 1
|
|
||||||
runtime = "crun"
|
|
||||||
|
|
||||||
[network]
|
|
||||||
network_backend = "netavark"
|
|
||||||
|
|
||||||
[engine]
|
|
||||||
cgroup_manager = "systemd"
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Enable user namespaces
|
|
||||||
echo "user.max_user_namespaces=15000" >> /etc/sysctl.conf
|
|
||||||
sysctl -p user.max_user_namespaces=15000
|
|
||||||
|
|
||||||
log_success "Podman installed"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_redis() {
|
|
||||||
log_info "Installing Redis..."
|
|
||||||
|
|
||||||
if command -v redis-server &> /dev/null; then
|
|
||||||
log_info "Redis already installed"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
$PKG_MANAGER install -y redis
|
|
||||||
|
|
||||||
# Configure Redis for production
|
|
||||||
sed -i 's/supervised no/supervised systemd/' /etc/redis.conf
|
|
||||||
sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis.conf
|
|
||||||
|
|
||||||
systemctl enable redis
|
|
||||||
systemctl start redis
|
|
||||||
|
|
||||||
log_success "Redis installed and configured"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_nvidia_drivers() {
|
|
||||||
log_info "Checking for NVIDIA GPU..."
|
|
||||||
|
|
||||||
if command -v nvidia-smi &> /dev/null; then
|
|
||||||
log_info "NVIDIA drivers already installed"
|
|
||||||
nvidia-smi
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
if lspci | grep -i nvidia &> /dev/null; then
|
|
||||||
log_info "NVIDIA GPU detected, installing drivers..."
|
|
||||||
|
|
||||||
# Enable NVIDIA repository
|
|
||||||
$PKG_MANAGER config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel$(rpm -E %rhel)/x86_64/cuda-rhel.repo
|
|
||||||
|
|
||||||
# Clean and install
|
|
||||||
$PKG_MANAGER clean all
|
|
||||||
$PKG_MANAGER module enable -y nvidia-driver:latest-dkms
|
|
||||||
$PKG_MANAGER install -y nvidia-driver nvidia-cuda-toolkit
|
|
||||||
|
|
||||||
# Configure Podman for NVIDIA (only if needed)
|
|
||||||
if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then
|
|
||||||
log_warning "NVIDIA GPU access test failed, you may need to reboot"
|
|
||||||
else
|
|
||||||
log_success "NVIDIA drivers installed and GPU access verified"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Reboot required
|
|
||||||
log_warning "System reboot required for NVIDIA drivers"
|
|
||||||
log_info "Run: reboot"
|
|
||||||
else
|
|
||||||
log_info "No NVIDIA GPU detected, skipping driver installation"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
install_ml_tools() {
|
|
||||||
log_info "Installing ML tools and dependencies..."
|
|
||||||
|
|
||||||
# Python and ML packages
|
|
||||||
$PKG_MANAGER install -y python3 python3-pip python3-devel
|
|
||||||
|
|
||||||
# System dependencies for ML
|
|
||||||
$PKG_MANAGER groupinstall -y "Development Tools"
|
|
||||||
$PKG_MANAGER install -y cmake git pkgconfig
|
|
||||||
$PKG_MANAGER install -y libjpeg-turbo-devel libpng-devel libtiff-devel
|
|
||||||
$PKG_MANAGER install -y mesa-libGL-devel mesa-libGLU-devel
|
|
||||||
$PKG_MANAGER install -y gtk3-devel
|
|
||||||
$PKG_MANAGER install -y atlas-devel blas-devel lapack-devel
|
|
||||||
|
|
||||||
# Install common ML libraries
|
|
||||||
pip3 install --upgrade pip
|
|
||||||
pip3 install numpy scipy scikit-learn pandas
|
|
||||||
pip3 install jupyter matplotlib seaborn
|
|
||||||
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
|
||||||
|
|
||||||
log_success "ML tools installed"
|
|
||||||
}
|
|
||||||
|
|
||||||
create_user() {
|
|
||||||
log_info "Creating fetchml user..."
|
|
||||||
|
|
||||||
if id "$FETCH_ML_USER" &>/dev/null; then
|
|
||||||
log_info "User $FETCH_ML_USER already exists"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
useradd -m -d $FETCH_ML_HOME -s /bin/bash $FETCH_ML_USER
|
|
||||||
usermod -aG podman $FETCH_ML_USER
|
|
||||||
|
|
||||||
# Create directories
|
|
||||||
mkdir -p $FETCH_ML_HOME/.config/containers
|
|
||||||
mkdir -p $FETCH_ML_HOME/go/bin
|
|
||||||
mkdir -p $LOG_DIR
|
|
||||||
mkdir -p $DATA_DIR
|
|
||||||
|
|
||||||
chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
|
|
||||||
chown -R $FETCH_ML_USER:$FETCH_ML_USER $LOG_DIR
|
|
||||||
chown -R $FETCH_ML_USER:$FETCH_ML_USER $DATA_DIR
|
|
||||||
|
|
||||||
log_success "User $FETCH_ML_USER created"
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_firewall() {
|
|
||||||
log_info "Configuring firewall..."
|
|
||||||
|
|
||||||
if command -v firewall-cmd &> /dev/null; then
|
|
||||||
systemctl enable firewalld
|
|
||||||
systemctl start firewalld
|
|
||||||
|
|
||||||
firewall-cmd --permanent --add-service=ssh
|
|
||||||
firewall-cmd --permanent --add-port=8080/tcp # Worker API
|
|
||||||
firewall-cmd --permanent --add-port=8081/tcp # Data manager API
|
|
||||||
firewall-cmd --permanent --add-port=6379/tcp # Redis
|
|
||||||
firewall-cmd --reload
|
|
||||||
|
|
||||||
firewall-cmd --list-all
|
|
||||||
else
|
|
||||||
log_warning "Firewalld not available, skipping firewall configuration"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_systemd_services() {
|
|
||||||
log_info "Setting up systemd services..."
|
|
||||||
|
|
||||||
# Fetch ML Worker service
|
|
||||||
cat > $SERVICE_DIR/fetch_ml_worker.service << EOF
|
|
||||||
[Unit]
|
|
||||||
Description=Fetch ML Worker Service
|
|
||||||
After=network.target redis.service
|
|
||||||
Wants=redis.service
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
User=$FETCH_ML_USER
|
|
||||||
Group=$FETCH_ML_USER
|
|
||||||
WorkingDirectory=$FETCH_ML_HOME
|
|
||||||
Environment=FETCH_ML_HOME=$FETCH_ML_HOME
|
|
||||||
Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin
|
|
||||||
ExecStart=$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml
|
|
||||||
Restart=always
|
|
||||||
RestartSec=5
|
|
||||||
StandardOutput=journal
|
|
||||||
StandardError=journal
|
|
||||||
SyslogIdentifier=fetch_ml_worker
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Fetch ML Data Manager service
|
|
||||||
cat > $SERVICE_DIR/fetch_ml_data_manager.service << EOF
|
|
||||||
[Unit]
|
|
||||||
Description=Fetch ML Data Manager Service
|
|
||||||
After=network.target redis.service
|
|
||||||
Wants=redis.service
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
User=$FETCH_ML_USER
|
|
||||||
Group=$FETCH_ML_USER
|
|
||||||
WorkingDirectory=$FETCH_ML_HOME
|
|
||||||
Environment=FETCH_ML_HOME=$FETCH_ML_HOME
|
|
||||||
Environment=PATH=$FETCH_ML_HOME/go/bin:/usr/local/go/bin:/usr/bin:/bin
|
|
||||||
ExecStart=$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml
|
|
||||||
Restart=always
|
|
||||||
RestartSec=5
|
|
||||||
StandardOutput=journal
|
|
||||||
StandardError=journal
|
|
||||||
SyslogIdentifier=fetch_ml_data_manager
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Enable services
|
|
||||||
systemctl daemon-reload
|
|
||||||
systemctl enable fetch_ml_worker
|
|
||||||
systemctl enable fetch_ml_data_manager
|
|
||||||
|
|
||||||
log_success "Systemd services configured"
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_log_rotation() {
|
|
||||||
log_info "Setting up log rotation..."
|
|
||||||
|
|
||||||
cat > /etc/logrotate.d/fetch_ml << EOF
|
|
||||||
$LOG_DIR/*.log {
|
|
||||||
daily
|
|
||||||
missingok
|
|
||||||
rotate 30
|
|
||||||
compress
|
|
||||||
delaycompress
|
|
||||||
notifempty
|
|
||||||
create 0644 $FETCH_ML_USER $FETCH_ML_USER
|
|
||||||
postrotate
|
|
||||||
systemctl reload fetch_ml_worker || true
|
|
||||||
systemctl reload fetch_ml_data_manager || true
|
|
||||||
endscript
|
|
||||||
}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
log_success "Log rotation configured"
|
|
||||||
}
|
|
||||||
|
|
||||||
optimize_system() {
|
|
||||||
log_info "Optimizing system for ML workloads..."
|
|
||||||
|
|
||||||
# Increase file limits
|
|
||||||
echo "* soft nofile 65536" >> /etc/security/limits.conf
|
|
||||||
echo "* hard nofile 65536" >> /etc/security/limits.conf
|
|
||||||
|
|
||||||
# Optimize kernel parameters for ML
|
|
||||||
cat >> /etc/sysctl.conf << EOF
|
|
||||||
# ML Optimization
|
|
||||||
net.core.rmem_max = 134217728
|
|
||||||
net.core.wmem_max = 134217728
|
|
||||||
vm.swappiness = 10
|
|
||||||
vm.dirty_ratio = 15
|
|
||||||
vm.dirty_background_ratio = 5
|
|
||||||
EOF
|
|
||||||
|
|
||||||
sysctl -p
|
|
||||||
|
|
||||||
# Configure GPU persistence mode if NVIDIA available
|
|
||||||
if command -v nvidia-smi &> /dev/null; then
|
|
||||||
nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Disable SELinux for better container compatibility (optional)
|
|
||||||
if [[ -f /etc/selinux/config ]]; then
|
|
||||||
log_warning "Consider setting SELinux to permissive mode for better container compatibility"
|
|
||||||
log_info "Edit /etc/selinux/config and set SELINUX=permissive"
|
|
||||||
fi
|
|
||||||
|
|
||||||
log_success "System optimized for ML workloads"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_fetch_ml() {
|
|
||||||
log_info "Installing Fetch ML..."
|
|
||||||
|
|
||||||
# Clone or copy Fetch ML
|
|
||||||
cd $FETCH_ML_HOME
|
|
||||||
|
|
||||||
if [[ ! -d "fetch_ml" ]]; then
|
|
||||||
log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml"
|
|
||||||
log_info "Example: git clone https://github.com/your-org/fetch_ml.git"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
cd fetch_ml
|
|
||||||
|
|
||||||
# Build
|
|
||||||
export PATH=$PATH:/usr/local/go/bin
|
|
||||||
make build
|
|
||||||
|
|
||||||
# Copy binaries
|
|
||||||
cp bin/* $FETCH_ML_HOME/bin/
|
|
||||||
chmod +x $FETCH_ML_HOME/bin/*
|
|
||||||
|
|
||||||
# Copy configs
|
|
||||||
mkdir -p $FETCH_ML_HOME/configs
|
|
||||||
cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml
|
|
||||||
|
|
||||||
# Set permissions
|
|
||||||
chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
|
|
||||||
|
|
||||||
log_success "Fetch ML installed"
|
|
||||||
}
|
|
||||||
|
|
||||||
main() {
|
|
||||||
log_info "Starting Fetch ML Rocky Linux server setup..."
|
|
||||||
|
|
||||||
check_root
|
|
||||||
check_rocky
|
|
||||||
|
|
||||||
update_system
|
|
||||||
enable_epel
|
|
||||||
install_go
|
|
||||||
install_podman
|
|
||||||
install_redis
|
|
||||||
install_nvidia_drivers
|
|
||||||
install_ml_tools
|
|
||||||
ensure_user
|
|
||||||
create_directories
|
|
||||||
setup_firewall
|
|
||||||
setup_systemd_services
|
|
||||||
setup_logrotate
|
|
||||||
hardening_steps
|
|
||||||
selinux_guidance
|
|
||||||
install_fetch_ml
|
|
||||||
|
|
||||||
log_success "Fetch ML setup complete!"
|
|
||||||
echo
|
|
||||||
log_info "Next steps:"
|
|
||||||
echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml"
|
|
||||||
echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml"
|
|
||||||
echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager"
|
|
||||||
echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager"
|
|
||||||
echo "5. View logs: journalctl -u fetch_ml_worker -f"
|
|
||||||
echo
|
|
||||||
log_info "Services will be available at:"
|
|
||||||
echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080"
|
|
||||||
echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Run main function
|
|
||||||
main "$@"
|
|
||||||
|
|
@ -1,294 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# Fetch ML Ubuntu Server Setup Script
|
|
||||||
# Optimized for ML experiments on Ubuntu 20.04/22.04
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# shellcheck source=scripts/setup_common.sh
|
|
||||||
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
|
||||||
source "$SCRIPT_DIR/setup_common.sh"
|
|
||||||
|
|
||||||
check_root() {
|
|
||||||
if [[ $EUID -ne 0 ]]; then
|
|
||||||
log_error "This script must be run as root"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
check_ubuntu() {
|
|
||||||
if ! command -v apt-get &> /dev/null; then
|
|
||||||
log_error "This script is designed for Ubuntu systems"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
local ubuntu_version=$(lsb_release -rs)
|
|
||||||
log_info "Ubuntu version: $ubuntu_version"
|
|
||||||
|
|
||||||
if (( $(echo "$ubuntu_version < 20.04" | bc -l) == 1 )); then
|
|
||||||
log_warning "Ubuntu version < 20.04 may not support all features"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
update_system() {
|
|
||||||
log_info "Updating system packages..."
|
|
||||||
apt-get update -y
|
|
||||||
apt-get upgrade -y
|
|
||||||
apt-get install -y curl wget gnupg lsb-release software-properties-common
|
|
||||||
}
|
|
||||||
|
|
||||||
install_go() {
|
|
||||||
log_info "Installing Go 1.25..."
|
|
||||||
|
|
||||||
if command -v go &> /dev/null; then
|
|
||||||
local go_version=$(go version | awk '{print $3}' | sed 's/go//')
|
|
||||||
log_info "Go already installed: $go_version"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
cd /tmp
|
|
||||||
TMP_FILES="/tmp/go1.25.0.linux-amd64.tar.gz"
|
|
||||||
secure_download "https://go.dev/dl/go1.25.0.linux-amd64.tar.gz" "b5b98c784d53115553848114fd3c74e565643b4e4c8e8db0c3bea3478fd8c345" "/tmp/go1.25.0.linux-amd64.tar.gz"
|
|
||||||
tar -C /usr/local -xzf go1.25.0.linux-amd64.tar.gz
|
|
||||||
|
|
||||||
# Add to PATH
|
|
||||||
echo 'export PATH=$PATH:/usr/local/go/bin' >> /etc/profile
|
|
||||||
echo 'export PATH=$PATH:$HOME/go/bin' >> /etc/profile
|
|
||||||
export PATH=$PATH:/usr/local/go/bin
|
|
||||||
|
|
||||||
log_success "Go 1.25 installed"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_podman() {
|
|
||||||
log_info "Installing Podman..."
|
|
||||||
|
|
||||||
if command -v podman &> /dev/null; then
|
|
||||||
log_info "Podman already installed"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Add official Podman repository
|
|
||||||
echo "deb https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/ /" | tee /etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list
|
|
||||||
curl -L "https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_$(lsb_release -rs)/Release.key" | apt-key add -
|
|
||||||
|
|
||||||
apt-get update -y
|
|
||||||
apt-get install -y podman podman-compose
|
|
||||||
|
|
||||||
# Configure Podman for rootless operation
|
|
||||||
echo "user_namespace_enable = 1" >> /etc/containers/containers.conf
|
|
||||||
echo "runtime = \"crun\"" >> /etc/containers/containers.conf
|
|
||||||
|
|
||||||
log_success "Podman installed"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_redis() {
|
|
||||||
log_info "Installing Redis..."
|
|
||||||
|
|
||||||
if command -v redis-server &> /dev/null; then
|
|
||||||
log_info "Redis already installed"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
apt-get install -y redis-server
|
|
||||||
|
|
||||||
# Configure Redis for production
|
|
||||||
sed -i 's/supervised no/supervised systemd/' /etc/redis/redis.conf
|
|
||||||
sed -i 's/bind 127.0.0.1 ::1/bind 127.0.0.1/' /etc/redis/redis.conf
|
|
||||||
|
|
||||||
systemctl enable redis-server
|
|
||||||
systemctl start redis-server
|
|
||||||
|
|
||||||
log_success "Redis installed and configured"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_nvidia_drivers() {
|
|
||||||
log_info "Checking for NVIDIA GPU..."
|
|
||||||
|
|
||||||
if command -v nvidia-smi &> /dev/null; then
|
|
||||||
log_info "NVIDIA drivers already installed"
|
|
||||||
nvidia-smi
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
if lspci | grep -i nvidia &> /dev/null; then
|
|
||||||
log_info "NVIDIA GPU detected, installing drivers..."
|
|
||||||
|
|
||||||
# Add NVIDIA repository
|
|
||||||
TMP_FILES="/tmp/cuda-keyring_1.1-1_all.deb"
|
|
||||||
secure_download "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$(lsb_release -rs | cut -d. -f1)/x86_64/cuda-keyring_1.1-1_all.deb" "cfa6b4109e7e3d9be060a016b7dc07e8edcd5356c0eabcc0c537a76e6c603d76" "/tmp/cuda-keyring_1.1-1_all.deb"
|
|
||||||
dpkg -i /tmp/cuda-keyring_1.1-1_all.deb
|
|
||||||
apt-get update -y
|
|
||||||
|
|
||||||
# Install drivers
|
|
||||||
apt-get install -y nvidia-driver-535 nvidia-cuda-toolkit
|
|
||||||
|
|
||||||
# Configure Podman for NVIDIA (only if needed)
|
|
||||||
if ! podman run --rm --device nvidia.com/gpu=all alpine echo "NVIDIA GPU access configured" 2>/dev/null; then
|
|
||||||
log_warning "NVIDIA GPU access test failed, you may need to reboot"
|
|
||||||
else
|
|
||||||
log_success "NVIDIA drivers installed and GPU access verified"
|
|
||||||
fi
|
|
||||||
|
|
||||||
else
|
|
||||||
log_info "No NVIDIA GPU detected, skipping driver installation"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
install_ml_tools() {
|
|
||||||
log_info "Installing ML tools and dependencies..."
|
|
||||||
|
|
||||||
# Python and ML packages
|
|
||||||
apt-get install -y python3 python3-pip python3-venv
|
|
||||||
|
|
||||||
# System dependencies for ML
|
|
||||||
apt-get install -y build-essential cmake git pkg-config
|
|
||||||
apt-get install -y libjpeg-dev libpng-dev libtiff-dev
|
|
||||||
apt-get install -y libavcodec-dev libavformat-dev libswscale-dev
|
|
||||||
apt-get install -y libgtk2.0-dev libcanberra-gtk-module
|
|
||||||
apt-get install -y libxvidcore-dev libx264-dev
|
|
||||||
apt-get install -y libatlas-base-dev gfortran
|
|
||||||
|
|
||||||
# Install common ML libraries
|
|
||||||
pip3 install --upgrade pip
|
|
||||||
pip3 install numpy scipy scikit-learn pandas
|
|
||||||
pip3 install jupyter matplotlib seaborn
|
|
||||||
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
|
|
||||||
|
|
||||||
log_success "ML tools installed"
|
|
||||||
}
|
|
||||||
|
|
||||||
create_user() {
|
|
||||||
log_info "Creating fetchml user..."
|
|
||||||
ensure_user
|
|
||||||
create_directories
|
|
||||||
log_success "User $FETCH_ML_USER and directories created"
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_firewall() {
|
|
||||||
log_info "Configuring firewall..."
|
|
||||||
|
|
||||||
if command -v ufw &> /dev/null; then
|
|
||||||
ufw --force enable
|
|
||||||
ufw allow ssh
|
|
||||||
ufw allow 8080/tcp # Worker API
|
|
||||||
ufw allow 8081/tcp # Data manager API
|
|
||||||
ufw allow 6379/tcp # Redis
|
|
||||||
ufw status
|
|
||||||
else
|
|
||||||
log_warning "UFW not available, skipping firewall configuration"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_systemd_services() {
|
|
||||||
log_info "Setting up systemd services..."
|
|
||||||
|
|
||||||
setup_systemd_service "fetch_ml_worker" "$FETCH_ML_HOME/bin/worker --config $FETCH_ML_HOME/configs/config-local.yaml"
|
|
||||||
setup_systemd_service "fetch_ml_data_manager" "$FETCH_ML_HOME/bin/data_manager --config $FETCH_ML_HOME/configs/config-local.yaml"
|
|
||||||
|
|
||||||
# Enable services
|
|
||||||
systemctl daemon-reload
|
|
||||||
systemctl enable fetch_ml_worker
|
|
||||||
systemctl enable fetch_ml_data_manager
|
|
||||||
|
|
||||||
log_success "Systemd services configured"
|
|
||||||
}
|
|
||||||
|
|
||||||
setup_log_rotation() {
|
|
||||||
log_info "Setting up log rotation..."
|
|
||||||
setup_logrotate
|
|
||||||
log_success "Log rotation configured"
|
|
||||||
}
|
|
||||||
|
|
||||||
optimize_system() {
|
|
||||||
log_info "Optimizing system for ML workloads..."
|
|
||||||
hardening_steps
|
|
||||||
|
|
||||||
# Optimize kernel parameters for ML
|
|
||||||
cat >> /etc/sysctl.conf << EOF
|
|
||||||
# ML Optimization
|
|
||||||
net.core.rmem_max = 134217728
|
|
||||||
net.core.wmem_max = 134217728
|
|
||||||
vm.swappiness = 10
|
|
||||||
vm.dirty_ratio = 15
|
|
||||||
vm.dirty_background_ratio = 5
|
|
||||||
EOF
|
|
||||||
|
|
||||||
sysctl -p
|
|
||||||
|
|
||||||
# Configure GPU persistence mode if NVIDIA available
|
|
||||||
if command -v nvidia-smi &> /dev/null; then
|
|
||||||
nvidia-smi -pm 1 || log_warning "Could not enable GPU persistence mode"
|
|
||||||
fi
|
|
||||||
|
|
||||||
log_success "System optimized for ML workloads"
|
|
||||||
}
|
|
||||||
|
|
||||||
install_fetch_ml() {
|
|
||||||
log_info "Installing Fetch ML..."
|
|
||||||
|
|
||||||
# Clone or copy Fetch ML
|
|
||||||
cd $FETCH_ML_HOME
|
|
||||||
|
|
||||||
if [[ ! -d "fetch_ml" ]]; then
|
|
||||||
# This would be replaced with actual repository URL
|
|
||||||
log_warning "Please clone Fetch ML repository manually to $FETCH_ML_HOME/fetch_ml"
|
|
||||||
log_info "Example: git clone https://github.com/your-org/fetch_ml.git"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
cd fetch_ml
|
|
||||||
|
|
||||||
# Build
|
|
||||||
export PATH=$PATH:/usr/local/go/bin
|
|
||||||
make build
|
|
||||||
|
|
||||||
# Copy binaries
|
|
||||||
cp bin/* $FETCH_ML_HOME/bin/
|
|
||||||
chmod +x $FETCH_ML_HOME/bin/*
|
|
||||||
|
|
||||||
# Copy configs
|
|
||||||
mkdir -p $FETCH_ML_HOME/configs
|
|
||||||
cp configs/config-local.yaml.example $FETCH_ML_HOME/configs/config-local.yaml
|
|
||||||
|
|
||||||
# Set permissions
|
|
||||||
chown -R $FETCH_ML_USER:$FETCH_ML_USER $FETCH_ML_HOME
|
|
||||||
|
|
||||||
log_success "Fetch ML installed"
|
|
||||||
}
|
|
||||||
|
|
||||||
main() {
|
|
||||||
log_info "Starting Fetch ML Ubuntu server setup..."
|
|
||||||
|
|
||||||
check_root
|
|
||||||
check_ubuntu
|
|
||||||
|
|
||||||
update_system
|
|
||||||
install_go
|
|
||||||
install_podman
|
|
||||||
install_redis
|
|
||||||
install_nvidia_drivers
|
|
||||||
install_ml_tools
|
|
||||||
ensure_user
|
|
||||||
create_directories
|
|
||||||
setup_firewall
|
|
||||||
setup_systemd_services
|
|
||||||
setup_logrotate
|
|
||||||
hardening_steps
|
|
||||||
install_fetch_ml
|
|
||||||
|
|
||||||
log_success "Fetch ML setup complete!"
|
|
||||||
echo
|
|
||||||
log_info "Next steps:"
|
|
||||||
echo "1. Clone Fetch ML repository: git clone https://github.com/your-org/fetch_ml.git $FETCH_ML_HOME/fetch_ml"
|
|
||||||
echo "2. Configure: $FETCH_ML_HOME/configs/config-local.yaml"
|
|
||||||
echo "3. Start services: systemctl start fetch_ml_worker fetch_ml_data_manager"
|
|
||||||
echo "4. Check status: systemctl status fetch_ml_worker fetch_ml_data_manager"
|
|
||||||
echo "5. View logs: journalctl -u fetch_ml_worker -f"
|
|
||||||
echo
|
|
||||||
log_info "Services will be available at:"
|
|
||||||
echo "- Worker API: http://$(hostname -I | awk '{print $1}'):8080"
|
|
||||||
echo "- Data Manager: http://$(hostname -I | awk '{print $1}'):8081"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Run main function
|
|
||||||
main "$@"
|
|
||||||
|
|
@ -1,67 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
echo "=== Test Tools Harness ==="
|
|
||||||
|
|
||||||
# Function to check if Redis is running, start temporary instance if needed
|
|
||||||
ensure_redis() {
|
|
||||||
if ! redis-cli ping >/dev/null 2>&1; then
|
|
||||||
echo "Starting temporary Redis instance..."
|
|
||||||
redis-server --daemonize yes --port 6379
|
|
||||||
sleep 2
|
|
||||||
if ! redis-cli ping >/dev/null 2>&1; then
|
|
||||||
echo "Failed to start Redis"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Redis started successfully"
|
|
||||||
# Set up cleanup trap
|
|
||||||
trap 'echo "Stopping temporary Redis..."; redis-cli shutdown || true' EXIT
|
|
||||||
else
|
|
||||||
echo "Redis is already running"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Step 1: Build Go binaries
|
|
||||||
echo "Building Go binaries..."
|
|
||||||
go build -o bin/api-server ./cmd/api-server
|
|
||||||
go build -o bin/worker ./cmd/worker
|
|
||||||
go build -o bin/data_manager ./cmd/data_manager
|
|
||||||
go build -o bin/user_manager ./cmd/user_manager
|
|
||||||
|
|
||||||
# Step 2: Build Zig CLI
|
|
||||||
echo "Building Zig CLI..."
|
|
||||||
cd cli
|
|
||||||
zig build
|
|
||||||
cd ..
|
|
||||||
|
|
||||||
# Step 3: Ensure Redis is running
|
|
||||||
ensure_redis
|
|
||||||
|
|
||||||
# Step 4: Run Go tests
|
|
||||||
echo "Running Go tests..."
|
|
||||||
go test ./...
|
|
||||||
|
|
||||||
# Step 5: Run Zig tests
|
|
||||||
echo "Running Zig CLI tests..."
|
|
||||||
cd cli
|
|
||||||
zig test
|
|
||||||
cd ..
|
|
||||||
|
|
||||||
# Step 6: Run Go E2E tests (Redis is already available)
|
|
||||||
echo "Running Go E2E tests..."
|
|
||||||
go test ./tests/e2e/...
|
|
||||||
|
|
||||||
# Step 7: Smoke test API server and CLI
|
|
||||||
echo "Running smoke test..."
|
|
||||||
# Start API server in background on different port
|
|
||||||
./bin/api-server -config configs/config.yaml -port 19101 -no-tls > /tmp/api-server.log 2>&1 &
|
|
||||||
API_PID=$!
|
|
||||||
sleep 2
|
|
||||||
|
|
||||||
# Test CLI status
|
|
||||||
./cli/zig-out/bin/ml status -server http://localhost:19101
|
|
||||||
|
|
||||||
# Clean up
|
|
||||||
kill $API_PID 2>/dev/null || true
|
|
||||||
|
|
||||||
echo "=== All tests completed successfully ==="
|
|
||||||
|
|
@ -5,7 +5,7 @@ Requires=docker.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
ExecStart=/Users/jfraeys/Documents/dev/fetch_ml/scripts/cleanup.sh --force
|
ExecStart=/Users/jfraeys/Documents/dev/fetch_ml/scripts/maintenance/cleanup.sh --dry-run
|
||||||
User=jfraeys
|
User=jfraeys
|
||||||
Group=staff
|
Group=staff
|
||||||
StandardOutput=journal
|
StandardOutput=journal
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ set -e
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||||
LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
|
LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
|
||||||
|
ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"
|
||||||
|
|
||||||
# Colors for output
|
# Colors for output
|
||||||
RED='\033[0;31m'
|
RED='\033[0;31m'
|
||||||
|
|
@ -43,22 +44,34 @@ cleanup_benchmark_artifacts() {
|
||||||
|
|
||||||
case "${1:-keep-10}" in
|
case "${1:-keep-10}" in
|
||||||
"all")
|
"all")
|
||||||
print_status "Removing ALL benchmark artifacts..."
|
print_status "Archiving ALL benchmark artifacts..."
|
||||||
rm -rf "$LOCAL_ARTIFACTS_DIR"
|
local stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||||
print_success "Removed all artifacts (was $size_before)"
|
mkdir -p "$ARCHIVE_DIR/$stamp"
|
||||||
|
mv "$LOCAL_ARTIFACTS_DIR"/run_* "$ARCHIVE_DIR/$stamp"/ 2>/dev/null || true
|
||||||
|
print_success "Archived all artifacts (was $size_before)"
|
||||||
;;
|
;;
|
||||||
"keep-5")
|
"keep-5")
|
||||||
print_status "Keeping last 5 runs, removing older ones..."
|
print_status "Keeping last 5 runs, archiving older ones..."
|
||||||
|
local stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||||
|
mkdir -p "$ARCHIVE_DIR/$stamp"
|
||||||
cd "$LOCAL_ARTIFACTS_DIR"
|
cd "$LOCAL_ARTIFACTS_DIR"
|
||||||
ls -1t run_* 2>/dev/null | tail -n +6 | xargs rm -rf 2>/dev/null || true
|
ls -1t run_* 2>/dev/null | tail -n +6 | while read -r run; do
|
||||||
|
[ -n "$run" ] || continue
|
||||||
|
mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
|
||||||
|
done
|
||||||
local count_after=$(ls -1d run_* 2>/dev/null | wc -l)
|
local count_after=$(ls -1d run_* 2>/dev/null | wc -l)
|
||||||
local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B")
|
local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B")
|
||||||
print_success "Cleaned old runs: $count_before → $count_after runs ($size_before → $size_after)"
|
print_success "Cleaned old runs: $count_before → $count_after runs ($size_before → $size_after)"
|
||||||
;;
|
;;
|
||||||
"keep-10")
|
"keep-10")
|
||||||
print_status "Keeping last 10 runs, removing older ones..."
|
print_status "Keeping last 10 runs, archiving older ones..."
|
||||||
|
local stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||||
|
mkdir -p "$ARCHIVE_DIR/$stamp"
|
||||||
cd "$LOCAL_ARTIFACTS_DIR"
|
cd "$LOCAL_ARTIFACTS_DIR"
|
||||||
ls -1t run_* 2>/dev/null | tail -n +11 | xargs rm -rf 2>/dev/null || true
|
ls -1t run_* 2>/dev/null | tail -n +11 | while read -r run; do
|
||||||
|
[ -n "$run" ] || continue
|
||||||
|
mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
|
||||||
|
done
|
||||||
local count_after=$(ls -1d run_* 2>/dev/null | wc -l)
|
local count_after=$(ls -1d run_* 2>/dev/null | wc -l)
|
||||||
local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B")
|
local size_after=$(du -sh . 2>/dev/null | cut -f1 || echo "0B")
|
||||||
print_success "Cleaned old runs: $count_before → $count_after runs ($size_before → $size_after)"
|
print_success "Cleaned old runs: $count_before → $count_after runs ($size_before → $size_after)"
|
||||||
|
|
@ -80,12 +93,18 @@ cleanup_temp_files() {
|
||||||
# Clean temp directories
|
# Clean temp directories
|
||||||
local temp_cleaned=0
|
local temp_cleaned=0
|
||||||
|
|
||||||
|
local stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||||
|
local tmp_archive_dir="$LOCAL_ARTIFACTS_DIR/tmp-archive/$stamp"
|
||||||
|
mkdir -p "$tmp_archive_dir"
|
||||||
|
|
||||||
# /tmp cleanup
|
# /tmp cleanup
|
||||||
if [ -d "/tmp" ]; then
|
if [ -d "/tmp" ]; then
|
||||||
local tmp_files=$(find /tmp -name "benchmark_*" -type f 2>/dev/null | wc -l)
|
local tmp_files=$(find /tmp -name "benchmark_*" -type f 2>/dev/null | wc -l)
|
||||||
if [ "$tmp_files" -gt 0 ]; then
|
if [ "$tmp_files" -gt 0 ]; then
|
||||||
find /tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
|
find /tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
|
||||||
print_success "Cleaned $tmp_files temporary files from /tmp"
|
mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
print_success "Archived $tmp_files temporary files from /tmp"
|
||||||
temp_cleaned=$((temp_cleaned + tmp_files))
|
temp_cleaned=$((temp_cleaned + tmp_files))
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
@ -94,8 +113,10 @@ cleanup_temp_files() {
|
||||||
if [ -d "/var/tmp" ]; then
|
if [ -d "/var/tmp" ]; then
|
||||||
local vartmp_files=$(find /var/tmp -name "benchmark_*" -type f 2>/dev/null | wc -l)
|
local vartmp_files=$(find /var/tmp -name "benchmark_*" -type f 2>/dev/null | wc -l)
|
||||||
if [ "$vartmp_files" -gt 0 ]; then
|
if [ "$vartmp_files" -gt 0 ]; then
|
||||||
find /var/tmp -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
|
find /var/tmp -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
|
||||||
print_success "Cleaned $vartmp_files temporary files from /var/tmp"
|
mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
print_success "Archived $vartmp_files temporary files from /var/tmp"
|
||||||
temp_cleaned=$((temp_cleaned + vartmp_files))
|
temp_cleaned=$((temp_cleaned + vartmp_files))
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
@ -104,8 +125,10 @@ cleanup_temp_files() {
|
||||||
if [ -d "$HOME/tmp" ]; then
|
if [ -d "$HOME/tmp" ]; then
|
||||||
local user_tmp_files=$(find "$HOME/tmp" -name "benchmark_*" -type f 2>/dev/null | wc -l)
|
local user_tmp_files=$(find "$HOME/tmp" -name "benchmark_*" -type f 2>/dev/null | wc -l)
|
||||||
if [ "$user_tmp_files" -gt 0 ]; then
|
if [ "$user_tmp_files" -gt 0 ]; then
|
||||||
find "$HOME/tmp" -name "benchmark_*" -type f -mmin +60 -delete 2>/dev/null || true
|
find "$HOME/tmp" -name "benchmark_*" -type f -mmin +60 -print0 2>/dev/null | while IFS= read -r -d '' f; do
|
||||||
print_success "Cleaned $user_tmp_files temporary files from ~/tmp"
|
mv "$f" "$tmp_archive_dir/" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
print_success "Archived $user_tmp_files temporary files from ~/tmp"
|
||||||
temp_cleaned=$((temp_cleaned + user_tmp_files))
|
temp_cleaned=$((temp_cleaned + user_tmp_files))
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
@ -177,9 +200,16 @@ cleanup_logs() {
|
||||||
for log_dir in "${log_dirs[@]}"; do
|
for log_dir in "${log_dirs[@]}"; do
|
||||||
if [ -d "$log_dir" ]; then
|
if [ -d "$log_dir" ]; then
|
||||||
local log_size_before=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B")
|
local log_size_before=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B")
|
||||||
# Remove log files older than 7 days
|
local stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||||
find "$log_dir" -name "*.log" -type f -mtime +7 -delete 2>/dev/null || true
|
local log_archive_dir="$log_dir/archive/$stamp"
|
||||||
find "$log_dir" -name "*.log.*" -type f -mtime +7 -delete 2>/dev/null || true
|
mkdir -p "$log_archive_dir"
|
||||||
|
# Move log files older than 7 days to archive
|
||||||
|
find "$log_dir" -name "*.log" -type f -mtime +7 -print0 2>/dev/null | while IFS= read -r -d '' f; do
|
||||||
|
mv "$f" "$log_archive_dir/" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
find "$log_dir" -name "*.log.*" -type f -mtime +7 -print0 2>/dev/null | while IFS= read -r -d '' f; do
|
||||||
|
mv "$f" "$log_archive_dir/" 2>/dev/null || true
|
||||||
|
done
|
||||||
local log_size_after=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B")
|
local log_size_after=$(du -sh "$log_dir" 2>/dev/null | cut -f1 || echo "0B")
|
||||||
if [ "$log_size_before" != "$log_size_after" ]; then
|
if [ "$log_size_before" != "$log_size_after" ]; then
|
||||||
print_success "Cleaned old logs in $log_dir: $log_size_before → $log_size_after"
|
print_success "Cleaned old logs in $log_dir: $log_size_before → $log_size_after"
|
||||||
|
|
|
||||||
|
|
@ -144,12 +144,12 @@ else
|
||||||
log_info "No running containers found"
|
log_info "No running containers found"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Remove containers
|
# Remove containers
|
||||||
log_info "Removing containers..."
|
log_info "Removing containers..."
|
||||||
containers=$(docker ps -aq --filter "name=ml-")
|
containers=$(docker ps -aq --filter "name=ml-")
|
||||||
if [ -n "$containers" ]; then
|
if [ -n "$containers" ]; then
|
||||||
if [ "$DRY_RUN" = false ]; then
|
if [ "$DRY_RUN" = false ]; then
|
||||||
echo "$containers" | xargs docker rm -f
|
echo "$containers" | xargs docker rm
|
||||||
log_success "Containers removed"
|
log_success "Containers removed"
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
|
|
@ -168,9 +168,9 @@ else
|
||||||
log_info "No networks found"
|
log_info "No networks found"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Remove volumes (with caution)
|
# Remove volumes (with caution)
|
||||||
log_warning "Removing volumes (this will delete data)..."
|
log_warning "Skipping volumes by default (use --all to remove them)"
|
||||||
if [ "$FORCE" = true ] || [ "$ALL" = true ]; then
|
if [ "$ALL" = true ]; then
|
||||||
volumes=$(docker volume ls -q --filter "name=ml-")
|
volumes=$(docker volume ls -q --filter "name=ml-")
|
||||||
if [ -n "$volumes" ]; then
|
if [ -n "$volumes" ]; then
|
||||||
if [ "$DRY_RUN" = false ]; then
|
if [ "$DRY_RUN" = false ]; then
|
||||||
|
|
@ -181,16 +181,16 @@ if [ "$FORCE" = true ] || [ "$ALL" = true ]; then
|
||||||
log_info "No volumes found"
|
log_info "No volumes found"
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
log_info "Skipping volumes (use --force or --all to remove them)"
|
log_info "Skipping volumes"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Remove images if requested
|
# Remove images if requested
|
||||||
if [ "$ALL" = true ]; then
|
if [ "$ALL" = true ]; then
|
||||||
log_info "Removing images..."
|
log_info "Removing images..."
|
||||||
images=$(docker images -q --filter "reference=fetch_ml-*")
|
images=$(docker images -q --filter "reference=fetch_ml-*")
|
||||||
if [ -n "$images" ]; then
|
if [ -n "$images" ]; then
|
||||||
if [ "$DRY_RUN" = false ]; then
|
if [ "$DRY_RUN" = false ]; then
|
||||||
echo "$images" | xargs docker rmi -f
|
echo "$images" | xargs docker rmi
|
||||||
log_success "Images removed"
|
log_success "Images removed"
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
|
|
@ -200,11 +200,15 @@ else
|
||||||
log_info "Skipping images (use --all to remove them)"
|
log_info "Skipping images (use --all to remove them)"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# General Docker cleanup
|
# General Docker cleanup
|
||||||
log_info "Running general Docker cleanup..."
|
if [ "$ALL" = true ]; then
|
||||||
if [ "$DRY_RUN" = false ]; then
|
log_info "Running general Docker cleanup (docker system prune)..."
|
||||||
docker system prune -f
|
if [ "$DRY_RUN" = false ]; then
|
||||||
log_success "General cleanup completed"
|
docker system prune -f
|
||||||
|
log_success "General cleanup completed"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log_info "Skipping docker system prune (use --all to enable)"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Show final state
|
# Show final state
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ set -e
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||||
LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
|
LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
|
||||||
|
ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"
|
||||||
|
|
||||||
# Create artifacts directory if it doesn't exist
|
# Create artifacts directory if it doesn't exist
|
||||||
mkdir -p "$LOCAL_ARTIFACTS_DIR"
|
mkdir -p "$LOCAL_ARTIFACTS_DIR"
|
||||||
|
|
@ -41,17 +42,21 @@ case "${1:-help}" in
|
||||||
echo "=== Cleaning Artifacts ==="
|
echo "=== Cleaning Artifacts ==="
|
||||||
case "${2:-all}" in
|
case "${2:-all}" in
|
||||||
"all")
|
"all")
|
||||||
echo "Removing all artifacts..."
|
echo "Archiving all artifacts..."
|
||||||
rm -rf "$LOCAL_ARTIFACTS_DIR"
|
stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||||
echo "All artifacts removed"
|
mkdir -p "$ARCHIVE_DIR/$stamp"
|
||||||
|
mv "$LOCAL_ARTIFACTS_DIR"/run_* "$ARCHIVE_DIR/$stamp"/ 2>/dev/null || true
|
||||||
|
echo "All artifacts archived"
|
||||||
;;
|
;;
|
||||||
"old")
|
"old")
|
||||||
keep_count="${3:-10}"
|
keep_count="${3:-10}"
|
||||||
echo "Keeping last $keep_count runs, removing older ones..."
|
echo "Keeping last $keep_count runs, archiving older ones..."
|
||||||
|
stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||||
|
mkdir -p "$ARCHIVE_DIR/$stamp"
|
||||||
cd "$LOCAL_ARTIFACTS_DIR"
|
cd "$LOCAL_ARTIFACTS_DIR"
|
||||||
ls -1t run_* 2>/dev/null | tail -n +$((keep_count + 1)) | while read -r run; do
|
ls -1t run_* 2>/dev/null | tail -n +$((keep_count + 1)) | while read -r run; do
|
||||||
echo "Removing: $run"
|
echo "Archiving: $run"
|
||||||
rm -rf "$run"
|
mv "$run" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
|
||||||
done
|
done
|
||||||
;;
|
;;
|
||||||
"run")
|
"run")
|
||||||
|
|
@ -64,8 +69,10 @@ case "${1:-help}" in
|
||||||
fi
|
fi
|
||||||
run_dir="$LOCAL_ARTIFACTS_DIR/run_$run_id"
|
run_dir="$LOCAL_ARTIFACTS_DIR/run_$run_id"
|
||||||
if [ -d "$run_dir" ]; then
|
if [ -d "$run_dir" ]; then
|
||||||
echo "Removing run: $run_id"
|
echo "Archiving run: $run_id"
|
||||||
rm -rf "$run_dir"
|
stamp=$(date -u +%Y%m%d-%H%M%S)
|
||||||
|
mkdir -p "$ARCHIVE_DIR/$stamp"
|
||||||
|
mv "$run_dir" "$ARCHIVE_DIR/$stamp/" 2>/dev/null || true
|
||||||
else
|
else
|
||||||
echo "Run not found: $run_id"
|
echo "Run not found: $run_id"
|
||||||
fi
|
fi
|
||||||
|
|
|
||||||
|
|
@ -1,169 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Secure Homelab Setup Script for Fetch ML
|
|
||||||
# This script generates secure API keys and TLS certificates
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
|
||||||
CONFIG_DIR="$PROJECT_ROOT/configs/environments"
|
|
||||||
SSL_DIR="$PROJECT_ROOT/ssl"
|
|
||||||
|
|
||||||
echo "🔒 Setting up secure homelab configuration..."
|
|
||||||
|
|
||||||
# Create SSL directory
|
|
||||||
mkdir -p "$SSL_DIR"
|
|
||||||
|
|
||||||
# Generate TLS certificates
|
|
||||||
echo "📜 Generating TLS certificates..."
|
|
||||||
if [[ ! -f "$SSL_DIR/cert.pem" ]] || [[ ! -f "$SSL_DIR/key.pem" ]]; then
|
|
||||||
openssl req -x509 -newkey rsa:4096 -keyout "$SSL_DIR/key.pem" -out "$SSL_DIR/cert.pem" -days 365 -nodes \
|
|
||||||
-subj "/C=US/ST=Homelab/L=Local/O=FetchML/OU=Homelab/CN=localhost" \
|
|
||||||
-addext "subjectAltName=DNS:localhost,DNS:$(hostname),IP:127.0.0.1"
|
|
||||||
chmod 600 "$SSL_DIR/key.pem"
|
|
||||||
chmod 644 "$SSL_DIR/cert.pem"
|
|
||||||
echo "✅ TLS certificates generated in $SSL_DIR/"
|
|
||||||
else
|
|
||||||
echo "ℹ️ TLS certificates already exist, skipping generation"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Generate secure API keys
|
|
||||||
echo "🔑 Generating secure API keys..."
|
|
||||||
generate_api_key() {
|
|
||||||
openssl rand -hex 32
|
|
||||||
}
|
|
||||||
|
|
||||||
# Hash function
|
|
||||||
hash_key() {
|
|
||||||
echo -n "$1" | sha256sum | cut -d' ' -f1
|
|
||||||
}
|
|
||||||
|
|
||||||
# Generate keys
|
|
||||||
ADMIN_KEY=$(generate_api_key)
|
|
||||||
USER_KEY=$(generate_api_key)
|
|
||||||
ADMIN_HASH=$(hash_key "$ADMIN_KEY")
|
|
||||||
USER_HASH=$(hash_key "$USER_KEY")
|
|
||||||
|
|
||||||
# Create secure config
|
|
||||||
echo "⚙️ Creating secure configuration..."
|
|
||||||
cat > "$CONFIG_DIR/config-homelab-secure.yaml" << EOF
|
|
||||||
# Secure Homelab Configuration
|
|
||||||
# IMPORTANT: Keep your API keys safe and never share them!
|
|
||||||
|
|
||||||
redis:
|
|
||||||
url: "redis://localhost:6379"
|
|
||||||
max_connections: 10
|
|
||||||
|
|
||||||
auth:
|
|
||||||
enabled: true
|
|
||||||
api_keys:
|
|
||||||
homelab_admin:
|
|
||||||
hash: $ADMIN_HASH
|
|
||||||
admin: true
|
|
||||||
roles:
|
|
||||||
- admin
|
|
||||||
permissions:
|
|
||||||
'*': true
|
|
||||||
homelab_user:
|
|
||||||
hash: $USER_HASH
|
|
||||||
admin: false
|
|
||||||
roles:
|
|
||||||
- researcher
|
|
||||||
permissions:
|
|
||||||
'experiments': true
|
|
||||||
'datasets': true
|
|
||||||
'jupyter': true
|
|
||||||
|
|
||||||
server:
|
|
||||||
address: ":9101"
|
|
||||||
tls:
|
|
||||||
enabled: true
|
|
||||||
cert_file: "$SSL_DIR/cert.pem"
|
|
||||||
key_file: "$SSL_DIR/key.pem"
|
|
||||||
|
|
||||||
security:
|
|
||||||
rate_limit:
|
|
||||||
enabled: true
|
|
||||||
requests_per_minute: 60
|
|
||||||
burst_size: 10
|
|
||||||
ip_whitelist:
|
|
||||||
- "127.0.0.1"
|
|
||||||
- "::1"
|
|
||||||
- "localhost"
|
|
||||||
- "192.168.1.0/24" # Adjust to your network
|
|
||||||
- "10.0.0.0/8"
|
|
||||||
|
|
||||||
logging:
|
|
||||||
level: "info"
|
|
||||||
file: "logs/fetch_ml.log"
|
|
||||||
console: true
|
|
||||||
|
|
||||||
resources:
|
|
||||||
cpu_limit: "2"
|
|
||||||
memory_limit: "4Gi"
|
|
||||||
gpu_limit: 0
|
|
||||||
disk_limit: "10Gi"
|
|
||||||
|
|
||||||
# Prometheus metrics
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
listen_addr: ":9100"
|
|
||||||
tls:
|
|
||||||
enabled: false
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Save API keys to a secure file
|
|
||||||
echo "🔐 Saving API keys..."
|
|
||||||
cat > "$PROJECT_ROOT/.api-keys" << EOF
|
|
||||||
# Fetch ML Homelab API Keys
|
|
||||||
# IMPORTANT: Keep this file secure and never commit to version control!
|
|
||||||
|
|
||||||
ADMIN_API_KEY: $ADMIN_KEY
|
|
||||||
USER_API_KEY: $USER_KEY
|
|
||||||
|
|
||||||
# Usage examples:
|
|
||||||
# curl -H "X-API-Key: $ADMIN_KEY" https://localhost:9101/health
|
|
||||||
# curl -H "X-API-Key: $USER_KEY" https://localhost:9101/api/jupyter/services
|
|
||||||
EOF
|
|
||||||
|
|
||||||
chmod 600 "$PROJECT_ROOT/.api-keys"
|
|
||||||
|
|
||||||
# Create environment file for JWT secret
|
|
||||||
JWT_SECRET=$(generate_api_key)
|
|
||||||
cat > "$PROJECT_ROOT/.env.secure" << EOF
|
|
||||||
# Secure environment variables for Fetch ML
|
|
||||||
# IMPORTANT: Keep this file secure and never commit to version control!
|
|
||||||
|
|
||||||
JWT_SECRET=$JWT_SECRET
|
|
||||||
|
|
||||||
# Source this file before running the server:
|
|
||||||
# source .env.secure
|
|
||||||
EOF
|
|
||||||
|
|
||||||
chmod 600 "$PROJECT_ROOT/.env.secure"
|
|
||||||
|
|
||||||
# Update .gitignore to exclude sensitive files
|
|
||||||
echo "📝 Updating .gitignore..."
|
|
||||||
if ! grep -q ".api-keys" "$PROJECT_ROOT/.gitignore"; then
|
|
||||||
echo -e "\n# Security files\n.api-keys\n.env.secure\nssl/\n*.pem\n*.key" >> "$PROJECT_ROOT/.gitignore"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "🎉 Secure homelab setup complete!"
|
|
||||||
echo ""
|
|
||||||
echo "📋 Next steps:"
|
|
||||||
echo "1. Review and adjust the IP whitelist in config-homelab-secure.yaml"
|
|
||||||
echo "2. Start the server with: ./api-server -config configs/environments/config-homelab-secure.yaml"
|
|
||||||
echo "3. Source the environment: source .env.secure"
|
|
||||||
echo "4. Your API keys are saved in .api-keys"
|
|
||||||
echo ""
|
|
||||||
echo "🔐 API Keys:"
|
|
||||||
echo " Admin: $ADMIN_KEY"
|
|
||||||
echo " User: $USER_KEY"
|
|
||||||
echo ""
|
|
||||||
echo "⚠️ IMPORTANT:"
|
|
||||||
echo " - Never share your API keys"
|
|
||||||
echo " - Never commit .api-keys or .env.secure to version control"
|
|
||||||
echo " - Backup your SSL certificates and API keys securely"
|
|
||||||
echo " - Consider using a password manager for storing keys"
|
|
||||||
311
scripts/setup.sh
311
scripts/setup.sh
|
|
@ -1,311 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
# setup.sh: One-shot homelab setup (security + core services)
|
|
||||||
# Keeps essential security (Fail2Ban, monitoring) while simplifying complexity
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
readonly RED='\033[0;31m'
|
|
||||||
readonly GREEN='\033[0;32m'
|
|
||||||
readonly YELLOW='\033[1;33m'
|
|
||||||
readonly BLUE='\033[0;34m'
|
|
||||||
readonly NC='\033[0m'
|
|
||||||
|
|
||||||
print_info() {
|
|
||||||
echo -e "${BLUE}[INFO]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
print_success() {
|
|
||||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
print_warning() {
|
|
||||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
print_error() {
|
|
||||||
echo -e "${RED}[ERROR]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Simple dependency check
|
|
||||||
check_deps() {
|
|
||||||
print_info "Checking dependencies..."
|
|
||||||
|
|
||||||
local missing=()
|
|
||||||
|
|
||||||
if ! command -v go &> /dev/null; then
|
|
||||||
missing+=("go")
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v zig &> /dev/null; then
|
|
||||||
missing+=("zig")
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v redis-server &> /dev/null; then
|
|
||||||
missing+=("redis-server")
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v docker &> /dev/null; then
|
|
||||||
missing+=("docker")
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ${#missing[@]} -gt 0 ]]; then
|
|
||||||
print_error "Missing dependencies: ${missing[*]}"
|
|
||||||
echo ""
|
|
||||||
echo "Install with:"
|
|
||||||
echo " macOS: brew install ${missing[*]}"
|
|
||||||
echo " Ubuntu: sudo apt-get install ${missing[*]}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
print_success "Dependencies OK"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Simple setup
|
|
||||||
setup_project() {
|
|
||||||
print_info "Setting up project..."
|
|
||||||
|
|
||||||
# Create essential directories
|
|
||||||
mkdir -p ssl logs configs data monitoring
|
|
||||||
|
|
||||||
# Generate simple SSL cert
|
|
||||||
if [[ ! -f "ssl/cert.pem" ]]; then
|
|
||||||
openssl req -x509 -newkey rsa:2048 -keyout ssl/key.pem -out ssl/cert.pem \
|
|
||||||
-days 365 -nodes -subj "/C=US/ST=State/L=City/O=Homelab/CN=localhost" \
|
|
||||||
-addext "subjectAltName=DNS:localhost,IP:127.0.0.1" 2>/dev/null
|
|
||||||
print_success "SSL certificates generated"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Create balanced config
|
|
||||||
cat > configs/config.yaml << 'EOF'
|
|
||||||
base_path: "./data/experiments"
|
|
||||||
|
|
||||||
auth:
|
|
||||||
enabled: true
|
|
||||||
api_keys:
|
|
||||||
homelab_user:
|
|
||||||
hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password"
|
|
||||||
admin: true
|
|
||||||
roles: ["user", "admin"]
|
|
||||||
permissions:
|
|
||||||
read: true
|
|
||||||
write: true
|
|
||||||
delete: true
|
|
||||||
|
|
||||||
server:
|
|
||||||
address: ":9101"
|
|
||||||
tls:
|
|
||||||
enabled: true
|
|
||||||
cert_file: "./ssl/cert.pem"
|
|
||||||
key_file: "./ssl/key.pem"
|
|
||||||
|
|
||||||
security:
|
|
||||||
rate_limit:
|
|
||||||
enabled: true
|
|
||||||
requests_per_minute: 30
|
|
||||||
burst_size: 5
|
|
||||||
ip_whitelist:
|
|
||||||
- "127.0.0.1"
|
|
||||||
- "::1"
|
|
||||||
- "192.168.0.0/16"
|
|
||||||
- "10.0.0.0/8"
|
|
||||||
- "172.16.0.0/12"
|
|
||||||
failed_login_lockout:
|
|
||||||
enabled: true
|
|
||||||
max_attempts: 3
|
|
||||||
lockout_duration: "15m"
|
|
||||||
|
|
||||||
redis:
|
|
||||||
url: "redis://localhost:6379"
|
|
||||||
|
|
||||||
logging:
|
|
||||||
level: "info"
|
|
||||||
file: "./logs/app.log"
|
|
||||||
audit_log: "./logs/audit.log"
|
|
||||||
access_log: "./logs/access.log"
|
|
||||||
|
|
||||||
monitoring:
|
|
||||||
enabled: true
|
|
||||||
metrics_port: 9090
|
|
||||||
health_check_interval: "30s"
|
|
||||||
EOF
|
|
||||||
|
|
||||||
print_success "Configuration created"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Simple build
|
|
||||||
build_project() {
|
|
||||||
print_info "Building project..."
|
|
||||||
|
|
||||||
# Build Go apps
|
|
||||||
go build -o bin/api-server ./cmd/api-server
|
|
||||||
go build -o bin/worker ./cmd/worker
|
|
||||||
go build -o bin/tui ./cmd/tui
|
|
||||||
|
|
||||||
# Build Zig CLI
|
|
||||||
cd cli && zig build && cd ..
|
|
||||||
|
|
||||||
print_success "Build completed"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Setup Fail2Ban
|
|
||||||
setup_fail2ban() {
|
|
||||||
print_info "Setting up Fail2Ban..."
|
|
||||||
|
|
||||||
if ! command -v fail2ban-server &> /dev/null; then
|
|
||||||
print_warning "Fail2Ban not installed, skipping..."
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Create Fail2Ban configuration
|
|
||||||
sudo mkdir -p /etc/fail2ban/jail.d 2>/dev/null || true
|
|
||||||
|
|
||||||
cat > /tmp/ml-experiments-jail.conf << 'EOF'
|
|
||||||
[DEFAULT]
|
|
||||||
bantime = 3600
|
|
||||||
findtime = 600
|
|
||||||
maxretry = 3
|
|
||||||
backend = systemd
|
|
||||||
|
|
||||||
[sshd]
|
|
||||||
enabled = true
|
|
||||||
port = ssh
|
|
||||||
logpath = /var/log/auth.log
|
|
||||||
maxretry = 3
|
|
||||||
|
|
||||||
[ml-experiments-api]
|
|
||||||
enabled = true
|
|
||||||
port = 9101
|
|
||||||
filter = ml-experiments-api
|
|
||||||
logpath = ./logs/audit.log
|
|
||||||
maxretry = 5
|
|
||||||
bantime = 7200
|
|
||||||
|
|
||||||
[ml-experiments-auth]
|
|
||||||
enabled = true
|
|
||||||
filter = ml-experiments-auth
|
|
||||||
logpath = ./logs/audit.log
|
|
||||||
maxretry = 3
|
|
||||||
bantime = 3600
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Create filter definitions
|
|
||||||
cat > /tmp/ml-experiments-api.conf << 'EOF'
|
|
||||||
[Definition]
|
|
||||||
failregex = ^.*<HOST>.*"status":40[13].*$
|
|
||||||
ignoreregex =
|
|
||||||
EOF
|
|
||||||
|
|
||||||
cat > /tmp/ml-experiments-auth.conf << 'EOF'
|
|
||||||
[Definition]
|
|
||||||
failregex = ^.*"event":"failed_login".*"client_ip":"<HOST>".*$
|
|
||||||
ignoreregex =
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Try to install configurations
|
|
||||||
if sudo cp /tmp/ml-experiments-jail.conf /etc/fail2ban/jail.d/ 2>/dev/null; then
|
|
||||||
sudo cp /tmp/ml-experiments-*.conf /etc/fail2ban/filter.d/ 2>/dev/null || true
|
|
||||||
sudo systemctl restart fail2ban 2>/dev/null || true
|
|
||||||
print_success "Fail2Ban configured"
|
|
||||||
else
|
|
||||||
print_warning "Could not configure Fail2Ban (requires sudo)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
rm -f /tmp/ml-experiments-*.conf
|
|
||||||
}
|
|
||||||
|
|
||||||
# Setup Redis
|
|
||||||
setup_redis() {
|
|
||||||
print_info "Setting up Redis..."
|
|
||||||
|
|
||||||
if ! pgrep -f "redis-server" > /dev/null; then
|
|
||||||
redis-server --daemonize yes --port 6379
|
|
||||||
print_success "Redis started"
|
|
||||||
else
|
|
||||||
print_info "Redis already running"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Create simple management script
|
|
||||||
create_manage_script() {
|
|
||||||
cat > manage.sh << 'EOF'
|
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Simple management script
|
|
||||||
|
|
||||||
case "${1:-status}" in
|
|
||||||
"start")
|
|
||||||
echo "Starting services..."
|
|
||||||
redis-server --daemonize yes --port 6379 2>/dev/null || true
|
|
||||||
./bin/api-server -config configs/config.yaml &
|
|
||||||
echo "Services started"
|
|
||||||
;;
|
|
||||||
"stop")
|
|
||||||
echo "Stopping services..."
|
|
||||||
pkill -f "api-server" || true
|
|
||||||
redis-cli shutdown 2>/dev/null || true
|
|
||||||
echo "Services stopped"
|
|
||||||
;;
|
|
||||||
"status")
|
|
||||||
echo "=== Status ==="
|
|
||||||
if pgrep -f "redis-server" > /dev/null; then
|
|
||||||
echo "✅ Redis: Running"
|
|
||||||
else
|
|
||||||
echo "❌ Redis: Stopped"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if pgrep -f "api-server" > /dev/null; then
|
|
||||||
echo "✅ API Server: Running"
|
|
||||||
else
|
|
||||||
echo "❌ API Server: Stopped"
|
|
||||||
fi
|
|
||||||
;;
|
|
||||||
"logs")
|
|
||||||
echo "=== Recent Logs ==="
|
|
||||||
tail -20 logs/app.log 2>/dev/null || echo "No logs yet"
|
|
||||||
;;
|
|
||||||
"test")
|
|
||||||
echo "=== Testing ==="
|
|
||||||
curl -k -s https://localhost:9101/health || echo "API server not responding"
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Usage: $0 {start|stop|status|logs|test}"
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
EOF
|
|
||||||
|
|
||||||
chmod +x manage.sh
|
|
||||||
print_success "Management script created"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Show next steps
|
|
||||||
show_next_steps() {
|
|
||||||
print_success "Setup completed!"
|
|
||||||
echo ""
|
|
||||||
echo "🎉 Setup complete!"
|
|
||||||
echo ""
|
|
||||||
echo "Next steps:"
|
|
||||||
echo " 1. Start services: ./tools/manage.sh start"
|
|
||||||
echo " 2. Check status: ./tools/manage.sh status"
|
|
||||||
echo " 3. Test API: curl -k -H 'X-API-Key: password' https://localhost:9101/health"
|
|
||||||
echo ""
|
|
||||||
echo "Configuration: configs/config.yaml"
|
|
||||||
echo "Logs: logs/app.log and logs/audit.log"
|
|
||||||
echo ""
|
|
||||||
print_success "Ready for homelab use!"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Main setup
|
|
||||||
main() {
|
|
||||||
echo "ML Experiment Manager - Homelab Setup"
|
|
||||||
echo "====================================="
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
check_deps
|
|
||||||
setup_project
|
|
||||||
build_project
|
|
||||||
setup_redis
|
|
||||||
create_manage_script
|
|
||||||
show_next_steps
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
62
scripts/setup_monitoring.py
Normal file
62
scripts/setup_monitoring.py
Normal file
|
|
@ -0,0 +1,62 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Create monitoring directory structure
|
||||||
|
repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
||||||
|
monitoring_dir = os.path.join(repo_root, 'monitoring')
|
||||||
|
grafana_dir = os.path.join(monitoring_dir, 'grafana')
|
||||||
|
|
||||||
|
datasources_dir = os.path.join(grafana_dir, 'provisioning', 'datasources')
|
||||||
|
providers_dir = os.path.join(grafana_dir, 'provisioning', 'dashboards')
|
||||||
|
|
||||||
|
os.makedirs(datasources_dir, exist_ok=True)
|
||||||
|
os.makedirs(providers_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Essential datasource configurations
|
||||||
|
datasources = {
|
||||||
|
'prometheus.yml': """apiVersion: 1
|
||||||
|
datasources:
|
||||||
|
- name: Prometheus
|
||||||
|
type: prometheus
|
||||||
|
access: proxy
|
||||||
|
url: http://prometheus:9090
|
||||||
|
isDefault: true
|
||||||
|
editable: true
|
||||||
|
jsonData:
|
||||||
|
timeInterval: "5s"
|
||||||
|
""",
|
||||||
|
'loki.yml': """apiVersion: 1
|
||||||
|
datasources:
|
||||||
|
- name: Loki
|
||||||
|
type: loki
|
||||||
|
access: proxy
|
||||||
|
url: http://loki:3100
|
||||||
|
editable: true
|
||||||
|
jsonData:
|
||||||
|
maxLines: 1000
|
||||||
|
""",
|
||||||
|
'dashboards.yml': """apiVersion: 1
|
||||||
|
providers:
|
||||||
|
- name: 'default'
|
||||||
|
orgId: 1
|
||||||
|
folder: ''
|
||||||
|
type: file
|
||||||
|
disableDeletion: false
|
||||||
|
updateIntervalSeconds: 10
|
||||||
|
allowUiUpdates: true
|
||||||
|
options:
|
||||||
|
path: /var/lib/grafana/dashboards
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Write configuration files
|
||||||
|
for filename, content in datasources.items():
|
||||||
|
if filename == 'dashboards.yml':
|
||||||
|
path = os.path.join(providers_dir, filename)
|
||||||
|
else:
|
||||||
|
path = os.path.join(datasources_dir, filename)
|
||||||
|
|
||||||
|
with open(path, 'w') as f:
|
||||||
|
f.write(content)
|
||||||
|
|
||||||
|
print("Monitoring setup completed!")
|
||||||
111
scripts/smoke-test.sh
Normal file
111
scripts/smoke-test.sh
Normal file
|
|
@ -0,0 +1,111 @@
|
||||||
|
set -euo pipefail;
|
||||||
|
|
||||||
|
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
export FETCHML_REPO_ROOT="$repo_root"
|
||||||
|
|
||||||
|
env="${1:-dev}";
|
||||||
|
if [ "$env" != "dev" ] && [ "$env" != "prod" ]; then
|
||||||
|
echo "usage: $0 [dev|prod]" >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
probe_https_health_openssl() {
|
||||||
|
host="$1"
|
||||||
|
port="$2"
|
||||||
|
path="$3"
|
||||||
|
|
||||||
|
req="GET ${path} HTTP/1.1\r\nHost: ${host}\r\nConnection: close\r\n\r\n"
|
||||||
|
resp=$(printf "%b" "$req" | openssl s_client -connect "127.0.0.1:${port}" -servername "${host}" -tls1_2 -quiet 2>/dev/null || true)
|
||||||
|
printf "%s" "$resp" | tr -d '\r' | head -n 1 | grep -Eq '^HTTP/1\.[01] 200'
|
||||||
|
}
|
||||||
|
|
||||||
|
compose_cmd="docker-compose";
|
||||||
|
if ! command -v docker-compose >/dev/null 2>&1; then
|
||||||
|
compose_cmd="docker compose";
|
||||||
|
fi
|
||||||
|
|
||||||
|
compose_files=()
|
||||||
|
compose_project_args=("--project-directory" "$repo_root")
|
||||||
|
api_base=""
|
||||||
|
prometheus_base=""
|
||||||
|
stack_name=""
|
||||||
|
|
||||||
|
if [ "$env" = "dev" ]; then
|
||||||
|
mkdir -p \
|
||||||
|
"$repo_root/data/dev/redis" \
|
||||||
|
"$repo_root/data/dev/minio" \
|
||||||
|
"$repo_root/data/dev/prometheus" \
|
||||||
|
"$repo_root/data/dev/grafana" \
|
||||||
|
"$repo_root/data/dev/loki" \
|
||||||
|
"$repo_root/data/dev/logs" \
|
||||||
|
"$repo_root/data/dev/experiments" \
|
||||||
|
"$repo_root/data/dev/active" \
|
||||||
|
"$repo_root/data/dev/workspaces"
|
||||||
|
|
||||||
|
stack_name="dev"
|
||||||
|
compose_files=("-f" "$repo_root/deployments/docker-compose.dev.yml")
|
||||||
|
api_base="https://localhost:9101"
|
||||||
|
if ! curl -skf "$api_base/health" >/dev/null 2>&1; then
|
||||||
|
api_base="http://localhost:9101"
|
||||||
|
fi
|
||||||
|
prometheus_base="http://localhost:9090"
|
||||||
|
else
|
||||||
|
mkdir -p \
|
||||||
|
"$repo_root/data/prod-smoke/caddy/data" \
|
||||||
|
"$repo_root/data/prod-smoke/caddy/config" \
|
||||||
|
"$repo_root/data/prod-smoke/redis" \
|
||||||
|
"$repo_root/data/prod-smoke/logs" \
|
||||||
|
"$repo_root/data/prod-smoke/experiments" \
|
||||||
|
"$repo_root/data/prod-smoke/active"
|
||||||
|
|
||||||
|
stack_name="prod"
|
||||||
|
compose_files=("-f" "$repo_root/deployments/docker-compose.prod.smoke.yml")
|
||||||
|
api_base="https://localhost:8443"
|
||||||
|
export FETCHML_DOMAIN=localhost
|
||||||
|
export CADDY_EMAIL=smoke@example.invalid
|
||||||
|
fi
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
status=$?;
|
||||||
|
if [ "$status" -ne 0 ]; then
|
||||||
|
$compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" logs --no-color || true;
|
||||||
|
fi
|
||||||
|
if [ "${KEEP_STACK:-0}" != "1" ]; then
|
||||||
|
$compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" down -v >/dev/null 2>&1 || true;
|
||||||
|
fi
|
||||||
|
exit "$status";
|
||||||
|
}
|
||||||
|
|
||||||
|
trap cleanup EXIT;
|
||||||
|
echo "Starting $stack_name stack for smoke test...";
|
||||||
|
|
||||||
|
$compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" up -d --build >/dev/null;
|
||||||
|
echo "Waiting for API to become healthy...";
|
||||||
|
|
||||||
|
deadline=$(($(date +%s) + 90));
|
||||||
|
while true; do
|
||||||
|
if [ "$env" = "dev" ]; then
|
||||||
|
if curl -skf "$api_base/health" >/dev/null 2>&1; then break; fi;
|
||||||
|
else
|
||||||
|
if probe_https_health_openssl "localhost" "8443" "/health"; then break; fi;
|
||||||
|
fi
|
||||||
|
if [ $(date +%s) -ge $deadline ]; then echo "Timed out waiting for $api_base/health"; exit 1; fi;
|
||||||
|
sleep 2;
|
||||||
|
done;
|
||||||
|
|
||||||
|
if [ "$env" = "dev" ]; then
|
||||||
|
echo "Checking metrics endpoint...";
|
||||||
|
curl -skf "$api_base/metrics" >/dev/null;
|
||||||
|
|
||||||
|
echo "Waiting for Prometheus target api-server to be up...";
|
||||||
|
deadline=$(($(date +%s) + 90));
|
||||||
|
query_url="$prometheus_base/api/v1/query?query=up%7Bjob%3D%22api-server%22%7D";
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
resp=$(curl -sf "$query_url" || true);
|
||||||
|
resp_compact=$(printf "%s" "$resp" | tr -d '\n' | tr -d '\r');
|
||||||
|
if echo "$resp_compact" | grep -Fq '"instance":"api-server:9101"' && echo "$resp_compact" | grep -Fq ',"1"]'; then break; fi;
|
||||||
|
if [ $(date +%s) -ge $deadline ]; then echo "Timed out waiting for Prometheus api-server target to be up"; echo "$resp"; exit 1; fi;
|
||||||
|
sleep 2;
|
||||||
|
done;
|
||||||
|
fi
|
||||||
|
|
@ -1,80 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Homelab Secure Test Environment Script
|
|
||||||
set -e
|
|
||||||
|
|
||||||
echo "Starting Homelab Secure Production Environment..."
|
|
||||||
|
|
||||||
# Clean up any existing containers
|
|
||||||
echo "Cleaning up existing containers..."
|
|
||||||
docker-compose -f deployments/docker-compose.homelab-secure.yml down -v
|
|
||||||
|
|
||||||
# Create necessary directories with proper permissions
|
|
||||||
echo "Creating directories..."
|
|
||||||
mkdir -p data logs
|
|
||||||
chmod 750 data logs
|
|
||||||
|
|
||||||
# Build and start services
|
|
||||||
echo "Building and starting services..."
|
|
||||||
docker-compose -f deployments/docker-compose.homelab-secure.yml up --build -d
|
|
||||||
|
|
||||||
# Wait for services to be healthy
|
|
||||||
echo "Waiting for services to be healthy..."
|
|
||||||
sleep 20
|
|
||||||
|
|
||||||
# Check service health
|
|
||||||
echo "Checking service health..."
|
|
||||||
docker-compose -f deployments/docker-compose.homelab-secure.yml ps
|
|
||||||
|
|
||||||
# Test API server with TLS
|
|
||||||
echo "Testing API server..."
|
|
||||||
curl -k -s https://localhost:9104/health || echo "API health check failed"
|
|
||||||
|
|
||||||
# Test Redis with authentication
|
|
||||||
echo "Testing Redis with authentication..."
|
|
||||||
docker exec ml-homelab-redis redis-cli -a "HomelabRedis2024!" ping || echo "Redis health check failed"
|
|
||||||
|
|
||||||
# Test SSH connectivity with security
|
|
||||||
echo "Testing SSH connectivity..."
|
|
||||||
docker exec -u worker ml-homelab-worker ssh -o StrictHostKeyChecking=no -o Port=2222 worker@localhost "echo 'SSH OK'" || echo "SSH test failed"
|
|
||||||
|
|
||||||
# Test fail2ban status
|
|
||||||
echo "Testing fail2ban..."
|
|
||||||
docker exec ml-homelab-api fail2ban-client status sshd || echo "fail2ban check failed"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Homelab secure production environment is ready!"
|
|
||||||
echo ""
|
|
||||||
echo "Services:"
|
|
||||||
echo " - API Server: https://localhost:9104"
|
|
||||||
echo " - SSH: localhost:2223 (worker user)"
|
|
||||||
echo " - Redis: localhost:6379 (with password)"
|
|
||||||
echo " - Metrics: http://localhost:9101"
|
|
||||||
echo ""
|
|
||||||
echo "Security Features:"
|
|
||||||
echo " ✓ Strong TLS 1.3 with modern ciphers"
|
|
||||||
echo " ✓ SSH with fail2ban protection"
|
|
||||||
echo " ✓ Redis with password authentication"
|
|
||||||
echo " ✓ SQLite database with encryption"
|
|
||||||
echo " ✓ Container security hardening"
|
|
||||||
echo " ✓ Rate limiting and CORS protection"
|
|
||||||
echo " ✓ Security headers and CSRF protection"
|
|
||||||
echo " ✓ Podman sandboxed job execution"
|
|
||||||
echo " ✓ Audit logging and monitoring"
|
|
||||||
echo ""
|
|
||||||
echo "Credentials:"
|
|
||||||
echo " - API User: homelab_user / password"
|
|
||||||
echo " - SSH User: worker / HomelabWorker2024!"
|
|
||||||
echo " - Redis Password: HomelabRedis2024!"
|
|
||||||
echo ""
|
|
||||||
echo "To test with CLI:"
|
|
||||||
echo " ./cli/zig-out/bin/ml queue homelab-secure-test"
|
|
||||||
echo " ./cli/zig-out/bin/ml status"
|
|
||||||
echo ""
|
|
||||||
echo "To view logs:"
|
|
||||||
echo " docker-compose -f deployments/docker-compose.homelab-secure.yml logs -f api-server"
|
|
||||||
echo " docker-compose -f deployments/docker-compose.homelab-secure.yml logs -f worker"
|
|
||||||
echo " docker-compose -f deployments/docker-compose.homelab-secure.yml down"
|
|
||||||
echo ""
|
|
||||||
echo "To stop:"
|
|
||||||
echo " docker-compose -f deployments/docker-compose.homelab-secure.yml down"
|
|
||||||
64
scripts/track_performance.sh
Executable file
64
scripts/track_performance.sh
Executable file
|
|
@ -0,0 +1,64 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Simple performance tracking script
|
||||||
|
|
||||||
|
RESULTS_DIR="test_results/performance"
|
||||||
|
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
|
||||||
|
RESULTS_FILE="$RESULTS_DIR/load_test_$TIMESTAMP.json"
|
||||||
|
|
||||||
|
mkdir -p "$RESULTS_DIR"
|
||||||
|
|
||||||
|
echo "Running load test performance tracking..."
|
||||||
|
echo "Timestamp: $TIMESTAMP"
|
||||||
|
|
||||||
|
# Run tests and capture results
|
||||||
|
go test ./tests/load -run=TestLoadTestSuite -v -load-suite=medium -timeout=10m > "$RESULTS_DIR/raw_$TIMESTAMP.log"
|
||||||
|
|
||||||
|
# Extract key metrics
|
||||||
|
{
|
||||||
|
echo "{"
|
||||||
|
echo " \"timestamp\": \"$TIMESTAMP\","
|
||||||
|
echo " \"tests\": ["
|
||||||
|
|
||||||
|
# Parse light load
|
||||||
|
LIGHT_RPS=$(grep -A1 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Throughput" | awk '{print $2}')
|
||||||
|
LIGHT_ERROR=$(grep -A2 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Error rate" | awk '{print $3}')
|
||||||
|
LIGHT_P99=$(grep -A4 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "P99 latency" | awk '{print $3}')
|
||||||
|
|
||||||
|
echo " {"
|
||||||
|
echo " \"name\": \"LightLoad\","
|
||||||
|
echo " \"throughput_rps\": $LIGHT_RPS,"
|
||||||
|
echo " \"error_rate_percent\": $LIGHT_ERROR,"
|
||||||
|
echo " \"p99_latency_ms\": \"$LIGHT_P99\""
|
||||||
|
echo " },"
|
||||||
|
|
||||||
|
# Parse medium load
|
||||||
|
MEDIUM_RPS=$(grep -A1 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Throughput" | awk '{print $2}')
|
||||||
|
MEDIUM_ERROR=$(grep -A2 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Error rate" | awk '{print $3}')
|
||||||
|
MEDIUM_P99=$(grep -A4 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "P99 latency" | awk '{print $3}')
|
||||||
|
|
||||||
|
echo " {"
|
||||||
|
echo " \"name\": \"MediumLoad\","
|
||||||
|
echo " \"throughput_rps\": $MEDIUM_RPS,"
|
||||||
|
echo " \"error_rate_percent\": $MEDIUM_ERROR,"
|
||||||
|
echo " \"p99_latency_ms\": \"$MEDIUM_P99\""
|
||||||
|
echo " }"
|
||||||
|
echo " ]"
|
||||||
|
echo "}"
|
||||||
|
} > "$RESULTS_FILE"
|
||||||
|
|
||||||
|
echo "Results saved to: $RESULTS_FILE"
|
||||||
|
echo "Raw logs: $RESULTS_DIR/raw_$TIMESTAMP.log"
|
||||||
|
|
||||||
|
# Show comparison with previous run if exists
|
||||||
|
PREV_FILE=$(ls -t "$RESULTS_DIR"/load_test_*.json | sed -n '2p')
|
||||||
|
if [ -n "$PREV_FILE" ]; then
|
||||||
|
echo ""
|
||||||
|
echo "=== Comparison with previous run ==="
|
||||||
|
echo "Previous: $(basename $PREV_FILE)"
|
||||||
|
echo "Current: $(basename $RESULTS_FILE)"
|
||||||
|
echo ""
|
||||||
|
echo "Light Load Throughput:"
|
||||||
|
echo " Previous: $(jq -r '.tests[0].throughput_rps' "$PREV_FILE") RPS"
|
||||||
|
echo " Current: $(jq -r '.tests[0].throughput_rps' "$RESULTS_FILE") RPS"
|
||||||
|
echo " Change: $(echo "$(jq -r '.tests[0].throughput_rps' "$RESULTS_FILE") - $(jq -r '.tests[0].throughput_rps' "$PREV_FILE")" | bc -l) RPS"
|
||||||
|
fi
|
||||||
|
|
@ -1,204 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
# Production Configuration Validator
|
|
||||||
# Verifies all paths and configs are consistent for experiment lifecycle
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
BOLD='\033[1m'
|
|
||||||
GREEN='\033[0;32m'
|
|
||||||
RED='\033[0;31m'
|
|
||||||
YELLOW='\033[1;33m'
|
|
||||||
NC='\033[0m' # No Color
|
|
||||||
|
|
||||||
echo -e "${BOLD}=== FetchML Production Configuration Validator ===${NC}\n"
|
|
||||||
|
|
||||||
# Configuration file paths
|
|
||||||
API_CONFIG="${1:-configs/config-prod.yaml}"
|
|
||||||
WORKER_CONFIG="${2:-configs/worker-prod.toml}"
|
|
||||||
|
|
||||||
errors=0
|
|
||||||
warnings=0
|
|
||||||
|
|
||||||
check_pass() {
|
|
||||||
echo -e "${GREEN}✓${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
check_fail() {
|
|
||||||
echo -e "${RED}✗${NC} $1"
|
|
||||||
((errors++))
|
|
||||||
}
|
|
||||||
|
|
||||||
check_warn() {
|
|
||||||
echo -e "${YELLOW}⚠${NC} $1"
|
|
||||||
((warnings++))
|
|
||||||
}
|
|
||||||
|
|
||||||
# 1. Check API server config exists
|
|
||||||
echo -e "${BOLD}Checking API Server Configuration${NC}"
|
|
||||||
if [ ! -f "$API_CONFIG" ]; then
|
|
||||||
check_fail "API config not found: $API_CONFIG"
|
|
||||||
else
|
|
||||||
check_pass "API config found: $API_CONFIG"
|
|
||||||
|
|
||||||
# Extract base_path from API config
|
|
||||||
API_BASE_PATH=$(grep 'base_path:' "$API_CONFIG" | head -1 | awk '{print $2}' | tr -d '"')
|
|
||||||
echo " Base path: $API_BASE_PATH"
|
|
||||||
|
|
||||||
# Check if path is absolute
|
|
||||||
if [[ "$API_BASE_PATH" != /* ]]; then
|
|
||||||
check_fail "base_path must be absolute: $API_BASE_PATH"
|
|
||||||
else
|
|
||||||
check_pass "base_path is absolute"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check Redis config
|
|
||||||
if grep -q 'redis:' "$API_CONFIG"; then
|
|
||||||
check_pass "Redis configuration present"
|
|
||||||
else
|
|
||||||
check_fail "Redis configuration missing"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check auth enabled
|
|
||||||
if grep -q 'enabled: true' "$API_CONFIG"; then
|
|
||||||
check_pass "Authentication enabled"
|
|
||||||
else
|
|
||||||
check_warn "Authentication disabled (not recommended for production)"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# 2. Check Worker config (if provided)
|
|
||||||
if [ -f "$WORKER_CONFIG" ]; then
|
|
||||||
echo -e "${BOLD}Checking Worker Configuration${NC}"
|
|
||||||
check_pass "Worker config found: $WORKER_CONFIG"
|
|
||||||
|
|
||||||
# Extract base_path from worker config
|
|
||||||
WORKER_BASE_PATH=$(grep 'base_path' "$WORKER_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
|
|
||||||
echo " Base path: $WORKER_BASE_PATH"
|
|
||||||
|
|
||||||
# Compare paths
|
|
||||||
if [ "$API_BASE_PATH" = "$WORKER_BASE_PATH" ]; then
|
|
||||||
check_pass "API and Worker base_path match"
|
|
||||||
else
|
|
||||||
check_fail "base_path mismatch! API: $API_BASE_PATH, Worker: $WORKER_BASE_PATH"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check podman_image configured
|
|
||||||
if grep -q 'podman_image' "$WORKER_CONFIG"; then
|
|
||||||
PODMAN_IMAGE=$(grep 'podman_image' "$WORKER_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
|
|
||||||
check_pass "Podman image configured: $PODMAN_IMAGE"
|
|
||||||
else
|
|
||||||
check_fail "podman_image not configured"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
check_warn "Worker config not found: $WORKER_CONFIG (optional for API server only)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# 3. Check directory structure (if base_path exists)
|
|
||||||
if [ -n "$API_BASE_PATH" ] && [ -d "$API_BASE_PATH" ]; then
|
|
||||||
echo -e "${BOLD}Checking Directory Structure${NC}"
|
|
||||||
check_pass "Base directory exists: $API_BASE_PATH"
|
|
||||||
|
|
||||||
# Check subdirectories
|
|
||||||
for dir in experiments pending running finished failed; do
|
|
||||||
if [ -d "$API_BASE_PATH/$dir" ]; then
|
|
||||||
check_pass "$dir/ directory exists"
|
|
||||||
else
|
|
||||||
check_warn "$dir/ directory missing (will be created automatically)"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
# Check permissions
|
|
||||||
if [ -w "$API_BASE_PATH" ]; then
|
|
||||||
check_pass "Base directory is writable"
|
|
||||||
else
|
|
||||||
check_fail "Base directory is not writable (check permissions)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
elif [ -n "$API_BASE_PATH" ]; then
|
|
||||||
check_warn "Base directory does not exist: $API_BASE_PATH (will need to be created)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# 4. Check Redis connectivity (if server is running)
|
|
||||||
echo -e "${BOLD}Checking Redis Connectivity${NC}"
|
|
||||||
if command -v redis-cli &> /dev/null; then
|
|
||||||
if redis-cli ping &> /dev/null; then
|
|
||||||
check_pass "Redis server is running and accessible"
|
|
||||||
|
|
||||||
# Check queue
|
|
||||||
QUEUE_SIZE=$(redis-cli llen fetchml:tasks:queue 2>/dev/null || echo "0")
|
|
||||||
echo " Queue size: $QUEUE_SIZE tasks"
|
|
||||||
else
|
|
||||||
check_warn "Redis server not accessible (start with: redis-server)"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
check_warn "redis-cli not installed (cannot verify Redis connectivity)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# 5. Check Podman (if worker config exists)
|
|
||||||
if [ -f "$WORKER_CONFIG" ]; then
|
|
||||||
echo -e "${BOLD}Checking Podman${NC}"
|
|
||||||
if command -v podman &> /dev/null; then
|
|
||||||
check_pass "Podman is installed"
|
|
||||||
|
|
||||||
# Check if image exists
|
|
||||||
if [ -n "$PODMAN_IMAGE" ]; then
|
|
||||||
if podman image exists "$PODMAN_IMAGE" 2>/dev/null; then
|
|
||||||
check_pass "Podman image exists: $PODMAN_IMAGE"
|
|
||||||
else
|
|
||||||
check_warn "Podman image not found: $PODMAN_IMAGE (needs to be built)"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check GPU access (if configured)
|
|
||||||
if grep -q 'gpu_access.*true' "$WORKER_CONFIG" 2>/dev/null; then
|
|
||||||
if podman run --rm --device nvidia.com/gpu=all nvidia/cuda:11.8.0-base nvidia-smi &>/dev/null; then
|
|
||||||
check_pass "GPU access working"
|
|
||||||
else
|
|
||||||
check_warn "GPU access configured but not working (check nvidia-container-toolkit)"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
check_fail "Podman not installed (required for worker)"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# 6. Check CLI config consistency
|
|
||||||
echo -e "${BOLD}Checking CLI Configuration${NC}"
|
|
||||||
CLI_CONFIG="$HOME/.ml/config.toml"
|
|
||||||
if [ -f "$CLI_CONFIG" ]; then
|
|
||||||
check_pass "CLI config found: $CLI_CONFIG"
|
|
||||||
|
|
||||||
CLI_BASE=$(grep 'worker_base' "$CLI_CONFIG" | awk -F '=' '{print $2}' | tr -d ' "')
|
|
||||||
if [ "$CLI_BASE" = "$API_BASE_PATH" ]; then
|
|
||||||
check_pass "CLI worker_base matches server base_path"
|
|
||||||
else
|
|
||||||
check_warn "CLI worker_base ($CLI_BASE) differs from server ($API_BASE_PATH)"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
check_warn "CLI config not found (run: ml init)"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Summary
|
|
||||||
echo -e "${BOLD}=== Summary ===${NC}"
|
|
||||||
if [ $errors -eq 0 ] && [ $warnings -eq 0 ]; then
|
|
||||||
echo -e "${GREEN}All checks passed! Configuration is ready for production.${NC}"
|
|
||||||
exit 0
|
|
||||||
elif [ $errors -eq 0 ]; then
|
|
||||||
echo -e "${YELLOW}Configuration has $warnings warning(s). Review before deployment.${NC}"
|
|
||||||
exit 0
|
|
||||||
else
|
|
||||||
echo -e "${RED}Configuration has $errors error(s) and $warnings warning(s). Fix before deployment.${NC}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
148
scripts/verify_release.sh
Normal file
148
scripts/verify_release.sh
Normal file
|
|
@ -0,0 +1,148 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<'EOF'
|
||||||
|
Usage:
|
||||||
|
scripts/verify_release.sh --dir <release_dir> [--repo <org>/<repo>]
|
||||||
|
|
||||||
|
What it does:
|
||||||
|
- Verifies checksums.txt signature (keyless cosign) if cosign + checksums.txt.sig/.cert are present
|
||||||
|
- Verifies *.tar.gz files against checksums.txt
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- --repo enables strict Sigstore identity checking against the release workflow.
|
||||||
|
- Without cosign, the script still verifies SHA256 hashes.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
scripts/verify_release.sh --dir ./release --repo jfraeys/fetch_ml
|
||||||
|
scripts/verify_release.sh --dir .
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
release_dir=""
|
||||||
|
repo=""
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--dir)
|
||||||
|
release_dir="${2:-}"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--repo)
|
||||||
|
repo="${2:-}"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-h|--help)
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "unknown argument: $1" >&2
|
||||||
|
usage >&2
|
||||||
|
exit 2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ -z "$release_dir" ]]; then
|
||||||
|
echo "missing --dir" >&2
|
||||||
|
usage >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -d "$release_dir" ]]; then
|
||||||
|
echo "directory not found: $release_dir" >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd "$release_dir"
|
||||||
|
|
||||||
|
if [[ ! -f checksums.txt ]]; then
|
||||||
|
echo "missing checksums.txt in $release_dir" >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
has_cosign=false
|
||||||
|
if command -v cosign >/dev/null 2>&1; then
|
||||||
|
has_cosign=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
verify_sigstore() {
|
||||||
|
if [[ ! -f checksums.txt.sig ]] || [[ ! -f checksums.txt.cert ]]; then
|
||||||
|
echo "[verify] cosign available, but checksums.txt.sig/.cert not found; skipping signature verification" >&2
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$repo" ]]; then
|
||||||
|
echo "[verify] verifying signature (no repo identity pin; pass --repo to pin identity)" >&2
|
||||||
|
COSIGN_YES=true cosign verify-blob \
|
||||||
|
--certificate checksums.txt.cert \
|
||||||
|
--signature checksums.txt.sig \
|
||||||
|
--certificate-oidc-issuer https://token.actions.githubusercontent.com \
|
||||||
|
checksums.txt >/dev/null
|
||||||
|
echo "[ok] checksums.txt signature verified (un-pinned identity)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local identity
|
||||||
|
identity="^https://github.com/${repo}/\.github/workflows/release\.yml@refs/tags/v.*$"
|
||||||
|
|
||||||
|
COSIGN_YES=true cosign verify-blob \
|
||||||
|
--certificate checksums.txt.cert \
|
||||||
|
--signature checksums.txt.sig \
|
||||||
|
--certificate-identity-regexp "$identity" \
|
||||||
|
--certificate-oidc-issuer https://token.actions.githubusercontent.com \
|
||||||
|
checksums.txt >/dev/null
|
||||||
|
|
||||||
|
echo "[ok] checksums.txt signature verified (pinned to ${repo} release workflow)"
|
||||||
|
}
|
||||||
|
|
||||||
|
verify_hashes() {
|
||||||
|
local failures=0
|
||||||
|
|
||||||
|
local has_sha256sum=false
|
||||||
|
if command -v sha256sum >/dev/null 2>&1; then
|
||||||
|
has_sha256sum=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
while IFS= read -r expected file; do
|
||||||
|
[[ -z "${expected}" ]] && continue
|
||||||
|
[[ -z "${file}" ]] && continue
|
||||||
|
|
||||||
|
if [[ ! -f "$file" ]]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
local actual
|
||||||
|
if [[ "$has_sha256sum" == true ]]; then
|
||||||
|
actual="$(sha256sum "$file" | awk '{print $1}')"
|
||||||
|
else
|
||||||
|
actual="$(shasum -a 256 "$file" | awk '{print $1}')"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$actual" != "$expected" ]]; then
|
||||||
|
echo "[fail] $file" >&2
|
||||||
|
echo " expected: $expected" >&2
|
||||||
|
echo " actual: $actual" >&2
|
||||||
|
failures=$((failures+1))
|
||||||
|
fi
|
||||||
|
done < <(awk '{print $1, $2}' checksums.txt)
|
||||||
|
|
||||||
|
if [[ $failures -gt 0 ]]; then
|
||||||
|
echo "[fail] checksum verification failed ($failures file(s))" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[ok] all available artifacts match checksums.txt"
|
||||||
|
}
|
||||||
|
|
||||||
|
if [[ "$has_cosign" == true ]]; then
|
||||||
|
verify_sigstore
|
||||||
|
else
|
||||||
|
echo "[verify] cosign not installed; skipping signature verification" >&2
|
||||||
|
fi
|
||||||
|
|
||||||
|
verify_hashes
|
||||||
|
|
||||||
|
echo "[ok] release verification complete"
|
||||||
|
|
@ -5,6 +5,10 @@
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
make_target_exists() {
|
||||||
|
make -n "$1" >/dev/null 2>&1
|
||||||
|
}
|
||||||
|
|
||||||
# Colors
|
# Colors
|
||||||
RED='\033[0;31m'
|
RED='\033[0;31m'
|
||||||
GREEN='\033[0;32m'
|
GREEN='\033[0;32m'
|
||||||
|
|
@ -45,7 +49,7 @@ show_status() {
|
||||||
|
|
||||||
# Check Go apps
|
# Check Go apps
|
||||||
print_app "Go Applications:"
|
print_app "Go Applications:"
|
||||||
local go_apps=("api-server" "worker" "tui" "data_manager" "user_manager")
|
local go_apps=("api-server" "worker" "tui")
|
||||||
for app in "${go_apps[@]}"; do
|
for app in "${go_apps[@]}"; do
|
||||||
if [[ -f "bin/$app" ]]; then
|
if [[ -f "bin/$app" ]]; then
|
||||||
echo " ✅ $app: Built"
|
echo " ✅ $app: Built"
|
||||||
|
|
@ -85,7 +89,7 @@ show_status() {
|
||||||
|
|
||||||
# Check configuration
|
# Check configuration
|
||||||
print_app "Configuration:"
|
print_app "Configuration:"
|
||||||
if [[ -f "configs/config-local.yaml" ]]; then
|
if [[ -f "configs/api/dev.yaml" ]]; then
|
||||||
echo " ✅ Security config: Found"
|
echo " ✅ Security config: Found"
|
||||||
else
|
else
|
||||||
echo " ⚠️ Security config: Not found"
|
echo " ⚠️ Security config: Not found"
|
||||||
|
|
@ -110,14 +114,14 @@ build_all() {
|
||||||
echo "============================="
|
echo "============================="
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
print_info "Building Go applications..."
|
|
||||||
make build
|
|
||||||
|
|
||||||
if command -v zig &> /dev/null; then
|
if command -v zig &> /dev/null; then
|
||||||
print_info "Building Zig CLI..."
|
print_info "Building all components (Go + Zig CLI)..."
|
||||||
make cli-build
|
make build
|
||||||
else
|
else
|
||||||
print_warning "Zig not found, skipping CLI build"
|
print_warning "Zig not found, building Go components only"
|
||||||
|
go build -o bin/api-server cmd/api-server/main.go
|
||||||
|
go build -o bin/worker cmd/worker/worker_server.go
|
||||||
|
go build -o bin/tui ./cmd/tui
|
||||||
fi
|
fi
|
||||||
|
|
||||||
print_success "Build completed!"
|
print_success "Build completed!"
|
||||||
|
|
@ -128,11 +132,13 @@ test_all() {
|
||||||
echo "===================="
|
echo "===================="
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
print_info "Running main test suite..."
|
if make_target_exists test-full; then
|
||||||
make test
|
print_info "Running full test suite..."
|
||||||
|
make test-full
|
||||||
print_info "Running comprehensive tests..."
|
else
|
||||||
make test-all
|
print_info "Running test suite..."
|
||||||
|
make test
|
||||||
|
fi
|
||||||
|
|
||||||
print_success "All tests completed!"
|
print_success "All tests completed!"
|
||||||
}
|
}
|
||||||
|
|
@ -156,8 +162,8 @@ start_services() {
|
||||||
# Start API server if built
|
# Start API server if built
|
||||||
if [[ -f "bin/api-server" ]]; then
|
if [[ -f "bin/api-server" ]]; then
|
||||||
print_info "Starting API server..."
|
print_info "Starting API server..."
|
||||||
if [[ -f "configs/config-local.yaml" ]]; then
|
if [[ -f "configs/api/dev.yaml" ]]; then
|
||||||
./bin/api-server --config configs/config-local.yaml &
|
./bin/api-server --config configs/api/dev.yaml &
|
||||||
else
|
else
|
||||||
print_warning "No config found, using defaults"
|
print_warning "No config found, using defaults"
|
||||||
./bin/api-server &
|
./bin/api-server &
|
||||||
|
|
@ -187,13 +193,25 @@ check_health() {
|
||||||
print_info "Port 9101 is open, checking API health endpoint..."
|
print_info "Port 9101 is open, checking API health endpoint..."
|
||||||
|
|
||||||
# Try the health endpoint
|
# Try the health endpoint
|
||||||
response=$(curl -k -s --max-time 3 -H 'X-API-Key: password' -H 'X-Forwarded-For: 127.0.0.1' https://localhost:9101/health 2>/dev/null)
|
local api_key_header=""
|
||||||
|
if [[ -n "${FETCH_ML_API_KEY:-}" ]]; then
|
||||||
|
api_key_header="-H X-API-Key: ${FETCH_ML_API_KEY}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
response=$(curl -s --max-time 3 ${api_key_header} http://localhost:9101/health 2>/dev/null || true)
|
||||||
|
if [[ -z "$response" ]]; then
|
||||||
|
response=$(curl -k -s --max-time 3 ${api_key_header} https://localhost:9101/health 2>/dev/null || true)
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ "$response" == "OK" ]]; then
|
if [[ "$response" == "OK" ]]; then
|
||||||
print_success "API is healthy: $response"
|
print_success "API is healthy: $response"
|
||||||
elif [[ "$response" == *"IP not whitelisted"* ]]; then
|
elif [[ "$response" == *"IP not whitelisted"* ]]; then
|
||||||
print_warning "API running but IP not whitelisted (expected behavior)"
|
print_warning "API running but IP not whitelisted (expected behavior)"
|
||||||
print_info "Try: curl -k -H 'X-API-Key: password' -H 'X-Forwarded-For: 127.0.0.1' https://localhost:9101/health"
|
if [[ -n "${FETCH_ML_API_KEY:-}" ]]; then
|
||||||
|
print_info "Try: curl -k -H 'X-API-Key: $FETCH_ML_API_KEY' https://localhost:9101/health"
|
||||||
|
else
|
||||||
|
print_info "Try: curl -k https://localhost:9101/health"
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
print_error "Unexpected response: $response"
|
print_error "Unexpected response: $response"
|
||||||
fi
|
fi
|
||||||
|
|
@ -229,19 +247,36 @@ run_security() {
|
||||||
case "${1:-check}" in
|
case "${1:-check}" in
|
||||||
"check")
|
"check")
|
||||||
print_info "Running security checks..."
|
print_info "Running security checks..."
|
||||||
make security-check
|
if make_target_exists security-check; then
|
||||||
|
make security-check
|
||||||
|
else
|
||||||
|
print_warning "No 'security-check' Make target found"
|
||||||
|
print_info "Try: make ci-local"
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
"monitor")
|
"monitor")
|
||||||
print_info "Starting security monitoring..."
|
print_info "Starting security monitoring..."
|
||||||
make security-monitor
|
if make_target_exists security-monitor; then
|
||||||
|
make security-monitor
|
||||||
|
else
|
||||||
|
print_warning "No 'security-monitor' Make target found"
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
"deploy")
|
"deploy")
|
||||||
print_info "Deploying with security..."
|
print_info "Deploying with security..."
|
||||||
make security-deploy
|
if make_target_exists security-deploy; then
|
||||||
|
make security-deploy
|
||||||
|
else
|
||||||
|
print_warning "No 'security-deploy' Make target found"
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
"audit")
|
"audit")
|
||||||
print_info "Running security audit..."
|
print_info "Running security audit..."
|
||||||
make security-audit
|
if make_target_exists security-audit; then
|
||||||
|
make security-audit
|
||||||
|
else
|
||||||
|
print_warning "No 'security-audit' Make target found"
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Usage: $0 security {check|monitor|deploy|audit}"
|
echo "Usage: $0 security {check|monitor|deploy|audit}"
|
||||||
|
|
@ -258,15 +293,22 @@ run_development() {
|
||||||
case "${1:-setup}" in
|
case "${1:-setup}" in
|
||||||
"setup")
|
"setup")
|
||||||
print_info "Setting up development environment..."
|
print_info "Setting up development environment..."
|
||||||
./scripts/auto_setup.sh
|
print_warning "Legacy setup scripts were removed; using Makefile/deployments instead"
|
||||||
|
print_info "Try: make dev"
|
||||||
|
print_info "Or: ./deployments/deploy.sh dev up"
|
||||||
;;
|
;;
|
||||||
"quick")
|
"quick")
|
||||||
print_info "Running quick start..."
|
print_info "Running quick start..."
|
||||||
./scripts/quick_start.sh
|
print_warning "Legacy quick start script was removed; using deployments instead"
|
||||||
|
print_info "Try: ./deployments/deploy.sh dev up"
|
||||||
;;
|
;;
|
||||||
"deps")
|
"deps")
|
||||||
print_info "Installing dependencies..."
|
print_info "Installing dependencies..."
|
||||||
make install-deps
|
if make_target_exists install-deps; then
|
||||||
|
make install-deps
|
||||||
|
else
|
||||||
|
print_warning "No 'install-deps' Make target found"
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Usage: $0 dev {setup|quick|deps}"
|
echo "Usage: $0 dev {setup|quick|deps}"
|
||||||
|
|
@ -309,7 +351,7 @@ cleanup() {
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
print_info "Cleaning project artifacts..."
|
print_info "Cleaning project artifacts..."
|
||||||
make clean-all
|
make clean
|
||||||
|
|
||||||
print_info "Stopping services..."
|
print_info "Stopping services..."
|
||||||
stop_services
|
stop_services
|
||||||
|
|
@ -330,7 +372,7 @@ show_help() {
|
||||||
echo " start - Start all services"
|
echo " start - Start all services"
|
||||||
echo " stop - Stop all services"
|
echo " stop - Stop all services"
|
||||||
echo " health - Check API health endpoint"
|
echo " health - Check API health endpoint"
|
||||||
echo " security - Security management (check|monitor|deploy|audit)"
|
echo " security - Security management (check|monitor|deploy|audit)"
|
||||||
echo " dev - Development environment (setup|quick|deps)"
|
echo " dev - Development environment (setup|quick|deps)"
|
||||||
echo " logs - Show application logs"
|
echo " logs - Show application logs"
|
||||||
echo " cleanup - Clean project artifacts and stop services"
|
echo " cleanup - Clean project artifacts and stop services"
|
||||||
|
|
|
||||||
|
|
@ -47,7 +47,10 @@ type Improvement struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewPerformanceRegressionDetector creates a new detector instance
|
// NewPerformanceRegressionDetector creates a new detector instance
|
||||||
func NewPerformanceRegressionDetector(baselineFile string, threshold float64) *PerformanceRegressionDetector {
|
func NewPerformanceRegressionDetector(
|
||||||
|
baselineFile string,
|
||||||
|
threshold float64,
|
||||||
|
) *PerformanceRegressionDetector {
|
||||||
return &PerformanceRegressionDetector{
|
return &PerformanceRegressionDetector{
|
||||||
BaselineFile: baselineFile,
|
BaselineFile: baselineFile,
|
||||||
Threshold: threshold,
|
Threshold: threshold,
|
||||||
|
|
@ -74,7 +77,9 @@ func (prd *PerformanceRegressionDetector) LoadBaseline() ([]BenchmarkResult, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// AnalyzeResults analyzes current results against baseline
|
// AnalyzeResults analyzes current results against baseline
|
||||||
func (prd *PerformanceRegressionDetector) AnalyzeResults(current []BenchmarkResult) (*RegressionReport, error) {
|
func (prd *PerformanceRegressionDetector) AnalyzeResults(
|
||||||
|
current []BenchmarkResult,
|
||||||
|
) (*RegressionReport, error) {
|
||||||
baseline, err := prd.LoadBaseline()
|
baseline, err := prd.LoadBaseline()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to load baseline: %w", err)
|
return nil, fmt.Errorf("failed to load baseline: %w", err)
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue