fetch_ml/configs/api/homelab-secure.yaml
Jeremie Fraeys b3a0c78903
config: add Plugin GPU Quota, plugins, and audit logging to configs
- Add Plugin GPU Quota config section to scheduler.yaml.example

- Add audit logging config to homelab-secure.yaml (HIPAA-compliant)

- Add Jupyter and vLLM plugin configs to all worker configs:

  - Security settings (passwords, trusted channels, blocked packages)

  - Resource limits (GPU, memory, CPU)

  - Model cache paths and quantization options for vLLM

- Disable plugins in HIPAA deployment mode for compliance

- Update deployments README with plugin services and GPU quotas
2026-02-26 14:34:42 -05:00

90 lines
2 KiB
YAML

base_path: "/data/experiments"
data_dir: "/data/active"
auth:
enabled: true
api_keys:
homelab_admin:
hash: "CHANGE_ME_SHA256_HOMELAB_ADMIN_KEY"
admin: true
roles:
- admin
permissions:
"*": true
homelab_user:
hash: "CHANGE_ME_SHA256_HOMELAB_USER_KEY"
admin: false
roles:
- researcher
permissions:
experiments: true
datasets: true
jupyter: true
server:
address: ":9101"
tls:
enabled: false
cert_file: "/app/ssl/cert.pem"
key_file: "/app/ssl/key.pem"
security:
production_mode: true
allowed_origins:
- "https://ml-experiments.example.com"
rate_limit:
enabled: true
requests_per_minute: 60
burst_size: 10
ip_whitelist:
- "127.0.0.1"
- "192.168.0.0/16"
monitoring:
prometheus:
enabled: true
port: 9101
path: "/metrics"
health_checks:
enabled: true
interval: "30s"
redis:
url: "redis://:CHANGE_ME_REDIS_PASSWORD@redis:6379"
password: ""
db: 0
database:
type: "sqlite"
connection: "/data/experiments/fetch_ml.sqlite"
logging:
level: "info"
file: "/logs/fetch_ml.log"
# Audit logging (HIPAA-compliant with tamper-evident chain hashing)
audit:
enabled: true
file: "/var/log/fetch_ml/audit.log" # Separate file for audit events
chain_hashing: true # Enable tamper-evident logging
retention_days: 2555 # 7 years for HIPAA compliance
log_ip_address: true # Include source IP in audit events
log_user_agent: true # Include user agent in audit events
# Sensitive events to always log
events:
- "authentication_success"
- "authentication_failure"
- "file_access"
- "file_write"
- "file_delete"
- "job_queued"
- "job_started"
- "job_completed"
- "experiment_created"
- "experiment_deleted"
resources:
max_workers: 1
desired_rps_per_worker: 2
podman_cpus: "2"
podman_memory: "4Gi"