config: add Plugin GPU Quota, plugins, and audit logging to configs
- Add Plugin GPU Quota config section to scheduler.yaml.example - Add audit logging config to homelab-secure.yaml (HIPAA-compliant) - Add Jupyter and vLLM plugin configs to all worker configs: - Security settings (passwords, trusted channels, blocked packages) - Resource limits (GPU, memory, CPU) - Model cache paths and quantization options for vLLM - Disable plugins in HIPAA deployment mode for compliance - Update deployments README with plugin services and GPU quotas
This commit is contained in:
parent
90ea18555c
commit
b3a0c78903
12 changed files with 359 additions and 2 deletions
|
|
@ -62,7 +62,26 @@ database:
|
|||
logging:
|
||||
level: "info"
|
||||
file: "/logs/fetch_ml.log"
|
||||
audit_log: ""
|
||||
# Audit logging (HIPAA-compliant with tamper-evident chain hashing)
|
||||
audit:
|
||||
enabled: true
|
||||
file: "/var/log/fetch_ml/audit.log" # Separate file for audit events
|
||||
chain_hashing: true # Enable tamper-evident logging
|
||||
retention_days: 2555 # 7 years for HIPAA compliance
|
||||
log_ip_address: true # Include source IP in audit events
|
||||
log_user_agent: true # Include user agent in audit events
|
||||
# Sensitive events to always log
|
||||
events:
|
||||
- "authentication_success"
|
||||
- "authentication_failure"
|
||||
- "file_access"
|
||||
- "file_write"
|
||||
- "file_delete"
|
||||
- "job_queued"
|
||||
- "job_started"
|
||||
- "job_completed"
|
||||
- "experiment_created"
|
||||
- "experiment_deleted"
|
||||
|
||||
resources:
|
||||
max_workers: 1
|
||||
|
|
|
|||
|
|
@ -30,3 +30,30 @@ scheduler:
|
|||
token: "wkr_PLACEHOLDER_GENERATE_WITH_OPENSSL_RAND_HEX_32"
|
||||
- id: "worker-02"
|
||||
token: "wkr_PLACEHOLDER_GENERATE_WITH_OPENSSL_RAND_HEX_32"
|
||||
|
||||
# Plugin GPU Quota Configuration
|
||||
# Controls GPU allocation for plugin-based services (Jupyter, vLLM, etc.)
|
||||
plugin_quota:
|
||||
enabled: false # Enable quota enforcement (default: false)
|
||||
total_gpus: 16 # Global GPU limit across all plugins (0 = unlimited)
|
||||
per_user_gpus: 4 # Default per-user GPU limit (0 = unlimited)
|
||||
per_user_services: 2 # Default per-user service count limit (0 = unlimited)
|
||||
|
||||
# Plugin-specific limits (optional)
|
||||
per_plugin_limits:
|
||||
vllm:
|
||||
max_gpus: 8 # Max GPUs for vLLM across all users
|
||||
max_services: 4 # Max vLLM service instances
|
||||
jupyter:
|
||||
max_gpus: 4 # Max GPUs for Jupyter across all users
|
||||
max_services: 10 # Max Jupyter service instances
|
||||
|
||||
# Per-user overrides (optional)
|
||||
user_overrides:
|
||||
admin:
|
||||
max_gpus: 8 # Admin gets more GPUs
|
||||
max_services: 5 # Admin can run more services
|
||||
allowed_plugins: ["jupyter", "vllm"] # Restrict which plugins user can use
|
||||
researcher1:
|
||||
max_gpus: 2 # Limited GPU access
|
||||
max_services: 1 # Single service limit
|
||||
|
|
|
|||
|
|
@ -48,6 +48,39 @@ queue:
|
|||
native:
|
||||
data_dir: "data/dev/queue"
|
||||
|
||||
# Plugin Configuration (for local development)
|
||||
plugins:
|
||||
# Jupyter Notebook/Lab Service
|
||||
jupyter:
|
||||
enabled: true
|
||||
image: "quay.io/jupyter/base-notebook:latest"
|
||||
default_port: 8888
|
||||
mode: "lab"
|
||||
# Security settings
|
||||
security:
|
||||
trusted_channels:
|
||||
- "conda-forge"
|
||||
- "defaults"
|
||||
blocked_packages: [] # Less restrictive for local dev
|
||||
require_password: false # No password for local dev
|
||||
# Resource limits
|
||||
max_gpu_per_instance: 1
|
||||
max_memory_per_instance: "4Gi"
|
||||
|
||||
# vLLM Inference Service
|
||||
vllm:
|
||||
enabled: true
|
||||
image: "vllm/vllm-openai:latest"
|
||||
default_port: 8000
|
||||
# Model cache location
|
||||
model_cache: "data/dev/models"
|
||||
# Supported quantization methods: awq, gptq, fp8, squeezellm
|
||||
default_quantization: "" # No quantization for dev (better quality)
|
||||
# Resource limits
|
||||
max_gpu_per_instance: 1
|
||||
max_model_len: 2048
|
||||
tensor_parallel_size: 1
|
||||
|
||||
task_lease_duration: "30m"
|
||||
heartbeat_interval: "1m"
|
||||
max_retries: 3
|
||||
|
|
|
|||
|
|
@ -50,7 +50,40 @@ resources:
|
|||
metrics:
|
||||
enabled: true
|
||||
listen_addr: ":9100"
|
||||
metrics_flush_interval: "500ms"
|
||||
metrics_flush_interval: "500ms"
|
||||
|
||||
# Plugin Configuration
|
||||
plugins:
|
||||
# Jupyter Notebook/Lab Service
|
||||
jupyter:
|
||||
enabled: true
|
||||
image: "quay.io/jupyter/base-notebook:latest"
|
||||
default_port: 8888
|
||||
mode: "lab"
|
||||
# Security settings
|
||||
security:
|
||||
trusted_channels:
|
||||
- "conda-forge"
|
||||
- "defaults"
|
||||
blocked_packages: [] # Dev environment - less restrictive
|
||||
require_password: false # No password for dev
|
||||
# Resource limits
|
||||
max_gpu_per_instance: 1
|
||||
max_memory_per_instance: "4Gi"
|
||||
|
||||
# vLLM Inference Service
|
||||
vllm:
|
||||
enabled: true
|
||||
image: "vllm/vllm-openai:latest"
|
||||
default_port: 8000
|
||||
# Model cache location
|
||||
model_cache: "/models"
|
||||
# Supported quantization methods: awq, gptq, fp8, squeezellm
|
||||
default_quantization: "" # No quantization for dev
|
||||
# Resource limits
|
||||
max_gpu_per_instance: 1
|
||||
max_model_len: 2048
|
||||
tensor_parallel_size: 1
|
||||
|
||||
task_lease_duration: "30m"
|
||||
heartbeat_interval: "1m"
|
||||
|
|
|
|||
|
|
@ -48,3 +48,42 @@ task_lease_duration: "30m"
|
|||
heartbeat_interval: "1m"
|
||||
max_retries: 3
|
||||
graceful_timeout: "5m"
|
||||
|
||||
# Plugin Configuration
|
||||
plugins:
|
||||
# Jupyter Notebook/Lab Service
|
||||
jupyter:
|
||||
enabled: true
|
||||
image: "quay.io/jupyter/base-notebook:latest"
|
||||
default_port: 8888
|
||||
# Security settings
|
||||
security:
|
||||
trusted_channels:
|
||||
- "conda-forge"
|
||||
- "defaults"
|
||||
- "pytorch"
|
||||
blocked_packages:
|
||||
- "requests"
|
||||
- "urllib3"
|
||||
- "httpx"
|
||||
require_password: true
|
||||
# Resource limits (enforced by scheduler quota system)
|
||||
max_gpu_per_instance: 1
|
||||
max_memory_per_instance: "8Gi"
|
||||
|
||||
# vLLM Inference Service
|
||||
vllm:
|
||||
enabled: true
|
||||
image: "vllm/vllm-openai:latest"
|
||||
default_port: 8000
|
||||
# Model cache location
|
||||
model_cache: "/models"
|
||||
# Supported quantization methods: awq, gptq, fp8, squeezellm
|
||||
default_quantization: "" # empty = no quantization
|
||||
# Resource limits
|
||||
max_gpu_per_instance: 4
|
||||
max_model_len: 4096
|
||||
# Environment variables passed to container
|
||||
env:
|
||||
- "HF_HOME=/models"
|
||||
- "VLLM_WORKER_MULTIPROC_METHOD=spawn"
|
||||
|
|
|
|||
|
|
@ -48,6 +48,46 @@ queue:
|
|||
backend: "redis"
|
||||
redis_url: "redis://localhost:6379/0"
|
||||
|
||||
# Plugin Configuration
|
||||
plugins:
|
||||
# Jupyter Notebook/Lab Service
|
||||
jupyter:
|
||||
enabled: true
|
||||
image: "quay.io/jupyter/base-notebook:latest"
|
||||
default_port: 8888
|
||||
mode: "lab" # "lab" or "notebook"
|
||||
# Security settings
|
||||
security:
|
||||
trusted_channels:
|
||||
- "conda-forge"
|
||||
- "defaults"
|
||||
- "pytorch"
|
||||
- "nvidia"
|
||||
blocked_packages:
|
||||
- "requests"
|
||||
- "urllib3"
|
||||
- "httpx"
|
||||
- "socket"
|
||||
- "subprocess"
|
||||
require_password: true
|
||||
# Resource limits
|
||||
max_gpu_per_instance: 1
|
||||
max_memory_per_instance: "16Gi"
|
||||
|
||||
# vLLM Inference Service
|
||||
vllm:
|
||||
enabled: true
|
||||
image: "vllm/vllm-openai:latest"
|
||||
default_port: 8000
|
||||
# Model cache location (should be on fast storage)
|
||||
model_cache: "/var/lib/fetchml/models"
|
||||
# Supported quantization methods: awq, gptq, fp8, squeezellm
|
||||
default_quantization: ""
|
||||
# Resource limits
|
||||
max_gpu_per_instance: 2
|
||||
max_model_len: 4096
|
||||
tensor_parallel_size: 1
|
||||
|
||||
# Snapshot store (optional)
|
||||
snapshot_store:
|
||||
enabled: false
|
||||
|
|
|
|||
|
|
@ -45,3 +45,42 @@ task_lease_duration: "30m"
|
|||
heartbeat_interval: "1m"
|
||||
max_retries: 3
|
||||
graceful_timeout: "5m"
|
||||
|
||||
# Plugin Configuration
|
||||
plugins:
|
||||
# Jupyter Notebook/Lab Service
|
||||
jupyter:
|
||||
enabled: true
|
||||
image: "quay.io/jupyter/base-notebook:latest"
|
||||
default_port: 8888
|
||||
mode: "lab"
|
||||
# Security settings (strict for secure config)
|
||||
security:
|
||||
trusted_channels:
|
||||
- "conda-forge"
|
||||
- "defaults"
|
||||
blocked_packages:
|
||||
- "requests"
|
||||
- "urllib3"
|
||||
- "httpx"
|
||||
- "socket"
|
||||
- "subprocess"
|
||||
- "os.system"
|
||||
require_password: true
|
||||
# Resource limits
|
||||
max_gpu_per_instance: 1
|
||||
max_memory_per_instance: "8Gi"
|
||||
|
||||
# vLLM Inference Service
|
||||
vllm:
|
||||
enabled: true
|
||||
image: "vllm/vllm-openai:latest"
|
||||
default_port: 8000
|
||||
# Model cache location
|
||||
model_cache: "/models"
|
||||
# Supported quantization methods: awq, gptq, fp8, squeezellm
|
||||
default_quantization: ""
|
||||
# Resource limits
|
||||
max_gpu_per_instance: 1
|
||||
max_model_len: 4096
|
||||
tensor_parallel_size: 1
|
||||
|
|
|
|||
|
|
@ -45,3 +45,34 @@ podman_memory = "16g"
|
|||
[metrics]
|
||||
enabled = true
|
||||
listen_addr = ":9100"
|
||||
|
||||
# Plugin Configuration
|
||||
[plugins]
|
||||
|
||||
[plugins.jupyter]
|
||||
enabled = true
|
||||
image = "quay.io/jupyter/base-notebook:latest"
|
||||
default_port = 8888
|
||||
mode = "lab"
|
||||
max_gpu_per_instance = 1
|
||||
max_memory_per_instance = "8Gi"
|
||||
|
||||
[plugins.jupyter.security]
|
||||
require_password = true
|
||||
trusted_channels = ["conda-forge", "defaults", "pytorch"]
|
||||
blocked_packages = ["requests", "urllib3", "httpx"]
|
||||
|
||||
[plugins.vllm]
|
||||
enabled = true
|
||||
image = "vllm/vllm-openai:latest"
|
||||
default_port = 8000
|
||||
model_cache = "/models"
|
||||
default_quantization = "" # Options: awq, gptq, fp8, squeezellm
|
||||
max_gpu_per_instance = 2
|
||||
max_model_len = 4096
|
||||
tensor_parallel_size = 1
|
||||
|
||||
# Environment variables for vLLM
|
||||
[plugins.vllm.env]
|
||||
HF_HOME = "/models"
|
||||
VLLM_WORKER_MULTIPROC_METHOD = "spawn"
|
||||
|
|
|
|||
|
|
@ -110,6 +110,36 @@ TLS_KEY_PATH=/app/ssl/key.pem
|
|||
| Prometheus | 9090 | - | - |
|
||||
| Grafana | 3000 | - | - |
|
||||
| Loki | 3100 | - | - |
|
||||
| JupyterLab | 8888* | 8888* | - |
|
||||
| vLLM | 8000* | 8000* | - |
|
||||
|
||||
*Plugin service ports are dynamically allocated from the 8000-9000 range by the scheduler.
|
||||
|
||||
## Plugin Services
|
||||
|
||||
The deployment configurations include support for interactive ML services:
|
||||
|
||||
### Jupyter Notebook/Lab
|
||||
- **Image**: `quay.io/jupyter/base-notebook:latest`
|
||||
- **Security**: Trusted channels (conda-forge, defaults), blocked packages (http clients)
|
||||
- **Resources**: Configurable GPU/memory limits
|
||||
- **Access**: Via scheduler-assigned port (8000-9000 range)
|
||||
|
||||
### vLLM Inference
|
||||
- **Image**: `vllm/vllm-openai:latest`
|
||||
- **Features**: OpenAI-compatible API, quantization support (AWQ, GPTQ, FP8)
|
||||
- **Model Cache**: Configurable path for model storage
|
||||
- **Resources**: Multi-GPU tensor parallelism support
|
||||
|
||||
## Scheduler GPU Quotas
|
||||
|
||||
The scheduler supports GPU quota management for plugin services:
|
||||
- **Global Limit**: Total GPUs across all plugins
|
||||
- **Per-User Limits**: GPU and service count per user
|
||||
- **Per-Plugin Limits**: vLLM and Jupyter-specific limits
|
||||
- **User Overrides**: Special permissions for admins/researchers
|
||||
|
||||
See `configs/scheduler/scheduler.yaml.example` for quota configuration.
|
||||
|
||||
## Monitoring
|
||||
|
||||
|
|
@ -122,3 +152,4 @@ TLS_KEY_PATH=/app/ssl/key.pem
|
|||
- If you need HTTPS externally, terminate TLS at a reverse proxy.
|
||||
- API keys should be managed via environment variables
|
||||
- Database credentials should use secrets management in production
|
||||
- **HIPAA deployments**: Plugins are disabled by default for compliance
|
||||
|
|
|
|||
|
|
@ -29,3 +29,30 @@ max_artifact_total_bytes: 1073741824 # 1GB
|
|||
|
||||
# Provenance (disabled in dev for speed)
|
||||
provenance_best_effort: false
|
||||
|
||||
# Plugin Configuration (development mode)
|
||||
plugins:
|
||||
# Jupyter Notebook/Lab Service
|
||||
jupyter:
|
||||
enabled: true
|
||||
image: "quay.io/jupyter/base-notebook:latest"
|
||||
default_port: 8888
|
||||
mode: "lab"
|
||||
security:
|
||||
trusted_channels:
|
||||
- "conda-forge"
|
||||
- "defaults"
|
||||
blocked_packages: [] # No restrictions in dev
|
||||
require_password: false # No password for dev
|
||||
max_gpu_per_instance: 1
|
||||
max_memory_per_instance: "4Gi"
|
||||
|
||||
# vLLM Inference Service
|
||||
vllm:
|
||||
enabled: true
|
||||
image: "vllm/vllm-openai:latest"
|
||||
default_port: 8000
|
||||
model_cache: "/tmp/models" # Temp location for dev
|
||||
default_quantization: "" # No quantization for dev
|
||||
max_gpu_per_instance: 1
|
||||
max_model_len: 2048
|
||||
|
|
|
|||
|
|
@ -51,3 +51,12 @@ ssh_key: ${SSH_KEY_PATH}
|
|||
|
||||
# Config hash computation enabled (required for audit)
|
||||
# This is automatically computed by Validate()
|
||||
|
||||
# Plugin Configuration (DISABLED for HIPAA compliance)
|
||||
# Jupyter and vLLM services are disabled in HIPAA mode to ensure
|
||||
# no unauthorized network access or data processing
|
||||
plugins:
|
||||
jupyter:
|
||||
enabled: false # Disabled: HIPAA requires strict network isolation
|
||||
vllm:
|
||||
enabled: false # Disabled: External model downloads violate PHI controls
|
||||
|
|
|
|||
|
|
@ -33,3 +33,32 @@ max_artifact_total_bytes: 536870912 # 512MB
|
|||
|
||||
# Provenance (enabled)
|
||||
provenance_best_effort: true
|
||||
|
||||
# Plugin Configuration
|
||||
plugins:
|
||||
# Jupyter Notebook/Lab Service
|
||||
jupyter:
|
||||
enabled: true
|
||||
image: "quay.io/jupyter/base-notebook:latest"
|
||||
default_port: 8888
|
||||
mode: "lab"
|
||||
security:
|
||||
trusted_channels:
|
||||
- "conda-forge"
|
||||
- "defaults"
|
||||
blocked_packages:
|
||||
- "requests"
|
||||
- "urllib3"
|
||||
require_password: true
|
||||
max_gpu_per_instance: 1
|
||||
max_memory_per_instance: "8Gi"
|
||||
|
||||
# vLLM Inference Service
|
||||
vllm:
|
||||
enabled: true
|
||||
image: "vllm/vllm-openai:latest"
|
||||
default_port: 8000
|
||||
model_cache: "/models"
|
||||
default_quantization: ""
|
||||
max_gpu_per_instance: 1
|
||||
max_model_len: 4096
|
||||
|
|
|
|||
Loading…
Reference in a new issue