diff --git a/configs/README.md b/configs/README.md index 31e5042..47277e8 100644 --- a/configs/README.md +++ b/configs/README.md @@ -2,59 +2,238 @@ ## Quick Start -### Standalone Mode (Existing Behavior) +### Docker Compose (Recommended) + ```bash -# Single worker, direct queue access -go run ./cmd/worker -config configs/worker/standalone/worker.yaml +# Development with 2 workers +cd deployments +CONFIG_DIR=../configs docker-compose -f docker-compose.dev.yml up -d + +# Scale to 4 workers +docker-compose -f docker-compose.dev.yml up -d --scale worker=4 + +# Production with scheduler +CONFIG_DIR=../configs docker-compose -f docker-compose.prod.yml up -d ``` -### Distributed Mode -```bash -# Terminal 1: Start scheduler -go run ./cmd/scheduler -config configs/scheduler/scheduler.yaml +### Key Environment Variables -# Terminal 2: Start worker -go run ./cmd/worker -config configs/worker/distributed/worker.yaml +| Variable | Description | Default | +|----------|-------------|---------| +| `CONFIG_DIR` | Path to config directory | `../configs` | +| `DATA_DIR` | Path to data directory | `./data/` | +| `LOG_LEVEL` | Logging level | `info` | +| `REDIS_URL` | Redis connection URL | `redis://redis:6379` | + +## Architecture Overview + +``` +┌─────────────────┐ ┌─────────────┐ ┌─────────────────┐ +│ API Server │────▶│ Redis │◀────│ Scheduler │ +│ (with builtin │ │ Queue │ │ (in api-server)│ +│ scheduler) │ └─────────────┘ └─────────────────┘ +└─────────────────┘ │ │ + │ │ │ + │ ┌────────┴────────┐ │ + │ ▼ ▼ │ + │ ┌─────────┐ ┌─────────┐ │ + └────▶│ Worker 1│ │ Worker 2│ │ + │ (Podman)│ │ (Podman)│ │ + └─────────┘ └─────────┘ │ + │ │ │ + └──────────────────┴──────────┘ + Heartbeats ``` -### Single-Node Mode (Zero Config) -```bash -# Both scheduler and worker in one process -go run ./cmd/fetch_ml -config configs/multi-node/single-node.yaml -``` +The scheduler is built into the API server and manages multiple workers dynamically. -## Config Structure +## Configuration Structure ``` configs/ -├── scheduler/ -│ └── scheduler.yaml # Central scheduler configuration +├── api/ +│ ├── dev.yaml # Development API config +│ ├── multi-user.yaml # Production multi-worker +│ └── homelab-secure.yaml # Homelab secure config ├── worker/ -│ ├── standalone/ -│ │ └── worker.yaml # Direct queue access (Redis/SQLite) -│ └── distributed/ -│ └── worker.yaml # WebSocket to scheduler -└── multi-node/ - └── single-node.yaml # Combined scheduler+worker +│ ├── docker-dev.yaml # Development worker +│ ├── docker-prod.yaml # Production worker +│ ├── docker-staging.yaml # Staging worker +│ ├── docker-standard.yaml # Standard compliance +│ └── homelab-secure.yaml # Homelab secure worker +└── schema/ + └── *.yaml # Validation schemas ``` -## Key Configuration Modes +## Scheduler Configuration -| Mode | Use Case | Backend | -|------|----------|---------| -| `standalone` | Single machine, existing behavior | Redis/SQLite/Filesystem | -| `distributed` | Multiple workers, central scheduler | WebSocket to scheduler | -| `both` | Quick testing, single process | In-process scheduler | - -## Worker Mode Selection - -Set `worker.mode` to switch between implementations: +The scheduler is configured in the API server config: ```yaml -worker: - mode: "standalone" # Uses Redis/SQLite queue.Backend - # OR - mode: "distributed" # Uses SchedulerBackend over WebSocket +# configs/api/multi-user.yaml +resources: + max_workers: 4 # Max concurrent workers + desired_rps_per_worker: 3 # Target requests/sec per worker + +scheduler: + enabled: true + strategy: "round-robin" # round-robin, least-loaded, priority + max_concurrent_jobs: 16 # Max jobs across all workers + queue: + type: "redis" + redis_addr: "redis:6379" + worker_discovery: + mode: "dynamic" # dynamic or static + heartbeat_timeout: "30s" + health_check_interval: "10s" ``` -The worker code is unchanged — only the backend implementation changes. +### Scheduling Strategies + +| Strategy | Description | Use Case | +|----------|-------------|----------| +| `round-robin` | Distribute evenly across workers | Balanced load | +| `least-loaded` | Send to worker with fewest jobs | Variable job sizes | +| `priority` | Respect job priorities first | Mixed priority workloads | + +## Worker Configuration + +Workers connect to the scheduler via Redis queue: + +```yaml +# configs/worker/docker-prod.yaml +backend: + type: "redis" + redis: + addr: "redis:6379" + password: "" # Set via REDIS_PASSWORD env var + db: 0 + +worker: + id: "${FETCHML_WORKER_ID}" # Unique worker ID + mode: "distributed" # Uses scheduler via Redis + heartbeat_interval: "10s" + max_concurrent_jobs: 4 # Jobs this worker can run + +sandbox: + type: "podman" + podman: + socket: "/run/podman/podman.sock" + cpus: "2" + memory: "4Gi" +``` + +## Scaling Workers + +### Docker Compose (Recommended) + +```bash +# Development - 2 workers by default +docker-compose -f deployments/docker-compose.dev.yml up -d + +# Scale to 4 workers +docker-compose -f deployments/docker-compose.dev.yml up -d --scale worker=4 + +# Scale down to 1 worker +docker-compose -f deployments/docker-compose.dev.yml up -d --scale worker=1 +``` + +### Kubernetes / Manual Deployment + +```bash +# Each worker needs unique ID +export FETCHML_WORKER_ID="worker-$(hostname)-$(date +%s)" +./worker -config configs/worker/docker-prod.yaml +``` + +## Environment-Specific Setups + +### Development (docker-compose.dev.yml) + +- 2 workers by default +- Redis for queue +- Local MinIO for storage +- Caddy reverse proxy + +```bash +make dev-up # Start with 2 workers +make dev-up SCALE=4 # Start with 4 workers +``` + +### Production (docker-compose.prod.yml) + +- 4 workers configured +- Redis cluster recommended +- External MinIO/S3 +- Health checks enabled + +```bash +CONFIG_DIR=./configs DATA_DIR=/var/lib/fetchml \ + docker-compose -f deployments/docker-compose.prod.yml up -d +``` + +### Staging (docker-compose.staging.yml) + +- 2 workers +- Audit logging enabled +- Same as prod but smaller scale + +## Monitoring + +### Check Worker Status + +```bash +# Via API +curl http://localhost:9101/api/v1/workers + +# Via Redis +redis-cli LRANGE fetchml:workers 0 -1 +redis-cli HGETALL fetchml:worker:status +``` + +### View Logs + +```bash +# All workers +docker-compose -f deployments/docker-compose.dev.yml logs worker + +# Specific worker (by container name) +docker logs ml-experiments-worker-1 +docker logs ml-experiments-worker-2 +``` + +## Troubleshooting + +### Workers Not Registering + +1. Check Redis connection: `redis-cli ping` +2. Verify worker config has `mode: distributed` +3. Check API server scheduler is enabled +4. Review worker logs: `docker logs ` + +### Jobs Stuck in Queue + +1. Check worker capacity: `max_concurrent_jobs` not exceeded +2. Verify workers are healthy: `docker ps` +3. Check Redis queue length: `redis-cli LLEN fetchml:queue:pending` + +### Worker ID Collisions + +Ensure `FETCHML_WORKER_ID` is unique per worker instance: +```yaml +environment: + - FETCHML_WORKER_ID=${HOSTNAME}-${COMPOSE_PROJECT_NAME}-${RANDOM} +``` + +## Security Notes + +- Workers run in privileged mode for Podman containers +- Redis should be firewalled (not exposed publicly in prod) +- Worker-to-scheduler communication is via Redis only +- No direct API-to-worker connections required + +## See Also + +- `deployments/README.md` - Deployment environments +- `docs/src/deployment.md` - Full deployment guide +- `docs/src/cicd.md` - CI/CD workflows diff --git a/configs/api/dev-local.yaml b/configs/api/dev-local.yaml deleted file mode 100644 index bab91e4..0000000 --- a/configs/api/dev-local.yaml +++ /dev/null @@ -1,61 +0,0 @@ -base_path: "data/dev/experiments" - -data_dir: "data/dev/active" - -auth: - enabled: false - -server: - address: "0.0.0.0:9101" - tls: - enabled: false - cert_file: "" - key_file: "" - -security: - production_mode: false - allowed_origins: - - "http://localhost:3000" - api_key_rotation_days: 90 - audit_logging: - enabled: true - log_path: "data/dev/logs/fetchml-audit.log" - rate_limit: - enabled: false - requests_per_minute: 60 - burst_size: 10 - ip_whitelist: [] - -monitoring: - prometheus: - enabled: true - port: 9101 - path: "/metrics" - health_checks: - enabled: true - interval: "30s" - -redis: - addr: "localhost:6379" - password: "" - db: 0 - -database: - type: "sqlite" - connection: "data/dev/db/fetchml.sqlite" - -logging: - level: "info" - file: "data/dev/logs/fetchml.log" - audit_log: "data/dev/logs/fetchml-audit.log" - -resources: - max_workers: 1 - desired_rps_per_worker: 2 - podman_cpus: "2" - podman_memory: "4Gi" - -queue: - type: "native" - native: - data_dir: "data/dev/queue" diff --git a/configs/api/dev.yaml b/configs/api/dev.yaml index 0f5a978..22b44b6 100644 --- a/configs/api/dev.yaml +++ b/configs/api/dev.yaml @@ -50,7 +50,20 @@ logging: audit_log: "./data/dev/logs/fetchml-audit.log" resources: - max_workers: 1 + max_workers: 4 desired_rps_per_worker: 2 podman_cpus: "2" podman_memory: "4Gi" + +# Scheduler configuration for multi-worker support +scheduler: + enabled: true + strategy: "round-robin" # round-robin, least-loaded, or priority + max_concurrent_jobs: 8 + queue: + type: "redis" # redis, sqlite, or filesystem + redis_addr: "redis:6379" + worker_discovery: + mode: "dynamic" # dynamic or static + heartbeat_timeout: "30s" + health_check_interval: "10s" diff --git a/configs/api/homelab-secure.yaml b/configs/api/homelab-secure.yaml index 269ed05..86021fc 100644 --- a/configs/api/homelab-secure.yaml +++ b/configs/api/homelab-secure.yaml @@ -84,7 +84,20 @@ logging: - "experiment_deleted" resources: - max_workers: 1 + max_workers: 2 desired_rps_per_worker: 2 podman_cpus: "2" podman_memory: "4Gi" + +# Scheduler configuration for multi-worker support +scheduler: + enabled: true + strategy: "round-robin" + max_concurrent_jobs: 8 + queue: + type: "redis" + redis_addr: "redis:6379" + worker_discovery: + mode: "dynamic" + heartbeat_timeout: "30s" + health_check_interval: "10s" diff --git a/configs/api/multi-user.yaml b/configs/api/multi-user.yaml index 74d0bbc..40bbd35 100644 --- a/configs/api/multi-user.yaml +++ b/configs/api/multi-user.yaml @@ -68,7 +68,20 @@ logging: audit_log: "" resources: - max_workers: 3 + max_workers: 4 desired_rps_per_worker: 3 podman_cpus: "2" podman_memory: "4Gi" + +# Scheduler configuration for multi-worker support +scheduler: + enabled: true + strategy: "round-robin" + max_concurrent_jobs: 16 + queue: + type: "redis" + redis_addr: "redis:6379" + worker_discovery: + mode: "dynamic" + heartbeat_timeout: "30s" + health_check_interval: "10s" diff --git a/configs/api/prod.yaml b/configs/api/prod.yaml deleted file mode 100644 index 914ae8d..0000000 --- a/configs/api/prod.yaml +++ /dev/null @@ -1,59 +0,0 @@ -base_path: "/app/data/prod/experiments" - -data_dir: "/app/data/prod/active" - -auth: - enabled: true - api_keys: - admin: - hash: "replace-with-sha256-of-your-api-key" - admin: true - roles: - - admin - permissions: - "*": true - -server: - address: ":9101" - tls: - enabled: true - cert_file: "/app/ssl/cert.pem" - key_file: "/app/ssl/key.pem" - -security: - production_mode: false - allowed_origins: [] - rate_limit: - enabled: true - requests_per_minute: 60 - burst_size: 10 - ip_whitelist: [] - -monitoring: - prometheus: - enabled: true - port: 9101 - path: "/metrics" - health_checks: - enabled: true - interval: "30s" - -redis: - addr: "redis:6379" - password: "" - db: 0 - -database: - type: "sqlite" - connection: "/app/data/prod/fetch_ml.sqlite" - -logging: - level: "info" - file: "/app/data/prod/logs/fetch_ml.log" - audit_log: "/app/data/prod/logs/audit.log" - -resources: - max_workers: 2 - desired_rps_per_worker: 5 - podman_cpus: "2" - podman_memory: "4Gi" diff --git a/configs/examples/config-postgres.yaml b/configs/examples/config-postgres.yaml deleted file mode 100644 index 6d11b7f..0000000 --- a/configs/examples/config-postgres.yaml +++ /dev/null @@ -1,62 +0,0 @@ -# Fetch ML Configuration Example for PostgreSQL -# This example shows how to configure Fetch ML to use PostgreSQL as the database - -base_path: "./data/experiments" - -auth: - enabled: true - api_keys: - admin: - hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd5f8b6c8b0b4f0b8e3" # "password" - admin: true - roles: ["admin"] - permissions: - "*": true - -server: - address: ":9101" - tls: - enabled: false - -database: - type: "postgres" - host: "localhost" - port: 5432 - username: "fetchml" - password: "your_password_here" - database: "fetchml" - # Alternatively, you can use a full connection string: - # connection: "postgres://fetchml:your_password_here@localhost:5432/fetchml?sslmode=disable" - -redis: - addr: "localhost:6379" - password: "" - db: 0 - -logging: - level: "info" - file: "" - audit_log: "" - -security: - production_mode: false - rate_limit: - enabled: false - requests_per_minute: 60 - burst_size: 10 - ip_whitelist: [] - -monitoring: - prometheus: - enabled: true - port: 9101 - path: "/metrics" - health_checks: - enabled: true - interval: "30s" - -resources: - max_workers: 1 - desired_rps_per_worker: 2 - podman_cpus: "2" - podman_memory: "4Gi" diff --git a/configs/examples/config.yaml.example b/configs/examples/config.yaml.example deleted file mode 100644 index 6928efd..0000000 --- a/configs/examples/config.yaml.example +++ /dev/null @@ -1,57 +0,0 @@ -# Fetch ML Configuration Example -# Copy this file to config.yaml and customize for your environment - -base_path: "./data/experiments" - -auth: - enabled: true - api_keys: - # Example API key (replace with real hashed keys) - admin: - hash: "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" # "password" - admin: true - roles: ["admin"] - permissions: - "*": true - -server: - address: ":9101" - tls: - enabled: false - -database: - type: "sqlite" - connection: "data/fetch_ml.db" - -redis: - addr: "localhost:6379" - password: "" - db: 0 - -logging: - level: "info" - file: "logs/fetch_ml.log" - audit_log: "logs/audit.log" - -security: - rate_limit: - enabled: false - requests_per_minute: 60 - burst_size: 10 - ip_whitelist: [] - production_mode: false - -monitoring: - prometheus: - enabled: true - port: 9101 - path: "/metrics" - health_checks: - enabled: true - interval: "30s" - -resources: - max_workers: 1 - desired_rps_per_worker: 2 - podman_cpus: "2" - podman_memory: "4Gi" diff --git a/configs/examples/prod.yaml.example b/configs/examples/prod.yaml.example new file mode 100644 index 0000000..e30831e --- /dev/null +++ b/configs/examples/prod.yaml.example @@ -0,0 +1,236 @@ +# fetch_ml Production Setup Example +# +# This configuration provides a production-ready multi-worker setup +# with the scheduler managing job distribution across workers. +# +# Usage: +# 1. Copy this file to your working directory +# 2. Adjust values for your environment +# 3. Deploy with: docker-compose -f docker-compose.prod.yml up -d + +# ============================================================================= +# API SERVER CONFIGURATION +# ============================================================================= +# File: configs/api/multi-user.yaml + +resources: + max_workers: 4 # Maximum concurrent workers + desired_rps_per_worker: 3 # Target requests per second per worker + podman_cpus: "2" # CPU limit per Podman container + podman_memory: "4Gi" # Memory limit per Podman container + +scheduler: + enabled: true # Enable the job scheduler + strategy: "round-robin" # Job distribution strategy + # Options: round-robin, least-loaded, priority + max_concurrent_jobs: 16 # Max jobs running across all workers + + queue: + type: "redis" # Queue backend + redis_addr: "redis:6379" # Redis connection address + # For Redis Cluster: + # redis_cluster_addrs: "redis-node-1:6379,redis-node-2:6379,redis-node-3:6379" + + worker_discovery: + mode: "dynamic" # Dynamic worker discovery + heartbeat_timeout: "30s" # Worker considered dead after this timeout + health_check_interval: "10s" # Health check frequency + +# ============================================================================= +# WORKER CONFIGURATION +# ============================================================================= +# File: configs/worker/docker-prod.yaml + +backend: + type: "redis" # Must match scheduler queue type + redis: + addr: "redis:6379" # Redis address + password: "" # Set via REDIS_PASSWORD env var + db: 0 + +worker: + id: "${FETCHML_WORKER_ID}" # Unique worker ID - MUST be unique per worker! + mode: "distributed" # Connect to scheduler via Redis + heartbeat_interval: "10s" # How often to send heartbeats + max_concurrent_jobs: 4 # Jobs this worker can run simultaneously + + # Auto-scaling settings (optional) + autoscale: + enabled: false # Enable worker auto-scaling + min_workers: 2 # Minimum workers to maintain + max_workers: 8 # Maximum workers to scale to + target_queue_depth: 10 # Scale up when queue exceeds this + +sandbox: + type: "podman" + podman: + socket: "/run/podman/podman.sock" + cpus: "2" + memory: "4Gi" + # Security options + privileged: true # Required for Podman-in-Docker + seccomp_profile: "" # Optional: custom seccomp profile + +# ============================================================================= +# DEPLOYMENT EXAMPLES +# ============================================================================= + +# Example 1: Docker Compose (Recommended for single node) +# -------------------------------------------------------- +# Save as docker-compose.yml and run: +# docker-compose up -d +# +# services: +# api-server: +# image: fetchml-api:latest +# ports: +# - "9101:9101" +# volumes: +# - ./configs/api/multi-user.yaml:/app/configs/api/prod.yaml:ro +# depends_on: +# - redis +# +# worker: +# image: fetchml-worker:latest +# volumes: +# - ./configs/worker/docker-prod.yaml:/app/configs/worker.yaml:ro +# depends_on: +# - api-server +# - redis +# deploy: +# replicas: 4 # Start with 4 workers +# privileged: true +# +# redis: +# image: redis:7-alpine +# volumes: +# - redis_data:/data +# +# volumes: +# redis_data: + +# Example 2: Kubernetes Deployment +# -------------------------------- +# Use the Helm chart or create deployments manually: +# +# apiVersion: apps/v1 +# kind: Deployment +# metadata: +# name: fetchml-api +# spec: +# replicas: 1 +# template: +# spec: +# containers: +# - name: api +# image: fetchml-api:latest +# env: +# - name: FETCHML_CONFIG_PATH +# value: /app/configs/prod.yaml +# --- +# apiVersion: apps/v1 +# kind: Deployment +# metadata: +# name: fetchml-worker +# spec: +# replicas: 4 +# template: +# spec: +# containers: +# - name: worker +# image: fetchml-worker:latest +# env: +# - name: FETCHML_WORKER_ID +# valueFrom: +# fieldRef: +# fieldPath: metadata.name +# - name: FETCHML_CONFIG_PATH +# value: /app/configs/worker.yaml +# securityContext: +# privileged: true + +# Example 3: Systemd Service (Bare Metal) +# ---------------------------------------- +# /etc/systemd/system/fetchml-api.service: +# +# [Unit] +# Description=FetchML API Server +# After=network.target redis.service +# +# [Service] +# ExecStart=/usr/local/bin/fetchml-api -config /etc/fetchml/api.yaml +# Restart=always +# User=fetchml +# Group=fetchml +# +# [Install] +# WantedBy=multi-user.target +# +# /etc/systemd/system/fetchml-worker@.service (template): +# +# [Unit] +# Description=FetchML Worker %i +# After=fetchml-api.service +# +# [Service] +# ExecStart=/usr/local/bin/fetchml-worker -config /etc/fetchml/worker.yaml +# Environment="FETCHML_WORKER_ID=worker-%i" +# Restart=always +# +# [Install] +# WantedBy=multi-user.target +# +# Enable 4 workers: +# systemctl enable fetchml-worker@{1..4} + +# ============================================================================= +# MONITORING & MAINTENANCE +# ============================================================================= + +# Check worker status +# $ curl http://localhost:9101/api/v1/workers + +# View Redis queue depth +# $ redis-cli LLEN fetchml:queue:pending + +# View worker heartbeats +# $ redis-cli HGETALL fetchml:worker:heartbeats + +# Restart all workers +# $ docker-compose restart worker + +# Scale workers dynamically +# $ docker-compose up -d --scale worker=8 + +# ============================================================================= +# TROUBLESHOOTING +# ============================================================================= + +# Workers not appearing in API: +# 1. Check Redis connection: redis-cli ping +# 2. Verify worker logs: docker logs +# 3. Check FETCHML_WORKER_ID is unique per worker +# 4. Verify worker mode is "distributed" not "standalone" + +# Jobs stuck in queue: +# 1. Check worker capacity not exceeded +# 2. Verify workers are healthy: docker ps +# 3. Check Redis queue: redis-cli LRANGE fetchml:queue:pending 0 10 +# 4. Review worker logs for errors + +# High job latency: +# 1. Consider increasing max_concurrent_jobs per worker +# 2. Scale to more workers: docker-compose up -d --scale worker=8 +# 3. Check if jobs are CPU-bound (increase podman_cpus) +# 4. Consider using "least-loaded" strategy instead of "round-robin" + +# ============================================================================= +# SECURITY BEST PRACTICES +# ============================================================================= + +# 1. Firewall Redis - never expose to public internet +# 2. Use Redis AUTH password in production +# 3. Run workers with minimal privileges (use custom seccomp profile) +# 4. Enable TLS for API server in production +# 5. Use separate networks for API/Worker/Redis communication +# 6. Regular security updates for Podman and container images diff --git a/configs/scheduler/scheduler.yaml.example b/configs/scheduler/scheduler.yaml.example deleted file mode 100644 index b7732db..0000000 --- a/configs/scheduler/scheduler.yaml.example +++ /dev/null @@ -1,59 +0,0 @@ -# Scheduler Configuration Example -# Copy this to scheduler.yaml and replace placeholders with real values -# DO NOT commit the actual scheduler.yaml with real tokens - -scheduler: - bind_addr: "0.0.0.0:7777" - - # Auto-generate self-signed certs if files don't exist - auto_generate_certs: true - cert_file: "/etc/fetch_ml/scheduler.crt" - key_file: "/etc/fetch_ml/scheduler.key" - - state_dir: "/var/lib/fetch_ml" - - default_batch_slots: 3 - default_service_slots: 1 - - starvation_threshold_mins: 5 - priority_aging_rate: 0.1 - - gang_alloc_timeout_secs: 60 - acceptance_timeout_secs: 30 - - metrics_addr: "0.0.0.0:9090" - - # Generate tokens using: openssl rand -hex 32 - # Example: wkr_abc123... (64 hex chars after wkr_) - worker_tokens: - - id: "worker-01" - token: "wkr_PLACEHOLDER_GENERATE_WITH_OPENSSL_RAND_HEX_32" - - id: "worker-02" - token: "wkr_PLACEHOLDER_GENERATE_WITH_OPENSSL_RAND_HEX_32" - - # Plugin GPU Quota Configuration - # Controls GPU allocation for plugin-based services (Jupyter, vLLM, etc.) - plugin_quota: - enabled: false # Enable quota enforcement (default: false) - total_gpus: 16 # Global GPU limit across all plugins (0 = unlimited) - per_user_gpus: 4 # Default per-user GPU limit (0 = unlimited) - per_user_services: 2 # Default per-user service count limit (0 = unlimited) - - # Plugin-specific limits (optional) - per_plugin_limits: - vllm: - max_gpus: 8 # Max GPUs for vLLM across all users - max_services: 4 # Max vLLM service instances - jupyter: - max_gpus: 4 # Max GPUs for Jupyter across all users - max_services: 10 # Max Jupyter service instances - - # Per-user overrides (optional) - user_overrides: - admin: - max_gpus: 8 # Admin gets more GPUs - max_services: 5 # Admin can run more services - allowed_plugins: ["jupyter", "vllm"] # Restrict which plugins user can use - researcher1: - max_gpus: 2 # Limited GPU access - max_services: 1 # Single service limit diff --git a/configs/schema/worker_config_schema.yaml b/configs/schema/worker_config_schema.yaml index 66df194..7d5ec52 100644 --- a/configs/schema/worker_config_schema.yaml +++ b/configs/schema/worker_config_schema.yaml @@ -38,7 +38,7 @@ required: - podman_image - container_workspace - container_results - - train_script + - entrypoint properties: host: type: string @@ -57,9 +57,9 @@ properties: base_path: type: string description: Base path for worker operations - train_script: + entrypoint: type: string - description: Path to training script + description: Entrypoint script or command (e.g., train.py, run.sh, /bin/bash -c "echo hello") - supports Python scripts, shell scripts, or direct commands redis_url: type: string description: Legacy Redis URL (if set, redis_addr/password/db are derived) diff --git a/configs/worker/dev-local.yaml b/configs/worker/dev-local.yaml deleted file mode 100644 index 187e53d..0000000 --- a/configs/worker/dev-local.yaml +++ /dev/null @@ -1,87 +0,0 @@ -worker_id: "local-worker" -base_path: "data/dev/experiments" -train_script: "train.py" - -redis_url: "redis://localhost:6379/0" - -local_mode: true - -prewarm_enabled: false - -max_workers: 2 -poll_interval_seconds: 2 - -auto_fetch_data: false - -data_manager_path: "./data_manager" -dataset_cache_ttl: "30m" - -data_dir: "data/dev/active" - -snapshot_store: - enabled: false - -podman_image: "python:3.9-slim" -container_workspace: "/workspace" -container_results: "/results" -gpu_devices: [] -gpu_vendor: "apple" -gpu_visible_devices: [] - -# Apple M-series GPU configuration -apple_gpu: - enabled: true - metal_device: "/dev/metal" - mps_runtime: "/dev/mps" - -resources: - max_workers: 2 - desired_rps_per_worker: 2 - podman_cpus: "2" - podman_memory: "4Gi" - -metrics: - enabled: false - -queue: - type: "native" - native: - data_dir: "data/dev/queue" - -# Plugin Configuration (for local development) -plugins: - # Jupyter Notebook/Lab Service - jupyter: - enabled: true - image: "quay.io/jupyter/base-notebook:latest" - default_port: 8888 - mode: "lab" - # Security settings - security: - trusted_channels: - - "conda-forge" - - "defaults" - blocked_packages: [] # Less restrictive for local dev - require_password: false # No password for local dev - # Resource limits - max_gpu_per_instance: 1 - max_memory_per_instance: "4Gi" - - # vLLM Inference Service - vllm: - enabled: true - image: "vllm/vllm-openai:latest" - default_port: 8000 - # Model cache location - model_cache: "data/dev/models" - # Supported quantization methods: awq, gptq, fp8, squeezellm - default_quantization: "" # No quantization for dev (better quality) - # Resource limits - max_gpu_per_instance: 1 - max_model_len: 2048 - tensor_parallel_size: 1 - -task_lease_duration: "30m" -heartbeat_interval: "1m" -max_retries: 3 -graceful_timeout: "5m" diff --git a/configs/worker/distributed/worker.yaml.example b/configs/worker/distributed/worker.yaml.example deleted file mode 100644 index 8465519..0000000 --- a/configs/worker/distributed/worker.yaml.example +++ /dev/null @@ -1,33 +0,0 @@ -# Distributed Worker Configuration Example -# Copy this to worker.yaml and replace placeholders with real values -# DO NOT commit the actual worker.yaml with real tokens - -node: - role: "worker" - id: "" # Auto-generated UUID if empty - -worker: - mode: "distributed" - max_workers: 3 - -scheduler: - address: "192.168.1.10:7777" - cert: "/etc/fetch_ml/scheduler.crt" - # Copy token from scheduler config for this worker - token: "wkr_COPY_FROM_SCHEDULER_CONFIG" - -slots: - service_slots: 1 - ports: - service_range_start: 8000 - service_range_end: 8099 - -gpu: - vendor: "auto" - -prewarm: - enabled: true - -log: - level: "info" - format: "json" diff --git a/configs/worker/docker-dev.yaml b/configs/worker/docker-dev.yaml index b0cd77e..a3a6f1d 100644 --- a/configs/worker/docker-dev.yaml +++ b/configs/worker/docker-dev.yaml @@ -1,58 +1,39 @@ -worker_id: "docker-worker" -base_path: "/data/experiments" -train_script: "train.py" +# Development mode worker configuration +# Relaxed validation for fast iteration +host: localhost +port: 22 +user: dev-user +base_path: /tmp/fetchml_dev +entrypoint: train.py -redis_url: "redis://redis:6379/0" +# Redis configuration +redis_url: redis://redis:6379 -local_mode: true +# Development mode - relaxed security +compliance_mode: dev +max_workers: 4 -prewarm_enabled: true +# Worker mode - must be "distributed" to use scheduler +mode: distributed -max_workers: 1 -poll_interval_seconds: 2 +# Sandbox settings (relaxed for development) +sandbox: + network_mode: bridge + seccomp_profile: "" + no_new_privileges: false + allowed_secrets: [] # All secrets allowed in dev -auto_fetch_data: false +# GPU configuration +gpu_vendor: none -data_manager_path: "./data_manager" -dataset_cache_ttl: "30m" +# Artifact handling (relaxed limits) +max_artifact_files: 10000 +max_artifact_total_bytes: 1073741824 # 1GB -data_dir: "/data/active" +# Provenance (disabled in dev for speed) +provenance_best_effort: false -snapshot_store: - enabled: true - endpoint: "minio:9000" - secure: false - bucket: "fetchml-snapshots" - prefix: "snapshots" - timeout: "2m" - max_retries: 3 - -podman_image: "python:3.9-slim" -container_workspace: "/workspace" -container_results: "/results" -gpu_devices: - - "/dev/dri" -gpu_vendor: "apple" -gpu_visible_devices: [] - -# Apple M-series GPU configuration -apple_gpu: - enabled: true - metal_device: "/dev/metal" - mps_runtime: "/dev/mps" - -resources: - max_workers: 1 - desired_rps_per_worker: 2 - podman_cpus: "2" - podman_memory: "4Gi" - -metrics: - enabled: true - listen_addr: ":9100" - metrics_flush_interval: "500ms" - -# Plugin Configuration +# Plugin Configuration (development mode) plugins: # Jupyter Notebook/Lab Service jupyter: @@ -60,14 +41,12 @@ plugins: image: "quay.io/jupyter/base-notebook:latest" default_port: 8888 mode: "lab" - # Security settings security: trusted_channels: - "conda-forge" - "defaults" - blocked_packages: [] # Dev environment - less restrictive + blocked_packages: [] # No restrictions in dev require_password: false # No password for dev - # Resource limits max_gpu_per_instance: 1 max_memory_per_instance: "4Gi" @@ -76,16 +55,7 @@ plugins: enabled: true image: "vllm/vllm-openai:latest" default_port: 8000 - # Model cache location - model_cache: "/models" - # Supported quantization methods: awq, gptq, fp8, squeezellm + model_cache: "/tmp/models" # Temp location for dev default_quantization: "" # No quantization for dev - # Resource limits max_gpu_per_instance: 1 max_model_len: 2048 - tensor_parallel_size: 1 - -task_lease_duration: "30m" -heartbeat_interval: "1m" -max_retries: 3 -graceful_timeout: "5m" diff --git a/configs/worker/docker-prod.yaml b/configs/worker/docker-prod.yaml index 371b218..f976a27 100644 --- a/configs/worker/docker-prod.yaml +++ b/configs/worker/docker-prod.yaml @@ -1,6 +1,6 @@ worker_id: "docker-worker" base_path: "/tmp/fetchml-jobs" -train_script: "train.py" +entrypoint: "train.py" redis_url: "redis://redis:6379/0" diff --git a/configs/worker/docker.yaml b/configs/worker/docker.yaml deleted file mode 100644 index 15cb93f..0000000 --- a/configs/worker/docker.yaml +++ /dev/null @@ -1,43 +0,0 @@ -worker_id: "docker-worker" -base_path: "/tmp/fetchml-jobs" -train_script: "train.py" - -redis_addr: "redis:6379" -redis_password: "" -redis_db: 0 - -local_mode: true - -max_workers: 1 -poll_interval_seconds: 5 - -auto_fetch_data: false - -data_manager_path: "./data_manager" -dataset_cache_ttl: "30m" - -snapshot_store: - enabled: false - -podman_image: "python:3.9-slim" -container_workspace: "/workspace" -container_results: "/results" -gpu_devices: [] -gpu_vendor: "none" -gpu_visible_devices: [] - -resources: - max_workers: 1 - desired_rps_per_worker: 2 - podman_cpus: "2" - podman_memory: "4Gi" - -metrics: - enabled: true - listen_addr: ":9100" -metrics_flush_interval: "500ms" - -task_lease_duration: "30m" -heartbeat_interval: "1m" -max_retries: 3 -graceful_timeout: "5m" diff --git a/configs/worker/homelab-sandbox.yaml b/configs/worker/homelab-sandbox.yaml deleted file mode 100644 index 11d5d09..0000000 --- a/configs/worker/homelab-sandbox.yaml +++ /dev/null @@ -1,93 +0,0 @@ -# Worker configuration with sandboxing enabled -# This configuration provides strict isolation for sensitive workloads - -host: "127.0.0.1" -user: "worker" -base_path: "/var/lib/fetchml/experiments" -max_workers: 4 - -# Sandboxing configuration -sandbox: - # Network isolation: "none" (no network), "slirp4netns" (user-mode networking), - # "bridge" (bridge networking), or "" (default) - network_mode: "none" - - # Mount root filesystem as read-only - read_only_root: true - - # Enable secret injection for API keys - allow_secrets: true - - # Allowed secrets (explicit allowlist for security) - allowed_secrets: - - HF_TOKEN # Hugging Face API token - - WANDB_API_KEY # Weights & Biases API key - - OPENAI_API_KEY # OpenAI API key - - ANTHROPIC_API_KEY # Anthropic API key - - AWS_ACCESS_KEY_ID # AWS credentials - - AWS_SECRET_ACCESS_KEY - - # Seccomp profile for syscall filtering - seccomp_profile: "ml-research.json" - - # Maximum runtime before automatic termination (hours) - max_runtime_hours: 48 - -# Resource limits -resources: - max_memory_gb: 64 - max_cpu_cores: 16 - max_gpu_devices: 4 - -# Podman configuration -podman_image: "fetchml-ml:latest" -gpu_vendor: "nvidia" - -# Queue backend -queue: - backend: "redis" - redis_url: "redis://localhost:6379/0" - -# Plugin Configuration -plugins: - # Jupyter Notebook/Lab Service - jupyter: - enabled: true - image: "quay.io/jupyter/base-notebook:latest" - default_port: 8888 - mode: "lab" # "lab" or "notebook" - # Security settings - security: - trusted_channels: - - "conda-forge" - - "defaults" - - "pytorch" - - "nvidia" - blocked_packages: - - "requests" - - "urllib3" - - "httpx" - - "socket" - - "subprocess" - require_password: true - # Resource limits - max_gpu_per_instance: 1 - max_memory_per_instance: "16Gi" - - # vLLM Inference Service - vllm: - enabled: true - image: "vllm/vllm-openai:latest" - default_port: 8000 - # Model cache location (should be on fast storage) - model_cache: "/var/lib/fetchml/models" - # Supported quantization methods: awq, gptq, fp8, squeezellm - default_quantization: "" - # Resource limits - max_gpu_per_instance: 2 - max_model_len: 4096 - tensor_parallel_size: 1 - -# Snapshot store (optional) -snapshot_store: - enabled: false diff --git a/configs/worker/homelab-secure.yaml b/configs/worker/homelab-secure.yaml index 009cd36..30257d4 100644 --- a/configs/worker/homelab-secure.yaml +++ b/configs/worker/homelab-secure.yaml @@ -1,6 +1,6 @@ worker_id: "homelab-worker" base_path: "/tmp/fetchml-jobs" -train_script: "train.py" +entrypoint: "train.py" redis_url: "redis://:${REDIS_PASSWORD}@redis:6379/0" diff --git a/configs/worker/standalone/worker.yaml.example b/configs/worker/standalone/worker.yaml.example deleted file mode 100644 index c9e5ee8..0000000 --- a/configs/worker/standalone/worker.yaml.example +++ /dev/null @@ -1,32 +0,0 @@ -# Standalone Worker Configuration Example -# Copy this to worker.yaml and adjust for your environment - -node: - role: "worker" - id: "" - -worker: - mode: "standalone" - max_workers: 3 - -queue: - backend: "redis" - redis_addr: "localhost:6379" - redis_password: "" # Set if Redis requires auth - redis_db: 0 - -slots: - service_slots: 1 - ports: - service_range_start: 8000 - service_range_end: 8099 - -gpu: - vendor: "auto" - -prewarm: - enabled: true - -log: - level: "info" - format: "json"