fetch_ml/configs/schema/worker_config_schema.yaml
Jeremie Fraeys 8a7e7695f4
config: consolidate and cleanup configuration files
- Remove redundant config examples (distributed/, standalone/, examples/)
- Delete dev-local.yaml variants (use dev.yaml with env vars)
- Delete prod.yaml (use multi-user.yaml or homelab-secure.yaml)
- Clean up worker configs: remove docker.yaml, homelab-sandbox.yaml
- Update remaining configs with current best practices
- Simplify config schema and documentation
2026-03-04 13:22:52 -05:00

272 lines
7.1 KiB
YAML

$schema: "http://json-schema.org/draft-07/schema#"
title: "Fetch ML Worker Configuration"
type: object
additionalProperties: false
allOf:
# forbid both index and UUID at once (allow zero or one)
- not:
required: [gpu_visible_devices, gpu_visible_device_ids]
- if:
properties:
queue:
properties:
backend:
const: sqlite
required: [queue]
then:
properties:
queue:
required: [sqlite_path]
- if:
properties:
queue:
properties:
backend:
const: filesystem
required: [queue]
then:
properties:
queue:
required: [filesystem_path]
else:
anyOf:
- required: [redis_addr]
- required: [redis_url]
required:
- base_path
- worker_id
- podman_image
- container_workspace
- container_results
- entrypoint
properties:
host:
type: string
description: SSH host for remote worker
user:
type: string
description: SSH user for remote worker
ssh_key:
type: string
description: Path to SSH private key
port:
type: integer
minimum: 1
maximum: 65535
description: SSH port
base_path:
type: string
description: Base path for worker operations
entrypoint:
type: string
description: Entrypoint script or command (e.g., train.py, run.sh, /bin/bash -c "echo hello") - supports Python scripts, shell scripts, or direct commands
redis_url:
type: string
description: Legacy Redis URL (if set, redis_addr/password/db are derived)
redis_addr:
type: string
description: Redis server address
redis_password:
type: string
description: Redis password
redis_db:
type: integer
minimum: 0
default: 0
description: Redis database number
queue:
type: object
description: Queue backend configuration (optional; defaults to redis)
additionalProperties: false
properties:
backend:
type: string
enum: [redis, sqlite, filesystem]
default: redis
sqlite_path:
type: string
description: Path to queue.db (sqlite backend only)
filesystem_path:
type: string
description: Base directory for filesystem queue state
fallback_to_filesystem:
type: boolean
default: false
description: If true, fall back to filesystem queue when primary backend is unavailable
known_hosts:
type: string
description: Path to SSH known hosts file
worker_id:
type: string
minLength: 1
description: Unique worker identifier
max_workers:
type: integer
minimum: 1
description: Maximum number of concurrent workers
poll_interval_seconds:
type: integer
minimum: 1
description: Polling interval in seconds
local_mode:
type: boolean
default: false
description: Run in local mode without SSH
resources:
type: object
description: Resource configuration
additionalProperties: false
properties:
max_workers:
type: integer
minimum: 1
desired_rps_per_worker:
type: integer
minimum: 1
requests_per_sec:
type: integer
minimum: 1
podman_cpus:
type: string
podman_memory:
type: string
request_burst:
type: integer
minimum: 1
auth:
type: object
description: Authentication configuration
additionalProperties: true
metrics:
type: object
description: Metrics configuration
additionalProperties: false
properties:
enabled:
type: boolean
default: false
listen_addr:
type: string
default: ":9100"
metrics_flush_interval:
type: string
description: Duration string (e.g., "500ms")
default: "500ms"
data_manager_path:
type: string
description: Path to data manager
default: "./data_manager"
auto_fetch_data:
type: boolean
default: false
description: Automatically fetch data
data_dir:
type: string
description: Data directory
dataset_cache_ttl:
type: string
description: Dataset cache TTL duration
default: "30m"
snapshot_store:
type: object
description: Optional S3-compatible snapshot store configuration (worker pulls snapshots by snapshot_id)
additionalProperties: false
properties:
enabled:
type: boolean
default: false
endpoint:
type: string
description: S3-compatible endpoint (e.g. "s3.amazonaws.com" or "minio:9000")
secure:
type: boolean
default: true
region:
type: string
bucket:
type: string
prefix:
type: string
description: Object key prefix where snapshots are stored
access_key:
type: string
description: Optional static access key (otherwise uses env credentials)
secret_key:
type: string
description: Optional static secret key (otherwise uses env credentials)
session_token:
type: string
description: Optional session token for temporary credentials
timeout:
type: string
description: Duration string (e.g., "10m")
default: "10m"
max_retries:
type: integer
minimum: 0
default: 3
prewarm_enabled:
type: boolean
description: Enable best-effort prewarming of next task artifacts (snapshot/datasets/env image). Default off.
default: false
podman_image:
type: string
minLength: 1
description: Podman image to use
container_workspace:
type: string
description: Container workspace path
container_results:
type: string
description: Container results path
gpu_devices:
type: array
description: GPU device paths to expose to the container (e.g. ["/dev/dri"]).
items:
type: string
gpu_vendor:
type: string
enum: [nvidia, amd, apple, none]
description: GPU vendor/runtime selection for env var injection (nvidia|amd|apple|none).
default: "none"
gpu_visible_devices:
type: array
description: GPU indices to expose via vendor-specific env (e.g. [0,1]).
items:
type: integer
gpu_visible_device_ids:
type: array
description: NVIDIA GPU UUIDs to expose via CUDA_VISIBLE_DEVICES (e.g. ["GPU-..."]). Mutually exclusive with gpu_visible_devices.
items:
type: string
apple_gpu:
type: object
description: Apple M-series GPU configuration
additionalProperties: false
properties:
enabled:
type: boolean
default: false
metal_device:
type: string
description: Path to Metal device node (e.g. /dev/metal)
mps_runtime:
type: string
description: Path to MPS runtime device node (e.g. /dev/mps)
task_lease_duration:
type: string
description: Task lease duration
default: "30m"
heartbeat_interval:
type: string
description: Heartbeat interval
default: "1m"
max_retries:
type: integer
minimum: 0
default: 3
description: Maximum retry attempts
graceful_timeout:
type: string
description: Graceful shutdown timeout
default: "5m"