- Remove redundant config examples (distributed/, standalone/, examples/) - Delete dev-local.yaml variants (use dev.yaml with env vars) - Delete prod.yaml (use multi-user.yaml or homelab-secure.yaml) - Clean up worker configs: remove docker.yaml, homelab-sandbox.yaml - Update remaining configs with current best practices - Simplify config schema and documentation
272 lines
7.1 KiB
YAML
272 lines
7.1 KiB
YAML
$schema: "http://json-schema.org/draft-07/schema#"
|
|
title: "Fetch ML Worker Configuration"
|
|
type: object
|
|
additionalProperties: false
|
|
allOf:
|
|
# forbid both index and UUID at once (allow zero or one)
|
|
- not:
|
|
required: [gpu_visible_devices, gpu_visible_device_ids]
|
|
- if:
|
|
properties:
|
|
queue:
|
|
properties:
|
|
backend:
|
|
const: sqlite
|
|
required: [queue]
|
|
then:
|
|
properties:
|
|
queue:
|
|
required: [sqlite_path]
|
|
- if:
|
|
properties:
|
|
queue:
|
|
properties:
|
|
backend:
|
|
const: filesystem
|
|
required: [queue]
|
|
then:
|
|
properties:
|
|
queue:
|
|
required: [filesystem_path]
|
|
else:
|
|
anyOf:
|
|
- required: [redis_addr]
|
|
- required: [redis_url]
|
|
required:
|
|
- base_path
|
|
- worker_id
|
|
- podman_image
|
|
- container_workspace
|
|
- container_results
|
|
- entrypoint
|
|
properties:
|
|
host:
|
|
type: string
|
|
description: SSH host for remote worker
|
|
user:
|
|
type: string
|
|
description: SSH user for remote worker
|
|
ssh_key:
|
|
type: string
|
|
description: Path to SSH private key
|
|
port:
|
|
type: integer
|
|
minimum: 1
|
|
maximum: 65535
|
|
description: SSH port
|
|
base_path:
|
|
type: string
|
|
description: Base path for worker operations
|
|
entrypoint:
|
|
type: string
|
|
description: Entrypoint script or command (e.g., train.py, run.sh, /bin/bash -c "echo hello") - supports Python scripts, shell scripts, or direct commands
|
|
redis_url:
|
|
type: string
|
|
description: Legacy Redis URL (if set, redis_addr/password/db are derived)
|
|
redis_addr:
|
|
type: string
|
|
description: Redis server address
|
|
redis_password:
|
|
type: string
|
|
description: Redis password
|
|
redis_db:
|
|
type: integer
|
|
minimum: 0
|
|
default: 0
|
|
description: Redis database number
|
|
queue:
|
|
type: object
|
|
description: Queue backend configuration (optional; defaults to redis)
|
|
additionalProperties: false
|
|
properties:
|
|
backend:
|
|
type: string
|
|
enum: [redis, sqlite, filesystem]
|
|
default: redis
|
|
sqlite_path:
|
|
type: string
|
|
description: Path to queue.db (sqlite backend only)
|
|
filesystem_path:
|
|
type: string
|
|
description: Base directory for filesystem queue state
|
|
fallback_to_filesystem:
|
|
type: boolean
|
|
default: false
|
|
description: If true, fall back to filesystem queue when primary backend is unavailable
|
|
known_hosts:
|
|
type: string
|
|
description: Path to SSH known hosts file
|
|
worker_id:
|
|
type: string
|
|
minLength: 1
|
|
description: Unique worker identifier
|
|
max_workers:
|
|
type: integer
|
|
minimum: 1
|
|
description: Maximum number of concurrent workers
|
|
poll_interval_seconds:
|
|
type: integer
|
|
minimum: 1
|
|
description: Polling interval in seconds
|
|
local_mode:
|
|
type: boolean
|
|
default: false
|
|
description: Run in local mode without SSH
|
|
resources:
|
|
type: object
|
|
description: Resource configuration
|
|
additionalProperties: false
|
|
properties:
|
|
max_workers:
|
|
type: integer
|
|
minimum: 1
|
|
desired_rps_per_worker:
|
|
type: integer
|
|
minimum: 1
|
|
requests_per_sec:
|
|
type: integer
|
|
minimum: 1
|
|
podman_cpus:
|
|
type: string
|
|
podman_memory:
|
|
type: string
|
|
request_burst:
|
|
type: integer
|
|
minimum: 1
|
|
auth:
|
|
type: object
|
|
description: Authentication configuration
|
|
additionalProperties: true
|
|
metrics:
|
|
type: object
|
|
description: Metrics configuration
|
|
additionalProperties: false
|
|
properties:
|
|
enabled:
|
|
type: boolean
|
|
default: false
|
|
listen_addr:
|
|
type: string
|
|
default: ":9100"
|
|
metrics_flush_interval:
|
|
type: string
|
|
description: Duration string (e.g., "500ms")
|
|
default: "500ms"
|
|
data_manager_path:
|
|
type: string
|
|
description: Path to data manager
|
|
default: "./data_manager"
|
|
auto_fetch_data:
|
|
type: boolean
|
|
default: false
|
|
description: Automatically fetch data
|
|
data_dir:
|
|
type: string
|
|
description: Data directory
|
|
dataset_cache_ttl:
|
|
type: string
|
|
description: Dataset cache TTL duration
|
|
default: "30m"
|
|
snapshot_store:
|
|
type: object
|
|
description: Optional S3-compatible snapshot store configuration (worker pulls snapshots by snapshot_id)
|
|
additionalProperties: false
|
|
properties:
|
|
enabled:
|
|
type: boolean
|
|
default: false
|
|
endpoint:
|
|
type: string
|
|
description: S3-compatible endpoint (e.g. "s3.amazonaws.com" or "minio:9000")
|
|
secure:
|
|
type: boolean
|
|
default: true
|
|
region:
|
|
type: string
|
|
bucket:
|
|
type: string
|
|
prefix:
|
|
type: string
|
|
description: Object key prefix where snapshots are stored
|
|
access_key:
|
|
type: string
|
|
description: Optional static access key (otherwise uses env credentials)
|
|
secret_key:
|
|
type: string
|
|
description: Optional static secret key (otherwise uses env credentials)
|
|
session_token:
|
|
type: string
|
|
description: Optional session token for temporary credentials
|
|
timeout:
|
|
type: string
|
|
description: Duration string (e.g., "10m")
|
|
default: "10m"
|
|
max_retries:
|
|
type: integer
|
|
minimum: 0
|
|
default: 3
|
|
prewarm_enabled:
|
|
type: boolean
|
|
description: Enable best-effort prewarming of next task artifacts (snapshot/datasets/env image). Default off.
|
|
default: false
|
|
podman_image:
|
|
type: string
|
|
minLength: 1
|
|
description: Podman image to use
|
|
container_workspace:
|
|
type: string
|
|
description: Container workspace path
|
|
container_results:
|
|
type: string
|
|
description: Container results path
|
|
gpu_devices:
|
|
type: array
|
|
description: GPU device paths to expose to the container (e.g. ["/dev/dri"]).
|
|
items:
|
|
type: string
|
|
gpu_vendor:
|
|
type: string
|
|
enum: [nvidia, amd, apple, none]
|
|
description: GPU vendor/runtime selection for env var injection (nvidia|amd|apple|none).
|
|
default: "none"
|
|
gpu_visible_devices:
|
|
type: array
|
|
description: GPU indices to expose via vendor-specific env (e.g. [0,1]).
|
|
items:
|
|
type: integer
|
|
gpu_visible_device_ids:
|
|
type: array
|
|
description: NVIDIA GPU UUIDs to expose via CUDA_VISIBLE_DEVICES (e.g. ["GPU-..."]). Mutually exclusive with gpu_visible_devices.
|
|
items:
|
|
type: string
|
|
apple_gpu:
|
|
type: object
|
|
description: Apple M-series GPU configuration
|
|
additionalProperties: false
|
|
properties:
|
|
enabled:
|
|
type: boolean
|
|
default: false
|
|
metal_device:
|
|
type: string
|
|
description: Path to Metal device node (e.g. /dev/metal)
|
|
mps_runtime:
|
|
type: string
|
|
description: Path to MPS runtime device node (e.g. /dev/mps)
|
|
task_lease_duration:
|
|
type: string
|
|
description: Task lease duration
|
|
default: "30m"
|
|
heartbeat_interval:
|
|
type: string
|
|
description: Heartbeat interval
|
|
default: "1m"
|
|
max_retries:
|
|
type: integer
|
|
minimum: 0
|
|
default: 3
|
|
description: Maximum retry attempts
|
|
graceful_timeout:
|
|
type: string
|
|
description: Graceful shutdown timeout
|
|
default: "5m"
|