$schema: "http://json-schema.org/draft-07/schema#" title: "Fetch ML Worker Configuration" type: object additionalProperties: false allOf: # forbid both index and UUID at once (allow zero or one) - not: required: [gpu_visible_devices, gpu_visible_device_ids] - if: properties: queue: properties: backend: const: sqlite required: [queue] then: properties: queue: required: [sqlite_path] - if: properties: queue: properties: backend: const: filesystem required: [queue] then: properties: queue: required: [filesystem_path] else: anyOf: - required: [redis_addr] - required: [redis_url] required: - base_path - worker_id - podman_image - container_workspace - container_results - entrypoint properties: host: type: string description: SSH host for remote worker user: type: string description: SSH user for remote worker ssh_key: type: string description: Path to SSH private key port: type: integer minimum: 1 maximum: 65535 description: SSH port base_path: type: string description: Base path for worker operations entrypoint: type: string description: Entrypoint script or command (e.g., train.py, run.sh, /bin/bash -c "echo hello") - supports Python scripts, shell scripts, or direct commands redis_url: type: string description: Legacy Redis URL (if set, redis_addr/password/db are derived) redis_addr: type: string description: Redis server address redis_password: type: string description: Redis password redis_db: type: integer minimum: 0 default: 0 description: Redis database number queue: type: object description: Queue backend configuration (optional; defaults to redis) additionalProperties: false properties: backend: type: string enum: [redis, sqlite, filesystem] default: redis sqlite_path: type: string description: Path to queue.db (sqlite backend only) filesystem_path: type: string description: Base directory for filesystem queue state fallback_to_filesystem: type: boolean default: false description: If true, fall back to filesystem queue when primary backend is unavailable known_hosts: type: string description: Path to SSH known hosts file worker_id: type: string minLength: 1 description: Unique worker identifier max_workers: type: integer minimum: 1 description: Maximum number of concurrent workers poll_interval_seconds: type: integer minimum: 1 description: Polling interval in seconds local_mode: type: boolean default: false description: Run in local mode without SSH resources: type: object description: Resource configuration additionalProperties: false properties: max_workers: type: integer minimum: 1 desired_rps_per_worker: type: integer minimum: 1 requests_per_sec: type: integer minimum: 1 podman_cpus: type: string podman_memory: type: string request_burst: type: integer minimum: 1 auth: type: object description: Authentication configuration additionalProperties: true metrics: type: object description: Metrics configuration additionalProperties: false properties: enabled: type: boolean default: false listen_addr: type: string default: ":9100" metrics_flush_interval: type: string description: Duration string (e.g., "500ms") default: "500ms" data_manager_path: type: string description: Path to data manager default: "./data_manager" auto_fetch_data: type: boolean default: false description: Automatically fetch data data_dir: type: string description: Data directory dataset_cache_ttl: type: string description: Dataset cache TTL duration default: "30m" snapshot_store: type: object description: Optional S3-compatible snapshot store configuration (worker pulls snapshots by snapshot_id) additionalProperties: false properties: enabled: type: boolean default: false endpoint: type: string description: S3-compatible endpoint (e.g. "s3.amazonaws.com" or "minio:9000") secure: type: boolean default: true region: type: string bucket: type: string prefix: type: string description: Object key prefix where snapshots are stored access_key: type: string description: Optional static access key (otherwise uses env credentials) secret_key: type: string description: Optional static secret key (otherwise uses env credentials) session_token: type: string description: Optional session token for temporary credentials timeout: type: string description: Duration string (e.g., "10m") default: "10m" max_retries: type: integer minimum: 0 default: 3 prewarm_enabled: type: boolean description: Enable best-effort prewarming of next task artifacts (snapshot/datasets/env image). Default off. default: false podman_image: type: string minLength: 1 description: Podman image to use container_workspace: type: string description: Container workspace path container_results: type: string description: Container results path gpu_devices: type: array description: GPU device paths to expose to the container (e.g. ["/dev/dri"]). items: type: string gpu_vendor: type: string enum: [nvidia, amd, apple, none] description: GPU vendor/runtime selection for env var injection (nvidia|amd|apple|none). default: "none" gpu_visible_devices: type: array description: GPU indices to expose via vendor-specific env (e.g. [0,1]). items: type: integer gpu_visible_device_ids: type: array description: NVIDIA GPU UUIDs to expose via CUDA_VISIBLE_DEVICES (e.g. ["GPU-..."]). Mutually exclusive with gpu_visible_devices. items: type: string apple_gpu: type: object description: Apple M-series GPU configuration additionalProperties: false properties: enabled: type: boolean default: false metal_device: type: string description: Path to Metal device node (e.g. /dev/metal) mps_runtime: type: string description: Path to MPS runtime device node (e.g. /dev/mps) task_lease_duration: type: string description: Task lease duration default: "30m" heartbeat_interval: type: string description: Heartbeat interval default: "1m" max_retries: type: integer minimum: 0 default: 3 description: Maximum retry attempts graceful_timeout: type: string description: Graceful shutdown timeout default: "5m"