$schema: "http://json-schema.org/draft-07/schema#"
title: "Fetch ML Worker Configuration"
type: object
additionalProperties: false
allOf:
  # forbid both index and UUID at once (allow zero or one)
  - not:
      required: [gpu_visible_devices, gpu_visible_device_ids]
  - if:
      properties:
        queue:
          properties:
            backend:
              const: sqlite
      required: [queue]
    then:
      properties:
        queue:
          required: [sqlite_path]
  - if:
      properties:
        queue:
          properties:
            backend:
              const: filesystem
      required: [queue]
    then:
      properties:
        queue:
          required: [filesystem_path]
    else:
      anyOf:
        - required: [redis_addr]
        - required: [redis_url]
required:
  - base_path
  - worker_id
  - podman_image
  - container_workspace
  - container_results
  - entrypoint
properties:
  host:
    type: string
    description: SSH host for remote worker
  user:
    type: string
    description: SSH user for remote worker
  ssh_key:
    type: string
    description: Path to SSH private key
  port:
    type: integer
    minimum: 1
    maximum: 65535
    description: SSH port
  base_path:
    type: string
    description: Base path for worker operations
  entrypoint:
    type: string
    description: Entrypoint script or command (e.g., train.py, run.sh, /bin/bash -c "echo hello") - supports Python scripts, shell scripts, or direct commands
  redis_url:
    type: string
    description: Legacy Redis URL (if set, redis_addr/password/db are derived)
  redis_addr:
    type: string
    description: Redis server address
  redis_password:
    type: string
    description: Redis password
  redis_db:
    type: integer
    minimum: 0
    default: 0
    description: Redis database number
  queue:
    type: object
    description: Queue backend configuration (optional; defaults to redis)
    additionalProperties: false
    properties:
      backend:
        type: string
        enum: [redis, sqlite, filesystem]
        default: redis
      sqlite_path:
        type: string
        description: Path to queue.db (sqlite backend only)
      filesystem_path:
        type: string
        description: Base directory for filesystem queue state
      fallback_to_filesystem:
        type: boolean
        default: false
        description: If true, fall back to filesystem queue when primary backend is unavailable
  known_hosts:
    type: string
    description: Path to SSH known hosts file
  worker_id:
    type: string
    minLength: 1
    description: Unique worker identifier
  max_workers:
    type: integer
    minimum: 1
    description: Maximum number of concurrent workers
  poll_interval_seconds:
    type: integer
    minimum: 1
    description: Polling interval in seconds
  local_mode:
    type: boolean
    default: false
    description: Run in local mode without SSH
  resources:
    type: object
    description: Resource configuration
    additionalProperties: false
    properties:
      max_workers:
        type: integer
        minimum: 1
      desired_rps_per_worker:
        type: integer
        minimum: 1
      requests_per_sec:
        type: integer
        minimum: 1
      podman_cpus:
        type: string
      podman_memory:
        type: string
      request_burst:
        type: integer
        minimum: 1
  auth:
    type: object
    description: Authentication configuration
    additionalProperties: true
  metrics:
    type: object
    description: Metrics configuration
    additionalProperties: false
    properties:
      enabled:
        type: boolean
        default: false
      listen_addr:
        type: string
        default: ":9100"
  metrics_flush_interval:
    type: string
    description: Duration string (e.g., "500ms")
    default: "500ms"
  data_manager_path:
    type: string
    description: Path to data manager
    default: "./data_manager"
  auto_fetch_data:
    type: boolean
    default: false
    description: Automatically fetch data
  data_dir:
    type: string
    description: Data directory
  dataset_cache_ttl:
    type: string
    description: Dataset cache TTL duration
    default: "30m"
  snapshot_store:
    type: object
    description: Optional S3-compatible snapshot store configuration (worker pulls snapshots by snapshot_id)
    additionalProperties: false
    properties:
      enabled:
        type: boolean
        default: false
      endpoint:
        type: string
        description: S3-compatible endpoint (e.g. "s3.amazonaws.com" or "minio:9000")
      secure:
        type: boolean
        default: true
      region:
        type: string
      bucket:
        type: string
      prefix:
        type: string
        description: Object key prefix where snapshots are stored
      access_key:
        type: string
        description: Optional static access key (otherwise uses env credentials)
      secret_key:
        type: string
        description: Optional static secret key (otherwise uses env credentials)
      session_token:
        type: string
        description: Optional session token for temporary credentials
      timeout:
        type: string
        description: Duration string (e.g., "10m")
        default: "10m"
      max_retries:
        type: integer
        minimum: 0
        default: 3
  prewarm_enabled:
    type: boolean
    description: Enable best-effort prewarming of next task artifacts (snapshot/datasets/env image). Default off.
    default: false
  podman_image:
    type: string
    minLength: 1
    description: Podman image to use
  container_workspace:
    type: string
    description: Container workspace path
  container_results:
    type: string
    description: Container results path
  gpu_devices:
    type: array
    description: GPU device paths to expose to the container (e.g. ["/dev/dri"]).
    items:
      type: string
  gpu_vendor:
    type: string
    enum: [nvidia, amd, apple, none]
    description: GPU vendor/runtime selection for env var injection (nvidia|amd|apple|none).
    default: "none"
  gpu_visible_devices:
    type: array
    description: GPU indices to expose via vendor-specific env (e.g. [0,1]).
    items:
      type: integer
  gpu_visible_device_ids:
    type: array
    description: NVIDIA GPU UUIDs to expose via CUDA_VISIBLE_DEVICES (e.g. ["GPU-..."]). Mutually exclusive with gpu_visible_devices.
    items:
      type: string
  apple_gpu:
    type: object
    description: Apple M-series GPU configuration
    additionalProperties: false
    properties:
      enabled:
        type: boolean
        default: false
      metal_device:
        type: string
        description: Path to Metal device node (e.g. /dev/metal)
      mps_runtime:
        type: string
        description: Path to MPS runtime device node (e.g. /dev/mps)
  task_lease_duration:
    type: string
    description: Task lease duration
    default: "30m"
  heartbeat_interval:
    type: string
    description: Heartbeat interval
    default: "1m"
  max_retries:
    type: integer
    minimum: 0
    default: 3
    description: Maximum retry attempts
  graceful_timeout:
    type: string
    description: Graceful shutdown timeout
    default: "5m"