- Move ci-test.sh and setup.sh to scripts/ - Trim docs/src/zig-cli.md to current structure - Replace hardcoded secrets with placeholders in configs - Update .gitignore to block .env*, secrets/, keys, build artifacts - Slim README.md to reflect current CLI/TUI split - Add cleanup trap to ci-test.sh - Ensure no secrets are committed
149 lines
3.3 KiB
YAML
149 lines
3.3 KiB
YAML
$schema: "http://json-schema.org/draft-07/schema#"
|
|
title: "Fetch ML Worker Configuration"
|
|
type: object
|
|
additionalProperties: false
|
|
required:
|
|
- base_path
|
|
- worker_id
|
|
- redis_addr
|
|
- podman_image
|
|
- container_workspace
|
|
- container_results
|
|
- train_script
|
|
properties:
|
|
host:
|
|
type: string
|
|
description: SSH host for remote worker
|
|
user:
|
|
type: string
|
|
description: SSH user for remote worker
|
|
ssh_key:
|
|
type: string
|
|
description: Path to SSH private key
|
|
port:
|
|
type: integer
|
|
minimum: 1
|
|
maximum: 65535
|
|
description: SSH port
|
|
base_path:
|
|
type: string
|
|
description: Base path for worker operations
|
|
train_script:
|
|
type: string
|
|
description: Path to training script
|
|
redis_addr:
|
|
type: string
|
|
description: Redis server address
|
|
redis_password:
|
|
type: string
|
|
description: Redis password
|
|
redis_db:
|
|
type: integer
|
|
minimum: 0
|
|
default: 0
|
|
description: Redis database number
|
|
known_hosts:
|
|
type: string
|
|
description: Path to SSH known hosts file
|
|
worker_id:
|
|
type: string
|
|
minLength: 1
|
|
description: Unique worker identifier
|
|
max_workers:
|
|
type: integer
|
|
minimum: 1
|
|
description: Maximum number of concurrent workers
|
|
poll_interval_seconds:
|
|
type: integer
|
|
minimum: 1
|
|
description: Polling interval in seconds
|
|
local_mode:
|
|
type: boolean
|
|
default: false
|
|
description: Run in local mode without SSH
|
|
resources:
|
|
type: object
|
|
description: Resource configuration
|
|
additionalProperties: false
|
|
properties:
|
|
max_workers:
|
|
type: integer
|
|
minimum: 1
|
|
desired_rps_per_worker:
|
|
type: integer
|
|
minimum: 1
|
|
requests_per_sec:
|
|
type: integer
|
|
minimum: 1
|
|
podman_cpus:
|
|
type: string
|
|
podman_memory:
|
|
type: string
|
|
request_burst:
|
|
type: integer
|
|
minimum: 1
|
|
auth:
|
|
type: object
|
|
description: Authentication configuration
|
|
additionalProperties: true
|
|
metrics:
|
|
type: object
|
|
description: Metrics configuration
|
|
additionalProperties: false
|
|
properties:
|
|
enabled:
|
|
type: boolean
|
|
default: false
|
|
listen_addr:
|
|
type: string
|
|
default: ":9100"
|
|
metrics_flush_interval:
|
|
type: string
|
|
description: Duration string (e.g., "500ms")
|
|
default: "500ms"
|
|
data_manager_path:
|
|
type: string
|
|
description: Path to data manager
|
|
default: "./data_manager"
|
|
auto_fetch_data:
|
|
type: boolean
|
|
default: false
|
|
description: Automatically fetch data
|
|
data_dir:
|
|
type: string
|
|
description: Data directory
|
|
dataset_cache_ttl:
|
|
type: string
|
|
description: Dataset cache TTL duration
|
|
default: "30m"
|
|
podman_image:
|
|
type: string
|
|
minLength: 1
|
|
description: Podman image to use
|
|
container_workspace:
|
|
type: string
|
|
description: Container workspace path
|
|
container_results:
|
|
type: string
|
|
description: Container results path
|
|
gpu_access:
|
|
type: boolean
|
|
default: false
|
|
description: Enable GPU access
|
|
task_lease_duration:
|
|
type: string
|
|
description: Task lease duration
|
|
default: "30m"
|
|
heartbeat_interval:
|
|
type: string
|
|
description: Heartbeat interval
|
|
default: "1m"
|
|
max_retries:
|
|
type: integer
|
|
minimum: 0
|
|
default: 3
|
|
description: Maximum retry attempts
|
|
graceful_timeout:
|
|
type: string
|
|
description: Graceful shutdown timeout
|
|
default: "5m"
|