fetch_ml/api/openapi.yaml
Jeremie Fraeys 1f495dfbb7
api: regenerate OpenAPI types and server code
- Update openapi.yaml spec
- Regenerate server_gen.go with oapi-codegen
- Update adapter, routes, and server configuration
2026-03-04 13:23:34 -05:00

1259 lines
33 KiB
YAML

---
openapi: 3.0.3
info:
title: ML Worker API
description: |
API for managing ML experiment tasks and Jupyter services.
## Security
All endpoints (except health checks) require API key authentication via the
`X-API-Key` header. Rate limiting is enforced per API key.
## Error Handling
Errors follow a consistent format with machine-readable codes and trace IDs:
```json
{
"error": "Sanitized error message",
"code": "ERROR_CODE",
"trace_id": "uuid-for-support"
}
```
version: 1.0.0
contact:
name: FetchML Support
servers:
- url: http://localhost:9101
description: Local development server
- url: https://api.fetchml.example.com
description: Production server
security:
- ApiKeyAuth: []
paths:
/health:
get:
summary: Health check
description: Returns server health status. No authentication required.
security: []
responses:
'200':
description: Server is healthy
content:
application/json:
schema:
$ref: '#/components/schemas/HealthResponse'
/v1/tasks:
get:
summary: List tasks
description: List all tasks with optional filtering
parameters:
- name: status
in: query
schema:
type: string
enum: [queued, running, completed, failed]
- name: limit
in: query
schema:
type: integer
default: 50
maximum: 1000
- name: offset
in: query
schema:
type: integer
default: 0
- name: user_id
in: query
schema:
type: string
description: Filter by user who submitted the task
- name: plugin_configs
in: query
schema:
type: object
additionalProperties:
$ref: '#/components/schemas/PluginConfig'
description: Plugin configurations for this task
- name: node_count
in: query
schema:
type: integer
minimum: 1
default: 1
description: Number of nodes for multi-node jobs
- name: reservation_id
in: query
schema:
type: string
description: Pre-reserved capacity for this task
responses:
'200':
description: List of tasks
content:
application/json:
schema:
$ref: '#/components/schemas/TaskList'
'400':
$ref: '#/components/responses/BadRequest'
'401':
$ref: '#/components/responses/Unauthorized'
'429':
$ref: '#/components/responses/RateLimited'
post:
summary: Create task
description: Submit a new ML experiment task
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CreateTaskRequest'
responses:
'201':
description: Task created successfully
content:
application/json:
schema:
$ref: '#/components/schemas/Task'
'400':
$ref: '#/components/responses/BadRequest'
'401':
$ref: '#/components/responses/Unauthorized'
'422':
$ref: '#/components/responses/ValidationError'
'429':
$ref: '#/components/responses/RateLimited'
/v1/tasks/{taskId}:
get:
summary: Get task details
parameters:
- name: taskId
in: path
required: true
schema:
type: string
responses:
'200':
description: Task details
content:
application/json:
schema:
$ref: '#/components/schemas/Task'
'404':
$ref: '#/components/responses/NotFound'
delete:
summary: Cancel/delete task
parameters:
- name: taskId
in: path
required: true
schema:
type: string
responses:
'204':
description: Task cancelled
'404':
$ref: '#/components/responses/NotFound'
/v1/queue:
get:
summary: Queue status
description: Get current queue statistics
responses:
'200':
description: Queue statistics
content:
application/json:
schema:
$ref: '#/components/schemas/QueueStats'
'401':
$ref: '#/components/responses/Unauthorized'
/v1/plugins:
get:
summary: List available plugins
description: Returns all registered plugins and their status
tags:
- Plugins
responses:
'200':
description: List of plugins
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/Plugin'
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
/v1/plugins/{pluginName}:
get:
summary: Get plugin details
description: Returns plugin configuration and status
tags:
- Plugins
parameters:
- name: pluginName
in: path
required: true
schema:
type: string
responses:
'200':
description: Plugin details
content:
application/json:
schema:
$ref: '#/components/schemas/Plugin'
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
'404':
$ref: '#/components/responses/NotFound'
/v1/plugins/{pluginName}/config:
get:
summary: Get plugin configuration
description: Returns plugin configuration
tags:
- Plugins
parameters:
- name: pluginName
in: path
required: true
schema:
type: string
responses:
'200':
description: Plugin configuration
content:
application/json:
schema:
$ref: '#/components/schemas/PluginConfig'
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
'404':
$ref: '#/components/responses/NotFound'
put:
summary: Update plugin configuration
description: Update plugin configuration (hot-reload if supported)
tags:
- Plugins
parameters:
- name: pluginName
in: path
required: true
schema:
type: string
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/PluginConfig'
responses:
'200':
description: Configuration updated
content:
application/json:
schema:
$ref: '#/components/schemas/Plugin'
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
'404':
$ref: '#/components/responses/NotFound'
'422':
$ref: '#/components/responses/ValidationError'
delete:
summary: Disable/unload plugin
description: Disable plugin (may require restart if plugin requires it)
tags:
- Plugins
parameters:
- name: pluginName
in: path
required: true
schema:
type: string
responses:
'204':
description: Plugin disabled
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
'404':
$ref: '#/components/responses/NotFound'
/v1/plugins/{pluginName}/health:
get:
summary: Check plugin health
description: Returns health status of plugin sidecars
tags:
- Plugins
parameters:
- name: pluginName
in: path
required: true
schema:
type: string
responses:
'200':
description: Plugin health
content:
application/json:
schema:
$ref: '#/components/schemas/HealthResponse'
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
'404':
$ref: '#/components/responses/NotFound'
/v1/scheduler/status:
get:
summary: Get scheduler status
description: Returns queue depths, worker counts, and metrics
tags:
- Scheduler
responses:
'200':
description: Scheduler status
content:
application/json:
schema:
$ref: '#/components/schemas/SchedulerStatus'
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
/v1/scheduler/status/stream:
get:
summary: SSE stream of scheduler state changes
description: Emits events on queue depth changes, worker connect/disconnect, job transitions
tags:
- Scheduler
produces:
- text/event-stream
responses:
'200':
description: SSE stream
content:
text/event-stream:
schema:
type: string
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
/v1/scheduler/workers:
get:
summary: List connected workers
description: Returns all workers and their capabilities
tags:
- Scheduler
responses:
'200':
description: List of workers
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/Worker'
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
/v1/scheduler/workers/{workerId}:
get:
summary: Get worker details
description: Returns detailed worker information
tags:
- Scheduler
parameters:
- name: workerId
in: path
required: true
schema:
type: string
responses:
'200':
description: Worker details
content:
application/json:
schema:
$ref: '#/components/schemas/Worker'
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
'404':
$ref: '#/components/responses/NotFound'
delete:
summary: Disconnect/drain worker
description: Gracefully drain and disconnect a worker
tags:
- Scheduler
parameters:
- name: workerId
in: path
required: true
schema:
type: string
responses:
'204':
description: Worker draining initiated
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
'404':
$ref: '#/components/responses/NotFound'
/v1/scheduler/reservations:
get:
summary: List active reservations
description: Returns all active capacity reservations
tags:
- Scheduler
responses:
'200':
description: List of reservations
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/Reservation'
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
post:
summary: Create reservation
description: Reserve capacity for large jobs
tags:
- Scheduler
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CreateReservationRequest'
responses:
'201':
description: Reservation created
content:
application/json:
schema:
$ref: '#/components/schemas/Reservation'
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
'422':
$ref: '#/components/responses/ValidationError'
/v1/scheduler/jobs/{jobId}/priority:
patch:
summary: Update job priority
description: Change the priority of a queued or running job
tags:
- Scheduler
parameters:
- name: jobId
in: path
required: true
schema:
type: string
requestBody:
required: true
content:
application/json:
schema:
type: object
required:
- priority
properties:
priority:
type: integer
minimum: 1
maximum: 10
responses:
'200':
description: Priority updated
content:
application/json:
schema:
$ref: '#/components/schemas/Task'
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
'404':
$ref: '#/components/responses/NotFound'
/v1/scheduler/jobs/{jobId}/stream:
get:
summary: SSE stream of job progress
description: Emits events on job state transitions and priority changes
tags:
- Scheduler
produces:
- text/event-stream
parameters:
- name: jobId
in: path
required: true
schema:
type: string
responses:
'200':
description: SSE stream
content:
text/event-stream:
schema:
type: string
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
'404':
$ref: '#/components/responses/NotFound'
/v1/audit/events:
get:
summary: Query audit events
description: Filter by time range, event type, user
tags:
- Audit
parameters:
- name: from
in: query
schema:
type: string
format: date-time
description: Start timestamp
- name: to
in: query
schema:
type: string
format: date-time
description: End timestamp
- name: event_type
in: query
schema:
type: string
enum: [job_queued, job_started, job_completed, file_access, auth_attempt, plugin_configured, scheduler_drain, audit_verified]
description: Filter by event type
- name: user_id
in: query
schema:
type: string
description: Filter by user
- name: limit
in: query
schema:
type: integer
default: 100
maximum: 1000
- name: offset
in: query
schema:
type: integer
default: 0
responses:
'200':
description: List of audit events
content:
application/json:
schema:
$ref: '#/components/schemas/AuditEventList'
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
'429':
$ref: '#/components/responses/RateLimited'
/v1/audit/verify:
post:
summary: Verify audit chain integrity
description: Validates the hash chain for tampering
tags:
- Audit
responses:
'200':
description: Verification result
content:
application/json:
schema:
$ref: '#/components/schemas/VerificationResult'
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
/v1/audit/chain-root:
get:
summary: Get current chain root hash
description: Returns the latest event hash for external verification
tags:
- Audit
responses:
'200':
description: Chain root hash
content:
application/json:
schema:
type: object
properties:
root_hash:
type: string
timestamp:
type: string
format: date-time
total_events:
type: integer
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/Forbidden'
/v1/experiments:
get:
summary: List experiments
description: List all experiments
responses:
'200':
description: List of experiments
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/Experiment'
post:
summary: Create experiment
description: Create a new experiment
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CreateExperimentRequest'
responses:
'201':
description: Experiment created
content:
application/json:
schema:
$ref: '#/components/schemas/Experiment'
/v1/jupyter/services:
get:
summary: List Jupyter services
responses:
'200':
description: List of Jupyter services
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/JupyterService'
post:
summary: Start Jupyter service
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/StartJupyterRequest'
responses:
'201':
description: Jupyter service started
content:
application/json:
schema:
$ref: '#/components/schemas/JupyterService'
/v1/jupyter/services/{serviceId}:
delete:
summary: Stop Jupyter service
parameters:
- name: serviceId
in: path
required: true
schema:
type: string
responses:
'204':
description: Service stopped
/ws:
get:
summary: WebSocket connection
description: |
WebSocket endpoint for real-time task updates.
## Message Types
- `task_update`: Task status changes
- `task_complete`: Task finished
- `ping`: Keep-alive (respond with `pong`)
security:
- ApiKeyAuth: []
responses:
'101':
description: WebSocket connection established
components:
securitySchemes:
ApiKeyAuth:
type: apiKey
in: header
name: X-API-Key
description: API key for authentication
schemas:
HealthResponse:
type: object
properties:
status:
type: string
enum: [healthy, degraded, unhealthy]
version:
type: string
timestamp:
type: string
format: date-time
Task:
type: object
properties:
id:
type: string
description: Unique task identifier
job_name:
type: string
pattern: '^[a-zA-Z0-9_-]+$'
maxLength: 64
status:
type: string
enum: [queued, preparing, running, collecting, completed, failed]
priority:
type: integer
minimum: 1
maximum: 10
default: 5
created_at:
type: string
format: date-time
started_at:
type: string
format: date-time
ended_at:
type: string
format: date-time
worker_id:
type: string
error:
type: string
output:
type: string
entrypoint:
type: string
description: Entrypoint script or command executed for this task
snapshot_id:
type: string
datasets:
type: array
items:
type: string
cpu:
type: integer
memory_gb:
type: integer
gpu:
type: integer
user_id:
type: string
retry_count:
type: integer
max_retries:
type: integer
plugin_status:
type: object
additionalProperties:
$ref: '#/components/schemas/PluginStatus'
description: Status of tracking plugins for this task
node_assignments:
type: array
items:
$ref: '#/components/schemas/NodeAssignment'
description: Worker node assignments for multi-node jobs
priority_aged:
type: number
description: Effective priority with aging applied
CreateTaskRequest:
type: object
required:
- job_name
properties:
job_name:
type: string
pattern: '^[a-zA-Z0-9_-]+$'
maxLength: 64
description: Unique identifier for the job
priority:
type: integer
minimum: 1
maximum: 10
default: 5
args:
type: string
description: Command-line arguments for the entrypoint
entrypoint:
type: string
description: Entrypoint script or command (e.g., train.py, run.sh, /bin/bash -c "echo hello")
examples:
- train.py
- run.sh
- /bin/bash -c "python train.py --epochs 10"
snapshot_id:
type: string
description: Reference to experiment snapshot
datasets:
type: array
items:
type: string
dataset_specs:
type: array
items:
$ref: '#/components/schemas/DatasetSpec'
cpu:
type: integer
description: CPU cores requested
memory_gb:
type: integer
description: Memory (GB) requested
gpu:
type: integer
description: GPUs requested
metadata:
type: object
additionalProperties:
type: string
DatasetSpec:
type: object
properties:
name:
type: string
source:
type: string
sha256:
type: string
mount_path:
type: string
TaskList:
type: object
properties:
tasks:
type: array
items:
$ref: '#/components/schemas/Task'
total:
type: integer
limit:
type: integer
offset:
type: integer
QueueStats:
type: object
properties:
queued:
type: integer
description: Tasks waiting to run
running:
type: integer
description: Tasks currently executing
completed:
type: integer
description: Tasks completed today
failed:
type: integer
description: Tasks failed today
workers:
type: integer
description: Active workers
Experiment:
type: object
properties:
id:
type: string
name:
type: string
commit_id:
type: string
created_at:
type: string
format: date-time
status:
type: string
enum: [active, archived, deleted]
CreateExperimentRequest:
type: object
required:
- name
properties:
name:
type: string
maxLength: 128
description:
type: string
JupyterService:
type: object
properties:
id:
type: string
name:
type: string
status:
type: string
enum: [starting, running, stopping, stopped, error]
url:
type: string
format: uri
token:
type: string
created_at:
type: string
format: date-time
StartJupyterRequest:
type: object
required:
- name
properties:
name:
type: string
workspace:
type: string
image:
type: string
default: jupyter/pytorch:latest
PluginConfig:
type: object
properties:
enabled:
type: boolean
mode:
type: string
enum: [sidecar, remote, disabled]
image:
type: string
settings:
type: object
additionalProperties: true
PluginStatus:
type: object
properties:
name:
type: string
status:
type: string
enum: [healthy, unhealthy, starting, stopped]
url:
type: string
last_check:
type: string
format: date-time
NodeAssignment:
type: object
properties:
node_id:
type: integer
worker_id:
type: string
slot_assigned:
type: boolean
Plugin:
type: object
properties:
name:
type: string
description: Plugin name
enabled:
type: boolean
description: Whether plugin is enabled
mode:
type: string
enum: [sidecar, remote, disabled]
description: Provisioning mode
status:
$ref: '#/components/schemas/PluginStatus'
description: Current plugin status
config:
$ref: '#/components/schemas/PluginConfig'
requires_restart:
type: boolean
description: Whether plugin requires restart on config change
version:
type: string
description: Plugin version
Worker:
type: object
properties:
id:
type: string
description: Worker unique identifier
connected_at:
type: string
format: date-time
description: When worker connected
last_heartbeat:
type: string
format: date-time
description: Last heartbeat timestamp
capabilities:
type: object
properties:
gpu_count:
type: integer
description: Number of GPUs
gpu_type:
type: string
description: GPU type (e.g., A100, H100)
cpu_cores:
type: integer
memory_gb:
type: integer
slots:
type: object
properties:
batch_available:
type: integer
batch_total:
type: integer
service_available:
type: integer
service_total:
type: integer
active_tasks:
type: array
items:
type: string
description: IDs of tasks currently running on this worker
status:
type: string
enum: [active, draining, offline]
SchedulerStatus:
type: object
properties:
workers_total:
type: integer
workers_active:
type: integer
workers_draining:
type: integer
batch_queue_depth:
type: integer
service_queue_depth:
type: integer
tasks_running:
type: integer
tasks_completed_24h:
type: integer
reservations_active:
type: integer
timestamp:
type: string
format: date-time
Reservation:
type: object
properties:
id:
type: string
user_id:
type: string
gpu_count:
type: integer
gpu_type:
type: string
node_count:
type: integer
expires_at:
type: string
format: date-time
status:
type: string
enum: [active, claimed, expired]
CreateReservationRequest:
type: object
required:
- gpu_count
properties:
gpu_count:
type: integer
minimum: 1
gpu_type:
type: string
node_count:
type: integer
minimum: 1
default: 1
expires_minutes:
type: integer
default: 30
AuditEvent:
type: object
properties:
timestamp:
type: string
format: date-time
event_type:
type: string
enum: [job_queued, job_started, job_completed, file_access, auth_attempt, plugin_configured, scheduler_drain, audit_verified]
user_id:
type: string
resource:
type: string
description: Resource being acted upon
action:
type: string
description: Action performed
success:
type: boolean
ip_address:
type: string
error:
type: string
prev_hash:
type: string
description: Previous event hash in chain
event_hash:
type: string
description: This event's hash
sequence_num:
type: integer
description: Position in audit chain
metadata:
type: object
additionalProperties: true
AuditEventList:
type: object
properties:
events:
type: array
items:
$ref: '#/components/schemas/AuditEvent'
total:
type: integer
limit:
type: integer
offset:
type: integer
VerificationResult:
type: object
properties:
valid:
type: boolean
total_events:
type: integer
first_tampered:
type: integer
description: Sequence number of first tampered event (if any)
chain_root_hash:
type: string
verified_at:
type: string
format: date-time
ErrorResponse:
type: object
required:
- error
- code
- trace_id
properties:
error:
type: string
description: Sanitized error message
code:
type: string
enum: [BAD_REQUEST, UNAUTHORIZED, FORBIDDEN, NOT_FOUND, CONFLICT, RATE_LIMITED, INTERNAL_ERROR, SERVICE_UNAVAILABLE, VALIDATION_ERROR]
trace_id:
type: string
description: Support correlation ID
tags:
- name: Plugins
description: Plugin management endpoints
- name: Scheduler
description: Scheduler and worker management
- name: Audit
description: Audit log and chain verification
responses:
BadRequest:
description: Invalid request
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
example:
error: Invalid request format
code: BAD_REQUEST
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
Unauthorized:
description: Authentication required
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
example:
error: Invalid or missing API key
code: UNAUTHORIZED
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
Forbidden:
description: Insufficient permissions
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
example:
error: Insufficient permissions
code: FORBIDDEN
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
NotFound:
description: Resource not found
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
example:
error: Resource not found
code: NOT_FOUND
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
ValidationError:
description: Validation failed
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
example:
error: Validation failed
code: VALIDATION_ERROR
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
RateLimited:
description: Too many requests
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
example:
error: Rate limit exceeded
code: RATE_LIMITED
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
headers:
Retry-After:
schema:
type: integer
description: Seconds until rate limit resets
InternalError:
description: Internal server error
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
example:
error: An error occurred
code: INTERNAL_ERROR
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890