fetch_ml/api/openapi.yaml
Jeremie Fraeys 420de879ff
feat(api): integrate scheduler protocol and WebSocket enhancements
Update API layer for scheduler integration:
- WebSocket handlers with scheduler protocol support
- Jobs WebSocket endpoint with priority queue integration
- Validation middleware for scheduler messages
- Server configuration with security hardening
- Protocol definitions for worker-scheduler communication
- Dataset handlers with tenant isolation checks
- Response helpers with audit context
- OpenAPI spec updates for new endpoints
2026-02-26 12:05:57 -05:00

527 lines
13 KiB
YAML

---
openapi: 3.0.3
info:
title: ML Worker API
description: |
API for managing ML experiment tasks and Jupyter services.
## Security
All endpoints (except health checks) require API key authentication via the
`X-API-Key` header. Rate limiting is enforced per API key.
## Error Handling
Errors follow a consistent format with machine-readable codes and trace IDs:
```json
{
"error": "Sanitized error message",
"code": "ERROR_CODE",
"trace_id": "uuid-for-support"
}
```
version: 1.0.0
contact:
name: FetchML Support
servers:
- url: http://localhost:9101
description: Local development server
- url: https://api.fetchml.example.com
description: Production server
security:
- ApiKeyAuth: []
paths:
/health:
get:
summary: Health check
description: Returns server health status. No authentication required.
security: []
responses:
'200':
description: Server is healthy
content:
application/json:
schema:
$ref: '#/components/schemas/HealthResponse'
/v1/tasks:
get:
summary: List tasks
description: List all tasks with optional filtering
parameters:
- name: status
in: query
schema:
type: string
enum: [queued, running, completed, failed]
- name: limit
in: query
schema:
type: integer
default: 50
maximum: 1000
- name: offset
in: query
schema:
type: integer
default: 0
responses:
'200':
description: List of tasks
content:
application/json:
schema:
$ref: '#/components/schemas/TaskList'
'400':
$ref: '#/components/responses/BadRequest'
'401':
$ref: '#/components/responses/Unauthorized'
'429':
$ref: '#/components/responses/RateLimited'
post:
summary: Create task
description: Submit a new ML experiment task
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CreateTaskRequest'
responses:
'201':
description: Task created successfully
content:
application/json:
schema:
$ref: '#/components/schemas/Task'
'400':
$ref: '#/components/responses/BadRequest'
'401':
$ref: '#/components/responses/Unauthorized'
'422':
$ref: '#/components/responses/ValidationError'
'429':
$ref: '#/components/responses/RateLimited'
/v1/tasks/{taskId}:
get:
summary: Get task details
parameters:
- name: taskId
in: path
required: true
schema:
type: string
responses:
'200':
description: Task details
content:
application/json:
schema:
$ref: '#/components/schemas/Task'
'404':
$ref: '#/components/responses/NotFound'
delete:
summary: Cancel/delete task
parameters:
- name: taskId
in: path
required: true
schema:
type: string
responses:
'204':
description: Task cancelled
'404':
$ref: '#/components/responses/NotFound'
/v1/queue:
get:
summary: Queue status
description: Get current queue statistics
responses:
'200':
description: Queue statistics
content:
application/json:
schema:
$ref: '#/components/schemas/QueueStats'
/v1/experiments:
get:
summary: List experiments
description: List all experiments
responses:
'200':
description: List of experiments
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/Experiment'
post:
summary: Create experiment
description: Create a new experiment
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CreateExperimentRequest'
responses:
'201':
description: Experiment created
content:
application/json:
schema:
$ref: '#/components/schemas/Experiment'
/v1/jupyter/services:
get:
summary: List Jupyter services
responses:
'200':
description: List of Jupyter services
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/JupyterService'
post:
summary: Start Jupyter service
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/StartJupyterRequest'
responses:
'201':
description: Jupyter service started
content:
application/json:
schema:
$ref: '#/components/schemas/JupyterService'
/v1/jupyter/services/{serviceId}:
delete:
summary: Stop Jupyter service
parameters:
- name: serviceId
in: path
required: true
schema:
type: string
responses:
'204':
description: Service stopped
/ws:
get:
summary: WebSocket connection
description: |
WebSocket endpoint for real-time task updates.
## Message Types
- `task_update`: Task status changes
- `task_complete`: Task finished
- `ping`: Keep-alive (respond with `pong`)
security:
- ApiKeyAuth: []
responses:
'101':
description: WebSocket connection established
components:
securitySchemes:
ApiKeyAuth:
type: apiKey
in: header
name: X-API-Key
description: API key for authentication
schemas:
HealthResponse:
type: object
properties:
status:
type: string
enum: [healthy, degraded, unhealthy]
version:
type: string
timestamp:
type: string
format: date-time
Task:
type: object
properties:
id:
type: string
description: Unique task identifier
job_name:
type: string
pattern: '^[a-zA-Z0-9_-]+$'
maxLength: 64
status:
type: string
enum: [queued, preparing, running, collecting, completed, failed]
priority:
type: integer
minimum: 1
maximum: 10
default: 5
created_at:
type: string
format: date-time
started_at:
type: string
format: date-time
ended_at:
type: string
format: date-time
worker_id:
type: string
error:
type: string
output:
type: string
snapshot_id:
type: string
datasets:
type: array
items:
type: string
cpu:
type: integer
memory_gb:
type: integer
gpu:
type: integer
user_id:
type: string
retry_count:
type: integer
max_retries:
type: integer
CreateTaskRequest:
type: object
required:
- job_name
properties:
job_name:
type: string
pattern: '^[a-zA-Z0-9_-]+$'
maxLength: 64
description: Unique identifier for the job
priority:
type: integer
minimum: 1
maximum: 10
default: 5
args:
type: string
description: Command-line arguments for the training script
snapshot_id:
type: string
description: Reference to experiment snapshot
datasets:
type: array
items:
type: string
dataset_specs:
type: array
items:
$ref: '#/components/schemas/DatasetSpec'
cpu:
type: integer
description: CPU cores requested
memory_gb:
type: integer
description: Memory (GB) requested
gpu:
type: integer
description: GPUs requested
metadata:
type: object
additionalProperties:
type: string
DatasetSpec:
type: object
properties:
name:
type: string
source:
type: string
sha256:
type: string
mount_path:
type: string
TaskList:
type: object
properties:
tasks:
type: array
items:
$ref: '#/components/schemas/Task'
total:
type: integer
limit:
type: integer
offset:
type: integer
QueueStats:
type: object
properties:
queued:
type: integer
description: Tasks waiting to run
running:
type: integer
description: Tasks currently executing
completed:
type: integer
description: Tasks completed today
failed:
type: integer
description: Tasks failed today
workers:
type: integer
description: Active workers
Experiment:
type: object
properties:
id:
type: string
name:
type: string
commit_id:
type: string
created_at:
type: string
format: date-time
status:
type: string
enum: [active, archived, deleted]
CreateExperimentRequest:
type: object
required:
- name
properties:
name:
type: string
maxLength: 128
description:
type: string
JupyterService:
type: object
properties:
id:
type: string
name:
type: string
status:
type: string
enum: [starting, running, stopping, stopped, error]
url:
type: string
format: uri
token:
type: string
created_at:
type: string
format: date-time
StartJupyterRequest:
type: object
required:
- name
properties:
name:
type: string
workspace:
type: string
image:
type: string
default: jupyter/pytorch:latest
ErrorResponse:
type: object
required:
- error
- code
- trace_id
properties:
error:
type: string
description: Sanitized error message
code:
type: string
enum: [BAD_REQUEST, UNAUTHORIZED, FORBIDDEN, NOT_FOUND, CONFLICT, RATE_LIMITED, INTERNAL_ERROR, SERVICE_UNAVAILABLE, VALIDATION_ERROR]
trace_id:
type: string
description: Support correlation ID
responses:
BadRequest:
description: Invalid request
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
example:
error: Invalid request format
code: BAD_REQUEST
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
Unauthorized:
description: Authentication required
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
example:
error: Invalid or missing API key
code: UNAUTHORIZED
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
Forbidden:
description: Insufficient permissions
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
example:
error: Insufficient permissions
code: FORBIDDEN
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
NotFound:
description: Resource not found
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
example:
error: Resource not found
code: NOT_FOUND
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
ValidationError:
description: Validation failed
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
example:
error: Validation failed
code: VALIDATION_ERROR
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
RateLimited:
description: Too many requests
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
example:
error: Rate limit exceeded
code: RATE_LIMITED
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
headers:
Retry-After:
schema:
type: integer
description: Seconds until rate limit resets
InternalError:
description: Internal server error
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
example:
error: An error occurred
code: INTERNAL_ERROR
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890