- Makefile: Update build targets for native library integration - build.zig: Add SQLite linking and native hash library support - scripts/build_rsync.sh: Update rsync embedded binary build process - scripts/build_sqlite.sh: Add SQLite constants generation script - src/assets/README.md: Document embedded asset structure - src/utils/rsync_embedded_binary.zig: Update for new build layout
559 lines
14 KiB
YAML
559 lines
14 KiB
YAML
openapi: 3.0.3
|
|
info:
|
|
title: ML Worker API
|
|
description: |
|
|
API for managing ML experiment tasks and Jupyter services.
|
|
|
|
## Security
|
|
All endpoints (except health checks) require API key authentication via the
|
|
`X-API-Key` header. Rate limiting is enforced per API key.
|
|
|
|
## Error Handling
|
|
Errors follow a consistent format with machine-readable codes and trace IDs:
|
|
```json
|
|
{
|
|
"error": "Sanitized error message",
|
|
"code": "ERROR_CODE",
|
|
"trace_id": "uuid-for-support"
|
|
}
|
|
```
|
|
version: 1.0.0
|
|
contact:
|
|
name: FetchML Support
|
|
|
|
servers:
|
|
- url: http://localhost:9101
|
|
description: Local development server
|
|
- url: https://api.fetchml.example.com
|
|
description: Production server
|
|
|
|
security:
|
|
- ApiKeyAuth: []
|
|
|
|
paths:
|
|
/health:
|
|
get:
|
|
summary: Health check
|
|
description: Returns server health status. No authentication required.
|
|
security: []
|
|
responses:
|
|
'200':
|
|
description: Server is healthy
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/HealthResponse'
|
|
|
|
/v1/tasks:
|
|
get:
|
|
summary: List tasks
|
|
description: List all tasks with optional filtering
|
|
parameters:
|
|
- name: status
|
|
in: query
|
|
schema:
|
|
type: string
|
|
enum: [queued, running, completed, failed]
|
|
- name: limit
|
|
in: query
|
|
schema:
|
|
type: integer
|
|
default: 50
|
|
maximum: 1000
|
|
- name: offset
|
|
in: query
|
|
schema:
|
|
type: integer
|
|
default: 0
|
|
responses:
|
|
'200':
|
|
description: List of tasks
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/TaskList'
|
|
'400':
|
|
$ref: '#/components/responses/BadRequest'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'429':
|
|
$ref: '#/components/responses/RateLimited'
|
|
|
|
post:
|
|
summary: Create task
|
|
description: Submit a new ML experiment task
|
|
requestBody:
|
|
required: true
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/CreateTaskRequest'
|
|
responses:
|
|
'201':
|
|
description: Task created successfully
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/Task'
|
|
'400':
|
|
$ref: '#/components/responses/BadRequest'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'422':
|
|
$ref: '#/components/responses/ValidationError'
|
|
'429':
|
|
$ref: '#/components/responses/RateLimited'
|
|
|
|
/v1/tasks/{taskId}:
|
|
get:
|
|
summary: Get task details
|
|
parameters:
|
|
- name: taskId
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
responses:
|
|
'200':
|
|
description: Task details
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/Task'
|
|
'404':
|
|
$ref: '#/components/responses/NotFound'
|
|
|
|
delete:
|
|
summary: Cancel/delete task
|
|
parameters:
|
|
- name: taskId
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
responses:
|
|
'204':
|
|
description: Task cancelled
|
|
'404':
|
|
$ref: '#/components/responses/NotFound'
|
|
|
|
/v1/queue:
|
|
get:
|
|
summary: Queue status
|
|
description: Get current queue statistics
|
|
responses:
|
|
'200':
|
|
description: Queue statistics
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/QueueStats'
|
|
|
|
/v1/experiments:
|
|
get:
|
|
summary: List experiments
|
|
description: List all experiments
|
|
responses:
|
|
'200':
|
|
description: List of experiments
|
|
content:
|
|
application/json:
|
|
schema:
|
|
type: array
|
|
items:
|
|
$ref: '#/components/schemas/Experiment'
|
|
|
|
post:
|
|
summary: Create experiment
|
|
description: Create a new experiment
|
|
requestBody:
|
|
required: true
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/CreateExperimentRequest'
|
|
responses:
|
|
'201':
|
|
description: Experiment created
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/Experiment'
|
|
|
|
/v1/jupyter/services:
|
|
get:
|
|
summary: List Jupyter services
|
|
responses:
|
|
'200':
|
|
description: List of Jupyter services
|
|
content:
|
|
application/json:
|
|
schema:
|
|
type: array
|
|
items:
|
|
$ref: '#/components/schemas/JupyterService'
|
|
|
|
post:
|
|
summary: Start Jupyter service
|
|
requestBody:
|
|
required: true
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/StartJupyterRequest'
|
|
responses:
|
|
'201':
|
|
description: Jupyter service started
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/JupyterService'
|
|
|
|
/v1/jupyter/services/{serviceId}:
|
|
delete:
|
|
summary: Stop Jupyter service
|
|
parameters:
|
|
- name: serviceId
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
responses:
|
|
'204':
|
|
description: Service stopped
|
|
|
|
/ws:
|
|
get:
|
|
summary: WebSocket connection
|
|
description: |
|
|
WebSocket endpoint for real-time task updates.
|
|
|
|
## Message Types
|
|
- `task_update`: Task status changes
|
|
- `task_complete`: Task finished
|
|
- `ping`: Keep-alive (respond with `pong`)
|
|
security:
|
|
- ApiKeyAuth: []
|
|
responses:
|
|
'101':
|
|
description: WebSocket connection established
|
|
|
|
components:
|
|
securitySchemes:
|
|
ApiKeyAuth:
|
|
type: apiKey
|
|
in: header
|
|
name: X-API-Key
|
|
description: API key for authentication
|
|
|
|
schemas:
|
|
HealthResponse:
|
|
type: object
|
|
properties:
|
|
status:
|
|
type: string
|
|
enum: [healthy, degraded, unhealthy]
|
|
version:
|
|
type: string
|
|
timestamp:
|
|
type: string
|
|
format: date-time
|
|
|
|
Task:
|
|
type: object
|
|
properties:
|
|
id:
|
|
type: string
|
|
description: Unique task identifier
|
|
job_name:
|
|
type: string
|
|
pattern: '^[a-zA-Z0-9_-]+$'
|
|
maxLength: 64
|
|
status:
|
|
type: string
|
|
enum: [queued, preparing, running, collecting, completed, failed]
|
|
priority:
|
|
type: integer
|
|
minimum: 1
|
|
maximum: 10
|
|
default: 5
|
|
created_at:
|
|
type: string
|
|
format: date-time
|
|
started_at:
|
|
type: string
|
|
format: date-time
|
|
ended_at:
|
|
type: string
|
|
format: date-time
|
|
worker_id:
|
|
type: string
|
|
error:
|
|
type: string
|
|
output:
|
|
type: string
|
|
snapshot_id:
|
|
type: string
|
|
datasets:
|
|
type: array
|
|
items:
|
|
type: string
|
|
cpu:
|
|
type: integer
|
|
memory_gb:
|
|
type: integer
|
|
gpu:
|
|
type: integer
|
|
user_id:
|
|
type: string
|
|
retry_count:
|
|
type: integer
|
|
max_retries:
|
|
type: integer
|
|
|
|
CreateTaskRequest:
|
|
type: object
|
|
required:
|
|
- job_name
|
|
properties:
|
|
job_name:
|
|
type: string
|
|
pattern: '^[a-zA-Z0-9_-]+$'
|
|
maxLength: 64
|
|
description: Unique identifier for the job
|
|
priority:
|
|
type: integer
|
|
minimum: 1
|
|
maximum: 10
|
|
default: 5
|
|
args:
|
|
type: string
|
|
description: Command-line arguments for the training script
|
|
snapshot_id:
|
|
type: string
|
|
description: Reference to experiment snapshot
|
|
datasets:
|
|
type: array
|
|
items:
|
|
type: string
|
|
dataset_specs:
|
|
type: array
|
|
items:
|
|
$ref: '#/components/schemas/DatasetSpec'
|
|
cpu:
|
|
type: integer
|
|
description: CPU cores requested
|
|
memory_gb:
|
|
type: integer
|
|
description: Memory (GB) requested
|
|
gpu:
|
|
type: integer
|
|
description: GPUs requested
|
|
metadata:
|
|
type: object
|
|
additionalProperties:
|
|
type: string
|
|
|
|
DatasetSpec:
|
|
type: object
|
|
properties:
|
|
name:
|
|
type: string
|
|
source:
|
|
type: string
|
|
sha256:
|
|
type: string
|
|
mount_path:
|
|
type: string
|
|
|
|
TaskList:
|
|
type: object
|
|
properties:
|
|
tasks:
|
|
type: array
|
|
items:
|
|
$ref: '#/components/schemas/Task'
|
|
total:
|
|
type: integer
|
|
limit:
|
|
type: integer
|
|
offset:
|
|
type: integer
|
|
|
|
QueueStats:
|
|
type: object
|
|
properties:
|
|
queued:
|
|
type: integer
|
|
description: Tasks waiting to run
|
|
running:
|
|
type: integer
|
|
description: Tasks currently executing
|
|
completed:
|
|
type: integer
|
|
description: Tasks completed today
|
|
failed:
|
|
type: integer
|
|
description: Tasks failed today
|
|
workers:
|
|
type: integer
|
|
description: Active workers
|
|
|
|
Experiment:
|
|
type: object
|
|
properties:
|
|
id:
|
|
type: string
|
|
name:
|
|
type: string
|
|
commit_id:
|
|
type: string
|
|
created_at:
|
|
type: string
|
|
format: date-time
|
|
status:
|
|
type: string
|
|
enum: [active, archived, deleted]
|
|
|
|
CreateExperimentRequest:
|
|
type: object
|
|
required:
|
|
- name
|
|
properties:
|
|
name:
|
|
type: string
|
|
maxLength: 128
|
|
description:
|
|
type: string
|
|
|
|
JupyterService:
|
|
type: object
|
|
properties:
|
|
id:
|
|
type: string
|
|
name:
|
|
type: string
|
|
status:
|
|
type: string
|
|
enum: [starting, running, stopping, stopped, error]
|
|
url:
|
|
type: string
|
|
format: uri
|
|
token:
|
|
type: string
|
|
created_at:
|
|
type: string
|
|
format: date-time
|
|
|
|
StartJupyterRequest:
|
|
type: object
|
|
required:
|
|
- name
|
|
properties:
|
|
name:
|
|
type: string
|
|
workspace:
|
|
type: string
|
|
image:
|
|
type: string
|
|
default: jupyter/pytorch:latest
|
|
|
|
ErrorResponse:
|
|
type: object
|
|
required:
|
|
- error
|
|
- code
|
|
- trace_id
|
|
properties:
|
|
error:
|
|
type: string
|
|
description: Sanitized error message
|
|
code:
|
|
type: string
|
|
enum: [BAD_REQUEST, UNAUTHORIZED, FORBIDDEN, NOT_FOUND, CONFLICT, RATE_LIMITED, INTERNAL_ERROR, SERVICE_UNAVAILABLE, VALIDATION_ERROR]
|
|
trace_id:
|
|
type: string
|
|
description: Support correlation ID
|
|
|
|
responses:
|
|
BadRequest:
|
|
description: Invalid request
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
example:
|
|
error: Invalid request format
|
|
code: BAD_REQUEST
|
|
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
|
|
|
Unauthorized:
|
|
description: Authentication required
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
example:
|
|
error: Invalid or missing API key
|
|
code: UNAUTHORIZED
|
|
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
|
|
|
Forbidden:
|
|
description: Insufficient permissions
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
example:
|
|
error: Insufficient permissions
|
|
code: FORBIDDEN
|
|
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
|
|
|
NotFound:
|
|
description: Resource not found
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
example:
|
|
error: Resource not found
|
|
code: NOT_FOUND
|
|
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
|
|
|
ValidationError:
|
|
description: Validation failed
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
example:
|
|
error: Validation failed
|
|
code: VALIDATION_ERROR
|
|
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
|
|
|
RateLimited:
|
|
description: Too many requests
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
example:
|
|
error: Rate limit exceeded
|
|
code: RATE_LIMITED
|
|
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
|
headers:
|
|
Retry-After:
|
|
schema:
|
|
type: integer
|
|
description: Seconds until rate limit resets
|
|
|
|
InternalError:
|
|
description: Internal server error
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
example:
|
|
error: An error occurred
|
|
code: INTERNAL_ERROR
|
|
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|