- Update openapi.yaml spec - Regenerate server_gen.go with oapi-codegen - Update adapter, routes, and server configuration
1259 lines
33 KiB
YAML
1259 lines
33 KiB
YAML
---
|
|
openapi: 3.0.3
|
|
info:
|
|
title: ML Worker API
|
|
description: |
|
|
API for managing ML experiment tasks and Jupyter services.
|
|
|
|
## Security
|
|
All endpoints (except health checks) require API key authentication via the
|
|
`X-API-Key` header. Rate limiting is enforced per API key.
|
|
|
|
## Error Handling
|
|
Errors follow a consistent format with machine-readable codes and trace IDs:
|
|
```json
|
|
{
|
|
"error": "Sanitized error message",
|
|
"code": "ERROR_CODE",
|
|
"trace_id": "uuid-for-support"
|
|
}
|
|
```
|
|
version: 1.0.0
|
|
contact:
|
|
name: FetchML Support
|
|
servers:
|
|
- url: http://localhost:9101
|
|
description: Local development server
|
|
- url: https://api.fetchml.example.com
|
|
description: Production server
|
|
security:
|
|
- ApiKeyAuth: []
|
|
paths:
|
|
/health:
|
|
get:
|
|
summary: Health check
|
|
description: Returns server health status. No authentication required.
|
|
security: []
|
|
responses:
|
|
'200':
|
|
description: Server is healthy
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/HealthResponse'
|
|
/v1/tasks:
|
|
get:
|
|
summary: List tasks
|
|
description: List all tasks with optional filtering
|
|
parameters:
|
|
- name: status
|
|
in: query
|
|
schema:
|
|
type: string
|
|
enum: [queued, running, completed, failed]
|
|
- name: limit
|
|
in: query
|
|
schema:
|
|
type: integer
|
|
default: 50
|
|
maximum: 1000
|
|
- name: offset
|
|
in: query
|
|
schema:
|
|
type: integer
|
|
default: 0
|
|
- name: user_id
|
|
in: query
|
|
schema:
|
|
type: string
|
|
description: Filter by user who submitted the task
|
|
- name: plugin_configs
|
|
in: query
|
|
schema:
|
|
type: object
|
|
additionalProperties:
|
|
$ref: '#/components/schemas/PluginConfig'
|
|
description: Plugin configurations for this task
|
|
- name: node_count
|
|
in: query
|
|
schema:
|
|
type: integer
|
|
minimum: 1
|
|
default: 1
|
|
description: Number of nodes for multi-node jobs
|
|
- name: reservation_id
|
|
in: query
|
|
schema:
|
|
type: string
|
|
description: Pre-reserved capacity for this task
|
|
responses:
|
|
'200':
|
|
description: List of tasks
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/TaskList'
|
|
'400':
|
|
$ref: '#/components/responses/BadRequest'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'429':
|
|
$ref: '#/components/responses/RateLimited'
|
|
post:
|
|
summary: Create task
|
|
description: Submit a new ML experiment task
|
|
requestBody:
|
|
required: true
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/CreateTaskRequest'
|
|
responses:
|
|
'201':
|
|
description: Task created successfully
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/Task'
|
|
'400':
|
|
$ref: '#/components/responses/BadRequest'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'422':
|
|
$ref: '#/components/responses/ValidationError'
|
|
'429':
|
|
$ref: '#/components/responses/RateLimited'
|
|
/v1/tasks/{taskId}:
|
|
get:
|
|
summary: Get task details
|
|
parameters:
|
|
- name: taskId
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
responses:
|
|
'200':
|
|
description: Task details
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/Task'
|
|
'404':
|
|
$ref: '#/components/responses/NotFound'
|
|
delete:
|
|
summary: Cancel/delete task
|
|
parameters:
|
|
- name: taskId
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
responses:
|
|
'204':
|
|
description: Task cancelled
|
|
'404':
|
|
$ref: '#/components/responses/NotFound'
|
|
/v1/queue:
|
|
get:
|
|
summary: Queue status
|
|
description: Get current queue statistics
|
|
responses:
|
|
'200':
|
|
description: Queue statistics
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/QueueStats'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
/v1/plugins:
|
|
get:
|
|
summary: List available plugins
|
|
description: Returns all registered plugins and their status
|
|
tags:
|
|
- Plugins
|
|
responses:
|
|
'200':
|
|
description: List of plugins
|
|
content:
|
|
application/json:
|
|
schema:
|
|
type: array
|
|
items:
|
|
$ref: '#/components/schemas/Plugin'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
/v1/plugins/{pluginName}:
|
|
get:
|
|
summary: Get plugin details
|
|
description: Returns plugin configuration and status
|
|
tags:
|
|
- Plugins
|
|
parameters:
|
|
- name: pluginName
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
responses:
|
|
'200':
|
|
description: Plugin details
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/Plugin'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
'404':
|
|
$ref: '#/components/responses/NotFound'
|
|
/v1/plugins/{pluginName}/config:
|
|
get:
|
|
summary: Get plugin configuration
|
|
description: Returns plugin configuration
|
|
tags:
|
|
- Plugins
|
|
parameters:
|
|
- name: pluginName
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
responses:
|
|
'200':
|
|
description: Plugin configuration
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/PluginConfig'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
'404':
|
|
$ref: '#/components/responses/NotFound'
|
|
put:
|
|
summary: Update plugin configuration
|
|
description: Update plugin configuration (hot-reload if supported)
|
|
tags:
|
|
- Plugins
|
|
parameters:
|
|
- name: pluginName
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
requestBody:
|
|
required: true
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/PluginConfig'
|
|
responses:
|
|
'200':
|
|
description: Configuration updated
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/Plugin'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
'404':
|
|
$ref: '#/components/responses/NotFound'
|
|
'422':
|
|
$ref: '#/components/responses/ValidationError'
|
|
delete:
|
|
summary: Disable/unload plugin
|
|
description: Disable plugin (may require restart if plugin requires it)
|
|
tags:
|
|
- Plugins
|
|
parameters:
|
|
- name: pluginName
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
responses:
|
|
'204':
|
|
description: Plugin disabled
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
'404':
|
|
$ref: '#/components/responses/NotFound'
|
|
/v1/plugins/{pluginName}/health:
|
|
get:
|
|
summary: Check plugin health
|
|
description: Returns health status of plugin sidecars
|
|
tags:
|
|
- Plugins
|
|
parameters:
|
|
- name: pluginName
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
responses:
|
|
'200':
|
|
description: Plugin health
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/HealthResponse'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
'404':
|
|
$ref: '#/components/responses/NotFound'
|
|
/v1/scheduler/status:
|
|
get:
|
|
summary: Get scheduler status
|
|
description: Returns queue depths, worker counts, and metrics
|
|
tags:
|
|
- Scheduler
|
|
responses:
|
|
'200':
|
|
description: Scheduler status
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/SchedulerStatus'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
/v1/scheduler/status/stream:
|
|
get:
|
|
summary: SSE stream of scheduler state changes
|
|
description: Emits events on queue depth changes, worker connect/disconnect, job transitions
|
|
tags:
|
|
- Scheduler
|
|
produces:
|
|
- text/event-stream
|
|
responses:
|
|
'200':
|
|
description: SSE stream
|
|
content:
|
|
text/event-stream:
|
|
schema:
|
|
type: string
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
/v1/scheduler/workers:
|
|
get:
|
|
summary: List connected workers
|
|
description: Returns all workers and their capabilities
|
|
tags:
|
|
- Scheduler
|
|
responses:
|
|
'200':
|
|
description: List of workers
|
|
content:
|
|
application/json:
|
|
schema:
|
|
type: array
|
|
items:
|
|
$ref: '#/components/schemas/Worker'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
/v1/scheduler/workers/{workerId}:
|
|
get:
|
|
summary: Get worker details
|
|
description: Returns detailed worker information
|
|
tags:
|
|
- Scheduler
|
|
parameters:
|
|
- name: workerId
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
responses:
|
|
'200':
|
|
description: Worker details
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/Worker'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
'404':
|
|
$ref: '#/components/responses/NotFound'
|
|
delete:
|
|
summary: Disconnect/drain worker
|
|
description: Gracefully drain and disconnect a worker
|
|
tags:
|
|
- Scheduler
|
|
parameters:
|
|
- name: workerId
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
responses:
|
|
'204':
|
|
description: Worker draining initiated
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
'404':
|
|
$ref: '#/components/responses/NotFound'
|
|
/v1/scheduler/reservations:
|
|
get:
|
|
summary: List active reservations
|
|
description: Returns all active capacity reservations
|
|
tags:
|
|
- Scheduler
|
|
responses:
|
|
'200':
|
|
description: List of reservations
|
|
content:
|
|
application/json:
|
|
schema:
|
|
type: array
|
|
items:
|
|
$ref: '#/components/schemas/Reservation'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
post:
|
|
summary: Create reservation
|
|
description: Reserve capacity for large jobs
|
|
tags:
|
|
- Scheduler
|
|
requestBody:
|
|
required: true
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/CreateReservationRequest'
|
|
responses:
|
|
'201':
|
|
description: Reservation created
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/Reservation'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
'422':
|
|
$ref: '#/components/responses/ValidationError'
|
|
/v1/scheduler/jobs/{jobId}/priority:
|
|
patch:
|
|
summary: Update job priority
|
|
description: Change the priority of a queued or running job
|
|
tags:
|
|
- Scheduler
|
|
parameters:
|
|
- name: jobId
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
requestBody:
|
|
required: true
|
|
content:
|
|
application/json:
|
|
schema:
|
|
type: object
|
|
required:
|
|
- priority
|
|
properties:
|
|
priority:
|
|
type: integer
|
|
minimum: 1
|
|
maximum: 10
|
|
responses:
|
|
'200':
|
|
description: Priority updated
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/Task'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
'404':
|
|
$ref: '#/components/responses/NotFound'
|
|
/v1/scheduler/jobs/{jobId}/stream:
|
|
get:
|
|
summary: SSE stream of job progress
|
|
description: Emits events on job state transitions and priority changes
|
|
tags:
|
|
- Scheduler
|
|
produces:
|
|
- text/event-stream
|
|
parameters:
|
|
- name: jobId
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
responses:
|
|
'200':
|
|
description: SSE stream
|
|
content:
|
|
text/event-stream:
|
|
schema:
|
|
type: string
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
'404':
|
|
$ref: '#/components/responses/NotFound'
|
|
/v1/audit/events:
|
|
get:
|
|
summary: Query audit events
|
|
description: Filter by time range, event type, user
|
|
tags:
|
|
- Audit
|
|
parameters:
|
|
- name: from
|
|
in: query
|
|
schema:
|
|
type: string
|
|
format: date-time
|
|
description: Start timestamp
|
|
- name: to
|
|
in: query
|
|
schema:
|
|
type: string
|
|
format: date-time
|
|
description: End timestamp
|
|
- name: event_type
|
|
in: query
|
|
schema:
|
|
type: string
|
|
enum: [job_queued, job_started, job_completed, file_access, auth_attempt, plugin_configured, scheduler_drain, audit_verified]
|
|
description: Filter by event type
|
|
- name: user_id
|
|
in: query
|
|
schema:
|
|
type: string
|
|
description: Filter by user
|
|
- name: limit
|
|
in: query
|
|
schema:
|
|
type: integer
|
|
default: 100
|
|
maximum: 1000
|
|
- name: offset
|
|
in: query
|
|
schema:
|
|
type: integer
|
|
default: 0
|
|
responses:
|
|
'200':
|
|
description: List of audit events
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/AuditEventList'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
'429':
|
|
$ref: '#/components/responses/RateLimited'
|
|
/v1/audit/verify:
|
|
post:
|
|
summary: Verify audit chain integrity
|
|
description: Validates the hash chain for tampering
|
|
tags:
|
|
- Audit
|
|
responses:
|
|
'200':
|
|
description: Verification result
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/VerificationResult'
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
/v1/audit/chain-root:
|
|
get:
|
|
summary: Get current chain root hash
|
|
description: Returns the latest event hash for external verification
|
|
tags:
|
|
- Audit
|
|
responses:
|
|
'200':
|
|
description: Chain root hash
|
|
content:
|
|
application/json:
|
|
schema:
|
|
type: object
|
|
properties:
|
|
root_hash:
|
|
type: string
|
|
timestamp:
|
|
type: string
|
|
format: date-time
|
|
total_events:
|
|
type: integer
|
|
'401':
|
|
$ref: '#/components/responses/Unauthorized'
|
|
'403':
|
|
$ref: '#/components/responses/Forbidden'
|
|
/v1/experiments:
|
|
get:
|
|
summary: List experiments
|
|
description: List all experiments
|
|
responses:
|
|
'200':
|
|
description: List of experiments
|
|
content:
|
|
application/json:
|
|
schema:
|
|
type: array
|
|
items:
|
|
$ref: '#/components/schemas/Experiment'
|
|
post:
|
|
summary: Create experiment
|
|
description: Create a new experiment
|
|
requestBody:
|
|
required: true
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/CreateExperimentRequest'
|
|
responses:
|
|
'201':
|
|
description: Experiment created
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/Experiment'
|
|
/v1/jupyter/services:
|
|
get:
|
|
summary: List Jupyter services
|
|
responses:
|
|
'200':
|
|
description: List of Jupyter services
|
|
content:
|
|
application/json:
|
|
schema:
|
|
type: array
|
|
items:
|
|
$ref: '#/components/schemas/JupyterService'
|
|
post:
|
|
summary: Start Jupyter service
|
|
requestBody:
|
|
required: true
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/StartJupyterRequest'
|
|
responses:
|
|
'201':
|
|
description: Jupyter service started
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/JupyterService'
|
|
/v1/jupyter/services/{serviceId}:
|
|
delete:
|
|
summary: Stop Jupyter service
|
|
parameters:
|
|
- name: serviceId
|
|
in: path
|
|
required: true
|
|
schema:
|
|
type: string
|
|
responses:
|
|
'204':
|
|
description: Service stopped
|
|
/ws:
|
|
get:
|
|
summary: WebSocket connection
|
|
description: |
|
|
WebSocket endpoint for real-time task updates.
|
|
|
|
## Message Types
|
|
- `task_update`: Task status changes
|
|
- `task_complete`: Task finished
|
|
- `ping`: Keep-alive (respond with `pong`)
|
|
security:
|
|
- ApiKeyAuth: []
|
|
responses:
|
|
'101':
|
|
description: WebSocket connection established
|
|
components:
|
|
securitySchemes:
|
|
ApiKeyAuth:
|
|
type: apiKey
|
|
in: header
|
|
name: X-API-Key
|
|
description: API key for authentication
|
|
schemas:
|
|
HealthResponse:
|
|
type: object
|
|
properties:
|
|
status:
|
|
type: string
|
|
enum: [healthy, degraded, unhealthy]
|
|
version:
|
|
type: string
|
|
timestamp:
|
|
type: string
|
|
format: date-time
|
|
Task:
|
|
type: object
|
|
properties:
|
|
id:
|
|
type: string
|
|
description: Unique task identifier
|
|
job_name:
|
|
type: string
|
|
pattern: '^[a-zA-Z0-9_-]+$'
|
|
maxLength: 64
|
|
status:
|
|
type: string
|
|
enum: [queued, preparing, running, collecting, completed, failed]
|
|
priority:
|
|
type: integer
|
|
minimum: 1
|
|
maximum: 10
|
|
default: 5
|
|
created_at:
|
|
type: string
|
|
format: date-time
|
|
started_at:
|
|
type: string
|
|
format: date-time
|
|
ended_at:
|
|
type: string
|
|
format: date-time
|
|
worker_id:
|
|
type: string
|
|
error:
|
|
type: string
|
|
output:
|
|
type: string
|
|
entrypoint:
|
|
type: string
|
|
description: Entrypoint script or command executed for this task
|
|
snapshot_id:
|
|
type: string
|
|
datasets:
|
|
type: array
|
|
items:
|
|
type: string
|
|
cpu:
|
|
type: integer
|
|
memory_gb:
|
|
type: integer
|
|
gpu:
|
|
type: integer
|
|
user_id:
|
|
type: string
|
|
retry_count:
|
|
type: integer
|
|
max_retries:
|
|
type: integer
|
|
plugin_status:
|
|
type: object
|
|
additionalProperties:
|
|
$ref: '#/components/schemas/PluginStatus'
|
|
description: Status of tracking plugins for this task
|
|
node_assignments:
|
|
type: array
|
|
items:
|
|
$ref: '#/components/schemas/NodeAssignment'
|
|
description: Worker node assignments for multi-node jobs
|
|
priority_aged:
|
|
type: number
|
|
description: Effective priority with aging applied
|
|
CreateTaskRequest:
|
|
type: object
|
|
required:
|
|
- job_name
|
|
properties:
|
|
job_name:
|
|
type: string
|
|
pattern: '^[a-zA-Z0-9_-]+$'
|
|
maxLength: 64
|
|
description: Unique identifier for the job
|
|
priority:
|
|
type: integer
|
|
minimum: 1
|
|
maximum: 10
|
|
default: 5
|
|
args:
|
|
type: string
|
|
description: Command-line arguments for the entrypoint
|
|
entrypoint:
|
|
type: string
|
|
description: Entrypoint script or command (e.g., train.py, run.sh, /bin/bash -c "echo hello")
|
|
examples:
|
|
- train.py
|
|
- run.sh
|
|
- /bin/bash -c "python train.py --epochs 10"
|
|
snapshot_id:
|
|
type: string
|
|
description: Reference to experiment snapshot
|
|
datasets:
|
|
type: array
|
|
items:
|
|
type: string
|
|
dataset_specs:
|
|
type: array
|
|
items:
|
|
$ref: '#/components/schemas/DatasetSpec'
|
|
cpu:
|
|
type: integer
|
|
description: CPU cores requested
|
|
memory_gb:
|
|
type: integer
|
|
description: Memory (GB) requested
|
|
gpu:
|
|
type: integer
|
|
description: GPUs requested
|
|
metadata:
|
|
type: object
|
|
additionalProperties:
|
|
type: string
|
|
DatasetSpec:
|
|
type: object
|
|
properties:
|
|
name:
|
|
type: string
|
|
source:
|
|
type: string
|
|
sha256:
|
|
type: string
|
|
mount_path:
|
|
type: string
|
|
TaskList:
|
|
type: object
|
|
properties:
|
|
tasks:
|
|
type: array
|
|
items:
|
|
$ref: '#/components/schemas/Task'
|
|
total:
|
|
type: integer
|
|
limit:
|
|
type: integer
|
|
offset:
|
|
type: integer
|
|
QueueStats:
|
|
type: object
|
|
properties:
|
|
queued:
|
|
type: integer
|
|
description: Tasks waiting to run
|
|
running:
|
|
type: integer
|
|
description: Tasks currently executing
|
|
completed:
|
|
type: integer
|
|
description: Tasks completed today
|
|
failed:
|
|
type: integer
|
|
description: Tasks failed today
|
|
workers:
|
|
type: integer
|
|
description: Active workers
|
|
Experiment:
|
|
type: object
|
|
properties:
|
|
id:
|
|
type: string
|
|
name:
|
|
type: string
|
|
commit_id:
|
|
type: string
|
|
created_at:
|
|
type: string
|
|
format: date-time
|
|
status:
|
|
type: string
|
|
enum: [active, archived, deleted]
|
|
CreateExperimentRequest:
|
|
type: object
|
|
required:
|
|
- name
|
|
properties:
|
|
name:
|
|
type: string
|
|
maxLength: 128
|
|
description:
|
|
type: string
|
|
JupyterService:
|
|
type: object
|
|
properties:
|
|
id:
|
|
type: string
|
|
name:
|
|
type: string
|
|
status:
|
|
type: string
|
|
enum: [starting, running, stopping, stopped, error]
|
|
url:
|
|
type: string
|
|
format: uri
|
|
token:
|
|
type: string
|
|
created_at:
|
|
type: string
|
|
format: date-time
|
|
StartJupyterRequest:
|
|
type: object
|
|
required:
|
|
- name
|
|
properties:
|
|
name:
|
|
type: string
|
|
workspace:
|
|
type: string
|
|
image:
|
|
type: string
|
|
default: jupyter/pytorch:latest
|
|
PluginConfig:
|
|
type: object
|
|
properties:
|
|
enabled:
|
|
type: boolean
|
|
mode:
|
|
type: string
|
|
enum: [sidecar, remote, disabled]
|
|
image:
|
|
type: string
|
|
settings:
|
|
type: object
|
|
additionalProperties: true
|
|
PluginStatus:
|
|
type: object
|
|
properties:
|
|
name:
|
|
type: string
|
|
status:
|
|
type: string
|
|
enum: [healthy, unhealthy, starting, stopped]
|
|
url:
|
|
type: string
|
|
last_check:
|
|
type: string
|
|
format: date-time
|
|
NodeAssignment:
|
|
type: object
|
|
properties:
|
|
node_id:
|
|
type: integer
|
|
worker_id:
|
|
type: string
|
|
slot_assigned:
|
|
type: boolean
|
|
Plugin:
|
|
type: object
|
|
properties:
|
|
name:
|
|
type: string
|
|
description: Plugin name
|
|
enabled:
|
|
type: boolean
|
|
description: Whether plugin is enabled
|
|
mode:
|
|
type: string
|
|
enum: [sidecar, remote, disabled]
|
|
description: Provisioning mode
|
|
status:
|
|
$ref: '#/components/schemas/PluginStatus'
|
|
description: Current plugin status
|
|
config:
|
|
$ref: '#/components/schemas/PluginConfig'
|
|
requires_restart:
|
|
type: boolean
|
|
description: Whether plugin requires restart on config change
|
|
version:
|
|
type: string
|
|
description: Plugin version
|
|
Worker:
|
|
type: object
|
|
properties:
|
|
id:
|
|
type: string
|
|
description: Worker unique identifier
|
|
connected_at:
|
|
type: string
|
|
format: date-time
|
|
description: When worker connected
|
|
last_heartbeat:
|
|
type: string
|
|
format: date-time
|
|
description: Last heartbeat timestamp
|
|
capabilities:
|
|
type: object
|
|
properties:
|
|
gpu_count:
|
|
type: integer
|
|
description: Number of GPUs
|
|
gpu_type:
|
|
type: string
|
|
description: GPU type (e.g., A100, H100)
|
|
cpu_cores:
|
|
type: integer
|
|
memory_gb:
|
|
type: integer
|
|
slots:
|
|
type: object
|
|
properties:
|
|
batch_available:
|
|
type: integer
|
|
batch_total:
|
|
type: integer
|
|
service_available:
|
|
type: integer
|
|
service_total:
|
|
type: integer
|
|
active_tasks:
|
|
type: array
|
|
items:
|
|
type: string
|
|
description: IDs of tasks currently running on this worker
|
|
status:
|
|
type: string
|
|
enum: [active, draining, offline]
|
|
SchedulerStatus:
|
|
type: object
|
|
properties:
|
|
workers_total:
|
|
type: integer
|
|
workers_active:
|
|
type: integer
|
|
workers_draining:
|
|
type: integer
|
|
batch_queue_depth:
|
|
type: integer
|
|
service_queue_depth:
|
|
type: integer
|
|
tasks_running:
|
|
type: integer
|
|
tasks_completed_24h:
|
|
type: integer
|
|
reservations_active:
|
|
type: integer
|
|
timestamp:
|
|
type: string
|
|
format: date-time
|
|
Reservation:
|
|
type: object
|
|
properties:
|
|
id:
|
|
type: string
|
|
user_id:
|
|
type: string
|
|
gpu_count:
|
|
type: integer
|
|
gpu_type:
|
|
type: string
|
|
node_count:
|
|
type: integer
|
|
expires_at:
|
|
type: string
|
|
format: date-time
|
|
status:
|
|
type: string
|
|
enum: [active, claimed, expired]
|
|
CreateReservationRequest:
|
|
type: object
|
|
required:
|
|
- gpu_count
|
|
properties:
|
|
gpu_count:
|
|
type: integer
|
|
minimum: 1
|
|
gpu_type:
|
|
type: string
|
|
node_count:
|
|
type: integer
|
|
minimum: 1
|
|
default: 1
|
|
expires_minutes:
|
|
type: integer
|
|
default: 30
|
|
AuditEvent:
|
|
type: object
|
|
properties:
|
|
timestamp:
|
|
type: string
|
|
format: date-time
|
|
event_type:
|
|
type: string
|
|
enum: [job_queued, job_started, job_completed, file_access, auth_attempt, plugin_configured, scheduler_drain, audit_verified]
|
|
user_id:
|
|
type: string
|
|
resource:
|
|
type: string
|
|
description: Resource being acted upon
|
|
action:
|
|
type: string
|
|
description: Action performed
|
|
success:
|
|
type: boolean
|
|
ip_address:
|
|
type: string
|
|
error:
|
|
type: string
|
|
prev_hash:
|
|
type: string
|
|
description: Previous event hash in chain
|
|
event_hash:
|
|
type: string
|
|
description: This event's hash
|
|
sequence_num:
|
|
type: integer
|
|
description: Position in audit chain
|
|
metadata:
|
|
type: object
|
|
additionalProperties: true
|
|
AuditEventList:
|
|
type: object
|
|
properties:
|
|
events:
|
|
type: array
|
|
items:
|
|
$ref: '#/components/schemas/AuditEvent'
|
|
total:
|
|
type: integer
|
|
limit:
|
|
type: integer
|
|
offset:
|
|
type: integer
|
|
VerificationResult:
|
|
type: object
|
|
properties:
|
|
valid:
|
|
type: boolean
|
|
total_events:
|
|
type: integer
|
|
first_tampered:
|
|
type: integer
|
|
description: Sequence number of first tampered event (if any)
|
|
chain_root_hash:
|
|
type: string
|
|
verified_at:
|
|
type: string
|
|
format: date-time
|
|
ErrorResponse:
|
|
type: object
|
|
required:
|
|
- error
|
|
- code
|
|
- trace_id
|
|
properties:
|
|
error:
|
|
type: string
|
|
description: Sanitized error message
|
|
code:
|
|
type: string
|
|
enum: [BAD_REQUEST, UNAUTHORIZED, FORBIDDEN, NOT_FOUND, CONFLICT, RATE_LIMITED, INTERNAL_ERROR, SERVICE_UNAVAILABLE, VALIDATION_ERROR]
|
|
trace_id:
|
|
type: string
|
|
description: Support correlation ID
|
|
tags:
|
|
- name: Plugins
|
|
description: Plugin management endpoints
|
|
- name: Scheduler
|
|
description: Scheduler and worker management
|
|
- name: Audit
|
|
description: Audit log and chain verification
|
|
responses:
|
|
BadRequest:
|
|
description: Invalid request
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
example:
|
|
error: Invalid request format
|
|
code: BAD_REQUEST
|
|
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
|
Unauthorized:
|
|
description: Authentication required
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
example:
|
|
error: Invalid or missing API key
|
|
code: UNAUTHORIZED
|
|
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
|
Forbidden:
|
|
description: Insufficient permissions
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
example:
|
|
error: Insufficient permissions
|
|
code: FORBIDDEN
|
|
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
|
NotFound:
|
|
description: Resource not found
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
example:
|
|
error: Resource not found
|
|
code: NOT_FOUND
|
|
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
|
ValidationError:
|
|
description: Validation failed
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
example:
|
|
error: Validation failed
|
|
code: VALIDATION_ERROR
|
|
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
|
RateLimited:
|
|
description: Too many requests
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
example:
|
|
error: Rate limit exceeded
|
|
code: RATE_LIMITED
|
|
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|
|
headers:
|
|
Retry-After:
|
|
schema:
|
|
type: integer
|
|
description: Seconds until rate limit resets
|
|
InternalError:
|
|
description: Internal server error
|
|
content:
|
|
application/json:
|
|
schema:
|
|
$ref: '#/components/schemas/ErrorResponse'
|
|
example:
|
|
error: An error occurred
|
|
code: INTERNAL_ERROR
|
|
trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
|