fetch_ml/api/openapi.yaml

---
openapi: 3.0.3
info:
  title: ML Worker API
  description: |
    API for managing ML experiment tasks and Jupyter services.

    ## Security
    All endpoints (except health checks) require API key authentication via the
    `X-API-Key` header. Rate limiting is enforced per API key.

    ## Error Handling
    Errors follow a consistent format with machine-readable codes and trace IDs:
    ```json
    {
      "error": "Sanitized error message",
      "code": "ERROR_CODE",
      "trace_id": "uuid-for-support"
    }
    ```
  version: 1.0.0
  contact:
    name: FetchML Support
servers:
  - url: http://localhost:9101
    description: Local development server
  - url: https://api.fetchml.example.com
    description: Production server
security:
  - ApiKeyAuth: []
paths:
  /health:
    get:
      summary: Health check
      description: Returns server health status. No authentication required.
      security: []
      responses:
        '200':
          description: Server is healthy
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HealthResponse'
  /v1/tasks:
    get:
      summary: List tasks
      description: List all tasks with optional filtering
      parameters:
        - name: status
          in: query
          schema:
            type: string
            enum: [queued, running, completed, failed]
        - name: limit
          in: query
          schema:
            type: integer
            default: 50
            maximum: 1000
        - name: offset
          in: query
          schema:
            type: integer
            default: 0
      responses:
        '200':
          description: List of tasks
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/TaskList'
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '429':
          $ref: '#/components/responses/RateLimited'
    post:
      summary: Create task
      description: Submit a new ML experiment task
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CreateTaskRequest'
      responses:
        '201':
          description: Task created successfully
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Task'
        '400':
          $ref: '#/components/responses/BadRequest'
        '401':
          $ref: '#/components/responses/Unauthorized'
        '422':
          $ref: '#/components/responses/ValidationError'
        '429':
          $ref: '#/components/responses/RateLimited'
  /v1/tasks/{taskId}:
    get:
      summary: Get task details
      parameters:
        - name: taskId
          in: path
          required: true
          schema:
            type: string
      responses:
        '200':
          description: Task details
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Task'
        '404':
          $ref: '#/components/responses/NotFound'
    delete:
      summary: Cancel/delete task
      parameters:
        - name: taskId
          in: path
          required: true
          schema:
            type: string
      responses:
        '204':
          description: Task cancelled
        '404':
          $ref: '#/components/responses/NotFound'
  /v1/queue:
    get:
      summary: Queue status
      description: Get current queue statistics
      responses:
        '200':
          description: Queue statistics
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/QueueStats'
  /v1/experiments:
    get:
      summary: List experiments
      description: List all experiments
      responses:
        '200':
          description: List of experiments
          content:
            application/json:
              schema:
                type: array
                items:
                  $ref: '#/components/schemas/Experiment'
    post:
      summary: Create experiment
      description: Create a new experiment
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CreateExperimentRequest'
      responses:
        '201':
          description: Experiment created
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Experiment'
  /v1/jupyter/services:
    get:
      summary: List Jupyter services
      responses:
        '200':
          description: List of Jupyter services
          content:
            application/json:
              schema:
                type: array
                items:
                  $ref: '#/components/schemas/JupyterService'
    post:
      summary: Start Jupyter service
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/StartJupyterRequest'
      responses:
        '201':
          description: Jupyter service started
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/JupyterService'
  /v1/jupyter/services/{serviceId}:
    delete:
      summary: Stop Jupyter service
      parameters:
        - name: serviceId
          in: path
          required: true
          schema:
            type: string
      responses:
        '204':
          description: Service stopped
  /ws:
    get:
      summary: WebSocket connection
      description: |
        WebSocket endpoint for real-time task updates.

        ## Message Types
        - `task_update`: Task status changes
        - `task_complete`: Task finished
        - `ping`: Keep-alive (respond with `pong`)
      security:
        - ApiKeyAuth: []
      responses:
        '101':
          description: WebSocket connection established
components:
  securitySchemes:
    ApiKeyAuth:
      type: apiKey
      in: header
      name: X-API-Key
      description: API key for authentication
  schemas:
    HealthResponse:
      type: object
      properties:
        status:
          type: string
          enum: [healthy, degraded, unhealthy]
        version:
          type: string
        timestamp:
          type: string
          format: date-time
    Task:
      type: object
      properties:
        id:
          type: string
          description: Unique task identifier
        job_name:
          type: string
          pattern: '^[a-zA-Z0-9_-]+$'
          maxLength: 64
        status:
          type: string
          enum: [queued, preparing, running, collecting, completed, failed]
        priority:
          type: integer
          minimum: 1
          maximum: 10
          default: 5
        created_at:
          type: string
          format: date-time
        started_at:
          type: string
          format: date-time
        ended_at:
          type: string
          format: date-time
        worker_id:
          type: string
        error:
          type: string
        output:
          type: string
        snapshot_id:
          type: string
        datasets:
          type: array
          items:
            type: string
        cpu:
          type: integer
        memory_gb:
          type: integer
        gpu:
          type: integer
        user_id:
          type: string
        retry_count:
          type: integer
        max_retries:
          type: integer
    CreateTaskRequest:
      type: object
      required:
        - job_name
      properties:
        job_name:
          type: string
          pattern: '^[a-zA-Z0-9_-]+$'
          maxLength: 64
          description: Unique identifier for the job
        priority:
          type: integer
          minimum: 1
          maximum: 10
          default: 5
        args:
          type: string
          description: Command-line arguments for the training script
        snapshot_id:
          type: string
          description: Reference to experiment snapshot
        datasets:
          type: array
          items:
            type: string
        dataset_specs:
          type: array
          items:
            $ref: '#/components/schemas/DatasetSpec'
        cpu:
          type: integer
          description: CPU cores requested
        memory_gb:
          type: integer
          description: Memory (GB) requested
        gpu:
          type: integer
          description: GPUs requested
        metadata:
          type: object
          additionalProperties:
            type: string
    DatasetSpec:
      type: object
      properties:
        name:
          type: string
        source:
          type: string
        sha256:
          type: string
        mount_path:
          type: string
    TaskList:
      type: object
      properties:
        tasks:
          type: array
          items:
            $ref: '#/components/schemas/Task'
        total:
          type: integer
        limit:
          type: integer
        offset:
          type: integer
    QueueStats:
      type: object
      properties:
        queued:
          type: integer
          description: Tasks waiting to run
        running:
          type: integer
          description: Tasks currently executing
        completed:
          type: integer
          description: Tasks completed today
        failed:
          type: integer
          description: Tasks failed today
        workers:
          type: integer
          description: Active workers
    Experiment:
      type: object
      properties:
        id:
          type: string
        name:
          type: string
        commit_id:
          type: string
        created_at:
          type: string
          format: date-time
        status:
          type: string
          enum: [active, archived, deleted]
    CreateExperimentRequest:
      type: object
      required:
        - name
      properties:
        name:
          type: string
          maxLength: 128
        description:
          type: string
    JupyterService:
      type: object
      properties:
        id:
          type: string
        name:
          type: string
        status:
          type: string
          enum: [starting, running, stopping, stopped, error]
        url:
          type: string
          format: uri
        token:
          type: string
        created_at:
          type: string
          format: date-time
    StartJupyterRequest:
      type: object
      required:
        - name
      properties:
        name:
          type: string
        workspace:
          type: string
        image:
          type: string
          default: jupyter/pytorch:latest
    ErrorResponse:
      type: object
      required:
        - error
        - code
        - trace_id
      properties:
        error:
          type: string
          description: Sanitized error message
        code:
          type: string
          enum: [BAD_REQUEST, UNAUTHORIZED, FORBIDDEN, NOT_FOUND, CONFLICT, RATE_LIMITED, INTERNAL_ERROR, SERVICE_UNAVAILABLE, VALIDATION_ERROR]
        trace_id:
          type: string
          description: Support correlation ID
  responses:
    BadRequest:
      description: Invalid request
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/ErrorResponse'
          example:
            error: Invalid request format
            code: BAD_REQUEST
            trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
    Unauthorized:
      description: Authentication required
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/ErrorResponse'
          example:
            error: Invalid or missing API key
            code: UNAUTHORIZED
            trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
    Forbidden:
      description: Insufficient permissions
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/ErrorResponse'
          example:
            error: Insufficient permissions
            code: FORBIDDEN
            trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
    NotFound:
      description: Resource not found
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/ErrorResponse'
          example:
            error: Resource not found
            code: NOT_FOUND
            trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
    ValidationError:
      description: Validation failed
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/ErrorResponse'
          example:
            error: Validation failed
            code: VALIDATION_ERROR
            trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
    RateLimited:
      description: Too many requests
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/ErrorResponse'
          example:
            error: Rate limit exceeded
            code: RATE_LIMITED
            trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890
      headers:
        Retry-After:
          schema:
            type: integer
          description: Seconds until rate limit resets
    InternalError:
      description: Internal server error
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/ErrorResponse'
          example:
            error: An error occurred
            code: INTERNAL_ERROR
            trace_id: a1b2c3d4-e5f6-7890-abcd-ef1234567890