Skip to content

Task Queue Architecture

The task queue system enables reliable job processing between the API server and workers using Redis.

Overview

graph LR
    CLI[CLI/Client] -->|WebSocket| API[API Server]
    API -->|Enqueue| Redis[(Redis)]
    Redis -->|Dequeue| Worker[Worker]
    Worker -->|Update Status| Redis

Components

TaskQueue (internal/queue)

Shared package used by both API server and worker for job management.

Task Structure

type Task struct {
    ID        string            // Unique task ID (UUID)
    JobName   string            // User-defined job name  
    Args      string            // Job arguments
    Status    string            // queued, running, completed, failed
    Priority  int64             // Higher = executed first
    CreatedAt time.Time         
    StartedAt *time.Time        
    EndedAt   *time.Time        
    WorkerID  string            
    Error     string            
    Datasets  []string          
    Metadata  map[string]string // commit_id, user, etc
}

TaskQueue Interface

// Initialize queue
queue, err := queue.NewTaskQueue(queue.Config{
    RedisAddr:     "localhost:6379",
    RedisPassword: "",
    RedisDB:       0,
})

// Add task (API server)
task := &queue.Task{
    ID:       uuid.New().String(),
    JobName:  "train-model",
    Status:   "queued",
    Priority: 5,
    Metadata: map[string]string{
        "commit_id": commitID,
        "user":      username,
    },
}
err = queue.AddTask(task)

// Get next task (Worker)
task, err := queue.GetNextTask()

// Update task status
task.Status = "running"
err = queue.UpdateTask(task)

Data Flow

Job Submission Flow

sequenceDiagram
    participant CLI
    participant API
    participant Redis
    participant Worker

    CLI->>API: Queue Job (WebSocket)
    API->>API: Create Task (UUID)
    API->>Redis: ZADD task:queue
    API->>Redis: SET task:{id}
    API->>CLI: Success Response

    Worker->>Redis: ZPOPMAX task:queue
    Redis->>Worker: Task ID
    Worker->>Redis: GET task:{id}
    Redis->>Worker: Task Data
    Worker->>Worker: Execute Job
    Worker->>Redis: Update Status

Protocol

CLI → API (Binary WebSocket):

[opcode:1][api_key_hash:64][commit_id:64][priority:1][job_name_len:1][job_name:var]

API → Redis: - Priority queue: ZADD task:queue {priority} {task_id} - Task data: SET task:{id} {json} - Status: HSET task:status:{job_name} ...

Worker ← Redis: - Poll: ZPOPMAX task:queue 1 (highest priority first) - Fetch: GET task:{id}

Redis Data Structures

Keys

task:queue                    # ZSET: priority queue
task:{uuid}                  # STRING: task JSON data
task:status:{job_name}       # HASH: job status
worker:heartbeat             # HASH: worker health
job:metrics:{job_name}       # HASH: job metrics

Priority Queue (ZSET)

ZADD task:queue 10 "uuid-1"   # Priority 10
ZADD task:queue 5  "uuid-2"   # Priority 5
ZPOPMAX task:queue 1          # Returns uuid-1 (highest)

API Server Integration

Initialization

// cmd/api-server/main.go
queueCfg := queue.Config{
    RedisAddr:     cfg.Redis.Addr,
    RedisPassword: cfg.Redis.Password,
    RedisDB:       cfg.Redis.DB,
}
taskQueue, err := queue.NewTaskQueue(queueCfg)

WebSocket Handler

// internal/api/ws.go
func (h *WSHandler) handleQueueJob(conn *websocket.Conn, payload []byte) error {
    // Parse request
    apiKeyHash, commitID, priority, jobName := parsePayload(payload)

    // Create task with unique ID
    taskID := uuid.New().String()
    task := &queue.Task{
        ID:       taskID,
        JobName:  jobName,
        Status:   "queued",
        Priority: int64(priority),
        Metadata: map[string]string{
            "commit_id": commitID,
            "user":      user,
        },
    }

    // Enqueue
    if err := h.queue.AddTask(task); err != nil {
        return h.sendErrorPacket(conn, ErrorCodeDatabaseError, ...)
    }

    return h.sendSuccessPacket(conn, "Job queued")
}

Worker Integration

Task Polling

// cmd/worker/worker_server.go
func (w *Worker) Start() error {
    for {
        task, err := w.queue.WaitForNextTask(ctx, 5*time.Second)
        if task != nil {
            go w.executeTask(task)
        }
    }
}

Task Execution

func (w *Worker) executeTask(task *queue.Task) {
    // Update status
    task.Status = "running"
    task.StartedAt = &now
    w.queue.UpdateTaskWithMetrics(task, "start")

    // Execute
    err := w.runJob(task)

    // Finalize
    task.Status = "completed" // or "failed"
    task.EndedAt = &endTime
    task.Error = err.Error() // if err != nil
    w.queue.UpdateTaskWithMetrics(task, "final")
}

Configuration

API Server (configs/config.yaml)

redis:
  addr: "localhost:6379"
  password: ""
  db: 0

Worker (configs/worker-config.yaml)

redis:
  addr: "localhost:6379"
  password: ""
  db: 0

metrics_flush_interval: 500ms

Monitoring

Queue Depth

depth, err := queue.QueueDepth()
fmt.Printf("Pending tasks: %d\n", depth)

Worker Heartbeat

// Worker sends heartbeat every 30s
err := queue.Heartbeat(workerID)

Metrics

HGETALL job:metrics:{job_name}
# Returns: timestamp, tasks_start, tasks_final, etc

Error Handling

Task Failures

if err := w.runJob(task); err != nil {
    task.Status = "failed"
    task.Error = err.Error()
    w.queue.UpdateTask(task)
}

Redis Connection Loss

// TaskQueue automatically reconnects
// Workers should implement retry logic
for retries := 0; retries < 3; retries++ {
    task, err := queue.GetNextTask()
    if err == nil {
        break
    }
    time.Sleep(backoff)
}

Testing

// tests using miniredis
s, _ := miniredis.Run()
defer s.Close()

tq, _ := queue.NewTaskQueue(queue.Config{
    RedisAddr: s.Addr(),
})

task := &queue.Task{ID: "test-1", JobName: "test"}
tq.AddTask(task)

fetched, _ := tq.GetNextTask()
// assert fetched.ID == "test-1"

Best Practices

  1. Unique Task IDs: Always use UUIDs to avoid conflicts
  2. Metadata: Store commit_id and user in task metadata
  3. Priority: Higher values execute first (0-255 range)
  4. Status Updates: Update status at each lifecycle stage
  5. Error Logging: Store detailed errors in task.Error
  6. Heartbeats: Workers should send heartbeats regularly
  7. Metrics: Use UpdateTaskWithMetrics for atomic updates

For implementation details, see: - internal/queue/task.go - internal/queue/queue.go