fetch_ml/internal/scheduler/hub_capabilities_test.go
Jeremie Fraeys 57787e1e7b
feat(scheduler): implement capability-based routing and hub v2
Add comprehensive capability routing system to scheduler hub:
- Capability-aware worker matching with requirement/offer negotiation
- Hub v2 protocol with structured message types and heartbeat management
- Worker capability advertisement and dynamic routing decisions
- Orphan recovery for disconnected workers with state reconciliation
- Template-based job scheduling with capability constraints

Add extensive test coverage:
- Unit tests for capability routing logic and heartbeat mechanics
- Unit tests for orphan recovery scenarios
- E2E tests for capability routing across multiple workers
- Hub capabilities integration tests
- Scheduler fixture helpers for test setup

Protocol improvements:
- Define structured protocol messages for hub-worker communication
- Add capability matching algorithm with scoring
- Implement graceful worker disconnection handling
2026-03-12 12:00:05 -04:00

357 lines
7.1 KiB
Go

package scheduler
import (
"testing"
"time"
)
func TestCanAdmit_BackendMatching(t *testing.T) {
h := &SchedulerHub{
reservations: make(map[string]*Reservation),
}
tests := []struct {
name string
workerCaps WorkerCapabilities
jobSpec JobSpec
want bool
}{
{
name: "backend matches nvidia",
workerCaps: WorkerCapabilities{
GPUBackend: BackendNVIDIA,
GPUCount: 4,
},
jobSpec: JobSpec{
GPUBackend: "nvidia",
GPUCount: 2,
},
want: true,
},
{
name: "backend matches metal",
workerCaps: WorkerCapabilities{
GPUBackend: BackendMetal,
GPUCount: 2,
},
jobSpec: JobSpec{
GPUBackend: "metal",
GPUCount: 1,
},
want: true,
},
{
name: "backend mismatch nvidia vs metal",
workerCaps: WorkerCapabilities{
GPUBackend: BackendNVIDIA,
GPUCount: 4,
},
jobSpec: JobSpec{
GPUBackend: "metal",
GPUCount: 1,
},
want: false,
},
{
name: "no backend required - any matches",
workerCaps: WorkerCapabilities{
GPUBackend: BackendVulkan,
GPUCount: 2,
},
jobSpec: JobSpec{
GPUBackend: "",
GPUCount: 1,
},
want: true,
},
{
name: "cpu job on cpu worker",
workerCaps: WorkerCapabilities{
GPUBackend: BackendCPU,
GPUCount: 0,
CPUCount: 8,
},
jobSpec: JobSpec{
GPUBackend: "cpu",
GPUCount: 0,
},
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
wc := &WorkerConn{
capabilities: tt.workerCaps,
slots: SlotStatus{BatchTotal: 4},
}
task := &Task{
ID: "test-task",
Spec: tt.jobSpec,
}
got := h.canAdmit(task, wc)
if got != tt.want {
t.Errorf("canAdmit() = %v, want %v", got, tt.want)
}
})
}
}
func TestCanAdmit_VRAMRequirements(t *testing.T) {
h := &SchedulerHub{
reservations: make(map[string]*Reservation),
}
tests := []struct {
name string
workerCaps WorkerCapabilities
jobSpec JobSpec
want bool
}{
{
name: "sufficient VRAM",
workerCaps: WorkerCapabilities{
GPUBackend: BackendNVIDIA,
GPUCount: 2,
VRAMGB: 32.0,
},
jobSpec: JobSpec{
MinVRAMGB: 16.0,
GPUCount: 1,
},
want: true,
},
{
name: "insufficient VRAM",
workerCaps: WorkerCapabilities{
GPUBackend: BackendNVIDIA,
GPUCount: 2,
VRAMGB: 8.0,
},
jobSpec: JobSpec{
MinVRAMGB: 16.0,
GPUCount: 1,
},
want: false,
},
{
name: "no VRAM required",
workerCaps: WorkerCapabilities{
GPUBackend: BackendNVIDIA,
GPUCount: 2,
VRAMGB: 8.0,
},
jobSpec: JobSpec{
MinVRAMGB: 0,
GPUCount: 1,
},
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
wc := &WorkerConn{
capabilities: tt.workerCaps,
slots: SlotStatus{BatchTotal: 4},
}
task := &Task{
ID: "test-task",
Spec: tt.jobSpec,
}
got := h.canAdmit(task, wc)
if got != tt.want {
t.Errorf("canAdmit() = %v, want %v", got, tt.want)
}
})
}
}
func TestCanAdmit_CPUCoresRequirements(t *testing.T) {
h := &SchedulerHub{
reservations: make(map[string]*Reservation),
}
tests := []struct {
name string
workerCaps WorkerCapabilities
jobSpec JobSpec
want bool
}{
{
name: "sufficient CPU cores",
workerCaps: WorkerCapabilities{
GPUBackend: BackendCPU,
CPUCount: 16,
},
jobSpec: JobSpec{
MinCPUCores: 8,
GPUCount: 0,
},
want: true,
},
{
name: "insufficient CPU cores",
workerCaps: WorkerCapabilities{
GPUBackend: BackendCPU,
CPUCount: 4,
},
jobSpec: JobSpec{
MinCPUCores: 8,
GPUCount: 0,
},
want: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
wc := &WorkerConn{
capabilities: tt.workerCaps,
slots: SlotStatus{BatchTotal: 4},
}
task := &Task{
ID: "test-task",
Spec: tt.jobSpec,
}
got := h.canAdmit(task, wc)
if got != tt.want {
t.Errorf("canAdmit() = %v, want %v", got, tt.want)
}
})
}
}
func TestCanAdmit_ReservedGPUs(t *testing.T) {
h := &SchedulerHub{
reservations: map[string]*Reservation{
"res-1": {TaskID: "task-1", GPUCount: 2},
"res-2": {TaskID: "task-2", GPUCount: 2},
},
}
tests := []struct {
name string
workerCaps WorkerCapabilities
jobSpec JobSpec
want bool
}{
{
name: "enough GPUs after reservations",
workerCaps: WorkerCapabilities{
GPUBackend: BackendNVIDIA,
GPUCount: 8,
},
jobSpec: JobSpec{
GPUCount: 4,
},
want: true, // 8 - (2+2) = 4 available
},
{
name: "not enough GPUs after reservations",
workerCaps: WorkerCapabilities{
GPUBackend: BackendNVIDIA,
GPUCount: 4,
},
jobSpec: JobSpec{
GPUCount: 2,
},
want: false, // 4 - (2+2) = 0 available
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
wc := &WorkerConn{
capabilities: tt.workerCaps,
slots: SlotStatus{BatchTotal: 4},
}
task := &Task{
ID: "test-task",
Spec: tt.jobSpec,
}
got := h.canAdmit(task, wc)
if got != tt.want {
t.Errorf("canAdmit() = %v, want %v", got, tt.want)
}
})
}
}
func TestReconcileOrphans_TierGracePeriods(t *testing.T) {
tests := []struct {
name string
jobTier JobTier
accepted bool
assignedAt time.Time
wantRequeued bool
}{
{
name: "data_processing tier - short grace period",
jobTier: TierDataProcessing,
accepted: true,
assignedAt: time.Now().Add(-35 * time.Second), // Past 30s grace
wantRequeued: true,
},
{
name: "training tier - long grace period",
jobTier: TierTraining,
accepted: true,
assignedAt: time.Now().Add(-5 * time.Minute), // Within 10min grace
wantRequeued: false,
},
{
name: "training tier - past grace period",
jobTier: TierTraining,
accepted: true,
assignedAt: time.Now().Add(-11 * time.Minute), // Past 10min grace
wantRequeued: true,
},
{
name: "evaluation tier - 2min grace",
jobTier: TierEvaluation,
accepted: true,
assignedAt: time.Now().Add(-3 * time.Minute), // Past 2min grace
wantRequeued: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
h := &SchedulerHub{
workers: make(map[string]*WorkerConn),
pendingAcceptance: make(map[string]*JobAssignment),
batchQueue: NewPriorityQueue(0.1),
config: HubConfig{
AcceptanceTimeoutSecs: 60,
},
}
task := &Task{
ID: "test-task",
Status: "assigned",
Spec: JobSpec{
JobTier: tt.jobTier,
},
}
h.pendingAcceptance["test-task"] = &JobAssignment{
TaskID: "test-task",
WorkerID: "disconnected-worker",
AssignedAt: tt.assignedAt,
Accepted: tt.accepted,
Task: task,
}
h.reconcileOrphans()
// Check if task was requeued
_, stillPending := h.pendingAcceptance["test-task"]
wasRequeued := !stillPending
if wasRequeued != tt.wantRequeued {
t.Errorf("reconcileOrphans() requeued=%v, want=%v", wasRequeued, tt.wantRequeued)
}
})
}
}