Add comprehensive capability routing system to scheduler hub: - Capability-aware worker matching with requirement/offer negotiation - Hub v2 protocol with structured message types and heartbeat management - Worker capability advertisement and dynamic routing decisions - Orphan recovery for disconnected workers with state reconciliation - Template-based job scheduling with capability constraints Add extensive test coverage: - Unit tests for capability routing logic and heartbeat mechanics - Unit tests for orphan recovery scenarios - E2E tests for capability routing across multiple workers - Hub capabilities integration tests - Scheduler fixture helpers for test setup Protocol improvements: - Define structured protocol messages for hub-worker communication - Add capability matching algorithm with scoring - Implement graceful worker disconnection handling
357 lines
7.1 KiB
Go
357 lines
7.1 KiB
Go
package scheduler
|
|
|
|
import (
|
|
"testing"
|
|
"time"
|
|
)
|
|
|
|
func TestCanAdmit_BackendMatching(t *testing.T) {
|
|
h := &SchedulerHub{
|
|
reservations: make(map[string]*Reservation),
|
|
}
|
|
|
|
tests := []struct {
|
|
name string
|
|
workerCaps WorkerCapabilities
|
|
jobSpec JobSpec
|
|
want bool
|
|
}{
|
|
{
|
|
name: "backend matches nvidia",
|
|
workerCaps: WorkerCapabilities{
|
|
GPUBackend: BackendNVIDIA,
|
|
GPUCount: 4,
|
|
},
|
|
jobSpec: JobSpec{
|
|
GPUBackend: "nvidia",
|
|
GPUCount: 2,
|
|
},
|
|
want: true,
|
|
},
|
|
{
|
|
name: "backend matches metal",
|
|
workerCaps: WorkerCapabilities{
|
|
GPUBackend: BackendMetal,
|
|
GPUCount: 2,
|
|
},
|
|
jobSpec: JobSpec{
|
|
GPUBackend: "metal",
|
|
GPUCount: 1,
|
|
},
|
|
want: true,
|
|
},
|
|
{
|
|
name: "backend mismatch nvidia vs metal",
|
|
workerCaps: WorkerCapabilities{
|
|
GPUBackend: BackendNVIDIA,
|
|
GPUCount: 4,
|
|
},
|
|
jobSpec: JobSpec{
|
|
GPUBackend: "metal",
|
|
GPUCount: 1,
|
|
},
|
|
want: false,
|
|
},
|
|
{
|
|
name: "no backend required - any matches",
|
|
workerCaps: WorkerCapabilities{
|
|
GPUBackend: BackendVulkan,
|
|
GPUCount: 2,
|
|
},
|
|
jobSpec: JobSpec{
|
|
GPUBackend: "",
|
|
GPUCount: 1,
|
|
},
|
|
want: true,
|
|
},
|
|
{
|
|
name: "cpu job on cpu worker",
|
|
workerCaps: WorkerCapabilities{
|
|
GPUBackend: BackendCPU,
|
|
GPUCount: 0,
|
|
CPUCount: 8,
|
|
},
|
|
jobSpec: JobSpec{
|
|
GPUBackend: "cpu",
|
|
GPUCount: 0,
|
|
},
|
|
want: true,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
wc := &WorkerConn{
|
|
capabilities: tt.workerCaps,
|
|
slots: SlotStatus{BatchTotal: 4},
|
|
}
|
|
task := &Task{
|
|
ID: "test-task",
|
|
Spec: tt.jobSpec,
|
|
}
|
|
got := h.canAdmit(task, wc)
|
|
if got != tt.want {
|
|
t.Errorf("canAdmit() = %v, want %v", got, tt.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestCanAdmit_VRAMRequirements(t *testing.T) {
|
|
h := &SchedulerHub{
|
|
reservations: make(map[string]*Reservation),
|
|
}
|
|
|
|
tests := []struct {
|
|
name string
|
|
workerCaps WorkerCapabilities
|
|
jobSpec JobSpec
|
|
want bool
|
|
}{
|
|
{
|
|
name: "sufficient VRAM",
|
|
workerCaps: WorkerCapabilities{
|
|
GPUBackend: BackendNVIDIA,
|
|
GPUCount: 2,
|
|
VRAMGB: 32.0,
|
|
},
|
|
jobSpec: JobSpec{
|
|
MinVRAMGB: 16.0,
|
|
GPUCount: 1,
|
|
},
|
|
want: true,
|
|
},
|
|
{
|
|
name: "insufficient VRAM",
|
|
workerCaps: WorkerCapabilities{
|
|
GPUBackend: BackendNVIDIA,
|
|
GPUCount: 2,
|
|
VRAMGB: 8.0,
|
|
},
|
|
jobSpec: JobSpec{
|
|
MinVRAMGB: 16.0,
|
|
GPUCount: 1,
|
|
},
|
|
want: false,
|
|
},
|
|
{
|
|
name: "no VRAM required",
|
|
workerCaps: WorkerCapabilities{
|
|
GPUBackend: BackendNVIDIA,
|
|
GPUCount: 2,
|
|
VRAMGB: 8.0,
|
|
},
|
|
jobSpec: JobSpec{
|
|
MinVRAMGB: 0,
|
|
GPUCount: 1,
|
|
},
|
|
want: true,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
wc := &WorkerConn{
|
|
capabilities: tt.workerCaps,
|
|
slots: SlotStatus{BatchTotal: 4},
|
|
}
|
|
task := &Task{
|
|
ID: "test-task",
|
|
Spec: tt.jobSpec,
|
|
}
|
|
got := h.canAdmit(task, wc)
|
|
if got != tt.want {
|
|
t.Errorf("canAdmit() = %v, want %v", got, tt.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestCanAdmit_CPUCoresRequirements(t *testing.T) {
|
|
h := &SchedulerHub{
|
|
reservations: make(map[string]*Reservation),
|
|
}
|
|
|
|
tests := []struct {
|
|
name string
|
|
workerCaps WorkerCapabilities
|
|
jobSpec JobSpec
|
|
want bool
|
|
}{
|
|
{
|
|
name: "sufficient CPU cores",
|
|
workerCaps: WorkerCapabilities{
|
|
GPUBackend: BackendCPU,
|
|
CPUCount: 16,
|
|
},
|
|
jobSpec: JobSpec{
|
|
MinCPUCores: 8,
|
|
GPUCount: 0,
|
|
},
|
|
want: true,
|
|
},
|
|
{
|
|
name: "insufficient CPU cores",
|
|
workerCaps: WorkerCapabilities{
|
|
GPUBackend: BackendCPU,
|
|
CPUCount: 4,
|
|
},
|
|
jobSpec: JobSpec{
|
|
MinCPUCores: 8,
|
|
GPUCount: 0,
|
|
},
|
|
want: false,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
wc := &WorkerConn{
|
|
capabilities: tt.workerCaps,
|
|
slots: SlotStatus{BatchTotal: 4},
|
|
}
|
|
task := &Task{
|
|
ID: "test-task",
|
|
Spec: tt.jobSpec,
|
|
}
|
|
got := h.canAdmit(task, wc)
|
|
if got != tt.want {
|
|
t.Errorf("canAdmit() = %v, want %v", got, tt.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestCanAdmit_ReservedGPUs(t *testing.T) {
|
|
h := &SchedulerHub{
|
|
reservations: map[string]*Reservation{
|
|
"res-1": {TaskID: "task-1", GPUCount: 2},
|
|
"res-2": {TaskID: "task-2", GPUCount: 2},
|
|
},
|
|
}
|
|
|
|
tests := []struct {
|
|
name string
|
|
workerCaps WorkerCapabilities
|
|
jobSpec JobSpec
|
|
want bool
|
|
}{
|
|
{
|
|
name: "enough GPUs after reservations",
|
|
workerCaps: WorkerCapabilities{
|
|
GPUBackend: BackendNVIDIA,
|
|
GPUCount: 8,
|
|
},
|
|
jobSpec: JobSpec{
|
|
GPUCount: 4,
|
|
},
|
|
want: true, // 8 - (2+2) = 4 available
|
|
},
|
|
{
|
|
name: "not enough GPUs after reservations",
|
|
workerCaps: WorkerCapabilities{
|
|
GPUBackend: BackendNVIDIA,
|
|
GPUCount: 4,
|
|
},
|
|
jobSpec: JobSpec{
|
|
GPUCount: 2,
|
|
},
|
|
want: false, // 4 - (2+2) = 0 available
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
wc := &WorkerConn{
|
|
capabilities: tt.workerCaps,
|
|
slots: SlotStatus{BatchTotal: 4},
|
|
}
|
|
task := &Task{
|
|
ID: "test-task",
|
|
Spec: tt.jobSpec,
|
|
}
|
|
got := h.canAdmit(task, wc)
|
|
if got != tt.want {
|
|
t.Errorf("canAdmit() = %v, want %v", got, tt.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestReconcileOrphans_TierGracePeriods(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
jobTier JobTier
|
|
accepted bool
|
|
assignedAt time.Time
|
|
wantRequeued bool
|
|
}{
|
|
{
|
|
name: "data_processing tier - short grace period",
|
|
jobTier: TierDataProcessing,
|
|
accepted: true,
|
|
assignedAt: time.Now().Add(-35 * time.Second), // Past 30s grace
|
|
wantRequeued: true,
|
|
},
|
|
{
|
|
name: "training tier - long grace period",
|
|
jobTier: TierTraining,
|
|
accepted: true,
|
|
assignedAt: time.Now().Add(-5 * time.Minute), // Within 10min grace
|
|
wantRequeued: false,
|
|
},
|
|
{
|
|
name: "training tier - past grace period",
|
|
jobTier: TierTraining,
|
|
accepted: true,
|
|
assignedAt: time.Now().Add(-11 * time.Minute), // Past 10min grace
|
|
wantRequeued: true,
|
|
},
|
|
{
|
|
name: "evaluation tier - 2min grace",
|
|
jobTier: TierEvaluation,
|
|
accepted: true,
|
|
assignedAt: time.Now().Add(-3 * time.Minute), // Past 2min grace
|
|
wantRequeued: true,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
h := &SchedulerHub{
|
|
workers: make(map[string]*WorkerConn),
|
|
pendingAcceptance: make(map[string]*JobAssignment),
|
|
batchQueue: NewPriorityQueue(0.1),
|
|
config: HubConfig{
|
|
AcceptanceTimeoutSecs: 60,
|
|
},
|
|
}
|
|
|
|
task := &Task{
|
|
ID: "test-task",
|
|
Status: "assigned",
|
|
Spec: JobSpec{
|
|
JobTier: tt.jobTier,
|
|
},
|
|
}
|
|
|
|
h.pendingAcceptance["test-task"] = &JobAssignment{
|
|
TaskID: "test-task",
|
|
WorkerID: "disconnected-worker",
|
|
AssignedAt: tt.assignedAt,
|
|
Accepted: tt.accepted,
|
|
Task: task,
|
|
}
|
|
|
|
h.reconcileOrphans()
|
|
|
|
// Check if task was requeued
|
|
_, stillPending := h.pendingAcceptance["test-task"]
|
|
wasRequeued := !stillPending
|
|
|
|
if wasRequeued != tt.wantRequeued {
|
|
t.Errorf("reconcileOrphans() requeued=%v, want=%v", wasRequeued, tt.wantRequeued)
|
|
}
|
|
})
|
|
}
|
|
}
|