Add comprehensive capability routing system to scheduler hub: - Capability-aware worker matching with requirement/offer negotiation - Hub v2 protocol with structured message types and heartbeat management - Worker capability advertisement and dynamic routing decisions - Orphan recovery for disconnected workers with state reconciliation - Template-based job scheduling with capability constraints Add extensive test coverage: - Unit tests for capability routing logic and heartbeat mechanics - Unit tests for orphan recovery scenarios - E2E tests for capability routing across multiple workers - Hub capabilities integration tests - Scheduler fixture helpers for test setup Protocol improvements: - Define structured protocol messages for hub-worker communication - Add capability matching algorithm with scoring - Implement graceful worker disconnection handling
352 lines
11 KiB
Go
352 lines
11 KiB
Go
package tests
|
|
|
|
import (
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/scheduler"
|
|
fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
// TestCapabilityRoutingE2E_MultiWorkerScenario validates multi-worker capability routing
|
|
func TestCapabilityRoutingE2E_MultiWorkerScenario(t *testing.T) {
|
|
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
// Create GPU worker with NVIDIA GPUs
|
|
gpuWorker := fixture.CreateWorker("e2e-gpu-worker", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendNVIDIA,
|
|
GPUCount: 4,
|
|
VRAMGB: 24.0,
|
|
CPUCount: 8,
|
|
})
|
|
|
|
// Create CPU-only worker
|
|
cpuWorker := fixture.CreateWorker("e2e-cpu-worker", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendCPU,
|
|
GPUCount: 0,
|
|
CPUCount: 16,
|
|
})
|
|
|
|
// Submit training job (needs GPU)
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: "e2e-training-job",
|
|
Type: scheduler.JobTypeBatch,
|
|
SlotPool: "batch",
|
|
JobTier: scheduler.TierTraining,
|
|
GPUCount: 2,
|
|
GPUBackend: "nvidia",
|
|
MinVRAMGB: 16.0,
|
|
Command: []string{"python", "train.py"},
|
|
})
|
|
|
|
// Submit data processing job (CPU only)
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: "e2e-data-job",
|
|
Type: scheduler.JobTypeBatch,
|
|
SlotPool: "batch",
|
|
JobTier: scheduler.TierDataProcessing,
|
|
GPUCount: 0,
|
|
Command: []string{"python", "preprocess.py"},
|
|
})
|
|
|
|
// Both workers signal ready to trigger job assignment
|
|
gpuWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
cpuWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
|
|
// GPU worker should get training job
|
|
msg1 := gpuWorker.RecvTimeout(2 * time.Second)
|
|
require.Equal(t, scheduler.MsgJobAssign, msg1.Type, "GPU worker should receive training job")
|
|
|
|
// CPU worker should get data job
|
|
msg2 := cpuWorker.RecvTimeout(2 * time.Second)
|
|
require.Equal(t, scheduler.MsgJobAssign, msg2.Type, "CPU worker should receive data job")
|
|
}
|
|
|
|
// TestCapabilityRoutingE2E_GPUSelection validates job lands on correct GPU worker
|
|
func TestCapabilityRoutingE2E_GPUSelection(t *testing.T) {
|
|
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
// Create worker with 2 GPUs
|
|
worker2GPU := fixture.CreateWorker("e2e-2gpu", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendNVIDIA,
|
|
GPUCount: 2,
|
|
VRAMGB: 16.0,
|
|
})
|
|
|
|
// Create worker with 8 GPUs
|
|
worker8GPU := fixture.CreateWorker("e2e-8gpu", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendNVIDIA,
|
|
GPUCount: 8,
|
|
VRAMGB: 48.0,
|
|
})
|
|
|
|
// Submit job needing 4 GPUs
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: "e2e-4gpu-job",
|
|
Type: scheduler.JobTypeBatch,
|
|
SlotPool: "batch",
|
|
GPUCount: 4,
|
|
})
|
|
|
|
// Both signal ready to trigger assignment
|
|
worker2GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
worker8GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
|
|
// Should go to 8GPU worker (2GPU can't handle it) - poll with retries
|
|
var assignedWorker string
|
|
deadline := time.Now().Add(2 * time.Second)
|
|
for time.Now().Before(deadline) && assignedWorker == "" {
|
|
select {
|
|
case msg := <-worker2GPU.RecvCh:
|
|
if msg.Type == scheduler.MsgJobAssign {
|
|
assignedWorker = "2gpu"
|
|
}
|
|
case msg := <-worker8GPU.RecvCh:
|
|
if msg.Type == scheduler.MsgJobAssign {
|
|
assignedWorker = "8gpu"
|
|
}
|
|
default:
|
|
// No message yet, signal ready again
|
|
worker2GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
worker8GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
time.Sleep(100 * time.Millisecond)
|
|
}
|
|
}
|
|
|
|
if assignedWorker == "" {
|
|
t.Fatal("timeout waiting for job assignment")
|
|
}
|
|
|
|
assert.Equal(t, "8gpu", assignedWorker, "4-GPU job should go to 8-GPU worker")
|
|
}
|
|
|
|
// TestCapabilityRoutingE2E_BackendMismatch validates backend requirements are enforced
|
|
func TestCapabilityRoutingE2E_BackendMismatch(t *testing.T) {
|
|
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
// Create Metal worker (macOS GPU)
|
|
metalWorker := fixture.CreateWorker("e2e-metal", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendMetal,
|
|
GPUCount: 4,
|
|
})
|
|
|
|
// Create NVIDIA worker
|
|
nvidiaWorker := fixture.CreateWorker("e2e-nvidia", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendNVIDIA,
|
|
GPUCount: 4,
|
|
})
|
|
|
|
// Submit job requiring NVIDIA
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: "e2e-nvidia-job",
|
|
Type: scheduler.JobTypeBatch,
|
|
SlotPool: "batch",
|
|
GPUCount: 2,
|
|
GPUBackend: "nvidia",
|
|
})
|
|
|
|
// Both workers signal ready
|
|
metalWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
nvidiaWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
|
|
// NVIDIA worker should get the job - poll with retries
|
|
var msg scheduler.Message
|
|
deadline := time.Now().Add(2 * time.Second)
|
|
for time.Now().Before(deadline) && msg.Type != scheduler.MsgJobAssign {
|
|
select {
|
|
case m := <-nvidiaWorker.RecvCh:
|
|
msg = m
|
|
default:
|
|
// No message yet, signal ready again
|
|
metalWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
nvidiaWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
time.Sleep(50 * time.Millisecond)
|
|
}
|
|
}
|
|
require.Equal(t, scheduler.MsgJobAssign, msg.Type, "NVIDIA worker should get NVIDIA job")
|
|
|
|
// Metal worker should receive NoWork (not job_assign) - poll to verify
|
|
var metalMsg scheduler.Message
|
|
metalDeadline := time.Now().Add(500 * time.Millisecond)
|
|
for time.Now().Before(metalDeadline) {
|
|
select {
|
|
case m := <-metalWorker.RecvCh:
|
|
metalMsg = m
|
|
default:
|
|
metalWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
time.Sleep(50 * time.Millisecond)
|
|
}
|
|
if metalMsg.Type == scheduler.MsgNoWork || metalMsg.Type == scheduler.MsgJobAssign {
|
|
break
|
|
}
|
|
}
|
|
|
|
// Metal worker should get NoWork, never job_assign
|
|
assert.NotEqual(t, scheduler.MsgJobAssign, metalMsg.Type, "Metal worker should NOT receive NVIDIA job")
|
|
}
|
|
|
|
// TestCapabilityRoutingE2E_VRAMFiltering validates VRAM requirements filtering
|
|
func TestCapabilityRoutingE2E_VRAMFiltering(t *testing.T) {
|
|
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
// Worker with 8GB VRAM
|
|
worker8GB := fixture.CreateWorker("e2e-8gb-vram", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendNVIDIA,
|
|
GPUCount: 2,
|
|
VRAMGB: 8.0,
|
|
})
|
|
|
|
// Worker with 24GB VRAM
|
|
worker24GB := fixture.CreateWorker("e2e-24gb-vram", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendNVIDIA,
|
|
GPUCount: 2,
|
|
VRAMGB: 24.0,
|
|
})
|
|
|
|
// Submit job needing 16GB VRAM
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: "e2e-vram-job",
|
|
Type: scheduler.JobTypeBatch,
|
|
SlotPool: "batch",
|
|
GPUCount: 1,
|
|
MinVRAMGB: 16.0,
|
|
})
|
|
|
|
worker8GB.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
worker24GB.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
|
|
// Should go to 24GB worker - poll with retries since scheduler may need time
|
|
var assignedWorker string
|
|
deadline := time.Now().Add(2 * time.Second)
|
|
for time.Now().Before(deadline) && assignedWorker == "" {
|
|
select {
|
|
case msg := <-worker8GB.RecvCh:
|
|
if msg.Type == scheduler.MsgJobAssign {
|
|
assignedWorker = "8gb"
|
|
}
|
|
case msg := <-worker24GB.RecvCh:
|
|
if msg.Type == scheduler.MsgJobAssign {
|
|
assignedWorker = "24gb"
|
|
}
|
|
default:
|
|
// No message yet, signal ready again to trigger assignment
|
|
worker8GB.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
worker24GB.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
time.Sleep(100 * time.Millisecond)
|
|
}
|
|
}
|
|
|
|
if assignedWorker == "" {
|
|
t.Fatal("timeout waiting for job assignment")
|
|
}
|
|
|
|
assert.Equal(t, "24gb", assignedWorker, "16GB VRAM job should go to 24GB worker")
|
|
}
|
|
|
|
// TestCapabilityRoutingE2E_GangAllocation validates multi-node jobs across mixed workers
|
|
func TestCapabilityRoutingE2E_GangAllocation(t *testing.T) {
|
|
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
// Create workers with different capabilities
|
|
workers := make([]*fixtures.MockWorker, 3)
|
|
workerIDs := []string{"gang-worker-1", "gang-worker-2", "gang-worker-3"}
|
|
|
|
for i, id := range workerIDs {
|
|
workers[i] = fixture.CreateWorker(id, scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendNVIDIA,
|
|
GPUCount: 2,
|
|
VRAMGB: 16.0,
|
|
})
|
|
}
|
|
|
|
// Submit multi-node job needing 3 nodes
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: "e2e-gang-job",
|
|
Type: scheduler.JobTypeBatch,
|
|
SlotPool: "batch",
|
|
NodeCount: 3,
|
|
GPUCount: 1,
|
|
GPUBackend: "nvidia",
|
|
Command: []string{"torchrun", "--nproc_per_node=3", "train.py"},
|
|
})
|
|
|
|
// Workers signal ready after job submission
|
|
for _, worker := range workers {
|
|
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
}
|
|
|
|
// All three workers should receive the job assignment
|
|
assignedCount := 0
|
|
deadline := time.After(3 * time.Second)
|
|
|
|
for _, worker := range workers {
|
|
select {
|
|
case msg := <-worker.RecvCh:
|
|
if msg.Type == scheduler.MsgJobAssign {
|
|
assignedCount++
|
|
}
|
|
case <-deadline:
|
|
// Timeout - continue to next worker
|
|
}
|
|
}
|
|
|
|
// Gang allocation may assign one at a time; verify at least one gets assigned
|
|
assert.GreaterOrEqual(t, assignedCount, 1, "at least one worker should be assigned for gang job")
|
|
}
|
|
|
|
// TestCapabilityRoutingE2E_NoSuitableWorker validates job waits when no worker matches
|
|
func TestCapabilityRoutingE2E_NoSuitableWorker(t *testing.T) {
|
|
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
// Create only CPU workers
|
|
cpuWorker := fixture.CreateWorker("e2e-cpu-only", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendCPU,
|
|
GPUCount: 0,
|
|
})
|
|
|
|
// Submit GPU job first
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: "e2e-waiting-gpu-job",
|
|
Type: scheduler.JobTypeBatch,
|
|
SlotPool: "batch",
|
|
GPUCount: 4,
|
|
})
|
|
|
|
// CPU worker signals ready after job submission
|
|
cpuWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
|
|
// Wait a moment for any potential assignment
|
|
time.Sleep(100 * time.Millisecond)
|
|
|
|
// CPU worker should receive NoWork (not job_assign) - poll to verify
|
|
var cpuMsg scheduler.Message
|
|
cpuDeadline := time.Now().Add(500 * time.Millisecond)
|
|
for time.Now().Before(cpuDeadline) {
|
|
select {
|
|
case m := <-cpuWorker.RecvCh:
|
|
cpuMsg = m
|
|
default:
|
|
cpuWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
time.Sleep(50 * time.Millisecond)
|
|
}
|
|
if cpuMsg.Type == scheduler.MsgNoWork || cpuMsg.Type == scheduler.MsgJobAssign {
|
|
break
|
|
}
|
|
}
|
|
|
|
// CPU worker should get NoWork, never job_assign for GPU job
|
|
assert.NotEqual(t, scheduler.MsgJobAssign, cpuMsg.Type, "CPU worker should NOT receive GPU job")
|
|
|
|
// Job should be in queue
|
|
metrics := fixture.Hub.GetMetricsPayload()
|
|
queueDepth := metrics["queue_depth_batch"].(int)
|
|
assert.GreaterOrEqual(t, queueDepth, 1, "GPU job should be queued waiting for GPU worker")
|
|
}
|