fetch_ml/tests/e2e/capability_routing_e2e_test.go
Jeremie Fraeys 57787e1e7b
feat(scheduler): implement capability-based routing and hub v2
Add comprehensive capability routing system to scheduler hub:
- Capability-aware worker matching with requirement/offer negotiation
- Hub v2 protocol with structured message types and heartbeat management
- Worker capability advertisement and dynamic routing decisions
- Orphan recovery for disconnected workers with state reconciliation
- Template-based job scheduling with capability constraints

Add extensive test coverage:
- Unit tests for capability routing logic and heartbeat mechanics
- Unit tests for orphan recovery scenarios
- E2E tests for capability routing across multiple workers
- Hub capabilities integration tests
- Scheduler fixture helpers for test setup

Protocol improvements:
- Define structured protocol messages for hub-worker communication
- Add capability matching algorithm with scoring
- Implement graceful worker disconnection handling
2026-03-12 12:00:05 -04:00

352 lines
11 KiB
Go

package tests
import (
"testing"
"time"
"github.com/jfraeys/fetch_ml/internal/scheduler"
fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestCapabilityRoutingE2E_MultiWorkerScenario validates multi-worker capability routing
func TestCapabilityRoutingE2E_MultiWorkerScenario(t *testing.T) {
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
// Create GPU worker with NVIDIA GPUs
gpuWorker := fixture.CreateWorker("e2e-gpu-worker", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendNVIDIA,
GPUCount: 4,
VRAMGB: 24.0,
CPUCount: 8,
})
// Create CPU-only worker
cpuWorker := fixture.CreateWorker("e2e-cpu-worker", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendCPU,
GPUCount: 0,
CPUCount: 16,
})
// Submit training job (needs GPU)
fixture.SubmitJob(scheduler.JobSpec{
ID: "e2e-training-job",
Type: scheduler.JobTypeBatch,
SlotPool: "batch",
JobTier: scheduler.TierTraining,
GPUCount: 2,
GPUBackend: "nvidia",
MinVRAMGB: 16.0,
Command: []string{"python", "train.py"},
})
// Submit data processing job (CPU only)
fixture.SubmitJob(scheduler.JobSpec{
ID: "e2e-data-job",
Type: scheduler.JobTypeBatch,
SlotPool: "batch",
JobTier: scheduler.TierDataProcessing,
GPUCount: 0,
Command: []string{"python", "preprocess.py"},
})
// Both workers signal ready to trigger job assignment
gpuWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
cpuWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
// GPU worker should get training job
msg1 := gpuWorker.RecvTimeout(2 * time.Second)
require.Equal(t, scheduler.MsgJobAssign, msg1.Type, "GPU worker should receive training job")
// CPU worker should get data job
msg2 := cpuWorker.RecvTimeout(2 * time.Second)
require.Equal(t, scheduler.MsgJobAssign, msg2.Type, "CPU worker should receive data job")
}
// TestCapabilityRoutingE2E_GPUSelection validates job lands on correct GPU worker
func TestCapabilityRoutingE2E_GPUSelection(t *testing.T) {
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
// Create worker with 2 GPUs
worker2GPU := fixture.CreateWorker("e2e-2gpu", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendNVIDIA,
GPUCount: 2,
VRAMGB: 16.0,
})
// Create worker with 8 GPUs
worker8GPU := fixture.CreateWorker("e2e-8gpu", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendNVIDIA,
GPUCount: 8,
VRAMGB: 48.0,
})
// Submit job needing 4 GPUs
fixture.SubmitJob(scheduler.JobSpec{
ID: "e2e-4gpu-job",
Type: scheduler.JobTypeBatch,
SlotPool: "batch",
GPUCount: 4,
})
// Both signal ready to trigger assignment
worker2GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
worker8GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
// Should go to 8GPU worker (2GPU can't handle it) - poll with retries
var assignedWorker string
deadline := time.Now().Add(2 * time.Second)
for time.Now().Before(deadline) && assignedWorker == "" {
select {
case msg := <-worker2GPU.RecvCh:
if msg.Type == scheduler.MsgJobAssign {
assignedWorker = "2gpu"
}
case msg := <-worker8GPU.RecvCh:
if msg.Type == scheduler.MsgJobAssign {
assignedWorker = "8gpu"
}
default:
// No message yet, signal ready again
worker2GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
worker8GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
time.Sleep(100 * time.Millisecond)
}
}
if assignedWorker == "" {
t.Fatal("timeout waiting for job assignment")
}
assert.Equal(t, "8gpu", assignedWorker, "4-GPU job should go to 8-GPU worker")
}
// TestCapabilityRoutingE2E_BackendMismatch validates backend requirements are enforced
func TestCapabilityRoutingE2E_BackendMismatch(t *testing.T) {
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
// Create Metal worker (macOS GPU)
metalWorker := fixture.CreateWorker("e2e-metal", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendMetal,
GPUCount: 4,
})
// Create NVIDIA worker
nvidiaWorker := fixture.CreateWorker("e2e-nvidia", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendNVIDIA,
GPUCount: 4,
})
// Submit job requiring NVIDIA
fixture.SubmitJob(scheduler.JobSpec{
ID: "e2e-nvidia-job",
Type: scheduler.JobTypeBatch,
SlotPool: "batch",
GPUCount: 2,
GPUBackend: "nvidia",
})
// Both workers signal ready
metalWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
nvidiaWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
// NVIDIA worker should get the job - poll with retries
var msg scheduler.Message
deadline := time.Now().Add(2 * time.Second)
for time.Now().Before(deadline) && msg.Type != scheduler.MsgJobAssign {
select {
case m := <-nvidiaWorker.RecvCh:
msg = m
default:
// No message yet, signal ready again
metalWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
nvidiaWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
time.Sleep(50 * time.Millisecond)
}
}
require.Equal(t, scheduler.MsgJobAssign, msg.Type, "NVIDIA worker should get NVIDIA job")
// Metal worker should receive NoWork (not job_assign) - poll to verify
var metalMsg scheduler.Message
metalDeadline := time.Now().Add(500 * time.Millisecond)
for time.Now().Before(metalDeadline) {
select {
case m := <-metalWorker.RecvCh:
metalMsg = m
default:
metalWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
time.Sleep(50 * time.Millisecond)
}
if metalMsg.Type == scheduler.MsgNoWork || metalMsg.Type == scheduler.MsgJobAssign {
break
}
}
// Metal worker should get NoWork, never job_assign
assert.NotEqual(t, scheduler.MsgJobAssign, metalMsg.Type, "Metal worker should NOT receive NVIDIA job")
}
// TestCapabilityRoutingE2E_VRAMFiltering validates VRAM requirements filtering
func TestCapabilityRoutingE2E_VRAMFiltering(t *testing.T) {
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
// Worker with 8GB VRAM
worker8GB := fixture.CreateWorker("e2e-8gb-vram", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendNVIDIA,
GPUCount: 2,
VRAMGB: 8.0,
})
// Worker with 24GB VRAM
worker24GB := fixture.CreateWorker("e2e-24gb-vram", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendNVIDIA,
GPUCount: 2,
VRAMGB: 24.0,
})
// Submit job needing 16GB VRAM
fixture.SubmitJob(scheduler.JobSpec{
ID: "e2e-vram-job",
Type: scheduler.JobTypeBatch,
SlotPool: "batch",
GPUCount: 1,
MinVRAMGB: 16.0,
})
worker8GB.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
worker24GB.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
// Should go to 24GB worker - poll with retries since scheduler may need time
var assignedWorker string
deadline := time.Now().Add(2 * time.Second)
for time.Now().Before(deadline) && assignedWorker == "" {
select {
case msg := <-worker8GB.RecvCh:
if msg.Type == scheduler.MsgJobAssign {
assignedWorker = "8gb"
}
case msg := <-worker24GB.RecvCh:
if msg.Type == scheduler.MsgJobAssign {
assignedWorker = "24gb"
}
default:
// No message yet, signal ready again to trigger assignment
worker8GB.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
worker24GB.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
time.Sleep(100 * time.Millisecond)
}
}
if assignedWorker == "" {
t.Fatal("timeout waiting for job assignment")
}
assert.Equal(t, "24gb", assignedWorker, "16GB VRAM job should go to 24GB worker")
}
// TestCapabilityRoutingE2E_GangAllocation validates multi-node jobs across mixed workers
func TestCapabilityRoutingE2E_GangAllocation(t *testing.T) {
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
// Create workers with different capabilities
workers := make([]*fixtures.MockWorker, 3)
workerIDs := []string{"gang-worker-1", "gang-worker-2", "gang-worker-3"}
for i, id := range workerIDs {
workers[i] = fixture.CreateWorker(id, scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendNVIDIA,
GPUCount: 2,
VRAMGB: 16.0,
})
}
// Submit multi-node job needing 3 nodes
fixture.SubmitJob(scheduler.JobSpec{
ID: "e2e-gang-job",
Type: scheduler.JobTypeBatch,
SlotPool: "batch",
NodeCount: 3,
GPUCount: 1,
GPUBackend: "nvidia",
Command: []string{"torchrun", "--nproc_per_node=3", "train.py"},
})
// Workers signal ready after job submission
for _, worker := range workers {
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
}
// All three workers should receive the job assignment
assignedCount := 0
deadline := time.After(3 * time.Second)
for _, worker := range workers {
select {
case msg := <-worker.RecvCh:
if msg.Type == scheduler.MsgJobAssign {
assignedCount++
}
case <-deadline:
// Timeout - continue to next worker
}
}
// Gang allocation may assign one at a time; verify at least one gets assigned
assert.GreaterOrEqual(t, assignedCount, 1, "at least one worker should be assigned for gang job")
}
// TestCapabilityRoutingE2E_NoSuitableWorker validates job waits when no worker matches
func TestCapabilityRoutingE2E_NoSuitableWorker(t *testing.T) {
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
// Create only CPU workers
cpuWorker := fixture.CreateWorker("e2e-cpu-only", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendCPU,
GPUCount: 0,
})
// Submit GPU job first
fixture.SubmitJob(scheduler.JobSpec{
ID: "e2e-waiting-gpu-job",
Type: scheduler.JobTypeBatch,
SlotPool: "batch",
GPUCount: 4,
})
// CPU worker signals ready after job submission
cpuWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
// Wait a moment for any potential assignment
time.Sleep(100 * time.Millisecond)
// CPU worker should receive NoWork (not job_assign) - poll to verify
var cpuMsg scheduler.Message
cpuDeadline := time.Now().Add(500 * time.Millisecond)
for time.Now().Before(cpuDeadline) {
select {
case m := <-cpuWorker.RecvCh:
cpuMsg = m
default:
cpuWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
time.Sleep(50 * time.Millisecond)
}
if cpuMsg.Type == scheduler.MsgNoWork || cpuMsg.Type == scheduler.MsgJobAssign {
break
}
}
// CPU worker should get NoWork, never job_assign for GPU job
assert.NotEqual(t, scheduler.MsgJobAssign, cpuMsg.Type, "CPU worker should NOT receive GPU job")
// Job should be in queue
metrics := fixture.Hub.GetMetricsPayload()
queueDepth := metrics["queue_depth_batch"].(int)
assert.GreaterOrEqual(t, queueDepth, 1, "GPU job should be queued waiting for GPU worker")
}