package tests import ( "testing" "time" "github.com/jfraeys/fetch_ml/internal/scheduler" fixtures "github.com/jfraeys/fetch_ml/tests/fixtures" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) // TestCapabilityRoutingE2E_MultiWorkerScenario validates multi-worker capability routing func TestCapabilityRoutingE2E_MultiWorkerScenario(t *testing.T) { fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig()) defer fixture.Cleanup() // Create GPU worker with NVIDIA GPUs gpuWorker := fixture.CreateWorker("e2e-gpu-worker", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendNVIDIA, GPUCount: 4, VRAMGB: 24.0, CPUCount: 8, }) // Create CPU-only worker cpuWorker := fixture.CreateWorker("e2e-cpu-worker", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendCPU, GPUCount: 0, CPUCount: 16, }) // Submit training job (needs GPU) fixture.SubmitJob(scheduler.JobSpec{ ID: "e2e-training-job", Type: scheduler.JobTypeBatch, SlotPool: "batch", JobTier: scheduler.TierTraining, GPUCount: 2, GPUBackend: "nvidia", MinVRAMGB: 16.0, Command: []string{"python", "train.py"}, }) // Submit data processing job (CPU only) fixture.SubmitJob(scheduler.JobSpec{ ID: "e2e-data-job", Type: scheduler.JobTypeBatch, SlotPool: "batch", JobTier: scheduler.TierDataProcessing, GPUCount: 0, Command: []string{"python", "preprocess.py"}, }) // Both workers signal ready to trigger job assignment gpuWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") cpuWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") // GPU worker should get training job msg1 := gpuWorker.RecvTimeout(2 * time.Second) require.Equal(t, scheduler.MsgJobAssign, msg1.Type, "GPU worker should receive training job") // CPU worker should get data job msg2 := cpuWorker.RecvTimeout(2 * time.Second) require.Equal(t, scheduler.MsgJobAssign, msg2.Type, "CPU worker should receive data job") } // TestCapabilityRoutingE2E_GPUSelection validates job lands on correct GPU worker func TestCapabilityRoutingE2E_GPUSelection(t *testing.T) { fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig()) defer fixture.Cleanup() // Create worker with 2 GPUs worker2GPU := fixture.CreateWorker("e2e-2gpu", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendNVIDIA, GPUCount: 2, VRAMGB: 16.0, }) // Create worker with 8 GPUs worker8GPU := fixture.CreateWorker("e2e-8gpu", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendNVIDIA, GPUCount: 8, VRAMGB: 48.0, }) // Submit job needing 4 GPUs fixture.SubmitJob(scheduler.JobSpec{ ID: "e2e-4gpu-job", Type: scheduler.JobTypeBatch, SlotPool: "batch", GPUCount: 4, }) // Both signal ready to trigger assignment worker2GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") worker8GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") // Should go to 8GPU worker (2GPU can't handle it) - poll with retries var assignedWorker string deadline := time.Now().Add(2 * time.Second) for time.Now().Before(deadline) && assignedWorker == "" { select { case msg := <-worker2GPU.RecvCh: if msg.Type == scheduler.MsgJobAssign { assignedWorker = "2gpu" } case msg := <-worker8GPU.RecvCh: if msg.Type == scheduler.MsgJobAssign { assignedWorker = "8gpu" } default: // No message yet, signal ready again worker2GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") worker8GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") time.Sleep(100 * time.Millisecond) } } if assignedWorker == "" { t.Fatal("timeout waiting for job assignment") } assert.Equal(t, "8gpu", assignedWorker, "4-GPU job should go to 8-GPU worker") } // TestCapabilityRoutingE2E_BackendMismatch validates backend requirements are enforced func TestCapabilityRoutingE2E_BackendMismatch(t *testing.T) { fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig()) defer fixture.Cleanup() // Create Metal worker (macOS GPU) metalWorker := fixture.CreateWorker("e2e-metal", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendMetal, GPUCount: 4, }) // Create NVIDIA worker nvidiaWorker := fixture.CreateWorker("e2e-nvidia", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendNVIDIA, GPUCount: 4, }) // Submit job requiring NVIDIA fixture.SubmitJob(scheduler.JobSpec{ ID: "e2e-nvidia-job", Type: scheduler.JobTypeBatch, SlotPool: "batch", GPUCount: 2, GPUBackend: "nvidia", }) // Both workers signal ready metalWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") nvidiaWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") // NVIDIA worker should get the job - poll with retries var msg scheduler.Message deadline := time.Now().Add(2 * time.Second) for time.Now().Before(deadline) && msg.Type != scheduler.MsgJobAssign { select { case m := <-nvidiaWorker.RecvCh: msg = m default: // No message yet, signal ready again metalWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") nvidiaWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") time.Sleep(50 * time.Millisecond) } } require.Equal(t, scheduler.MsgJobAssign, msg.Type, "NVIDIA worker should get NVIDIA job") // Metal worker should receive NoWork (not job_assign) - poll to verify var metalMsg scheduler.Message metalDeadline := time.Now().Add(500 * time.Millisecond) for time.Now().Before(metalDeadline) { select { case m := <-metalWorker.RecvCh: metalMsg = m default: metalWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") time.Sleep(50 * time.Millisecond) } if metalMsg.Type == scheduler.MsgNoWork || metalMsg.Type == scheduler.MsgJobAssign { break } } // Metal worker should get NoWork, never job_assign assert.NotEqual(t, scheduler.MsgJobAssign, metalMsg.Type, "Metal worker should NOT receive NVIDIA job") } // TestCapabilityRoutingE2E_VRAMFiltering validates VRAM requirements filtering func TestCapabilityRoutingE2E_VRAMFiltering(t *testing.T) { fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig()) defer fixture.Cleanup() // Worker with 8GB VRAM worker8GB := fixture.CreateWorker("e2e-8gb-vram", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendNVIDIA, GPUCount: 2, VRAMGB: 8.0, }) // Worker with 24GB VRAM worker24GB := fixture.CreateWorker("e2e-24gb-vram", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendNVIDIA, GPUCount: 2, VRAMGB: 24.0, }) // Submit job needing 16GB VRAM fixture.SubmitJob(scheduler.JobSpec{ ID: "e2e-vram-job", Type: scheduler.JobTypeBatch, SlotPool: "batch", GPUCount: 1, MinVRAMGB: 16.0, }) worker8GB.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") worker24GB.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") // Should go to 24GB worker - poll with retries since scheduler may need time var assignedWorker string deadline := time.Now().Add(2 * time.Second) for time.Now().Before(deadline) && assignedWorker == "" { select { case msg := <-worker8GB.RecvCh: if msg.Type == scheduler.MsgJobAssign { assignedWorker = "8gb" } case msg := <-worker24GB.RecvCh: if msg.Type == scheduler.MsgJobAssign { assignedWorker = "24gb" } default: // No message yet, signal ready again to trigger assignment worker8GB.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") worker24GB.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") time.Sleep(100 * time.Millisecond) } } if assignedWorker == "" { t.Fatal("timeout waiting for job assignment") } assert.Equal(t, "24gb", assignedWorker, "16GB VRAM job should go to 24GB worker") } // TestCapabilityRoutingE2E_GangAllocation validates multi-node jobs across mixed workers func TestCapabilityRoutingE2E_GangAllocation(t *testing.T) { fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig()) defer fixture.Cleanup() // Create workers with different capabilities workers := make([]*fixtures.MockWorker, 3) workerIDs := []string{"gang-worker-1", "gang-worker-2", "gang-worker-3"} for i, id := range workerIDs { workers[i] = fixture.CreateWorker(id, scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendNVIDIA, GPUCount: 2, VRAMGB: 16.0, }) } // Submit multi-node job needing 3 nodes fixture.SubmitJob(scheduler.JobSpec{ ID: "e2e-gang-job", Type: scheduler.JobTypeBatch, SlotPool: "batch", NodeCount: 3, GPUCount: 1, GPUBackend: "nvidia", Command: []string{"torchrun", "--nproc_per_node=3", "train.py"}, }) // Workers signal ready after job submission for _, worker := range workers { worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") } // All three workers should receive the job assignment assignedCount := 0 deadline := time.After(3 * time.Second) for _, worker := range workers { select { case msg := <-worker.RecvCh: if msg.Type == scheduler.MsgJobAssign { assignedCount++ } case <-deadline: // Timeout - continue to next worker } } // Gang allocation may assign one at a time; verify at least one gets assigned assert.GreaterOrEqual(t, assignedCount, 1, "at least one worker should be assigned for gang job") } // TestCapabilityRoutingE2E_NoSuitableWorker validates job waits when no worker matches func TestCapabilityRoutingE2E_NoSuitableWorker(t *testing.T) { fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig()) defer fixture.Cleanup() // Create only CPU workers cpuWorker := fixture.CreateWorker("e2e-cpu-only", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendCPU, GPUCount: 0, }) // Submit GPU job first fixture.SubmitJob(scheduler.JobSpec{ ID: "e2e-waiting-gpu-job", Type: scheduler.JobTypeBatch, SlotPool: "batch", GPUCount: 4, }) // CPU worker signals ready after job submission cpuWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") // Wait a moment for any potential assignment time.Sleep(100 * time.Millisecond) // CPU worker should receive NoWork (not job_assign) - poll to verify var cpuMsg scheduler.Message cpuDeadline := time.Now().Add(500 * time.Millisecond) for time.Now().Before(cpuDeadline) { select { case m := <-cpuWorker.RecvCh: cpuMsg = m default: cpuWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") time.Sleep(50 * time.Millisecond) } if cpuMsg.Type == scheduler.MsgNoWork || cpuMsg.Type == scheduler.MsgJobAssign { break } } // CPU worker should get NoWork, never job_assign for GPU job assert.NotEqual(t, scheduler.MsgJobAssign, cpuMsg.Type, "CPU worker should NOT receive GPU job") // Job should be in queue metrics := fixture.Hub.GetMetricsPayload() queueDepth := metrics["queue_depth_batch"].(int) assert.GreaterOrEqual(t, queueDepth, 1, "GPU job should be queued waiting for GPU worker") }