Add comprehensive capability routing system to scheduler hub: - Capability-aware worker matching with requirement/offer negotiation - Hub v2 protocol with structured message types and heartbeat management - Worker capability advertisement and dynamic routing decisions - Orphan recovery for disconnected workers with state reconciliation - Template-based job scheduling with capability constraints Add extensive test coverage: - Unit tests for capability routing logic and heartbeat mechanics - Unit tests for orphan recovery scenarios - E2E tests for capability routing across multiple workers - Hub capabilities integration tests - Scheduler fixture helpers for test setup Protocol improvements: - Define structured protocol messages for hub-worker communication - Add capability matching algorithm with scoring - Implement graceful worker disconnection handling
257 lines
8.9 KiB
Go
257 lines
8.9 KiB
Go
package scheduler_test
|
|
|
|
import (
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/scheduler"
|
|
fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
// TestHeartbeat_SlotStatusSynchronization validates slot updates via heartbeat
|
|
func TestHeartbeat_SlotStatusSynchronization(t *testing.T) {
|
|
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
worker := fixture.CreateWorker("slot-sync-worker", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendCPU,
|
|
GPUCount: 0,
|
|
CPUCount: 8,
|
|
})
|
|
|
|
// Submit a job
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: "slot-sync-job",
|
|
Type: scheduler.JobTypeBatch,
|
|
SlotPool: "batch",
|
|
GPUCount: 0,
|
|
})
|
|
|
|
// Signal ready to trigger assignment
|
|
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
|
|
// Worker should receive the job
|
|
msg := worker.RecvTimeout(2 * time.Second)
|
|
require.Equal(t, scheduler.MsgJobAssign, msg.Type, "worker should receive job")
|
|
|
|
// Accept the job
|
|
worker.AcceptJob("slot-sync-job")
|
|
|
|
// Send heartbeat showing slot is now in use
|
|
worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1})
|
|
|
|
// Give time for heartbeat to be processed
|
|
time.Sleep(100 * time.Millisecond)
|
|
|
|
// Verify metrics reflect updated slot status
|
|
metrics := fixture.Hub.GetMetricsPayload()
|
|
slotData, ok := metrics["worker_slots"].(map[string]scheduler.SlotStatus)
|
|
if ok {
|
|
status := slotData["slot-sync-worker"]
|
|
assert.Equal(t, 4, status.BatchTotal, "total slots should remain 4")
|
|
}
|
|
}
|
|
|
|
// TestHeartbeat_LivenessDetection validates worker disconnect on missed heartbeats
|
|
func TestHeartbeat_LivenessDetection(t *testing.T) {
|
|
// Use short heartbeat timeout for faster test
|
|
cfg := fixtures.DefaultHubConfig()
|
|
cfg.AcceptanceTimeoutSecs = 2 // Short timeout for test speed
|
|
|
|
fixture := fixtures.NewSchedulerTestFixture(t, cfg)
|
|
defer fixture.Cleanup()
|
|
|
|
worker := fixture.CreateWorker("liveness-test-worker", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendCPU,
|
|
GPUCount: 0,
|
|
CPUCount: 4,
|
|
})
|
|
|
|
// Register and send initial ready
|
|
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
|
|
// Verify worker is connected by checking metrics
|
|
metrics := fixture.Hub.GetMetricsPayload()
|
|
connectedWorkers := metrics["workers_connected"].(int)
|
|
assert.GreaterOrEqual(t, connectedWorkers, 1, "worker should be connected")
|
|
|
|
// Close worker connection without graceful disconnect (simulates death)
|
|
worker.Close()
|
|
|
|
// Wait for scheduler to detect disconnect
|
|
// The detection happens through connection close, not heartbeat timeout
|
|
time.Sleep(500 * time.Millisecond)
|
|
|
|
// Verify worker is disconnected by checking metrics changed
|
|
metricsAfter := fixture.Hub.GetMetricsPayload()
|
|
connectedAfter := metricsAfter["workers_connected"].(int)
|
|
assert.Less(t, connectedAfter, connectedWorkers, "worker should be disconnected after close")
|
|
}
|
|
|
|
// TestHeartbeat_AckResponse validates heartbeat acknowledgment
|
|
func TestHeartbeat_AckResponse(t *testing.T) {
|
|
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
worker := fixture.CreateWorker("hb-ack-worker", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendCPU,
|
|
GPUCount: 0,
|
|
})
|
|
|
|
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
|
|
// Send heartbeat with capability update
|
|
worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0})
|
|
|
|
// Heartbeat itself doesn't produce a response in current implementation
|
|
// but we verify the connection remains active
|
|
time.Sleep(100 * time.Millisecond)
|
|
|
|
// Verify we can still receive messages (connection is alive)
|
|
// Send another ready signal to confirm bidirectional communication works
|
|
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "heartbeat_test")
|
|
|
|
// If connection is dead, this would error
|
|
// Verify by sending another ready signal - if connection dead, this would panic or error
|
|
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "heartbeat_ack_test")
|
|
msg := worker.RecvTimeout(500 * time.Millisecond)
|
|
// Should get NoWork since no jobs are queued
|
|
assert.Equal(t, scheduler.MsgNoWork, msg.Type, "heartbeat should maintain connection - worker should respond to ready signal")
|
|
}
|
|
|
|
// TestHeartbeat_RegistrationWithCapabilities validates registration includes capabilities
|
|
func TestHeartbeat_RegistrationWithCapabilities(t *testing.T) {
|
|
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
caps := scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendNVIDIA,
|
|
GPUCount: 8,
|
|
VRAMGB: 48.0,
|
|
CPUCount: 16,
|
|
MemoryGB: 64.0,
|
|
Hostname: "test-gpu-node-01",
|
|
}
|
|
|
|
worker := fixture.CreateWorker("reg-caps-worker", caps)
|
|
|
|
// Registration happens during CreateWorker, verify by submitting GPU job
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: "reg-caps-job",
|
|
GPUCount: 4,
|
|
GPUBackend: "nvidia",
|
|
MinVRAMGB: 32.0,
|
|
})
|
|
|
|
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
|
|
// Should receive job because worker has required capabilities
|
|
msg := worker.RecvTimeout(2 * time.Second)
|
|
assert.Equal(t, scheduler.MsgJobAssign, msg.Type, "registered worker with capabilities should receive GPU job")
|
|
}
|
|
|
|
// TestHeartbeat_DuringActiveJob validates heartbeat works while job is running
|
|
func TestHeartbeat_DuringActiveJob(t *testing.T) {
|
|
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
worker := fixture.CreateWorker("hb-active-worker", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendCPU,
|
|
GPUCount: 0,
|
|
})
|
|
|
|
// Submit and receive job
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: "hb-active-job",
|
|
Type: scheduler.JobTypeBatch,
|
|
SlotPool: "batch",
|
|
})
|
|
|
|
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
|
|
msg := worker.RecvTimeout(2 * time.Second)
|
|
require.Equal(t, scheduler.MsgJobAssign, msg.Type)
|
|
|
|
// Accept the job
|
|
worker.AcceptJob("hb-active-job")
|
|
|
|
// Send multiple heartbeats while job is "running"
|
|
for i := 0; i < 3; i++ {
|
|
worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1})
|
|
time.Sleep(50 * time.Millisecond)
|
|
}
|
|
|
|
// Complete the job
|
|
worker.CompleteJob("hb-active-job", 0, "completed successfully")
|
|
|
|
// Verify job completion was processed by checking worker can receive new jobs
|
|
// Submit another job to verify worker is still functional
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: "post-hb-job",
|
|
Type: scheduler.JobTypeBatch,
|
|
SlotPool: "batch",
|
|
})
|
|
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
msg2 := worker.RecvTimeout(2 * time.Second)
|
|
assert.Equal(t, scheduler.MsgJobAssign, msg2.Type, "worker should receive new job after heartbeats during active job")
|
|
}
|
|
|
|
// TestHeartbeat_SlotDeallocationOnDisconnect validates slots freed when worker dies
|
|
func TestHeartbeat_SlotDeallocationOnDisconnect(t *testing.T) {
|
|
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
worker := fixture.CreateWorker("slot-dealloc-worker", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendCPU,
|
|
GPUCount: 0,
|
|
CPUCount: 8,
|
|
})
|
|
|
|
// Assign a job to the worker
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: "slot-dealloc-job",
|
|
Type: scheduler.JobTypeBatch,
|
|
SlotPool: "batch",
|
|
})
|
|
|
|
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
|
|
|
|
msg := worker.RecvTimeout(2 * time.Second)
|
|
require.Equal(t, scheduler.MsgJobAssign, msg.Type)
|
|
worker.AcceptJob("slot-dealloc-job")
|
|
|
|
// Verify slot is in use (via heartbeat)
|
|
worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1})
|
|
time.Sleep(100 * time.Millisecond)
|
|
|
|
// Close connection (simulates worker death)
|
|
worker.Close()
|
|
|
|
// Wait for disconnect to be processed
|
|
time.Sleep(500 * time.Millisecond)
|
|
|
|
// Trigger orphan reconciliation at boundary
|
|
fixture.Hub.TriggerReconcileOrphans()
|
|
|
|
// At this exact moment, job should be at the boundary
|
|
// Verify state is consistent
|
|
task := fixture.Hub.GetTask("slot-dealloc-job")
|
|
if task != nil {
|
|
// Task may be orphaned or still running depending on exact timing
|
|
assert.True(t, task.Status == "running" || task.Status == "orphaned" || task.Status == "queued",
|
|
"task should be in valid state at grace period boundary, got: %s", task.Status)
|
|
}
|
|
|
|
// Submit another job - should be queueable even though previous worker had a slot "reserved"
|
|
// In a real scenario, the scheduler would detect the disconnect and free the slot
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: "slot-dealloc-job-2",
|
|
})
|
|
|
|
// The job should be in the queue waiting for a new worker
|
|
metrics := fixture.Hub.GetMetricsPayload()
|
|
queueDepth := metrics["queue_depth_batch"].(int)
|
|
assert.GreaterOrEqual(t, queueDepth, 1, "job should be queued waiting for available worker")
|
|
}
|