fetch_ml/tests/unit/scheduler/heartbeat_test.go
Jeremie Fraeys 57787e1e7b
feat(scheduler): implement capability-based routing and hub v2
Add comprehensive capability routing system to scheduler hub:
- Capability-aware worker matching with requirement/offer negotiation
- Hub v2 protocol with structured message types and heartbeat management
- Worker capability advertisement and dynamic routing decisions
- Orphan recovery for disconnected workers with state reconciliation
- Template-based job scheduling with capability constraints

Add extensive test coverage:
- Unit tests for capability routing logic and heartbeat mechanics
- Unit tests for orphan recovery scenarios
- E2E tests for capability routing across multiple workers
- Hub capabilities integration tests
- Scheduler fixture helpers for test setup

Protocol improvements:
- Define structured protocol messages for hub-worker communication
- Add capability matching algorithm with scoring
- Implement graceful worker disconnection handling
2026-03-12 12:00:05 -04:00

257 lines
8.9 KiB
Go

package scheduler_test
import (
"testing"
"time"
"github.com/jfraeys/fetch_ml/internal/scheduler"
fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestHeartbeat_SlotStatusSynchronization validates slot updates via heartbeat
func TestHeartbeat_SlotStatusSynchronization(t *testing.T) {
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
worker := fixture.CreateWorker("slot-sync-worker", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendCPU,
GPUCount: 0,
CPUCount: 8,
})
// Submit a job
fixture.SubmitJob(scheduler.JobSpec{
ID: "slot-sync-job",
Type: scheduler.JobTypeBatch,
SlotPool: "batch",
GPUCount: 0,
})
// Signal ready to trigger assignment
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
// Worker should receive the job
msg := worker.RecvTimeout(2 * time.Second)
require.Equal(t, scheduler.MsgJobAssign, msg.Type, "worker should receive job")
// Accept the job
worker.AcceptJob("slot-sync-job")
// Send heartbeat showing slot is now in use
worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1})
// Give time for heartbeat to be processed
time.Sleep(100 * time.Millisecond)
// Verify metrics reflect updated slot status
metrics := fixture.Hub.GetMetricsPayload()
slotData, ok := metrics["worker_slots"].(map[string]scheduler.SlotStatus)
if ok {
status := slotData["slot-sync-worker"]
assert.Equal(t, 4, status.BatchTotal, "total slots should remain 4")
}
}
// TestHeartbeat_LivenessDetection validates worker disconnect on missed heartbeats
func TestHeartbeat_LivenessDetection(t *testing.T) {
// Use short heartbeat timeout for faster test
cfg := fixtures.DefaultHubConfig()
cfg.AcceptanceTimeoutSecs = 2 // Short timeout for test speed
fixture := fixtures.NewSchedulerTestFixture(t, cfg)
defer fixture.Cleanup()
worker := fixture.CreateWorker("liveness-test-worker", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendCPU,
GPUCount: 0,
CPUCount: 4,
})
// Register and send initial ready
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
// Verify worker is connected by checking metrics
metrics := fixture.Hub.GetMetricsPayload()
connectedWorkers := metrics["workers_connected"].(int)
assert.GreaterOrEqual(t, connectedWorkers, 1, "worker should be connected")
// Close worker connection without graceful disconnect (simulates death)
worker.Close()
// Wait for scheduler to detect disconnect
// The detection happens through connection close, not heartbeat timeout
time.Sleep(500 * time.Millisecond)
// Verify worker is disconnected by checking metrics changed
metricsAfter := fixture.Hub.GetMetricsPayload()
connectedAfter := metricsAfter["workers_connected"].(int)
assert.Less(t, connectedAfter, connectedWorkers, "worker should be disconnected after close")
}
// TestHeartbeat_AckResponse validates heartbeat acknowledgment
func TestHeartbeat_AckResponse(t *testing.T) {
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
worker := fixture.CreateWorker("hb-ack-worker", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendCPU,
GPUCount: 0,
})
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
// Send heartbeat with capability update
worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0})
// Heartbeat itself doesn't produce a response in current implementation
// but we verify the connection remains active
time.Sleep(100 * time.Millisecond)
// Verify we can still receive messages (connection is alive)
// Send another ready signal to confirm bidirectional communication works
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "heartbeat_test")
// If connection is dead, this would error
// Verify by sending another ready signal - if connection dead, this would panic or error
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "heartbeat_ack_test")
msg := worker.RecvTimeout(500 * time.Millisecond)
// Should get NoWork since no jobs are queued
assert.Equal(t, scheduler.MsgNoWork, msg.Type, "heartbeat should maintain connection - worker should respond to ready signal")
}
// TestHeartbeat_RegistrationWithCapabilities validates registration includes capabilities
func TestHeartbeat_RegistrationWithCapabilities(t *testing.T) {
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
caps := scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendNVIDIA,
GPUCount: 8,
VRAMGB: 48.0,
CPUCount: 16,
MemoryGB: 64.0,
Hostname: "test-gpu-node-01",
}
worker := fixture.CreateWorker("reg-caps-worker", caps)
// Registration happens during CreateWorker, verify by submitting GPU job
fixture.SubmitJob(scheduler.JobSpec{
ID: "reg-caps-job",
GPUCount: 4,
GPUBackend: "nvidia",
MinVRAMGB: 32.0,
})
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
// Should receive job because worker has required capabilities
msg := worker.RecvTimeout(2 * time.Second)
assert.Equal(t, scheduler.MsgJobAssign, msg.Type, "registered worker with capabilities should receive GPU job")
}
// TestHeartbeat_DuringActiveJob validates heartbeat works while job is running
func TestHeartbeat_DuringActiveJob(t *testing.T) {
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
worker := fixture.CreateWorker("hb-active-worker", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendCPU,
GPUCount: 0,
})
// Submit and receive job
fixture.SubmitJob(scheduler.JobSpec{
ID: "hb-active-job",
Type: scheduler.JobTypeBatch,
SlotPool: "batch",
})
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
msg := worker.RecvTimeout(2 * time.Second)
require.Equal(t, scheduler.MsgJobAssign, msg.Type)
// Accept the job
worker.AcceptJob("hb-active-job")
// Send multiple heartbeats while job is "running"
for i := 0; i < 3; i++ {
worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1})
time.Sleep(50 * time.Millisecond)
}
// Complete the job
worker.CompleteJob("hb-active-job", 0, "completed successfully")
// Verify job completion was processed by checking worker can receive new jobs
// Submit another job to verify worker is still functional
fixture.SubmitJob(scheduler.JobSpec{
ID: "post-hb-job",
Type: scheduler.JobTypeBatch,
SlotPool: "batch",
})
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
msg2 := worker.RecvTimeout(2 * time.Second)
assert.Equal(t, scheduler.MsgJobAssign, msg2.Type, "worker should receive new job after heartbeats during active job")
}
// TestHeartbeat_SlotDeallocationOnDisconnect validates slots freed when worker dies
func TestHeartbeat_SlotDeallocationOnDisconnect(t *testing.T) {
fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
worker := fixture.CreateWorker("slot-dealloc-worker", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendCPU,
GPUCount: 0,
CPUCount: 8,
})
// Assign a job to the worker
fixture.SubmitJob(scheduler.JobSpec{
ID: "slot-dealloc-job",
Type: scheduler.JobTypeBatch,
SlotPool: "batch",
})
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
msg := worker.RecvTimeout(2 * time.Second)
require.Equal(t, scheduler.MsgJobAssign, msg.Type)
worker.AcceptJob("slot-dealloc-job")
// Verify slot is in use (via heartbeat)
worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1})
time.Sleep(100 * time.Millisecond)
// Close connection (simulates worker death)
worker.Close()
// Wait for disconnect to be processed
time.Sleep(500 * time.Millisecond)
// Trigger orphan reconciliation at boundary
fixture.Hub.TriggerReconcileOrphans()
// At this exact moment, job should be at the boundary
// Verify state is consistent
task := fixture.Hub.GetTask("slot-dealloc-job")
if task != nil {
// Task may be orphaned or still running depending on exact timing
assert.True(t, task.Status == "running" || task.Status == "orphaned" || task.Status == "queued",
"task should be in valid state at grace period boundary, got: %s", task.Status)
}
// Submit another job - should be queueable even though previous worker had a slot "reserved"
// In a real scenario, the scheduler would detect the disconnect and free the slot
fixture.SubmitJob(scheduler.JobSpec{
ID: "slot-dealloc-job-2",
})
// The job should be in the queue waiting for a new worker
metrics := fixture.Hub.GetMetricsPayload()
queueDepth := metrics["queue_depth_batch"].(int)
assert.GreaterOrEqual(t, queueDepth, 1, "job should be queued waiting for available worker")
}