package scheduler_test import ( "testing" "time" "github.com/jfraeys/fetch_ml/internal/scheduler" fixtures "github.com/jfraeys/fetch_ml/tests/fixtures" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) // TestHeartbeat_SlotStatusSynchronization validates slot updates via heartbeat func TestHeartbeat_SlotStatusSynchronization(t *testing.T) { fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig()) defer fixture.Cleanup() worker := fixture.CreateWorker("slot-sync-worker", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendCPU, GPUCount: 0, CPUCount: 8, }) // Submit a job fixture.SubmitJob(scheduler.JobSpec{ ID: "slot-sync-job", Type: scheduler.JobTypeBatch, SlotPool: "batch", GPUCount: 0, }) // Signal ready to trigger assignment worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") // Worker should receive the job msg := worker.RecvTimeout(2 * time.Second) require.Equal(t, scheduler.MsgJobAssign, msg.Type, "worker should receive job") // Accept the job worker.AcceptJob("slot-sync-job") // Send heartbeat showing slot is now in use worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1}) // Give time for heartbeat to be processed time.Sleep(100 * time.Millisecond) // Verify metrics reflect updated slot status metrics := fixture.Hub.GetMetricsPayload() slotData, ok := metrics["worker_slots"].(map[string]scheduler.SlotStatus) if ok { status := slotData["slot-sync-worker"] assert.Equal(t, 4, status.BatchTotal, "total slots should remain 4") } } // TestHeartbeat_LivenessDetection validates worker disconnect on missed heartbeats func TestHeartbeat_LivenessDetection(t *testing.T) { // Use short heartbeat timeout for faster test cfg := fixtures.DefaultHubConfig() cfg.AcceptanceTimeoutSecs = 2 // Short timeout for test speed fixture := fixtures.NewSchedulerTestFixture(t, cfg) defer fixture.Cleanup() worker := fixture.CreateWorker("liveness-test-worker", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendCPU, GPUCount: 0, CPUCount: 4, }) // Register and send initial ready worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") // Verify worker is connected by checking metrics metrics := fixture.Hub.GetMetricsPayload() connectedWorkers := metrics["workers_connected"].(int) assert.GreaterOrEqual(t, connectedWorkers, 1, "worker should be connected") // Close worker connection without graceful disconnect (simulates death) worker.Close() // Wait for scheduler to detect disconnect // The detection happens through connection close, not heartbeat timeout time.Sleep(500 * time.Millisecond) // Verify worker is disconnected by checking metrics changed metricsAfter := fixture.Hub.GetMetricsPayload() connectedAfter := metricsAfter["workers_connected"].(int) assert.Less(t, connectedAfter, connectedWorkers, "worker should be disconnected after close") } // TestHeartbeat_AckResponse validates heartbeat acknowledgment func TestHeartbeat_AckResponse(t *testing.T) { fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig()) defer fixture.Cleanup() worker := fixture.CreateWorker("hb-ack-worker", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendCPU, GPUCount: 0, }) worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") // Send heartbeat with capability update worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}) // Heartbeat itself doesn't produce a response in current implementation // but we verify the connection remains active time.Sleep(100 * time.Millisecond) // Verify we can still receive messages (connection is alive) // Send another ready signal to confirm bidirectional communication works worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "heartbeat_test") // If connection is dead, this would error // Verify by sending another ready signal - if connection dead, this would panic or error worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "heartbeat_ack_test") msg := worker.RecvTimeout(500 * time.Millisecond) // Should get NoWork since no jobs are queued assert.Equal(t, scheduler.MsgNoWork, msg.Type, "heartbeat should maintain connection - worker should respond to ready signal") } // TestHeartbeat_RegistrationWithCapabilities validates registration includes capabilities func TestHeartbeat_RegistrationWithCapabilities(t *testing.T) { fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig()) defer fixture.Cleanup() caps := scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendNVIDIA, GPUCount: 8, VRAMGB: 48.0, CPUCount: 16, MemoryGB: 64.0, Hostname: "test-gpu-node-01", } worker := fixture.CreateWorker("reg-caps-worker", caps) // Registration happens during CreateWorker, verify by submitting GPU job fixture.SubmitJob(scheduler.JobSpec{ ID: "reg-caps-job", GPUCount: 4, GPUBackend: "nvidia", MinVRAMGB: 32.0, }) worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") // Should receive job because worker has required capabilities msg := worker.RecvTimeout(2 * time.Second) assert.Equal(t, scheduler.MsgJobAssign, msg.Type, "registered worker with capabilities should receive GPU job") } // TestHeartbeat_DuringActiveJob validates heartbeat works while job is running func TestHeartbeat_DuringActiveJob(t *testing.T) { fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig()) defer fixture.Cleanup() worker := fixture.CreateWorker("hb-active-worker", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendCPU, GPUCount: 0, }) // Submit and receive job fixture.SubmitJob(scheduler.JobSpec{ ID: "hb-active-job", Type: scheduler.JobTypeBatch, SlotPool: "batch", }) worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") msg := worker.RecvTimeout(2 * time.Second) require.Equal(t, scheduler.MsgJobAssign, msg.Type) // Accept the job worker.AcceptJob("hb-active-job") // Send multiple heartbeats while job is "running" for i := 0; i < 3; i++ { worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1}) time.Sleep(50 * time.Millisecond) } // Complete the job worker.CompleteJob("hb-active-job", 0, "completed successfully") // Verify job completion was processed by checking worker can receive new jobs // Submit another job to verify worker is still functional fixture.SubmitJob(scheduler.JobSpec{ ID: "post-hb-job", Type: scheduler.JobTypeBatch, SlotPool: "batch", }) worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") msg2 := worker.RecvTimeout(2 * time.Second) assert.Equal(t, scheduler.MsgJobAssign, msg2.Type, "worker should receive new job after heartbeats during active job") } // TestHeartbeat_SlotDeallocationOnDisconnect validates slots freed when worker dies func TestHeartbeat_SlotDeallocationOnDisconnect(t *testing.T) { fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig()) defer fixture.Cleanup() worker := fixture.CreateWorker("slot-dealloc-worker", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendCPU, GPUCount: 0, CPUCount: 8, }) // Assign a job to the worker fixture.SubmitJob(scheduler.JobSpec{ ID: "slot-dealloc-job", Type: scheduler.JobTypeBatch, SlotPool: "batch", }) worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling") msg := worker.RecvTimeout(2 * time.Second) require.Equal(t, scheduler.MsgJobAssign, msg.Type) worker.AcceptJob("slot-dealloc-job") // Verify slot is in use (via heartbeat) worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1}) time.Sleep(100 * time.Millisecond) // Close connection (simulates worker death) worker.Close() // Wait for disconnect to be processed time.Sleep(500 * time.Millisecond) // Trigger orphan reconciliation at boundary fixture.Hub.TriggerReconcileOrphans() // At this exact moment, job should be at the boundary // Verify state is consistent task := fixture.Hub.GetTask("slot-dealloc-job") if task != nil { // Task may be orphaned or still running depending on exact timing assert.True(t, task.Status == "running" || task.Status == "orphaned" || task.Status == "queued", "task should be in valid state at grace period boundary, got: %s", task.Status) } // Submit another job - should be queueable even though previous worker had a slot "reserved" // In a real scenario, the scheduler would detect the disconnect and free the slot fixture.SubmitJob(scheduler.JobSpec{ ID: "slot-dealloc-job-2", }) // The job should be in the queue waiting for a new worker metrics := fixture.Hub.GetMetricsPayload() queueDepth := metrics["queue_depth_batch"].(int) assert.GreaterOrEqual(t, queueDepth, 1, "job should be queued waiting for available worker") }