fetch_ml/tests/unit/scheduler/heartbeat_test.go

package scheduler_test

import (
	"testing"
	"time"

	"github.com/jfraeys/fetch_ml/internal/scheduler"
	fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

// TestHeartbeat_SlotStatusSynchronization validates slot updates via heartbeat
func TestHeartbeat_SlotStatusSynchronization(t *testing.T) {
	fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
	defer fixture.Cleanup()

	worker := fixture.CreateWorker("slot-sync-worker", scheduler.WorkerCapabilities{
		GPUBackend: scheduler.BackendCPU,
		GPUCount:   0,
		CPUCount:   8,
	})

	// Submit a job
	fixture.SubmitJob(scheduler.JobSpec{
		ID:       "slot-sync-job",
		Type:     scheduler.JobTypeBatch,
		SlotPool: "batch",
		GPUCount: 0,
	})

	// Signal ready to trigger assignment
	worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")

	// Worker should receive the job
	msg := worker.RecvTimeout(2 * time.Second)
	require.Equal(t, scheduler.MsgJobAssign, msg.Type, "worker should receive job")

	// Accept the job
	worker.AcceptJob("slot-sync-job")

	// Send heartbeat showing slot is now in use
	worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1})

	// Give time for heartbeat to be processed
	time.Sleep(100 * time.Millisecond)

	// Verify metrics reflect updated slot status
	metrics := fixture.Hub.GetMetricsPayload()
	slotData, ok := metrics["worker_slots"].(map[string]scheduler.SlotStatus)
	if ok {
		status := slotData["slot-sync-worker"]
		assert.Equal(t, 4, status.BatchTotal, "total slots should remain 4")
	}
}

// TestHeartbeat_LivenessDetection validates worker disconnect on missed heartbeats
func TestHeartbeat_LivenessDetection(t *testing.T) {
	// Use short heartbeat timeout for faster test
	cfg := fixtures.DefaultHubConfig()
	cfg.AcceptanceTimeoutSecs = 2 // Short timeout for test speed

	fixture := fixtures.NewSchedulerTestFixture(t, cfg)
	defer fixture.Cleanup()

	worker := fixture.CreateWorker("liveness-test-worker", scheduler.WorkerCapabilities{
		GPUBackend: scheduler.BackendCPU,
		GPUCount:   0,
		CPUCount:   4,
	})

	// Register and send initial ready
	worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")

	// Verify worker is connected by checking metrics
	metrics := fixture.Hub.GetMetricsPayload()
	connectedWorkers := metrics["workers_connected"].(int)
	assert.GreaterOrEqual(t, connectedWorkers, 1, "worker should be connected")

	// Close worker connection without graceful disconnect (simulates death)
	worker.Close()

	// Wait for scheduler to detect disconnect
	// The detection happens through connection close, not heartbeat timeout
	time.Sleep(500 * time.Millisecond)

	// Verify worker is disconnected by checking metrics changed
	metricsAfter := fixture.Hub.GetMetricsPayload()
	connectedAfter := metricsAfter["workers_connected"].(int)
	assert.Less(t, connectedAfter, connectedWorkers, "worker should be disconnected after close")
}

// TestHeartbeat_AckResponse validates heartbeat acknowledgment
func TestHeartbeat_AckResponse(t *testing.T) {
	fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
	defer fixture.Cleanup()

	worker := fixture.CreateWorker("hb-ack-worker", scheduler.WorkerCapabilities{
		GPUBackend: scheduler.BackendCPU,
		GPUCount:   0,
	})

	worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")

	// Send heartbeat with capability update
	worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0})

	// Heartbeat itself doesn't produce a response in current implementation
	// but we verify the connection remains active
	time.Sleep(100 * time.Millisecond)

	// Verify we can still receive messages (connection is alive)
	// Send another ready signal to confirm bidirectional communication works
	worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "heartbeat_test")

	// If connection is dead, this would error
	// Verify by sending another ready signal - if connection dead, this would panic or error
	worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "heartbeat_ack_test")
	msg := worker.RecvTimeout(500 * time.Millisecond)
	// Should get NoWork since no jobs are queued
	assert.Equal(t, scheduler.MsgNoWork, msg.Type, "heartbeat should maintain connection - worker should respond to ready signal")
}

// TestHeartbeat_RegistrationWithCapabilities validates registration includes capabilities
func TestHeartbeat_RegistrationWithCapabilities(t *testing.T) {
	fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
	defer fixture.Cleanup()

	caps := scheduler.WorkerCapabilities{
		GPUBackend: scheduler.BackendNVIDIA,
		GPUCount:   8,
		VRAMGB:     48.0,
		CPUCount:   16,
		MemoryGB:   64.0,
		Hostname:   "test-gpu-node-01",
	}

	worker := fixture.CreateWorker("reg-caps-worker", caps)

	// Registration happens during CreateWorker, verify by submitting GPU job
	fixture.SubmitJob(scheduler.JobSpec{
		ID:         "reg-caps-job",
		GPUCount:   4,
		GPUBackend: "nvidia",
		MinVRAMGB:  32.0,
	})

	worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")

	// Should receive job because worker has required capabilities
	msg := worker.RecvTimeout(2 * time.Second)
	assert.Equal(t, scheduler.MsgJobAssign, msg.Type, "registered worker with capabilities should receive GPU job")
}

// TestHeartbeat_DuringActiveJob validates heartbeat works while job is running
func TestHeartbeat_DuringActiveJob(t *testing.T) {
	fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
	defer fixture.Cleanup()

	worker := fixture.CreateWorker("hb-active-worker", scheduler.WorkerCapabilities{
		GPUBackend: scheduler.BackendCPU,
		GPUCount:   0,
	})

	// Submit and receive job
	fixture.SubmitJob(scheduler.JobSpec{
		ID:       "hb-active-job",
		Type:     scheduler.JobTypeBatch,
		SlotPool: "batch",
	})

	worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")

	msg := worker.RecvTimeout(2 * time.Second)
	require.Equal(t, scheduler.MsgJobAssign, msg.Type)

	// Accept the job
	worker.AcceptJob("hb-active-job")

	// Send multiple heartbeats while job is "running"
	for i := 0; i < 3; i++ {
		worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1})
		time.Sleep(50 * time.Millisecond)
	}

	// Complete the job
	worker.CompleteJob("hb-active-job", 0, "completed successfully")

	// Verify job completion was processed by checking worker can receive new jobs
	// Submit another job to verify worker is still functional
	fixture.SubmitJob(scheduler.JobSpec{
		ID:       "post-hb-job",
		Type:     scheduler.JobTypeBatch,
		SlotPool: "batch",
	})
	worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
	msg2 := worker.RecvTimeout(2 * time.Second)
	assert.Equal(t, scheduler.MsgJobAssign, msg2.Type, "worker should receive new job after heartbeats during active job")
}

// TestHeartbeat_SlotDeallocationOnDisconnect validates slots freed when worker dies
func TestHeartbeat_SlotDeallocationOnDisconnect(t *testing.T) {
	fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
	defer fixture.Cleanup()

	worker := fixture.CreateWorker("slot-dealloc-worker", scheduler.WorkerCapabilities{
		GPUBackend: scheduler.BackendCPU,
		GPUCount:   0,
		CPUCount:   8,
	})

	// Assign a job to the worker
	fixture.SubmitJob(scheduler.JobSpec{
		ID:       "slot-dealloc-job",
		Type:     scheduler.JobTypeBatch,
		SlotPool: "batch",
	})

	worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")

	msg := worker.RecvTimeout(2 * time.Second)
	require.Equal(t, scheduler.MsgJobAssign, msg.Type)
	worker.AcceptJob("slot-dealloc-job")

	// Verify slot is in use (via heartbeat)
	worker.SendHeartbeat(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1})
	time.Sleep(100 * time.Millisecond)

	// Close connection (simulates worker death)
	worker.Close()

	// Wait for disconnect to be processed
	time.Sleep(500 * time.Millisecond)

	// Trigger orphan reconciliation at boundary
	fixture.Hub.TriggerReconcileOrphans()

	// At this exact moment, job should be at the boundary
	// Verify state is consistent
	task := fixture.Hub.GetTask("slot-dealloc-job")
	if task != nil {
		// Task may be orphaned or still running depending on exact timing
		assert.True(t, task.Status == "running" || task.Status == "orphaned" || task.Status == "queued",
			"task should be in valid state at grace period boundary, got: %s", task.Status)
	}

	// Submit another job - should be queueable even though previous worker had a slot "reserved"
	// In a real scenario, the scheduler would detect the disconnect and free the slot
	fixture.SubmitJob(scheduler.JobSpec{
		ID: "slot-dealloc-job-2",
	})

	// The job should be in the queue waiting for a new worker
	metrics := fixture.Hub.GetMetricsPayload()
	queueDepth := metrics["queue_depth_batch"].(int)
	assert.GreaterOrEqual(t, queueDepth, 1, "job should be queued waiting for available worker")
}