fetch_ml/tests/unit/scheduler/capability_routing_test.go

package scheduler_test

import (
	"testing"
	"time"

	"github.com/jfraeys/fetch_ml/internal/scheduler"
	fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

// TestCapabilityRouting_BackendMatching validates GPU backend compatibility
func TestCapabilityRouting_BackendMatching(t *testing.T) {
	tests := []struct {
		name       string
		workerCaps scheduler.WorkerCapabilities
		jobSpec    scheduler.JobSpec
		wantAdmit  bool
	}{
		{
			name: "nvidia backend matches nvidia job",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendNVIDIA,
				GPUCount:   4,
			},
			jobSpec: scheduler.JobSpec{
				ID:         "nvidia-match-job",
				Type:       scheduler.JobTypeBatch,
				SlotPool:   "batch",
				GPUBackend: "nvidia",
				GPUCount:   2,
			},
			wantAdmit: true,
		},
		{
			name: "metal backend matches metal job",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendMetal,
				GPUCount:   2,
			},
			jobSpec: scheduler.JobSpec{
				ID:         "metal-match-job",
				Type:       scheduler.JobTypeBatch,
				GPUBackend: "metal",
				GPUCount:   1,
			},
			wantAdmit: true,
		},
		{
			name: "nvidia worker rejects metal job",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendNVIDIA,
				GPUCount:   4,
			},
			jobSpec: scheduler.JobSpec{
				ID:         "metal-reject-job",
				Type:       scheduler.JobTypeBatch,
				GPUBackend: "metal",
				GPUCount:   1,
			},
			wantAdmit: false,
		},
		{
			name: "any backend accepted when job has no preference",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendVulkan,
				GPUCount:   2,
			},
			jobSpec: scheduler.JobSpec{
				ID:         "any-backend-job",
				Type:       scheduler.JobTypeBatch,
				GPUBackend: "",
				GPUCount:   1,
			},
			wantAdmit: true,
		},
		{
			name: "cpu worker accepts cpu job",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendCPU,
				GPUCount:   0,
				CPUCount:   8,
			},
			jobSpec: scheduler.JobSpec{
				ID:         "cpu-job",
				Type:       scheduler.JobTypeBatch,
				GPUBackend: "cpu",
				GPUCount:   0,
			},
			wantAdmit: true,
		},
		{
			name: "cpu worker rejects gpu job",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendCPU,
				GPUCount:   0,
			},
			jobSpec: scheduler.JobSpec{
				ID:         "gpu-reject-job",
				Type:       scheduler.JobTypeBatch,
				GPUBackend: "nvidia",
				GPUCount:   1,
			},
			wantAdmit: false,
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
			defer fixture.Cleanup()

			worker := fixture.CreateWorker("backend-test-worker", tt.workerCaps)

			// Submit job first
			fixture.SubmitJob(tt.jobSpec)

			// Signal ready to trigger job assignment
			worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")

			msg := worker.RecvTimeout(2 * time.Second)
			gotAdmit := msg.Type == scheduler.MsgJobAssign

			if gotAdmit != tt.wantAdmit {
				t.Errorf("backend matching: got admit=%v, want=%v", gotAdmit, tt.wantAdmit)
			}
		})
	}
}

// TestCapabilityRouting_VRAMRequirements validates VRAM filtering
func TestCapabilityRouting_VRAMRequirements(t *testing.T) {
	tests := []struct {
		name       string
		workerCaps scheduler.WorkerCapabilities
		jobSpec    scheduler.JobSpec
		wantAdmit  bool
	}{
		{
			name: "sufficient VRAM - job needs 16GB, worker has 32GB",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendNVIDIA,
				GPUCount:   2,
				VRAMGB:     32.0,
			},
			jobSpec: scheduler.JobSpec{
				ID:        "vram-sufficient-job",
				Type:      scheduler.JobTypeBatch,
				SlotPool:  "batch",
				MinVRAMGB: 16.0,
				GPUCount:  1,
			},
			wantAdmit: true,
		},
		{
			name: "insufficient VRAM - job needs 16GB, worker has 8GB",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendNVIDIA,
				GPUCount:   2,
				VRAMGB:     8.0,
			},
			jobSpec: scheduler.JobSpec{
				ID:        "vram-insufficient-job",
				Type:      scheduler.JobTypeBatch,
				SlotPool:  "batch",
				MinVRAMGB: 16.0,
				GPUCount:  1,
			},
			wantAdmit: false,
		},
		{
			name: "no VRAM requirement - any VRAM accepted",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendNVIDIA,
				GPUCount:   2,
				VRAMGB:     4.0,
			},
			jobSpec: scheduler.JobSpec{
				ID:        "no-vram-job",
				Type:      scheduler.JobTypeBatch,
				SlotPool:  "batch",
				MinVRAMGB: 0,
				GPUCount:  1,
			},
			wantAdmit: true,
		},
		{
			name: "multi-GPU VRAM - job needs 48GB, worker has 48GB total (2x24GB)",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendNVIDIA,
				GPUCount:   2,
				VRAMGB:     48.0, // Total VRAM across all GPUs
			},
			jobSpec: scheduler.JobSpec{
				ID:        "multi-vram-job",
				Type:      scheduler.JobTypeBatch,
				SlotPool:  "batch",
				MinVRAMGB: 48.0,
				GPUCount:  2,
			},
			wantAdmit: true,
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
			defer fixture.Cleanup()

			worker := fixture.CreateWorker("vram-test-worker", tt.workerCaps)

			// Submit job first, then signal ready to trigger assignment
			fixture.SubmitJob(tt.jobSpec)
			worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")

			msg := worker.RecvTimeout(2 * time.Second)
			gotAdmit := msg.Type == scheduler.MsgJobAssign

			if gotAdmit != tt.wantAdmit {
				t.Errorf("VRAM filtering: got admit=%v, want=%v", gotAdmit, tt.wantAdmit)
			}
		})
	}
}

// TestCapabilityRouting_CPURequirements validates CPU core filtering
func TestCapabilityRouting_CPURequirements(t *testing.T) {
	tests := []struct {
		name       string
		workerCaps scheduler.WorkerCapabilities
		jobSpec    scheduler.JobSpec
		wantAdmit  bool
	}{
		{
			name: "sufficient CPU cores - job needs 8, worker has 16",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendCPU,
				CPUCount:   16,
			},
			jobSpec: scheduler.JobSpec{
				ID:          "cpu-sufficient-job",
				Type:        scheduler.JobTypeBatch,
				SlotPool:    "batch",
				MinCPUCores: 8,
				GPUCount:    0,
			},
			wantAdmit: true,
		},
		{
			name: "insufficient CPU cores - job needs 8, worker has 4",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendCPU,
				CPUCount:   4,
			},
			jobSpec: scheduler.JobSpec{
				ID:          "cpu-insufficient-job",
				Type:        scheduler.JobTypeBatch,
				SlotPool:    "batch",
				MinCPUCores: 8,
				GPUCount:    0,
			},
			wantAdmit: false,
		},
		{
			name: "no CPU requirement - any CPU count accepted",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendCPU,
				CPUCount:   2,
			},
			jobSpec: scheduler.JobSpec{
				ID:          "no-cpu-req-job",
				Type:        scheduler.JobTypeBatch,
				SlotPool:    "batch",
				MinCPUCores: 0,
				GPUCount:    0,
			},
			wantAdmit: true,
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
			defer fixture.Cleanup()

			worker := fixture.CreateWorker("cpu-test-worker", tt.workerCaps)

			// Submit job first, then signal ready to trigger assignment
			fixture.SubmitJob(tt.jobSpec)
			worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")

			msg := worker.RecvTimeout(2 * time.Second)
			gotAdmit := msg.Type == scheduler.MsgJobAssign

			if gotAdmit != tt.wantAdmit {
				t.Errorf("CPU filtering: got admit=%v, want=%v", gotAdmit, tt.wantAdmit)
			}
		})
	}
}

// TestCapabilityRouting_MultiGPUPlacement validates multi-GPU job placement
func TestCapabilityRouting_MultiGPUPlacement(t *testing.T) {
	tests := []struct {
		name       string
		workerCaps scheduler.WorkerCapabilities
		jobGPUs    int
		wantAdmit  bool
	}{
		{
			name: "job needs 4 GPUs, worker has 8 - admitted",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendNVIDIA,
				GPUCount:   8,
			},
			jobGPUs:   4,
			wantAdmit: true,
		},
		{
			name: "job needs 4 GPUs, worker has 2 - rejected",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendNVIDIA,
				GPUCount:   2,
			},
			jobGPUs:   4,
			wantAdmit: false,
		},
		{
			name: "job needs 4 GPUs, worker has exactly 4 - admitted",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendNVIDIA,
				GPUCount:   4,
			},
			jobGPUs:   4,
			wantAdmit: true,
		},
		{
			name: "job needs 0 GPUs (CPU), worker has GPUs - admitted",
			workerCaps: scheduler.WorkerCapabilities{
				GPUBackend: scheduler.BackendNVIDIA,
				GPUCount:   4,
				CPUCount:   8,
			},
			jobGPUs:   0,
			wantAdmit: true,
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
			defer fixture.Cleanup()

			worker := fixture.CreateWorker("multi-gpu-test-worker", tt.workerCaps)

			// Submit job first, then signal ready to trigger assignment
			fixture.SubmitJob(scheduler.JobSpec{
				ID:       "multi-gpu-job",
				Type:     scheduler.JobTypeBatch,
				SlotPool: "batch",
				GPUCount: tt.jobGPUs,
			})
			worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")

			msg := worker.RecvTimeout(2 * time.Second)
			gotAdmit := msg.Type == scheduler.MsgJobAssign

			if gotAdmit != tt.wantAdmit {
				t.Errorf("multi-GPU placement: got admit=%v, want=%v", gotAdmit, tt.wantAdmit)
			}
		})
	}
}

// TestCapabilityRouting_ReservedGPUAccounting validates reservation system
func TestCapabilityRouting_ReservedGPUAccounting(t *testing.T) {
	fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
	defer fixture.Cleanup()

	// Create worker with 8 GPUs
	worker := fixture.CreateWorker("reservation-test-worker", scheduler.WorkerCapabilities{
		GPUBackend: scheduler.BackendNVIDIA,
		GPUCount:   8,
	})

	// Submit first job
	fixture.SubmitJob(scheduler.JobSpec{
		ID:       "job-1",
		Type:     scheduler.JobTypeBatch,
		SlotPool: "batch",
		GPUCount: 4,
	})

	// Signal ready to trigger assignment
	worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")

	msg1 := worker.RecvTimeout(2 * time.Second)
	require.Equal(t, scheduler.MsgJobAssign, msg1.Type, "first job should be assigned")

	// Accept first job to reserve its GPUs
	worker.AcceptJob("job-1")

	// Submit second job
	fixture.SubmitJob(scheduler.JobSpec{
		ID:       "job-2",
		Type:     scheduler.JobTypeBatch,
		SlotPool: "batch",
		GPUCount: 4,
	})

	// Signal ready again to trigger second job assignment
	worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1}, "polling")

	// Worker still has 4 GPUs available (8 total - 4 reserved = 4 available)
	// Job needs 4, so it should be assigned
	msg2 := worker.RecvTimeout(2 * time.Second)
	assert.Equal(t, scheduler.MsgJobAssign, msg2.Type, "second job should be assigned - 4 GPUs still available")
}

// TestCapabilityRouting_JobTierPriority validates job tier interactions with capabilities
func TestCapabilityRouting_JobTierPriority(t *testing.T) {
	fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
	defer fixture.Cleanup()

	// Create workers with different capabilities
	gpuWorker := fixture.CreateWorker("tier-gpu-worker", scheduler.WorkerCapabilities{
		GPUBackend: scheduler.BackendNVIDIA,
		GPUCount:   4,
	})
	cpuWorker := fixture.CreateWorker("tier-cpu-worker", scheduler.WorkerCapabilities{
		GPUBackend: scheduler.BackendCPU,
		CPUCount:   8,
	})

	// Submit training job (high priority tier, needs GPU)
	fixture.SubmitJob(scheduler.JobSpec{
		ID:       "training-job",
		Type:     scheduler.JobTypeBatch,
		SlotPool: "batch",
		JobTier:  scheduler.TierTraining,
		GPUCount: 2,
	})

	// Submit data processing job (lower priority tier, CPU only)
	fixture.SubmitJob(scheduler.JobSpec{
		ID:       "data-job",
		Type:     scheduler.JobTypeBatch,
		SlotPool: "batch",
		JobTier:  scheduler.TierDataProcessing,
		GPUCount: 0,
	})

	// Signal both workers ready to trigger job assignment
	gpuWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
	cpuWorker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")

	// GPU worker should get training job (it requires GPUs)
	msg1 := gpuWorker.RecvTimeout(2 * time.Second)
	require.Equal(t, scheduler.MsgJobAssign, msg1.Type, "GPU worker should get training job")

	// CPU worker should get data job
	msg2 := cpuWorker.RecvTimeout(2 * time.Second)
	require.Equal(t, scheduler.MsgJobAssign, msg2.Type, "CPU worker should get data job")
}

// TestCapabilityRouting_MixedCapabilitiesRace validates race-free capability matching
func TestCapabilityRouting_MixedCapabilitiesRace(t *testing.T) {
	// This test verifies that when multiple workers with different capabilities
	// are ready, jobs are routed to the correct workers based on requirements

	fixture := fixtures.NewSchedulerTestFixture(t, fixtures.DefaultHubConfig())
	defer fixture.Cleanup()

	// Create workers with different GPU counts
	worker2GPU := fixture.CreateWorker("race-2gpu", scheduler.WorkerCapabilities{
		GPUBackend: scheduler.BackendNVIDIA,
		GPUCount:   2,
	})
	worker8GPU := fixture.CreateWorker("race-8gpu", scheduler.WorkerCapabilities{
		GPUBackend: scheduler.BackendNVIDIA,
		GPUCount:   8,
	})

	// Both signal ready
	worker2GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
	worker8GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")

	// Submit job needing 4 GPUs
	fixture.SubmitJob(scheduler.JobSpec{
		ID:       "race-job-4gpu",
		Type:     scheduler.JobTypeBatch,
		SlotPool: "batch",
		GPUCount: 4,
	})

	// Signal ready after job submission to trigger assignment
	worker2GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
	worker8GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")

	// Should go to 8GPU worker (2GPU can't handle it)
	var assignedWorker *fixtures.MockWorker
	deadline := time.After(2 * time.Second)
	checkTimeout := time.After(100 * time.Millisecond)

	for assignedWorker == nil {
		select {
		case msg := <-worker2GPU.RecvCh:
			if msg.Type == scheduler.MsgJobAssign {
				assignedWorker = worker2GPU
			}
		case msg := <-worker8GPU.RecvCh:
			if msg.Type == scheduler.MsgJobAssign {
				assignedWorker = worker8GPU
			}
		case <-checkTimeout:
			// No assignment yet, signal ready again to trigger
			worker2GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
			worker8GPU.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
			checkTimeout = time.After(100 * time.Millisecond)
		case <-deadline:
			t.Fatal("timeout waiting for job assignment")
		}
	}

	assert.Equal(t, worker8GPU, assignedWorker, "4-GPU job should go to 8-GPU worker")
}