fetch_ml/internal/scheduler/hub_capabilities_test.go

package scheduler

import (
	"testing"
	"time"
)

func TestCanAdmit_BackendMatching(t *testing.T) {
	h := &SchedulerHub{
		reservations: make(map[string]*Reservation),
	}

	tests := []struct {
		name       string
		workerCaps WorkerCapabilities
		jobSpec    JobSpec
		want       bool
	}{
		{
			name: "backend matches nvidia",
			workerCaps: WorkerCapabilities{
				GPUBackend: BackendNVIDIA,
				GPUCount:   4,
			},
			jobSpec: JobSpec{
				GPUBackend: "nvidia",
				GPUCount:   2,
			},
			want: true,
		},
		{
			name: "backend matches metal",
			workerCaps: WorkerCapabilities{
				GPUBackend: BackendMetal,
				GPUCount:   2,
			},
			jobSpec: JobSpec{
				GPUBackend: "metal",
				GPUCount:   1,
			},
			want: true,
		},
		{
			name: "backend mismatch nvidia vs metal",
			workerCaps: WorkerCapabilities{
				GPUBackend: BackendNVIDIA,
				GPUCount:   4,
			},
			jobSpec: JobSpec{
				GPUBackend: "metal",
				GPUCount:   1,
			},
			want: false,
		},
		{
			name: "no backend required - any matches",
			workerCaps: WorkerCapabilities{
				GPUBackend: BackendVulkan,
				GPUCount:   2,
			},
			jobSpec: JobSpec{
				GPUBackend: "",
				GPUCount:   1,
			},
			want: true,
		},
		{
			name: "cpu job on cpu worker",
			workerCaps: WorkerCapabilities{
				GPUBackend: BackendCPU,
				GPUCount:   0,
				CPUCount:   8,
			},
			jobSpec: JobSpec{
				GPUBackend: "cpu",
				GPUCount:   0,
			},
			want: true,
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			wc := &WorkerConn{
				capabilities: tt.workerCaps,
				slots:        SlotStatus{BatchTotal: 4},
			}
			task := &Task{
				ID:   "test-task",
				Spec: tt.jobSpec,
			}
			got := h.canAdmit(task, wc)
			if got != tt.want {
				t.Errorf("canAdmit() = %v, want %v", got, tt.want)
			}
		})
	}
}

func TestCanAdmit_VRAMRequirements(t *testing.T) {
	h := &SchedulerHub{
		reservations: make(map[string]*Reservation),
	}

	tests := []struct {
		name       string
		workerCaps WorkerCapabilities
		jobSpec    JobSpec
		want       bool
	}{
		{
			name: "sufficient VRAM",
			workerCaps: WorkerCapabilities{
				GPUBackend: BackendNVIDIA,
				GPUCount:   2,
				VRAMGB:     32.0,
			},
			jobSpec: JobSpec{
				MinVRAMGB: 16.0,
				GPUCount:  1,
			},
			want: true,
		},
		{
			name: "insufficient VRAM",
			workerCaps: WorkerCapabilities{
				GPUBackend: BackendNVIDIA,
				GPUCount:   2,
				VRAMGB:     8.0,
			},
			jobSpec: JobSpec{
				MinVRAMGB: 16.0,
				GPUCount:  1,
			},
			want: false,
		},
		{
			name: "no VRAM required",
			workerCaps: WorkerCapabilities{
				GPUBackend: BackendNVIDIA,
				GPUCount:   2,
				VRAMGB:     8.0,
			},
			jobSpec: JobSpec{
				MinVRAMGB: 0,
				GPUCount:  1,
			},
			want: true,
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			wc := &WorkerConn{
				capabilities: tt.workerCaps,
				slots:        SlotStatus{BatchTotal: 4},
			}
			task := &Task{
				ID:   "test-task",
				Spec: tt.jobSpec,
			}
			got := h.canAdmit(task, wc)
			if got != tt.want {
				t.Errorf("canAdmit() = %v, want %v", got, tt.want)
			}
		})
	}
}

func TestCanAdmit_CPUCoresRequirements(t *testing.T) {
	h := &SchedulerHub{
		reservations: make(map[string]*Reservation),
	}

	tests := []struct {
		name       string
		workerCaps WorkerCapabilities
		jobSpec    JobSpec
		want       bool
	}{
		{
			name: "sufficient CPU cores",
			workerCaps: WorkerCapabilities{
				GPUBackend: BackendCPU,
				CPUCount:   16,
			},
			jobSpec: JobSpec{
				MinCPUCores: 8,
				GPUCount:    0,
			},
			want: true,
		},
		{
			name: "insufficient CPU cores",
			workerCaps: WorkerCapabilities{
				GPUBackend: BackendCPU,
				CPUCount:   4,
			},
			jobSpec: JobSpec{
				MinCPUCores: 8,
				GPUCount:    0,
			},
			want: false,
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			wc := &WorkerConn{
				capabilities: tt.workerCaps,
				slots:        SlotStatus{BatchTotal: 4},
			}
			task := &Task{
				ID:   "test-task",
				Spec: tt.jobSpec,
			}
			got := h.canAdmit(task, wc)
			if got != tt.want {
				t.Errorf("canAdmit() = %v, want %v", got, tt.want)
			}
		})
	}
}

func TestCanAdmit_ReservedGPUs(t *testing.T) {
	h := &SchedulerHub{
		reservations: map[string]*Reservation{
			"res-1": {TaskID: "task-1", GPUCount: 2},
			"res-2": {TaskID: "task-2", GPUCount: 2},
		},
	}

	tests := []struct {
		name       string
		workerCaps WorkerCapabilities
		jobSpec    JobSpec
		want       bool
	}{
		{
			name: "enough GPUs after reservations",
			workerCaps: WorkerCapabilities{
				GPUBackend: BackendNVIDIA,
				GPUCount:   8,
			},
			jobSpec: JobSpec{
				GPUCount: 4,
			},
			want: true, // 8 - (2+2) = 4 available
		},
		{
			name: "not enough GPUs after reservations",
			workerCaps: WorkerCapabilities{
				GPUBackend: BackendNVIDIA,
				GPUCount:   4,
			},
			jobSpec: JobSpec{
				GPUCount: 2,
			},
			want: false, // 4 - (2+2) = 0 available
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			wc := &WorkerConn{
				capabilities: tt.workerCaps,
				slots:        SlotStatus{BatchTotal: 4},
			}
			task := &Task{
				ID:   "test-task",
				Spec: tt.jobSpec,
			}
			got := h.canAdmit(task, wc)
			if got != tt.want {
				t.Errorf("canAdmit() = %v, want %v", got, tt.want)
			}
		})
	}
}

func TestReconcileOrphans_TierGracePeriods(t *testing.T) {
	tests := []struct {
		name         string
		jobTier      JobTier
		accepted     bool
		assignedAt   time.Time
		wantRequeued bool
	}{
		{
			name:         "data_processing tier - short grace period",
			jobTier:      TierDataProcessing,
			accepted:     true,
			assignedAt:   time.Now().Add(-35 * time.Second), // Past 30s grace
			wantRequeued: true,
		},
		{
			name:         "training tier - long grace period",
			jobTier:      TierTraining,
			accepted:     true,
			assignedAt:   time.Now().Add(-5 * time.Minute), // Within 10min grace
			wantRequeued: false,
		},
		{
			name:         "training tier - past grace period",
			jobTier:      TierTraining,
			accepted:     true,
			assignedAt:   time.Now().Add(-11 * time.Minute), // Past 10min grace
			wantRequeued: true,
		},
		{
			name:         "evaluation tier - 2min grace",
			jobTier:      TierEvaluation,
			accepted:     true,
			assignedAt:   time.Now().Add(-3 * time.Minute), // Past 2min grace
			wantRequeued: true,
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			h := &SchedulerHub{
				workers:           make(map[string]*WorkerConn),
				pendingAcceptance: make(map[string]*JobAssignment),
				batchQueue:        NewPriorityQueue(0.1),
				config: HubConfig{
					AcceptanceTimeoutSecs: 60,
				},
			}

			task := &Task{
				ID:     "test-task",
				Status: "assigned",
				Spec: JobSpec{
					JobTier: tt.jobTier,
				},
			}

			h.pendingAcceptance["test-task"] = &JobAssignment{
				TaskID:     "test-task",
				WorkerID:   "disconnected-worker",
				AssignedAt: tt.assignedAt,
				Accepted:   tt.accepted,
				Task:       task,
			}

			h.reconcileOrphans()

			// Check if task was requeued
			_, stillPending := h.pendingAcceptance["test-task"]
			wasRequeued := !stillPending

			if wasRequeued != tt.wantRequeued {
				t.Errorf("reconcileOrphans() requeued=%v, want=%v", wasRequeued, tt.wantRequeued)
			}
		})
	}
}