Fix bug in scheduler hub orphan reconciliation: - Move delete(h.pendingAcceptance, taskID) inside the requeue success block - Prevents premature cleanup when requeue fails Add comprehensive test infrastructure: - hub_test_helpers.go: New test helper utilities (78 lines) - Mock scheduler components for isolated testing - Test fixture setup and teardown helpers Refactor and enhance hub capabilities tests: - Significant restructuring of hub_capabilities_test.go (213 lines changed) - Improved test coverage for worker capability matching Add comprehensive orphan recovery tests: - internal/scheduler/orphan_recovery_test.go (451 lines) - Tests orphaned job detection and recovery - Covers requeue logic, timeout handling, state cleanup
78 lines
2.5 KiB
Go
78 lines
2.5 KiB
Go
package scheduler
|
|
|
|
import "time"
|
|
|
|
// Test helpers - only compiled for tests
|
|
// These expose internal functionality for tests in tests/ directory
|
|
|
|
// CanAdmitForTest exports canAdmit for testing
|
|
func (h *SchedulerHub) CanAdmitForTest(candidate *Task, worker *WorkerConn) bool {
|
|
return h.canAdmit(candidate, worker)
|
|
}
|
|
|
|
// ReconcileOrphansForTest exports reconcileOrphans for testing
|
|
func (h *SchedulerHub) ReconcileOrphansForTest() {
|
|
h.reconcileOrphans()
|
|
}
|
|
|
|
// SetPendingAcceptanceForTest sets pending acceptance for testing
|
|
func (h *SchedulerHub) SetPendingAcceptanceForTest(taskID string, assignment *JobAssignment) {
|
|
h.mu.Lock()
|
|
defer h.mu.Unlock()
|
|
if h.pendingAcceptance == nil {
|
|
h.pendingAcceptance = make(map[string]*JobAssignment)
|
|
}
|
|
h.pendingAcceptance[taskID] = assignment
|
|
}
|
|
|
|
// GetPendingAcceptanceForTest gets pending acceptance for testing
|
|
func (h *SchedulerHub) GetPendingAcceptanceForTest(taskID string) (*JobAssignment, bool) {
|
|
h.mu.RLock()
|
|
defer h.mu.RUnlock()
|
|
a, ok := h.pendingAcceptance[taskID]
|
|
return a, ok
|
|
}
|
|
|
|
// SetWorkerConnForTest creates a WorkerConn for testing with exported fields
|
|
func SetWorkerConnForTest(wc *WorkerConn, caps WorkerCapabilities, slots SlotStatus) {
|
|
wc.capabilities = caps
|
|
wc.slots = slots
|
|
}
|
|
|
|
// NewWorkerConnForTest creates a new WorkerConn for testing
|
|
func NewWorkerConnForTest(caps WorkerCapabilities, slots SlotStatus) *WorkerConn {
|
|
return &WorkerConn{
|
|
capabilities: caps,
|
|
slots: slots,
|
|
}
|
|
}
|
|
|
|
// SetReservationsForTest sets reservations for testing
|
|
func (h *SchedulerHub) SetReservationsForTest(reservations map[string]*Reservation) {
|
|
h.mu.Lock()
|
|
defer h.mu.Unlock()
|
|
h.reservations = reservations
|
|
}
|
|
|
|
// NewTestSchedulerHub creates a scheduler hub for testing
|
|
func NewTestSchedulerHub(cfg HubConfig) *SchedulerHub {
|
|
stateStore, _ := NewStateStore("/tmp/test-scheduler.state")
|
|
return &SchedulerHub{
|
|
workers: make(map[string]*WorkerConn),
|
|
readyWorkers: make(map[string]*WorkerConn),
|
|
batchQueue: NewPriorityQueue(0.1),
|
|
serviceQueue: NewPriorityQueue(0.1),
|
|
reservations: make(map[string]*Reservation),
|
|
multiNodePending: make(map[string]*MultiNodeJob),
|
|
pendingAcceptance: make(map[string]*JobAssignment),
|
|
runningTasks: make(map[string]*Task),
|
|
state: stateStore,
|
|
starvation: &StarvationTracker{
|
|
threshold: time.Duration(cfg.StarvationThresholdMins) * time.Minute,
|
|
},
|
|
metrics: &SchedulerMetrics{
|
|
WorkerSlots: make(map[string]SlotStatus),
|
|
},
|
|
config: cfg,
|
|
}
|
|
}
|