// Package fixtures provides shared test utilities and fixtures for scheduler tests package tests import ( "fmt" "os" "testing" "time" "github.com/jfraeys/fetch_ml/internal/scheduler" "github.com/stretchr/testify/require" ) // SchedulerTestFixture provides a test fixture for scheduler tests type SchedulerTestFixture struct { T testing.TB Hub *scheduler.SchedulerHub Workers map[string]*MockWorker stateDir string } // NewSchedulerTestFixture creates a new scheduler test fixture func NewSchedulerTestFixture(t testing.TB, cfg scheduler.HubConfig) *SchedulerTestFixture { if cfg.BindAddr == "" { cfg.BindAddr = "localhost:0" } // Create isolated state directory per test stateDir, err := os.MkdirTemp("", "fetchml-test-*") require.NoError(t, err) cfg.StateDir = stateDir hub, err := scheduler.NewHub(cfg, nil) require.NoError(t, err) // Start scheduler err = hub.Start() require.NoError(t, err) return &SchedulerTestFixture{ T: t, Hub: hub, Workers: make(map[string]*MockWorker), stateDir: stateDir, } } // CreateWorker creates and registers a new mock worker func (f *SchedulerTestFixture) CreateWorker(workerID string, caps scheduler.WorkerCapabilities) *MockWorker { worker := NewMockWorker(f.T, f.Hub, workerID) worker.Register(caps) f.Workers[workerID] = worker return worker } // SubmitJob submits a job to the scheduler func (f *SchedulerTestFixture) SubmitJob(spec scheduler.JobSpec) { err := f.Hub.SubmitJob(spec) require.NoError(f.T, err) } // GetTask retrieves a task by ID func (f *SchedulerTestFixture) GetTask(taskID string) *scheduler.Task { return f.Hub.GetTask(taskID) } // Cleanup stops the scheduler, closes all workers, and removes state dir func (f *SchedulerTestFixture) Cleanup() { // Close all workers first for _, worker := range f.Workers { worker.Close() } // Then stop the hub f.Hub.Stop() // Clean up isolated state directory if err := os.RemoveAll(f.stateDir); err != nil { // Log cleanup error but don't fail test fmt.Fprintf(os.Stderr, "failed to remove state dir: %v\n", err) } } // DefaultHubConfig returns a default hub configuration for testing func DefaultHubConfig() scheduler.HubConfig { tokens := map[string]string{ "test-token-worker-restart-1": "worker-restart-1", "test-token-mode-switch-worker": "mode-switch-worker", "test-token-mode-switch-worker-2": "mode-switch-worker-2", "test-token-e2e-worker-1": "e2e-worker-1", "test-token-e2e-worker-2": "e2e-worker-2", "test-token-worker-death-test": "worker-death-test", "test-token-worker-split-1": "worker-split-1", "test-token-worker-split-2": "worker-split-2", "test-token-worker-split-3": "worker-split-3", "test-token-worker-timeout": "worker-timeout", "test-token-worker-gang": "worker-gang", "test-token-bench-worker": "bench-worker", "test-token-bench-hb-worker": "bench-hb-worker", "test-token-bench-assign-worker": "bench-assign-worker", // Capability routing test workers "test-token-backend-test-worker": "backend-test-worker", "test-token-vram-test-worker": "vram-test-worker", "test-token-cpu-test-worker": "cpu-test-worker", "test-token-multi-gpu-test-worker": "multi-gpu-test-worker", "test-token-reservation-test-worker": "reservation-test-worker", "test-token-tier-gpu-worker": "tier-gpu-worker", "test-token-tier-cpu-worker": "tier-cpu-worker", "test-token-race-2gpu": "race-2gpu", "test-token-race-8gpu": "race-8gpu", "test-token-slot-sync-worker": "slot-sync-worker", "test-token-liveness-test-worker": "liveness-test-worker", "test-token-hb-ack-worker": "hb-ack-worker", "test-token-reg-caps-worker": "reg-caps-worker", "test-token-hb-active-worker": "hb-active-worker", "test-token-slot-dealloc-worker": "slot-dealloc-worker", "test-token-orphan-test-worker": "orphan-test-worker", "test-token-requeue-worker-1": "requeue-worker-1", "test-token-requeue-worker-2": "requeue-worker-2", "test-token-death-detection-worker": "death-detection-worker", "test-token-cleanup-worker": "cleanup-worker", "test-token-edge-worker": "edge-worker", "test-token-concurrent-worker-1": "concurrent-worker-1", "test-token-concurrent-worker-2": "concurrent-worker-2", "test-token-concurrent-worker-3": "concurrent-worker-3", "test-token-chaos-training-worker": "chaos-training-worker", "test-token-chaos-grace-worker": "chaos-grace-worker", "test-token-chaos-dup-worker": "chaos-dup-worker", "test-token-chaos-boundary-worker": "chaos-boundary-worker", "test-token-chaos-multi-worker-0": "chaos-multi-worker-0", "test-token-chaos-multi-worker-1": "chaos-multi-worker-1", "test-token-chaos-multi-worker-2": "chaos-multi-worker-2", "test-token-chaos-tier-worker-0": "chaos-tier-worker-0", "test-token-chaos-tier-worker-1": "chaos-tier-worker-1", "test-token-chaos-tier-worker-2": "chaos-tier-worker-2", "test-token-e2e-gpu-worker": "e2e-gpu-worker", "test-token-e2e-cpu-worker": "e2e-cpu-worker", "test-token-e2e-2gpu": "e2e-2gpu", "test-token-e2e-8gpu": "e2e-8gpu", "test-token-e2e-metal": "e2e-metal", "test-token-e2e-nvidia": "e2e-nvidia", "test-token-e2e-8gb-vram": "e2e-8gb-vram", "test-token-e2e-24gb-vram": "e2e-24gb-vram", "test-token-gang-worker-1": "gang-worker-1", "test-token-gang-worker-2": "gang-worker-2", "test-token-gang-worker-3": "gang-worker-3", "test-token-e2e-cpu-only": "e2e-cpu-only", } // Add tokens for dynamic benchmark worker IDs (0-999 for each pattern) for i := range 1000 { tokens[fmt.Sprintf("test-token-bench-worker-%d", i)] = fmt.Sprintf("bench-worker-%d", i) tokens[fmt.Sprintf("test-token-bench-multi-worker-%d", i)] = fmt.Sprintf("bench-multi-worker-%d", i) } return scheduler.HubConfig{ BindAddr: "localhost:0", DefaultBatchSlots: 4, StarvationThresholdMins: 5, AcceptanceTimeoutSecs: 5, GangAllocTimeoutSecs: 10, // #nosec G101 -- These are test fixture tokens, not real credentials WorkerTokens: tokens, } } // WaitForTimeout is a helper to wait for a condition with timeout func WaitForTimeout(duration time.Duration, condition func() bool) bool { deadline := time.Now().Add(duration) for time.Now().Before(deadline) { if condition() { return true } time.Sleep(10 * time.Millisecond) } return false }