fetch_ml/tests/fixtures/scheduler_fixture.go
Jeremie Fraeys ca913e8878
feat(scheduler): add test mode config and TLS detection
- Add DisableTLSForTesting to HubConfig for test environments
- Add IsUsingTLS() method to detect scheduler TLS status
- Update MockWorker to auto-select ws:// vs wss:// protocol
- Set DisableTLSForTesting: true in DefaultHubConfig
2026-03-12 14:05:35 -04:00

176 lines
6.6 KiB
Go

// Package fixtures provides shared test utilities and fixtures for scheduler tests
package tests
import (
"fmt"
"os"
"testing"
"time"
"github.com/jfraeys/fetch_ml/internal/scheduler"
"github.com/stretchr/testify/require"
)
// SchedulerTestFixture provides a test fixture for scheduler tests
type SchedulerTestFixture struct {
T testing.TB
Hub *scheduler.SchedulerHub
Workers map[string]*MockWorker
stateDir string
}
// NewSchedulerTestFixture creates a new scheduler test fixture
func NewSchedulerTestFixture(t testing.TB, cfg scheduler.HubConfig) *SchedulerTestFixture {
if cfg.BindAddr == "" {
cfg.BindAddr = "localhost:0"
}
// Create isolated state directory per test
stateDir, err := os.MkdirTemp("", "fetchml-test-*")
require.NoError(t, err)
cfg.StateDir = stateDir
hub, err := scheduler.NewHub(cfg, nil)
require.NoError(t, err)
// Start scheduler
err = hub.Start()
require.NoError(t, err)
return &SchedulerTestFixture{
T: t,
Hub: hub,
Workers: make(map[string]*MockWorker),
stateDir: stateDir,
}
}
// CreateWorker creates and registers a new mock worker
func (f *SchedulerTestFixture) CreateWorker(workerID string, caps scheduler.WorkerCapabilities) *MockWorker {
worker := NewMockWorker(f.T, f.Hub, workerID)
worker.Register(caps)
f.Workers[workerID] = worker
return worker
}
// SubmitJob submits a job to the scheduler
func (f *SchedulerTestFixture) SubmitJob(spec scheduler.JobSpec) {
err := f.Hub.SubmitJob(spec)
require.NoError(f.T, err)
}
// GetTask retrieves a task by ID
func (f *SchedulerTestFixture) GetTask(taskID string) *scheduler.Task {
return f.Hub.GetTask(taskID)
}
// Cleanup stops the scheduler, closes all workers, and removes state dir
func (f *SchedulerTestFixture) Cleanup() {
// Close all workers first
for _, worker := range f.Workers {
worker.Close()
}
// Then stop the hub
f.Hub.Stop()
// Clean up isolated state directory
if err := os.RemoveAll(f.stateDir); err != nil {
// Log cleanup error but don't fail test
fmt.Fprintf(os.Stderr, "failed to remove state dir: %v\n", err)
}
}
// DefaultHubConfig returns a default hub configuration for testing
func DefaultHubConfig() scheduler.HubConfig {
tokens := map[string]string{
"test-token-worker-restart-1": "worker-restart-1",
"test-token-mode-switch-worker": "mode-switch-worker",
"test-token-mode-switch-worker-2": "mode-switch-worker-2",
"test-token-e2e-worker-1": "e2e-worker-1",
"test-token-e2e-worker-2": "e2e-worker-2",
"test-token-worker-death-test": "worker-death-test",
"test-token-worker-split-1": "worker-split-1",
"test-token-worker-split-2": "worker-split-2",
"test-token-worker-split-3": "worker-split-3",
"test-token-worker-timeout": "worker-timeout",
"test-token-worker-gang": "worker-gang",
"test-token-bench-worker": "bench-worker",
"test-token-bench-hb-worker": "bench-hb-worker",
"test-token-bench-assign-worker": "bench-assign-worker",
// Capability routing test workers
"test-token-backend-test-worker": "backend-test-worker",
"test-token-vram-test-worker": "vram-test-worker",
"test-token-cpu-test-worker": "cpu-test-worker",
"test-token-multi-gpu-test-worker": "multi-gpu-test-worker",
"test-token-reservation-test-worker": "reservation-test-worker",
"test-token-tier-gpu-worker": "tier-gpu-worker",
"test-token-tier-cpu-worker": "tier-cpu-worker",
"test-token-race-2gpu": "race-2gpu",
"test-token-race-8gpu": "race-8gpu",
"test-token-slot-sync-worker": "slot-sync-worker",
"test-token-liveness-test-worker": "liveness-test-worker",
"test-token-hb-ack-worker": "hb-ack-worker",
"test-token-reg-caps-worker": "reg-caps-worker",
"test-token-hb-active-worker": "hb-active-worker",
"test-token-slot-dealloc-worker": "slot-dealloc-worker",
"test-token-orphan-test-worker": "orphan-test-worker",
"test-token-requeue-worker-1": "requeue-worker-1",
"test-token-requeue-worker-2": "requeue-worker-2",
"test-token-death-detection-worker": "death-detection-worker",
"test-token-cleanup-worker": "cleanup-worker",
"test-token-edge-worker": "edge-worker",
"test-token-concurrent-worker-1": "concurrent-worker-1",
"test-token-concurrent-worker-2": "concurrent-worker-2",
"test-token-concurrent-worker-3": "concurrent-worker-3",
"test-token-chaos-training-worker": "chaos-training-worker",
"test-token-chaos-grace-worker": "chaos-grace-worker",
"test-token-chaos-dup-worker": "chaos-dup-worker",
"test-token-chaos-boundary-worker": "chaos-boundary-worker",
"test-token-chaos-multi-worker-0": "chaos-multi-worker-0",
"test-token-chaos-multi-worker-1": "chaos-multi-worker-1",
"test-token-chaos-multi-worker-2": "chaos-multi-worker-2",
"test-token-chaos-tier-worker-0": "chaos-tier-worker-0",
"test-token-chaos-tier-worker-1": "chaos-tier-worker-1",
"test-token-chaos-tier-worker-2": "chaos-tier-worker-2",
"test-token-e2e-gpu-worker": "e2e-gpu-worker",
"test-token-e2e-cpu-worker": "e2e-cpu-worker",
"test-token-e2e-2gpu": "e2e-2gpu",
"test-token-e2e-8gpu": "e2e-8gpu",
"test-token-e2e-metal": "e2e-metal",
"test-token-e2e-nvidia": "e2e-nvidia",
"test-token-e2e-8gb-vram": "e2e-8gb-vram",
"test-token-e2e-24gb-vram": "e2e-24gb-vram",
"test-token-gang-worker-1": "gang-worker-1",
"test-token-gang-worker-2": "gang-worker-2",
"test-token-gang-worker-3": "gang-worker-3",
"test-token-e2e-cpu-only": "e2e-cpu-only",
}
// Add tokens for dynamic benchmark worker IDs (0-999 for each pattern)
for i := range 1000 {
tokens[fmt.Sprintf("test-token-bench-worker-%d", i)] = fmt.Sprintf("bench-worker-%d", i)
tokens[fmt.Sprintf("test-token-bench-multi-worker-%d", i)] = fmt.Sprintf("bench-multi-worker-%d", i)
}
return scheduler.HubConfig{
BindAddr: "localhost:0",
DefaultBatchSlots: 4,
StarvationThresholdMins: 5,
AcceptanceTimeoutSecs: 5,
GangAllocTimeoutSecs: 10,
DisableTLSForTesting: true, // Use ws:// for tests to avoid TLS complexity
// #nosec G101 -- These are test fixture tokens, not real credentials
WorkerTokens: tokens,
}
}
// WaitForTimeout is a helper to wait for a condition with timeout
func WaitForTimeout(duration time.Duration, condition func() bool) bool {
deadline := time.Now().Add(duration)
for time.Now().Before(deadline) {
if condition() {
return true
}
time.Sleep(10 * time.Millisecond)
}
return false
}