- Add DisableTLSForTesting to HubConfig for test environments - Add IsUsingTLS() method to detect scheduler TLS status - Update MockWorker to auto-select ws:// vs wss:// protocol - Set DisableTLSForTesting: true in DefaultHubConfig
176 lines
6.6 KiB
Go
176 lines
6.6 KiB
Go
// Package fixtures provides shared test utilities and fixtures for scheduler tests
|
|
package tests
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/scheduler"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
// SchedulerTestFixture provides a test fixture for scheduler tests
|
|
type SchedulerTestFixture struct {
|
|
T testing.TB
|
|
Hub *scheduler.SchedulerHub
|
|
Workers map[string]*MockWorker
|
|
stateDir string
|
|
}
|
|
|
|
// NewSchedulerTestFixture creates a new scheduler test fixture
|
|
func NewSchedulerTestFixture(t testing.TB, cfg scheduler.HubConfig) *SchedulerTestFixture {
|
|
if cfg.BindAddr == "" {
|
|
cfg.BindAddr = "localhost:0"
|
|
}
|
|
|
|
// Create isolated state directory per test
|
|
stateDir, err := os.MkdirTemp("", "fetchml-test-*")
|
|
require.NoError(t, err)
|
|
cfg.StateDir = stateDir
|
|
|
|
hub, err := scheduler.NewHub(cfg, nil)
|
|
require.NoError(t, err)
|
|
|
|
// Start scheduler
|
|
err = hub.Start()
|
|
require.NoError(t, err)
|
|
|
|
return &SchedulerTestFixture{
|
|
T: t,
|
|
Hub: hub,
|
|
Workers: make(map[string]*MockWorker),
|
|
stateDir: stateDir,
|
|
}
|
|
}
|
|
|
|
// CreateWorker creates and registers a new mock worker
|
|
func (f *SchedulerTestFixture) CreateWorker(workerID string, caps scheduler.WorkerCapabilities) *MockWorker {
|
|
worker := NewMockWorker(f.T, f.Hub, workerID)
|
|
worker.Register(caps)
|
|
f.Workers[workerID] = worker
|
|
return worker
|
|
}
|
|
|
|
// SubmitJob submits a job to the scheduler
|
|
func (f *SchedulerTestFixture) SubmitJob(spec scheduler.JobSpec) {
|
|
err := f.Hub.SubmitJob(spec)
|
|
require.NoError(f.T, err)
|
|
}
|
|
|
|
// GetTask retrieves a task by ID
|
|
func (f *SchedulerTestFixture) GetTask(taskID string) *scheduler.Task {
|
|
return f.Hub.GetTask(taskID)
|
|
}
|
|
|
|
// Cleanup stops the scheduler, closes all workers, and removes state dir
|
|
func (f *SchedulerTestFixture) Cleanup() {
|
|
// Close all workers first
|
|
for _, worker := range f.Workers {
|
|
worker.Close()
|
|
}
|
|
// Then stop the hub
|
|
f.Hub.Stop()
|
|
// Clean up isolated state directory
|
|
if err := os.RemoveAll(f.stateDir); err != nil {
|
|
// Log cleanup error but don't fail test
|
|
fmt.Fprintf(os.Stderr, "failed to remove state dir: %v\n", err)
|
|
}
|
|
}
|
|
|
|
// DefaultHubConfig returns a default hub configuration for testing
|
|
func DefaultHubConfig() scheduler.HubConfig {
|
|
tokens := map[string]string{
|
|
"test-token-worker-restart-1": "worker-restart-1",
|
|
"test-token-mode-switch-worker": "mode-switch-worker",
|
|
"test-token-mode-switch-worker-2": "mode-switch-worker-2",
|
|
"test-token-e2e-worker-1": "e2e-worker-1",
|
|
"test-token-e2e-worker-2": "e2e-worker-2",
|
|
"test-token-worker-death-test": "worker-death-test",
|
|
"test-token-worker-split-1": "worker-split-1",
|
|
"test-token-worker-split-2": "worker-split-2",
|
|
"test-token-worker-split-3": "worker-split-3",
|
|
"test-token-worker-timeout": "worker-timeout",
|
|
"test-token-worker-gang": "worker-gang",
|
|
"test-token-bench-worker": "bench-worker",
|
|
"test-token-bench-hb-worker": "bench-hb-worker",
|
|
"test-token-bench-assign-worker": "bench-assign-worker",
|
|
// Capability routing test workers
|
|
"test-token-backend-test-worker": "backend-test-worker",
|
|
"test-token-vram-test-worker": "vram-test-worker",
|
|
"test-token-cpu-test-worker": "cpu-test-worker",
|
|
"test-token-multi-gpu-test-worker": "multi-gpu-test-worker",
|
|
"test-token-reservation-test-worker": "reservation-test-worker",
|
|
"test-token-tier-gpu-worker": "tier-gpu-worker",
|
|
"test-token-tier-cpu-worker": "tier-cpu-worker",
|
|
"test-token-race-2gpu": "race-2gpu",
|
|
"test-token-race-8gpu": "race-8gpu",
|
|
"test-token-slot-sync-worker": "slot-sync-worker",
|
|
"test-token-liveness-test-worker": "liveness-test-worker",
|
|
"test-token-hb-ack-worker": "hb-ack-worker",
|
|
"test-token-reg-caps-worker": "reg-caps-worker",
|
|
"test-token-hb-active-worker": "hb-active-worker",
|
|
"test-token-slot-dealloc-worker": "slot-dealloc-worker",
|
|
"test-token-orphan-test-worker": "orphan-test-worker",
|
|
"test-token-requeue-worker-1": "requeue-worker-1",
|
|
"test-token-requeue-worker-2": "requeue-worker-2",
|
|
"test-token-death-detection-worker": "death-detection-worker",
|
|
"test-token-cleanup-worker": "cleanup-worker",
|
|
"test-token-edge-worker": "edge-worker",
|
|
"test-token-concurrent-worker-1": "concurrent-worker-1",
|
|
"test-token-concurrent-worker-2": "concurrent-worker-2",
|
|
"test-token-concurrent-worker-3": "concurrent-worker-3",
|
|
"test-token-chaos-training-worker": "chaos-training-worker",
|
|
"test-token-chaos-grace-worker": "chaos-grace-worker",
|
|
"test-token-chaos-dup-worker": "chaos-dup-worker",
|
|
"test-token-chaos-boundary-worker": "chaos-boundary-worker",
|
|
"test-token-chaos-multi-worker-0": "chaos-multi-worker-0",
|
|
"test-token-chaos-multi-worker-1": "chaos-multi-worker-1",
|
|
"test-token-chaos-multi-worker-2": "chaos-multi-worker-2",
|
|
"test-token-chaos-tier-worker-0": "chaos-tier-worker-0",
|
|
"test-token-chaos-tier-worker-1": "chaos-tier-worker-1",
|
|
"test-token-chaos-tier-worker-2": "chaos-tier-worker-2",
|
|
"test-token-e2e-gpu-worker": "e2e-gpu-worker",
|
|
"test-token-e2e-cpu-worker": "e2e-cpu-worker",
|
|
"test-token-e2e-2gpu": "e2e-2gpu",
|
|
"test-token-e2e-8gpu": "e2e-8gpu",
|
|
"test-token-e2e-metal": "e2e-metal",
|
|
"test-token-e2e-nvidia": "e2e-nvidia",
|
|
"test-token-e2e-8gb-vram": "e2e-8gb-vram",
|
|
"test-token-e2e-24gb-vram": "e2e-24gb-vram",
|
|
"test-token-gang-worker-1": "gang-worker-1",
|
|
"test-token-gang-worker-2": "gang-worker-2",
|
|
"test-token-gang-worker-3": "gang-worker-3",
|
|
"test-token-e2e-cpu-only": "e2e-cpu-only",
|
|
}
|
|
|
|
// Add tokens for dynamic benchmark worker IDs (0-999 for each pattern)
|
|
for i := range 1000 {
|
|
tokens[fmt.Sprintf("test-token-bench-worker-%d", i)] = fmt.Sprintf("bench-worker-%d", i)
|
|
tokens[fmt.Sprintf("test-token-bench-multi-worker-%d", i)] = fmt.Sprintf("bench-multi-worker-%d", i)
|
|
}
|
|
|
|
return scheduler.HubConfig{
|
|
BindAddr: "localhost:0",
|
|
DefaultBatchSlots: 4,
|
|
StarvationThresholdMins: 5,
|
|
AcceptanceTimeoutSecs: 5,
|
|
GangAllocTimeoutSecs: 10,
|
|
DisableTLSForTesting: true, // Use ws:// for tests to avoid TLS complexity
|
|
// #nosec G101 -- These are test fixture tokens, not real credentials
|
|
WorkerTokens: tokens,
|
|
}
|
|
}
|
|
|
|
// WaitForTimeout is a helper to wait for a condition with timeout
|
|
func WaitForTimeout(duration time.Duration, condition func() bool) bool {
|
|
deadline := time.Now().Add(duration)
|
|
for time.Now().Before(deadline) {
|
|
if condition() {
|
|
return true
|
|
}
|
|
time.Sleep(10 * time.Millisecond)
|
|
}
|
|
return false
|
|
}
|