Add new scheduler component for distributed ML workload orchestration: - Hub-based coordination for multi-worker clusters - Pacing controller for rate limiting job submissions - Priority queue with preemption support - Port allocator for dynamic service discovery - Protocol handlers for worker-scheduler communication - Service manager with OS-specific implementations - Connection management and state persistence - Template system for service deployment Includes comprehensive test suite: - Unit tests for all core components - Integration tests for distributed scenarios - Benchmark tests for performance validation - Mock fixtures for isolated testing Refs: scheduler-architecture.md
118 lines
3.3 KiB
Go
118 lines
3.3 KiB
Go
// Package fixtures provides shared test utilities and fixtures for scheduler tests
|
|
package tests
|
|
|
|
import (
|
|
"os"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/scheduler"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
// testStateDir is used for hub state storage in tests
|
|
var testStateDir string
|
|
|
|
func init() {
|
|
var err error
|
|
testStateDir, err = os.MkdirTemp("", "fetchml-test-*")
|
|
if err != nil {
|
|
panic("failed to create test state dir: " + err.Error())
|
|
}
|
|
}
|
|
|
|
// SchedulerTestFixture provides a test fixture for scheduler tests
|
|
type SchedulerTestFixture struct {
|
|
T testing.TB
|
|
Hub *scheduler.SchedulerHub
|
|
Workers map[string]*MockWorker
|
|
}
|
|
|
|
// NewSchedulerTestFixture creates a new scheduler test fixture
|
|
func NewSchedulerTestFixture(t testing.TB, cfg scheduler.HubConfig) *SchedulerTestFixture {
|
|
if cfg.BindAddr == "" {
|
|
cfg.BindAddr = "localhost:0"
|
|
}
|
|
|
|
hub, err := scheduler.NewHub(cfg, nil)
|
|
require.NoError(t, err)
|
|
|
|
// Start scheduler
|
|
err = hub.Start()
|
|
require.NoError(t, err)
|
|
|
|
return &SchedulerTestFixture{
|
|
T: t,
|
|
Hub: hub,
|
|
Workers: make(map[string]*MockWorker),
|
|
}
|
|
}
|
|
|
|
// CreateWorker creates and registers a new mock worker
|
|
func (f *SchedulerTestFixture) CreateWorker(workerID string, caps scheduler.WorkerCapabilities) *MockWorker {
|
|
worker := NewMockWorker(f.T, f.Hub, workerID)
|
|
worker.Register(caps)
|
|
f.Workers[workerID] = worker
|
|
return worker
|
|
}
|
|
|
|
// SubmitJob submits a job to the scheduler
|
|
func (f *SchedulerTestFixture) SubmitJob(spec scheduler.JobSpec) {
|
|
err := f.Hub.SubmitJob(spec)
|
|
require.NoError(f.T, err)
|
|
}
|
|
|
|
// GetTask retrieves a task by ID
|
|
func (f *SchedulerTestFixture) GetTask(taskID string) *scheduler.Task {
|
|
return f.Hub.GetTask(taskID)
|
|
}
|
|
|
|
// Cleanup stops the scheduler and closes all workers
|
|
func (f *SchedulerTestFixture) Cleanup() {
|
|
// Close all workers first
|
|
for _, worker := range f.Workers {
|
|
worker.Close()
|
|
}
|
|
// Then stop the hub
|
|
f.Hub.Stop()
|
|
}
|
|
|
|
// DefaultHubConfig returns a default hub configuration for testing
|
|
func DefaultHubConfig() scheduler.HubConfig {
|
|
return scheduler.HubConfig{
|
|
BindAddr: "localhost:0",
|
|
DefaultBatchSlots: 4,
|
|
StarvationThresholdMins: 5,
|
|
AcceptanceTimeoutSecs: 5,
|
|
GangAllocTimeoutSecs: 10,
|
|
StateDir: testStateDir,
|
|
WorkerTokens: map[string]string{
|
|
"test-token-worker-restart-1": "worker-restart-1",
|
|
"test-token-mode-switch-worker": "mode-switch-worker",
|
|
"test-token-mode-switch-worker-2": "mode-switch-worker-2",
|
|
"test-token-e2e-worker-1": "e2e-worker-1",
|
|
"test-token-e2e-worker-2": "e2e-worker-2",
|
|
"test-token-worker-death-test": "worker-death-test",
|
|
"test-token-worker-split-1": "worker-split-1",
|
|
"test-token-worker-split-2": "worker-split-2",
|
|
"test-token-worker-split-3": "worker-split-3",
|
|
"test-token-worker-timeout": "worker-timeout",
|
|
"test-token-worker-gang": "worker-gang",
|
|
"test-token-bench-worker": "bench-worker",
|
|
"test-token-bench-hb-worker": "bench-hb-worker",
|
|
"test-token-bench-assign-worker": "bench-assign-worker",
|
|
},
|
|
}
|
|
}
|
|
|
|
// WaitForTimeout is a helper to wait for a condition with timeout
|
|
func WaitForTimeout(duration time.Duration, condition func() bool) bool {
|
|
deadline := time.Now().Add(duration)
|
|
for time.Now().Before(deadline) {
|
|
if condition() {
|
|
return true
|
|
}
|
|
time.Sleep(10 * time.Millisecond)
|
|
}
|
|
return false
|
|
}
|