// Package stress provides stress tests for the scheduler // These tests validate scheduler behavior under high load and burst conditions. // // To run stress tests: go test -v ./tests/stress/... -run TestStress // These tests are skipped in short mode (go test -short) package stress import ( "encoding/json" "fmt" "runtime" "sync" "testing" "time" "github.com/jfraeys/fetch_ml/internal/scheduler" fixtures "github.com/jfraeys/fetch_ml/tests/fixtures" "github.com/stretchr/testify/assert" ) // TestStress_WorkerConnectBurst tests 30 sequential WebSocket connections // Validates that the scheduler can handle burst worker connections without failure. func TestStress_WorkerConnectBurst(t *testing.T) { if testing.Short() { t.Skip("Skipping stress test in short mode") } cfg := fixtures.DefaultHubConfig() cfg.DefaultBatchSlots = 4 fixture := fixtures.NewSchedulerTestFixture(t, cfg) defer fixture.Cleanup() numWorkers := 30 workers := make([]*fixtures.MockWorker, 0, numWorkers) latencies := make([]time.Duration, 0, numWorkers) // Connect workers sequentially with minimal delay start := time.Now() for i := 0; i < numWorkers; i++ { workerStart := time.Now() workerID := fmt.Sprintf("bench-worker-%d", i) worker := fixture.CreateWorker(workerID, scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendNVIDIA, GPUCount: 4, CPUCount: 8, }) workers = append(workers, worker) latencies = append(latencies, time.Since(workerStart)) // Small yield to avoid overwhelming the scheduler if i%10 == 9 { time.Sleep(10 * time.Millisecond) } } totalTime := time.Since(start) // Cleanup all workers for _, w := range workers { w.Close() } // Validate p99 latency is under 100ms p99 := calculateP99(latencies) t.Logf("Worker connect burst: %d workers in %v, p99 latency: %v", numWorkers, totalTime, p99) assert.Less(t, p99, 100*time.Millisecond, "p99 connection latency should be under 100ms") assert.Less(t, totalTime, 5*time.Second, "total connect time should be under 5s") } // TestStress_JobSubmissionBurst tests 1K job submissions // Validates that the scheduler can handle burst job submissions without queue overflow. func TestStress_JobSubmissionBurst(t *testing.T) { if testing.Short() { t.Skip("Skipping stress test in short mode") } cfg := fixtures.DefaultHubConfig() fixture := fixtures.NewSchedulerTestFixture(t, cfg) defer fixture.Cleanup() // Create a single worker to receive assignments (use bench-worker-* pattern which has tokens 0-999) worker := fixture.CreateWorker("bench-worker-100", scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendNVIDIA, GPUCount: 8, CPUCount: 16, }) defer worker.Close() numJobs := 1000 start := time.Now() // Submit 1K jobs for i := range numJobs { jobID := fmt.Sprintf("burst-job-%d", i) fixture.SubmitJob(scheduler.JobSpec{ ID: jobID, Type: scheduler.JobTypeBatch, SlotPool: "batch", GPUCount: 1, JobTier: scheduler.TierTraining, }) } submitTime := time.Since(start) t.Logf("Submitted %d jobs in %v (%.0f jobs/sec)", numJobs, submitTime, float64(numJobs)/submitTime.Seconds()) // Signal worker ready to process some jobs worker.SignalReady(scheduler.SlotStatus{BatchTotal: 8, BatchInUse: 0}, "ready") // Wait for and accept some assignments accepted := 0 done := time.After(3 * time.Second) for accepted < 10 { select { case <-done: goto doneAccepting default: select { case msg := <-worker.RecvCh: if msg.Type == scheduler.MsgJobAssign { var payload scheduler.JobAssignPayload _ = json.Unmarshal(msg.Payload, &payload) worker.AcceptJob(payload.Spec.ID) accepted++ } case <-time.After(100 * time.Millisecond): worker.SignalReady(scheduler.SlotStatus{BatchTotal: 8, BatchInUse: accepted}, "still_ready") } } } doneAccepting: t.Logf("Worker accepted %d jobs from burst queue", accepted) assert.Greater(t, accepted, 0, "worker should receive at least some job assignments") } // TestStress_WorkerChurn tests rapid connect/disconnect cycles // Validates that the scheduler properly cleans up resources and doesn't leak memory. func TestStress_WorkerChurn(t *testing.T) { if testing.Short() { t.Skip("Skipping stress test in short mode") } cfg := fixtures.DefaultHubConfig() fixture := fixtures.NewSchedulerTestFixture(t, cfg) defer fixture.Cleanup() cycles := 50 var m1, m2 runtime.MemStats runtime.GC() runtime.ReadMemStats(&m1) for i := range cycles { workerID := fmt.Sprintf("churn-worker-%d", i%10) // Reuse 10 worker IDs // Create worker - the fixture has dynamic tokens for bench-worker patterns workerID = fmt.Sprintf("bench-worker-%d", i) worker := fixtures.NewMockWorker(t, fixture.Hub, workerID) worker.Register(scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendNVIDIA, GPUCount: 4, }) // Brief connection time.Sleep(20 * time.Millisecond) // Close worker worker.Close() // Small delay between cycles if i%10 == 9 { time.Sleep(50 * time.Millisecond) } } // Force GC and check memory runtime.GC() time.Sleep(100 * time.Millisecond) runtime.ReadMemStats(&m2) // Allow 10MB growth for 50 cycles (200KB per cycle max) growth := int64(m2.HeapAlloc) - int64(m1.HeapAlloc) maxGrowth := int64(10 * 1024 * 1024) // 10MB t.Logf("Worker churn: %d cycles, heap growth: %d bytes", cycles, growth) assert.Less(t, growth, maxGrowth, "memory growth should be bounded (possible leak)") } // TestStress_ConcurrentScheduling tests job queue contention with multiple workers // Validates fair scheduling and lack of race conditions under concurrent load. func TestStress_ConcurrentScheduling(t *testing.T) { if testing.Short() { t.Skip("Skipping stress test in short mode") } cfg := fixtures.DefaultHubConfig() cfg.DefaultBatchSlots = 4 fixture := fixtures.NewSchedulerTestFixture(t, cfg) defer fixture.Cleanup() numWorkers := 10 jobsPerWorker := 20 // Create workers workers := make([]*fixtures.MockWorker, numWorkers) for i := range numWorkers { workerID := fmt.Sprintf("bench-multi-worker-%d", i) workers[i] = fixture.CreateWorker(workerID, scheduler.WorkerCapabilities{ GPUBackend: scheduler.BackendNVIDIA, GPUCount: 4, CPUCount: 8, }) } // Submit jobs concurrently var wg sync.WaitGroup for i := range numWorkers { wg.Add(1) go func(workerIdx int) { defer wg.Done() for j := 0; j < jobsPerWorker; j++ { jobID := fmt.Sprintf("concurrent-job-w%d-j%d", workerIdx, j) fixture.SubmitJob(scheduler.JobSpec{ ID: jobID, Type: scheduler.JobTypeBatch, SlotPool: "batch", GPUCount: 1, JobTier: scheduler.TierDataProcessing, }) } }(i) } wg.Wait() totalJobs := numWorkers * jobsPerWorker t.Logf("Submitted %d jobs from %d workers concurrently", totalJobs, numWorkers) // Signal all workers ready and collect some assignments var assignWg sync.WaitGroup assignmentCounts := make([]int, numWorkers) for i, worker := range workers { assignWg.Add(1) go func(idx int, w *fixtures.MockWorker) { defer assignWg.Done() w.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "ready") // Collect assignments for 500ms deadline := time.Now().Add(500 * time.Millisecond) for time.Now().Before(deadline) { select { case msg := <-w.RecvCh: if msg.Type == scheduler.MsgJobAssign { assignmentCounts[idx]++ var payload scheduler.JobAssignPayload _ = json.Unmarshal(msg.Payload, &payload) w.AcceptJob(payload.Spec.ID) // Signal ready again after accepting w.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1}, "processing") } case <-time.After(50 * time.Millisecond): // Ping ready status w.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "still_ready") } } }(i, worker) } assignWg.Wait() totalAssigned := 0 for _, count := range assignmentCounts { totalAssigned += count } t.Logf("Workers received %d total assignments", totalAssigned) assert.Greater(t, totalAssigned, 0, "should have some job assignments") // Cleanup for _, w := range workers { w.Close() } } // calculateP99 returns the 99th percentile latency from a slice of durations func calculateP99(latencies []time.Duration) time.Duration { if len(latencies) == 0 { return 0 } // Simple sort-based approach (not efficient for large N, but fine for stress tests) sorted := make([]time.Duration, len(latencies)) copy(sorted, latencies) for i := range sorted { for j := i + 1; j < len(sorted); j++ { if sorted[i] > sorted[j] { sorted[i], sorted[j] = sorted[j], sorted[i] } } } idx := (len(sorted) * 99) / 100 if idx >= len(sorted) { idx = len(sorted) - 1 } return sorted[idx] }