fetch_ml/tests/stress/scheduler_stress_test.go
Jeremie Fraeys 6af85ddaf6
feat(tests): enable stress and long-running test suites
Stress Tests:
- TestStress_WorkerConnectBurst: 30 workers, p99 latency validation
- TestStress_JobSubmissionBurst: 1K job submissions
- TestStress_WorkerChurn: 50 connect/disconnect cycles, memory leak detection
- TestStress_ConcurrentScheduling: 10 workers x 20 jobs contention

Long-Running Tests:
- TestLongRunning_MemoryLeak: heap growth monitoring
- TestLongRunning_OrphanRecovery: worker death/requeue stability
- TestLongRunning_WebSocketStability: 20 worker connection stability

Infrastructure:
- Add testreport package with JSON output, flaky test tracking
- Add TestTimer for timing/budget enforcement
- Add WaitForEvent, WaitForTaskStatus helpers
- Fix worker IDs to use valid bench-worker token patterns
2026-03-12 14:05:45 -04:00

308 lines
8.6 KiB
Go

// Package stress provides stress tests for the scheduler
// These tests validate scheduler behavior under high load and burst conditions.
//
// To run stress tests: go test -v ./tests/stress/... -run TestStress
// These tests are skipped in short mode (go test -short)
package stress
import (
"encoding/json"
"fmt"
"runtime"
"sync"
"testing"
"time"
"github.com/jfraeys/fetch_ml/internal/scheduler"
fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
"github.com/stretchr/testify/assert"
)
// TestStress_WorkerConnectBurst tests 30 sequential WebSocket connections
// Validates that the scheduler can handle burst worker connections without failure.
func TestStress_WorkerConnectBurst(t *testing.T) {
if testing.Short() {
t.Skip("Skipping stress test in short mode")
}
cfg := fixtures.DefaultHubConfig()
cfg.DefaultBatchSlots = 4
fixture := fixtures.NewSchedulerTestFixture(t, cfg)
defer fixture.Cleanup()
numWorkers := 30
workers := make([]*fixtures.MockWorker, 0, numWorkers)
latencies := make([]time.Duration, 0, numWorkers)
// Connect workers sequentially with minimal delay
start := time.Now()
for i := 0; i < numWorkers; i++ {
workerStart := time.Now()
workerID := fmt.Sprintf("bench-worker-%d", i)
worker := fixture.CreateWorker(workerID, scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendNVIDIA,
GPUCount: 4,
CPUCount: 8,
})
workers = append(workers, worker)
latencies = append(latencies, time.Since(workerStart))
// Small yield to avoid overwhelming the scheduler
if i%10 == 9 {
time.Sleep(10 * time.Millisecond)
}
}
totalTime := time.Since(start)
// Cleanup all workers
for _, w := range workers {
w.Close()
}
// Validate p99 latency is under 100ms
p99 := calculateP99(latencies)
t.Logf("Worker connect burst: %d workers in %v, p99 latency: %v", numWorkers, totalTime, p99)
assert.Less(t, p99, 100*time.Millisecond, "p99 connection latency should be under 100ms")
assert.Less(t, totalTime, 5*time.Second, "total connect time should be under 5s")
}
// TestStress_JobSubmissionBurst tests 1K job submissions
// Validates that the scheduler can handle burst job submissions without queue overflow.
func TestStress_JobSubmissionBurst(t *testing.T) {
if testing.Short() {
t.Skip("Skipping stress test in short mode")
}
cfg := fixtures.DefaultHubConfig()
fixture := fixtures.NewSchedulerTestFixture(t, cfg)
defer fixture.Cleanup()
// Create a single worker to receive assignments (use bench-worker-* pattern which has tokens 0-999)
worker := fixture.CreateWorker("bench-worker-100", scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendNVIDIA,
GPUCount: 8,
CPUCount: 16,
})
defer worker.Close()
numJobs := 1000
start := time.Now()
// Submit 1K jobs
for i := range numJobs {
jobID := fmt.Sprintf("burst-job-%d", i)
fixture.SubmitJob(scheduler.JobSpec{
ID: jobID,
Type: scheduler.JobTypeBatch,
SlotPool: "batch",
GPUCount: 1,
JobTier: scheduler.TierTraining,
})
}
submitTime := time.Since(start)
t.Logf("Submitted %d jobs in %v (%.0f jobs/sec)", numJobs, submitTime, float64(numJobs)/submitTime.Seconds())
// Signal worker ready to process some jobs
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 8, BatchInUse: 0}, "ready")
// Wait for and accept some assignments
accepted := 0
done := time.After(3 * time.Second)
for accepted < 10 {
select {
case <-done:
goto doneAccepting
default:
select {
case msg := <-worker.RecvCh:
if msg.Type == scheduler.MsgJobAssign {
var payload scheduler.JobAssignPayload
_ = json.Unmarshal(msg.Payload, &payload)
worker.AcceptJob(payload.Spec.ID)
accepted++
}
case <-time.After(100 * time.Millisecond):
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 8, BatchInUse: accepted}, "still_ready")
}
}
}
doneAccepting:
t.Logf("Worker accepted %d jobs from burst queue", accepted)
assert.Greater(t, accepted, 0, "worker should receive at least some job assignments")
}
// TestStress_WorkerChurn tests rapid connect/disconnect cycles
// Validates that the scheduler properly cleans up resources and doesn't leak memory.
func TestStress_WorkerChurn(t *testing.T) {
if testing.Short() {
t.Skip("Skipping stress test in short mode")
}
cfg := fixtures.DefaultHubConfig()
fixture := fixtures.NewSchedulerTestFixture(t, cfg)
defer fixture.Cleanup()
cycles := 50
var m1, m2 runtime.MemStats
runtime.GC()
runtime.ReadMemStats(&m1)
for i := range cycles {
workerID := fmt.Sprintf("churn-worker-%d", i%10) // Reuse 10 worker IDs
// Create worker - the fixture has dynamic tokens for bench-worker patterns
workerID = fmt.Sprintf("bench-worker-%d", i)
worker := fixtures.NewMockWorker(t, fixture.Hub, workerID)
worker.Register(scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendNVIDIA,
GPUCount: 4,
})
// Brief connection
time.Sleep(20 * time.Millisecond)
// Close worker
worker.Close()
// Small delay between cycles
if i%10 == 9 {
time.Sleep(50 * time.Millisecond)
}
}
// Force GC and check memory
runtime.GC()
time.Sleep(100 * time.Millisecond)
runtime.ReadMemStats(&m2)
// Allow 10MB growth for 50 cycles (200KB per cycle max)
growth := int64(m2.HeapAlloc) - int64(m1.HeapAlloc)
maxGrowth := int64(10 * 1024 * 1024) // 10MB
t.Logf("Worker churn: %d cycles, heap growth: %d bytes", cycles, growth)
assert.Less(t, growth, maxGrowth, "memory growth should be bounded (possible leak)")
}
// TestStress_ConcurrentScheduling tests job queue contention with multiple workers
// Validates fair scheduling and lack of race conditions under concurrent load.
func TestStress_ConcurrentScheduling(t *testing.T) {
if testing.Short() {
t.Skip("Skipping stress test in short mode")
}
cfg := fixtures.DefaultHubConfig()
cfg.DefaultBatchSlots = 4
fixture := fixtures.NewSchedulerTestFixture(t, cfg)
defer fixture.Cleanup()
numWorkers := 10
jobsPerWorker := 20
// Create workers
workers := make([]*fixtures.MockWorker, numWorkers)
for i := range numWorkers {
workerID := fmt.Sprintf("bench-multi-worker-%d", i)
workers[i] = fixture.CreateWorker(workerID, scheduler.WorkerCapabilities{
GPUBackend: scheduler.BackendNVIDIA,
GPUCount: 4,
CPUCount: 8,
})
}
// Submit jobs concurrently
var wg sync.WaitGroup
for i := range numWorkers {
wg.Add(1)
go func(workerIdx int) {
defer wg.Done()
for j := 0; j < jobsPerWorker; j++ {
jobID := fmt.Sprintf("concurrent-job-w%d-j%d", workerIdx, j)
fixture.SubmitJob(scheduler.JobSpec{
ID: jobID,
Type: scheduler.JobTypeBatch,
SlotPool: "batch",
GPUCount: 1,
JobTier: scheduler.TierDataProcessing,
})
}
}(i)
}
wg.Wait()
totalJobs := numWorkers * jobsPerWorker
t.Logf("Submitted %d jobs from %d workers concurrently", totalJobs, numWorkers)
// Signal all workers ready and collect some assignments
var assignWg sync.WaitGroup
assignmentCounts := make([]int, numWorkers)
for i, worker := range workers {
assignWg.Add(1)
go func(idx int, w *fixtures.MockWorker) {
defer assignWg.Done()
w.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "ready")
// Collect assignments for 500ms
deadline := time.Now().Add(500 * time.Millisecond)
for time.Now().Before(deadline) {
select {
case msg := <-w.RecvCh:
if msg.Type == scheduler.MsgJobAssign {
assignmentCounts[idx]++
var payload scheduler.JobAssignPayload
_ = json.Unmarshal(msg.Payload, &payload)
w.AcceptJob(payload.Spec.ID)
// Signal ready again after accepting
w.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1}, "processing")
}
case <-time.After(50 * time.Millisecond):
// Ping ready status
w.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "still_ready")
}
}
}(i, worker)
}
assignWg.Wait()
totalAssigned := 0
for _, count := range assignmentCounts {
totalAssigned += count
}
t.Logf("Workers received %d total assignments", totalAssigned)
assert.Greater(t, totalAssigned, 0, "should have some job assignments")
// Cleanup
for _, w := range workers {
w.Close()
}
}
// calculateP99 returns the 99th percentile latency from a slice of durations
func calculateP99(latencies []time.Duration) time.Duration {
if len(latencies) == 0 {
return 0
}
// Simple sort-based approach (not efficient for large N, but fine for stress tests)
sorted := make([]time.Duration, len(latencies))
copy(sorted, latencies)
for i := range sorted {
for j := i + 1; j < len(sorted); j++ {
if sorted[i] > sorted[j] {
sorted[i], sorted[j] = sorted[j], sorted[i]
}
}
}
idx := (len(sorted) * 99) / 100
if idx >= len(sorted) {
idx = len(sorted) - 1
}
return sorted[idx]
}