Stress Tests: - TestStress_WorkerConnectBurst: 30 workers, p99 latency validation - TestStress_JobSubmissionBurst: 1K job submissions - TestStress_WorkerChurn: 50 connect/disconnect cycles, memory leak detection - TestStress_ConcurrentScheduling: 10 workers x 20 jobs contention Long-Running Tests: - TestLongRunning_MemoryLeak: heap growth monitoring - TestLongRunning_OrphanRecovery: worker death/requeue stability - TestLongRunning_WebSocketStability: 20 worker connection stability Infrastructure: - Add testreport package with JSON output, flaky test tracking - Add TestTimer for timing/budget enforcement - Add WaitForEvent, WaitForTaskStatus helpers - Fix worker IDs to use valid bench-worker token patterns
308 lines
8.6 KiB
Go
308 lines
8.6 KiB
Go
// Package stress provides stress tests for the scheduler
|
|
// These tests validate scheduler behavior under high load and burst conditions.
|
|
//
|
|
// To run stress tests: go test -v ./tests/stress/... -run TestStress
|
|
// These tests are skipped in short mode (go test -short)
|
|
package stress
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"runtime"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/scheduler"
|
|
fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
|
|
"github.com/stretchr/testify/assert"
|
|
)
|
|
|
|
// TestStress_WorkerConnectBurst tests 30 sequential WebSocket connections
|
|
// Validates that the scheduler can handle burst worker connections without failure.
|
|
func TestStress_WorkerConnectBurst(t *testing.T) {
|
|
if testing.Short() {
|
|
t.Skip("Skipping stress test in short mode")
|
|
}
|
|
|
|
cfg := fixtures.DefaultHubConfig()
|
|
cfg.DefaultBatchSlots = 4
|
|
fixture := fixtures.NewSchedulerTestFixture(t, cfg)
|
|
defer fixture.Cleanup()
|
|
|
|
numWorkers := 30
|
|
workers := make([]*fixtures.MockWorker, 0, numWorkers)
|
|
latencies := make([]time.Duration, 0, numWorkers)
|
|
|
|
// Connect workers sequentially with minimal delay
|
|
start := time.Now()
|
|
for i := 0; i < numWorkers; i++ {
|
|
workerStart := time.Now()
|
|
workerID := fmt.Sprintf("bench-worker-%d", i)
|
|
|
|
worker := fixture.CreateWorker(workerID, scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendNVIDIA,
|
|
GPUCount: 4,
|
|
CPUCount: 8,
|
|
})
|
|
workers = append(workers, worker)
|
|
latencies = append(latencies, time.Since(workerStart))
|
|
|
|
// Small yield to avoid overwhelming the scheduler
|
|
if i%10 == 9 {
|
|
time.Sleep(10 * time.Millisecond)
|
|
}
|
|
}
|
|
totalTime := time.Since(start)
|
|
|
|
// Cleanup all workers
|
|
for _, w := range workers {
|
|
w.Close()
|
|
}
|
|
|
|
// Validate p99 latency is under 100ms
|
|
p99 := calculateP99(latencies)
|
|
t.Logf("Worker connect burst: %d workers in %v, p99 latency: %v", numWorkers, totalTime, p99)
|
|
assert.Less(t, p99, 100*time.Millisecond, "p99 connection latency should be under 100ms")
|
|
assert.Less(t, totalTime, 5*time.Second, "total connect time should be under 5s")
|
|
}
|
|
|
|
// TestStress_JobSubmissionBurst tests 1K job submissions
|
|
// Validates that the scheduler can handle burst job submissions without queue overflow.
|
|
func TestStress_JobSubmissionBurst(t *testing.T) {
|
|
if testing.Short() {
|
|
t.Skip("Skipping stress test in short mode")
|
|
}
|
|
|
|
cfg := fixtures.DefaultHubConfig()
|
|
fixture := fixtures.NewSchedulerTestFixture(t, cfg)
|
|
defer fixture.Cleanup()
|
|
|
|
// Create a single worker to receive assignments (use bench-worker-* pattern which has tokens 0-999)
|
|
worker := fixture.CreateWorker("bench-worker-100", scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendNVIDIA,
|
|
GPUCount: 8,
|
|
CPUCount: 16,
|
|
})
|
|
defer worker.Close()
|
|
|
|
numJobs := 1000
|
|
start := time.Now()
|
|
|
|
// Submit 1K jobs
|
|
for i := range numJobs {
|
|
jobID := fmt.Sprintf("burst-job-%d", i)
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: jobID,
|
|
Type: scheduler.JobTypeBatch,
|
|
SlotPool: "batch",
|
|
GPUCount: 1,
|
|
JobTier: scheduler.TierTraining,
|
|
})
|
|
}
|
|
submitTime := time.Since(start)
|
|
|
|
t.Logf("Submitted %d jobs in %v (%.0f jobs/sec)", numJobs, submitTime, float64(numJobs)/submitTime.Seconds())
|
|
|
|
// Signal worker ready to process some jobs
|
|
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 8, BatchInUse: 0}, "ready")
|
|
|
|
// Wait for and accept some assignments
|
|
accepted := 0
|
|
done := time.After(3 * time.Second)
|
|
for accepted < 10 {
|
|
select {
|
|
case <-done:
|
|
goto doneAccepting
|
|
default:
|
|
select {
|
|
case msg := <-worker.RecvCh:
|
|
if msg.Type == scheduler.MsgJobAssign {
|
|
var payload scheduler.JobAssignPayload
|
|
_ = json.Unmarshal(msg.Payload, &payload)
|
|
worker.AcceptJob(payload.Spec.ID)
|
|
accepted++
|
|
}
|
|
case <-time.After(100 * time.Millisecond):
|
|
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 8, BatchInUse: accepted}, "still_ready")
|
|
}
|
|
}
|
|
}
|
|
doneAccepting:
|
|
|
|
t.Logf("Worker accepted %d jobs from burst queue", accepted)
|
|
assert.Greater(t, accepted, 0, "worker should receive at least some job assignments")
|
|
}
|
|
|
|
// TestStress_WorkerChurn tests rapid connect/disconnect cycles
|
|
// Validates that the scheduler properly cleans up resources and doesn't leak memory.
|
|
func TestStress_WorkerChurn(t *testing.T) {
|
|
if testing.Short() {
|
|
t.Skip("Skipping stress test in short mode")
|
|
}
|
|
|
|
cfg := fixtures.DefaultHubConfig()
|
|
fixture := fixtures.NewSchedulerTestFixture(t, cfg)
|
|
defer fixture.Cleanup()
|
|
|
|
cycles := 50
|
|
var m1, m2 runtime.MemStats
|
|
|
|
runtime.GC()
|
|
runtime.ReadMemStats(&m1)
|
|
|
|
for i := range cycles {
|
|
workerID := fmt.Sprintf("churn-worker-%d", i%10) // Reuse 10 worker IDs
|
|
|
|
// Create worker - the fixture has dynamic tokens for bench-worker patterns
|
|
workerID = fmt.Sprintf("bench-worker-%d", i)
|
|
worker := fixtures.NewMockWorker(t, fixture.Hub, workerID)
|
|
worker.Register(scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendNVIDIA,
|
|
GPUCount: 4,
|
|
})
|
|
|
|
// Brief connection
|
|
time.Sleep(20 * time.Millisecond)
|
|
|
|
// Close worker
|
|
worker.Close()
|
|
|
|
// Small delay between cycles
|
|
if i%10 == 9 {
|
|
time.Sleep(50 * time.Millisecond)
|
|
}
|
|
}
|
|
|
|
// Force GC and check memory
|
|
runtime.GC()
|
|
time.Sleep(100 * time.Millisecond)
|
|
runtime.ReadMemStats(&m2)
|
|
|
|
// Allow 10MB growth for 50 cycles (200KB per cycle max)
|
|
growth := int64(m2.HeapAlloc) - int64(m1.HeapAlloc)
|
|
maxGrowth := int64(10 * 1024 * 1024) // 10MB
|
|
|
|
t.Logf("Worker churn: %d cycles, heap growth: %d bytes", cycles, growth)
|
|
assert.Less(t, growth, maxGrowth, "memory growth should be bounded (possible leak)")
|
|
}
|
|
|
|
// TestStress_ConcurrentScheduling tests job queue contention with multiple workers
|
|
// Validates fair scheduling and lack of race conditions under concurrent load.
|
|
func TestStress_ConcurrentScheduling(t *testing.T) {
|
|
if testing.Short() {
|
|
t.Skip("Skipping stress test in short mode")
|
|
}
|
|
|
|
cfg := fixtures.DefaultHubConfig()
|
|
cfg.DefaultBatchSlots = 4
|
|
fixture := fixtures.NewSchedulerTestFixture(t, cfg)
|
|
defer fixture.Cleanup()
|
|
|
|
numWorkers := 10
|
|
jobsPerWorker := 20
|
|
|
|
// Create workers
|
|
workers := make([]*fixtures.MockWorker, numWorkers)
|
|
for i := range numWorkers {
|
|
workerID := fmt.Sprintf("bench-multi-worker-%d", i)
|
|
workers[i] = fixture.CreateWorker(workerID, scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendNVIDIA,
|
|
GPUCount: 4,
|
|
CPUCount: 8,
|
|
})
|
|
}
|
|
|
|
// Submit jobs concurrently
|
|
var wg sync.WaitGroup
|
|
for i := range numWorkers {
|
|
wg.Add(1)
|
|
go func(workerIdx int) {
|
|
defer wg.Done()
|
|
for j := 0; j < jobsPerWorker; j++ {
|
|
jobID := fmt.Sprintf("concurrent-job-w%d-j%d", workerIdx, j)
|
|
fixture.SubmitJob(scheduler.JobSpec{
|
|
ID: jobID,
|
|
Type: scheduler.JobTypeBatch,
|
|
SlotPool: "batch",
|
|
GPUCount: 1,
|
|
JobTier: scheduler.TierDataProcessing,
|
|
})
|
|
}
|
|
}(i)
|
|
}
|
|
wg.Wait()
|
|
|
|
totalJobs := numWorkers * jobsPerWorker
|
|
t.Logf("Submitted %d jobs from %d workers concurrently", totalJobs, numWorkers)
|
|
|
|
// Signal all workers ready and collect some assignments
|
|
var assignWg sync.WaitGroup
|
|
assignmentCounts := make([]int, numWorkers)
|
|
|
|
for i, worker := range workers {
|
|
assignWg.Add(1)
|
|
go func(idx int, w *fixtures.MockWorker) {
|
|
defer assignWg.Done()
|
|
|
|
w.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "ready")
|
|
|
|
// Collect assignments for 500ms
|
|
deadline := time.Now().Add(500 * time.Millisecond)
|
|
for time.Now().Before(deadline) {
|
|
select {
|
|
case msg := <-w.RecvCh:
|
|
if msg.Type == scheduler.MsgJobAssign {
|
|
assignmentCounts[idx]++
|
|
var payload scheduler.JobAssignPayload
|
|
_ = json.Unmarshal(msg.Payload, &payload)
|
|
w.AcceptJob(payload.Spec.ID)
|
|
// Signal ready again after accepting
|
|
w.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1}, "processing")
|
|
}
|
|
case <-time.After(50 * time.Millisecond):
|
|
// Ping ready status
|
|
w.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "still_ready")
|
|
}
|
|
}
|
|
}(i, worker)
|
|
}
|
|
assignWg.Wait()
|
|
|
|
totalAssigned := 0
|
|
for _, count := range assignmentCounts {
|
|
totalAssigned += count
|
|
}
|
|
|
|
t.Logf("Workers received %d total assignments", totalAssigned)
|
|
assert.Greater(t, totalAssigned, 0, "should have some job assignments")
|
|
|
|
// Cleanup
|
|
for _, w := range workers {
|
|
w.Close()
|
|
}
|
|
}
|
|
|
|
// calculateP99 returns the 99th percentile latency from a slice of durations
|
|
func calculateP99(latencies []time.Duration) time.Duration {
|
|
if len(latencies) == 0 {
|
|
return 0
|
|
}
|
|
|
|
// Simple sort-based approach (not efficient for large N, but fine for stress tests)
|
|
sorted := make([]time.Duration, len(latencies))
|
|
copy(sorted, latencies)
|
|
for i := range sorted {
|
|
for j := i + 1; j < len(sorted); j++ {
|
|
if sorted[i] > sorted[j] {
|
|
sorted[i], sorted[j] = sorted[j], sorted[i]
|
|
}
|
|
}
|
|
}
|
|
|
|
idx := (len(sorted) * 99) / 100
|
|
if idx >= len(sorted) {
|
|
idx = len(sorted) - 1
|
|
}
|
|
return sorted[idx]
|
|
}
|