fetch_ml/tests/stress/scheduler_stress_test.go

// Package stress provides stress tests for the scheduler
// These tests validate scheduler behavior under high load and burst conditions.
//
// To run stress tests: go test -v ./tests/stress/... -run TestStress
// These tests are skipped in short mode (go test -short)
package stress

import (
	"encoding/json"
	"fmt"
	"runtime"
	"sync"
	"testing"
	"time"

	"github.com/jfraeys/fetch_ml/internal/scheduler"
	fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
	"github.com/stretchr/testify/assert"
)

// TestStress_WorkerConnectBurst tests 30 sequential WebSocket connections
// Validates that the scheduler can handle burst worker connections without failure.
func TestStress_WorkerConnectBurst(t *testing.T) {
	if testing.Short() {
		t.Skip("Skipping stress test in short mode")
	}

	cfg := fixtures.DefaultHubConfig()
	cfg.DefaultBatchSlots = 4
	fixture := fixtures.NewSchedulerTestFixture(t, cfg)
	defer fixture.Cleanup()

	numWorkers := 30
	workers := make([]*fixtures.MockWorker, 0, numWorkers)
	latencies := make([]time.Duration, 0, numWorkers)

	// Connect workers sequentially with minimal delay
	start := time.Now()
	for i := 0; i < numWorkers; i++ {
		workerStart := time.Now()
		workerID := fmt.Sprintf("bench-worker-%d", i)

		worker := fixture.CreateWorker(workerID, scheduler.WorkerCapabilities{
			GPUBackend: scheduler.BackendNVIDIA,
			GPUCount:   4,
			CPUCount:   8,
		})
		workers = append(workers, worker)
		latencies = append(latencies, time.Since(workerStart))

		// Small yield to avoid overwhelming the scheduler
		if i%10 == 9 {
			time.Sleep(10 * time.Millisecond)
		}
	}
	totalTime := time.Since(start)

	// Cleanup all workers
	for _, w := range workers {
		w.Close()
	}

	// Validate p99 latency is under 100ms
	p99 := calculateP99(latencies)
	t.Logf("Worker connect burst: %d workers in %v, p99 latency: %v", numWorkers, totalTime, p99)
	assert.Less(t, p99, 100*time.Millisecond, "p99 connection latency should be under 100ms")
	assert.Less(t, totalTime, 5*time.Second, "total connect time should be under 5s")
}

// TestStress_JobSubmissionBurst tests 1K job submissions
// Validates that the scheduler can handle burst job submissions without queue overflow.
func TestStress_JobSubmissionBurst(t *testing.T) {
	if testing.Short() {
		t.Skip("Skipping stress test in short mode")
	}

	cfg := fixtures.DefaultHubConfig()
	fixture := fixtures.NewSchedulerTestFixture(t, cfg)
	defer fixture.Cleanup()

	// Create a single worker to receive assignments (use bench-worker-* pattern which has tokens 0-999)
	worker := fixture.CreateWorker("bench-worker-100", scheduler.WorkerCapabilities{
		GPUBackend: scheduler.BackendNVIDIA,
		GPUCount:   8,
		CPUCount:   16,
	})
	defer worker.Close()

	numJobs := 1000
	start := time.Now()

	// Submit 1K jobs
	for i := range numJobs {
		jobID := fmt.Sprintf("burst-job-%d", i)
		fixture.SubmitJob(scheduler.JobSpec{
			ID:       jobID,
			Type:     scheduler.JobTypeBatch,
			SlotPool: "batch",
			GPUCount: 1,
			JobTier:  scheduler.TierTraining,
		})
	}
	submitTime := time.Since(start)

	t.Logf("Submitted %d jobs in %v (%.0f jobs/sec)", numJobs, submitTime, float64(numJobs)/submitTime.Seconds())

	// Signal worker ready to process some jobs
	worker.SignalReady(scheduler.SlotStatus{BatchTotal: 8, BatchInUse: 0}, "ready")

	// Wait for and accept some assignments
	accepted := 0
	done := time.After(3 * time.Second)
	for accepted < 10 {
		select {
		case <-done:
			goto doneAccepting
		default:
			select {
			case msg := <-worker.RecvCh:
				if msg.Type == scheduler.MsgJobAssign {
					var payload scheduler.JobAssignPayload
					_ = json.Unmarshal(msg.Payload, &payload)
					worker.AcceptJob(payload.Spec.ID)
					accepted++
				}
			case <-time.After(100 * time.Millisecond):
				worker.SignalReady(scheduler.SlotStatus{BatchTotal: 8, BatchInUse: accepted}, "still_ready")
			}
		}
	}
doneAccepting:

	t.Logf("Worker accepted %d jobs from burst queue", accepted)
	assert.Greater(t, accepted, 0, "worker should receive at least some job assignments")
}

// TestStress_WorkerChurn tests rapid connect/disconnect cycles
// Validates that the scheduler properly cleans up resources and doesn't leak memory.
func TestStress_WorkerChurn(t *testing.T) {
	if testing.Short() {
		t.Skip("Skipping stress test in short mode")
	}

	cfg := fixtures.DefaultHubConfig()
	fixture := fixtures.NewSchedulerTestFixture(t, cfg)
	defer fixture.Cleanup()

	cycles := 50
	var m1, m2 runtime.MemStats

	runtime.GC()
	runtime.ReadMemStats(&m1)

	for i := range cycles {
		workerID := fmt.Sprintf("churn-worker-%d", i%10) // Reuse 10 worker IDs

		// Create worker - the fixture has dynamic tokens for bench-worker patterns
		workerID = fmt.Sprintf("bench-worker-%d", i)
		worker := fixtures.NewMockWorker(t, fixture.Hub, workerID)
		worker.Register(scheduler.WorkerCapabilities{
			GPUBackend: scheduler.BackendNVIDIA,
			GPUCount:   4,
		})

		// Brief connection
		time.Sleep(20 * time.Millisecond)

		// Close worker
		worker.Close()

		// Small delay between cycles
		if i%10 == 9 {
			time.Sleep(50 * time.Millisecond)
		}
	}

	// Force GC and check memory
	runtime.GC()
	time.Sleep(100 * time.Millisecond)
	runtime.ReadMemStats(&m2)

	// Allow 10MB growth for 50 cycles (200KB per cycle max)
	growth := int64(m2.HeapAlloc) - int64(m1.HeapAlloc)
	maxGrowth := int64(10 * 1024 * 1024) // 10MB

	t.Logf("Worker churn: %d cycles, heap growth: %d bytes", cycles, growth)
	assert.Less(t, growth, maxGrowth, "memory growth should be bounded (possible leak)")
}

// TestStress_ConcurrentScheduling tests job queue contention with multiple workers
// Validates fair scheduling and lack of race conditions under concurrent load.
func TestStress_ConcurrentScheduling(t *testing.T) {
	if testing.Short() {
		t.Skip("Skipping stress test in short mode")
	}

	cfg := fixtures.DefaultHubConfig()
	cfg.DefaultBatchSlots = 4
	fixture := fixtures.NewSchedulerTestFixture(t, cfg)
	defer fixture.Cleanup()

	numWorkers := 10
	jobsPerWorker := 20

	// Create workers
	workers := make([]*fixtures.MockWorker, numWorkers)
	for i := range numWorkers {
		workerID := fmt.Sprintf("bench-multi-worker-%d", i)
		workers[i] = fixture.CreateWorker(workerID, scheduler.WorkerCapabilities{
			GPUBackend: scheduler.BackendNVIDIA,
			GPUCount:   4,
			CPUCount:   8,
		})
	}

	// Submit jobs concurrently
	var wg sync.WaitGroup
	for i := range numWorkers {
		wg.Add(1)
		go func(workerIdx int) {
			defer wg.Done()
			for j := 0; j < jobsPerWorker; j++ {
				jobID := fmt.Sprintf("concurrent-job-w%d-j%d", workerIdx, j)
				fixture.SubmitJob(scheduler.JobSpec{
					ID:       jobID,
					Type:     scheduler.JobTypeBatch,
					SlotPool: "batch",
					GPUCount: 1,
					JobTier:  scheduler.TierDataProcessing,
				})
			}
		}(i)
	}
	wg.Wait()

	totalJobs := numWorkers * jobsPerWorker
	t.Logf("Submitted %d jobs from %d workers concurrently", totalJobs, numWorkers)

	// Signal all workers ready and collect some assignments
	var assignWg sync.WaitGroup
	assignmentCounts := make([]int, numWorkers)

	for i, worker := range workers {
		assignWg.Add(1)
		go func(idx int, w *fixtures.MockWorker) {
			defer assignWg.Done()

			w.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "ready")

			// Collect assignments for 500ms
			deadline := time.Now().Add(500 * time.Millisecond)
			for time.Now().Before(deadline) {
				select {
				case msg := <-w.RecvCh:
					if msg.Type == scheduler.MsgJobAssign {
						assignmentCounts[idx]++
						var payload scheduler.JobAssignPayload
						_ = json.Unmarshal(msg.Payload, &payload)
						w.AcceptJob(payload.Spec.ID)
						// Signal ready again after accepting
						w.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 1}, "processing")
					}
				case <-time.After(50 * time.Millisecond):
					// Ping ready status
					w.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "still_ready")
				}
			}
		}(i, worker)
	}
	assignWg.Wait()

	totalAssigned := 0
	for _, count := range assignmentCounts {
		totalAssigned += count
	}

	t.Logf("Workers received %d total assignments", totalAssigned)
	assert.Greater(t, totalAssigned, 0, "should have some job assignments")

	// Cleanup
	for _, w := range workers {
		w.Close()
	}
}

// calculateP99 returns the 99th percentile latency from a slice of durations
func calculateP99(latencies []time.Duration) time.Duration {
	if len(latencies) == 0 {
		return 0
	}

	// Simple sort-based approach (not efficient for large N, but fine for stress tests)
	sorted := make([]time.Duration, len(latencies))
	copy(sorted, latencies)
	for i := range sorted {
		for j := i + 1; j < len(sorted); j++ {
			if sorted[i] > sorted[j] {
				sorted[i], sorted[j] = sorted[j], sorted[i]
			}
		}
	}

	idx := (len(sorted) * 99) / 100
	if idx >= len(sorted) {
		idx = len(sorted) - 1
	}
	return sorted[idx]
}