fetch_ml/tests/benchmarks/scheduler_bench_test.go
Jeremie Fraeys 43e6446587
feat(scheduler): implement multi-tenant job scheduler with gang scheduling
Add new scheduler component for distributed ML workload orchestration:
- Hub-based coordination for multi-worker clusters
- Pacing controller for rate limiting job submissions
- Priority queue with preemption support
- Port allocator for dynamic service discovery
- Protocol handlers for worker-scheduler communication
- Service manager with OS-specific implementations
- Connection management and state persistence
- Template system for service deployment

Includes comprehensive test suite:
- Unit tests for all core components
- Integration tests for distributed scenarios
- Benchmark tests for performance validation
- Mock fixtures for isolated testing

Refs: scheduler-architecture.md
2026-02-26 12:03:23 -05:00

190 lines
4.6 KiB
Go

package benchmarks_test
import (
"fmt"
"testing"
"time"
"github.com/jfraeys/fetch_ml/internal/scheduler"
fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
)
// BenchmarkPriorityQueueAdd measures job enqueue performance
func BenchmarkPriorityQueueAdd(b *testing.B) {
pq := scheduler.NewPriorityQueue(0.1)
b.ResetTimer()
for i := 0; i < b.N; i++ {
task := &scheduler.Task{
ID: fmt.Sprintf("task-%d", i),
Priority: i % 100,
}
pq.Add(task)
}
}
// BenchmarkPriorityQueueTake measures job dequeue performance
func BenchmarkPriorityQueueTake(b *testing.B) {
pq := scheduler.NewPriorityQueue(0.1)
// Pre-populate queue
for i := 0; i < b.N; i++ {
task := &scheduler.Task{
ID: fmt.Sprintf("task-%d", i),
Priority: i % 100,
}
pq.Add(task)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
pq.Take()
}
}
// BenchmarkPortAllocator measures port allocation performance
func BenchmarkPortAllocator(b *testing.B) {
pa := scheduler.NewPortAllocator(10000, 20000)
b.ResetTimer()
for i := 0; i < b.N; i++ {
port, _ := pa.Allocate(fmt.Sprintf("service-%d", i))
_ = port
}
}
// BenchmarkStateStoreAppend measures state persistence performance
func BenchmarkStateStoreAppend(b *testing.B) {
dir := b.TempDir()
store, _ := scheduler.NewStateStore(dir + "/bench.state")
event := scheduler.StateEvent{
Type: scheduler.EventJobEnqueued,
TaskID: "bench-task",
Timestamp: time.Now(),
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
event.TaskID = fmt.Sprintf("bench-task-%d", i)
store.Append(event)
}
}
// BenchmarkSchedulerSubmitJob measures job submission throughput
func BenchmarkSchedulerSubmitJob(b *testing.B) {
// Create scheduler directly for benchmark
cfg := scheduler.HubConfig{
BindAddr: "localhost:0",
DefaultBatchSlots: 4,
StarvationThresholdMins: 5,
AcceptanceTimeoutSecs: 5,
}
hub, err := scheduler.NewHub(cfg, nil)
if err != nil {
b.Fatal(err)
}
defer hub.Stop()
if err := hub.Start(); err != nil {
b.Fatal(err)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
hub.SubmitJob(scheduler.JobSpec{
ID: fmt.Sprintf("bench-job-%d", i),
Type: scheduler.JobTypeBatch,
})
}
}
// BenchmarkWorkerRegistration measures worker registration throughput
func BenchmarkWorkerRegistration(b *testing.B) {
fixture := fixtures.NewSchedulerTestFixture(b, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
b.ResetTimer()
for i := 0; i < b.N; i++ {
workerID := fmt.Sprintf("bench-worker-%d", i)
worker := fixtures.NewMockWorker(b, fixture.Hub, workerID)
worker.Register(scheduler.WorkerCapabilities{GPUCount: 0})
worker.Close()
}
}
// BenchmarkHeartbeatProcessing measures heartbeat handling throughput
func BenchmarkHeartbeatProcessing(b *testing.B) {
fixture := fixtures.NewSchedulerTestFixture(b, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
worker := fixture.CreateWorker("bench-hb-worker", scheduler.WorkerCapabilities{GPUCount: 0})
slots := scheduler.SlotStatus{
BatchTotal: 4,
BatchInUse: 0,
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
worker.SendHeartbeat(slots)
}
}
// BenchmarkJobAssignment measures job scheduling latency
func BenchmarkJobAssignment(b *testing.B) {
fixture := fixtures.NewSchedulerTestFixture(b, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
// Create worker
worker := fixture.CreateWorker("bench-assign-worker", scheduler.WorkerCapabilities{GPUCount: 0})
b.ResetTimer()
for i := 0; i < b.N; i++ {
// Submit job
jobID := fmt.Sprintf("bench-assign-%d", i)
fixture.SubmitJob(scheduler.JobSpec{
ID: jobID,
Type: scheduler.JobTypeBatch,
})
// Signal ready to trigger assignment
worker.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
// Wait for assignment
worker.RecvTimeout(100 * time.Millisecond)
}
}
// BenchmarkMultiWorkerScheduling measures scheduling with multiple workers
func BenchmarkMultiWorkerScheduling(b *testing.B) {
fixture := fixtures.NewSchedulerTestFixture(b, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
// Create multiple workers
workers := make([]*fixtures.MockWorker, 10)
for i := 0; i < 10; i++ {
workers[i] = fixture.CreateWorker(
fmt.Sprintf("bench-multi-worker-%d", i),
scheduler.WorkerCapabilities{GPUCount: 0},
)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
// Submit job
jobID := fmt.Sprintf("bench-multi-%d", i)
fixture.SubmitJob(scheduler.JobSpec{
ID: jobID,
Type: scheduler.JobTypeBatch,
})
// All workers signal ready
for _, w := range workers {
w.SignalReady(scheduler.SlotStatus{BatchTotal: 4, BatchInUse: 0}, "polling")
}
// One worker gets the job
workers[i%10].RecvTimeout(100 * time.Millisecond)
}
}