fetch_ml/tests/benchmarks/worker_churn_bench_test.go
Jeremie Fraeys f827ee522a
test(tracking/plugins): add PodmanInterface and comprehensive plugin tests for 91% coverage
Refactor plugins to use interface for testability:
- Add PodmanInterface to container package (StartContainer, StopContainer, RemoveContainer)
- Update MLflow plugin to use container.PodmanInterface
- Update TensorBoard plugin to use container.PodmanInterface
- Add comprehensive mocked tests for all three plugins (wandb, mlflow, tensorboard)
- Coverage increased from 18% to 91.4%
2026-03-14 16:59:16 -04:00

125 lines
3.5 KiB
Go

// Package benchmarks provides performance benchmarks for the scheduler and queue
package benchmarks_test
import (
"fmt"
"testing"
"time"
"github.com/jfraeys/fetch_ml/internal/scheduler"
fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
)
// BenchmarkWorkerChurn measures worker connection/disconnection throughput
// This benchmarks the scheduler's ability to handle rapid worker churn
func BenchmarkWorkerChurn(b *testing.B) {
fixture := fixtures.NewSchedulerTestFixture(b, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
// Reset timer to exclude setup
b.ReportAllocs()
b.ResetTimer()
for i := 0; b.Loop(); i++ {
workerID := fmt.Sprintf("churn-worker-%d", i)
worker := fixtures.NewMockWorker(b, fixture.Hub, workerID)
worker.Register(scheduler.WorkerCapabilities{GPUCount: 0})
worker.Close()
}
}
// BenchmarkWorkerChurnParallel measures concurrent worker churn
func BenchmarkWorkerChurnParallel(b *testing.B) {
fixture := fixtures.NewSchedulerTestFixture(b, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
i := 0
for pb.Next() {
workerID := fmt.Sprintf("parallel-worker-%d", b.N, i)
worker := fixtures.NewMockWorker(b, fixture.Hub, workerID)
worker.Register(scheduler.WorkerCapabilities{GPUCount: 0})
worker.Close()
i++
}
})
}
// BenchmarkWorkerChurnWithHeartbeat measures churn with active heartbeats
func BenchmarkWorkerChurnWithHeartbeat(b *testing.B) {
fixture := fixtures.NewSchedulerTestFixture(b, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
b.ReportAllocs()
for i := 0; b.Loop(); i++ {
workerID := fmt.Sprintf("hb-worker-%d", i)
worker := fixtures.NewMockWorker(b, fixture.Hub, workerID)
worker.Register(scheduler.WorkerCapabilities{GPUCount: 0})
// Send a few heartbeats before disconnecting
for range 3 {
worker.SendHeartbeat(scheduler.SlotStatus{
BatchTotal: 4,
BatchInUse: 0,
})
time.Sleep(10 * time.Millisecond)
}
worker.Close()
}
}
// BenchmarkWorkerChurnLargeBatch measures batch worker registration/disconnection
func BenchmarkWorkerChurnLargeBatch(b *testing.B) {
batchSizes := []int{10, 50, 100, 500}
for _, batchSize := range batchSizes {
b.Run(fmt.Sprintf("batch-%d", batchSize), func(b *testing.B) {
fixture := fixtures.NewSchedulerTestFixture(b, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
b.ReportAllocs()
b.ResetTimer()
for i := 0; b.Loop(); i++ {
workers := make([]*fixtures.MockWorker, batchSize)
// Register all workers
for j := 0; j < batchSize; j++ {
workerID := fmt.Sprintf("batch-worker-%d-%d", i, j)
workers[j] = fixtures.NewMockWorker(b, fixture.Hub, workerID)
workers[j].Register(scheduler.WorkerCapabilities{GPUCount: 0})
}
// Disconnect all workers
for _, w := range workers {
w.Close()
}
}
// Report connections per second
b.ReportMetric(float64(batchSize), "workers/op")
})
}
}
// BenchmarkMemoryAllocs tracks memory allocations during worker operations
func BenchmarkMemoryAllocs(b *testing.B) {
fixture := fixtures.NewSchedulerTestFixture(b, fixtures.DefaultHubConfig())
defer fixture.Cleanup()
b.ReportAllocs()
for i := 0; b.Loop(); i++ {
workerID := fmt.Sprintf("alloc-worker-%d", i)
worker := fixtures.NewMockWorker(b, fixture.Hub, workerID)
worker.Register(scheduler.WorkerCapabilities{GPUCount: 0})
worker.SendHeartbeat(scheduler.SlotStatus{
BatchTotal: 4,
BatchInUse: 0,
})
worker.Close()
}
}