Refactor plugins to use interface for testability: - Add PodmanInterface to container package (StartContainer, StopContainer, RemoveContainer) - Update MLflow plugin to use container.PodmanInterface - Update TensorBoard plugin to use container.PodmanInterface - Add comprehensive mocked tests for all three plugins (wandb, mlflow, tensorboard) - Coverage increased from 18% to 91.4%
125 lines
3.5 KiB
Go
125 lines
3.5 KiB
Go
// Package benchmarks provides performance benchmarks for the scheduler and queue
|
|
package benchmarks_test
|
|
|
|
import (
|
|
"fmt"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/scheduler"
|
|
fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
|
|
)
|
|
|
|
// BenchmarkWorkerChurn measures worker connection/disconnection throughput
|
|
// This benchmarks the scheduler's ability to handle rapid worker churn
|
|
func BenchmarkWorkerChurn(b *testing.B) {
|
|
fixture := fixtures.NewSchedulerTestFixture(b, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
// Reset timer to exclude setup
|
|
b.ReportAllocs()
|
|
b.ResetTimer()
|
|
|
|
for i := 0; b.Loop(); i++ {
|
|
workerID := fmt.Sprintf("churn-worker-%d", i)
|
|
worker := fixtures.NewMockWorker(b, fixture.Hub, workerID)
|
|
worker.Register(scheduler.WorkerCapabilities{GPUCount: 0})
|
|
worker.Close()
|
|
}
|
|
}
|
|
|
|
// BenchmarkWorkerChurnParallel measures concurrent worker churn
|
|
func BenchmarkWorkerChurnParallel(b *testing.B) {
|
|
fixture := fixtures.NewSchedulerTestFixture(b, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
b.ReportAllocs()
|
|
b.RunParallel(func(pb *testing.PB) {
|
|
i := 0
|
|
for pb.Next() {
|
|
workerID := fmt.Sprintf("parallel-worker-%d", b.N, i)
|
|
worker := fixtures.NewMockWorker(b, fixture.Hub, workerID)
|
|
worker.Register(scheduler.WorkerCapabilities{GPUCount: 0})
|
|
worker.Close()
|
|
i++
|
|
}
|
|
})
|
|
}
|
|
|
|
// BenchmarkWorkerChurnWithHeartbeat measures churn with active heartbeats
|
|
func BenchmarkWorkerChurnWithHeartbeat(b *testing.B) {
|
|
fixture := fixtures.NewSchedulerTestFixture(b, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
b.ReportAllocs()
|
|
|
|
for i := 0; b.Loop(); i++ {
|
|
workerID := fmt.Sprintf("hb-worker-%d", i)
|
|
worker := fixtures.NewMockWorker(b, fixture.Hub, workerID)
|
|
worker.Register(scheduler.WorkerCapabilities{GPUCount: 0})
|
|
|
|
// Send a few heartbeats before disconnecting
|
|
for range 3 {
|
|
worker.SendHeartbeat(scheduler.SlotStatus{
|
|
BatchTotal: 4,
|
|
BatchInUse: 0,
|
|
})
|
|
time.Sleep(10 * time.Millisecond)
|
|
}
|
|
|
|
worker.Close()
|
|
}
|
|
}
|
|
|
|
// BenchmarkWorkerChurnLargeBatch measures batch worker registration/disconnection
|
|
func BenchmarkWorkerChurnLargeBatch(b *testing.B) {
|
|
batchSizes := []int{10, 50, 100, 500}
|
|
|
|
for _, batchSize := range batchSizes {
|
|
b.Run(fmt.Sprintf("batch-%d", batchSize), func(b *testing.B) {
|
|
fixture := fixtures.NewSchedulerTestFixture(b, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
b.ReportAllocs()
|
|
b.ResetTimer()
|
|
|
|
for i := 0; b.Loop(); i++ {
|
|
workers := make([]*fixtures.MockWorker, batchSize)
|
|
|
|
// Register all workers
|
|
for j := 0; j < batchSize; j++ {
|
|
workerID := fmt.Sprintf("batch-worker-%d-%d", i, j)
|
|
workers[j] = fixtures.NewMockWorker(b, fixture.Hub, workerID)
|
|
workers[j].Register(scheduler.WorkerCapabilities{GPUCount: 0})
|
|
}
|
|
|
|
// Disconnect all workers
|
|
for _, w := range workers {
|
|
w.Close()
|
|
}
|
|
}
|
|
|
|
// Report connections per second
|
|
b.ReportMetric(float64(batchSize), "workers/op")
|
|
})
|
|
}
|
|
}
|
|
|
|
// BenchmarkMemoryAllocs tracks memory allocations during worker operations
|
|
func BenchmarkMemoryAllocs(b *testing.B) {
|
|
fixture := fixtures.NewSchedulerTestFixture(b, fixtures.DefaultHubConfig())
|
|
defer fixture.Cleanup()
|
|
|
|
b.ReportAllocs()
|
|
|
|
for i := 0; b.Loop(); i++ {
|
|
workerID := fmt.Sprintf("alloc-worker-%d", i)
|
|
worker := fixtures.NewMockWorker(b, fixture.Hub, workerID)
|
|
worker.Register(scheduler.WorkerCapabilities{GPUCount: 0})
|
|
worker.SendHeartbeat(scheduler.SlotStatus{
|
|
BatchTotal: 4,
|
|
BatchInUse: 0,
|
|
})
|
|
worker.Close()
|
|
}
|
|
}
|