fetch_ml/tests/benchmarks/queue_bench_test.go
Jeremie Fraeys 72b4b29ecd
perf: add profiling benchmarks and parallel Go baseline for C++ optimization
Add comprehensive benchmarking suite for C++ optimization targets:
- tests/benchmarks/dataset_hash_bench_test.go - dirOverallSHA256Hex profiling
- tests/benchmarks/queue_bench_test.go - filesystem queue profiling
- tests/benchmarks/artifact_and_snapshot_bench_test.go - scanArtifacts/extractTarGz profiling
- tests/unit/worker/artifacts_test.go - moved from internal/ for clean separation

Add parallel Go implementation as baseline for C++ comparison:
- internal/worker/data_integrity.go: dirOverallSHA256HexParallel() with worker pool
- Benchmarks show 2.1x speedup (3.97ms -> 1.90ms) vs sequential

Exported wrappers for testing:
- ScanArtifacts() - artifact scanning
- ExtractTarGz() - tar.gz extraction
- DirOverallSHA256HexParallel() - parallel hashing

Profiling results (Apple M2 Ultra):
- dirOverallSHA256Hex: 78% syscall overhead (target for mmap C++)
- rebuildIndex: 96% syscall overhead (target for binary index C++)
- scanArtifacts: 87% syscall overhead (target for fast traversal C++)
- extractTarGz: 95% syscall overhead (target for parallel gzip C++)

Related: C++ optimization strategy in memory 5d5f0bb6
2026-02-12 12:04:02 -05:00

112 lines
2.5 KiB
Go

package benchmarks
import (
"testing"
"github.com/jfraeys/fetch_ml/internal/queue"
)
// BenchmarkFilesystemQueueRebuildIndex profiles the queue index rebuild hot path.
// Called on every task add/update - walks directory, reads JSON, sorts tasks.
// Tier 1 C++ candidate for:
// - Memory-mapped JSON parsing
// - Binary index format (instead of JSON)
// - Zero-copy sorting
func BenchmarkFilesystemQueueRebuildIndex(b *testing.B) {
tmpDir := b.TempDir()
q, err := queue.NewFilesystemQueue(tmpDir)
if err != nil {
b.Fatal(err)
}
defer q.Close()
// Seed with tasks
for i := 0; i < 100; i++ {
task := &queue.Task{
ID: "task-" + string(rune('0'+i/10)) + string(rune('0'+i%10)),
JobName: "job-" + string(rune('0'+i/10)),
Priority: int64(100 - i),
}
if err := q.AddTask(task); err != nil {
b.Fatal(err)
}
}
b.ResetTimer()
b.ReportAllocs()
// Benchmark just the rebuild (not the full AddTask)
for i := 0; i < b.N; i++ {
// Force rebuild by adding one more task
task := &queue.Task{
ID: "bench-task-" + string(rune('0'+i%10)),
JobName: "bench-job",
Priority: int64(i),
}
if err := q.AddTask(task); err != nil {
b.Fatal(err)
}
}
}
// BenchmarkFilesystemQueueClaimNext profiles task claiming (priority selection)
func BenchmarkFilesystemQueueClaimNext(b *testing.B) {
tmpDir := b.TempDir()
q, err := queue.NewFilesystemQueue(tmpDir)
if err != nil {
b.Fatal(err)
}
defer q.Close()
// Seed with tasks
for i := 0; i < 100; i++ {
task := &queue.Task{
ID: "task-" + string(rune('0'+i/10)) + string(rune('0'+i%10)),
JobName: "job-" + string(rune('0'+i/10)),
Priority: int64(100 - i),
}
if err := q.AddTask(task); err != nil {
b.Fatal(err)
}
}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
// This triggers ReadDir + JSON unmarshal + sort
_, _ = q.PeekNextTask()
}
}
// BenchmarkFilesystemQueueGetAllTasks profiles full task scan
func BenchmarkFilesystemQueueGetAllTasks(b *testing.B) {
tmpDir := b.TempDir()
q, err := queue.NewFilesystemQueue(tmpDir)
if err != nil {
b.Fatal(err)
}
defer q.Close()
// Seed with tasks
for i := 0; i < 100; i++ {
task := &queue.Task{
ID: "task-" + string(rune('0'+i/10)) + string(rune('0'+i%10)),
JobName: "job-" + string(rune('0'+i/10)),
Priority: int64(100 - i),
}
if err := q.AddTask(task); err != nil {
b.Fatal(err)
}
}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err := q.GetAllTasks()
if err != nil {
b.Fatal(err)
}
}
}