fetch_ml/tests/benchmarks/dataset_hash_bench_test.go
Jeremie Fraeys be67cb77d3
test(benchmarks): update benchmark tests with job cleanup and improvements
**Payload Performance Test:**
- Add job cleanup after each iteration using DeleteJob()
- Ensure isolated memory measurements between test runs

**All Benchmark Tests:**
- General improvements and maintenance updates
2026-02-23 18:03:54 -05:00

120 lines
2.7 KiB
Go

//go:build !native_libs
// +build !native_libs
package benchmarks
import (
"os"
"path/filepath"
"testing"
"github.com/jfraeys/fetch_ml/internal/worker"
)
// BenchmarkDirOverallSHA256Hex profiles the directory hashing hot path.
// This function walks directories, sorts files, and computes SHA256 hashes.
// It's a Tier 1 candidate for C++ optimization via:
// - Memory-mapped file reads
// - Parallel hashing
// - SIMD SHA256 (Intel SHA extensions or ARMv8 crypto)
func BenchmarkDirOverallSHA256Hex(b *testing.B) {
// Create a temp directory structure resembling a dataset
tmpDir := b.TempDir()
// Create nested structure with files of varying sizes
sizes := []int{1024, 10240, 102400, 1024 * 1024} // 1KB to 1MB
for i, size := range sizes {
subdir := filepath.Join(tmpDir, "subdir", string(rune('a'+i)))
if err := os.MkdirAll(subdir, 0750); err != nil {
b.Fatal(err)
}
data := make([]byte, size)
for j := range data {
data[j] = byte(i + j%256)
}
if err := os.WriteFile(filepath.Join(subdir, "data.bin"), data, 0640); err != nil {
b.Fatal(err)
}
}
// Add some small metadata files
metaDir := filepath.Join(tmpDir, "meta")
if err := os.MkdirAll(metaDir, 0750); err != nil {
b.Fatal(err)
}
for i := range 10 {
if err := os.WriteFile(
filepath.Join(metaDir, "file"+string(rune('0'+i))+".json"),
[]byte(`{"key": "value"}`),
0640,
); err != nil {
b.Fatal(err)
}
}
b.ResetTimer()
b.ReportAllocs()
for b.Loop() {
_, err := worker.DirOverallSHA256Hex(tmpDir)
if err != nil {
b.Fatal(err)
}
}
}
// BenchmarkDirOverallSHA256HexLarge profiles with larger dataset simulation
func BenchmarkDirOverallSHA256HexLarge(b *testing.B) {
tmpDir := b.TempDir()
// Create 50 files of 100KB each = ~5MB total
for i := range 50 {
subdir := filepath.Join(tmpDir, "data", string(rune('a'+i%26)))
if err := os.MkdirAll(subdir, 0750); err != nil {
b.Fatal(err)
}
data := make([]byte, 100*1024)
for j := range data {
data[j] = byte(i + j%256)
}
if err := os.WriteFile(
filepath.Join(subdir, "chunk"+string(rune('0'+i/26))+".bin"),
data,
0640,
); err != nil {
b.Fatal(err)
}
}
b.Run("Sequential", func(b *testing.B) {
b.ReportAllocs()
for b.Loop() {
_, err := worker.DirOverallSHA256Hex(tmpDir)
if err != nil {
b.Fatal(err)
}
}
})
b.Run("ParallelGo", func(b *testing.B) {
b.ReportAllocs()
for b.Loop() {
_, err := worker.DirOverallSHA256Hex(tmpDir)
if err != nil {
b.Fatal(err)
}
}
})
b.Run("Native", func(b *testing.B) {
// This requires -tags native_libs to actually use native
// Otherwise falls back to Go implementation
b.ReportAllocs()
for b.Loop() {
_, err := worker.DirOverallSHA256Hex(tmpDir)
if err != nil {
b.Fatal(err)
}
}
})
}