fetch_ml/tests/benchmarks/artifact_and_snapshot_bench_test.go
Jeremie Fraeys 72b4b29ecd
perf: add profiling benchmarks and parallel Go baseline for C++ optimization
Add comprehensive benchmarking suite for C++ optimization targets:
- tests/benchmarks/dataset_hash_bench_test.go - dirOverallSHA256Hex profiling
- tests/benchmarks/queue_bench_test.go - filesystem queue profiling
- tests/benchmarks/artifact_and_snapshot_bench_test.go - scanArtifacts/extractTarGz profiling
- tests/unit/worker/artifacts_test.go - moved from internal/ for clean separation

Add parallel Go implementation as baseline for C++ comparison:
- internal/worker/data_integrity.go: dirOverallSHA256HexParallel() with worker pool
- Benchmarks show 2.1x speedup (3.97ms -> 1.90ms) vs sequential

Exported wrappers for testing:
- ScanArtifacts() - artifact scanning
- ExtractTarGz() - tar.gz extraction
- DirOverallSHA256HexParallel() - parallel hashing

Profiling results (Apple M2 Ultra):
- dirOverallSHA256Hex: 78% syscall overhead (target for mmap C++)
- rebuildIndex: 96% syscall overhead (target for binary index C++)
- scanArtifacts: 87% syscall overhead (target for fast traversal C++)
- extractTarGz: 95% syscall overhead (target for parallel gzip C++)

Related: C++ optimization strategy in memory 5d5f0bb6
2026-02-12 12:04:02 -05:00

139 lines
2.9 KiB
Go

package benchmarks
import (
"archive/tar"
"bytes"
"compress/gzip"
"os"
"path/filepath"
"testing"
"github.com/jfraeys/fetch_ml/internal/worker"
)
// BenchmarkExtractTarGz profiles the tar.gz extraction hot path.
// Called during snapshot resolution - streaming I/O with decompression.
// Tier 1A C++ candidate: parallel decompression, zero-copy extraction.
func BenchmarkExtractTarGz(b *testing.B) {
// Create a test tar.gz archive
tmpDir := b.TempDir()
archivePath := filepath.Join(tmpDir, "snapshot.tar.gz")
// Build archive with realistic contents
if err := createTestArchive(archivePath); err != nil {
b.Fatal(err)
}
extractDir := filepath.Join(tmpDir, "extracted")
if err := os.MkdirAll(extractDir, 0750); err != nil {
b.Fatal(err)
}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
// Clean extract dir between iterations
os.RemoveAll(extractDir)
os.MkdirAll(extractDir, 0750)
err := worker.ExtractTarGz(archivePath, extractDir)
if err != nil {
b.Fatal(err)
}
}
}
func createTestArchive(path string) error {
var buf bytes.Buffer
gw := gzip.NewWriter(&buf)
tw := tar.NewWriter(gw)
// Add files of varying sizes
files := []struct {
name string
size int
}{
{"train.py", 5000},
{"requirements.txt", 100},
{"data/config.json", 500},
{"checkpoints/model.pt", 10000000}, // 10MB
{"logs/output.log", 50000},
}
for _, f := range files {
data := make([]byte, f.size)
for i := range data {
data[i] = byte(i % 256)
}
hdr := &tar.Header{
Name: f.name,
Size: int64(f.size),
Mode: 0640,
}
if err := tw.WriteHeader(hdr); err != nil {
return err
}
if _, err := tw.Write(data); err != nil {
return err
}
}
if err := tw.Close(); err != nil {
return err
}
if err := gw.Close(); err != nil {
return err
}
return os.WriteFile(path, buf.Bytes(), 0640)
}
// BenchmarkScanArtifacts profiles the artifact scanning hot path.
// Uses filepath.WalkDir with repeated d.Info() syscalls.
// Tier 1A C++ candidate: fts(3) traversal, mmap manifest building.
func BenchmarkScanArtifacts(b *testing.B) {
runDir := b.TempDir()
// Create realistic run directory structure
files := []struct {
path string
size int
}{
{"run_manifest.json", 100},
{"output.log", 1000},
{"code/train.py", 5000},
{"snapshot/model.pt", 100000},
{"results/metrics.jsonl", 50000},
{"results/history.csv", 200000},
{"checkpoints/best.pt", 50000000},
{"checkpoints/epoch_10.pt", 25000000},
{"plots/loss.png", 50000},
{"plots/accuracy.png", 50000},
}
for _, f := range files {
p := filepath.Join(runDir, f.path)
if err := os.MkdirAll(filepath.Dir(p), 0750); err != nil {
b.Fatal(err)
}
data := make([]byte, f.size)
for i := range data {
data[i] = byte(i % 256)
}
if err := os.WriteFile(p, data, 0640); err != nil {
b.Fatal(err)
}
}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err := worker.ScanArtifacts(runDir)
if err != nil {
b.Fatal(err)
}
}
}