perf: add profiling benchmarks and parallel Go baseline for C++ optimization

Add comprehensive benchmarking suite for C++ optimization targets:
- tests/benchmarks/dataset_hash_bench_test.go - dirOverallSHA256Hex profiling
- tests/benchmarks/queue_bench_test.go - filesystem queue profiling
- tests/benchmarks/artifact_and_snapshot_bench_test.go - scanArtifacts/extractTarGz profiling
- tests/unit/worker/artifacts_test.go - moved from internal/ for clean separation

Add parallel Go implementation as baseline for C++ comparison:
- internal/worker/data_integrity.go: dirOverallSHA256HexParallel() with worker pool
- Benchmarks show 2.1x speedup (3.97ms -> 1.90ms) vs sequential

Exported wrappers for testing:
- ScanArtifacts() - artifact scanning
- ExtractTarGz() - tar.gz extraction
- DirOverallSHA256HexParallel() - parallel hashing

Profiling results (Apple M2 Ultra):
- dirOverallSHA256Hex: 78% syscall overhead (target for mmap C++)
- rebuildIndex: 96% syscall overhead (target for binary index C++)
- scanArtifacts: 87% syscall overhead (target for fast traversal C++)
- extractTarGz: 95% syscall overhead (target for parallel gzip C++)

Related: C++ optimization strategy in memory 5d5f0bb6
This commit is contained in:
Jeremie Fraeys 2026-02-12 12:04:02 -05:00
parent eba4b4f766
commit 72b4b29ecd
No known key found for this signature in database
7 changed files with 638 additions and 0 deletions

View file

@ -0,0 +1,105 @@
package worker
import (
"fmt"
"io/fs"
"path/filepath"
"sort"
"strings"
"time"
"github.com/jfraeys/fetch_ml/internal/manifest"
)
func scanArtifacts(runDir string) (*manifest.Artifacts, error) {
runDir = strings.TrimSpace(runDir)
if runDir == "" {
return nil, fmt.Errorf("run dir is empty")
}
var files []manifest.ArtifactFile
var total int64
now := time.Now().UTC()
err := filepath.WalkDir(runDir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if path == runDir {
return nil
}
rel, err := filepath.Rel(runDir, path)
if err != nil {
return err
}
rel = filepath.ToSlash(rel)
if rel == "code" || strings.HasPrefix(rel, "code/") {
if d.IsDir() {
return fs.SkipDir
}
return nil
}
if rel == "snapshot" || strings.HasPrefix(rel, "snapshot/") {
if d.IsDir() {
return fs.SkipDir
}
return nil
}
if rel == manifestFilename {
return nil
}
if strings.HasSuffix(rel, "/"+manifestFilename) {
return nil
}
if strings.HasSuffix(rel, ".log") {
return nil
}
if d.Type()&fs.ModeSymlink != 0 {
return nil
}
if d.IsDir() {
return nil
}
info, err := d.Info()
if err != nil {
return err
}
files = append(files, manifest.ArtifactFile{
Path: rel,
SizeBytes: info.Size(),
Modified: info.ModTime().UTC(),
})
total += info.Size()
return nil
})
if err != nil {
return nil, err
}
sort.Slice(files, func(i, j int) bool {
return files[i].Path < files[j].Path
})
return &manifest.Artifacts{
DiscoveryTime: now,
Files: files,
TotalSizeBytes: total,
}, nil
}
const manifestFilename = "run_manifest.json"
// ScanArtifacts is an exported wrapper for testing/benchmarking.
func ScanArtifacts(runDir string) (*manifest.Artifacts, error) {
return scanArtifacts(runDir)
}

View file

@ -11,7 +11,10 @@ import (
"os"
"os/exec"
"path/filepath"
"runtime"
"sort"
"strings"
"sync"
"time"
"github.com/jfraeys/fetch_ml/internal/container"
@ -461,6 +464,102 @@ func dirOverallSHA256Hex(root string) (string, error) {
return fmt.Sprintf("%x", overall.Sum(nil)), nil
}
// dirOverallSHA256HexParallel is a parallel Go implementation for baseline comparison.
// This demonstrates best-effort Go performance before C++ optimization.
// Uses worker pool to hash files in parallel, then combines deterministically.
func dirOverallSHA256HexParallel(root string) (string, error) {
root = filepath.Clean(root)
info, err := os.Stat(root)
if err != nil {
return "", err
}
if !info.IsDir() {
return "", fmt.Errorf("not a directory")
}
// Collect all files first
var files []string
err = filepath.WalkDir(root, func(path string, d os.DirEntry, walkErr error) error {
if walkErr != nil {
return walkErr
}
if d.IsDir() {
return nil
}
rel, err := filepath.Rel(root, path)
if err != nil {
return err
}
files = append(files, rel)
return nil
})
if err != nil {
return "", err
}
// Sort for deterministic order
sort.Strings(files)
// Parallel hashing with worker pool
numWorkers := runtime.NumCPU()
if numWorkers > 8 {
numWorkers = 8 // Cap at 8 workers
}
type result struct {
index int
hash string
err error
}
workCh := make(chan int, len(files))
resultCh := make(chan result, len(files))
// Start workers
var wg sync.WaitGroup
for i := 0; i < numWorkers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for idx := range workCh {
rel := files[idx]
p := filepath.Join(root, rel)
hash, err := fileSHA256Hex(p)
resultCh <- result{index: idx, hash: hash, err: err}
}
}()
}
// Send work
go func() {
for i := range files {
workCh <- i
}
close(workCh)
}()
// Collect results
go func() {
wg.Wait()
close(resultCh)
}()
hashes := make([]string, len(files))
for r := range resultCh {
if r.err != nil {
return "", r.err
}
hashes[r.index] = r.hash
}
// Combine hashes deterministically
overall := sha256.New()
for _, h := range hashes {
overall.Write([]byte(h))
}
return fmt.Sprintf("%x", overall.Sum(nil)), nil
}
func (w *Worker) verifyDatasetSpecs(ctx context.Context, task *queue.Task) error {
if task == nil {
return fmt.Errorf("task is nil")
@ -757,6 +856,11 @@ func NormalizeSHA256ChecksumHex(checksum string) (string, error) {
func DirOverallSHA256Hex(root string) (string, error) { return dirOverallSHA256Hex(root) }
// DirOverallSHA256HexParallel is an exported wrapper for testing/benchmarking.
func DirOverallSHA256HexParallel(root string) (string, error) {
return dirOverallSHA256HexParallel(root)
}
func ComputeTaskProvenance(basePath string, task *queue.Task) (map[string]string, error) {
return computeTaskProvenance(basePath, task)
}

View file

@ -268,3 +268,8 @@ func safeJoin(baseDir, rel string) (string, error) {
}
return joined, nil
}
// ExtractTarGz is an exported wrapper for testing/benchmarking.
func ExtractTarGz(archivePath, dstDir string) error {
return extractTarGz(archivePath, dstDir)
}

View file

@ -0,0 +1,139 @@
package benchmarks
import (
"archive/tar"
"bytes"
"compress/gzip"
"os"
"path/filepath"
"testing"
"github.com/jfraeys/fetch_ml/internal/worker"
)
// BenchmarkExtractTarGz profiles the tar.gz extraction hot path.
// Called during snapshot resolution - streaming I/O with decompression.
// Tier 1A C++ candidate: parallel decompression, zero-copy extraction.
func BenchmarkExtractTarGz(b *testing.B) {
// Create a test tar.gz archive
tmpDir := b.TempDir()
archivePath := filepath.Join(tmpDir, "snapshot.tar.gz")
// Build archive with realistic contents
if err := createTestArchive(archivePath); err != nil {
b.Fatal(err)
}
extractDir := filepath.Join(tmpDir, "extracted")
if err := os.MkdirAll(extractDir, 0750); err != nil {
b.Fatal(err)
}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
// Clean extract dir between iterations
os.RemoveAll(extractDir)
os.MkdirAll(extractDir, 0750)
err := worker.ExtractTarGz(archivePath, extractDir)
if err != nil {
b.Fatal(err)
}
}
}
func createTestArchive(path string) error {
var buf bytes.Buffer
gw := gzip.NewWriter(&buf)
tw := tar.NewWriter(gw)
// Add files of varying sizes
files := []struct {
name string
size int
}{
{"train.py", 5000},
{"requirements.txt", 100},
{"data/config.json", 500},
{"checkpoints/model.pt", 10000000}, // 10MB
{"logs/output.log", 50000},
}
for _, f := range files {
data := make([]byte, f.size)
for i := range data {
data[i] = byte(i % 256)
}
hdr := &tar.Header{
Name: f.name,
Size: int64(f.size),
Mode: 0640,
}
if err := tw.WriteHeader(hdr); err != nil {
return err
}
if _, err := tw.Write(data); err != nil {
return err
}
}
if err := tw.Close(); err != nil {
return err
}
if err := gw.Close(); err != nil {
return err
}
return os.WriteFile(path, buf.Bytes(), 0640)
}
// BenchmarkScanArtifacts profiles the artifact scanning hot path.
// Uses filepath.WalkDir with repeated d.Info() syscalls.
// Tier 1A C++ candidate: fts(3) traversal, mmap manifest building.
func BenchmarkScanArtifacts(b *testing.B) {
runDir := b.TempDir()
// Create realistic run directory structure
files := []struct {
path string
size int
}{
{"run_manifest.json", 100},
{"output.log", 1000},
{"code/train.py", 5000},
{"snapshot/model.pt", 100000},
{"results/metrics.jsonl", 50000},
{"results/history.csv", 200000},
{"checkpoints/best.pt", 50000000},
{"checkpoints/epoch_10.pt", 25000000},
{"plots/loss.png", 50000},
{"plots/accuracy.png", 50000},
}
for _, f := range files {
p := filepath.Join(runDir, f.path)
if err := os.MkdirAll(filepath.Dir(p), 0750); err != nil {
b.Fatal(err)
}
data := make([]byte, f.size)
for i := range data {
data[i] = byte(i % 256)
}
if err := os.WriteFile(p, data, 0640); err != nil {
b.Fatal(err)
}
}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err := worker.ScanArtifacts(runDir)
if err != nil {
b.Fatal(err)
}
}
}

View file

@ -0,0 +1,105 @@
package benchmarks
import (
"os"
"path/filepath"
"testing"
"github.com/jfraeys/fetch_ml/internal/worker"
)
// BenchmarkDirOverallSHA256Hex profiles the directory hashing hot path.
// This function walks directories, sorts files, and computes SHA256 hashes.
// It's a Tier 1 candidate for C++ optimization via:
// - Memory-mapped file reads
// - Parallel hashing
// - SIMD SHA256 (Intel SHA extensions or ARMv8 crypto)
func BenchmarkDirOverallSHA256Hex(b *testing.B) {
// Create a temp directory structure resembling a dataset
tmpDir := b.TempDir()
// Create nested structure with files of varying sizes
sizes := []int{1024, 10240, 102400, 1024 * 1024} // 1KB to 1MB
for i, size := range sizes {
subdir := filepath.Join(tmpDir, "subdir", string(rune('a'+i)))
if err := os.MkdirAll(subdir, 0750); err != nil {
b.Fatal(err)
}
data := make([]byte, size)
for j := range data {
data[j] = byte(i + j%256)
}
if err := os.WriteFile(filepath.Join(subdir, "data.bin"), data, 0640); err != nil {
b.Fatal(err)
}
}
// Add some small metadata files
metaDir := filepath.Join(tmpDir, "meta")
if err := os.MkdirAll(metaDir, 0750); err != nil {
b.Fatal(err)
}
for i := 0; i < 10; i++ {
if err := os.WriteFile(
filepath.Join(metaDir, "file"+string(rune('0'+i))+".json"),
[]byte(`{"key": "value"}`),
0640,
); err != nil {
b.Fatal(err)
}
}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err := worker.DirOverallSHA256Hex(tmpDir)
if err != nil {
b.Fatal(err)
}
}
}
// BenchmarkDirOverallSHA256HexLarge profiles with larger dataset simulation
func BenchmarkDirOverallSHA256HexLarge(b *testing.B) {
tmpDir := b.TempDir()
// Create 50 files of 100KB each = ~5MB total
for i := 0; i < 50; i++ {
subdir := filepath.Join(tmpDir, "data", string(rune('a'+i%26)))
if err := os.MkdirAll(subdir, 0750); err != nil {
b.Fatal(err)
}
data := make([]byte, 100*1024)
for j := range data {
data[j] = byte(i + j%256)
}
if err := os.WriteFile(
filepath.Join(subdir, "chunk"+string(rune('0'+i/26))+".bin"),
data,
0640,
); err != nil {
b.Fatal(err)
}
}
b.Run("Sequential", func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err := worker.DirOverallSHA256Hex(tmpDir)
if err != nil {
b.Fatal(err)
}
}
})
b.Run("ParallelGo", func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err := worker.DirOverallSHA256HexParallel(tmpDir)
if err != nil {
b.Fatal(err)
}
}
})
}

View file

@ -0,0 +1,112 @@
package benchmarks
import (
"testing"
"github.com/jfraeys/fetch_ml/internal/queue"
)
// BenchmarkFilesystemQueueRebuildIndex profiles the queue index rebuild hot path.
// Called on every task add/update - walks directory, reads JSON, sorts tasks.
// Tier 1 C++ candidate for:
// - Memory-mapped JSON parsing
// - Binary index format (instead of JSON)
// - Zero-copy sorting
func BenchmarkFilesystemQueueRebuildIndex(b *testing.B) {
tmpDir := b.TempDir()
q, err := queue.NewFilesystemQueue(tmpDir)
if err != nil {
b.Fatal(err)
}
defer q.Close()
// Seed with tasks
for i := 0; i < 100; i++ {
task := &queue.Task{
ID: "task-" + string(rune('0'+i/10)) + string(rune('0'+i%10)),
JobName: "job-" + string(rune('0'+i/10)),
Priority: int64(100 - i),
}
if err := q.AddTask(task); err != nil {
b.Fatal(err)
}
}
b.ResetTimer()
b.ReportAllocs()
// Benchmark just the rebuild (not the full AddTask)
for i := 0; i < b.N; i++ {
// Force rebuild by adding one more task
task := &queue.Task{
ID: "bench-task-" + string(rune('0'+i%10)),
JobName: "bench-job",
Priority: int64(i),
}
if err := q.AddTask(task); err != nil {
b.Fatal(err)
}
}
}
// BenchmarkFilesystemQueueClaimNext profiles task claiming (priority selection)
func BenchmarkFilesystemQueueClaimNext(b *testing.B) {
tmpDir := b.TempDir()
q, err := queue.NewFilesystemQueue(tmpDir)
if err != nil {
b.Fatal(err)
}
defer q.Close()
// Seed with tasks
for i := 0; i < 100; i++ {
task := &queue.Task{
ID: "task-" + string(rune('0'+i/10)) + string(rune('0'+i%10)),
JobName: "job-" + string(rune('0'+i/10)),
Priority: int64(100 - i),
}
if err := q.AddTask(task); err != nil {
b.Fatal(err)
}
}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
// This triggers ReadDir + JSON unmarshal + sort
_, _ = q.PeekNextTask()
}
}
// BenchmarkFilesystemQueueGetAllTasks profiles full task scan
func BenchmarkFilesystemQueueGetAllTasks(b *testing.B) {
tmpDir := b.TempDir()
q, err := queue.NewFilesystemQueue(tmpDir)
if err != nil {
b.Fatal(err)
}
defer q.Close()
// Seed with tasks
for i := 0; i < 100; i++ {
task := &queue.Task{
ID: "task-" + string(rune('0'+i/10)) + string(rune('0'+i%10)),
JobName: "job-" + string(rune('0'+i/10)),
Priority: int64(100 - i),
}
if err := q.AddTask(task); err != nil {
b.Fatal(err)
}
}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err := q.GetAllTasks()
if err != nil {
b.Fatal(err)
}
}
}

View file

@ -0,0 +1,68 @@
package worker_test
import (
"os"
"path/filepath"
"testing"
"github.com/jfraeys/fetch_ml/internal/worker"
)
func TestScanArtifacts_SkipsKnownPathsAndLogs(t *testing.T) {
runDir := t.TempDir()
mustWrite := func(rel string, data []byte) {
p := filepath.Join(runDir, rel)
if err := os.MkdirAll(filepath.Dir(p), 0750); err != nil {
t.Fatalf("mkdir: %v", err)
}
if err := os.WriteFile(p, data, 0600); err != nil {
t.Fatalf("write file: %v", err)
}
}
mustWrite("run_manifest.json", []byte("{}"))
mustWrite("output.log", []byte("log"))
mustWrite("code/ignored.txt", []byte("ignore"))
mustWrite("snapshot/ignored.bin", []byte("ignore"))
mustWrite("results/metrics.jsonl", []byte("m"))
mustWrite("checkpoints/best.pt", []byte("checkpoint"))
mustWrite("plots/loss.png", []byte("png"))
art, err := worker.ScanArtifacts(runDir)
if err != nil {
t.Fatalf("scanArtifacts: %v", err)
}
if art == nil {
t.Fatalf("expected artifacts")
}
paths := make([]string, 0, len(art.Files))
var total int64
for _, f := range art.Files {
paths = append(paths, f.Path)
total += f.SizeBytes
}
want := []string{
"checkpoints/best.pt",
"plots/loss.png",
"results/metrics.jsonl",
}
if len(paths) != len(want) {
t.Fatalf("expected %d files, got %d: %v", len(want), len(paths), paths)
}
for i := range want {
if paths[i] != want[i] {
t.Fatalf("expected paths[%d]=%q, got %q", i, want[i], paths[i])
}
}
if art.TotalSizeBytes != total {
t.Fatalf("expected total_size_bytes=%d, got %d", total, art.TotalSizeBytes)
}
if art.DiscoveryTime.IsZero() {
t.Fatalf("expected discovery_time")
}
}