fetch_ml/tests/unit/worker/artifacts_test.go
Jeremie Fraeys 72b4b29ecd
perf: add profiling benchmarks and parallel Go baseline for C++ optimization
Add comprehensive benchmarking suite for C++ optimization targets:
- tests/benchmarks/dataset_hash_bench_test.go - dirOverallSHA256Hex profiling
- tests/benchmarks/queue_bench_test.go - filesystem queue profiling
- tests/benchmarks/artifact_and_snapshot_bench_test.go - scanArtifacts/extractTarGz profiling
- tests/unit/worker/artifacts_test.go - moved from internal/ for clean separation

Add parallel Go implementation as baseline for C++ comparison:
- internal/worker/data_integrity.go: dirOverallSHA256HexParallel() with worker pool
- Benchmarks show 2.1x speedup (3.97ms -> 1.90ms) vs sequential

Exported wrappers for testing:
- ScanArtifacts() - artifact scanning
- ExtractTarGz() - tar.gz extraction
- DirOverallSHA256HexParallel() - parallel hashing

Profiling results (Apple M2 Ultra):
- dirOverallSHA256Hex: 78% syscall overhead (target for mmap C++)
- rebuildIndex: 96% syscall overhead (target for binary index C++)
- scanArtifacts: 87% syscall overhead (target for fast traversal C++)
- extractTarGz: 95% syscall overhead (target for parallel gzip C++)

Related: C++ optimization strategy in memory 5d5f0bb6
2026-02-12 12:04:02 -05:00

68 lines
1.6 KiB
Go

package worker_test
import (
"os"
"path/filepath"
"testing"
"github.com/jfraeys/fetch_ml/internal/worker"
)
func TestScanArtifacts_SkipsKnownPathsAndLogs(t *testing.T) {
runDir := t.TempDir()
mustWrite := func(rel string, data []byte) {
p := filepath.Join(runDir, rel)
if err := os.MkdirAll(filepath.Dir(p), 0750); err != nil {
t.Fatalf("mkdir: %v", err)
}
if err := os.WriteFile(p, data, 0600); err != nil {
t.Fatalf("write file: %v", err)
}
}
mustWrite("run_manifest.json", []byte("{}"))
mustWrite("output.log", []byte("log"))
mustWrite("code/ignored.txt", []byte("ignore"))
mustWrite("snapshot/ignored.bin", []byte("ignore"))
mustWrite("results/metrics.jsonl", []byte("m"))
mustWrite("checkpoints/best.pt", []byte("checkpoint"))
mustWrite("plots/loss.png", []byte("png"))
art, err := worker.ScanArtifacts(runDir)
if err != nil {
t.Fatalf("scanArtifacts: %v", err)
}
if art == nil {
t.Fatalf("expected artifacts")
}
paths := make([]string, 0, len(art.Files))
var total int64
for _, f := range art.Files {
paths = append(paths, f.Path)
total += f.SizeBytes
}
want := []string{
"checkpoints/best.pt",
"plots/loss.png",
"results/metrics.jsonl",
}
if len(paths) != len(want) {
t.Fatalf("expected %d files, got %d: %v", len(want), len(paths), paths)
}
for i := range want {
if paths[i] != want[i] {
t.Fatalf("expected paths[%d]=%q, got %q", i, want[i], paths[i])
}
}
if art.TotalSizeBytes != total {
t.Fatalf("expected total_size_bytes=%d, got %d", total, art.TotalSizeBytes)
}
if art.DiscoveryTime.IsZero() {
t.Fatalf("expected discovery_time")
}
}