fetch_ml/internal/worker/artifacts.go
Jeremie Fraeys 72b4b29ecd
perf: add profiling benchmarks and parallel Go baseline for C++ optimization
Add comprehensive benchmarking suite for C++ optimization targets:
- tests/benchmarks/dataset_hash_bench_test.go - dirOverallSHA256Hex profiling
- tests/benchmarks/queue_bench_test.go - filesystem queue profiling
- tests/benchmarks/artifact_and_snapshot_bench_test.go - scanArtifacts/extractTarGz profiling
- tests/unit/worker/artifacts_test.go - moved from internal/ for clean separation

Add parallel Go implementation as baseline for C++ comparison:
- internal/worker/data_integrity.go: dirOverallSHA256HexParallel() with worker pool
- Benchmarks show 2.1x speedup (3.97ms -> 1.90ms) vs sequential

Exported wrappers for testing:
- ScanArtifacts() - artifact scanning
- ExtractTarGz() - tar.gz extraction
- DirOverallSHA256HexParallel() - parallel hashing

Profiling results (Apple M2 Ultra):
- dirOverallSHA256Hex: 78% syscall overhead (target for mmap C++)
- rebuildIndex: 96% syscall overhead (target for binary index C++)
- scanArtifacts: 87% syscall overhead (target for fast traversal C++)
- extractTarGz: 95% syscall overhead (target for parallel gzip C++)

Related: C++ optimization strategy in memory 5d5f0bb6
2026-02-12 12:04:02 -05:00

105 lines
1.8 KiB
Go

package worker
import (
"fmt"
"io/fs"
"path/filepath"
"sort"
"strings"
"time"
"github.com/jfraeys/fetch_ml/internal/manifest"
)
func scanArtifacts(runDir string) (*manifest.Artifacts, error) {
runDir = strings.TrimSpace(runDir)
if runDir == "" {
return nil, fmt.Errorf("run dir is empty")
}
var files []manifest.ArtifactFile
var total int64
now := time.Now().UTC()
err := filepath.WalkDir(runDir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if path == runDir {
return nil
}
rel, err := filepath.Rel(runDir, path)
if err != nil {
return err
}
rel = filepath.ToSlash(rel)
if rel == "code" || strings.HasPrefix(rel, "code/") {
if d.IsDir() {
return fs.SkipDir
}
return nil
}
if rel == "snapshot" || strings.HasPrefix(rel, "snapshot/") {
if d.IsDir() {
return fs.SkipDir
}
return nil
}
if rel == manifestFilename {
return nil
}
if strings.HasSuffix(rel, "/"+manifestFilename) {
return nil
}
if strings.HasSuffix(rel, ".log") {
return nil
}
if d.Type()&fs.ModeSymlink != 0 {
return nil
}
if d.IsDir() {
return nil
}
info, err := d.Info()
if err != nil {
return err
}
files = append(files, manifest.ArtifactFile{
Path: rel,
SizeBytes: info.Size(),
Modified: info.ModTime().UTC(),
})
total += info.Size()
return nil
})
if err != nil {
return nil, err
}
sort.Slice(files, func(i, j int) bool {
return files[i].Path < files[j].Path
})
return &manifest.Artifacts{
DiscoveryTime: now,
Files: files,
TotalSizeBytes: total,
}, nil
}
const manifestFilename = "run_manifest.json"
// ScanArtifacts is an exported wrapper for testing/benchmarking.
func ScanArtifacts(runDir string) (*manifest.Artifacts, error) {
return scanArtifacts(runDir)
}