Add comprehensive benchmarking suite for C++ optimization targets: - tests/benchmarks/dataset_hash_bench_test.go - dirOverallSHA256Hex profiling - tests/benchmarks/queue_bench_test.go - filesystem queue profiling - tests/benchmarks/artifact_and_snapshot_bench_test.go - scanArtifacts/extractTarGz profiling - tests/unit/worker/artifacts_test.go - moved from internal/ for clean separation Add parallel Go implementation as baseline for C++ comparison: - internal/worker/data_integrity.go: dirOverallSHA256HexParallel() with worker pool - Benchmarks show 2.1x speedup (3.97ms -> 1.90ms) vs sequential Exported wrappers for testing: - ScanArtifacts() - artifact scanning - ExtractTarGz() - tar.gz extraction - DirOverallSHA256HexParallel() - parallel hashing Profiling results (Apple M2 Ultra): - dirOverallSHA256Hex: 78% syscall overhead (target for mmap C++) - rebuildIndex: 96% syscall overhead (target for binary index C++) - scanArtifacts: 87% syscall overhead (target for fast traversal C++) - extractTarGz: 95% syscall overhead (target for parallel gzip C++) Related: C++ optimization strategy in memory 5d5f0bb6
105 lines
1.8 KiB
Go
105 lines
1.8 KiB
Go
package worker
|
|
|
|
import (
|
|
"fmt"
|
|
"io/fs"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/manifest"
|
|
)
|
|
|
|
func scanArtifacts(runDir string) (*manifest.Artifacts, error) {
|
|
runDir = strings.TrimSpace(runDir)
|
|
if runDir == "" {
|
|
return nil, fmt.Errorf("run dir is empty")
|
|
}
|
|
|
|
var files []manifest.ArtifactFile
|
|
var total int64
|
|
|
|
now := time.Now().UTC()
|
|
|
|
err := filepath.WalkDir(runDir, func(path string, d fs.DirEntry, err error) error {
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if path == runDir {
|
|
return nil
|
|
}
|
|
|
|
rel, err := filepath.Rel(runDir, path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rel = filepath.ToSlash(rel)
|
|
|
|
if rel == "code" || strings.HasPrefix(rel, "code/") {
|
|
if d.IsDir() {
|
|
return fs.SkipDir
|
|
}
|
|
return nil
|
|
}
|
|
if rel == "snapshot" || strings.HasPrefix(rel, "snapshot/") {
|
|
if d.IsDir() {
|
|
return fs.SkipDir
|
|
}
|
|
return nil
|
|
}
|
|
|
|
if rel == manifestFilename {
|
|
return nil
|
|
}
|
|
if strings.HasSuffix(rel, "/"+manifestFilename) {
|
|
return nil
|
|
}
|
|
|
|
if strings.HasSuffix(rel, ".log") {
|
|
return nil
|
|
}
|
|
|
|
if d.Type()&fs.ModeSymlink != 0 {
|
|
return nil
|
|
}
|
|
|
|
if d.IsDir() {
|
|
return nil
|
|
}
|
|
|
|
info, err := d.Info()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
files = append(files, manifest.ArtifactFile{
|
|
Path: rel,
|
|
SizeBytes: info.Size(),
|
|
Modified: info.ModTime().UTC(),
|
|
})
|
|
total += info.Size()
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
sort.Slice(files, func(i, j int) bool {
|
|
return files[i].Path < files[j].Path
|
|
})
|
|
|
|
return &manifest.Artifacts{
|
|
DiscoveryTime: now,
|
|
Files: files,
|
|
TotalSizeBytes: total,
|
|
}, nil
|
|
}
|
|
|
|
const manifestFilename = "run_manifest.json"
|
|
|
|
// ScanArtifacts is an exported wrapper for testing/benchmarking.
|
|
func ScanArtifacts(runDir string) (*manifest.Artifacts, error) {
|
|
return scanArtifacts(runDir)
|
|
}
|