fetch_ml/tests/benchmarks/artifact_and_snapshot_bench_test.go
Jeremie Fraeys 9434f4c8e6
feat(security): Artifact ingestion caps enforcement
Add MaxArtifactFiles and MaxArtifactTotalBytes to SandboxConfig:
- Default MaxArtifactFiles: 10,000 (configurable via SecurityDefaults)
- Default MaxArtifactTotalBytes: 100GB (configurable via SecurityDefaults)
- ApplySecurityDefaults() sets defaults if not specified

Enforce caps in scanArtifacts() during directory walk:
- Returns error immediately when MaxArtifactFiles exceeded
- Returns error immediately when MaxArtifactTotalBytes exceeded
- Prevents resource exhaustion attacks from malicious artifact trees

Update all call sites to pass SandboxConfig for cap enforcement:
- Native bridge libs updated to pass caps argument
- Benchmark tests updated with nil caps (unlimited for benchmarks)
- Unit tests updated with nil caps

Closes: artifact ingestion caps items from security plan
2026-02-23 19:43:28 -05:00

139 lines
3 KiB
Go

package benchmarks
import (
"archive/tar"
"bytes"
"compress/gzip"
"os"
"path/filepath"
"testing"
"github.com/jfraeys/fetch_ml/internal/worker"
)
// BenchmarkExtractTarGz profiles the tar.gz extraction hot path.
// Called during snapshot resolution - streaming I/O with decompression.
// Tier 1A C++ candidate: parallel decompression, zero-copy extraction.
func BenchmarkExtractTarGz(b *testing.B) {
// Create a test tar.gz archive
tmpDir := b.TempDir()
archivePath := filepath.Join(tmpDir, "snapshot.tar.gz")
// Build archive with realistic contents
if err := createTestArchive(archivePath); err != nil {
b.Fatal(err)
}
extractDir := filepath.Join(tmpDir, "extracted")
if err := os.MkdirAll(extractDir, 0750); err != nil {
b.Fatal(err)
}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
// Clean extract dir between iterations
os.RemoveAll(extractDir)
os.MkdirAll(extractDir, 0750)
err := worker.ExtractTarGz(archivePath, extractDir)
if err != nil {
b.Fatal(err)
}
}
}
func createTestArchive(path string) error {
var buf bytes.Buffer
gw := gzip.NewWriter(&buf)
tw := tar.NewWriter(gw)
// Add files of varying sizes
files := []struct {
name string
size int
}{
{"train.py", 5000},
{"requirements.txt", 100},
{"data/config.json", 500},
{"checkpoints/model.pt", 10000000}, // 10MB
{"logs/output.log", 50000},
}
for _, f := range files {
data := make([]byte, f.size)
for i := range data {
data[i] = byte(i % 256)
}
hdr := &tar.Header{
Name: f.name,
Size: int64(f.size),
Mode: 0640,
}
if err := tw.WriteHeader(hdr); err != nil {
return err
}
if _, err := tw.Write(data); err != nil {
return err
}
}
if err := tw.Close(); err != nil {
return err
}
if err := gw.Close(); err != nil {
return err
}
return os.WriteFile(path, buf.Bytes(), 0640)
}
// BenchmarkScanArtifacts profiles the artifact scanning hot path.
// Uses filepath.WalkDir with repeated d.Info() syscalls.
// Tier 1A C++ candidate: fts(3) traversal, mmap manifest building.
func BenchmarkScanArtifacts(b *testing.B) {
runDir := b.TempDir()
// Create realistic run directory structure
files := []struct {
path string
size int
}{
{"run_manifest.json", 100},
{"output.log", 1000},
{"code/train.py", 5000},
{"snapshot/model.pt", 100000},
{"results/metrics.jsonl", 50000},
{"results/history.csv", 200000},
{"checkpoints/best.pt", 50000000},
{"checkpoints/epoch_10.pt", 25000000},
{"plots/loss.png", 50000},
{"plots/accuracy.png", 50000},
}
for _, f := range files {
p := filepath.Join(runDir, f.path)
if err := os.MkdirAll(filepath.Dir(p), 0750); err != nil {
b.Fatal(err)
}
data := make([]byte, f.size)
for i := range data {
data[i] = byte(i % 256)
}
if err := os.WriteFile(p, data, 0640); err != nil {
b.Fatal(err)
}
}
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err := worker.ScanArtifacts(runDir, false, nil)
if err != nil {
b.Fatal(err)
}
}
}