fetch_ml/tests/benchmarks/artifact_and_snapshot_bench_test.go

package benchmarks

import (
	"archive/tar"
	"bytes"
	"compress/gzip"
	"os"
	"path/filepath"
	"testing"

	"github.com/jfraeys/fetch_ml/internal/worker"
)

// BenchmarkExtractTarGz profiles the tar.gz extraction hot path.
// Called during snapshot resolution - streaming I/O with decompression.
// Tier 1A C++ candidate: parallel decompression, zero-copy extraction.
func BenchmarkExtractTarGz(b *testing.B) {
	// Create a test tar.gz archive
	tmpDir := b.TempDir()
	archivePath := filepath.Join(tmpDir, "snapshot.tar.gz")

	// Build archive with realistic contents
	if err := createTestArchive(archivePath); err != nil {
		b.Fatal(err)
	}

	extractDir := filepath.Join(tmpDir, "extracted")
	if err := os.MkdirAll(extractDir, 0750); err != nil {
		b.Fatal(err)
	}

	b.ResetTimer()
	b.ReportAllocs()

	for i := 0; i < b.N; i++ {
		// Clean extract dir between iterations
		os.RemoveAll(extractDir)
		os.MkdirAll(extractDir, 0750)

		err := worker.ExtractTarGz(archivePath, extractDir)
		if err != nil {
			b.Fatal(err)
		}
	}
}

func createTestArchive(path string) error {
	var buf bytes.Buffer
	gw := gzip.NewWriter(&buf)
	tw := tar.NewWriter(gw)

	// Add files of varying sizes
	files := []struct {
		name string
		size int
	}{
		{"train.py", 5000},
		{"requirements.txt", 100},
		{"data/config.json", 500},
		{"checkpoints/model.pt", 10000000}, // 10MB
		{"logs/output.log", 50000},
	}

	for _, f := range files {
		data := make([]byte, f.size)
		for i := range data {
			data[i] = byte(i % 256)
		}

		hdr := &tar.Header{
			Name: f.name,
			Size: int64(f.size),
			Mode: 0640,
		}
		if err := tw.WriteHeader(hdr); err != nil {
			return err
		}
		if _, err := tw.Write(data); err != nil {
			return err
		}
	}

	if err := tw.Close(); err != nil {
		return err
	}
	if err := gw.Close(); err != nil {
		return err
	}

	return os.WriteFile(path, buf.Bytes(), 0640)
}

// BenchmarkScanArtifacts profiles the artifact scanning hot path.
// Uses filepath.WalkDir with repeated d.Info() syscalls.
// Tier 1A C++ candidate: fts(3) traversal, mmap manifest building.
func BenchmarkScanArtifacts(b *testing.B) {
	runDir := b.TempDir()

	// Create realistic run directory structure
	files := []struct {
		path string
		size int
	}{
		{"run_manifest.json", 100},
		{"output.log", 1000},
		{"code/train.py", 5000},
		{"snapshot/model.pt", 100000},
		{"results/metrics.jsonl", 50000},
		{"results/history.csv", 200000},
		{"checkpoints/best.pt", 50000000},
		{"checkpoints/epoch_10.pt", 25000000},
		{"plots/loss.png", 50000},
		{"plots/accuracy.png", 50000},
	}

	for _, f := range files {
		p := filepath.Join(runDir, f.path)
		if err := os.MkdirAll(filepath.Dir(p), 0750); err != nil {
			b.Fatal(err)
		}
		data := make([]byte, f.size)
		for i := range data {
			data[i] = byte(i % 256)
		}
		if err := os.WriteFile(p, data, 0640); err != nil {
			b.Fatal(err)
		}
	}

	b.ResetTimer()
	b.ReportAllocs()

	for i := 0; i < b.N; i++ {
		_, err := worker.ScanArtifacts(runDir, false)
		if err != nil {
			b.Fatal(err)
		}
	}
}