From a239f3a14fa2bb7af3c2776d55a7039fd40d3f5b Mon Sep 17 00:00:00 2001 From: Jeremie Fraeys Date: Thu, 5 Mar 2026 14:41:14 -0500 Subject: [PATCH] test(consistency): add dataset hash consistency test suite Add cross-implementation consistency tests for dataset hash functionality: ## Test Fixtures - Single file, nested directories, and multiple file test cases - Expected hashes in JSON format for validation ## Test Infrastructure - harness.go: Common test utilities and reference implementation runner - dataset_hash_test.go: Consistency test cases comparing implementations - cmd/update.go: Tool to regenerate expected hashes from reference ## Purpose Ensures hash implementations (Go, C++, Zig) produce identical results across all supported platforms and implementations. --- tests/fixtures/consistency/README.md | 40 ++ .../02_single_file/input/test.txt | 1 + .../dataset_hash/03_nested/input/root.txt | 1 + .../03_nested/input/subdir1/file1.txt | 1 + .../03_nested/input/subdir1/subdir2/deep.txt | 1 + .../04_multiple_files/input/file_a.txt | 1 + .../04_multiple_files/input/file_b.txt | 1 + .../04_multiple_files/input/file_c.txt | 1 + .../dataset_hash/expected_hashes.json | 48 ++ tests/integration/consistency/cmd/update.go | 97 ++++ .../consistency/dataset_hash_test.go | 213 ++++++++ tests/integration/consistency/harness.go | 472 ++++++++++++++++++ 12 files changed, 877 insertions(+) create mode 100644 tests/fixtures/consistency/README.md create mode 100644 tests/fixtures/consistency/dataset_hash/02_single_file/input/test.txt create mode 100644 tests/fixtures/consistency/dataset_hash/03_nested/input/root.txt create mode 100644 tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/file1.txt create mode 100644 tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/subdir2/deep.txt create mode 100644 tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_a.txt create mode 100644 tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_b.txt create mode 100644 tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_c.txt create mode 100644 tests/fixtures/consistency/dataset_hash/expected_hashes.json create mode 100644 tests/integration/consistency/cmd/update.go create mode 100644 tests/integration/consistency/dataset_hash_test.go create mode 100644 tests/integration/consistency/harness.go diff --git a/tests/fixtures/consistency/README.md b/tests/fixtures/consistency/README.md new file mode 100644 index 0000000..aa72a7b --- /dev/null +++ b/tests/fixtures/consistency/README.md @@ -0,0 +1,40 @@ +# Consistency Test Fixtures + +This directory contains canonical test fixtures for cross-implementation consistency testing. + +Each implementation (native C++, Go, Zig) must produce identical outputs for these fixtures. + +## Algorithm Specification + +### Dataset Hash Algorithm v1 + +1. Recursively collect all regular files (not symlinks, not directories) +2. Skip hidden files (names starting with '.') +3. Sort file paths lexicographically (full relative paths) +4. For each file: + - Compute SHA256 of file contents + - Convert to lowercase hex (64 chars) +5. Combine: SHA256(concatenation of all file hashes in sorted order) +6. Return lowercase hex (64 chars) + +**Empty directory**: Returns SHA256 of empty string: +`e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855` + +### Directory Structure + +``` +dataset_hash/ +├── 01_empty_dir/ # Empty directory +├── 02_single_file/ # One file with "hello world" +├── 03_nested/ # Nested directories +├── 04_special_chars/ # Files with spaces and unicode +└── expected_hashes.json # All expected outputs +``` + +## Adding New Fixtures + +1. Create directory with `input/` subdirectory +2. Add files to `input/` +3. Compute expected hash using reference implementation +4. Add entry to `expected_hashes.json` +5. Document any special considerations in `README.md` diff --git a/tests/fixtures/consistency/dataset_hash/02_single_file/input/test.txt b/tests/fixtures/consistency/dataset_hash/02_single_file/input/test.txt new file mode 100644 index 0000000..3b18e51 --- /dev/null +++ b/tests/fixtures/consistency/dataset_hash/02_single_file/input/test.txt @@ -0,0 +1 @@ +hello world diff --git a/tests/fixtures/consistency/dataset_hash/03_nested/input/root.txt b/tests/fixtures/consistency/dataset_hash/03_nested/input/root.txt new file mode 100644 index 0000000..6ee1742 --- /dev/null +++ b/tests/fixtures/consistency/dataset_hash/03_nested/input/root.txt @@ -0,0 +1 @@ +file a content diff --git a/tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/file1.txt b/tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/file1.txt new file mode 100644 index 0000000..5c2d55e --- /dev/null +++ b/tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/file1.txt @@ -0,0 +1 @@ +file b content diff --git a/tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/subdir2/deep.txt b/tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/subdir2/deep.txt new file mode 100644 index 0000000..c9d61a6 --- /dev/null +++ b/tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/subdir2/deep.txt @@ -0,0 +1 @@ +file c content diff --git a/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_a.txt b/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_a.txt new file mode 100644 index 0000000..8f001d2 --- /dev/null +++ b/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_a.txt @@ -0,0 +1 @@ +first file content here diff --git a/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_b.txt b/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_b.txt new file mode 100644 index 0000000..2a1506f --- /dev/null +++ b/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_b.txt @@ -0,0 +1 @@ +second file content here diff --git a/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_c.txt b/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_c.txt new file mode 100644 index 0000000..f39703e --- /dev/null +++ b/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_c.txt @@ -0,0 +1 @@ +third file content here diff --git a/tests/fixtures/consistency/dataset_hash/expected_hashes.json b/tests/fixtures/consistency/dataset_hash/expected_hashes.json new file mode 100644 index 0000000..1be662d --- /dev/null +++ b/tests/fixtures/consistency/dataset_hash/expected_hashes.json @@ -0,0 +1,48 @@ +{ + "version": "1.0.0", + "algorithm": "Dataset Hash Algorithm v1", + "description": "SHA256 of concatenated file hashes (sorted lexicographically)", + "fixtures": [ + { + "id": "01_empty_dir", + "name": "Empty Directory", + "description": "Directory with no files", + "expected_hash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + "files": [] + }, + { + "id": "02_single_file", + "name": "Single File", + "description": "Directory with one file containing 'hello world' (no trailing newline)", + "expected_hash": "6dd7e8e932ea9d58555d7fee44a9b01a9bd7448e986636b728ee3711b01f37ce", + "files": [ + { + "path": "test.txt", + "content_hash": "a948904f2f0f479b8f8197694b30184b0d2ed1c1cd2a1ec0fb85d299a192a447" + } + ] + }, + { + "id": "03_nested", + "name": "Nested Directories", + "description": "Multiple levels of subdirectories", + "expected_hash": "ba539800f8b98db5c7403773737ed92c71589e60b415d6a2556cb267a19fa0e0", + "files": [ + {"path": "root.txt", "content_hash": "0c572ee02055d28c45d0616bc31484af3912fb14ff231f5fe23000fb6747f561"}, + {"path": "subdir1/file1.txt", "content_hash": "89daac0d129ad5569989efcca1763e74de4431d1a3b081a68d53aa23e1cf2c3f"}, + {"path": "subdir1/subdir2/deep.txt", "content_hash": "728312d971fd4d1aa9720531f0e495d33fda5c71562643fd814d0cff46689d4a"} + ] + }, + { + "id": "04_multiple_files", + "name": "Multiple Files", + "description": "Directory with several files at root level", + "expected_hash": "b2aca3c5daf9b5c46d96bfc78c4fb221c3b045798336c7c226937f10ac1257a5", + "files": [ + {"path": "file_a.txt", "content_hash": "a2ba67db2bf4d822fc687c98c96db8e83284abd9f069a7e958aaae0e36490903"}, + {"path": "file_b.txt", "content_hash": "0c0370cff9c241b6c1869edf309da41f6711e94cabf3d8d99044dc500189d15a"}, + {"path": "file_c.txt", "content_hash": "e1f1e0b4750c7f7af8527ce285442cb45a337a7b83a97381430fd99587f79948"} + ] + } + ] +} diff --git a/tests/integration/consistency/cmd/update.go b/tests/integration/consistency/cmd/update.go new file mode 100644 index 0000000..83e329c --- /dev/null +++ b/tests/integration/consistency/cmd/update.go @@ -0,0 +1,97 @@ +// Command update computes expected hashes for fixtures using the reference Go implementation +package main + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + + "github.com/jfraeys/fetch_ml/tests/integration/consistency" +) + +func main() { + fixturesDir := filepath.Join("tests", "fixtures", "consistency") + + // Load current expected hashes + expectedPath := filepath.Join(fixturesDir, "dataset_hash", "expected_hashes.json") + data, err := os.ReadFile(expectedPath) + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to read expected hashes: %v\n", err) + os.Exit(1) + } + + var expected consistency.ExpectedHashes + if err := json.Unmarshal(data, &expected); err != nil { + fmt.Fprintf(os.Stderr, "Failed to parse expected hashes: %v\n", err) + os.Exit(1) + } + + // Use Go implementation as reference + goImpl := consistency.NewGoImpl() + + updated := false + for i, fixture := range expected.Fixtures { + fixturePath := filepath.Join(fixturesDir, "dataset_hash", fixture.ID, "input") + + // Check if fixture exists + if _, err := os.Stat(fixturePath); os.IsNotExist(err) { + fmt.Printf("Skipping %s: fixture not found at %s\n", fixture.ID, fixturePath) + continue + } + + // Compute hash using reference implementation + hash, err := goImpl.HashDataset(fixturePath) + if err != nil { + fmt.Printf("Error hashing %s: %v\n", fixture.ID, err) + continue + } + + // Update if different or TODO + if fixture.ExpectedHash == "TODO_COMPUTE" { + fmt.Printf("%s: computed %s\n", fixture.ID, hash) + expected.Fixtures[i].ExpectedHash = hash + updated = true + } else if fixture.ExpectedHash != hash { + fmt.Printf("%s: updated %s -> %s\n", fixture.ID, fixture.ExpectedHash, hash) + expected.Fixtures[i].ExpectedHash = hash + updated = true + } else { + fmt.Printf("%s: unchanged (%s)\n", fixture.ID, hash) + } + + // Compute individual file hashes + for j, file := range fixture.Files { + if file.ContentHash == "TODO" || file.ContentHash == "" { + filePath := filepath.Join(fixturePath, file.Path) + fileHash, err := goImpl.HashFile(filePath) + if err != nil { + fmt.Printf(" %s: error - %v\n", file.Path, err) + continue + } + fmt.Printf(" %s: %s\n", file.Path, fileHash) + expected.Fixtures[i].Files[j].ContentHash = fileHash + updated = true + } + } + } + + if !updated { + fmt.Println("\nNo updates needed.") + return + } + + // Write updated hashes + output, err := json.MarshalIndent(expected, "", " ") + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to marshal updated hashes: %v\n", err) + os.Exit(1) + } + + if err := os.WriteFile(expectedPath, output, 0644); err != nil { + fmt.Fprintf(os.Stderr, "Failed to write updated hashes: %v\n", err) + os.Exit(1) + } + + fmt.Println("\nUpdated expected_hashes.json") +} diff --git a/tests/integration/consistency/dataset_hash_test.go b/tests/integration/consistency/dataset_hash_test.go new file mode 100644 index 0000000..5b28b8c --- /dev/null +++ b/tests/integration/consistency/dataset_hash_test.go @@ -0,0 +1,213 @@ +package consistency + +import ( + "os" + "path/filepath" + "testing" +) + +// TestDatasetHashConsistency verifies all implementations produce identical hashes +func TestDatasetHashConsistency(t *testing.T) { + fixturesDir := filepath.Join("..", "..", "fixtures", "consistency") + + expected, err := LoadExpectedHashes(fixturesDir) + if err != nil { + t.Fatalf("Failed to load expected hashes: %v", err) + } + + // Initialize implementations + impls := []Implementation{ + NewNativeImpl(), + NewGoImpl(), + NewZigImpl(), + } + + // Check which implementations are available + availableCount := 0 + for _, impl := range impls { + if impl.Available() { + availableCount++ + t.Logf("Implementation available: %s", impl.Name()) + } else { + t.Logf("Implementation not available: %s", impl.Name()) + } + } + + if availableCount < 2 { + t.Skip("Need at least 2 implementations for consistency testing") + } + + // Test each fixture + for _, fixture := range expected.Fixtures { + t.Run(fixture.ID, func(t *testing.T) { + testFixture(t, fixturesDir, &fixture, impls) + }) + } +} + +// TestDatasetHashSmoke runs a quick smoke test +func TestDatasetHashSmoke(t *testing.T) { + fixturesDir := filepath.Join("..", "..", "fixtures", "consistency") + + // Just test single file fixture for quick validation + fixturePath := filepath.Join(fixturesDir, "dataset_hash", "02_single_file", "input") + + // Verify fixture exists + if _, err := os.Stat(fixturePath); os.IsNotExist(err) { + t.Skipf("Fixture not found: %s", fixturePath) + } + + impls := []Implementation{ + NewNativeImpl(), + NewGoImpl(), + NewZigImpl(), + } + + results, err := ComputeAllHashes(fixturePath, impls) + if err != nil { + t.Logf("Errors during hash computation: %v", err) + } + + expected := "6dd7e8e932ea9d58555d7fee44a9b01a9bd7448e986636b728ee3711b01f37ce" + match, mismatches := CompareHashes(results, expected) + + t.Logf("\n%s", FormatHashComparison(results, expected)) + + if !match { + for _, m := range mismatches { + t.Errorf("Mismatch: %s", m) + } + } +} + +// TestCrossImplEquivalence compares implementations against each other +func TestCrossImplEquivalence(t *testing.T) { + fixturesDir := filepath.Join("..", "..", "fixtures", "consistency") + + impls := []Implementation{ + NewGoImpl(), + NewNativeImpl(), + NewZigImpl(), + } + + // Find first available implementation as reference + var reference Implementation + for _, impl := range impls { + if impl.Available() { + reference = impl + break + } + } + + if reference == nil { + t.Skip("No implementations available") + } + + t.Logf("Using %s as reference implementation", reference.Name()) + + // Test fixtures + fixtures := []string{ + "02_single_file", + "03_nested", + "04_multiple_files", + } + + for _, fixtureName := range fixtures { + t.Run(fixtureName, func(t *testing.T) { + fixturePath := filepath.Join(fixturesDir, "dataset_hash", fixtureName, "input") + + if _, err := os.Stat(fixturePath); os.IsNotExist(err) { + t.Skipf("Fixture not found: %s", fixturePath) + } + + // Get reference hash + refHash, err := reference.HashDataset(fixturePath) + if err != nil { + t.Fatalf("Reference implementation failed: %v", err) + } + + // Compare all other implementations + for _, impl := range impls { + if impl == reference || !impl.Available() { + continue + } + + hash, err := impl.HashDataset(fixturePath) + if err != nil { + t.Errorf("%s failed: %v", impl.Name(), err) + continue + } + + if hash != refHash { + t.Errorf("%s mismatch: got %s, reference (%s) has %s", + impl.Name(), hash, reference.Name(), refHash) + } else { + t.Logf("%s matches reference ✓", impl.Name()) + } + } + }) + } +} + +// TestEmptyDirectory specifically tests empty directory handling +func TestEmptyDirectory(t *testing.T) { + fixturesDir := filepath.Join("..", "..", "fixtures", "consistency") + fixturePath := filepath.Join(fixturesDir, "dataset_hash", "01_empty_dir", "input") + + expected := "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + + impls := []Implementation{ + NewGoImpl(), + NewNativeImpl(), + NewZigImpl(), + } + + for _, impl := range impls { + if !impl.Available() { + continue + } + + t.Run(impl.Name(), func(t *testing.T) { + hash, err := impl.HashDataset(fixturePath) + if err != nil { + t.Fatalf("Failed to hash empty directory: %v", err) + } + + if hash != expected { + t.Errorf("Empty directory hash mismatch: got %s, expected %s", hash, expected) + } + }) + } +} + +// testFixture tests a single fixture against all implementations +func testFixture(t *testing.T, fixturesDir string, fixture *Fixture, impls []Implementation) { + fixturePath := filepath.Join(fixturesDir, "dataset_hash", fixture.ID, "input") + + // Verify fixture exists + if _, err := os.Stat(fixturePath); os.IsNotExist(err) { + t.Skipf("Fixture not found: %s", fixturePath) + } + + // Skip fixtures with TODO expected hashes + if fixture.ExpectedHash == "TODO_COMPUTE" { + t.Skipf("Fixture %s has uncomputed expected hash", fixture.ID) + } + + results, err := ComputeAllHashes(fixturePath, impls) + if err != nil { + t.Logf("Errors during hash computation: %v", err) + } + + match, mismatches := CompareHashes(results, fixture.ExpectedHash) + + // Log comparison for debugging + t.Logf("\nFixture: %s - %s", fixture.ID, fixture.Name) + t.Logf("\n%s", FormatHashComparison(results, fixture.ExpectedHash)) + + if !match { + for _, m := range mismatches { + t.Errorf("%s", m) + } + } +} diff --git a/tests/integration/consistency/harness.go b/tests/integration/consistency/harness.go new file mode 100644 index 0000000..ac12bd5 --- /dev/null +++ b/tests/integration/consistency/harness.go @@ -0,0 +1,472 @@ +// Package consistency provides cross-implementation consistency testing +// for native C++, Go, and Zig implementations. +// +//go:build cgo +// +build cgo + +package consistency + +import ( + "bytes" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "sort" + "strings" + "sync" + "unsafe" +) + +// #cgo darwin LDFLAGS: -L${SRCDIR}/../../../native/build -Wl,-rpath,${SRCDIR}/../../../native/build -ldataset_hash +// #cgo linux LDFLAGS: -L${SRCDIR}/../../../native/build -Wl,-rpath,${SRCDIR}/../../../native/build -ldataset_hash +// #include "../../../native/dataset_hash/dataset_hash.h" +// #include +import "C" + +var ( + nativeCtx *C.fh_context_t + nativeCtxOnce sync.Once +) + +// Implementation defines the interface for a hashing implementation +type Implementation interface { + Name() string + HashDataset(path string) (string, error) + HashFile(path string) (string, error) + Available() bool +} + +// NativeImpl wraps the native C++ library via CGO +type NativeImpl struct { + available bool +} + +// GoImpl uses the pure Go implementation +type GoImpl struct{} + +// ZigImpl executes the Zig CLI as a subprocess +type ZigImpl struct { + cliPath string + available bool +} + +// Fixture represents a test case with known expected output +type Fixture struct { + ID string `json:"id"` + Name string `json:"name"` + Description string `json:"description"` + ExpectedHash string `json:"expected_hash"` + Files []FixtureFile `json:"files"` +} + +// FixtureFile represents a file in a fixture +type FixtureFile struct { + Path string `json:"path"` + ContentHash string `json:"content_hash"` +} + +// ExpectedHashes is the root structure of expected_hashes.json +type ExpectedHashes struct { + Version string `json:"version"` + Algorithm string `json:"algorithm"` + Description string `json:"description"` + Fixtures []Fixture `json:"fixtures"` +} + +// Name returns the implementation name +func (n *NativeImpl) Name() string { return "native_c++" } + +// Available returns true if native libraries are available +func (n *NativeImpl) Available() bool { return n.available } + +// HashDataset computes the hash of a directory using native library +func (n *NativeImpl) HashDataset(path string) (string, error) { + if !n.available { + return "", fmt.Errorf("native library not available") + } + + // Call the native library through worker package + return hashWithNative(path) +} + +// HashFile computes the hash of a single file using native library +func (n *NativeImpl) HashFile(path string) (string, error) { + if !n.available { + return "", fmt.Errorf("native library not available") + } + + return hashFileWithNative(path) +} + +// Name returns the implementation name +func (g *GoImpl) Name() string { return "go_pure" } + +// Available always returns true for Go implementation +func (g *GoImpl) Available() bool { return true } + +// HashDataset computes the hash of a directory using pure Go +func (g *GoImpl) HashDataset(path string) (string, error) { + return hashDirGo(path) +} + +// HashFile computes the hash of a single file using pure Go +func (g *GoImpl) HashFile(path string) (string, error) { + return hashFileGo(path) +} + +// Name returns the implementation name +func (z *ZigImpl) Name() string { return "zig_cli" } + +// Available returns true if Zig CLI is found +func (z *ZigImpl) Available() bool { return z.available } + +// HashDataset computes the hash of a directory using Zig CLI +// Uses 'dataset verify' command which auto-hashes the dataset +func (z *ZigImpl) HashDataset(path string) (string, error) { + if !z.available { + return "", fmt.Errorf("zig CLI not available at %s", z.cliPath) + } + + // Convert to absolute path to avoid PathTraversalAttempt error + absPath, err := filepath.Abs(path) + if err != nil { + return "", fmt.Errorf("failed to get absolute path: %w", err) + } + + // Use dataset verify --dry-run to get the hash without verifying + cmd := exec.Command(z.cliPath, "dataset", "verify", absPath, "--dry-run") + output, err := cmd.CombinedOutput() + if err != nil { + return "", fmt.Errorf("zig CLI failed: %w (output: %s)", err, string(output)) + } + + // Parse output to extract hash + return parseZigHashOutput(string(output)) +} + +// HashFile computes the hash of a single file using Zig CLI +func (z *ZigImpl) HashFile(path string) (string, error) { + // Zig CLI doesn't have a single file hash command, so we compute it directly + return hashFileGo(path) +} + +// NewNativeImpl creates a new native implementation wrapper +func NewNativeImpl() *NativeImpl { + return &NativeImpl{ + available: checkNativeAvailable(), + } +} + +// NewGoImpl creates a new Go implementation wrapper +func NewGoImpl() *GoImpl { + return &GoImpl{} +} + +// NewZigImpl creates a new Zig implementation wrapper +func NewZigImpl() *ZigImpl { + cliPath := findZigCLI() + return &ZigImpl{ + cliPath: cliPath, + available: cliPath != "", + } +} + +// LoadExpectedHashes loads the expected hash values from fixtures +func LoadExpectedHashes(fixturesDir string) (*ExpectedHashes, error) { + path := filepath.Join(fixturesDir, "dataset_hash", "expected_hashes.json") + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("failed to read expected hashes: %w", err) + } + + var expected ExpectedHashes + if err := json.Unmarshal(data, &expected); err != nil { + return nil, fmt.Errorf("failed to parse expected hashes: %w", err) + } + + return &expected, nil +} + +// ComputeExpectedHash computes the expected hash for a fixture using reference algorithm +func ComputeExpectedHash(fixturePath string) (string, error) { + return hashDirGo(fixturePath) +} + +// hashDirGo is the reference Go implementation +func hashDirGo(root string) (string, error) { + root = filepath.Clean(root) + info, err := os.Stat(root) + if err != nil { + return "", err + } + if !info.IsDir() { + return "", fmt.Errorf("not a directory: %s", root) + } + + var files []string + err = filepath.WalkDir(root, func(path string, d os.DirEntry, walkErr error) error { + if walkErr != nil { + return walkErr + } + if d.IsDir() { + return nil + } + + // Skip hidden files + rel, err := filepath.Rel(root, path) + if err != nil { + return err + } + if strings.HasPrefix(filepath.Base(rel), ".") { + return nil + } + + files = append(files, rel) + return nil + }) + if err != nil { + return "", err + } + + // Deterministic order + sort.Strings(files) + + // Hash file hashes to avoid holding all bytes + overall := sha256.New() + for _, rel := range files { + p := filepath.Join(root, rel) + sum, err := hashFileGo(p) + if err != nil { + return "", err + } + overall.Write([]byte(sum)) + } + return hex.EncodeToString(overall.Sum(nil)), nil +} + +// hashFileGo computes SHA256 of a file +func hashFileGo(path string) (string, error) { + f, err := os.Open(filepath.Clean(path)) + if err != nil { + return "", err + } + defer f.Close() + + h := sha256.New() + if _, err := h.Write([]byte{}); err != nil { + return "", err + } + + buf := make([]byte, 64*1024) + for { + n, err := f.Read(buf) + if n > 0 { + h.Write(buf[:n]) + } + if err != nil { + break + } + } + + return hex.EncodeToString(h.Sum(nil)), nil +} + +// checkNativeAvailable checks if native libraries are available +// Called from tests/integration/consistency/, so native/build is at ../../../native/build +func checkNativeAvailable() bool { + libDir := filepath.Join("..", "..", "..", "native", "build") + + if runtime.GOOS == "darwin" { + _, err := os.Stat(filepath.Join(libDir, "libdataset_hash.dylib")) + return err == nil + } + _, err := os.Stat(filepath.Join(libDir, "libdataset_hash.so")) + return err == nil +} + +// findZigCLI locates the Zig CLI binary +// Called from tests/integration/consistency/, so cli/ is at ../../../cli/ +func findZigCLI() string { + // Check zig-out/bin for ml-- pattern + zigBinDir := filepath.Join("..", "..", "..", "cli", "zig-out", "bin") + if entries, err := os.ReadDir(zigBinDir); err == nil { + for _, entry := range entries { + if strings.HasPrefix(entry.Name(), "ml-") && !entry.IsDir() { + return filepath.Join(zigBinDir, entry.Name()) + } + } + } + + // Fallback: check .zig-cache + cacheDir := filepath.Join("..", "..", "..", "cli", ".zig-cache", "o") + if entries, err := os.ReadDir(cacheDir); err == nil { + for _, entry := range entries { + if entry.IsDir() { + subEntries, _ := os.ReadDir(filepath.Join(cacheDir, entry.Name())) + for _, sub := range subEntries { + if strings.HasPrefix(sub.Name(), "ml-") && !sub.IsDir() { + return filepath.Join(cacheDir, entry.Name(), sub.Name()) + } + } + } + } + } + + // Try PATH + if path, err := exec.LookPath("ml"); err == nil { + return path + } + + return "" +} + +// parseZigHashOutput extracts the hash from Zig CLI 'dataset verify' output +// Expected format contains hash information from the verify command +func parseZigHashOutput(output string) (string, error) { + // Look for hash in the output - various formats possible + lines := strings.SplitSeq(output, "\n") + for line := range lines { + // Try to find a 64-character hex string (SHA256) + fields := strings.FieldsSeq(line) + for field := range fields { + field = strings.TrimSpace(field) + // Check if it looks like a SHA256 hash (64 hex chars) + if len(field) == 64 { + // Verify it's valid hex + if _, err := hex.DecodeString(field); err == nil { + return field, nil + } + } + } + } + return "", fmt.Errorf("could not parse hash from output: %s", output) +} + +// hashWithNative calls the native library via CGO +func hashWithNative(path string) (string, error) { + ctx := getNativeHashContext() + + croot := C.CString(path) + defer C.free(unsafe.Pointer(croot)) + + result := C.fh_hash_directory_combined(ctx, croot) + if result == nil { + err := C.fh_last_error(ctx) + if err != nil { + return "", fmt.Errorf("native hash failed: %s", C.GoString(err)) + } + return "", fmt.Errorf("native hash failed") + } + defer C.fh_free_string(result) + + return C.GoString(result), nil +} + +// hashFileWithNative calls the native library for single file +func hashFileWithNative(path string) (string, error) { + ctx := getNativeHashContext() + + cpath := C.CString(path) + defer C.free(unsafe.Pointer(cpath)) + + result := C.fh_hash_file(ctx, cpath) + if result == nil { + err := C.fh_last_error(ctx) + if err != nil { + return "", fmt.Errorf("native file hash failed: %s", C.GoString(err)) + } + return "", fmt.Errorf("native file hash failed") + } + defer C.fh_free_string(result) + + return C.GoString(result), nil +} + +// getNativeHashContext initializes and returns the native hash context +func getNativeHashContext() *C.fh_context_t { + nativeCtxOnce.Do(func() { + nativeCtx = C.fh_init(C.uint32_t(runtime.NumCPU())) + }) + return nativeCtx +} + +// ComputeAllHashes runs all available implementations on a path +func ComputeAllHashes(path string, impls []Implementation) (map[string]string, error) { + results := make(map[string]string) + var errors []string + + for _, impl := range impls { + if !impl.Available() { + results[impl.Name()] = "[not available]" + continue + } + + hash, err := impl.HashDataset(path) + if err != nil { + errors = append(errors, fmt.Sprintf("%s: %v", impl.Name(), err)) + results[impl.Name()] = fmt.Sprintf("[error: %v]", err) + } else { + results[impl.Name()] = hash + } + } + + if len(errors) > 0 { + return results, fmt.Errorf("errors: %s", strings.Join(errors, "; ")) + } + return results, nil +} + +// CompareHashes checks if all hashes match the expected value +func CompareHashes(results map[string]string, expected string) (bool, []string) { + var mismatches []string + + for name, hash := range results { + if strings.HasPrefix(hash, "[") { + // Not available or error - skip comparison + continue + } + + if !strings.EqualFold(hash, expected) { + mismatches = append(mismatches, + fmt.Sprintf("%s: got %s, expected %s", name, hash, expected)) + } + } + + return len(mismatches) == 0, mismatches +} + +// FormatHashComparison creates a readable comparison of hashes +func FormatHashComparison(results map[string]string, expected string) string { + var buf bytes.Buffer + + fmt.Fprintf(&buf, "Hash Comparison:\n") + fmt.Fprintf(&buf, " Expected: %s\n", expected) + fmt.Fprintf(&buf, "\n") + + maxNameLen := 0 + for name := range results { + if len(name) > maxNameLen { + maxNameLen = len(name) + } + } + + for name, hash := range results { + padding := strings.Repeat(" ", maxNameLen-len(name)) + match := " " + if !strings.HasPrefix(hash, "[") { + if strings.EqualFold(hash, expected) { + match = "✓" + } else { + match = "✗" + } + } + fmt.Fprintf(&buf, " %s:%s %s %s\n", name, padding, match, hash) + } + + return buf.String() +}