test(consistency): add dataset hash consistency test suite

Add cross-implementation consistency tests for dataset hash functionality: ## Test Fixtures - Single file, nested directories, and multiple file test cases - Expected hashes in JSON format for validation ## Test Infrastructure - harness.go: Common test utilities and reference implementation runner - dataset_hash_test.go: Consistency test cases comparing implementations - cmd/update.go: Tool to regenerate expected hashes from reference ## Purpose Ensures hash implementations (Go, C++, Zig) produce identical results across all supported platforms and implementations.
2026-03-05 14:41:14 -05:00 · 2026-03-05 14:41:14 -05:00 · a239f3a14f
commit a239f3a14f
parent 8e5af0da2d
12 changed files with 877 additions and 0 deletions
--- a/tests/fixtures/consistency/README.md
+++ b/tests/fixtures/consistency/README.md
@ -0,0 +1,40 @@
+# Consistency Test Fixtures
+
+This directory contains canonical test fixtures for cross-implementation consistency testing.
+
+Each implementation (native C++, Go, Zig) must produce identical outputs for these fixtures.
+
+## Algorithm Specification
+
+### Dataset Hash Algorithm v1
+
+1. Recursively collect all regular files (not symlinks, not directories)
+2. Skip hidden files (names starting with '.')
+3. Sort file paths lexicographically (full relative paths)
+4. For each file:
+   - Compute SHA256 of file contents
+   - Convert to lowercase hex (64 chars)
+5. Combine: SHA256(concatenation of all file hashes in sorted order)
+6. Return lowercase hex (64 chars)
+
+**Empty directory**: Returns SHA256 of empty string:
+`e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855`
+
+### Directory Structure
+
+```
+dataset_hash/
+├── 01_empty_dir/         # Empty directory
+├── 02_single_file/         # One file with "hello world"
+├── 03_nested/              # Nested directories
+├── 04_special_chars/       # Files with spaces and unicode
+└── expected_hashes.json    # All expected outputs
+```
+
+## Adding New Fixtures
+
+1. Create directory with `input/` subdirectory
+2. Add files to `input/`
+3. Compute expected hash using reference implementation
+4. Add entry to `expected_hashes.json`
+5. Document any special considerations in `README.md`
--- a/tests/fixtures/consistency/dataset_hash/02_single_file/input/test.txt
+++ b/tests/fixtures/consistency/dataset_hash/02_single_file/input/test.txt
@ -0,0 +1 @@
+hello world
--- a/tests/fixtures/consistency/dataset_hash/03_nested/input/root.txt
+++ b/tests/fixtures/consistency/dataset_hash/03_nested/input/root.txt
@ -0,0 +1 @@
+file a content
--- a/tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/file1.txt
+++ b/tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/file1.txt
@ -0,0 +1 @@
+file b content
--- a/tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/subdir2/deep.txt
+++ b/tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/subdir2/deep.txt
@ -0,0 +1 @@
+file c content
--- a/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_a.txt
+++ b/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_a.txt
@ -0,0 +1 @@
+first file content here
--- a/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_b.txt
+++ b/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_b.txt
@ -0,0 +1 @@
+second file content here
--- a/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_c.txt
+++ b/tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_c.txt
@ -0,0 +1 @@
+third file content here
--- a/tests/fixtures/consistency/dataset_hash/expected_hashes.json
+++ b/tests/fixtures/consistency/dataset_hash/expected_hashes.json
@ -0,0 +1,48 @@
+{
+  "version": "1.0.0",
+  "algorithm": "Dataset Hash Algorithm v1",
+  "description": "SHA256 of concatenated file hashes (sorted lexicographically)",
+  "fixtures": [
+    {
+      "id": "01_empty_dir",
+      "name": "Empty Directory",
+      "description": "Directory with no files",
+      "expected_hash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+      "files": []
+    },
+    {
+      "id": "02_single_file",
+      "name": "Single File",
+      "description": "Directory with one file containing 'hello world' (no trailing newline)",
+      "expected_hash": "6dd7e8e932ea9d58555d7fee44a9b01a9bd7448e986636b728ee3711b01f37ce",
+      "files": [
+        {
+          "path": "test.txt",
+          "content_hash": "a948904f2f0f479b8f8197694b30184b0d2ed1c1cd2a1ec0fb85d299a192a447"
+        }
+      ]
+    },
+    {
+      "id": "03_nested",
+      "name": "Nested Directories",
+      "description": "Multiple levels of subdirectories",
+      "expected_hash": "ba539800f8b98db5c7403773737ed92c71589e60b415d6a2556cb267a19fa0e0",
+      "files": [
+        {"path": "root.txt", "content_hash": "0c572ee02055d28c45d0616bc31484af3912fb14ff231f5fe23000fb6747f561"},
+        {"path": "subdir1/file1.txt", "content_hash": "89daac0d129ad5569989efcca1763e74de4431d1a3b081a68d53aa23e1cf2c3f"},
+        {"path": "subdir1/subdir2/deep.txt", "content_hash": "728312d971fd4d1aa9720531f0e495d33fda5c71562643fd814d0cff46689d4a"}
+      ]
+    },
+    {
+      "id": "04_multiple_files",
+      "name": "Multiple Files",
+      "description": "Directory with several files at root level",
+      "expected_hash": "b2aca3c5daf9b5c46d96bfc78c4fb221c3b045798336c7c226937f10ac1257a5",
+      "files": [
+        {"path": "file_a.txt", "content_hash": "a2ba67db2bf4d822fc687c98c96db8e83284abd9f069a7e958aaae0e36490903"},
+        {"path": "file_b.txt", "content_hash": "0c0370cff9c241b6c1869edf309da41f6711e94cabf3d8d99044dc500189d15a"},
+        {"path": "file_c.txt", "content_hash": "e1f1e0b4750c7f7af8527ce285442cb45a337a7b83a97381430fd99587f79948"}
+      ]
+    }
+  ]
+}
--- a/tests/integration/consistency/cmd/update.go
+++ b/tests/integration/consistency/cmd/update.go
@ -0,0 +1,97 @@
+// Command update computes expected hashes for fixtures using the reference Go implementation
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"github.com/jfraeys/fetch_ml/tests/integration/consistency"
+)
+
+func main() {
+	fixturesDir := filepath.Join("tests", "fixtures", "consistency")
+	
+	// Load current expected hashes
+	expectedPath := filepath.Join(fixturesDir, "dataset_hash", "expected_hashes.json")
+	data, err := os.ReadFile(expectedPath)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Failed to read expected hashes: %v\n", err)
+		os.Exit(1)
+	}
+	
+	var expected consistency.ExpectedHashes
+	if err := json.Unmarshal(data, &expected); err != nil {
+		fmt.Fprintf(os.Stderr, "Failed to parse expected hashes: %v\n", err)
+		os.Exit(1)
+	}
+	
+	// Use Go implementation as reference
+	goImpl := consistency.NewGoImpl()
+	
+	updated := false
+	for i, fixture := range expected.Fixtures {
+		fixturePath := filepath.Join(fixturesDir, "dataset_hash", fixture.ID, "input")
+		
+		// Check if fixture exists
+		if _, err := os.Stat(fixturePath); os.IsNotExist(err) {
+			fmt.Printf("Skipping %s: fixture not found at %s\n", fixture.ID, fixturePath)
+			continue
+		}
+		
+		// Compute hash using reference implementation
+		hash, err := goImpl.HashDataset(fixturePath)
+		if err != nil {
+			fmt.Printf("Error hashing %s: %v\n", fixture.ID, err)
+			continue
+		}
+		
+		// Update if different or TODO
+		if fixture.ExpectedHash == "TODO_COMPUTE" {
+			fmt.Printf("%s: computed %s\n", fixture.ID, hash)
+			expected.Fixtures[i].ExpectedHash = hash
+			updated = true
+		} else if fixture.ExpectedHash != hash {
+			fmt.Printf("%s: updated %s -> %s\n", fixture.ID, fixture.ExpectedHash, hash)
+			expected.Fixtures[i].ExpectedHash = hash
+			updated = true
+		} else {
+			fmt.Printf("%s: unchanged (%s)\n", fixture.ID, hash)
+		}
+		
+		// Compute individual file hashes
+		for j, file := range fixture.Files {
+			if file.ContentHash == "TODO" || file.ContentHash == "" {
+				filePath := filepath.Join(fixturePath, file.Path)
+				fileHash, err := goImpl.HashFile(filePath)
+				if err != nil {
+					fmt.Printf("  %s: error - %v\n", file.Path, err)
+					continue
+				}
+				fmt.Printf("  %s: %s\n", file.Path, fileHash)
+				expected.Fixtures[i].Files[j].ContentHash = fileHash
+				updated = true
+			}
+		}
+	}
+	
+	if !updated {
+		fmt.Println("\nNo updates needed.")
+		return
+	}
+	
+	// Write updated hashes
+	output, err := json.MarshalIndent(expected, "", "  ")
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Failed to marshal updated hashes: %v\n", err)
+		os.Exit(1)
+	}
+	
+	if err := os.WriteFile(expectedPath, output, 0644); err != nil {
+		fmt.Fprintf(os.Stderr, "Failed to write updated hashes: %v\n", err)
+		os.Exit(1)
+	}
+	
+	fmt.Println("\nUpdated expected_hashes.json")
+}
--- a/tests/integration/consistency/dataset_hash_test.go
+++ b/tests/integration/consistency/dataset_hash_test.go
@ -0,0 +1,213 @@
+package consistency
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// TestDatasetHashConsistency verifies all implementations produce identical hashes
+func TestDatasetHashConsistency(t *testing.T) {
+	fixturesDir := filepath.Join("..", "..", "fixtures", "consistency")
+
+	expected, err := LoadExpectedHashes(fixturesDir)
+	if err != nil {
+		t.Fatalf("Failed to load expected hashes: %v", err)
+	}
+
+	// Initialize implementations
+	impls := []Implementation{
+		NewNativeImpl(),
+		NewGoImpl(),
+		NewZigImpl(),
+	}
+
+	// Check which implementations are available
+	availableCount := 0
+	for _, impl := range impls {
+		if impl.Available() {
+			availableCount++
+			t.Logf("Implementation available: %s", impl.Name())
+		} else {
+			t.Logf("Implementation not available: %s", impl.Name())
+		}
+	}
+
+	if availableCount < 2 {
+		t.Skip("Need at least 2 implementations for consistency testing")
+	}
+
+	// Test each fixture
+	for _, fixture := range expected.Fixtures {
+		t.Run(fixture.ID, func(t *testing.T) {
+			testFixture(t, fixturesDir, &fixture, impls)
+		})
+	}
+}
+
+// TestDatasetHashSmoke runs a quick smoke test
+func TestDatasetHashSmoke(t *testing.T) {
+	fixturesDir := filepath.Join("..", "..", "fixtures", "consistency")
+
+	// Just test single file fixture for quick validation
+	fixturePath := filepath.Join(fixturesDir, "dataset_hash", "02_single_file", "input")
+
+	// Verify fixture exists
+	if _, err := os.Stat(fixturePath); os.IsNotExist(err) {
+		t.Skipf("Fixture not found: %s", fixturePath)
+	}
+
+	impls := []Implementation{
+		NewNativeImpl(),
+		NewGoImpl(),
+		NewZigImpl(),
+	}
+
+	results, err := ComputeAllHashes(fixturePath, impls)
+	if err != nil {
+		t.Logf("Errors during hash computation: %v", err)
+	}
+
+	expected := "6dd7e8e932ea9d58555d7fee44a9b01a9bd7448e986636b728ee3711b01f37ce"
+	match, mismatches := CompareHashes(results, expected)
+
+	t.Logf("\n%s", FormatHashComparison(results, expected))
+
+	if !match {
+		for _, m := range mismatches {
+			t.Errorf("Mismatch: %s", m)
+		}
+	}
+}
+
+// TestCrossImplEquivalence compares implementations against each other
+func TestCrossImplEquivalence(t *testing.T) {
+	fixturesDir := filepath.Join("..", "..", "fixtures", "consistency")
+
+	impls := []Implementation{
+		NewGoImpl(),
+		NewNativeImpl(),
+		NewZigImpl(),
+	}
+
+	// Find first available implementation as reference
+	var reference Implementation
+	for _, impl := range impls {
+		if impl.Available() {
+			reference = impl
+			break
+		}
+	}
+
+	if reference == nil {
+		t.Skip("No implementations available")
+	}
+
+	t.Logf("Using %s as reference implementation", reference.Name())
+
+	// Test fixtures
+	fixtures := []string{
+		"02_single_file",
+		"03_nested",
+		"04_multiple_files",
+	}
+
+	for _, fixtureName := range fixtures {
+		t.Run(fixtureName, func(t *testing.T) {
+			fixturePath := filepath.Join(fixturesDir, "dataset_hash", fixtureName, "input")
+
+			if _, err := os.Stat(fixturePath); os.IsNotExist(err) {
+				t.Skipf("Fixture not found: %s", fixturePath)
+			}
+
+			// Get reference hash
+			refHash, err := reference.HashDataset(fixturePath)
+			if err != nil {
+				t.Fatalf("Reference implementation failed: %v", err)
+			}
+
+			// Compare all other implementations
+			for _, impl := range impls {
+				if impl == reference || !impl.Available() {
+					continue
+				}
+
+				hash, err := impl.HashDataset(fixturePath)
+				if err != nil {
+					t.Errorf("%s failed: %v", impl.Name(), err)
+					continue
+				}
+
+				if hash != refHash {
+					t.Errorf("%s mismatch: got %s, reference (%s) has %s",
+						impl.Name(), hash, reference.Name(), refHash)
+				} else {
+					t.Logf("%s matches reference ✓", impl.Name())
+				}
+			}
+		})
+	}
+}
+
+// TestEmptyDirectory specifically tests empty directory handling
+func TestEmptyDirectory(t *testing.T) {
+	fixturesDir := filepath.Join("..", "..", "fixtures", "consistency")
+	fixturePath := filepath.Join(fixturesDir, "dataset_hash", "01_empty_dir", "input")
+
+	expected := "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
+
+	impls := []Implementation{
+		NewGoImpl(),
+		NewNativeImpl(),
+		NewZigImpl(),
+	}
+
+	for _, impl := range impls {
+		if !impl.Available() {
+			continue
+		}
+
+		t.Run(impl.Name(), func(t *testing.T) {
+			hash, err := impl.HashDataset(fixturePath)
+			if err != nil {
+				t.Fatalf("Failed to hash empty directory: %v", err)
+			}
+
+			if hash != expected {
+				t.Errorf("Empty directory hash mismatch: got %s, expected %s", hash, expected)
+			}
+		})
+	}
+}
+
+// testFixture tests a single fixture against all implementations
+func testFixture(t *testing.T, fixturesDir string, fixture *Fixture, impls []Implementation) {
+	fixturePath := filepath.Join(fixturesDir, "dataset_hash", fixture.ID, "input")
+
+	// Verify fixture exists
+	if _, err := os.Stat(fixturePath); os.IsNotExist(err) {
+		t.Skipf("Fixture not found: %s", fixturePath)
+	}
+
+	// Skip fixtures with TODO expected hashes
+	if fixture.ExpectedHash == "TODO_COMPUTE" {
+		t.Skipf("Fixture %s has uncomputed expected hash", fixture.ID)
+	}
+
+	results, err := ComputeAllHashes(fixturePath, impls)
+	if err != nil {
+		t.Logf("Errors during hash computation: %v", err)
+	}
+
+	match, mismatches := CompareHashes(results, fixture.ExpectedHash)
+
+	// Log comparison for debugging
+	t.Logf("\nFixture: %s - %s", fixture.ID, fixture.Name)
+	t.Logf("\n%s", FormatHashComparison(results, fixture.ExpectedHash))
+
+	if !match {
+		for _, m := range mismatches {
+			t.Errorf("%s", m)
+		}
+	}
+}
--- a/tests/integration/consistency/harness.go
+++ b/tests/integration/consistency/harness.go
@ -0,0 +1,472 @@
+// Package consistency provides cross-implementation consistency testing
+// for native C++, Go, and Zig implementations.
+//
+//go:build cgo
+// +build cgo
+
+package consistency
+
+import (
+	"bytes"
+	"crypto/sha256"
+	"encoding/hex"
+	"encoding/json"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"sort"
+	"strings"
+	"sync"
+	"unsafe"
+)
+
+// #cgo darwin LDFLAGS: -L${SRCDIR}/../../../native/build -Wl,-rpath,${SRCDIR}/../../../native/build -ldataset_hash
+// #cgo linux LDFLAGS: -L${SRCDIR}/../../../native/build -Wl,-rpath,${SRCDIR}/../../../native/build -ldataset_hash
+// #include "../../../native/dataset_hash/dataset_hash.h"
+// #include <stdlib.h>
+import "C"
+
+var (
+	nativeCtx     *C.fh_context_t
+	nativeCtxOnce sync.Once
+)
+
+// Implementation defines the interface for a hashing implementation
+type Implementation interface {
+	Name() string
+	HashDataset(path string) (string, error)
+	HashFile(path string) (string, error)
+	Available() bool
+}
+
+// NativeImpl wraps the native C++ library via CGO
+type NativeImpl struct {
+	available bool
+}
+
+// GoImpl uses the pure Go implementation
+type GoImpl struct{}
+
+// ZigImpl executes the Zig CLI as a subprocess
+type ZigImpl struct {
+	cliPath   string
+	available bool
+}
+
+// Fixture represents a test case with known expected output
+type Fixture struct {
+	ID           string        `json:"id"`
+	Name         string        `json:"name"`
+	Description  string        `json:"description"`
+	ExpectedHash string        `json:"expected_hash"`
+	Files        []FixtureFile `json:"files"`
+}
+
+// FixtureFile represents a file in a fixture
+type FixtureFile struct {
+	Path        string `json:"path"`
+	ContentHash string `json:"content_hash"`
+}
+
+// ExpectedHashes is the root structure of expected_hashes.json
+type ExpectedHashes struct {
+	Version     string    `json:"version"`
+	Algorithm   string    `json:"algorithm"`
+	Description string    `json:"description"`
+	Fixtures    []Fixture `json:"fixtures"`
+}
+
+// Name returns the implementation name
+func (n *NativeImpl) Name() string { return "native_c++" }
+
+// Available returns true if native libraries are available
+func (n *NativeImpl) Available() bool { return n.available }
+
+// HashDataset computes the hash of a directory using native library
+func (n *NativeImpl) HashDataset(path string) (string, error) {
+	if !n.available {
+		return "", fmt.Errorf("native library not available")
+	}
+
+	// Call the native library through worker package
+	return hashWithNative(path)
+}
+
+// HashFile computes the hash of a single file using native library
+func (n *NativeImpl) HashFile(path string) (string, error) {
+	if !n.available {
+		return "", fmt.Errorf("native library not available")
+	}
+
+	return hashFileWithNative(path)
+}
+
+// Name returns the implementation name
+func (g *GoImpl) Name() string { return "go_pure" }
+
+// Available always returns true for Go implementation
+func (g *GoImpl) Available() bool { return true }
+
+// HashDataset computes the hash of a directory using pure Go
+func (g *GoImpl) HashDataset(path string) (string, error) {
+	return hashDirGo(path)
+}
+
+// HashFile computes the hash of a single file using pure Go
+func (g *GoImpl) HashFile(path string) (string, error) {
+	return hashFileGo(path)
+}
+
+// Name returns the implementation name
+func (z *ZigImpl) Name() string { return "zig_cli" }
+
+// Available returns true if Zig CLI is found
+func (z *ZigImpl) Available() bool { return z.available }
+
+// HashDataset computes the hash of a directory using Zig CLI
+// Uses 'dataset verify' command which auto-hashes the dataset
+func (z *ZigImpl) HashDataset(path string) (string, error) {
+	if !z.available {
+		return "", fmt.Errorf("zig CLI not available at %s", z.cliPath)
+	}
+
+	// Convert to absolute path to avoid PathTraversalAttempt error
+	absPath, err := filepath.Abs(path)
+	if err != nil {
+		return "", fmt.Errorf("failed to get absolute path: %w", err)
+	}
+
+	// Use dataset verify --dry-run to get the hash without verifying
+	cmd := exec.Command(z.cliPath, "dataset", "verify", absPath, "--dry-run")
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return "", fmt.Errorf("zig CLI failed: %w (output: %s)", err, string(output))
+	}
+
+	// Parse output to extract hash
+	return parseZigHashOutput(string(output))
+}
+
+// HashFile computes the hash of a single file using Zig CLI
+func (z *ZigImpl) HashFile(path string) (string, error) {
+	// Zig CLI doesn't have a single file hash command, so we compute it directly
+	return hashFileGo(path)
+}
+
+// NewNativeImpl creates a new native implementation wrapper
+func NewNativeImpl() *NativeImpl {
+	return &NativeImpl{
+		available: checkNativeAvailable(),
+	}
+}
+
+// NewGoImpl creates a new Go implementation wrapper
+func NewGoImpl() *GoImpl {
+	return &GoImpl{}
+}
+
+// NewZigImpl creates a new Zig implementation wrapper
+func NewZigImpl() *ZigImpl {
+	cliPath := findZigCLI()
+	return &ZigImpl{
+		cliPath:   cliPath,
+		available: cliPath != "",
+	}
+}
+
+// LoadExpectedHashes loads the expected hash values from fixtures
+func LoadExpectedHashes(fixturesDir string) (*ExpectedHashes, error) {
+	path := filepath.Join(fixturesDir, "dataset_hash", "expected_hashes.json")
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read expected hashes: %w", err)
+	}
+
+	var expected ExpectedHashes
+	if err := json.Unmarshal(data, &expected); err != nil {
+		return nil, fmt.Errorf("failed to parse expected hashes: %w", err)
+	}
+
+	return &expected, nil
+}
+
+// ComputeExpectedHash computes the expected hash for a fixture using reference algorithm
+func ComputeExpectedHash(fixturePath string) (string, error) {
+	return hashDirGo(fixturePath)
+}
+
+// hashDirGo is the reference Go implementation
+func hashDirGo(root string) (string, error) {
+	root = filepath.Clean(root)
+	info, err := os.Stat(root)
+	if err != nil {
+		return "", err
+	}
+	if !info.IsDir() {
+		return "", fmt.Errorf("not a directory: %s", root)
+	}
+
+	var files []string
+	err = filepath.WalkDir(root, func(path string, d os.DirEntry, walkErr error) error {
+		if walkErr != nil {
+			return walkErr
+		}
+		if d.IsDir() {
+			return nil
+		}
+
+		// Skip hidden files
+		rel, err := filepath.Rel(root, path)
+		if err != nil {
+			return err
+		}
+		if strings.HasPrefix(filepath.Base(rel), ".") {
+			return nil
+		}
+
+		files = append(files, rel)
+		return nil
+	})
+	if err != nil {
+		return "", err
+	}
+
+	// Deterministic order
+	sort.Strings(files)
+
+	// Hash file hashes to avoid holding all bytes
+	overall := sha256.New()
+	for _, rel := range files {
+		p := filepath.Join(root, rel)
+		sum, err := hashFileGo(p)
+		if err != nil {
+			return "", err
+		}
+		overall.Write([]byte(sum))
+	}
+	return hex.EncodeToString(overall.Sum(nil)), nil
+}
+
+// hashFileGo computes SHA256 of a file
+func hashFileGo(path string) (string, error) {
+	f, err := os.Open(filepath.Clean(path))
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+
+	h := sha256.New()
+	if _, err := h.Write([]byte{}); err != nil {
+		return "", err
+	}
+
+	buf := make([]byte, 64*1024)
+	for {
+		n, err := f.Read(buf)
+		if n > 0 {
+			h.Write(buf[:n])
+		}
+		if err != nil {
+			break
+		}
+	}
+
+	return hex.EncodeToString(h.Sum(nil)), nil
+}
+
+// checkNativeAvailable checks if native libraries are available
+// Called from tests/integration/consistency/, so native/build is at ../../../native/build
+func checkNativeAvailable() bool {
+	libDir := filepath.Join("..", "..", "..", "native", "build")
+
+	if runtime.GOOS == "darwin" {
+		_, err := os.Stat(filepath.Join(libDir, "libdataset_hash.dylib"))
+		return err == nil
+	}
+	_, err := os.Stat(filepath.Join(libDir, "libdataset_hash.so"))
+	return err == nil
+}
+
+// findZigCLI locates the Zig CLI binary
+// Called from tests/integration/consistency/, so cli/ is at ../../../cli/
+func findZigCLI() string {
+	// Check zig-out/bin for ml-<os>-<arch> pattern
+	zigBinDir := filepath.Join("..", "..", "..", "cli", "zig-out", "bin")
+	if entries, err := os.ReadDir(zigBinDir); err == nil {
+		for _, entry := range entries {
+			if strings.HasPrefix(entry.Name(), "ml-") && !entry.IsDir() {
+				return filepath.Join(zigBinDir, entry.Name())
+			}
+		}
+	}
+
+	// Fallback: check .zig-cache
+	cacheDir := filepath.Join("..", "..", "..", "cli", ".zig-cache", "o")
+	if entries, err := os.ReadDir(cacheDir); err == nil {
+		for _, entry := range entries {
+			if entry.IsDir() {
+				subEntries, _ := os.ReadDir(filepath.Join(cacheDir, entry.Name()))
+				for _, sub := range subEntries {
+					if strings.HasPrefix(sub.Name(), "ml-") && !sub.IsDir() {
+						return filepath.Join(cacheDir, entry.Name(), sub.Name())
+					}
+				}
+			}
+		}
+	}
+
+	// Try PATH
+	if path, err := exec.LookPath("ml"); err == nil {
+		return path
+	}
+
+	return ""
+}
+
+// parseZigHashOutput extracts the hash from Zig CLI 'dataset verify' output
+// Expected format contains hash information from the verify command
+func parseZigHashOutput(output string) (string, error) {
+	// Look for hash in the output - various formats possible
+	lines := strings.SplitSeq(output, "\n")
+	for line := range lines {
+		// Try to find a 64-character hex string (SHA256)
+		fields := strings.FieldsSeq(line)
+		for field := range fields {
+			field = strings.TrimSpace(field)
+			// Check if it looks like a SHA256 hash (64 hex chars)
+			if len(field) == 64 {
+				// Verify it's valid hex
+				if _, err := hex.DecodeString(field); err == nil {
+					return field, nil
+				}
+			}
+		}
+	}
+	return "", fmt.Errorf("could not parse hash from output: %s", output)
+}
+
+// hashWithNative calls the native library via CGO
+func hashWithNative(path string) (string, error) {
+	ctx := getNativeHashContext()
+
+	croot := C.CString(path)
+	defer C.free(unsafe.Pointer(croot))
+
+	result := C.fh_hash_directory_combined(ctx, croot)
+	if result == nil {
+		err := C.fh_last_error(ctx)
+		if err != nil {
+			return "", fmt.Errorf("native hash failed: %s", C.GoString(err))
+		}
+		return "", fmt.Errorf("native hash failed")
+	}
+	defer C.fh_free_string(result)
+
+	return C.GoString(result), nil
+}
+
+// hashFileWithNative calls the native library for single file
+func hashFileWithNative(path string) (string, error) {
+	ctx := getNativeHashContext()
+
+	cpath := C.CString(path)
+	defer C.free(unsafe.Pointer(cpath))
+
+	result := C.fh_hash_file(ctx, cpath)
+	if result == nil {
+		err := C.fh_last_error(ctx)
+		if err != nil {
+			return "", fmt.Errorf("native file hash failed: %s", C.GoString(err))
+		}
+		return "", fmt.Errorf("native file hash failed")
+	}
+	defer C.fh_free_string(result)
+
+	return C.GoString(result), nil
+}
+
+// getNativeHashContext initializes and returns the native hash context
+func getNativeHashContext() *C.fh_context_t {
+	nativeCtxOnce.Do(func() {
+		nativeCtx = C.fh_init(C.uint32_t(runtime.NumCPU()))
+	})
+	return nativeCtx
+}
+
+// ComputeAllHashes runs all available implementations on a path
+func ComputeAllHashes(path string, impls []Implementation) (map[string]string, error) {
+	results := make(map[string]string)
+	var errors []string
+
+	for _, impl := range impls {
+		if !impl.Available() {
+			results[impl.Name()] = "[not available]"
+			continue
+		}
+
+		hash, err := impl.HashDataset(path)
+		if err != nil {
+			errors = append(errors, fmt.Sprintf("%s: %v", impl.Name(), err))
+			results[impl.Name()] = fmt.Sprintf("[error: %v]", err)
+		} else {
+			results[impl.Name()] = hash
+		}
+	}
+
+	if len(errors) > 0 {
+		return results, fmt.Errorf("errors: %s", strings.Join(errors, "; "))
+	}
+	return results, nil
+}
+
+// CompareHashes checks if all hashes match the expected value
+func CompareHashes(results map[string]string, expected string) (bool, []string) {
+	var mismatches []string
+
+	for name, hash := range results {
+		if strings.HasPrefix(hash, "[") {
+			// Not available or error - skip comparison
+			continue
+		}
+
+		if !strings.EqualFold(hash, expected) {
+			mismatches = append(mismatches,
+				fmt.Sprintf("%s: got %s, expected %s", name, hash, expected))
+		}
+	}
+
+	return len(mismatches) == 0, mismatches
+}
+
+// FormatHashComparison creates a readable comparison of hashes
+func FormatHashComparison(results map[string]string, expected string) string {
+	var buf bytes.Buffer
+
+	fmt.Fprintf(&buf, "Hash Comparison:\n")
+	fmt.Fprintf(&buf, "  Expected: %s\n", expected)
+	fmt.Fprintf(&buf, "\n")
+
+	maxNameLen := 0
+	for name := range results {
+		if len(name) > maxNameLen {
+			maxNameLen = len(name)
+		}
+	}
+
+	for name, hash := range results {
+		padding := strings.Repeat(" ", maxNameLen-len(name))
+		match := " "
+		if !strings.HasPrefix(hash, "[") {
+			if strings.EqualFold(hash, expected) {
+				match = "✓"
+			} else {
+				match = "✗"
+			}
+		}
+		fmt.Fprintf(&buf, "  %s:%s %s %s\n", name, padding, match, hash)
+	}
+
+	return buf.String()
+}