test(consistency): add dataset hash consistency test suite
Add cross-implementation consistency tests for dataset hash functionality: ## Test Fixtures - Single file, nested directories, and multiple file test cases - Expected hashes in JSON format for validation ## Test Infrastructure - harness.go: Common test utilities and reference implementation runner - dataset_hash_test.go: Consistency test cases comparing implementations - cmd/update.go: Tool to regenerate expected hashes from reference ## Purpose Ensures hash implementations (Go, C++, Zig) produce identical results across all supported platforms and implementations.
This commit is contained in:
parent
8e5af0da2d
commit
a239f3a14f
12 changed files with 877 additions and 0 deletions
40
tests/fixtures/consistency/README.md
vendored
Normal file
40
tests/fixtures/consistency/README.md
vendored
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
# Consistency Test Fixtures
|
||||
|
||||
This directory contains canonical test fixtures for cross-implementation consistency testing.
|
||||
|
||||
Each implementation (native C++, Go, Zig) must produce identical outputs for these fixtures.
|
||||
|
||||
## Algorithm Specification
|
||||
|
||||
### Dataset Hash Algorithm v1
|
||||
|
||||
1. Recursively collect all regular files (not symlinks, not directories)
|
||||
2. Skip hidden files (names starting with '.')
|
||||
3. Sort file paths lexicographically (full relative paths)
|
||||
4. For each file:
|
||||
- Compute SHA256 of file contents
|
||||
- Convert to lowercase hex (64 chars)
|
||||
5. Combine: SHA256(concatenation of all file hashes in sorted order)
|
||||
6. Return lowercase hex (64 chars)
|
||||
|
||||
**Empty directory**: Returns SHA256 of empty string:
|
||||
`e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855`
|
||||
|
||||
### Directory Structure
|
||||
|
||||
```
|
||||
dataset_hash/
|
||||
├── 01_empty_dir/ # Empty directory
|
||||
├── 02_single_file/ # One file with "hello world"
|
||||
├── 03_nested/ # Nested directories
|
||||
├── 04_special_chars/ # Files with spaces and unicode
|
||||
└── expected_hashes.json # All expected outputs
|
||||
```
|
||||
|
||||
## Adding New Fixtures
|
||||
|
||||
1. Create directory with `input/` subdirectory
|
||||
2. Add files to `input/`
|
||||
3. Compute expected hash using reference implementation
|
||||
4. Add entry to `expected_hashes.json`
|
||||
5. Document any special considerations in `README.md`
|
||||
1
tests/fixtures/consistency/dataset_hash/02_single_file/input/test.txt
vendored
Normal file
1
tests/fixtures/consistency/dataset_hash/02_single_file/input/test.txt
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
hello world
|
||||
1
tests/fixtures/consistency/dataset_hash/03_nested/input/root.txt
vendored
Normal file
1
tests/fixtures/consistency/dataset_hash/03_nested/input/root.txt
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
file a content
|
||||
1
tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/file1.txt
vendored
Normal file
1
tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/file1.txt
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
file b content
|
||||
1
tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/subdir2/deep.txt
vendored
Normal file
1
tests/fixtures/consistency/dataset_hash/03_nested/input/subdir1/subdir2/deep.txt
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
file c content
|
||||
1
tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_a.txt
vendored
Normal file
1
tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_a.txt
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
first file content here
|
||||
1
tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_b.txt
vendored
Normal file
1
tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_b.txt
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
second file content here
|
||||
1
tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_c.txt
vendored
Normal file
1
tests/fixtures/consistency/dataset_hash/04_multiple_files/input/file_c.txt
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
third file content here
|
||||
48
tests/fixtures/consistency/dataset_hash/expected_hashes.json
vendored
Normal file
48
tests/fixtures/consistency/dataset_hash/expected_hashes.json
vendored
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
{
|
||||
"version": "1.0.0",
|
||||
"algorithm": "Dataset Hash Algorithm v1",
|
||||
"description": "SHA256 of concatenated file hashes (sorted lexicographically)",
|
||||
"fixtures": [
|
||||
{
|
||||
"id": "01_empty_dir",
|
||||
"name": "Empty Directory",
|
||||
"description": "Directory with no files",
|
||||
"expected_hash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
|
||||
"files": []
|
||||
},
|
||||
{
|
||||
"id": "02_single_file",
|
||||
"name": "Single File",
|
||||
"description": "Directory with one file containing 'hello world' (no trailing newline)",
|
||||
"expected_hash": "6dd7e8e932ea9d58555d7fee44a9b01a9bd7448e986636b728ee3711b01f37ce",
|
||||
"files": [
|
||||
{
|
||||
"path": "test.txt",
|
||||
"content_hash": "a948904f2f0f479b8f8197694b30184b0d2ed1c1cd2a1ec0fb85d299a192a447"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "03_nested",
|
||||
"name": "Nested Directories",
|
||||
"description": "Multiple levels of subdirectories",
|
||||
"expected_hash": "ba539800f8b98db5c7403773737ed92c71589e60b415d6a2556cb267a19fa0e0",
|
||||
"files": [
|
||||
{"path": "root.txt", "content_hash": "0c572ee02055d28c45d0616bc31484af3912fb14ff231f5fe23000fb6747f561"},
|
||||
{"path": "subdir1/file1.txt", "content_hash": "89daac0d129ad5569989efcca1763e74de4431d1a3b081a68d53aa23e1cf2c3f"},
|
||||
{"path": "subdir1/subdir2/deep.txt", "content_hash": "728312d971fd4d1aa9720531f0e495d33fda5c71562643fd814d0cff46689d4a"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "04_multiple_files",
|
||||
"name": "Multiple Files",
|
||||
"description": "Directory with several files at root level",
|
||||
"expected_hash": "b2aca3c5daf9b5c46d96bfc78c4fb221c3b045798336c7c226937f10ac1257a5",
|
||||
"files": [
|
||||
{"path": "file_a.txt", "content_hash": "a2ba67db2bf4d822fc687c98c96db8e83284abd9f069a7e958aaae0e36490903"},
|
||||
{"path": "file_b.txt", "content_hash": "0c0370cff9c241b6c1869edf309da41f6711e94cabf3d8d99044dc500189d15a"},
|
||||
{"path": "file_c.txt", "content_hash": "e1f1e0b4750c7f7af8527ce285442cb45a337a7b83a97381430fd99587f79948"}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
97
tests/integration/consistency/cmd/update.go
Normal file
97
tests/integration/consistency/cmd/update.go
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
// Command update computes expected hashes for fixtures using the reference Go implementation
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/jfraeys/fetch_ml/tests/integration/consistency"
|
||||
)
|
||||
|
||||
func main() {
|
||||
fixturesDir := filepath.Join("tests", "fixtures", "consistency")
|
||||
|
||||
// Load current expected hashes
|
||||
expectedPath := filepath.Join(fixturesDir, "dataset_hash", "expected_hashes.json")
|
||||
data, err := os.ReadFile(expectedPath)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Failed to read expected hashes: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
var expected consistency.ExpectedHashes
|
||||
if err := json.Unmarshal(data, &expected); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Failed to parse expected hashes: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Use Go implementation as reference
|
||||
goImpl := consistency.NewGoImpl()
|
||||
|
||||
updated := false
|
||||
for i, fixture := range expected.Fixtures {
|
||||
fixturePath := filepath.Join(fixturesDir, "dataset_hash", fixture.ID, "input")
|
||||
|
||||
// Check if fixture exists
|
||||
if _, err := os.Stat(fixturePath); os.IsNotExist(err) {
|
||||
fmt.Printf("Skipping %s: fixture not found at %s\n", fixture.ID, fixturePath)
|
||||
continue
|
||||
}
|
||||
|
||||
// Compute hash using reference implementation
|
||||
hash, err := goImpl.HashDataset(fixturePath)
|
||||
if err != nil {
|
||||
fmt.Printf("Error hashing %s: %v\n", fixture.ID, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Update if different or TODO
|
||||
if fixture.ExpectedHash == "TODO_COMPUTE" {
|
||||
fmt.Printf("%s: computed %s\n", fixture.ID, hash)
|
||||
expected.Fixtures[i].ExpectedHash = hash
|
||||
updated = true
|
||||
} else if fixture.ExpectedHash != hash {
|
||||
fmt.Printf("%s: updated %s -> %s\n", fixture.ID, fixture.ExpectedHash, hash)
|
||||
expected.Fixtures[i].ExpectedHash = hash
|
||||
updated = true
|
||||
} else {
|
||||
fmt.Printf("%s: unchanged (%s)\n", fixture.ID, hash)
|
||||
}
|
||||
|
||||
// Compute individual file hashes
|
||||
for j, file := range fixture.Files {
|
||||
if file.ContentHash == "TODO" || file.ContentHash == "" {
|
||||
filePath := filepath.Join(fixturePath, file.Path)
|
||||
fileHash, err := goImpl.HashFile(filePath)
|
||||
if err != nil {
|
||||
fmt.Printf(" %s: error - %v\n", file.Path, err)
|
||||
continue
|
||||
}
|
||||
fmt.Printf(" %s: %s\n", file.Path, fileHash)
|
||||
expected.Fixtures[i].Files[j].ContentHash = fileHash
|
||||
updated = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !updated {
|
||||
fmt.Println("\nNo updates needed.")
|
||||
return
|
||||
}
|
||||
|
||||
// Write updated hashes
|
||||
output, err := json.MarshalIndent(expected, "", " ")
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Failed to marshal updated hashes: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(expectedPath, output, 0644); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Failed to write updated hashes: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
fmt.Println("\nUpdated expected_hashes.json")
|
||||
}
|
||||
213
tests/integration/consistency/dataset_hash_test.go
Normal file
213
tests/integration/consistency/dataset_hash_test.go
Normal file
|
|
@ -0,0 +1,213 @@
|
|||
package consistency
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestDatasetHashConsistency verifies all implementations produce identical hashes
|
||||
func TestDatasetHashConsistency(t *testing.T) {
|
||||
fixturesDir := filepath.Join("..", "..", "fixtures", "consistency")
|
||||
|
||||
expected, err := LoadExpectedHashes(fixturesDir)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to load expected hashes: %v", err)
|
||||
}
|
||||
|
||||
// Initialize implementations
|
||||
impls := []Implementation{
|
||||
NewNativeImpl(),
|
||||
NewGoImpl(),
|
||||
NewZigImpl(),
|
||||
}
|
||||
|
||||
// Check which implementations are available
|
||||
availableCount := 0
|
||||
for _, impl := range impls {
|
||||
if impl.Available() {
|
||||
availableCount++
|
||||
t.Logf("Implementation available: %s", impl.Name())
|
||||
} else {
|
||||
t.Logf("Implementation not available: %s", impl.Name())
|
||||
}
|
||||
}
|
||||
|
||||
if availableCount < 2 {
|
||||
t.Skip("Need at least 2 implementations for consistency testing")
|
||||
}
|
||||
|
||||
// Test each fixture
|
||||
for _, fixture := range expected.Fixtures {
|
||||
t.Run(fixture.ID, func(t *testing.T) {
|
||||
testFixture(t, fixturesDir, &fixture, impls)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestDatasetHashSmoke runs a quick smoke test
|
||||
func TestDatasetHashSmoke(t *testing.T) {
|
||||
fixturesDir := filepath.Join("..", "..", "fixtures", "consistency")
|
||||
|
||||
// Just test single file fixture for quick validation
|
||||
fixturePath := filepath.Join(fixturesDir, "dataset_hash", "02_single_file", "input")
|
||||
|
||||
// Verify fixture exists
|
||||
if _, err := os.Stat(fixturePath); os.IsNotExist(err) {
|
||||
t.Skipf("Fixture not found: %s", fixturePath)
|
||||
}
|
||||
|
||||
impls := []Implementation{
|
||||
NewNativeImpl(),
|
||||
NewGoImpl(),
|
||||
NewZigImpl(),
|
||||
}
|
||||
|
||||
results, err := ComputeAllHashes(fixturePath, impls)
|
||||
if err != nil {
|
||||
t.Logf("Errors during hash computation: %v", err)
|
||||
}
|
||||
|
||||
expected := "6dd7e8e932ea9d58555d7fee44a9b01a9bd7448e986636b728ee3711b01f37ce"
|
||||
match, mismatches := CompareHashes(results, expected)
|
||||
|
||||
t.Logf("\n%s", FormatHashComparison(results, expected))
|
||||
|
||||
if !match {
|
||||
for _, m := range mismatches {
|
||||
t.Errorf("Mismatch: %s", m)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestCrossImplEquivalence compares implementations against each other
|
||||
func TestCrossImplEquivalence(t *testing.T) {
|
||||
fixturesDir := filepath.Join("..", "..", "fixtures", "consistency")
|
||||
|
||||
impls := []Implementation{
|
||||
NewGoImpl(),
|
||||
NewNativeImpl(),
|
||||
NewZigImpl(),
|
||||
}
|
||||
|
||||
// Find first available implementation as reference
|
||||
var reference Implementation
|
||||
for _, impl := range impls {
|
||||
if impl.Available() {
|
||||
reference = impl
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if reference == nil {
|
||||
t.Skip("No implementations available")
|
||||
}
|
||||
|
||||
t.Logf("Using %s as reference implementation", reference.Name())
|
||||
|
||||
// Test fixtures
|
||||
fixtures := []string{
|
||||
"02_single_file",
|
||||
"03_nested",
|
||||
"04_multiple_files",
|
||||
}
|
||||
|
||||
for _, fixtureName := range fixtures {
|
||||
t.Run(fixtureName, func(t *testing.T) {
|
||||
fixturePath := filepath.Join(fixturesDir, "dataset_hash", fixtureName, "input")
|
||||
|
||||
if _, err := os.Stat(fixturePath); os.IsNotExist(err) {
|
||||
t.Skipf("Fixture not found: %s", fixturePath)
|
||||
}
|
||||
|
||||
// Get reference hash
|
||||
refHash, err := reference.HashDataset(fixturePath)
|
||||
if err != nil {
|
||||
t.Fatalf("Reference implementation failed: %v", err)
|
||||
}
|
||||
|
||||
// Compare all other implementations
|
||||
for _, impl := range impls {
|
||||
if impl == reference || !impl.Available() {
|
||||
continue
|
||||
}
|
||||
|
||||
hash, err := impl.HashDataset(fixturePath)
|
||||
if err != nil {
|
||||
t.Errorf("%s failed: %v", impl.Name(), err)
|
||||
continue
|
||||
}
|
||||
|
||||
if hash != refHash {
|
||||
t.Errorf("%s mismatch: got %s, reference (%s) has %s",
|
||||
impl.Name(), hash, reference.Name(), refHash)
|
||||
} else {
|
||||
t.Logf("%s matches reference ✓", impl.Name())
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestEmptyDirectory specifically tests empty directory handling
|
||||
func TestEmptyDirectory(t *testing.T) {
|
||||
fixturesDir := filepath.Join("..", "..", "fixtures", "consistency")
|
||||
fixturePath := filepath.Join(fixturesDir, "dataset_hash", "01_empty_dir", "input")
|
||||
|
||||
expected := "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
|
||||
|
||||
impls := []Implementation{
|
||||
NewGoImpl(),
|
||||
NewNativeImpl(),
|
||||
NewZigImpl(),
|
||||
}
|
||||
|
||||
for _, impl := range impls {
|
||||
if !impl.Available() {
|
||||
continue
|
||||
}
|
||||
|
||||
t.Run(impl.Name(), func(t *testing.T) {
|
||||
hash, err := impl.HashDataset(fixturePath)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to hash empty directory: %v", err)
|
||||
}
|
||||
|
||||
if hash != expected {
|
||||
t.Errorf("Empty directory hash mismatch: got %s, expected %s", hash, expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// testFixture tests a single fixture against all implementations
|
||||
func testFixture(t *testing.T, fixturesDir string, fixture *Fixture, impls []Implementation) {
|
||||
fixturePath := filepath.Join(fixturesDir, "dataset_hash", fixture.ID, "input")
|
||||
|
||||
// Verify fixture exists
|
||||
if _, err := os.Stat(fixturePath); os.IsNotExist(err) {
|
||||
t.Skipf("Fixture not found: %s", fixturePath)
|
||||
}
|
||||
|
||||
// Skip fixtures with TODO expected hashes
|
||||
if fixture.ExpectedHash == "TODO_COMPUTE" {
|
||||
t.Skipf("Fixture %s has uncomputed expected hash", fixture.ID)
|
||||
}
|
||||
|
||||
results, err := ComputeAllHashes(fixturePath, impls)
|
||||
if err != nil {
|
||||
t.Logf("Errors during hash computation: %v", err)
|
||||
}
|
||||
|
||||
match, mismatches := CompareHashes(results, fixture.ExpectedHash)
|
||||
|
||||
// Log comparison for debugging
|
||||
t.Logf("\nFixture: %s - %s", fixture.ID, fixture.Name)
|
||||
t.Logf("\n%s", FormatHashComparison(results, fixture.ExpectedHash))
|
||||
|
||||
if !match {
|
||||
for _, m := range mismatches {
|
||||
t.Errorf("%s", m)
|
||||
}
|
||||
}
|
||||
}
|
||||
472
tests/integration/consistency/harness.go
Normal file
472
tests/integration/consistency/harness.go
Normal file
|
|
@ -0,0 +1,472 @@
|
|||
// Package consistency provides cross-implementation consistency testing
|
||||
// for native C++, Go, and Zig implementations.
|
||||
//
|
||||
//go:build cgo
|
||||
// +build cgo
|
||||
|
||||
package consistency
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// #cgo darwin LDFLAGS: -L${SRCDIR}/../../../native/build -Wl,-rpath,${SRCDIR}/../../../native/build -ldataset_hash
|
||||
// #cgo linux LDFLAGS: -L${SRCDIR}/../../../native/build -Wl,-rpath,${SRCDIR}/../../../native/build -ldataset_hash
|
||||
// #include "../../../native/dataset_hash/dataset_hash.h"
|
||||
// #include <stdlib.h>
|
||||
import "C"
|
||||
|
||||
var (
|
||||
nativeCtx *C.fh_context_t
|
||||
nativeCtxOnce sync.Once
|
||||
)
|
||||
|
||||
// Implementation defines the interface for a hashing implementation
|
||||
type Implementation interface {
|
||||
Name() string
|
||||
HashDataset(path string) (string, error)
|
||||
HashFile(path string) (string, error)
|
||||
Available() bool
|
||||
}
|
||||
|
||||
// NativeImpl wraps the native C++ library via CGO
|
||||
type NativeImpl struct {
|
||||
available bool
|
||||
}
|
||||
|
||||
// GoImpl uses the pure Go implementation
|
||||
type GoImpl struct{}
|
||||
|
||||
// ZigImpl executes the Zig CLI as a subprocess
|
||||
type ZigImpl struct {
|
||||
cliPath string
|
||||
available bool
|
||||
}
|
||||
|
||||
// Fixture represents a test case with known expected output
|
||||
type Fixture struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Description string `json:"description"`
|
||||
ExpectedHash string `json:"expected_hash"`
|
||||
Files []FixtureFile `json:"files"`
|
||||
}
|
||||
|
||||
// FixtureFile represents a file in a fixture
|
||||
type FixtureFile struct {
|
||||
Path string `json:"path"`
|
||||
ContentHash string `json:"content_hash"`
|
||||
}
|
||||
|
||||
// ExpectedHashes is the root structure of expected_hashes.json
|
||||
type ExpectedHashes struct {
|
||||
Version string `json:"version"`
|
||||
Algorithm string `json:"algorithm"`
|
||||
Description string `json:"description"`
|
||||
Fixtures []Fixture `json:"fixtures"`
|
||||
}
|
||||
|
||||
// Name returns the implementation name
|
||||
func (n *NativeImpl) Name() string { return "native_c++" }
|
||||
|
||||
// Available returns true if native libraries are available
|
||||
func (n *NativeImpl) Available() bool { return n.available }
|
||||
|
||||
// HashDataset computes the hash of a directory using native library
|
||||
func (n *NativeImpl) HashDataset(path string) (string, error) {
|
||||
if !n.available {
|
||||
return "", fmt.Errorf("native library not available")
|
||||
}
|
||||
|
||||
// Call the native library through worker package
|
||||
return hashWithNative(path)
|
||||
}
|
||||
|
||||
// HashFile computes the hash of a single file using native library
|
||||
func (n *NativeImpl) HashFile(path string) (string, error) {
|
||||
if !n.available {
|
||||
return "", fmt.Errorf("native library not available")
|
||||
}
|
||||
|
||||
return hashFileWithNative(path)
|
||||
}
|
||||
|
||||
// Name returns the implementation name
|
||||
func (g *GoImpl) Name() string { return "go_pure" }
|
||||
|
||||
// Available always returns true for Go implementation
|
||||
func (g *GoImpl) Available() bool { return true }
|
||||
|
||||
// HashDataset computes the hash of a directory using pure Go
|
||||
func (g *GoImpl) HashDataset(path string) (string, error) {
|
||||
return hashDirGo(path)
|
||||
}
|
||||
|
||||
// HashFile computes the hash of a single file using pure Go
|
||||
func (g *GoImpl) HashFile(path string) (string, error) {
|
||||
return hashFileGo(path)
|
||||
}
|
||||
|
||||
// Name returns the implementation name
|
||||
func (z *ZigImpl) Name() string { return "zig_cli" }
|
||||
|
||||
// Available returns true if Zig CLI is found
|
||||
func (z *ZigImpl) Available() bool { return z.available }
|
||||
|
||||
// HashDataset computes the hash of a directory using Zig CLI
|
||||
// Uses 'dataset verify' command which auto-hashes the dataset
|
||||
func (z *ZigImpl) HashDataset(path string) (string, error) {
|
||||
if !z.available {
|
||||
return "", fmt.Errorf("zig CLI not available at %s", z.cliPath)
|
||||
}
|
||||
|
||||
// Convert to absolute path to avoid PathTraversalAttempt error
|
||||
absPath, err := filepath.Abs(path)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to get absolute path: %w", err)
|
||||
}
|
||||
|
||||
// Use dataset verify --dry-run to get the hash without verifying
|
||||
cmd := exec.Command(z.cliPath, "dataset", "verify", absPath, "--dry-run")
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("zig CLI failed: %w (output: %s)", err, string(output))
|
||||
}
|
||||
|
||||
// Parse output to extract hash
|
||||
return parseZigHashOutput(string(output))
|
||||
}
|
||||
|
||||
// HashFile computes the hash of a single file using Zig CLI
|
||||
func (z *ZigImpl) HashFile(path string) (string, error) {
|
||||
// Zig CLI doesn't have a single file hash command, so we compute it directly
|
||||
return hashFileGo(path)
|
||||
}
|
||||
|
||||
// NewNativeImpl creates a new native implementation wrapper
|
||||
func NewNativeImpl() *NativeImpl {
|
||||
return &NativeImpl{
|
||||
available: checkNativeAvailable(),
|
||||
}
|
||||
}
|
||||
|
||||
// NewGoImpl creates a new Go implementation wrapper
|
||||
func NewGoImpl() *GoImpl {
|
||||
return &GoImpl{}
|
||||
}
|
||||
|
||||
// NewZigImpl creates a new Zig implementation wrapper
|
||||
func NewZigImpl() *ZigImpl {
|
||||
cliPath := findZigCLI()
|
||||
return &ZigImpl{
|
||||
cliPath: cliPath,
|
||||
available: cliPath != "",
|
||||
}
|
||||
}
|
||||
|
||||
// LoadExpectedHashes loads the expected hash values from fixtures
|
||||
func LoadExpectedHashes(fixturesDir string) (*ExpectedHashes, error) {
|
||||
path := filepath.Join(fixturesDir, "dataset_hash", "expected_hashes.json")
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read expected hashes: %w", err)
|
||||
}
|
||||
|
||||
var expected ExpectedHashes
|
||||
if err := json.Unmarshal(data, &expected); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse expected hashes: %w", err)
|
||||
}
|
||||
|
||||
return &expected, nil
|
||||
}
|
||||
|
||||
// ComputeExpectedHash computes the expected hash for a fixture using reference algorithm
|
||||
func ComputeExpectedHash(fixturePath string) (string, error) {
|
||||
return hashDirGo(fixturePath)
|
||||
}
|
||||
|
||||
// hashDirGo is the reference Go implementation
|
||||
func hashDirGo(root string) (string, error) {
|
||||
root = filepath.Clean(root)
|
||||
info, err := os.Stat(root)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if !info.IsDir() {
|
||||
return "", fmt.Errorf("not a directory: %s", root)
|
||||
}
|
||||
|
||||
var files []string
|
||||
err = filepath.WalkDir(root, func(path string, d os.DirEntry, walkErr error) error {
|
||||
if walkErr != nil {
|
||||
return walkErr
|
||||
}
|
||||
if d.IsDir() {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Skip hidden files
|
||||
rel, err := filepath.Rel(root, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if strings.HasPrefix(filepath.Base(rel), ".") {
|
||||
return nil
|
||||
}
|
||||
|
||||
files = append(files, rel)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Deterministic order
|
||||
sort.Strings(files)
|
||||
|
||||
// Hash file hashes to avoid holding all bytes
|
||||
overall := sha256.New()
|
||||
for _, rel := range files {
|
||||
p := filepath.Join(root, rel)
|
||||
sum, err := hashFileGo(p)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
overall.Write([]byte(sum))
|
||||
}
|
||||
return hex.EncodeToString(overall.Sum(nil)), nil
|
||||
}
|
||||
|
||||
// hashFileGo computes SHA256 of a file
|
||||
func hashFileGo(path string) (string, error) {
|
||||
f, err := os.Open(filepath.Clean(path))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
h := sha256.New()
|
||||
if _, err := h.Write([]byte{}); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
buf := make([]byte, 64*1024)
|
||||
for {
|
||||
n, err := f.Read(buf)
|
||||
if n > 0 {
|
||||
h.Write(buf[:n])
|
||||
}
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return hex.EncodeToString(h.Sum(nil)), nil
|
||||
}
|
||||
|
||||
// checkNativeAvailable checks if native libraries are available
|
||||
// Called from tests/integration/consistency/, so native/build is at ../../../native/build
|
||||
func checkNativeAvailable() bool {
|
||||
libDir := filepath.Join("..", "..", "..", "native", "build")
|
||||
|
||||
if runtime.GOOS == "darwin" {
|
||||
_, err := os.Stat(filepath.Join(libDir, "libdataset_hash.dylib"))
|
||||
return err == nil
|
||||
}
|
||||
_, err := os.Stat(filepath.Join(libDir, "libdataset_hash.so"))
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// findZigCLI locates the Zig CLI binary
|
||||
// Called from tests/integration/consistency/, so cli/ is at ../../../cli/
|
||||
func findZigCLI() string {
|
||||
// Check zig-out/bin for ml-<os>-<arch> pattern
|
||||
zigBinDir := filepath.Join("..", "..", "..", "cli", "zig-out", "bin")
|
||||
if entries, err := os.ReadDir(zigBinDir); err == nil {
|
||||
for _, entry := range entries {
|
||||
if strings.HasPrefix(entry.Name(), "ml-") && !entry.IsDir() {
|
||||
return filepath.Join(zigBinDir, entry.Name())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: check .zig-cache
|
||||
cacheDir := filepath.Join("..", "..", "..", "cli", ".zig-cache", "o")
|
||||
if entries, err := os.ReadDir(cacheDir); err == nil {
|
||||
for _, entry := range entries {
|
||||
if entry.IsDir() {
|
||||
subEntries, _ := os.ReadDir(filepath.Join(cacheDir, entry.Name()))
|
||||
for _, sub := range subEntries {
|
||||
if strings.HasPrefix(sub.Name(), "ml-") && !sub.IsDir() {
|
||||
return filepath.Join(cacheDir, entry.Name(), sub.Name())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try PATH
|
||||
if path, err := exec.LookPath("ml"); err == nil {
|
||||
return path
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// parseZigHashOutput extracts the hash from Zig CLI 'dataset verify' output
|
||||
// Expected format contains hash information from the verify command
|
||||
func parseZigHashOutput(output string) (string, error) {
|
||||
// Look for hash in the output - various formats possible
|
||||
lines := strings.SplitSeq(output, "\n")
|
||||
for line := range lines {
|
||||
// Try to find a 64-character hex string (SHA256)
|
||||
fields := strings.FieldsSeq(line)
|
||||
for field := range fields {
|
||||
field = strings.TrimSpace(field)
|
||||
// Check if it looks like a SHA256 hash (64 hex chars)
|
||||
if len(field) == 64 {
|
||||
// Verify it's valid hex
|
||||
if _, err := hex.DecodeString(field); err == nil {
|
||||
return field, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("could not parse hash from output: %s", output)
|
||||
}
|
||||
|
||||
// hashWithNative calls the native library via CGO
|
||||
func hashWithNative(path string) (string, error) {
|
||||
ctx := getNativeHashContext()
|
||||
|
||||
croot := C.CString(path)
|
||||
defer C.free(unsafe.Pointer(croot))
|
||||
|
||||
result := C.fh_hash_directory_combined(ctx, croot)
|
||||
if result == nil {
|
||||
err := C.fh_last_error(ctx)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("native hash failed: %s", C.GoString(err))
|
||||
}
|
||||
return "", fmt.Errorf("native hash failed")
|
||||
}
|
||||
defer C.fh_free_string(result)
|
||||
|
||||
return C.GoString(result), nil
|
||||
}
|
||||
|
||||
// hashFileWithNative calls the native library for single file
|
||||
func hashFileWithNative(path string) (string, error) {
|
||||
ctx := getNativeHashContext()
|
||||
|
||||
cpath := C.CString(path)
|
||||
defer C.free(unsafe.Pointer(cpath))
|
||||
|
||||
result := C.fh_hash_file(ctx, cpath)
|
||||
if result == nil {
|
||||
err := C.fh_last_error(ctx)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("native file hash failed: %s", C.GoString(err))
|
||||
}
|
||||
return "", fmt.Errorf("native file hash failed")
|
||||
}
|
||||
defer C.fh_free_string(result)
|
||||
|
||||
return C.GoString(result), nil
|
||||
}
|
||||
|
||||
// getNativeHashContext initializes and returns the native hash context
|
||||
func getNativeHashContext() *C.fh_context_t {
|
||||
nativeCtxOnce.Do(func() {
|
||||
nativeCtx = C.fh_init(C.uint32_t(runtime.NumCPU()))
|
||||
})
|
||||
return nativeCtx
|
||||
}
|
||||
|
||||
// ComputeAllHashes runs all available implementations on a path
|
||||
func ComputeAllHashes(path string, impls []Implementation) (map[string]string, error) {
|
||||
results := make(map[string]string)
|
||||
var errors []string
|
||||
|
||||
for _, impl := range impls {
|
||||
if !impl.Available() {
|
||||
results[impl.Name()] = "[not available]"
|
||||
continue
|
||||
}
|
||||
|
||||
hash, err := impl.HashDataset(path)
|
||||
if err != nil {
|
||||
errors = append(errors, fmt.Sprintf("%s: %v", impl.Name(), err))
|
||||
results[impl.Name()] = fmt.Sprintf("[error: %v]", err)
|
||||
} else {
|
||||
results[impl.Name()] = hash
|
||||
}
|
||||
}
|
||||
|
||||
if len(errors) > 0 {
|
||||
return results, fmt.Errorf("errors: %s", strings.Join(errors, "; "))
|
||||
}
|
||||
return results, nil
|
||||
}
|
||||
|
||||
// CompareHashes checks if all hashes match the expected value
|
||||
func CompareHashes(results map[string]string, expected string) (bool, []string) {
|
||||
var mismatches []string
|
||||
|
||||
for name, hash := range results {
|
||||
if strings.HasPrefix(hash, "[") {
|
||||
// Not available or error - skip comparison
|
||||
continue
|
||||
}
|
||||
|
||||
if !strings.EqualFold(hash, expected) {
|
||||
mismatches = append(mismatches,
|
||||
fmt.Sprintf("%s: got %s, expected %s", name, hash, expected))
|
||||
}
|
||||
}
|
||||
|
||||
return len(mismatches) == 0, mismatches
|
||||
}
|
||||
|
||||
// FormatHashComparison creates a readable comparison of hashes
|
||||
func FormatHashComparison(results map[string]string, expected string) string {
|
||||
var buf bytes.Buffer
|
||||
|
||||
fmt.Fprintf(&buf, "Hash Comparison:\n")
|
||||
fmt.Fprintf(&buf, " Expected: %s\n", expected)
|
||||
fmt.Fprintf(&buf, "\n")
|
||||
|
||||
maxNameLen := 0
|
||||
for name := range results {
|
||||
if len(name) > maxNameLen {
|
||||
maxNameLen = len(name)
|
||||
}
|
||||
}
|
||||
|
||||
for name, hash := range results {
|
||||
padding := strings.Repeat(" ", maxNameLen-len(name))
|
||||
match := " "
|
||||
if !strings.HasPrefix(hash, "[") {
|
||||
if strings.EqualFold(hash, expected) {
|
||||
match = "✓"
|
||||
} else {
|
||||
match = "✗"
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&buf, " %s:%s %s %s\n", name, padding, match, hash)
|
||||
}
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
Loading…
Reference in a new issue