fetch_ml/tests/integration/consistency/harness.go
Jeremie Fraeys a239f3a14f
test(consistency): add dataset hash consistency test suite
Add cross-implementation consistency tests for dataset hash functionality:

## Test Fixtures
- Single file, nested directories, and multiple file test cases
- Expected hashes in JSON format for validation

## Test Infrastructure
- harness.go: Common test utilities and reference implementation runner
- dataset_hash_test.go: Consistency test cases comparing implementations
- cmd/update.go: Tool to regenerate expected hashes from reference

## Purpose
Ensures hash implementations (Go, C++, Zig) produce identical results
across all supported platforms and implementations.
2026-03-05 14:41:14 -05:00

472 lines
12 KiB
Go

// Package consistency provides cross-implementation consistency testing
// for native C++, Go, and Zig implementations.
//
//go:build cgo
// +build cgo
package consistency
import (
"bytes"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"runtime"
"sort"
"strings"
"sync"
"unsafe"
)
// #cgo darwin LDFLAGS: -L${SRCDIR}/../../../native/build -Wl,-rpath,${SRCDIR}/../../../native/build -ldataset_hash
// #cgo linux LDFLAGS: -L${SRCDIR}/../../../native/build -Wl,-rpath,${SRCDIR}/../../../native/build -ldataset_hash
// #include "../../../native/dataset_hash/dataset_hash.h"
// #include <stdlib.h>
import "C"
var (
nativeCtx *C.fh_context_t
nativeCtxOnce sync.Once
)
// Implementation defines the interface for a hashing implementation
type Implementation interface {
Name() string
HashDataset(path string) (string, error)
HashFile(path string) (string, error)
Available() bool
}
// NativeImpl wraps the native C++ library via CGO
type NativeImpl struct {
available bool
}
// GoImpl uses the pure Go implementation
type GoImpl struct{}
// ZigImpl executes the Zig CLI as a subprocess
type ZigImpl struct {
cliPath string
available bool
}
// Fixture represents a test case with known expected output
type Fixture struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
ExpectedHash string `json:"expected_hash"`
Files []FixtureFile `json:"files"`
}
// FixtureFile represents a file in a fixture
type FixtureFile struct {
Path string `json:"path"`
ContentHash string `json:"content_hash"`
}
// ExpectedHashes is the root structure of expected_hashes.json
type ExpectedHashes struct {
Version string `json:"version"`
Algorithm string `json:"algorithm"`
Description string `json:"description"`
Fixtures []Fixture `json:"fixtures"`
}
// Name returns the implementation name
func (n *NativeImpl) Name() string { return "native_c++" }
// Available returns true if native libraries are available
func (n *NativeImpl) Available() bool { return n.available }
// HashDataset computes the hash of a directory using native library
func (n *NativeImpl) HashDataset(path string) (string, error) {
if !n.available {
return "", fmt.Errorf("native library not available")
}
// Call the native library through worker package
return hashWithNative(path)
}
// HashFile computes the hash of a single file using native library
func (n *NativeImpl) HashFile(path string) (string, error) {
if !n.available {
return "", fmt.Errorf("native library not available")
}
return hashFileWithNative(path)
}
// Name returns the implementation name
func (g *GoImpl) Name() string { return "go_pure" }
// Available always returns true for Go implementation
func (g *GoImpl) Available() bool { return true }
// HashDataset computes the hash of a directory using pure Go
func (g *GoImpl) HashDataset(path string) (string, error) {
return hashDirGo(path)
}
// HashFile computes the hash of a single file using pure Go
func (g *GoImpl) HashFile(path string) (string, error) {
return hashFileGo(path)
}
// Name returns the implementation name
func (z *ZigImpl) Name() string { return "zig_cli" }
// Available returns true if Zig CLI is found
func (z *ZigImpl) Available() bool { return z.available }
// HashDataset computes the hash of a directory using Zig CLI
// Uses 'dataset verify' command which auto-hashes the dataset
func (z *ZigImpl) HashDataset(path string) (string, error) {
if !z.available {
return "", fmt.Errorf("zig CLI not available at %s", z.cliPath)
}
// Convert to absolute path to avoid PathTraversalAttempt error
absPath, err := filepath.Abs(path)
if err != nil {
return "", fmt.Errorf("failed to get absolute path: %w", err)
}
// Use dataset verify --dry-run to get the hash without verifying
cmd := exec.Command(z.cliPath, "dataset", "verify", absPath, "--dry-run")
output, err := cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("zig CLI failed: %w (output: %s)", err, string(output))
}
// Parse output to extract hash
return parseZigHashOutput(string(output))
}
// HashFile computes the hash of a single file using Zig CLI
func (z *ZigImpl) HashFile(path string) (string, error) {
// Zig CLI doesn't have a single file hash command, so we compute it directly
return hashFileGo(path)
}
// NewNativeImpl creates a new native implementation wrapper
func NewNativeImpl() *NativeImpl {
return &NativeImpl{
available: checkNativeAvailable(),
}
}
// NewGoImpl creates a new Go implementation wrapper
func NewGoImpl() *GoImpl {
return &GoImpl{}
}
// NewZigImpl creates a new Zig implementation wrapper
func NewZigImpl() *ZigImpl {
cliPath := findZigCLI()
return &ZigImpl{
cliPath: cliPath,
available: cliPath != "",
}
}
// LoadExpectedHashes loads the expected hash values from fixtures
func LoadExpectedHashes(fixturesDir string) (*ExpectedHashes, error) {
path := filepath.Join(fixturesDir, "dataset_hash", "expected_hashes.json")
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("failed to read expected hashes: %w", err)
}
var expected ExpectedHashes
if err := json.Unmarshal(data, &expected); err != nil {
return nil, fmt.Errorf("failed to parse expected hashes: %w", err)
}
return &expected, nil
}
// ComputeExpectedHash computes the expected hash for a fixture using reference algorithm
func ComputeExpectedHash(fixturePath string) (string, error) {
return hashDirGo(fixturePath)
}
// hashDirGo is the reference Go implementation
func hashDirGo(root string) (string, error) {
root = filepath.Clean(root)
info, err := os.Stat(root)
if err != nil {
return "", err
}
if !info.IsDir() {
return "", fmt.Errorf("not a directory: %s", root)
}
var files []string
err = filepath.WalkDir(root, func(path string, d os.DirEntry, walkErr error) error {
if walkErr != nil {
return walkErr
}
if d.IsDir() {
return nil
}
// Skip hidden files
rel, err := filepath.Rel(root, path)
if err != nil {
return err
}
if strings.HasPrefix(filepath.Base(rel), ".") {
return nil
}
files = append(files, rel)
return nil
})
if err != nil {
return "", err
}
// Deterministic order
sort.Strings(files)
// Hash file hashes to avoid holding all bytes
overall := sha256.New()
for _, rel := range files {
p := filepath.Join(root, rel)
sum, err := hashFileGo(p)
if err != nil {
return "", err
}
overall.Write([]byte(sum))
}
return hex.EncodeToString(overall.Sum(nil)), nil
}
// hashFileGo computes SHA256 of a file
func hashFileGo(path string) (string, error) {
f, err := os.Open(filepath.Clean(path))
if err != nil {
return "", err
}
defer f.Close()
h := sha256.New()
if _, err := h.Write([]byte{}); err != nil {
return "", err
}
buf := make([]byte, 64*1024)
for {
n, err := f.Read(buf)
if n > 0 {
h.Write(buf[:n])
}
if err != nil {
break
}
}
return hex.EncodeToString(h.Sum(nil)), nil
}
// checkNativeAvailable checks if native libraries are available
// Called from tests/integration/consistency/, so native/build is at ../../../native/build
func checkNativeAvailable() bool {
libDir := filepath.Join("..", "..", "..", "native", "build")
if runtime.GOOS == "darwin" {
_, err := os.Stat(filepath.Join(libDir, "libdataset_hash.dylib"))
return err == nil
}
_, err := os.Stat(filepath.Join(libDir, "libdataset_hash.so"))
return err == nil
}
// findZigCLI locates the Zig CLI binary
// Called from tests/integration/consistency/, so cli/ is at ../../../cli/
func findZigCLI() string {
// Check zig-out/bin for ml-<os>-<arch> pattern
zigBinDir := filepath.Join("..", "..", "..", "cli", "zig-out", "bin")
if entries, err := os.ReadDir(zigBinDir); err == nil {
for _, entry := range entries {
if strings.HasPrefix(entry.Name(), "ml-") && !entry.IsDir() {
return filepath.Join(zigBinDir, entry.Name())
}
}
}
// Fallback: check .zig-cache
cacheDir := filepath.Join("..", "..", "..", "cli", ".zig-cache", "o")
if entries, err := os.ReadDir(cacheDir); err == nil {
for _, entry := range entries {
if entry.IsDir() {
subEntries, _ := os.ReadDir(filepath.Join(cacheDir, entry.Name()))
for _, sub := range subEntries {
if strings.HasPrefix(sub.Name(), "ml-") && !sub.IsDir() {
return filepath.Join(cacheDir, entry.Name(), sub.Name())
}
}
}
}
}
// Try PATH
if path, err := exec.LookPath("ml"); err == nil {
return path
}
return ""
}
// parseZigHashOutput extracts the hash from Zig CLI 'dataset verify' output
// Expected format contains hash information from the verify command
func parseZigHashOutput(output string) (string, error) {
// Look for hash in the output - various formats possible
lines := strings.SplitSeq(output, "\n")
for line := range lines {
// Try to find a 64-character hex string (SHA256)
fields := strings.FieldsSeq(line)
for field := range fields {
field = strings.TrimSpace(field)
// Check if it looks like a SHA256 hash (64 hex chars)
if len(field) == 64 {
// Verify it's valid hex
if _, err := hex.DecodeString(field); err == nil {
return field, nil
}
}
}
}
return "", fmt.Errorf("could not parse hash from output: %s", output)
}
// hashWithNative calls the native library via CGO
func hashWithNative(path string) (string, error) {
ctx := getNativeHashContext()
croot := C.CString(path)
defer C.free(unsafe.Pointer(croot))
result := C.fh_hash_directory_combined(ctx, croot)
if result == nil {
err := C.fh_last_error(ctx)
if err != nil {
return "", fmt.Errorf("native hash failed: %s", C.GoString(err))
}
return "", fmt.Errorf("native hash failed")
}
defer C.fh_free_string(result)
return C.GoString(result), nil
}
// hashFileWithNative calls the native library for single file
func hashFileWithNative(path string) (string, error) {
ctx := getNativeHashContext()
cpath := C.CString(path)
defer C.free(unsafe.Pointer(cpath))
result := C.fh_hash_file(ctx, cpath)
if result == nil {
err := C.fh_last_error(ctx)
if err != nil {
return "", fmt.Errorf("native file hash failed: %s", C.GoString(err))
}
return "", fmt.Errorf("native file hash failed")
}
defer C.fh_free_string(result)
return C.GoString(result), nil
}
// getNativeHashContext initializes and returns the native hash context
func getNativeHashContext() *C.fh_context_t {
nativeCtxOnce.Do(func() {
nativeCtx = C.fh_init(C.uint32_t(runtime.NumCPU()))
})
return nativeCtx
}
// ComputeAllHashes runs all available implementations on a path
func ComputeAllHashes(path string, impls []Implementation) (map[string]string, error) {
results := make(map[string]string)
var errors []string
for _, impl := range impls {
if !impl.Available() {
results[impl.Name()] = "[not available]"
continue
}
hash, err := impl.HashDataset(path)
if err != nil {
errors = append(errors, fmt.Sprintf("%s: %v", impl.Name(), err))
results[impl.Name()] = fmt.Sprintf("[error: %v]", err)
} else {
results[impl.Name()] = hash
}
}
if len(errors) > 0 {
return results, fmt.Errorf("errors: %s", strings.Join(errors, "; "))
}
return results, nil
}
// CompareHashes checks if all hashes match the expected value
func CompareHashes(results map[string]string, expected string) (bool, []string) {
var mismatches []string
for name, hash := range results {
if strings.HasPrefix(hash, "[") {
// Not available or error - skip comparison
continue
}
if !strings.EqualFold(hash, expected) {
mismatches = append(mismatches,
fmt.Sprintf("%s: got %s, expected %s", name, hash, expected))
}
}
return len(mismatches) == 0, mismatches
}
// FormatHashComparison creates a readable comparison of hashes
func FormatHashComparison(results map[string]string, expected string) string {
var buf bytes.Buffer
fmt.Fprintf(&buf, "Hash Comparison:\n")
fmt.Fprintf(&buf, " Expected: %s\n", expected)
fmt.Fprintf(&buf, "\n")
maxNameLen := 0
for name := range results {
if len(name) > maxNameLen {
maxNameLen = len(name)
}
}
for name, hash := range results {
padding := strings.Repeat(" ", maxNameLen-len(name))
match := " "
if !strings.HasPrefix(hash, "[") {
if strings.EqualFold(hash, expected) {
match = "✓"
} else {
match = "✗"
}
}
fmt.Fprintf(&buf, " %s:%s %s %s\n", name, padding, match, hash)
}
return buf.String()
}