Add cross-implementation consistency tests for dataset hash functionality: ## Test Fixtures - Single file, nested directories, and multiple file test cases - Expected hashes in JSON format for validation ## Test Infrastructure - harness.go: Common test utilities and reference implementation runner - dataset_hash_test.go: Consistency test cases comparing implementations - cmd/update.go: Tool to regenerate expected hashes from reference ## Purpose Ensures hash implementations (Go, C++, Zig) produce identical results across all supported platforms and implementations.
472 lines
12 KiB
Go
472 lines
12 KiB
Go
// Package consistency provides cross-implementation consistency testing
|
|
// for native C++, Go, and Zig implementations.
|
|
//
|
|
//go:build cgo
|
|
// +build cgo
|
|
|
|
package consistency
|
|
|
|
import (
|
|
"bytes"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"runtime"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"unsafe"
|
|
)
|
|
|
|
// #cgo darwin LDFLAGS: -L${SRCDIR}/../../../native/build -Wl,-rpath,${SRCDIR}/../../../native/build -ldataset_hash
|
|
// #cgo linux LDFLAGS: -L${SRCDIR}/../../../native/build -Wl,-rpath,${SRCDIR}/../../../native/build -ldataset_hash
|
|
// #include "../../../native/dataset_hash/dataset_hash.h"
|
|
// #include <stdlib.h>
|
|
import "C"
|
|
|
|
var (
|
|
nativeCtx *C.fh_context_t
|
|
nativeCtxOnce sync.Once
|
|
)
|
|
|
|
// Implementation defines the interface for a hashing implementation
|
|
type Implementation interface {
|
|
Name() string
|
|
HashDataset(path string) (string, error)
|
|
HashFile(path string) (string, error)
|
|
Available() bool
|
|
}
|
|
|
|
// NativeImpl wraps the native C++ library via CGO
|
|
type NativeImpl struct {
|
|
available bool
|
|
}
|
|
|
|
// GoImpl uses the pure Go implementation
|
|
type GoImpl struct{}
|
|
|
|
// ZigImpl executes the Zig CLI as a subprocess
|
|
type ZigImpl struct {
|
|
cliPath string
|
|
available bool
|
|
}
|
|
|
|
// Fixture represents a test case with known expected output
|
|
type Fixture struct {
|
|
ID string `json:"id"`
|
|
Name string `json:"name"`
|
|
Description string `json:"description"`
|
|
ExpectedHash string `json:"expected_hash"`
|
|
Files []FixtureFile `json:"files"`
|
|
}
|
|
|
|
// FixtureFile represents a file in a fixture
|
|
type FixtureFile struct {
|
|
Path string `json:"path"`
|
|
ContentHash string `json:"content_hash"`
|
|
}
|
|
|
|
// ExpectedHashes is the root structure of expected_hashes.json
|
|
type ExpectedHashes struct {
|
|
Version string `json:"version"`
|
|
Algorithm string `json:"algorithm"`
|
|
Description string `json:"description"`
|
|
Fixtures []Fixture `json:"fixtures"`
|
|
}
|
|
|
|
// Name returns the implementation name
|
|
func (n *NativeImpl) Name() string { return "native_c++" }
|
|
|
|
// Available returns true if native libraries are available
|
|
func (n *NativeImpl) Available() bool { return n.available }
|
|
|
|
// HashDataset computes the hash of a directory using native library
|
|
func (n *NativeImpl) HashDataset(path string) (string, error) {
|
|
if !n.available {
|
|
return "", fmt.Errorf("native library not available")
|
|
}
|
|
|
|
// Call the native library through worker package
|
|
return hashWithNative(path)
|
|
}
|
|
|
|
// HashFile computes the hash of a single file using native library
|
|
func (n *NativeImpl) HashFile(path string) (string, error) {
|
|
if !n.available {
|
|
return "", fmt.Errorf("native library not available")
|
|
}
|
|
|
|
return hashFileWithNative(path)
|
|
}
|
|
|
|
// Name returns the implementation name
|
|
func (g *GoImpl) Name() string { return "go_pure" }
|
|
|
|
// Available always returns true for Go implementation
|
|
func (g *GoImpl) Available() bool { return true }
|
|
|
|
// HashDataset computes the hash of a directory using pure Go
|
|
func (g *GoImpl) HashDataset(path string) (string, error) {
|
|
return hashDirGo(path)
|
|
}
|
|
|
|
// HashFile computes the hash of a single file using pure Go
|
|
func (g *GoImpl) HashFile(path string) (string, error) {
|
|
return hashFileGo(path)
|
|
}
|
|
|
|
// Name returns the implementation name
|
|
func (z *ZigImpl) Name() string { return "zig_cli" }
|
|
|
|
// Available returns true if Zig CLI is found
|
|
func (z *ZigImpl) Available() bool { return z.available }
|
|
|
|
// HashDataset computes the hash of a directory using Zig CLI
|
|
// Uses 'dataset verify' command which auto-hashes the dataset
|
|
func (z *ZigImpl) HashDataset(path string) (string, error) {
|
|
if !z.available {
|
|
return "", fmt.Errorf("zig CLI not available at %s", z.cliPath)
|
|
}
|
|
|
|
// Convert to absolute path to avoid PathTraversalAttempt error
|
|
absPath, err := filepath.Abs(path)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to get absolute path: %w", err)
|
|
}
|
|
|
|
// Use dataset verify --dry-run to get the hash without verifying
|
|
cmd := exec.Command(z.cliPath, "dataset", "verify", absPath, "--dry-run")
|
|
output, err := cmd.CombinedOutput()
|
|
if err != nil {
|
|
return "", fmt.Errorf("zig CLI failed: %w (output: %s)", err, string(output))
|
|
}
|
|
|
|
// Parse output to extract hash
|
|
return parseZigHashOutput(string(output))
|
|
}
|
|
|
|
// HashFile computes the hash of a single file using Zig CLI
|
|
func (z *ZigImpl) HashFile(path string) (string, error) {
|
|
// Zig CLI doesn't have a single file hash command, so we compute it directly
|
|
return hashFileGo(path)
|
|
}
|
|
|
|
// NewNativeImpl creates a new native implementation wrapper
|
|
func NewNativeImpl() *NativeImpl {
|
|
return &NativeImpl{
|
|
available: checkNativeAvailable(),
|
|
}
|
|
}
|
|
|
|
// NewGoImpl creates a new Go implementation wrapper
|
|
func NewGoImpl() *GoImpl {
|
|
return &GoImpl{}
|
|
}
|
|
|
|
// NewZigImpl creates a new Zig implementation wrapper
|
|
func NewZigImpl() *ZigImpl {
|
|
cliPath := findZigCLI()
|
|
return &ZigImpl{
|
|
cliPath: cliPath,
|
|
available: cliPath != "",
|
|
}
|
|
}
|
|
|
|
// LoadExpectedHashes loads the expected hash values from fixtures
|
|
func LoadExpectedHashes(fixturesDir string) (*ExpectedHashes, error) {
|
|
path := filepath.Join(fixturesDir, "dataset_hash", "expected_hashes.json")
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read expected hashes: %w", err)
|
|
}
|
|
|
|
var expected ExpectedHashes
|
|
if err := json.Unmarshal(data, &expected); err != nil {
|
|
return nil, fmt.Errorf("failed to parse expected hashes: %w", err)
|
|
}
|
|
|
|
return &expected, nil
|
|
}
|
|
|
|
// ComputeExpectedHash computes the expected hash for a fixture using reference algorithm
|
|
func ComputeExpectedHash(fixturePath string) (string, error) {
|
|
return hashDirGo(fixturePath)
|
|
}
|
|
|
|
// hashDirGo is the reference Go implementation
|
|
func hashDirGo(root string) (string, error) {
|
|
root = filepath.Clean(root)
|
|
info, err := os.Stat(root)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if !info.IsDir() {
|
|
return "", fmt.Errorf("not a directory: %s", root)
|
|
}
|
|
|
|
var files []string
|
|
err = filepath.WalkDir(root, func(path string, d os.DirEntry, walkErr error) error {
|
|
if walkErr != nil {
|
|
return walkErr
|
|
}
|
|
if d.IsDir() {
|
|
return nil
|
|
}
|
|
|
|
// Skip hidden files
|
|
rel, err := filepath.Rel(root, path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if strings.HasPrefix(filepath.Base(rel), ".") {
|
|
return nil
|
|
}
|
|
|
|
files = append(files, rel)
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
// Deterministic order
|
|
sort.Strings(files)
|
|
|
|
// Hash file hashes to avoid holding all bytes
|
|
overall := sha256.New()
|
|
for _, rel := range files {
|
|
p := filepath.Join(root, rel)
|
|
sum, err := hashFileGo(p)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
overall.Write([]byte(sum))
|
|
}
|
|
return hex.EncodeToString(overall.Sum(nil)), nil
|
|
}
|
|
|
|
// hashFileGo computes SHA256 of a file
|
|
func hashFileGo(path string) (string, error) {
|
|
f, err := os.Open(filepath.Clean(path))
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer f.Close()
|
|
|
|
h := sha256.New()
|
|
if _, err := h.Write([]byte{}); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
buf := make([]byte, 64*1024)
|
|
for {
|
|
n, err := f.Read(buf)
|
|
if n > 0 {
|
|
h.Write(buf[:n])
|
|
}
|
|
if err != nil {
|
|
break
|
|
}
|
|
}
|
|
|
|
return hex.EncodeToString(h.Sum(nil)), nil
|
|
}
|
|
|
|
// checkNativeAvailable checks if native libraries are available
|
|
// Called from tests/integration/consistency/, so native/build is at ../../../native/build
|
|
func checkNativeAvailable() bool {
|
|
libDir := filepath.Join("..", "..", "..", "native", "build")
|
|
|
|
if runtime.GOOS == "darwin" {
|
|
_, err := os.Stat(filepath.Join(libDir, "libdataset_hash.dylib"))
|
|
return err == nil
|
|
}
|
|
_, err := os.Stat(filepath.Join(libDir, "libdataset_hash.so"))
|
|
return err == nil
|
|
}
|
|
|
|
// findZigCLI locates the Zig CLI binary
|
|
// Called from tests/integration/consistency/, so cli/ is at ../../../cli/
|
|
func findZigCLI() string {
|
|
// Check zig-out/bin for ml-<os>-<arch> pattern
|
|
zigBinDir := filepath.Join("..", "..", "..", "cli", "zig-out", "bin")
|
|
if entries, err := os.ReadDir(zigBinDir); err == nil {
|
|
for _, entry := range entries {
|
|
if strings.HasPrefix(entry.Name(), "ml-") && !entry.IsDir() {
|
|
return filepath.Join(zigBinDir, entry.Name())
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: check .zig-cache
|
|
cacheDir := filepath.Join("..", "..", "..", "cli", ".zig-cache", "o")
|
|
if entries, err := os.ReadDir(cacheDir); err == nil {
|
|
for _, entry := range entries {
|
|
if entry.IsDir() {
|
|
subEntries, _ := os.ReadDir(filepath.Join(cacheDir, entry.Name()))
|
|
for _, sub := range subEntries {
|
|
if strings.HasPrefix(sub.Name(), "ml-") && !sub.IsDir() {
|
|
return filepath.Join(cacheDir, entry.Name(), sub.Name())
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try PATH
|
|
if path, err := exec.LookPath("ml"); err == nil {
|
|
return path
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
// parseZigHashOutput extracts the hash from Zig CLI 'dataset verify' output
|
|
// Expected format contains hash information from the verify command
|
|
func parseZigHashOutput(output string) (string, error) {
|
|
// Look for hash in the output - various formats possible
|
|
lines := strings.SplitSeq(output, "\n")
|
|
for line := range lines {
|
|
// Try to find a 64-character hex string (SHA256)
|
|
fields := strings.FieldsSeq(line)
|
|
for field := range fields {
|
|
field = strings.TrimSpace(field)
|
|
// Check if it looks like a SHA256 hash (64 hex chars)
|
|
if len(field) == 64 {
|
|
// Verify it's valid hex
|
|
if _, err := hex.DecodeString(field); err == nil {
|
|
return field, nil
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return "", fmt.Errorf("could not parse hash from output: %s", output)
|
|
}
|
|
|
|
// hashWithNative calls the native library via CGO
|
|
func hashWithNative(path string) (string, error) {
|
|
ctx := getNativeHashContext()
|
|
|
|
croot := C.CString(path)
|
|
defer C.free(unsafe.Pointer(croot))
|
|
|
|
result := C.fh_hash_directory_combined(ctx, croot)
|
|
if result == nil {
|
|
err := C.fh_last_error(ctx)
|
|
if err != nil {
|
|
return "", fmt.Errorf("native hash failed: %s", C.GoString(err))
|
|
}
|
|
return "", fmt.Errorf("native hash failed")
|
|
}
|
|
defer C.fh_free_string(result)
|
|
|
|
return C.GoString(result), nil
|
|
}
|
|
|
|
// hashFileWithNative calls the native library for single file
|
|
func hashFileWithNative(path string) (string, error) {
|
|
ctx := getNativeHashContext()
|
|
|
|
cpath := C.CString(path)
|
|
defer C.free(unsafe.Pointer(cpath))
|
|
|
|
result := C.fh_hash_file(ctx, cpath)
|
|
if result == nil {
|
|
err := C.fh_last_error(ctx)
|
|
if err != nil {
|
|
return "", fmt.Errorf("native file hash failed: %s", C.GoString(err))
|
|
}
|
|
return "", fmt.Errorf("native file hash failed")
|
|
}
|
|
defer C.fh_free_string(result)
|
|
|
|
return C.GoString(result), nil
|
|
}
|
|
|
|
// getNativeHashContext initializes and returns the native hash context
|
|
func getNativeHashContext() *C.fh_context_t {
|
|
nativeCtxOnce.Do(func() {
|
|
nativeCtx = C.fh_init(C.uint32_t(runtime.NumCPU()))
|
|
})
|
|
return nativeCtx
|
|
}
|
|
|
|
// ComputeAllHashes runs all available implementations on a path
|
|
func ComputeAllHashes(path string, impls []Implementation) (map[string]string, error) {
|
|
results := make(map[string]string)
|
|
var errors []string
|
|
|
|
for _, impl := range impls {
|
|
if !impl.Available() {
|
|
results[impl.Name()] = "[not available]"
|
|
continue
|
|
}
|
|
|
|
hash, err := impl.HashDataset(path)
|
|
if err != nil {
|
|
errors = append(errors, fmt.Sprintf("%s: %v", impl.Name(), err))
|
|
results[impl.Name()] = fmt.Sprintf("[error: %v]", err)
|
|
} else {
|
|
results[impl.Name()] = hash
|
|
}
|
|
}
|
|
|
|
if len(errors) > 0 {
|
|
return results, fmt.Errorf("errors: %s", strings.Join(errors, "; "))
|
|
}
|
|
return results, nil
|
|
}
|
|
|
|
// CompareHashes checks if all hashes match the expected value
|
|
func CompareHashes(results map[string]string, expected string) (bool, []string) {
|
|
var mismatches []string
|
|
|
|
for name, hash := range results {
|
|
if strings.HasPrefix(hash, "[") {
|
|
// Not available or error - skip comparison
|
|
continue
|
|
}
|
|
|
|
if !strings.EqualFold(hash, expected) {
|
|
mismatches = append(mismatches,
|
|
fmt.Sprintf("%s: got %s, expected %s", name, hash, expected))
|
|
}
|
|
}
|
|
|
|
return len(mismatches) == 0, mismatches
|
|
}
|
|
|
|
// FormatHashComparison creates a readable comparison of hashes
|
|
func FormatHashComparison(results map[string]string, expected string) string {
|
|
var buf bytes.Buffer
|
|
|
|
fmt.Fprintf(&buf, "Hash Comparison:\n")
|
|
fmt.Fprintf(&buf, " Expected: %s\n", expected)
|
|
fmt.Fprintf(&buf, "\n")
|
|
|
|
maxNameLen := 0
|
|
for name := range results {
|
|
if len(name) > maxNameLen {
|
|
maxNameLen = len(name)
|
|
}
|
|
}
|
|
|
|
for name, hash := range results {
|
|
padding := strings.Repeat(" ", maxNameLen-len(name))
|
|
match := " "
|
|
if !strings.HasPrefix(hash, "[") {
|
|
if strings.EqualFold(hash, expected) {
|
|
match = "✓"
|
|
} else {
|
|
match = "✗"
|
|
}
|
|
}
|
|
fmt.Fprintf(&buf, " %s:%s %s %s\n", name, padding, match, hash)
|
|
}
|
|
|
|
return buf.String()
|
|
}
|