// Package consistency provides cross-implementation consistency testing // for native C++, Go, and Zig implementations. // //go:build cgo // +build cgo package consistency import ( "bytes" "crypto/sha256" "encoding/hex" "encoding/json" "fmt" "os" "os/exec" "path/filepath" "runtime" "sort" "strings" "sync" "unsafe" ) // #cgo darwin LDFLAGS: -L${SRCDIR}/../../../native/build -Wl,-rpath,${SRCDIR}/../../../native/build -ldataset_hash // #cgo linux LDFLAGS: -L${SRCDIR}/../../../native/build -Wl,-rpath,${SRCDIR}/../../../native/build -ldataset_hash // #include "../../../native/dataset_hash/dataset_hash.h" // #include import "C" var ( nativeCtx *C.fh_context_t nativeCtxOnce sync.Once ) // Implementation defines the interface for a hashing implementation type Implementation interface { Name() string HashDataset(path string) (string, error) HashFile(path string) (string, error) Available() bool } // NativeImpl wraps the native C++ library via CGO type NativeImpl struct { available bool } // GoImpl uses the pure Go implementation type GoImpl struct{} // ZigImpl executes the Zig CLI as a subprocess type ZigImpl struct { cliPath string available bool } // Fixture represents a test case with known expected output type Fixture struct { ID string `json:"id"` Name string `json:"name"` Description string `json:"description"` ExpectedHash string `json:"expected_hash"` Files []FixtureFile `json:"files"` } // FixtureFile represents a file in a fixture type FixtureFile struct { Path string `json:"path"` ContentHash string `json:"content_hash"` } // ExpectedHashes is the root structure of expected_hashes.json type ExpectedHashes struct { Version string `json:"version"` Algorithm string `json:"algorithm"` Description string `json:"description"` Fixtures []Fixture `json:"fixtures"` } // Name returns the implementation name func (n *NativeImpl) Name() string { return "native_c++" } // Available returns true if native libraries are available func (n *NativeImpl) Available() bool { return n.available } // HashDataset computes the hash of a directory using native library func (n *NativeImpl) HashDataset(path string) (string, error) { if !n.available { return "", fmt.Errorf("native library not available") } // Call the native library through worker package return hashWithNative(path) } // HashFile computes the hash of a single file using native library func (n *NativeImpl) HashFile(path string) (string, error) { if !n.available { return "", fmt.Errorf("native library not available") } return hashFileWithNative(path) } // Name returns the implementation name func (g *GoImpl) Name() string { return "go_pure" } // Available always returns true for Go implementation func (g *GoImpl) Available() bool { return true } // HashDataset computes the hash of a directory using pure Go func (g *GoImpl) HashDataset(path string) (string, error) { return hashDirGo(path) } // HashFile computes the hash of a single file using pure Go func (g *GoImpl) HashFile(path string) (string, error) { return hashFileGo(path) } // Name returns the implementation name func (z *ZigImpl) Name() string { return "zig_cli" } // Available returns true if Zig CLI is found func (z *ZigImpl) Available() bool { return z.available } // HashDataset computes the hash of a directory using Zig CLI // Uses 'dataset verify' command which auto-hashes the dataset func (z *ZigImpl) HashDataset(path string) (string, error) { if !z.available { return "", fmt.Errorf("zig CLI not available at %s", z.cliPath) } // Convert to absolute path to avoid PathTraversalAttempt error absPath, err := filepath.Abs(path) if err != nil { return "", fmt.Errorf("failed to get absolute path: %w", err) } // Use dataset verify --dry-run to get the hash without verifying cmd := exec.Command(z.cliPath, "dataset", "verify", absPath, "--dry-run") output, err := cmd.CombinedOutput() if err != nil { return "", fmt.Errorf("zig CLI failed: %w (output: %s)", err, string(output)) } // Parse output to extract hash return parseZigHashOutput(string(output)) } // HashFile computes the hash of a single file using Zig CLI func (z *ZigImpl) HashFile(path string) (string, error) { // Zig CLI doesn't have a single file hash command, so we compute it directly return hashFileGo(path) } // NewNativeImpl creates a new native implementation wrapper func NewNativeImpl() *NativeImpl { return &NativeImpl{ available: checkNativeAvailable(), } } // NewGoImpl creates a new Go implementation wrapper func NewGoImpl() *GoImpl { return &GoImpl{} } // NewZigImpl creates a new Zig implementation wrapper func NewZigImpl() *ZigImpl { cliPath := findZigCLI() return &ZigImpl{ cliPath: cliPath, available: cliPath != "", } } // LoadExpectedHashes loads the expected hash values from fixtures func LoadExpectedHashes(fixturesDir string) (*ExpectedHashes, error) { path := filepath.Join(fixturesDir, "dataset_hash", "expected_hashes.json") data, err := os.ReadFile(path) if err != nil { return nil, fmt.Errorf("failed to read expected hashes: %w", err) } var expected ExpectedHashes if err := json.Unmarshal(data, &expected); err != nil { return nil, fmt.Errorf("failed to parse expected hashes: %w", err) } return &expected, nil } // ComputeExpectedHash computes the expected hash for a fixture using reference algorithm func ComputeExpectedHash(fixturePath string) (string, error) { return hashDirGo(fixturePath) } // hashDirGo is the reference Go implementation func hashDirGo(root string) (string, error) { root = filepath.Clean(root) info, err := os.Stat(root) if err != nil { return "", err } if !info.IsDir() { return "", fmt.Errorf("not a directory: %s", root) } var files []string err = filepath.WalkDir(root, func(path string, d os.DirEntry, walkErr error) error { if walkErr != nil { return walkErr } if d.IsDir() { return nil } // Skip hidden files rel, err := filepath.Rel(root, path) if err != nil { return err } if strings.HasPrefix(filepath.Base(rel), ".") { return nil } files = append(files, rel) return nil }) if err != nil { return "", err } // Deterministic order sort.Strings(files) // Hash file hashes to avoid holding all bytes overall := sha256.New() for _, rel := range files { p := filepath.Join(root, rel) sum, err := hashFileGo(p) if err != nil { return "", err } overall.Write([]byte(sum)) } return hex.EncodeToString(overall.Sum(nil)), nil } // hashFileGo computes SHA256 of a file func hashFileGo(path string) (string, error) { f, err := os.Open(filepath.Clean(path)) if err != nil { return "", err } defer f.Close() h := sha256.New() if _, err := h.Write([]byte{}); err != nil { return "", err } buf := make([]byte, 64*1024) for { n, err := f.Read(buf) if n > 0 { h.Write(buf[:n]) } if err != nil { break } } return hex.EncodeToString(h.Sum(nil)), nil } // checkNativeAvailable checks if native libraries are available // Called from tests/integration/consistency/, so native/build is at ../../../native/build func checkNativeAvailable() bool { libDir := filepath.Join("..", "..", "..", "native", "build") if runtime.GOOS == "darwin" { _, err := os.Stat(filepath.Join(libDir, "libdataset_hash.dylib")) return err == nil } _, err := os.Stat(filepath.Join(libDir, "libdataset_hash.so")) return err == nil } // findZigCLI locates the Zig CLI binary // Called from tests/integration/consistency/, so cli/ is at ../../../cli/ func findZigCLI() string { // Check zig-out/bin for ml-- pattern zigBinDir := filepath.Join("..", "..", "..", "cli", "zig-out", "bin") if entries, err := os.ReadDir(zigBinDir); err == nil { for _, entry := range entries { if strings.HasPrefix(entry.Name(), "ml-") && !entry.IsDir() { return filepath.Join(zigBinDir, entry.Name()) } } } // Fallback: check .zig-cache cacheDir := filepath.Join("..", "..", "..", "cli", ".zig-cache", "o") if entries, err := os.ReadDir(cacheDir); err == nil { for _, entry := range entries { if entry.IsDir() { subEntries, _ := os.ReadDir(filepath.Join(cacheDir, entry.Name())) for _, sub := range subEntries { if strings.HasPrefix(sub.Name(), "ml-") && !sub.IsDir() { return filepath.Join(cacheDir, entry.Name(), sub.Name()) } } } } } // Try PATH if path, err := exec.LookPath("ml"); err == nil { return path } return "" } // parseZigHashOutput extracts the hash from Zig CLI 'dataset verify' output // Expected format contains hash information from the verify command func parseZigHashOutput(output string) (string, error) { // Look for hash in the output - various formats possible lines := strings.SplitSeq(output, "\n") for line := range lines { // Try to find a 64-character hex string (SHA256) fields := strings.FieldsSeq(line) for field := range fields { field = strings.TrimSpace(field) // Check if it looks like a SHA256 hash (64 hex chars) if len(field) == 64 { // Verify it's valid hex if _, err := hex.DecodeString(field); err == nil { return field, nil } } } } return "", fmt.Errorf("could not parse hash from output: %s", output) } // hashWithNative calls the native library via CGO func hashWithNative(path string) (string, error) { ctx := getNativeHashContext() croot := C.CString(path) defer C.free(unsafe.Pointer(croot)) result := C.fh_hash_directory_combined(ctx, croot) if result == nil { err := C.fh_last_error(ctx) if err != nil { return "", fmt.Errorf("native hash failed: %s", C.GoString(err)) } return "", fmt.Errorf("native hash failed") } defer C.fh_free_string(result) return C.GoString(result), nil } // hashFileWithNative calls the native library for single file func hashFileWithNative(path string) (string, error) { ctx := getNativeHashContext() cpath := C.CString(path) defer C.free(unsafe.Pointer(cpath)) result := C.fh_hash_file(ctx, cpath) if result == nil { err := C.fh_last_error(ctx) if err != nil { return "", fmt.Errorf("native file hash failed: %s", C.GoString(err)) } return "", fmt.Errorf("native file hash failed") } defer C.fh_free_string(result) return C.GoString(result), nil } // getNativeHashContext initializes and returns the native hash context func getNativeHashContext() *C.fh_context_t { nativeCtxOnce.Do(func() { nativeCtx = C.fh_init(C.uint32_t(runtime.NumCPU())) }) return nativeCtx } // ComputeAllHashes runs all available implementations on a path func ComputeAllHashes(path string, impls []Implementation) (map[string]string, error) { results := make(map[string]string) var errors []string for _, impl := range impls { if !impl.Available() { results[impl.Name()] = "[not available]" continue } hash, err := impl.HashDataset(path) if err != nil { errors = append(errors, fmt.Sprintf("%s: %v", impl.Name(), err)) results[impl.Name()] = fmt.Sprintf("[error: %v]", err) } else { results[impl.Name()] = hash } } if len(errors) > 0 { return results, fmt.Errorf("errors: %s", strings.Join(errors, "; ")) } return results, nil } // CompareHashes checks if all hashes match the expected value func CompareHashes(results map[string]string, expected string) (bool, []string) { var mismatches []string for name, hash := range results { if strings.HasPrefix(hash, "[") { // Not available or error - skip comparison continue } if !strings.EqualFold(hash, expected) { mismatches = append(mismatches, fmt.Sprintf("%s: got %s, expected %s", name, hash, expected)) } } return len(mismatches) == 0, mismatches } // FormatHashComparison creates a readable comparison of hashes func FormatHashComparison(results map[string]string, expected string) string { var buf bytes.Buffer fmt.Fprintf(&buf, "Hash Comparison:\n") fmt.Fprintf(&buf, " Expected: %s\n", expected) fmt.Fprintf(&buf, "\n") maxNameLen := 0 for name := range results { if len(name) > maxNameLen { maxNameLen = len(name) } } for name, hash := range results { padding := strings.Repeat(" ", maxNameLen-len(name)) match := " " if !strings.HasPrefix(hash, "[") { if strings.EqualFold(hash, expected) { match = "✓" } else { match = "✗" } } fmt.Fprintf(&buf, " %s:%s %s %s\n", name, padding, match, hash) } return buf.String() }