From cb018934e126dffabe7100c361ec5eb86c88e18b Mon Sep 17 00:00:00 2001 From: Jeremie Fraeys Date: Thu, 5 Mar 2026 10:57:39 -0500 Subject: [PATCH] feat(cli): add shared dataset_hash utility and automatic hashing Created utils/dataset_hash.zig: - computeDatasetHash(allocator, path) -> [64]u8 - Returns fixed 64-char hex string (stack allocated) - Provides verifyDatasetIntegrity() for hash comparison - Enables testing against native C++ implementations Updated dataset.zig: - verifyDataset() now automatically computes hash during verification - Uses utils/dataset_hash.zig for hash computation - Hash displayed in JSON output for reference - No separate 'dataset hash' command needed Benefits: - Single source of truth for dataset hashing - Testable independently for correctness verification - Automatic during dataset verify operation --- cli/src/commands/dataset.zig | 22 ++------------ cli/src/utils/dataset_hash.zig | 54 ++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 19 deletions(-) create mode 100644 cli/src/utils/dataset_hash.zig diff --git a/cli/src/commands/dataset.zig b/cli/src/commands/dataset.zig index e1a3e06..d6e57de 100644 --- a/cli/src/commands/dataset.zig +++ b/cli/src/commands/dataset.zig @@ -417,16 +417,15 @@ fn verifyDataset(allocator: std.mem.Allocator, target: []const u8, options: *con total_size += stat.size; } - // Compute SHA256 hash using pure Zig implementation + // Compute SHA256 hash using shared utility const hash = blk: { - const hash_mod = @import("../utils/hash.zig"); - break :blk hash_mod.hashDirectoryToHex(allocator, path) catch |err| { + const hash_util = @import("../utils/dataset_hash.zig"); + break :blk hash_util.computeDatasetHash(allocator, path) catch |err| { std.debug.print("Hash computation failed: {s}\n", .{@errorName(err)}); // Continue without hash - verification still succeeded break :blk null; }; }; - // hash is [64]u8 array (stack allocated), not heap allocated - no need to free if (options.json) { const stdout_file = std.fs.File{ .handle = std.posix.STDOUT_FILENO }; @@ -464,21 +463,6 @@ fn verifyDataset(allocator: std.mem.Allocator, target: []const u8, options: *con } } -fn hashDataset(allocator: std.mem.Allocator, path: []const u8) !void { - std.debug.print("Computing SHA256 hash for: {s}\n", .{path}); - - const hash_mod = @import("../utils/hash.zig"); - - // Compute hash using pure Zig implementation - const hash = hash_mod.hashDirectoryToHex(allocator, path) catch |err| { - std.debug.print("Hash computation failed: {s}\n", .{@errorName(err)}); - return err; - }; - - // Print result - std.debug.print("SHA256: {s}\n", .{hash}); -} - fn writeJSONString(writer: anytype, s: []const u8) !void { try writer.writeByte('"'); for (s) |c| { diff --git a/cli/src/utils/dataset_hash.zig b/cli/src/utils/dataset_hash.zig new file mode 100644 index 0000000..67bebca --- /dev/null +++ b/cli/src/utils/dataset_hash.zig @@ -0,0 +1,54 @@ +const std = @import("std"); +const hash = @import("hash.zig"); +const io = @import("io.zig"); + +/// Errors that can occur during dataset hashing +pub const HashError = error{ + PathTraversalAttempt, + NotAFile, + EmptyDirectory, + MaxDepthExceeded, + OutOfMemory, +}; + +/// Compute SHA256 hash of a dataset directory +/// Returns 64-char hex string of the hash (stack allocated, no need to free) +pub fn computeDatasetHash(allocator: std.mem.Allocator, path: []const u8) ![64]u8 { + // Validate path (prevent traversal) + if (std.mem.indexOf(u8, path, "..") != null) { + return error.PathTraversalAttempt; + } + + // Check if path exists and is directory + const stat = std.fs.cwd().statFile(path) catch |err| { + return err; + }; + + if (stat.kind != .directory) { + return error.NotAFile; + } + + // Compute hash using existing hash module + return hash.hashDirectoryToHex(allocator, path); +} + +/// Format hash result for display +pub fn formatHashResult(hash_str: []const u8) []const u8 { + return hash_str; +} + +/// Verify dataset integrity by comparing hashes +pub fn verifyDatasetIntegrity( + allocator: std.mem.Allocator, + path: []const u8, + expected_hash: ?[]const u8, +) !struct { hash: [64]u8, valid: bool } { + const computed_hash = try computeDatasetHash(allocator, path); + + const valid = if (expected_hash) |expected| + std.mem.eql(u8, &computed_hash, expected) + else + true; + + return .{ .hash = computed_hash, .valid = valid }; +}