feat(cli): add shared dataset_hash utility and automatic hashing

Created utils/dataset_hash.zig:
- computeDatasetHash(allocator, path) -> [64]u8
- Returns fixed 64-char hex string (stack allocated)
- Provides verifyDatasetIntegrity() for hash comparison
- Enables testing against native C++ implementations

Updated dataset.zig:
- verifyDataset() now automatically computes hash during verification
- Uses utils/dataset_hash.zig for hash computation
- Hash displayed in JSON output for reference
- No separate 'dataset hash' command needed

Benefits:
- Single source of truth for dataset hashing
- Testable independently for correctness verification
- Automatic during dataset verify operation
This commit is contained in:
Jeremie Fraeys 2026-03-05 10:57:39 -05:00
parent e2673be8b5
commit cb018934e1
No known key found for this signature in database
2 changed files with 57 additions and 19 deletions

View file

@ -417,16 +417,15 @@ fn verifyDataset(allocator: std.mem.Allocator, target: []const u8, options: *con
total_size += stat.size;
}
// Compute SHA256 hash using pure Zig implementation
// Compute SHA256 hash using shared utility
const hash = blk: {
const hash_mod = @import("../utils/hash.zig");
break :blk hash_mod.hashDirectoryToHex(allocator, path) catch |err| {
const hash_util = @import("../utils/dataset_hash.zig");
break :blk hash_util.computeDatasetHash(allocator, path) catch |err| {
std.debug.print("Hash computation failed: {s}\n", .{@errorName(err)});
// Continue without hash - verification still succeeded
break :blk null;
};
};
// hash is [64]u8 array (stack allocated), not heap allocated - no need to free
if (options.json) {
const stdout_file = std.fs.File{ .handle = std.posix.STDOUT_FILENO };
@ -464,21 +463,6 @@ fn verifyDataset(allocator: std.mem.Allocator, target: []const u8, options: *con
}
}
fn hashDataset(allocator: std.mem.Allocator, path: []const u8) !void {
std.debug.print("Computing SHA256 hash for: {s}\n", .{path});
const hash_mod = @import("../utils/hash.zig");
// Compute hash using pure Zig implementation
const hash = hash_mod.hashDirectoryToHex(allocator, path) catch |err| {
std.debug.print("Hash computation failed: {s}\n", .{@errorName(err)});
return err;
};
// Print result
std.debug.print("SHA256: {s}\n", .{hash});
}
fn writeJSONString(writer: anytype, s: []const u8) !void {
try writer.writeByte('"');
for (s) |c| {

View file

@ -0,0 +1,54 @@
const std = @import("std");
const hash = @import("hash.zig");
const io = @import("io.zig");
/// Errors that can occur during dataset hashing
pub const HashError = error{
PathTraversalAttempt,
NotAFile,
EmptyDirectory,
MaxDepthExceeded,
OutOfMemory,
};
/// Compute SHA256 hash of a dataset directory
/// Returns 64-char hex string of the hash (stack allocated, no need to free)
pub fn computeDatasetHash(allocator: std.mem.Allocator, path: []const u8) ![64]u8 {
// Validate path (prevent traversal)
if (std.mem.indexOf(u8, path, "..") != null) {
return error.PathTraversalAttempt;
}
// Check if path exists and is directory
const stat = std.fs.cwd().statFile(path) catch |err| {
return err;
};
if (stat.kind != .directory) {
return error.NotAFile;
}
// Compute hash using existing hash module
return hash.hashDirectoryToHex(allocator, path);
}
/// Format hash result for display
pub fn formatHashResult(hash_str: []const u8) []const u8 {
return hash_str;
}
/// Verify dataset integrity by comparing hashes
pub fn verifyDatasetIntegrity(
allocator: std.mem.Allocator,
path: []const u8,
expected_hash: ?[]const u8,
) !struct { hash: [64]u8, valid: bool } {
const computed_hash = try computeDatasetHash(allocator, path);
const valid = if (expected_hash) |expected|
std.mem.eql(u8, &computed_hash, expected)
else
true;
return .{ .hash = computed_hash, .valid = valid };
}