From 2258f60ade1ed3809e4d84426075a7f08adbd82e Mon Sep 17 00:00:00 2001 From: Jeremie Fraeys Date: Fri, 20 Feb 2026 15:51:10 -0500 Subject: [PATCH] feat(cli): add utility modules for local mode - Add hash_cache.zig for efficient file hash caching - Add ignore.zig for .gitignore-style pattern matching - Add native_hash.zig for C dataset_hash library integration --- cli/src/utils/hash_cache.zig | 82 +++++++++++++++++------------------ cli/src/utils/ignore.zig | 14 +++--- cli/src/utils/native_hash.zig | 46 ++++++++++---------- 3 files changed, 71 insertions(+), 71 deletions(-) diff --git a/cli/src/utils/hash_cache.zig b/cli/src/utils/hash_cache.zig index 9148256..ef0437c 100644 --- a/cli/src/utils/hash_cache.zig +++ b/cli/src/utils/hash_cache.zig @@ -45,16 +45,16 @@ pub const HashCache = struct { const home = std.posix.getenv("HOME") orelse { return error.NoHomeDirectory; }; - + // Ensure cache directory exists const cache_dir = try std.fs.path.join(allocator, &[_][]const u8{ home, ".ml", "cache" }); defer allocator.free(cache_dir); - + std.fs.cwd().makeDir(cache_dir) catch |err| switch (err) { error.PathAlreadyExists => {}, else => return err, }; - + return try std.fs.path.join(allocator, &[_][]const u8{ home, ".ml", "cache", "hashes.json" }); } @@ -62,37 +62,37 @@ pub const HashCache = struct { pub fn load(self: *HashCache) !void { const cache_path = try getDefaultPath(self.allocator); self.cache_path = cache_path; - + const file = std.fs.cwd().openFile(cache_path, .{}) catch |err| switch (err) { - error.FileNotFound => return, // No cache yet is fine + error.FileNotFound => return, // No cache yet is fine else => return err, }; defer file.close(); - - const content = try file.readToEndAlloc(self.allocator, 10 * 1024 * 1024); // Max 10MB + + const content = try file.readToEndAlloc(self.allocator, 10 * 1024 * 1024); // Max 10MB defer self.allocator.free(content); - + // Parse JSON const parsed = try std.json.parseFromSlice(std.json.Value, self.allocator, content, .{}); defer parsed.deinit(); - + const root = parsed.value.object; const version = root.get("version") orelse return error.InvalidCacheFormat; if (version.integer != 1) return error.UnsupportedCacheVersion; - + const files = root.get("files") orelse return error.InvalidCacheFormat; if (files.object.count() == 0) return; - + var it = files.object.iterator(); while (it.next()) |entry| { const path = try self.allocator.dupe(u8, entry.key_ptr.*); - + const file_obj = entry.value_ptr.object; const mtime = file_obj.get("mtime") orelse continue; const hash_val = file_obj.get("hash") orelse continue; - + const hash = try self.allocator.dupe(u8, hash_val.string); - + try self.entries.put(path, .{ .mtime = mtime.integer, .hash = hash, @@ -103,46 +103,46 @@ pub const HashCache = struct { /// Save cache to disk pub fn save(self: *HashCache) !void { if (!self.dirty) return; - + var json_str = std.ArrayList(u8).init(self.allocator); defer json_str.deinit(); - + var writer = json_str.writer(); - + // Write header try writer.print("{{\n \"version\": 1,\n \"files\": {{\n", .{}); - + // Write entries var it = self.entries.iterator(); var first = true; while (it.next()) |entry| { if (!first) try writer.print(",\n", .{}); first = false; - + // Escape path for JSON const escaped_path = try json.escapeString(self.allocator, entry.key_ptr.*); defer self.allocator.free(escaped_path); - + try writer.print(" \"{s}\": {{\"mtime\": {d}, \"hash\": \"{s}\"}}", .{ escaped_path, entry.value_ptr.mtime, entry.value_ptr.hash, }); } - + // Write footer try writer.print("\n }}\n}}\n", .{}); - + // Write atomically const tmp_path = try std.fmt.allocPrint(self.allocator, "{s}.tmp", .{self.cache_path}); defer self.allocator.free(tmp_path); - + { const file = try std.fs.cwd().createFile(tmp_path, .{}); defer file.close(); try file.writeAll(json_str.items); } - + try std.fs.cwd().rename(tmp_path, self.cache_path); self.dirty = false; } @@ -163,20 +163,20 @@ pub const HashCache = struct { /// Store hash for file pub fn putHash(self: *HashCache, path: []const u8, mtime: i64, hash: []const u8) !void { const path_copy = try self.allocator.dupe(u8, path); - + // Remove old entry if exists if (self.entries.fetchRemove(path_copy)) |old| { self.allocator.free(old.key); old.value.deinit(self.allocator); } - + const hash_copy = try self.allocator.dupe(u8, hash); - + try self.entries.put(path_copy, .{ .mtime = mtime, .hash = hash_copy, }); - + self.dirty = true; } @@ -214,7 +214,7 @@ pub fn hashDirectoryWithCache( // Load .gitignore patterns var gitignore = @import("ignore.zig").GitIgnore.init(allocator); defer gitignore.deinit(); - + try gitignore.loadFromDir(dir_path, ".gitignore"); try gitignore.loadFromDir(dir_path, ".mlignore"); @@ -232,21 +232,21 @@ pub fn hashDirectoryWithCache( if (entry.kind == .file) { // Skip files matching default ignores if (@import("ignore.zig").matchesDefaultIgnore(entry.path)) continue; - + // Skip files matching .gitignore/.mlignore patterns if (gitignore.isIgnored(entry.path, false)) continue; - + const full_path = try std.fs.path.join(allocator, &[_][]const u8{ dir_path, entry.path }); defer allocator.free(full_path); - + const stat = dir.statFile(entry.path) catch |err| switch (err) { error.FileNotFound => continue, else => return err, }; - + const mtime = @as(i64, @intCast(stat.mtime)); const use_cache = !cache.needsHash(entry.path, mtime); - + try paths.append(.{ .path = try allocator.dupe(u8, entry.path), .mtime = mtime, @@ -277,7 +277,7 @@ pub fn hashDirectoryWithCache( else blk: { const full_path = try std.fs.path.join(allocator, &[_][]const u8{ dir_path, item.path }); defer allocator.free(full_path); - + const hash = try crypto.hashFile(allocator, full_path); try cache.putHash(item.path, item.mtime, hash); break :blk hash; @@ -301,15 +301,15 @@ test "HashCache basic operations" { // Put and get try cache.putHash("src/main.py", 1708369200, "abc123"); - + const hash = cache.getHash("src/main.py", 1708369200); try std.testing.expect(hash != null); try std.testing.expectEqualStrings("abc123", hash.?); - + // Wrong mtime should return null const stale = cache.getHash("src/main.py", 1708369201); try std.testing.expect(stale == null); - + // needsHash should detect stale entries try std.testing.expect(cache.needsHash("src/main.py", 1708369201)); try std.testing.expect(!cache.needsHash("src/main.py", 1708369200)); @@ -323,11 +323,11 @@ test "HashCache clear" { try cache.putHash("file1.py", 123, "hash1"); try cache.putHash("file2.py", 456, "hash2"); - + try std.testing.expectEqual(@as(usize, 2), cache.getStats().entries); - + cache.clear(); - + try std.testing.expectEqual(@as(usize, 0), cache.getStats().entries); try std.testing.expect(cache.getStats().dirty); } diff --git a/cli/src/utils/ignore.zig b/cli/src/utils/ignore.zig index 79acc1b..17381ea 100644 --- a/cli/src/utils/ignore.zig +++ b/cli/src/utils/ignore.zig @@ -3,9 +3,9 @@ const std = @import("std"); /// Pattern type for ignore rules const Pattern = struct { pattern: []const u8, - is_negation: bool, // true if pattern starts with ! - is_dir_only: bool, // true if pattern ends with / - anchored: bool, // true if pattern contains / (not at start) + is_negation: bool, // true if pattern starts with ! + is_dir_only: bool, // true if pattern ends with / + anchored: bool, // true if pattern contains / (not at start) }; /// GitIgnore matcher for filtering files during directory traversal @@ -33,12 +33,12 @@ pub const GitIgnore = struct { defer self.allocator.free(path); const file = std.fs.cwd().openFile(path, .{}) catch |err| switch (err) { - error.FileNotFound => return, // No ignore file is fine + error.FileNotFound => return, // No ignore file is fine else => return err, }; defer file.close(); - const content = try file.readToEndAlloc(self.allocator, 1024 * 1024); // Max 1MB + const content = try file.readToEndAlloc(self.allocator, 1024 * 1024); // Max 1MB defer self.allocator.free(content); try self.parse(content); @@ -49,7 +49,7 @@ pub const GitIgnore = struct { var lines = std.mem.split(u8, content, "\n"); while (lines.next()) |line| { const trimmed = std.mem.trim(u8, line, " \t\r"); - + // Skip empty lines and comments if (trimmed.len == 0 or std.mem.startsWith(u8, trimmed, "#")) continue; @@ -205,7 +205,7 @@ pub fn matchesDefaultIgnore(path: []const u8) bool { const basename = path[idx + 1 ..]; for (DEFAULT_IGNORES) |pattern| { if (std.mem.startsWith(u8, pattern, "*.")) { - const ext = pattern[1..]; // Get extension including dot + const ext = pattern[1..]; // Get extension including dot if (std.mem.endsWith(u8, basename, ext)) return true; } } diff --git a/cli/src/utils/native_hash.zig b/cli/src/utils/native_hash.zig index 6bb5c8d..480c7d2 100644 --- a/cli/src/utils/native_hash.zig +++ b/cli/src/utils/native_hash.zig @@ -13,7 +13,7 @@ pub const NativeHasher = struct { pub fn init(allocator: std.mem.Allocator, num_threads: u32) !NativeHasher { const ctx = c.fh_init(num_threads); if (ctx == null) return error.NativeInitFailed; - + return .{ .ctx = ctx, .allocator = allocator, @@ -29,11 +29,11 @@ pub const NativeHasher = struct { pub fn hashFile(self: *NativeHasher, path: []const u8) ![]const u8 { const c_path = try self.allocator.dupeZ(u8, path); defer self.allocator.free(c_path); - + const result = c.fh_hash_file(self.ctx, c_path.ptr); if (result == null) return error.HashFailed; defer c.fh_free_string(result); - + return try self.allocator.dupe(u8, std.mem.span(result)); } @@ -42,7 +42,7 @@ pub const NativeHasher = struct { // Convert paths to C string array const c_paths = try self.allocator.alloc([*c]const u8, paths.len); defer self.allocator.free(c_paths); - + for (paths, 0..) |path, i| { const c_path = try self.allocator.dupeZ(u8, path); c_paths[i] = c_path.ptr; @@ -53,27 +53,27 @@ pub const NativeHasher = struct { self.allocator.free(std.mem.span(p)); } } - + // Allocate results array const results = try self.allocator.alloc([*c]u8, paths.len); defer self.allocator.free(results); - + // Call native batch hash const ret = c.fh_hash_batch(self.ctx, c_paths.ptr, @intCast(paths.len), results.ptr); if (ret != 0) return error.HashFailed; - + // Convert results to Zig strings var hashes = try self.allocator.alloc([]const u8, paths.len); errdefer { for (hashes) |h| self.allocator.free(h); self.allocator.free(hashes); } - + for (results, 0..) |r, i| { hashes[i] = try self.allocator.dupe(u8, std.mem.span(r)); c.fh_free_string(r); } - + return hashes; } @@ -81,11 +81,11 @@ pub const NativeHasher = struct { pub fn hashDirectory(self: *NativeHasher, dir_path: []const u8) ![]const u8 { const c_path = try self.allocator.dupeZ(u8, dir_path); defer self.allocator.free(c_path); - + const result = c.fh_hash_directory(self.ctx, c_path.ptr); if (result == null) return error.HashFailed; defer c.fh_free_string(result); - + return try self.allocator.dupe(u8, std.mem.span(result)); } @@ -97,16 +97,16 @@ pub const NativeHasher = struct { ) !struct { hashes: [][]const u8, paths: [][]const u8, count: u32 } { const c_path = try self.allocator.dupeZ(u8, dir_path); defer self.allocator.free(c_path); - + // Allocate output arrays const hashes = try self.allocator.alloc([*c]u8, max_results); defer self.allocator.free(hashes); - + const paths = try self.allocator.alloc([*c]u8, max_results); defer self.allocator.free(paths); - + var count: u32 = 0; - + const ret = c.fh_hash_directory_batch( self.ctx, c_path.ptr, @@ -116,28 +116,28 @@ pub const NativeHasher = struct { &count, ); if (ret != 0) return error.HashFailed; - + // Convert to Zig arrays var zig_hashes = try self.allocator.alloc([]const u8, count); errdefer { for (zig_hashes) |h| self.allocator.free(h); self.allocator.free(zig_hashes); } - + var zig_paths = try self.allocator.alloc([]const u8, count); errdefer { for (zig_paths) |p| self.allocator.free(p); self.allocator.free(zig_paths); } - + for (0..count) |i| { zig_hashes[i] = try self.allocator.dupe(u8, std.mem.span(hashes[i])); c.fh_free_string(hashes[i]); - + zig_paths[i] = try self.allocator.dupe(u8, std.mem.span(paths[i])); c.fh_free_string(paths[i]); } - + return .{ .hashes = zig_hashes, .paths = zig_paths, @@ -160,7 +160,7 @@ pub const NativeHasher = struct { /// Convenience function: hash directory using native library pub fn hashDirectoryNative(allocator: std.mem.Allocator, dir_path: []const u8) ![]const u8 { - var hasher = try NativeHasher.init(allocator, 0); // Auto-detect threads + var hasher = try NativeHasher.init(allocator, 0); // Auto-detect threads defer hasher.deinit(); return try hasher.hashDirectory(dir_path); } @@ -177,7 +177,7 @@ pub fn hashFilesNative( test "NativeHasher basic operations" { const allocator = std.testing.allocator; - + // Skip if native library not available var hasher = NativeHasher.init(allocator, 1) catch |err| { if (err == error.NativeInitFailed) { @@ -187,7 +187,7 @@ test "NativeHasher basic operations" { return err; }; defer hasher.deinit(); - + // Check SIMD availability const has_simd = hasher.hasSimd(); const impl_name = hasher.getImplInfo();