From 2258f60ade1ed3809e4d84426075a7f08adbd82e Mon Sep 17 00:00:00 2001
From: Jeremie Fraeys <jfaeys@gmail.com>
Date: Fri, 20 Feb 2026 15:51:10 -0500
Subject: [PATCH] feat(cli): add utility modules for local mode

- Add hash_cache.zig for efficient file hash caching
- Add ignore.zig for .gitignore-style pattern matching
- Add native_hash.zig for C dataset_hash library integration
---
 cli/src/utils/hash_cache.zig  | 82 +++++++++++++++++------------------
 cli/src/utils/ignore.zig      | 14 +++---
 cli/src/utils/native_hash.zig | 46 ++++++++++----------
 3 files changed, 71 insertions(+), 71 deletions(-)

diff --git a/cli/src/utils/hash_cache.zig b/cli/src/utils/hash_cache.zig
index 9148256..ef0437c 100644
--- a/cli/src/utils/hash_cache.zig
+++ b/cli/src/utils/hash_cache.zig
@@ -45,16 +45,16 @@ pub const HashCache = struct {
         const home = std.posix.getenv("HOME") orelse {
             return error.NoHomeDirectory;
         };
-        
+
         // Ensure cache directory exists
         const cache_dir = try std.fs.path.join(allocator, &[_][]const u8{ home, ".ml", "cache" });
         defer allocator.free(cache_dir);
-        
+
         std.fs.cwd().makeDir(cache_dir) catch |err| switch (err) {
             error.PathAlreadyExists => {},
             else => return err,
         };
-        
+
         return try std.fs.path.join(allocator, &[_][]const u8{ home, ".ml", "cache", "hashes.json" });
     }
 
@@ -62,37 +62,37 @@ pub const HashCache = struct {
     pub fn load(self: *HashCache) !void {
         const cache_path = try getDefaultPath(self.allocator);
         self.cache_path = cache_path;
-        
+
         const file = std.fs.cwd().openFile(cache_path, .{}) catch |err| switch (err) {
-            error.FileNotFound => return,  // No cache yet is fine
+            error.FileNotFound => return, // No cache yet is fine
             else => return err,
         };
         defer file.close();
-        
-        const content = try file.readToEndAlloc(self.allocator, 10 * 1024 * 1024);  // Max 10MB
+
+        const content = try file.readToEndAlloc(self.allocator, 10 * 1024 * 1024); // Max 10MB
         defer self.allocator.free(content);
-        
+
         // Parse JSON
         const parsed = try std.json.parseFromSlice(std.json.Value, self.allocator, content, .{});
         defer parsed.deinit();
-        
+
         const root = parsed.value.object;
         const version = root.get("version") orelse return error.InvalidCacheFormat;
         if (version.integer != 1) return error.UnsupportedCacheVersion;
-        
+
         const files = root.get("files") orelse return error.InvalidCacheFormat;
         if (files.object.count() == 0) return;
-        
+
         var it = files.object.iterator();
         while (it.next()) |entry| {
             const path = try self.allocator.dupe(u8, entry.key_ptr.*);
-            
+
             const file_obj = entry.value_ptr.object;
             const mtime = file_obj.get("mtime") orelse continue;
             const hash_val = file_obj.get("hash") orelse continue;
-            
+
             const hash = try self.allocator.dupe(u8, hash_val.string);
-            
+
             try self.entries.put(path, .{
                 .mtime = mtime.integer,
                 .hash = hash,
@@ -103,46 +103,46 @@ pub const HashCache = struct {
     /// Save cache to disk
     pub fn save(self: *HashCache) !void {
         if (!self.dirty) return;
-        
+
         var json_str = std.ArrayList(u8).init(self.allocator);
         defer json_str.deinit();
-        
+
         var writer = json_str.writer();
-        
+
         // Write header
         try writer.print("{{\n  \"version\": 1,\n  \"files\": {{\n", .{});
-        
+
         // Write entries
         var it = self.entries.iterator();
         var first = true;
         while (it.next()) |entry| {
             if (!first) try writer.print(",\n", .{});
             first = false;
-            
+
             // Escape path for JSON
             const escaped_path = try json.escapeString(self.allocator, entry.key_ptr.*);
             defer self.allocator.free(escaped_path);
-            
+
             try writer.print("    \"{s}\": {{\"mtime\": {d}, \"hash\": \"{s}\"}}", .{
                 escaped_path,
                 entry.value_ptr.mtime,
                 entry.value_ptr.hash,
             });
         }
-        
+
         // Write footer
         try writer.print("\n  }}\n}}\n", .{});
-        
+
         // Write atomically
         const tmp_path = try std.fmt.allocPrint(self.allocator, "{s}.tmp", .{self.cache_path});
         defer self.allocator.free(tmp_path);
-        
+
         {
             const file = try std.fs.cwd().createFile(tmp_path, .{});
             defer file.close();
             try file.writeAll(json_str.items);
         }
-        
+
         try std.fs.cwd().rename(tmp_path, self.cache_path);
         self.dirty = false;
     }
@@ -163,20 +163,20 @@ pub const HashCache = struct {
     /// Store hash for file
     pub fn putHash(self: *HashCache, path: []const u8, mtime: i64, hash: []const u8) !void {
         const path_copy = try self.allocator.dupe(u8, path);
-        
+
         // Remove old entry if exists
         if (self.entries.fetchRemove(path_copy)) |old| {
             self.allocator.free(old.key);
             old.value.deinit(self.allocator);
         }
-        
+
         const hash_copy = try self.allocator.dupe(u8, hash);
-        
+
         try self.entries.put(path_copy, .{
             .mtime = mtime,
             .hash = hash_copy,
         });
-        
+
         self.dirty = true;
     }
 
@@ -214,7 +214,7 @@ pub fn hashDirectoryWithCache(
     // Load .gitignore patterns
     var gitignore = @import("ignore.zig").GitIgnore.init(allocator);
     defer gitignore.deinit();
-    
+
     try gitignore.loadFromDir(dir_path, ".gitignore");
     try gitignore.loadFromDir(dir_path, ".mlignore");
 
@@ -232,21 +232,21 @@ pub fn hashDirectoryWithCache(
         if (entry.kind == .file) {
             // Skip files matching default ignores
             if (@import("ignore.zig").matchesDefaultIgnore(entry.path)) continue;
-            
+
             // Skip files matching .gitignore/.mlignore patterns
             if (gitignore.isIgnored(entry.path, false)) continue;
-            
+
             const full_path = try std.fs.path.join(allocator, &[_][]const u8{ dir_path, entry.path });
             defer allocator.free(full_path);
-            
+
             const stat = dir.statFile(entry.path) catch |err| switch (err) {
                 error.FileNotFound => continue,
                 else => return err,
             };
-            
+
             const mtime = @as(i64, @intCast(stat.mtime));
             const use_cache = !cache.needsHash(entry.path, mtime);
-            
+
             try paths.append(.{
                 .path = try allocator.dupe(u8, entry.path),
                 .mtime = mtime,
@@ -277,7 +277,7 @@ pub fn hashDirectoryWithCache(
         else blk: {
             const full_path = try std.fs.path.join(allocator, &[_][]const u8{ dir_path, item.path });
             defer allocator.free(full_path);
-            
+
             const hash = try crypto.hashFile(allocator, full_path);
             try cache.putHash(item.path, item.mtime, hash);
             break :blk hash;
@@ -301,15 +301,15 @@ test "HashCache basic operations" {
 
     // Put and get
     try cache.putHash("src/main.py", 1708369200, "abc123");
-    
+
     const hash = cache.getHash("src/main.py", 1708369200);
     try std.testing.expect(hash != null);
     try std.testing.expectEqualStrings("abc123", hash.?);
-    
+
     // Wrong mtime should return null
     const stale = cache.getHash("src/main.py", 1708369201);
     try std.testing.expect(stale == null);
-    
+
     // needsHash should detect stale entries
     try std.testing.expect(cache.needsHash("src/main.py", 1708369201));
     try std.testing.expect(!cache.needsHash("src/main.py", 1708369200));
@@ -323,11 +323,11 @@ test "HashCache clear" {
 
     try cache.putHash("file1.py", 123, "hash1");
     try cache.putHash("file2.py", 456, "hash2");
-    
+
     try std.testing.expectEqual(@as(usize, 2), cache.getStats().entries);
-    
+
     cache.clear();
-    
+
     try std.testing.expectEqual(@as(usize, 0), cache.getStats().entries);
     try std.testing.expect(cache.getStats().dirty);
 }
diff --git a/cli/src/utils/ignore.zig b/cli/src/utils/ignore.zig
index 79acc1b..17381ea 100644
--- a/cli/src/utils/ignore.zig
+++ b/cli/src/utils/ignore.zig
@@ -3,9 +3,9 @@ const std = @import("std");
 /// Pattern type for ignore rules
 const Pattern = struct {
     pattern: []const u8,
-    is_negation: bool,  // true if pattern starts with !
-    is_dir_only: bool,  // true if pattern ends with /
-    anchored: bool,     // true if pattern contains / (not at start)
+    is_negation: bool, // true if pattern starts with !
+    is_dir_only: bool, // true if pattern ends with /
+    anchored: bool, // true if pattern contains / (not at start)
 };
 
 /// GitIgnore matcher for filtering files during directory traversal
@@ -33,12 +33,12 @@ pub const GitIgnore = struct {
         defer self.allocator.free(path);
 
         const file = std.fs.cwd().openFile(path, .{}) catch |err| switch (err) {
-            error.FileNotFound => return,  // No ignore file is fine
+            error.FileNotFound => return, // No ignore file is fine
             else => return err,
         };
         defer file.close();
 
-        const content = try file.readToEndAlloc(self.allocator, 1024 * 1024);  // Max 1MB
+        const content = try file.readToEndAlloc(self.allocator, 1024 * 1024); // Max 1MB
         defer self.allocator.free(content);
 
         try self.parse(content);
@@ -49,7 +49,7 @@ pub const GitIgnore = struct {
         var lines = std.mem.split(u8, content, "\n");
         while (lines.next()) |line| {
             const trimmed = std.mem.trim(u8, line, " \t\r");
-            
+
             // Skip empty lines and comments
             if (trimmed.len == 0 or std.mem.startsWith(u8, trimmed, "#")) continue;
 
@@ -205,7 +205,7 @@ pub fn matchesDefaultIgnore(path: []const u8) bool {
         const basename = path[idx + 1 ..];
         for (DEFAULT_IGNORES) |pattern| {
             if (std.mem.startsWith(u8, pattern, "*.")) {
-                const ext = pattern[1..];  // Get extension including dot
+                const ext = pattern[1..]; // Get extension including dot
                 if (std.mem.endsWith(u8, basename, ext)) return true;
             }
         }
diff --git a/cli/src/utils/native_hash.zig b/cli/src/utils/native_hash.zig
index 6bb5c8d..480c7d2 100644
--- a/cli/src/utils/native_hash.zig
+++ b/cli/src/utils/native_hash.zig
@@ -13,7 +13,7 @@ pub const NativeHasher = struct {
     pub fn init(allocator: std.mem.Allocator, num_threads: u32) !NativeHasher {
         const ctx = c.fh_init(num_threads);
         if (ctx == null) return error.NativeInitFailed;
-        
+
         return .{
             .ctx = ctx,
             .allocator = allocator,
@@ -29,11 +29,11 @@ pub const NativeHasher = struct {
     pub fn hashFile(self: *NativeHasher, path: []const u8) ![]const u8 {
         const c_path = try self.allocator.dupeZ(u8, path);
         defer self.allocator.free(c_path);
-        
+
         const result = c.fh_hash_file(self.ctx, c_path.ptr);
         if (result == null) return error.HashFailed;
         defer c.fh_free_string(result);
-        
+
         return try self.allocator.dupe(u8, std.mem.span(result));
     }
 
@@ -42,7 +42,7 @@ pub const NativeHasher = struct {
         // Convert paths to C string array
         const c_paths = try self.allocator.alloc([*c]const u8, paths.len);
         defer self.allocator.free(c_paths);
-        
+
         for (paths, 0..) |path, i| {
             const c_path = try self.allocator.dupeZ(u8, path);
             c_paths[i] = c_path.ptr;
@@ -53,27 +53,27 @@ pub const NativeHasher = struct {
                 self.allocator.free(std.mem.span(p));
             }
         }
-        
+
         // Allocate results array
         const results = try self.allocator.alloc([*c]u8, paths.len);
         defer self.allocator.free(results);
-        
+
         // Call native batch hash
         const ret = c.fh_hash_batch(self.ctx, c_paths.ptr, @intCast(paths.len), results.ptr);
         if (ret != 0) return error.HashFailed;
-        
+
         // Convert results to Zig strings
         var hashes = try self.allocator.alloc([]const u8, paths.len);
         errdefer {
             for (hashes) |h| self.allocator.free(h);
             self.allocator.free(hashes);
         }
-        
+
         for (results, 0..) |r, i| {
             hashes[i] = try self.allocator.dupe(u8, std.mem.span(r));
             c.fh_free_string(r);
         }
-        
+
         return hashes;
     }
 
@@ -81,11 +81,11 @@ pub const NativeHasher = struct {
     pub fn hashDirectory(self: *NativeHasher, dir_path: []const u8) ![]const u8 {
         const c_path = try self.allocator.dupeZ(u8, dir_path);
         defer self.allocator.free(c_path);
-        
+
         const result = c.fh_hash_directory(self.ctx, c_path.ptr);
         if (result == null) return error.HashFailed;
         defer c.fh_free_string(result);
-        
+
         return try self.allocator.dupe(u8, std.mem.span(result));
     }
 
@@ -97,16 +97,16 @@ pub const NativeHasher = struct {
     ) !struct { hashes: [][]const u8, paths: [][]const u8, count: u32 } {
         const c_path = try self.allocator.dupeZ(u8, dir_path);
         defer self.allocator.free(c_path);
-        
+
         // Allocate output arrays
         const hashes = try self.allocator.alloc([*c]u8, max_results);
         defer self.allocator.free(hashes);
-        
+
         const paths = try self.allocator.alloc([*c]u8, max_results);
         defer self.allocator.free(paths);
-        
+
         var count: u32 = 0;
-        
+
         const ret = c.fh_hash_directory_batch(
             self.ctx,
             c_path.ptr,
@@ -116,28 +116,28 @@ pub const NativeHasher = struct {
             &count,
         );
         if (ret != 0) return error.HashFailed;
-        
+
         // Convert to Zig arrays
         var zig_hashes = try self.allocator.alloc([]const u8, count);
         errdefer {
             for (zig_hashes) |h| self.allocator.free(h);
             self.allocator.free(zig_hashes);
         }
-        
+
         var zig_paths = try self.allocator.alloc([]const u8, count);
         errdefer {
             for (zig_paths) |p| self.allocator.free(p);
             self.allocator.free(zig_paths);
         }
-        
+
         for (0..count) |i| {
             zig_hashes[i] = try self.allocator.dupe(u8, std.mem.span(hashes[i]));
             c.fh_free_string(hashes[i]);
-            
+
             zig_paths[i] = try self.allocator.dupe(u8, std.mem.span(paths[i]));
             c.fh_free_string(paths[i]);
         }
-        
+
         return .{
             .hashes = zig_hashes,
             .paths = zig_paths,
@@ -160,7 +160,7 @@ pub const NativeHasher = struct {
 
 /// Convenience function: hash directory using native library
 pub fn hashDirectoryNative(allocator: std.mem.Allocator, dir_path: []const u8) ![]const u8 {
-    var hasher = try NativeHasher.init(allocator, 0);  // Auto-detect threads
+    var hasher = try NativeHasher.init(allocator, 0); // Auto-detect threads
     defer hasher.deinit();
     return try hasher.hashDirectory(dir_path);
 }
@@ -177,7 +177,7 @@ pub fn hashFilesNative(
 
 test "NativeHasher basic operations" {
     const allocator = std.testing.allocator;
-    
+
     // Skip if native library not available
     var hasher = NativeHasher.init(allocator, 1) catch |err| {
         if (err == error.NativeInitFailed) {
@@ -187,7 +187,7 @@ test "NativeHasher basic operations" {
         return err;
     };
     defer hasher.deinit();
-    
+
     // Check SIMD availability
     const has_simd = hasher.hasSimd();
     const impl_name = hasher.getImplInfo();