feat(cli): add utility modules for local mode

- Add hash_cache.zig for efficient file hash caching - Add ignore.zig for .gitignore-style pattern matching - Add native_hash.zig for C dataset_hash library integration
2026-02-20 15:51:10 -05:00 · 2026-02-20 15:51:10 -05:00 · 2258f60ade
commit 2258f60ade
parent 7ce0fd251e
3 changed files with 71 additions and 71 deletions
--- a/cli/src/utils/hash_cache.zig
+++ b/cli/src/utils/hash_cache.zig
@ -45,16 +45,16 @@ pub const HashCache = struct {
        const home = std.posix.getenv("HOME") orelse {
            return error.NoHomeDirectory;
        };
-        
+
        // Ensure cache directory exists
        const cache_dir = try std.fs.path.join(allocator, &[_][]const u8{ home, ".ml", "cache" });
        defer allocator.free(cache_dir);
-        
+
        std.fs.cwd().makeDir(cache_dir) catch |err| switch (err) {
            error.PathAlreadyExists => {},
            else => return err,
        };
-        
+
        return try std.fs.path.join(allocator, &[_][]const u8{ home, ".ml", "cache", "hashes.json" });
    }

@ -62,37 +62,37 @@ pub const HashCache = struct {
    pub fn load(self: *HashCache) !void {
        const cache_path = try getDefaultPath(self.allocator);
        self.cache_path = cache_path;
-        
+
        const file = std.fs.cwd().openFile(cache_path, .{}) catch |err| switch (err) {
-            error.FileNotFound => return,  // No cache yet is fine
+            error.FileNotFound => return, // No cache yet is fine
            else => return err,
        };
        defer file.close();
-        
-        const content = try file.readToEndAlloc(self.allocator, 10 * 1024 * 1024);  // Max 10MB
+
+        const content = try file.readToEndAlloc(self.allocator, 10 * 1024 * 1024); // Max 10MB
        defer self.allocator.free(content);
-        
+
        // Parse JSON
        const parsed = try std.json.parseFromSlice(std.json.Value, self.allocator, content, .{});
        defer parsed.deinit();
-        
+
        const root = parsed.value.object;
        const version = root.get("version") orelse return error.InvalidCacheFormat;
        if (version.integer != 1) return error.UnsupportedCacheVersion;
-        
+
        const files = root.get("files") orelse return error.InvalidCacheFormat;
        if (files.object.count() == 0) return;
-        
+
        var it = files.object.iterator();
        while (it.next()) |entry| {
            const path = try self.allocator.dupe(u8, entry.key_ptr.*);
-            
+
            const file_obj = entry.value_ptr.object;
            const mtime = file_obj.get("mtime") orelse continue;
            const hash_val = file_obj.get("hash") orelse continue;
-            
+
            const hash = try self.allocator.dupe(u8, hash_val.string);
-            
+
            try self.entries.put(path, .{
                .mtime = mtime.integer,
                .hash = hash,
@ -103,46 +103,46 @@ pub const HashCache = struct {
    /// Save cache to disk
    pub fn save(self: *HashCache) !void {
        if (!self.dirty) return;
-        
+
        var json_str = std.ArrayList(u8).init(self.allocator);
        defer json_str.deinit();
-        
+
        var writer = json_str.writer();
-        
+
        // Write header
        try writer.print("{{\n  \"version\": 1,\n  \"files\": {{\n", .{});
-        
+
        // Write entries
        var it = self.entries.iterator();
        var first = true;
        while (it.next()) |entry| {
            if (!first) try writer.print(",\n", .{});
            first = false;
-            
+
            // Escape path for JSON
            const escaped_path = try json.escapeString(self.allocator, entry.key_ptr.*);
            defer self.allocator.free(escaped_path);
-            
+
            try writer.print("    \"{s}\": {{\"mtime\": {d}, \"hash\": \"{s}\"}}", .{
                escaped_path,
                entry.value_ptr.mtime,
                entry.value_ptr.hash,
            });
        }
-        
+
        // Write footer
        try writer.print("\n  }}\n}}\n", .{});
-        
+
        // Write atomically
        const tmp_path = try std.fmt.allocPrint(self.allocator, "{s}.tmp", .{self.cache_path});
        defer self.allocator.free(tmp_path);
-        
+
        {
            const file = try std.fs.cwd().createFile(tmp_path, .{});
            defer file.close();
            try file.writeAll(json_str.items);
        }
-        
+
        try std.fs.cwd().rename(tmp_path, self.cache_path);
        self.dirty = false;
    }
@ -163,20 +163,20 @@ pub const HashCache = struct {
    /// Store hash for file
    pub fn putHash(self: *HashCache, path: []const u8, mtime: i64, hash: []const u8) !void {
        const path_copy = try self.allocator.dupe(u8, path);
-        
+
        // Remove old entry if exists
        if (self.entries.fetchRemove(path_copy)) |old| {
            self.allocator.free(old.key);
            old.value.deinit(self.allocator);
        }
-        
+
        const hash_copy = try self.allocator.dupe(u8, hash);
-        
+
        try self.entries.put(path_copy, .{
            .mtime = mtime,
            .hash = hash_copy,
        });
-        
+
        self.dirty = true;
    }

@ -214,7 +214,7 @@ pub fn hashDirectoryWithCache(
    // Load .gitignore patterns
    var gitignore = @import("ignore.zig").GitIgnore.init(allocator);
    defer gitignore.deinit();
-    
+
    try gitignore.loadFromDir(dir_path, ".gitignore");
    try gitignore.loadFromDir(dir_path, ".mlignore");

@ -232,21 +232,21 @@ pub fn hashDirectoryWithCache(
        if (entry.kind == .file) {
            // Skip files matching default ignores
            if (@import("ignore.zig").matchesDefaultIgnore(entry.path)) continue;
-            
+
            // Skip files matching .gitignore/.mlignore patterns
            if (gitignore.isIgnored(entry.path, false)) continue;
-            
+
            const full_path = try std.fs.path.join(allocator, &[_][]const u8{ dir_path, entry.path });
            defer allocator.free(full_path);
-            
+
            const stat = dir.statFile(entry.path) catch |err| switch (err) {
                error.FileNotFound => continue,
                else => return err,
            };
-            
+
            const mtime = @as(i64, @intCast(stat.mtime));
            const use_cache = !cache.needsHash(entry.path, mtime);
-            
+
            try paths.append(.{
                .path = try allocator.dupe(u8, entry.path),
                .mtime = mtime,
@ -277,7 +277,7 @@ pub fn hashDirectoryWithCache(
        else blk: {
            const full_path = try std.fs.path.join(allocator, &[_][]const u8{ dir_path, item.path });
            defer allocator.free(full_path);
-            
+
            const hash = try crypto.hashFile(allocator, full_path);
            try cache.putHash(item.path, item.mtime, hash);
            break :blk hash;
@ -301,15 +301,15 @@ test "HashCache basic operations" {

    // Put and get
    try cache.putHash("src/main.py", 1708369200, "abc123");
-    
+
    const hash = cache.getHash("src/main.py", 1708369200);
    try std.testing.expect(hash != null);
    try std.testing.expectEqualStrings("abc123", hash.?);
-    
+
    // Wrong mtime should return null
    const stale = cache.getHash("src/main.py", 1708369201);
    try std.testing.expect(stale == null);
-    
+
    // needsHash should detect stale entries
    try std.testing.expect(cache.needsHash("src/main.py", 1708369201));
    try std.testing.expect(!cache.needsHash("src/main.py", 1708369200));
@ -323,11 +323,11 @@ test "HashCache clear" {

    try cache.putHash("file1.py", 123, "hash1");
    try cache.putHash("file2.py", 456, "hash2");
-    
+
    try std.testing.expectEqual(@as(usize, 2), cache.getStats().entries);
-    
+
    cache.clear();
-    
+
    try std.testing.expectEqual(@as(usize, 0), cache.getStats().entries);
    try std.testing.expect(cache.getStats().dirty);
 }
--- a/cli/src/utils/ignore.zig
+++ b/cli/src/utils/ignore.zig
@ -3,9 +3,9 @@ const std = @import("std");
 /// Pattern type for ignore rules
 const Pattern = struct {
    pattern: []const u8,
-    is_negation: bool,  // true if pattern starts with !
-    is_dir_only: bool,  // true if pattern ends with /
-    anchored: bool,     // true if pattern contains / (not at start)
+    is_negation: bool, // true if pattern starts with !
+    is_dir_only: bool, // true if pattern ends with /
+    anchored: bool, // true if pattern contains / (not at start)
 };

 /// GitIgnore matcher for filtering files during directory traversal
@ -33,12 +33,12 @@ pub const GitIgnore = struct {
        defer self.allocator.free(path);

        const file = std.fs.cwd().openFile(path, .{}) catch |err| switch (err) {
-            error.FileNotFound => return,  // No ignore file is fine
+            error.FileNotFound => return, // No ignore file is fine
            else => return err,
        };
        defer file.close();

-        const content = try file.readToEndAlloc(self.allocator, 1024 * 1024);  // Max 1MB
+        const content = try file.readToEndAlloc(self.allocator, 1024 * 1024); // Max 1MB
        defer self.allocator.free(content);

        try self.parse(content);
@ -49,7 +49,7 @@ pub const GitIgnore = struct {
        var lines = std.mem.split(u8, content, "\n");
        while (lines.next()) |line| {
            const trimmed = std.mem.trim(u8, line, " \t\r");
-            
+
            // Skip empty lines and comments
            if (trimmed.len == 0 or std.mem.startsWith(u8, trimmed, "#")) continue;

@ -205,7 +205,7 @@ pub fn matchesDefaultIgnore(path: []const u8) bool {
        const basename = path[idx + 1 ..];
        for (DEFAULT_IGNORES) |pattern| {
            if (std.mem.startsWith(u8, pattern, "*.")) {
-                const ext = pattern[1..];  // Get extension including dot
+                const ext = pattern[1..]; // Get extension including dot
                if (std.mem.endsWith(u8, basename, ext)) return true;
            }
        }
--- a/cli/src/utils/native_hash.zig
+++ b/cli/src/utils/native_hash.zig
@ -13,7 +13,7 @@ pub const NativeHasher = struct {
    pub fn init(allocator: std.mem.Allocator, num_threads: u32) !NativeHasher {
        const ctx = c.fh_init(num_threads);
        if (ctx == null) return error.NativeInitFailed;
-        
+
        return .{
            .ctx = ctx,
            .allocator = allocator,
@ -29,11 +29,11 @@ pub const NativeHasher = struct {
    pub fn hashFile(self: *NativeHasher, path: []const u8) ![]const u8 {
        const c_path = try self.allocator.dupeZ(u8, path);
        defer self.allocator.free(c_path);
-        
+
        const result = c.fh_hash_file(self.ctx, c_path.ptr);
        if (result == null) return error.HashFailed;
        defer c.fh_free_string(result);
-        
+
        return try self.allocator.dupe(u8, std.mem.span(result));
    }

@ -42,7 +42,7 @@ pub const NativeHasher = struct {
        // Convert paths to C string array
        const c_paths = try self.allocator.alloc([*c]const u8, paths.len);
        defer self.allocator.free(c_paths);
-        
+
        for (paths, 0..) |path, i| {
            const c_path = try self.allocator.dupeZ(u8, path);
            c_paths[i] = c_path.ptr;
@ -53,27 +53,27 @@ pub const NativeHasher = struct {
                self.allocator.free(std.mem.span(p));
            }
        }
-        
+
        // Allocate results array
        const results = try self.allocator.alloc([*c]u8, paths.len);
        defer self.allocator.free(results);
-        
+
        // Call native batch hash
        const ret = c.fh_hash_batch(self.ctx, c_paths.ptr, @intCast(paths.len), results.ptr);
        if (ret != 0) return error.HashFailed;
-        
+
        // Convert results to Zig strings
        var hashes = try self.allocator.alloc([]const u8, paths.len);
        errdefer {
            for (hashes) |h| self.allocator.free(h);
            self.allocator.free(hashes);
        }
-        
+
        for (results, 0..) |r, i| {
            hashes[i] = try self.allocator.dupe(u8, std.mem.span(r));
            c.fh_free_string(r);
        }
-        
+
        return hashes;
    }

@ -81,11 +81,11 @@ pub const NativeHasher = struct {
    pub fn hashDirectory(self: *NativeHasher, dir_path: []const u8) ![]const u8 {
        const c_path = try self.allocator.dupeZ(u8, dir_path);
        defer self.allocator.free(c_path);
-        
+
        const result = c.fh_hash_directory(self.ctx, c_path.ptr);
        if (result == null) return error.HashFailed;
        defer c.fh_free_string(result);
-        
+
        return try self.allocator.dupe(u8, std.mem.span(result));
    }

@ -97,16 +97,16 @@ pub const NativeHasher = struct {
    ) !struct { hashes: [][]const u8, paths: [][]const u8, count: u32 } {
        const c_path = try self.allocator.dupeZ(u8, dir_path);
        defer self.allocator.free(c_path);
-        
+
        // Allocate output arrays
        const hashes = try self.allocator.alloc([*c]u8, max_results);
        defer self.allocator.free(hashes);
-        
+
        const paths = try self.allocator.alloc([*c]u8, max_results);
        defer self.allocator.free(paths);
-        
+
        var count: u32 = 0;
-        
+
        const ret = c.fh_hash_directory_batch(
            self.ctx,
            c_path.ptr,
@ -116,28 +116,28 @@ pub const NativeHasher = struct {
            &count,
        );
        if (ret != 0) return error.HashFailed;
-        
+
        // Convert to Zig arrays
        var zig_hashes = try self.allocator.alloc([]const u8, count);
        errdefer {
            for (zig_hashes) |h| self.allocator.free(h);
            self.allocator.free(zig_hashes);
        }
-        
+
        var zig_paths = try self.allocator.alloc([]const u8, count);
        errdefer {
            for (zig_paths) |p| self.allocator.free(p);
            self.allocator.free(zig_paths);
        }
-        
+
        for (0..count) |i| {
            zig_hashes[i] = try self.allocator.dupe(u8, std.mem.span(hashes[i]));
            c.fh_free_string(hashes[i]);
-            
+
            zig_paths[i] = try self.allocator.dupe(u8, std.mem.span(paths[i]));
            c.fh_free_string(paths[i]);
        }
-        
+
        return .{
            .hashes = zig_hashes,
            .paths = zig_paths,
@ -160,7 +160,7 @@ pub const NativeHasher = struct {

 /// Convenience function: hash directory using native library
 pub fn hashDirectoryNative(allocator: std.mem.Allocator, dir_path: []const u8) ![]const u8 {
-    var hasher = try NativeHasher.init(allocator, 0);  // Auto-detect threads
+    var hasher = try NativeHasher.init(allocator, 0); // Auto-detect threads
    defer hasher.deinit();
    return try hasher.hashDirectory(dir_path);
 }
@ -177,7 +177,7 @@ pub fn hashFilesNative(

 test "NativeHasher basic operations" {
    const allocator = std.testing.allocator;
-    
+
    // Skip if native library not available
    var hasher = NativeHasher.init(allocator, 1) catch |err| {
        if (err == error.NativeInitFailed) {
@ -187,7 +187,7 @@ test "NativeHasher basic operations" {
        return err;
    };
    defer hasher.deinit();
-    
+
    // Check SIMD availability
    const has_simd = hasher.hasSimd();
    const impl_name = hasher.getImplInfo();