feat(cli): add utility modules for local mode

- Add hash_cache.zig for efficient file hash caching
- Add ignore.zig for .gitignore-style pattern matching
- Add native_hash.zig for C dataset_hash library integration
This commit is contained in:
Jeremie Fraeys 2026-02-20 15:51:10 -05:00
parent 7ce0fd251e
commit 2258f60ade
No known key found for this signature in database
3 changed files with 71 additions and 71 deletions

View file

@ -45,16 +45,16 @@ pub const HashCache = struct {
const home = std.posix.getenv("HOME") orelse {
return error.NoHomeDirectory;
};
// Ensure cache directory exists
const cache_dir = try std.fs.path.join(allocator, &[_][]const u8{ home, ".ml", "cache" });
defer allocator.free(cache_dir);
std.fs.cwd().makeDir(cache_dir) catch |err| switch (err) {
error.PathAlreadyExists => {},
else => return err,
};
return try std.fs.path.join(allocator, &[_][]const u8{ home, ".ml", "cache", "hashes.json" });
}
@ -62,37 +62,37 @@ pub const HashCache = struct {
pub fn load(self: *HashCache) !void {
const cache_path = try getDefaultPath(self.allocator);
self.cache_path = cache_path;
const file = std.fs.cwd().openFile(cache_path, .{}) catch |err| switch (err) {
error.FileNotFound => return, // No cache yet is fine
error.FileNotFound => return, // No cache yet is fine
else => return err,
};
defer file.close();
const content = try file.readToEndAlloc(self.allocator, 10 * 1024 * 1024); // Max 10MB
const content = try file.readToEndAlloc(self.allocator, 10 * 1024 * 1024); // Max 10MB
defer self.allocator.free(content);
// Parse JSON
const parsed = try std.json.parseFromSlice(std.json.Value, self.allocator, content, .{});
defer parsed.deinit();
const root = parsed.value.object;
const version = root.get("version") orelse return error.InvalidCacheFormat;
if (version.integer != 1) return error.UnsupportedCacheVersion;
const files = root.get("files") orelse return error.InvalidCacheFormat;
if (files.object.count() == 0) return;
var it = files.object.iterator();
while (it.next()) |entry| {
const path = try self.allocator.dupe(u8, entry.key_ptr.*);
const file_obj = entry.value_ptr.object;
const mtime = file_obj.get("mtime") orelse continue;
const hash_val = file_obj.get("hash") orelse continue;
const hash = try self.allocator.dupe(u8, hash_val.string);
try self.entries.put(path, .{
.mtime = mtime.integer,
.hash = hash,
@ -103,46 +103,46 @@ pub const HashCache = struct {
/// Save cache to disk
pub fn save(self: *HashCache) !void {
if (!self.dirty) return;
var json_str = std.ArrayList(u8).init(self.allocator);
defer json_str.deinit();
var writer = json_str.writer();
// Write header
try writer.print("{{\n \"version\": 1,\n \"files\": {{\n", .{});
// Write entries
var it = self.entries.iterator();
var first = true;
while (it.next()) |entry| {
if (!first) try writer.print(",\n", .{});
first = false;
// Escape path for JSON
const escaped_path = try json.escapeString(self.allocator, entry.key_ptr.*);
defer self.allocator.free(escaped_path);
try writer.print(" \"{s}\": {{\"mtime\": {d}, \"hash\": \"{s}\"}}", .{
escaped_path,
entry.value_ptr.mtime,
entry.value_ptr.hash,
});
}
// Write footer
try writer.print("\n }}\n}}\n", .{});
// Write atomically
const tmp_path = try std.fmt.allocPrint(self.allocator, "{s}.tmp", .{self.cache_path});
defer self.allocator.free(tmp_path);
{
const file = try std.fs.cwd().createFile(tmp_path, .{});
defer file.close();
try file.writeAll(json_str.items);
}
try std.fs.cwd().rename(tmp_path, self.cache_path);
self.dirty = false;
}
@ -163,20 +163,20 @@ pub const HashCache = struct {
/// Store hash for file
pub fn putHash(self: *HashCache, path: []const u8, mtime: i64, hash: []const u8) !void {
const path_copy = try self.allocator.dupe(u8, path);
// Remove old entry if exists
if (self.entries.fetchRemove(path_copy)) |old| {
self.allocator.free(old.key);
old.value.deinit(self.allocator);
}
const hash_copy = try self.allocator.dupe(u8, hash);
try self.entries.put(path_copy, .{
.mtime = mtime,
.hash = hash_copy,
});
self.dirty = true;
}
@ -214,7 +214,7 @@ pub fn hashDirectoryWithCache(
// Load .gitignore patterns
var gitignore = @import("ignore.zig").GitIgnore.init(allocator);
defer gitignore.deinit();
try gitignore.loadFromDir(dir_path, ".gitignore");
try gitignore.loadFromDir(dir_path, ".mlignore");
@ -232,21 +232,21 @@ pub fn hashDirectoryWithCache(
if (entry.kind == .file) {
// Skip files matching default ignores
if (@import("ignore.zig").matchesDefaultIgnore(entry.path)) continue;
// Skip files matching .gitignore/.mlignore patterns
if (gitignore.isIgnored(entry.path, false)) continue;
const full_path = try std.fs.path.join(allocator, &[_][]const u8{ dir_path, entry.path });
defer allocator.free(full_path);
const stat = dir.statFile(entry.path) catch |err| switch (err) {
error.FileNotFound => continue,
else => return err,
};
const mtime = @as(i64, @intCast(stat.mtime));
const use_cache = !cache.needsHash(entry.path, mtime);
try paths.append(.{
.path = try allocator.dupe(u8, entry.path),
.mtime = mtime,
@ -277,7 +277,7 @@ pub fn hashDirectoryWithCache(
else blk: {
const full_path = try std.fs.path.join(allocator, &[_][]const u8{ dir_path, item.path });
defer allocator.free(full_path);
const hash = try crypto.hashFile(allocator, full_path);
try cache.putHash(item.path, item.mtime, hash);
break :blk hash;
@ -301,15 +301,15 @@ test "HashCache basic operations" {
// Put and get
try cache.putHash("src/main.py", 1708369200, "abc123");
const hash = cache.getHash("src/main.py", 1708369200);
try std.testing.expect(hash != null);
try std.testing.expectEqualStrings("abc123", hash.?);
// Wrong mtime should return null
const stale = cache.getHash("src/main.py", 1708369201);
try std.testing.expect(stale == null);
// needsHash should detect stale entries
try std.testing.expect(cache.needsHash("src/main.py", 1708369201));
try std.testing.expect(!cache.needsHash("src/main.py", 1708369200));
@ -323,11 +323,11 @@ test "HashCache clear" {
try cache.putHash("file1.py", 123, "hash1");
try cache.putHash("file2.py", 456, "hash2");
try std.testing.expectEqual(@as(usize, 2), cache.getStats().entries);
cache.clear();
try std.testing.expectEqual(@as(usize, 0), cache.getStats().entries);
try std.testing.expect(cache.getStats().dirty);
}

View file

@ -3,9 +3,9 @@ const std = @import("std");
/// Pattern type for ignore rules
const Pattern = struct {
pattern: []const u8,
is_negation: bool, // true if pattern starts with !
is_dir_only: bool, // true if pattern ends with /
anchored: bool, // true if pattern contains / (not at start)
is_negation: bool, // true if pattern starts with !
is_dir_only: bool, // true if pattern ends with /
anchored: bool, // true if pattern contains / (not at start)
};
/// GitIgnore matcher for filtering files during directory traversal
@ -33,12 +33,12 @@ pub const GitIgnore = struct {
defer self.allocator.free(path);
const file = std.fs.cwd().openFile(path, .{}) catch |err| switch (err) {
error.FileNotFound => return, // No ignore file is fine
error.FileNotFound => return, // No ignore file is fine
else => return err,
};
defer file.close();
const content = try file.readToEndAlloc(self.allocator, 1024 * 1024); // Max 1MB
const content = try file.readToEndAlloc(self.allocator, 1024 * 1024); // Max 1MB
defer self.allocator.free(content);
try self.parse(content);
@ -49,7 +49,7 @@ pub const GitIgnore = struct {
var lines = std.mem.split(u8, content, "\n");
while (lines.next()) |line| {
const trimmed = std.mem.trim(u8, line, " \t\r");
// Skip empty lines and comments
if (trimmed.len == 0 or std.mem.startsWith(u8, trimmed, "#")) continue;
@ -205,7 +205,7 @@ pub fn matchesDefaultIgnore(path: []const u8) bool {
const basename = path[idx + 1 ..];
for (DEFAULT_IGNORES) |pattern| {
if (std.mem.startsWith(u8, pattern, "*.")) {
const ext = pattern[1..]; // Get extension including dot
const ext = pattern[1..]; // Get extension including dot
if (std.mem.endsWith(u8, basename, ext)) return true;
}
}

View file

@ -13,7 +13,7 @@ pub const NativeHasher = struct {
pub fn init(allocator: std.mem.Allocator, num_threads: u32) !NativeHasher {
const ctx = c.fh_init(num_threads);
if (ctx == null) return error.NativeInitFailed;
return .{
.ctx = ctx,
.allocator = allocator,
@ -29,11 +29,11 @@ pub const NativeHasher = struct {
pub fn hashFile(self: *NativeHasher, path: []const u8) ![]const u8 {
const c_path = try self.allocator.dupeZ(u8, path);
defer self.allocator.free(c_path);
const result = c.fh_hash_file(self.ctx, c_path.ptr);
if (result == null) return error.HashFailed;
defer c.fh_free_string(result);
return try self.allocator.dupe(u8, std.mem.span(result));
}
@ -42,7 +42,7 @@ pub const NativeHasher = struct {
// Convert paths to C string array
const c_paths = try self.allocator.alloc([*c]const u8, paths.len);
defer self.allocator.free(c_paths);
for (paths, 0..) |path, i| {
const c_path = try self.allocator.dupeZ(u8, path);
c_paths[i] = c_path.ptr;
@ -53,27 +53,27 @@ pub const NativeHasher = struct {
self.allocator.free(std.mem.span(p));
}
}
// Allocate results array
const results = try self.allocator.alloc([*c]u8, paths.len);
defer self.allocator.free(results);
// Call native batch hash
const ret = c.fh_hash_batch(self.ctx, c_paths.ptr, @intCast(paths.len), results.ptr);
if (ret != 0) return error.HashFailed;
// Convert results to Zig strings
var hashes = try self.allocator.alloc([]const u8, paths.len);
errdefer {
for (hashes) |h| self.allocator.free(h);
self.allocator.free(hashes);
}
for (results, 0..) |r, i| {
hashes[i] = try self.allocator.dupe(u8, std.mem.span(r));
c.fh_free_string(r);
}
return hashes;
}
@ -81,11 +81,11 @@ pub const NativeHasher = struct {
pub fn hashDirectory(self: *NativeHasher, dir_path: []const u8) ![]const u8 {
const c_path = try self.allocator.dupeZ(u8, dir_path);
defer self.allocator.free(c_path);
const result = c.fh_hash_directory(self.ctx, c_path.ptr);
if (result == null) return error.HashFailed;
defer c.fh_free_string(result);
return try self.allocator.dupe(u8, std.mem.span(result));
}
@ -97,16 +97,16 @@ pub const NativeHasher = struct {
) !struct { hashes: [][]const u8, paths: [][]const u8, count: u32 } {
const c_path = try self.allocator.dupeZ(u8, dir_path);
defer self.allocator.free(c_path);
// Allocate output arrays
const hashes = try self.allocator.alloc([*c]u8, max_results);
defer self.allocator.free(hashes);
const paths = try self.allocator.alloc([*c]u8, max_results);
defer self.allocator.free(paths);
var count: u32 = 0;
const ret = c.fh_hash_directory_batch(
self.ctx,
c_path.ptr,
@ -116,28 +116,28 @@ pub const NativeHasher = struct {
&count,
);
if (ret != 0) return error.HashFailed;
// Convert to Zig arrays
var zig_hashes = try self.allocator.alloc([]const u8, count);
errdefer {
for (zig_hashes) |h| self.allocator.free(h);
self.allocator.free(zig_hashes);
}
var zig_paths = try self.allocator.alloc([]const u8, count);
errdefer {
for (zig_paths) |p| self.allocator.free(p);
self.allocator.free(zig_paths);
}
for (0..count) |i| {
zig_hashes[i] = try self.allocator.dupe(u8, std.mem.span(hashes[i]));
c.fh_free_string(hashes[i]);
zig_paths[i] = try self.allocator.dupe(u8, std.mem.span(paths[i]));
c.fh_free_string(paths[i]);
}
return .{
.hashes = zig_hashes,
.paths = zig_paths,
@ -160,7 +160,7 @@ pub const NativeHasher = struct {
/// Convenience function: hash directory using native library
pub fn hashDirectoryNative(allocator: std.mem.Allocator, dir_path: []const u8) ![]const u8 {
var hasher = try NativeHasher.init(allocator, 0); // Auto-detect threads
var hasher = try NativeHasher.init(allocator, 0); // Auto-detect threads
defer hasher.deinit();
return try hasher.hashDirectory(dir_path);
}
@ -177,7 +177,7 @@ pub fn hashFilesNative(
test "NativeHasher basic operations" {
const allocator = std.testing.allocator;
// Skip if native library not available
var hasher = NativeHasher.init(allocator, 1) catch |err| {
if (err == error.NativeInitFailed) {
@ -187,7 +187,7 @@ test "NativeHasher basic operations" {
return err;
};
defer hasher.deinit();
// Check SIMD availability
const has_simd = hasher.hasSimd();
const impl_name = hasher.getImplInfo();