feat(cli): add utility modules for local mode
- Add hash_cache.zig for efficient file hash caching - Add ignore.zig for .gitignore-style pattern matching - Add native_hash.zig for C dataset_hash library integration
This commit is contained in:
parent
7ce0fd251e
commit
2258f60ade
3 changed files with 71 additions and 71 deletions
|
|
@ -45,16 +45,16 @@ pub const HashCache = struct {
|
|||
const home = std.posix.getenv("HOME") orelse {
|
||||
return error.NoHomeDirectory;
|
||||
};
|
||||
|
||||
|
||||
// Ensure cache directory exists
|
||||
const cache_dir = try std.fs.path.join(allocator, &[_][]const u8{ home, ".ml", "cache" });
|
||||
defer allocator.free(cache_dir);
|
||||
|
||||
|
||||
std.fs.cwd().makeDir(cache_dir) catch |err| switch (err) {
|
||||
error.PathAlreadyExists => {},
|
||||
else => return err,
|
||||
};
|
||||
|
||||
|
||||
return try std.fs.path.join(allocator, &[_][]const u8{ home, ".ml", "cache", "hashes.json" });
|
||||
}
|
||||
|
||||
|
|
@ -62,37 +62,37 @@ pub const HashCache = struct {
|
|||
pub fn load(self: *HashCache) !void {
|
||||
const cache_path = try getDefaultPath(self.allocator);
|
||||
self.cache_path = cache_path;
|
||||
|
||||
|
||||
const file = std.fs.cwd().openFile(cache_path, .{}) catch |err| switch (err) {
|
||||
error.FileNotFound => return, // No cache yet is fine
|
||||
error.FileNotFound => return, // No cache yet is fine
|
||||
else => return err,
|
||||
};
|
||||
defer file.close();
|
||||
|
||||
const content = try file.readToEndAlloc(self.allocator, 10 * 1024 * 1024); // Max 10MB
|
||||
|
||||
const content = try file.readToEndAlloc(self.allocator, 10 * 1024 * 1024); // Max 10MB
|
||||
defer self.allocator.free(content);
|
||||
|
||||
|
||||
// Parse JSON
|
||||
const parsed = try std.json.parseFromSlice(std.json.Value, self.allocator, content, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
|
||||
const root = parsed.value.object;
|
||||
const version = root.get("version") orelse return error.InvalidCacheFormat;
|
||||
if (version.integer != 1) return error.UnsupportedCacheVersion;
|
||||
|
||||
|
||||
const files = root.get("files") orelse return error.InvalidCacheFormat;
|
||||
if (files.object.count() == 0) return;
|
||||
|
||||
|
||||
var it = files.object.iterator();
|
||||
while (it.next()) |entry| {
|
||||
const path = try self.allocator.dupe(u8, entry.key_ptr.*);
|
||||
|
||||
|
||||
const file_obj = entry.value_ptr.object;
|
||||
const mtime = file_obj.get("mtime") orelse continue;
|
||||
const hash_val = file_obj.get("hash") orelse continue;
|
||||
|
||||
|
||||
const hash = try self.allocator.dupe(u8, hash_val.string);
|
||||
|
||||
|
||||
try self.entries.put(path, .{
|
||||
.mtime = mtime.integer,
|
||||
.hash = hash,
|
||||
|
|
@ -103,46 +103,46 @@ pub const HashCache = struct {
|
|||
/// Save cache to disk
|
||||
pub fn save(self: *HashCache) !void {
|
||||
if (!self.dirty) return;
|
||||
|
||||
|
||||
var json_str = std.ArrayList(u8).init(self.allocator);
|
||||
defer json_str.deinit();
|
||||
|
||||
|
||||
var writer = json_str.writer();
|
||||
|
||||
|
||||
// Write header
|
||||
try writer.print("{{\n \"version\": 1,\n \"files\": {{\n", .{});
|
||||
|
||||
|
||||
// Write entries
|
||||
var it = self.entries.iterator();
|
||||
var first = true;
|
||||
while (it.next()) |entry| {
|
||||
if (!first) try writer.print(",\n", .{});
|
||||
first = false;
|
||||
|
||||
|
||||
// Escape path for JSON
|
||||
const escaped_path = try json.escapeString(self.allocator, entry.key_ptr.*);
|
||||
defer self.allocator.free(escaped_path);
|
||||
|
||||
|
||||
try writer.print(" \"{s}\": {{\"mtime\": {d}, \"hash\": \"{s}\"}}", .{
|
||||
escaped_path,
|
||||
entry.value_ptr.mtime,
|
||||
entry.value_ptr.hash,
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// Write footer
|
||||
try writer.print("\n }}\n}}\n", .{});
|
||||
|
||||
|
||||
// Write atomically
|
||||
const tmp_path = try std.fmt.allocPrint(self.allocator, "{s}.tmp", .{self.cache_path});
|
||||
defer self.allocator.free(tmp_path);
|
||||
|
||||
|
||||
{
|
||||
const file = try std.fs.cwd().createFile(tmp_path, .{});
|
||||
defer file.close();
|
||||
try file.writeAll(json_str.items);
|
||||
}
|
||||
|
||||
|
||||
try std.fs.cwd().rename(tmp_path, self.cache_path);
|
||||
self.dirty = false;
|
||||
}
|
||||
|
|
@ -163,20 +163,20 @@ pub const HashCache = struct {
|
|||
/// Store hash for file
|
||||
pub fn putHash(self: *HashCache, path: []const u8, mtime: i64, hash: []const u8) !void {
|
||||
const path_copy = try self.allocator.dupe(u8, path);
|
||||
|
||||
|
||||
// Remove old entry if exists
|
||||
if (self.entries.fetchRemove(path_copy)) |old| {
|
||||
self.allocator.free(old.key);
|
||||
old.value.deinit(self.allocator);
|
||||
}
|
||||
|
||||
|
||||
const hash_copy = try self.allocator.dupe(u8, hash);
|
||||
|
||||
|
||||
try self.entries.put(path_copy, .{
|
||||
.mtime = mtime,
|
||||
.hash = hash_copy,
|
||||
});
|
||||
|
||||
|
||||
self.dirty = true;
|
||||
}
|
||||
|
||||
|
|
@ -214,7 +214,7 @@ pub fn hashDirectoryWithCache(
|
|||
// Load .gitignore patterns
|
||||
var gitignore = @import("ignore.zig").GitIgnore.init(allocator);
|
||||
defer gitignore.deinit();
|
||||
|
||||
|
||||
try gitignore.loadFromDir(dir_path, ".gitignore");
|
||||
try gitignore.loadFromDir(dir_path, ".mlignore");
|
||||
|
||||
|
|
@ -232,21 +232,21 @@ pub fn hashDirectoryWithCache(
|
|||
if (entry.kind == .file) {
|
||||
// Skip files matching default ignores
|
||||
if (@import("ignore.zig").matchesDefaultIgnore(entry.path)) continue;
|
||||
|
||||
|
||||
// Skip files matching .gitignore/.mlignore patterns
|
||||
if (gitignore.isIgnored(entry.path, false)) continue;
|
||||
|
||||
|
||||
const full_path = try std.fs.path.join(allocator, &[_][]const u8{ dir_path, entry.path });
|
||||
defer allocator.free(full_path);
|
||||
|
||||
|
||||
const stat = dir.statFile(entry.path) catch |err| switch (err) {
|
||||
error.FileNotFound => continue,
|
||||
else => return err,
|
||||
};
|
||||
|
||||
|
||||
const mtime = @as(i64, @intCast(stat.mtime));
|
||||
const use_cache = !cache.needsHash(entry.path, mtime);
|
||||
|
||||
|
||||
try paths.append(.{
|
||||
.path = try allocator.dupe(u8, entry.path),
|
||||
.mtime = mtime,
|
||||
|
|
@ -277,7 +277,7 @@ pub fn hashDirectoryWithCache(
|
|||
else blk: {
|
||||
const full_path = try std.fs.path.join(allocator, &[_][]const u8{ dir_path, item.path });
|
||||
defer allocator.free(full_path);
|
||||
|
||||
|
||||
const hash = try crypto.hashFile(allocator, full_path);
|
||||
try cache.putHash(item.path, item.mtime, hash);
|
||||
break :blk hash;
|
||||
|
|
@ -301,15 +301,15 @@ test "HashCache basic operations" {
|
|||
|
||||
// Put and get
|
||||
try cache.putHash("src/main.py", 1708369200, "abc123");
|
||||
|
||||
|
||||
const hash = cache.getHash("src/main.py", 1708369200);
|
||||
try std.testing.expect(hash != null);
|
||||
try std.testing.expectEqualStrings("abc123", hash.?);
|
||||
|
||||
|
||||
// Wrong mtime should return null
|
||||
const stale = cache.getHash("src/main.py", 1708369201);
|
||||
try std.testing.expect(stale == null);
|
||||
|
||||
|
||||
// needsHash should detect stale entries
|
||||
try std.testing.expect(cache.needsHash("src/main.py", 1708369201));
|
||||
try std.testing.expect(!cache.needsHash("src/main.py", 1708369200));
|
||||
|
|
@ -323,11 +323,11 @@ test "HashCache clear" {
|
|||
|
||||
try cache.putHash("file1.py", 123, "hash1");
|
||||
try cache.putHash("file2.py", 456, "hash2");
|
||||
|
||||
|
||||
try std.testing.expectEqual(@as(usize, 2), cache.getStats().entries);
|
||||
|
||||
|
||||
cache.clear();
|
||||
|
||||
|
||||
try std.testing.expectEqual(@as(usize, 0), cache.getStats().entries);
|
||||
try std.testing.expect(cache.getStats().dirty);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,9 +3,9 @@ const std = @import("std");
|
|||
/// Pattern type for ignore rules
|
||||
const Pattern = struct {
|
||||
pattern: []const u8,
|
||||
is_negation: bool, // true if pattern starts with !
|
||||
is_dir_only: bool, // true if pattern ends with /
|
||||
anchored: bool, // true if pattern contains / (not at start)
|
||||
is_negation: bool, // true if pattern starts with !
|
||||
is_dir_only: bool, // true if pattern ends with /
|
||||
anchored: bool, // true if pattern contains / (not at start)
|
||||
};
|
||||
|
||||
/// GitIgnore matcher for filtering files during directory traversal
|
||||
|
|
@ -33,12 +33,12 @@ pub const GitIgnore = struct {
|
|||
defer self.allocator.free(path);
|
||||
|
||||
const file = std.fs.cwd().openFile(path, .{}) catch |err| switch (err) {
|
||||
error.FileNotFound => return, // No ignore file is fine
|
||||
error.FileNotFound => return, // No ignore file is fine
|
||||
else => return err,
|
||||
};
|
||||
defer file.close();
|
||||
|
||||
const content = try file.readToEndAlloc(self.allocator, 1024 * 1024); // Max 1MB
|
||||
const content = try file.readToEndAlloc(self.allocator, 1024 * 1024); // Max 1MB
|
||||
defer self.allocator.free(content);
|
||||
|
||||
try self.parse(content);
|
||||
|
|
@ -49,7 +49,7 @@ pub const GitIgnore = struct {
|
|||
var lines = std.mem.split(u8, content, "\n");
|
||||
while (lines.next()) |line| {
|
||||
const trimmed = std.mem.trim(u8, line, " \t\r");
|
||||
|
||||
|
||||
// Skip empty lines and comments
|
||||
if (trimmed.len == 0 or std.mem.startsWith(u8, trimmed, "#")) continue;
|
||||
|
||||
|
|
@ -205,7 +205,7 @@ pub fn matchesDefaultIgnore(path: []const u8) bool {
|
|||
const basename = path[idx + 1 ..];
|
||||
for (DEFAULT_IGNORES) |pattern| {
|
||||
if (std.mem.startsWith(u8, pattern, "*.")) {
|
||||
const ext = pattern[1..]; // Get extension including dot
|
||||
const ext = pattern[1..]; // Get extension including dot
|
||||
if (std.mem.endsWith(u8, basename, ext)) return true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ pub const NativeHasher = struct {
|
|||
pub fn init(allocator: std.mem.Allocator, num_threads: u32) !NativeHasher {
|
||||
const ctx = c.fh_init(num_threads);
|
||||
if (ctx == null) return error.NativeInitFailed;
|
||||
|
||||
|
||||
return .{
|
||||
.ctx = ctx,
|
||||
.allocator = allocator,
|
||||
|
|
@ -29,11 +29,11 @@ pub const NativeHasher = struct {
|
|||
pub fn hashFile(self: *NativeHasher, path: []const u8) ![]const u8 {
|
||||
const c_path = try self.allocator.dupeZ(u8, path);
|
||||
defer self.allocator.free(c_path);
|
||||
|
||||
|
||||
const result = c.fh_hash_file(self.ctx, c_path.ptr);
|
||||
if (result == null) return error.HashFailed;
|
||||
defer c.fh_free_string(result);
|
||||
|
||||
|
||||
return try self.allocator.dupe(u8, std.mem.span(result));
|
||||
}
|
||||
|
||||
|
|
@ -42,7 +42,7 @@ pub const NativeHasher = struct {
|
|||
// Convert paths to C string array
|
||||
const c_paths = try self.allocator.alloc([*c]const u8, paths.len);
|
||||
defer self.allocator.free(c_paths);
|
||||
|
||||
|
||||
for (paths, 0..) |path, i| {
|
||||
const c_path = try self.allocator.dupeZ(u8, path);
|
||||
c_paths[i] = c_path.ptr;
|
||||
|
|
@ -53,27 +53,27 @@ pub const NativeHasher = struct {
|
|||
self.allocator.free(std.mem.span(p));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Allocate results array
|
||||
const results = try self.allocator.alloc([*c]u8, paths.len);
|
||||
defer self.allocator.free(results);
|
||||
|
||||
|
||||
// Call native batch hash
|
||||
const ret = c.fh_hash_batch(self.ctx, c_paths.ptr, @intCast(paths.len), results.ptr);
|
||||
if (ret != 0) return error.HashFailed;
|
||||
|
||||
|
||||
// Convert results to Zig strings
|
||||
var hashes = try self.allocator.alloc([]const u8, paths.len);
|
||||
errdefer {
|
||||
for (hashes) |h| self.allocator.free(h);
|
||||
self.allocator.free(hashes);
|
||||
}
|
||||
|
||||
|
||||
for (results, 0..) |r, i| {
|
||||
hashes[i] = try self.allocator.dupe(u8, std.mem.span(r));
|
||||
c.fh_free_string(r);
|
||||
}
|
||||
|
||||
|
||||
return hashes;
|
||||
}
|
||||
|
||||
|
|
@ -81,11 +81,11 @@ pub const NativeHasher = struct {
|
|||
pub fn hashDirectory(self: *NativeHasher, dir_path: []const u8) ![]const u8 {
|
||||
const c_path = try self.allocator.dupeZ(u8, dir_path);
|
||||
defer self.allocator.free(c_path);
|
||||
|
||||
|
||||
const result = c.fh_hash_directory(self.ctx, c_path.ptr);
|
||||
if (result == null) return error.HashFailed;
|
||||
defer c.fh_free_string(result);
|
||||
|
||||
|
||||
return try self.allocator.dupe(u8, std.mem.span(result));
|
||||
}
|
||||
|
||||
|
|
@ -97,16 +97,16 @@ pub const NativeHasher = struct {
|
|||
) !struct { hashes: [][]const u8, paths: [][]const u8, count: u32 } {
|
||||
const c_path = try self.allocator.dupeZ(u8, dir_path);
|
||||
defer self.allocator.free(c_path);
|
||||
|
||||
|
||||
// Allocate output arrays
|
||||
const hashes = try self.allocator.alloc([*c]u8, max_results);
|
||||
defer self.allocator.free(hashes);
|
||||
|
||||
|
||||
const paths = try self.allocator.alloc([*c]u8, max_results);
|
||||
defer self.allocator.free(paths);
|
||||
|
||||
|
||||
var count: u32 = 0;
|
||||
|
||||
|
||||
const ret = c.fh_hash_directory_batch(
|
||||
self.ctx,
|
||||
c_path.ptr,
|
||||
|
|
@ -116,28 +116,28 @@ pub const NativeHasher = struct {
|
|||
&count,
|
||||
);
|
||||
if (ret != 0) return error.HashFailed;
|
||||
|
||||
|
||||
// Convert to Zig arrays
|
||||
var zig_hashes = try self.allocator.alloc([]const u8, count);
|
||||
errdefer {
|
||||
for (zig_hashes) |h| self.allocator.free(h);
|
||||
self.allocator.free(zig_hashes);
|
||||
}
|
||||
|
||||
|
||||
var zig_paths = try self.allocator.alloc([]const u8, count);
|
||||
errdefer {
|
||||
for (zig_paths) |p| self.allocator.free(p);
|
||||
self.allocator.free(zig_paths);
|
||||
}
|
||||
|
||||
|
||||
for (0..count) |i| {
|
||||
zig_hashes[i] = try self.allocator.dupe(u8, std.mem.span(hashes[i]));
|
||||
c.fh_free_string(hashes[i]);
|
||||
|
||||
|
||||
zig_paths[i] = try self.allocator.dupe(u8, std.mem.span(paths[i]));
|
||||
c.fh_free_string(paths[i]);
|
||||
}
|
||||
|
||||
|
||||
return .{
|
||||
.hashes = zig_hashes,
|
||||
.paths = zig_paths,
|
||||
|
|
@ -160,7 +160,7 @@ pub const NativeHasher = struct {
|
|||
|
||||
/// Convenience function: hash directory using native library
|
||||
pub fn hashDirectoryNative(allocator: std.mem.Allocator, dir_path: []const u8) ![]const u8 {
|
||||
var hasher = try NativeHasher.init(allocator, 0); // Auto-detect threads
|
||||
var hasher = try NativeHasher.init(allocator, 0); // Auto-detect threads
|
||||
defer hasher.deinit();
|
||||
return try hasher.hashDirectory(dir_path);
|
||||
}
|
||||
|
|
@ -177,7 +177,7 @@ pub fn hashFilesNative(
|
|||
|
||||
test "NativeHasher basic operations" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
|
||||
// Skip if native library not available
|
||||
var hasher = NativeHasher.init(allocator, 1) catch |err| {
|
||||
if (err == error.NativeInitFailed) {
|
||||
|
|
@ -187,7 +187,7 @@ test "NativeHasher basic operations" {
|
|||
return err;
|
||||
};
|
||||
defer hasher.deinit();
|
||||
|
||||
|
||||
// Check SIMD availability
|
||||
const has_simd = hasher.hasSimd();
|
||||
const impl_name = hasher.getImplInfo();
|
||||
|
|
|
|||
Loading…
Reference in a new issue