fetch_ml/cli/src/utils/suggest.zig
Jeremie Fraeys aaeef69bab
feat: Privacy and PII detection
Add privacy protection features to prevent accidental PII leakage:
- PII detection engine supporting emails, phone numbers, SSNs, credit cards
- CLI privacy command for scanning files and text
- Privacy middleware for API request/response filtering
- Suggestion utility for privacy-preserving alternatives

Integrates PII scanning into manifest validation for narrative fields.
2026-02-18 21:27:23 -05:00

264 lines
7.7 KiB
Zig

const std = @import("std");
/// Calculate Levenshtein distance between two strings
pub fn levenshteinDistance(allocator: std.mem.Allocator, s1: []const u8, s2: []const u8) !usize {
const m = s1.len + 1;
const n = s2.len + 1;
// Create a 2D array for dynamic programming
var dp = try allocator.alloc(usize, m * n);
defer allocator.free(dp);
// Initialize first row and column
for (0..m) |i| {
dp[i * n] = i;
}
for (0..n) |j| {
dp[j] = j;
}
// Fill the matrix
for (1..m) |i| {
for (1..n) |j| {
const cost: usize = if (s1[i - 1] == s2[j - 1]) 0 else 1;
const deletion = dp[(i - 1) * n + j] + 1;
const insertion = dp[i * n + (j - 1)] + 1;
const substitution = dp[(i - 1) * n + (j - 1)] + cost;
dp[i * n + j] = @min(@min(deletion, insertion), substitution);
}
}
return dp[(m - 1) * n + (n - 1)];
}
/// Find suggestions for a typo from a list of candidates
pub fn findSuggestions(
allocator: std.mem.Allocator,
input: []const u8,
candidates: []const []const u8,
max_distance: usize,
max_suggestions: usize,
) ![][]const u8 {
var suggestions = std.ArrayList([]const u8).empty;
defer suggestions.deinit(allocator);
var distances = std.ArrayList(usize).empty;
defer distances.deinit(allocator);
for (candidates) |candidate| {
const dist = try levenshteinDistance(allocator, input, candidate);
if (dist <= max_distance) {
try suggestions.append(allocator, candidate);
try distances.append(allocator, dist);
}
}
// Sort by distance (bubble sort for simplicity with small lists)
const n = distances.items.len;
for (0..n) |i| {
for (0..n - i - 1) |j| {
if (distances.items[j] > distances.items[j + 1]) {
// Swap distances
const temp_dist = distances.items[j];
distances.items[j] = distances.items[j + 1];
distances.items[j + 1] = temp_dist;
// Swap corresponding suggestions
const temp_sugg = suggestions.items[j];
suggestions.items[j] = suggestions.items[j + 1];
suggestions.items[j + 1] = temp_sugg;
}
}
}
// Return top suggestions
const count = @min(suggestions.items.len, max_suggestions);
const result = try allocator.alloc([]const u8, count);
for (0..count) |i| {
result[i] = try allocator.dupe(u8, suggestions.items[i]);
}
return result;
}
/// Suggest commands based on prefix matching
pub fn suggestCommands(input: []const u8) ?[]const []const u8 {
const all_commands = [_][]const u8{
"init", "sync", "queue", "requeue", "status",
"monitor", "cancel", "prune", "watch", "dataset",
"experiment", "narrative", "outcome", "info", "logs",
"annotate", "validate", "compare", "find", "export",
};
// Exact match - no suggestion needed
for (all_commands) |cmd| {
if (std.mem.eql(u8, input, cmd)) return null;
}
// Find prefix matches
var matches: [5][]const u8 = undefined;
var match_count: usize = 0;
for (all_commands) |cmd| {
if (std.mem.startsWith(u8, cmd, input)) {
matches[match_count] = cmd;
match_count += 1;
if (match_count >= 5) break;
}
}
if (match_count == 0) return null;
// Return static slice - caller must not free
return matches[0..match_count];
}
/// Suggest flags for a command
pub fn suggestFlags(command: []const u8, input: []const u8) ?[]const []const u8 {
// Common flags for all commands
const common_flags = [_][]const u8{ "--help", "--verbose", "--quiet", "--json" };
// Command-specific flags
const queue_flags = [_][]const u8{
"--commit", "--priority", "--cpu", "--memory", "--gpu",
"--gpu-memory", "--hypothesis", "--context", "--intent", "--expected-outcome",
"--experiment-group", "--tags", "--dry-run", "--validate", "--explain",
"--force",
};
const find_flags = [_][]const u8{
"--tag", "--outcome", "--dataset", "--experiment-group",
"--author", "--after", "--before", "--limit",
};
const compare_flags = [_][]const u8{
"--json", "--all", "--fields",
};
const export_flags = [_][]const u8{
"--bundle", "--anonymize", "--anonymize-level", "--base",
};
// Select flags based on command
const flags: []const []const u8 = switch (std.meta.stringToEnum(Command, command) orelse .unknown) {
.queue => &queue_flags,
.find => &find_flags,
.compare => &compare_flags,
.export_cmd => &export_flags,
else => &common_flags,
};
// Find prefix matches
var matches: [5][]const u8 = undefined;
var match_count: usize = 0;
// Check common flags first
for (common_flags) |flag| {
if (std.mem.startsWith(u8, flag, input)) {
matches[match_count] = flag;
match_count += 1;
if (match_count >= 5) break;
}
}
// Then check command-specific flags
if (match_count < 5) {
for (flags) |flag| {
if (std.mem.startsWith(u8, flag, input)) {
// Avoid duplicates
var already_added = false;
for (0..match_count) |i| {
if (std.mem.eql(u8, matches[i], flag)) {
already_added = true;
break;
}
}
if (!already_added) {
matches[match_count] = flag;
match_count += 1;
if (match_count >= 5) break;
}
}
}
}
if (match_count == 0) return null;
return matches[0..match_count];
}
const Command = enum {
init,
sync,
queue,
requeue,
status,
monitor,
cancel,
prune,
watch,
dataset,
experiment,
narrative,
outcome,
info,
logs,
annotate,
validate,
compare,
find,
export_cmd,
unknown,
};
/// Format suggestions into a helpful message
pub fn formatSuggestionMessage(
allocator: std.mem.Allocator,
input: []const u8,
suggestions: []const []const u8,
) ![]u8 {
if (suggestions.len == 0) return allocator.dupe(u8, "");
var buf = std.ArrayList(u8).empty;
defer buf.deinit(allocator);
const writer = buf.writer(allocator);
try writer.print("Did you mean for '{s}': ", .{input});
for (suggestions, 0..) |sugg, i| {
if (i > 0) {
if (i == suggestions.len - 1) {
try writer.writeAll(" or ");
} else {
try writer.writeAll(", ");
}
}
try writer.print("'{s}'", .{sugg});
}
try writer.writeAll("?\n");
return buf.toOwnedSlice(allocator);
}
/// Test the suggestion system
pub fn testSuggestions() !void {
const allocator = std.testing.allocator;
// Test Levenshtein distance
const dist1 = try levenshteinDistance(allocator, "queue", "quee");
std.debug.assert(dist1 == 1);
const dist2 = try levenshteinDistance(allocator, "status", "statis");
std.debug.assert(dist2 == 1);
// Test suggestions
const candidates = [_][]const u8{ "queue", "query", "quiet", "quit" };
const suggestions = try findSuggestions(allocator, "quee", &candidates, 2, 3);
defer {
for (suggestions) |s| allocator.free(s);
allocator.free(suggestions);
}
std.debug.assert(suggestions.len > 0);
std.debug.assert(std.mem.eql(u8, suggestions[0], "queue"));
std.debug.print("Suggestion tests passed!\n", .{});
}