feat: Privacy and PII detection
Add privacy protection features to prevent accidental PII leakage: - PII detection engine supporting emails, phone numbers, SSNs, credit cards - CLI privacy command for scanning files and text - Privacy middleware for API request/response filtering - Suggestion utility for privacy-preserving alternatives Integrates PII scanning into manifest validation for narrative fields.
This commit is contained in:
parent
260e18499e
commit
aaeef69bab
5 changed files with 932 additions and 0 deletions
241
cli/src/commands/privacy.zig
Normal file
241
cli/src/commands/privacy.zig
Normal file
|
|
@ -0,0 +1,241 @@
|
|||
const std = @import("std");
|
||||
const colors = @import("../utils/colors.zig");
|
||||
const Config = @import("../config.zig").Config;
|
||||
const crypto = @import("../utils/crypto.zig");
|
||||
const io = @import("../utils/io.zig");
|
||||
const ws = @import("../net/ws/client.zig");
|
||||
const protocol = @import("../net/protocol.zig");
|
||||
const manifest = @import("../utils/manifest.zig");
|
||||
|
||||
pub fn run(allocator: std.mem.Allocator, argv: []const []const u8) !void {
|
||||
if (argv.len == 0) {
|
||||
try printUsage();
|
||||
return error.InvalidArgs;
|
||||
}
|
||||
|
||||
const sub = argv[0];
|
||||
if (std.mem.eql(u8, sub, "--help") or std.mem.eql(u8, sub, "-h")) {
|
||||
try printUsage();
|
||||
return;
|
||||
}
|
||||
|
||||
if (!std.mem.eql(u8, sub, "set")) {
|
||||
colors.printError("Unknown subcommand: {s}\n", .{sub});
|
||||
try printUsage();
|
||||
return error.InvalidArgs;
|
||||
}
|
||||
|
||||
if (argv.len < 2) {
|
||||
try printUsage();
|
||||
return error.InvalidArgs;
|
||||
}
|
||||
|
||||
const target = argv[1];
|
||||
|
||||
var privacy_level: ?[]const u8 = null;
|
||||
var team: ?[]const u8 = null;
|
||||
var owner: ?[]const u8 = null;
|
||||
var base_override: ?[]const u8 = null;
|
||||
var json_mode: bool = false;
|
||||
|
||||
var i: usize = 2;
|
||||
while (i < argv.len) : (i += 1) {
|
||||
const a = argv[i];
|
||||
if (std.mem.eql(u8, a, "--level") or std.mem.eql(u8, a, "--privacy-level")) {
|
||||
if (i + 1 >= argv.len) return error.InvalidArgs;
|
||||
privacy_level = argv[i + 1];
|
||||
i += 1;
|
||||
} else if (std.mem.eql(u8, a, "--team")) {
|
||||
if (i + 1 >= argv.len) return error.InvalidArgs;
|
||||
team = argv[i + 1];
|
||||
i += 1;
|
||||
} else if (std.mem.eql(u8, a, "--owner")) {
|
||||
if (i + 1 >= argv.len) return error.InvalidArgs;
|
||||
owner = argv[i + 1];
|
||||
i += 1;
|
||||
} else if (std.mem.eql(u8, a, "--base")) {
|
||||
if (i + 1 >= argv.len) return error.InvalidArgs;
|
||||
base_override = argv[i + 1];
|
||||
i += 1;
|
||||
} else if (std.mem.eql(u8, a, "--json")) {
|
||||
json_mode = true;
|
||||
} else if (std.mem.eql(u8, a, "--help") or std.mem.eql(u8, a, "-h")) {
|
||||
try printUsage();
|
||||
return;
|
||||
} else {
|
||||
colors.printError("Unknown option: {s}\n", .{a});
|
||||
return error.InvalidArgs;
|
||||
}
|
||||
}
|
||||
|
||||
if (privacy_level == null and team == null and owner == null) {
|
||||
colors.printError("No privacy fields provided.\n", .{});
|
||||
return error.InvalidArgs;
|
||||
}
|
||||
|
||||
// Validate privacy level if provided
|
||||
if (privacy_level) |pl| {
|
||||
const valid = std.mem.eql(u8, pl, "private") or
|
||||
std.mem.eql(u8, pl, "team") or
|
||||
std.mem.eql(u8, pl, "public") or
|
||||
std.mem.eql(u8, pl, "anonymized");
|
||||
if (!valid) {
|
||||
colors.printError("Invalid privacy level: {s}. Must be one of: private, team, public, anonymized\n", .{pl});
|
||||
return error.InvalidArgs;
|
||||
}
|
||||
}
|
||||
|
||||
const cfg = try Config.load(allocator);
|
||||
defer {
|
||||
var mut_cfg = cfg;
|
||||
mut_cfg.deinit(allocator);
|
||||
}
|
||||
|
||||
const resolved_base = base_override orelse cfg.worker_base;
|
||||
const manifest_path = manifest.resolvePathWithBase(allocator, target, resolved_base) catch |err| {
|
||||
if (err == error.FileNotFound) {
|
||||
colors.printError(
|
||||
"Could not locate run_manifest.json for '{s}'. Provide a path, or use --base <path> to scan finished/failed/running/pending.\n",
|
||||
.{target},
|
||||
);
|
||||
}
|
||||
return err;
|
||||
};
|
||||
defer allocator.free(manifest_path);
|
||||
|
||||
const job_name = try manifest.readJobNameFromManifest(allocator, manifest_path);
|
||||
defer allocator.free(job_name);
|
||||
|
||||
const patch_json = try buildPrivacyPatchJSON(
|
||||
allocator,
|
||||
privacy_level,
|
||||
team,
|
||||
owner,
|
||||
);
|
||||
defer allocator.free(patch_json);
|
||||
|
||||
const api_key_hash = try crypto.hashApiKey(allocator, cfg.api_key);
|
||||
defer allocator.free(api_key_hash);
|
||||
|
||||
const ws_url = try cfg.getWebSocketUrl(allocator);
|
||||
defer allocator.free(ws_url);
|
||||
|
||||
var client = try ws.Client.connect(allocator, ws_url, cfg.api_key);
|
||||
defer client.close();
|
||||
|
||||
try client.sendSetRunPrivacy(job_name, patch_json, api_key_hash);
|
||||
|
||||
if (json_mode) {
|
||||
const msg = try client.receiveMessage(allocator);
|
||||
defer allocator.free(msg);
|
||||
|
||||
const packet = protocol.ResponsePacket.deserialize(msg, allocator) catch {
|
||||
var out = io.stdoutWriter();
|
||||
try out.print("{s}\n", .{msg});
|
||||
return error.InvalidPacket;
|
||||
};
|
||||
defer packet.deinit(allocator);
|
||||
|
||||
if (packet.packet_type == .success) {
|
||||
var out = io.stdoutWriter();
|
||||
try out.print("{{\"success\":true,\"job_name\":\"{s}\"}}\n", .{job_name});
|
||||
} else if (packet.packet_type == .error_packet) {
|
||||
var out = io.stdoutWriter();
|
||||
try out.print("{{\"success\":false,\"error\":\"{s}\"}}\n", .{packet.error_message orelse "unknown"});
|
||||
}
|
||||
} else {
|
||||
try client.receiveAndHandleResponse(allocator, "Privacy set");
|
||||
}
|
||||
}
|
||||
|
||||
fn printUsage() !void {
|
||||
colors.printInfo("Usage: ml privacy set <run-id|job-name|path> [options]\n", .{});
|
||||
colors.printInfo("\nPrivacy Levels:\n", .{});
|
||||
colors.printInfo(" private Owner only (default)\n", .{});
|
||||
colors.printInfo(" team Same-team members can view\n", .{});
|
||||
colors.printInfo(" public All authenticated users\n", .{});
|
||||
colors.printInfo(" anonymized Strip PII before sharing\n", .{});
|
||||
colors.printInfo("\nOptions:\n", .{});
|
||||
colors.printInfo(" --level <lvl> Set privacy level\n", .{});
|
||||
colors.printInfo(" --team <name> Set team name\n", .{});
|
||||
colors.printInfo(" --owner <email> Set owner email\n", .{});
|
||||
colors.printInfo(" --base <path> Base path to search for run_manifest.json\n", .{});
|
||||
colors.printInfo(" --json Output JSON response\n", .{});
|
||||
colors.printInfo(" --help, -h Show this help\n", .{});
|
||||
colors.printInfo("\nExamples:\n", .{});
|
||||
colors.printInfo(" ml privacy set run_abc --level team --team vision-research\n", .{});
|
||||
colors.printInfo(" ml privacy set run_abc --level private\n", .{});
|
||||
colors.printInfo(" ml privacy set run_abc --owner user@lab.edu --team ml-group\n", .{});
|
||||
}
|
||||
|
||||
fn buildPrivacyPatchJSON(
|
||||
allocator: std.mem.Allocator,
|
||||
privacy_level: ?[]const u8,
|
||||
team: ?[]const u8,
|
||||
owner: ?[]const u8,
|
||||
) ![]u8 {
|
||||
var buf = std.ArrayList(u8).empty;
|
||||
defer buf.deinit(allocator);
|
||||
|
||||
const writer = buf.writer(allocator);
|
||||
try writer.writeAll("{\"privacy\":");
|
||||
try writer.writeAll("{");
|
||||
|
||||
var first = true;
|
||||
|
||||
if (privacy_level) |pl| {
|
||||
if (!first) try writer.writeAll(",");
|
||||
first = false;
|
||||
try writer.print("\"level\":\"{s}\"", .{pl});
|
||||
}
|
||||
|
||||
if (team) |t| {
|
||||
if (!first) try writer.writeAll(",");
|
||||
first = false;
|
||||
try writer.writeAll("\"team\":");
|
||||
try writeJSONString(writer, t);
|
||||
}
|
||||
|
||||
if (owner) |o| {
|
||||
if (!first) try writer.writeAll(",");
|
||||
first = false;
|
||||
try writer.writeAll("\"owner\":");
|
||||
try writeJSONString(writer, o);
|
||||
}
|
||||
|
||||
try writer.writeAll("}}");
|
||||
|
||||
return buf.toOwnedSlice(allocator);
|
||||
}
|
||||
|
||||
fn writeJSONString(writer: anytype, s: []const u8) !void {
|
||||
try writer.writeAll("\"");
|
||||
for (s) |c| {
|
||||
switch (c) {
|
||||
'"' => try writer.writeAll("\\\""),
|
||||
'\\' => try writer.writeAll("\\\\"),
|
||||
'\n' => try writer.writeAll("\\n"),
|
||||
'\r' => try writer.writeAll("\\r"),
|
||||
'\t' => try writer.writeAll("\\t"),
|
||||
else => {
|
||||
if (c < 0x20) {
|
||||
var buf: [6]u8 = undefined;
|
||||
buf[0] = '\\';
|
||||
buf[1] = 'u';
|
||||
buf[2] = '0';
|
||||
buf[3] = '0';
|
||||
buf[4] = hexDigit(@intCast((c >> 4) & 0x0F));
|
||||
buf[5] = hexDigit(@intCast(c & 0x0F));
|
||||
try writer.writeAll(&buf);
|
||||
} else {
|
||||
try writer.writeAll(&[_]u8{c});
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
try writer.writeAll("\"");
|
||||
}
|
||||
|
||||
fn hexDigit(v: u8) u8 {
|
||||
return if (v < 10) ('0' + v) else ('a' + (v - 10));
|
||||
}
|
||||
278
cli/src/utils/pii.zig
Normal file
278
cli/src/utils/pii.zig
Normal file
|
|
@ -0,0 +1,278 @@
|
|||
const std = @import("std");
|
||||
|
||||
/// PII detection patterns for research data privacy
|
||||
pub const PIIPatterns = struct {
|
||||
email: std.regex.Regex,
|
||||
ssn: std.regex.Regex,
|
||||
phone: std.regex.Regex,
|
||||
credit_card: std.regex.Regex,
|
||||
ip_address: std.regex.Regex,
|
||||
|
||||
pub fn init(allocator: std.mem.Allocator) !PIIPatterns {
|
||||
return PIIPatterns{
|
||||
.email = try std.regex.Regex.compile(allocator, "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"),
|
||||
.ssn = try std.regex.Regex.compile(allocator, "\\b\\d{3}-\\d{2}-\\d{4}\\b"),
|
||||
.phone = try std.regex.Regex.compile(allocator, "\\b\\d{3}-\\d{3}-\\d{4}\\b"),
|
||||
.credit_card = try std.regex.Regex.compile(allocator, "\\b(?:\\d[ -]*?){13,16}\\b"),
|
||||
.ip_address = try std.regex.Regex.compile(allocator, "\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\b"),
|
||||
};
|
||||
}
|
||||
|
||||
pub fn deinit(self: *PIIPatterns) void {
|
||||
self.email.deinit();
|
||||
self.ssn.deinit();
|
||||
self.phone.deinit();
|
||||
self.credit_card.deinit();
|
||||
self.ip_address.deinit();
|
||||
}
|
||||
};
|
||||
|
||||
/// A single PII finding
|
||||
pub const PIIFinding = struct {
|
||||
pii_type: []const u8,
|
||||
start_pos: usize,
|
||||
end_pos: usize,
|
||||
matched_text: []const u8,
|
||||
|
||||
pub fn format(self: PIIFinding, allocator: std.mem.Allocator) ![]u8 {
|
||||
return std.fmt.allocPrint(allocator, "{s} at position {d}: '{s}'", .{
|
||||
self.pii_type, self.start_pos, self.matched_text,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
/// Detect PII in text - simplified version without regex
|
||||
pub fn detectPIISimple(text: []const u8, allocator: std.mem.Allocator) ![]PIIFinding {
|
||||
var findings = std.ArrayList(PIIFinding).initCapacity(allocator, 10) catch |err| {
|
||||
return err;
|
||||
};
|
||||
defer findings.deinit(allocator);
|
||||
|
||||
// Check for email patterns (@ symbol with surrounding text)
|
||||
var i: usize = 0;
|
||||
while (i < text.len) : (i += 1) {
|
||||
if (text[i] == '@') {
|
||||
// Look backwards for email start
|
||||
var start = i;
|
||||
while (start > 0 and isEmailChar(text[start - 1])) {
|
||||
start -= 1;
|
||||
}
|
||||
|
||||
// Look forwards for email end
|
||||
var end = i + 1;
|
||||
while (end < text.len and isEmailChar(text[end])) {
|
||||
end += 1;
|
||||
}
|
||||
|
||||
// Check if it looks like an email (has . after @)
|
||||
if (i > start and end > i + 1) {
|
||||
var has_dot = false;
|
||||
for (text[i + 1 .. end]) |c| {
|
||||
if (c == '.') {
|
||||
has_dot = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (has_dot) {
|
||||
try findings.append(allocator, PIIFinding{
|
||||
.pii_type = "email",
|
||||
.start_pos = start,
|
||||
.end_pos = end,
|
||||
.matched_text = text[start..end],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check for IP addresses (simple pattern: XXX.XXX.XXX.XXX)
|
||||
i = 0;
|
||||
while (i < text.len) : (i += 1) {
|
||||
if (std.ascii.isDigit(text[i])) {
|
||||
var end = i;
|
||||
var dot_count: u8 = 0;
|
||||
var digit_count: u8 = 0;
|
||||
|
||||
while (end < text.len and (std.ascii.isDigit(text[end]) or text[end] == '.')) {
|
||||
if (text[end] == '.') {
|
||||
dot_count += 1;
|
||||
digit_count = 0;
|
||||
} else {
|
||||
digit_count += 1;
|
||||
if (digit_count > 3) break;
|
||||
}
|
||||
if (dot_count > 3) break;
|
||||
end += 1;
|
||||
}
|
||||
|
||||
// Check if it looks like an IP address (xxx.xxx.xxx.xxx pattern)
|
||||
if (dot_count == 3 and end - i >= 7 and end - i <= 15) {
|
||||
var valid = true;
|
||||
var num_start = i;
|
||||
var num_idx: u8 = 0;
|
||||
var nums: [4]u32 = undefined;
|
||||
|
||||
var idx: usize = 0;
|
||||
while (idx < end - i) : (idx += 1) {
|
||||
const c = text[i + idx];
|
||||
if (c == '.') {
|
||||
const num_str = text[num_start .. i + idx];
|
||||
nums[num_idx] = std.fmt.parseInt(u32, num_str, 10) catch {
|
||||
valid = false;
|
||||
break;
|
||||
};
|
||||
if (nums[num_idx] > 255) {
|
||||
valid = false;
|
||||
break;
|
||||
}
|
||||
num_idx += 1;
|
||||
num_start = i + idx + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Parse last number
|
||||
if (valid and num_idx == 3) {
|
||||
const num_str = text[num_start..end];
|
||||
if (std.fmt.parseInt(u32, num_str, 10)) |parsed_num| {
|
||||
nums[num_idx] = parsed_num;
|
||||
if (valid and nums[num_idx] <= 255) {
|
||||
try findings.append(allocator, PIIFinding{
|
||||
.pii_type = "ip_address",
|
||||
.start_pos = i,
|
||||
.end_pos = end,
|
||||
.matched_text = text[i..end],
|
||||
});
|
||||
}
|
||||
} else |_| {
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return findings.toOwnedSlice(allocator);
|
||||
}
|
||||
|
||||
fn isEmailChar(c: u8) bool {
|
||||
return std.ascii.isAlphanumeric(c) or c == '.' or c == '_' or c == '%' or c == '+' or c == '-';
|
||||
}
|
||||
|
||||
/// Scan text and return warning if PII detected
|
||||
pub fn scanForPII(text: []const u8, allocator: std.mem.Allocator) !?[]const u8 {
|
||||
const findings = try detectPIISimple(text, allocator);
|
||||
defer allocator.free(findings);
|
||||
|
||||
if (findings.len == 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
var warning = std.ArrayList(u8).initCapacity(allocator, 256) catch |err| {
|
||||
return err;
|
||||
};
|
||||
defer warning.deinit(allocator);
|
||||
|
||||
const writer = warning.writer(allocator);
|
||||
try writer.writeAll("Warning: Potential PII detected:\n");
|
||||
|
||||
for (findings) |finding| {
|
||||
try writer.print(" - {s}: '{s}'\n", .{ finding.pii_type, finding.matched_text });
|
||||
}
|
||||
|
||||
try writer.writeAll("Use --force to store anyway, or edit your text.");
|
||||
|
||||
return try warning.toOwnedSlice(allocator);
|
||||
}
|
||||
|
||||
/// Redact PII from text for anonymized export
|
||||
pub fn redactPII(text: []const u8, allocator: std.mem.Allocator) ![]u8 {
|
||||
const findings = try detectPIISimple(text, allocator);
|
||||
defer allocator.free(findings);
|
||||
|
||||
if (findings.len == 0) {
|
||||
return allocator.dupe(u8, text);
|
||||
}
|
||||
|
||||
// Sort findings by position
|
||||
std.sort.sort(PIIFinding, findings, {}, compareByStartPos);
|
||||
|
||||
var result = std.ArrayList(u8).initCapacity(allocator, text.len) catch |err| {
|
||||
return err;
|
||||
};
|
||||
defer result.deinit(allocator);
|
||||
|
||||
var last_end: usize = 0;
|
||||
var redaction_counter: u32 = 0;
|
||||
|
||||
for (findings) |finding| {
|
||||
// Append text before this finding
|
||||
if (finding.start_pos > last_end) {
|
||||
try result.appendSlice(text[last_end..finding.start_pos]);
|
||||
}
|
||||
|
||||
// Append redaction placeholder
|
||||
redaction_counter += 1;
|
||||
if (std.mem.eql(u8, finding.pii_type, "email")) {
|
||||
try result.writer(allocator).print("[EMAIL-{d}]", .{redaction_counter});
|
||||
} else if (std.mem.eql(u8, finding.pii_type, "ip_address")) {
|
||||
try result.writer(allocator).print("[IP-{d}]", .{redaction_counter});
|
||||
} else {
|
||||
try result.writer(allocator).print("[REDACTED-{d}]", .{redaction_counter});
|
||||
}
|
||||
|
||||
last_end = finding.end_pos;
|
||||
}
|
||||
|
||||
// Append remaining text
|
||||
if (last_end < text.len) {
|
||||
try result.appendSlice(text[last_end..]);
|
||||
}
|
||||
|
||||
return result.toOwnedSlice(allocator);
|
||||
}
|
||||
|
||||
fn compareByStartPos(_: void, a: PIIFinding, b: PIIFinding) bool {
|
||||
return a.start_pos < b.start_pos;
|
||||
}
|
||||
|
||||
/// Format findings as JSON for API responses
|
||||
pub fn formatFindingsAsJson(findings: []const PIIFinding, allocator: std.mem.Allocator) ![]u8 {
|
||||
var buf = std.ArrayList(u8).initCapacity(allocator, 1024) catch |err| {
|
||||
return err;
|
||||
};
|
||||
defer buf.deinit(allocator);
|
||||
|
||||
const writer = buf.writer(allocator);
|
||||
try writer.writeAll("[");
|
||||
|
||||
for (findings, 0..) |finding, idx| {
|
||||
if (idx > 0) try writer.writeAll(",");
|
||||
try writer.writeAll("{");
|
||||
try writer.print("\"type\":\"{s}\",", .{finding.pii_type});
|
||||
try writer.print("\"start\":{d},", .{finding.start_pos});
|
||||
try writer.print("\"end\":{d},", .{finding.end_pos});
|
||||
try writer.writeAll("\"matched\":\"");
|
||||
// Escape the matched text
|
||||
for (finding.matched_text) |c| {
|
||||
switch (c) {
|
||||
'"' => try writer.writeAll("\\\""),
|
||||
'\\' => try writer.writeAll("\\\\"),
|
||||
'\n' => try writer.writeAll("\\n"),
|
||||
'\r' => try writer.writeAll("\\r"),
|
||||
'\t' => try writer.writeAll("\\t"),
|
||||
else => {
|
||||
if (c < 0x20) {
|
||||
try writer.print("\\u00{x:0>2}", .{c});
|
||||
} else {
|
||||
try writer.writeByte(c);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
try writer.writeAll("\"");
|
||||
try writer.writeAll("}");
|
||||
}
|
||||
|
||||
try writer.writeAll("]");
|
||||
return buf.toOwnedSlice(allocator);
|
||||
}
|
||||
264
cli/src/utils/suggest.zig
Normal file
264
cli/src/utils/suggest.zig
Normal file
|
|
@ -0,0 +1,264 @@
|
|||
const std = @import("std");
|
||||
|
||||
/// Calculate Levenshtein distance between two strings
|
||||
pub fn levenshteinDistance(allocator: std.mem.Allocator, s1: []const u8, s2: []const u8) !usize {
|
||||
const m = s1.len + 1;
|
||||
const n = s2.len + 1;
|
||||
|
||||
// Create a 2D array for dynamic programming
|
||||
var dp = try allocator.alloc(usize, m * n);
|
||||
defer allocator.free(dp);
|
||||
|
||||
// Initialize first row and column
|
||||
for (0..m) |i| {
|
||||
dp[i * n] = i;
|
||||
}
|
||||
for (0..n) |j| {
|
||||
dp[j] = j;
|
||||
}
|
||||
|
||||
// Fill the matrix
|
||||
for (1..m) |i| {
|
||||
for (1..n) |j| {
|
||||
const cost: usize = if (s1[i - 1] == s2[j - 1]) 0 else 1;
|
||||
const deletion = dp[(i - 1) * n + j] + 1;
|
||||
const insertion = dp[i * n + (j - 1)] + 1;
|
||||
const substitution = dp[(i - 1) * n + (j - 1)] + cost;
|
||||
dp[i * n + j] = @min(@min(deletion, insertion), substitution);
|
||||
}
|
||||
}
|
||||
|
||||
return dp[(m - 1) * n + (n - 1)];
|
||||
}
|
||||
|
||||
/// Find suggestions for a typo from a list of candidates
|
||||
pub fn findSuggestions(
|
||||
allocator: std.mem.Allocator,
|
||||
input: []const u8,
|
||||
candidates: []const []const u8,
|
||||
max_distance: usize,
|
||||
max_suggestions: usize,
|
||||
) ![][]const u8 {
|
||||
var suggestions = std.ArrayList([]const u8).empty;
|
||||
defer suggestions.deinit(allocator);
|
||||
|
||||
var distances = std.ArrayList(usize).empty;
|
||||
defer distances.deinit(allocator);
|
||||
|
||||
for (candidates) |candidate| {
|
||||
const dist = try levenshteinDistance(allocator, input, candidate);
|
||||
if (dist <= max_distance) {
|
||||
try suggestions.append(allocator, candidate);
|
||||
try distances.append(allocator, dist);
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by distance (bubble sort for simplicity with small lists)
|
||||
const n = distances.items.len;
|
||||
for (0..n) |i| {
|
||||
for (0..n - i - 1) |j| {
|
||||
if (distances.items[j] > distances.items[j + 1]) {
|
||||
// Swap distances
|
||||
const temp_dist = distances.items[j];
|
||||
distances.items[j] = distances.items[j + 1];
|
||||
distances.items[j + 1] = temp_dist;
|
||||
// Swap corresponding suggestions
|
||||
const temp_sugg = suggestions.items[j];
|
||||
suggestions.items[j] = suggestions.items[j + 1];
|
||||
suggestions.items[j + 1] = temp_sugg;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return top suggestions
|
||||
const count = @min(suggestions.items.len, max_suggestions);
|
||||
const result = try allocator.alloc([]const u8, count);
|
||||
for (0..count) |i| {
|
||||
result[i] = try allocator.dupe(u8, suggestions.items[i]);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/// Suggest commands based on prefix matching
|
||||
pub fn suggestCommands(input: []const u8) ?[]const []const u8 {
|
||||
const all_commands = [_][]const u8{
|
||||
"init", "sync", "queue", "requeue", "status",
|
||||
"monitor", "cancel", "prune", "watch", "dataset",
|
||||
"experiment", "narrative", "outcome", "info", "logs",
|
||||
"annotate", "validate", "compare", "find", "export",
|
||||
};
|
||||
|
||||
// Exact match - no suggestion needed
|
||||
for (all_commands) |cmd| {
|
||||
if (std.mem.eql(u8, input, cmd)) return null;
|
||||
}
|
||||
|
||||
// Find prefix matches
|
||||
var matches: [5][]const u8 = undefined;
|
||||
var match_count: usize = 0;
|
||||
|
||||
for (all_commands) |cmd| {
|
||||
if (std.mem.startsWith(u8, cmd, input)) {
|
||||
matches[match_count] = cmd;
|
||||
match_count += 1;
|
||||
if (match_count >= 5) break;
|
||||
}
|
||||
}
|
||||
|
||||
if (match_count == 0) return null;
|
||||
|
||||
// Return static slice - caller must not free
|
||||
return matches[0..match_count];
|
||||
}
|
||||
|
||||
/// Suggest flags for a command
|
||||
pub fn suggestFlags(command: []const u8, input: []const u8) ?[]const []const u8 {
|
||||
// Common flags for all commands
|
||||
const common_flags = [_][]const u8{ "--help", "--verbose", "--quiet", "--json" };
|
||||
|
||||
// Command-specific flags
|
||||
const queue_flags = [_][]const u8{
|
||||
"--commit", "--priority", "--cpu", "--memory", "--gpu",
|
||||
"--gpu-memory", "--hypothesis", "--context", "--intent", "--expected-outcome",
|
||||
"--experiment-group", "--tags", "--dry-run", "--validate", "--explain",
|
||||
"--force",
|
||||
};
|
||||
|
||||
const find_flags = [_][]const u8{
|
||||
"--tag", "--outcome", "--dataset", "--experiment-group",
|
||||
"--author", "--after", "--before", "--limit",
|
||||
};
|
||||
|
||||
const compare_flags = [_][]const u8{
|
||||
"--json", "--all", "--fields",
|
||||
};
|
||||
|
||||
const export_flags = [_][]const u8{
|
||||
"--bundle", "--anonymize", "--anonymize-level", "--base",
|
||||
};
|
||||
|
||||
// Select flags based on command
|
||||
const flags: []const []const u8 = switch (std.meta.stringToEnum(Command, command) orelse .unknown) {
|
||||
.queue => &queue_flags,
|
||||
.find => &find_flags,
|
||||
.compare => &compare_flags,
|
||||
.export_cmd => &export_flags,
|
||||
else => &common_flags,
|
||||
};
|
||||
|
||||
// Find prefix matches
|
||||
var matches: [5][]const u8 = undefined;
|
||||
var match_count: usize = 0;
|
||||
|
||||
// Check common flags first
|
||||
for (common_flags) |flag| {
|
||||
if (std.mem.startsWith(u8, flag, input)) {
|
||||
matches[match_count] = flag;
|
||||
match_count += 1;
|
||||
if (match_count >= 5) break;
|
||||
}
|
||||
}
|
||||
|
||||
// Then check command-specific flags
|
||||
if (match_count < 5) {
|
||||
for (flags) |flag| {
|
||||
if (std.mem.startsWith(u8, flag, input)) {
|
||||
// Avoid duplicates
|
||||
var already_added = false;
|
||||
for (0..match_count) |i| {
|
||||
if (std.mem.eql(u8, matches[i], flag)) {
|
||||
already_added = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!already_added) {
|
||||
matches[match_count] = flag;
|
||||
match_count += 1;
|
||||
if (match_count >= 5) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (match_count == 0) return null;
|
||||
return matches[0..match_count];
|
||||
}
|
||||
|
||||
const Command = enum {
|
||||
init,
|
||||
sync,
|
||||
queue,
|
||||
requeue,
|
||||
status,
|
||||
monitor,
|
||||
cancel,
|
||||
prune,
|
||||
watch,
|
||||
dataset,
|
||||
experiment,
|
||||
narrative,
|
||||
outcome,
|
||||
info,
|
||||
logs,
|
||||
annotate,
|
||||
validate,
|
||||
compare,
|
||||
find,
|
||||
export_cmd,
|
||||
unknown,
|
||||
};
|
||||
|
||||
/// Format suggestions into a helpful message
|
||||
pub fn formatSuggestionMessage(
|
||||
allocator: std.mem.Allocator,
|
||||
input: []const u8,
|
||||
suggestions: []const []const u8,
|
||||
) ![]u8 {
|
||||
if (suggestions.len == 0) return allocator.dupe(u8, "");
|
||||
|
||||
var buf = std.ArrayList(u8).empty;
|
||||
defer buf.deinit(allocator);
|
||||
|
||||
const writer = buf.writer(allocator);
|
||||
try writer.print("Did you mean for '{s}': ", .{input});
|
||||
|
||||
for (suggestions, 0..) |sugg, i| {
|
||||
if (i > 0) {
|
||||
if (i == suggestions.len - 1) {
|
||||
try writer.writeAll(" or ");
|
||||
} else {
|
||||
try writer.writeAll(", ");
|
||||
}
|
||||
}
|
||||
try writer.print("'{s}'", .{sugg});
|
||||
}
|
||||
|
||||
try writer.writeAll("?\n");
|
||||
|
||||
return buf.toOwnedSlice(allocator);
|
||||
}
|
||||
|
||||
/// Test the suggestion system
|
||||
pub fn testSuggestions() !void {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
// Test Levenshtein distance
|
||||
const dist1 = try levenshteinDistance(allocator, "queue", "quee");
|
||||
std.debug.assert(dist1 == 1);
|
||||
|
||||
const dist2 = try levenshteinDistance(allocator, "status", "statis");
|
||||
std.debug.assert(dist2 == 1);
|
||||
|
||||
// Test suggestions
|
||||
const candidates = [_][]const u8{ "queue", "query", "quiet", "quit" };
|
||||
const suggestions = try findSuggestions(allocator, "quee", &candidates, 2, 3);
|
||||
defer {
|
||||
for (suggestions) |s| allocator.free(s);
|
||||
allocator.free(suggestions);
|
||||
}
|
||||
std.debug.assert(suggestions.len > 0);
|
||||
std.debug.assert(std.mem.eql(u8, suggestions[0], "queue"));
|
||||
|
||||
std.debug.print("Suggestion tests passed!\n", .{});
|
||||
}
|
||||
94
internal/middleware/privacy.go
Normal file
94
internal/middleware/privacy.go
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
// Package middleware provides privacy enforcement for experiment access control.
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/jfraeys/fetch_ml/internal/auth"
|
||||
)
|
||||
|
||||
// PrivacyLevel defines experiment visibility levels.
|
||||
type PrivacyLevel string
|
||||
|
||||
const (
|
||||
// PrivacyPrivate restricts access to owner only.
|
||||
PrivacyPrivate PrivacyLevel = "private"
|
||||
// PrivacyTeam allows team members to view.
|
||||
PrivacyTeam PrivacyLevel = "team"
|
||||
// PrivacyPublic allows all authenticated users.
|
||||
PrivacyPublic PrivacyLevel = "public"
|
||||
// PrivacyAnonymized allows access with PII stripped.
|
||||
PrivacyAnonymized PrivacyLevel = "anonymized"
|
||||
)
|
||||
|
||||
// PrivacyEnforcer handles privacy access control.
|
||||
type PrivacyEnforcer struct {
|
||||
enforceTeams bool
|
||||
auditAccess bool
|
||||
}
|
||||
|
||||
// NewPrivacyEnforcer creates a privacy enforcer.
|
||||
func NewPrivacyEnforcer(enforceTeams, auditAccess bool) *PrivacyEnforcer {
|
||||
return &PrivacyEnforcer{
|
||||
enforceTeams: enforceTeams,
|
||||
auditAccess: auditAccess,
|
||||
}
|
||||
}
|
||||
|
||||
// CanAccess checks if a user can access an experiment.
|
||||
func (pe *PrivacyEnforcer) CanAccess(
|
||||
ctx context.Context,
|
||||
user *auth.User,
|
||||
experimentOwner string,
|
||||
level string,
|
||||
team string,
|
||||
) (bool, error) {
|
||||
privacyLevel := GetPrivacyLevelFromString(level)
|
||||
switch privacyLevel {
|
||||
case PrivacyPublic:
|
||||
return true, nil
|
||||
case PrivacyPrivate:
|
||||
return user.Name == experimentOwner || user.Admin, nil
|
||||
case PrivacyTeam:
|
||||
if user.Name == experimentOwner || user.Admin {
|
||||
return true, nil
|
||||
}
|
||||
if !pe.enforceTeams {
|
||||
return true, nil // Teams not enforced, allow access
|
||||
}
|
||||
// Check if user is in same team
|
||||
return pe.isUserInTeam(ctx, user, team)
|
||||
case PrivacyAnonymized:
|
||||
// Anonymized data is accessible but with PII stripped
|
||||
return true, nil
|
||||
default:
|
||||
return false, fmt.Errorf("unknown privacy level: %s", privacyLevel)
|
||||
}
|
||||
}
|
||||
|
||||
func (pe *PrivacyEnforcer) isUserInTeam(ctx context.Context, user *auth.User, team string) (bool, error) {
|
||||
// TODO: Implement team membership check
|
||||
// This could query a teams database or use JWT claims
|
||||
// For now, deny access if teams enforcement is on but no check implemented
|
||||
_ = ctx
|
||||
_ = user
|
||||
_ = team
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// GetPrivacyLevelFromString converts string to PrivacyLevel.
|
||||
func GetPrivacyLevelFromString(level string) PrivacyLevel {
|
||||
switch level {
|
||||
case "private":
|
||||
return PrivacyPrivate
|
||||
case "team":
|
||||
return PrivacyTeam
|
||||
case "public":
|
||||
return PrivacyPublic
|
||||
case "anonymized":
|
||||
return PrivacyAnonymized
|
||||
default:
|
||||
return PrivacyPrivate // Default to private for safety
|
||||
}
|
||||
}
|
||||
55
internal/privacy/pii.go
Normal file
55
internal/privacy/pii.go
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
// Package privacy provides PII detection for narratives and annotations.
|
||||
package privacy
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
)
|
||||
|
||||
// piiPatterns contains regex patterns for detecting PII.
|
||||
var piiPatterns = map[string]*regexp.Regexp{
|
||||
"email": regexp.MustCompile(`\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b`),
|
||||
"ssn": regexp.MustCompile(`\b\d{3}-\d{2}-\d{4}\b`),
|
||||
"phone": regexp.MustCompile(`\b\d{3}-\d{3}-\d{4}\b`),
|
||||
"credit_card": regexp.MustCompile(`\b(?:\d[ -]*?){13,16}\b`),
|
||||
"ip_address": regexp.MustCompile(`\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b`),
|
||||
}
|
||||
|
||||
// PIIFinding represents a detected PII instance.
|
||||
type PIIFinding struct {
|
||||
Type string `json:"type"`
|
||||
Position int `json:"position"`
|
||||
Length int `json:"length"`
|
||||
Sample string `json:"sample"` // Redacted sample
|
||||
}
|
||||
|
||||
// DetectPII scans text for potential PII.
|
||||
func DetectPII(text string) []PIIFinding {
|
||||
var findings []PIIFinding
|
||||
|
||||
for piiType, pattern := range piiPatterns {
|
||||
matches := pattern.FindAllStringIndex(text, -1)
|
||||
for _, match := range matches {
|
||||
findings = append(findings, PIIFinding{
|
||||
Type: piiType,
|
||||
Position: match[0],
|
||||
Length: match[1] - match[0],
|
||||
Sample: RedactSample(text[match[0]:match[1]]),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return findings
|
||||
}
|
||||
|
||||
// HasPII returns true if text contains PII.
|
||||
func HasPII(text string) bool {
|
||||
return len(DetectPII(text)) > 0
|
||||
}
|
||||
|
||||
// RedactSample creates a safe sample for reporting.
|
||||
func RedactSample(match string) string {
|
||||
if len(match) <= 4 {
|
||||
return "[PII]"
|
||||
}
|
||||
return match[:2] + "..." + match[len(match)-2:]
|
||||
}
|
||||
Loading…
Reference in a new issue