fetch_ml/internal/storage/schema_sqlite.sql
Jeremie Fraeys fbcf4d38e5
feat(storage): add groups, tasks, tokens, and audit database schemas
Add comprehensive database storage layer for new features:

- db_groups.go: Lab group management with members, roles (admin/member/viewer),
  and group-based task visibility queries

- db_tasks.go: Task visibility system (private/lab/institution/open),
  task sharing with expiry, public clone tokens, and optimized
  ListTasksForUser() for access control

- db_tokens.go: Secure token management for public task access and cloning,
  with SHA-256 hashed token storage and automatic cleanup

- db_audit.go: Audit log persistence with checkpoint chains, tamper
  detection, and log rotation support

- schema_sqlite.sql: Updated schema with:
  - groups, group_members tables
  - tasks.visibility enum, task_shares with expiry
  - access_tokens table with hashed tokens
  - audit_logs, audit_checkpoints tables
  - indexes for all foreign keys and query patterns

- db_experiments.go: Add CascadeVisibilityToTasks() for propagating
  visibility changes from experiments to associated tasks
2026-03-08 12:48:42 -04:00

257 lines
9.6 KiB
SQL

-- SQLite schema for Fetch ML job persistence
-- Complements Redis for task queuing
CREATE TABLE IF NOT EXISTS jobs (
id TEXT PRIMARY KEY,
job_name TEXT NOT NULL,
args TEXT,
status TEXT NOT NULL DEFAULT 'pending',
priority INTEGER DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
started_at DATETIME,
ended_at DATETIME,
worker_id TEXT,
user_id TEXT,
error TEXT,
datasets TEXT, -- JSON array
metadata TEXT, -- JSON object
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
visibility TEXT NOT NULL DEFAULT 'lab',
experiment_id TEXT REFERENCES experiments(id)
);
CREATE TABLE IF NOT EXISTS job_metrics (
job_id TEXT,
metric_name TEXT,
metric_value TEXT,
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (job_id, metric_name, timestamp),
FOREIGN KEY (job_id) REFERENCES jobs(id) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS workers (
id TEXT PRIMARY KEY,
hostname TEXT,
last_heartbeat DATETIME DEFAULT CURRENT_TIMESTAMP,
status TEXT DEFAULT 'active',
current_jobs INTEGER DEFAULT 0,
max_jobs INTEGER DEFAULT 1,
metadata TEXT -- JSON object
);
CREATE TABLE IF NOT EXISTS system_metrics (
metric_name TEXT,
metric_value TEXT,
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (metric_name, timestamp)
);
CREATE TABLE IF NOT EXISTS experiments (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
description TEXT,
status TEXT DEFAULT 'pending',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
user_id TEXT,
workspace_id TEXT
);
CREATE TABLE IF NOT EXISTS experiment_environments (
experiment_id TEXT PRIMARY KEY,
python_version TEXT,
cuda_version TEXT,
system_os TEXT,
system_arch TEXT,
hostname TEXT,
requirements_hash TEXT,
conda_env_hash TEXT,
dependencies TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS experiment_git_info (
experiment_id TEXT PRIMARY KEY,
commit_sha TEXT,
branch TEXT,
remote_url TEXT,
is_dirty INTEGER DEFAULT 0,
diff_patch TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS experiment_seeds (
experiment_id TEXT PRIMARY KEY,
numpy_seed INTEGER,
torch_seed INTEGER,
tensorflow_seed INTEGER,
random_seed INTEGER,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS datasets (
name TEXT PRIMARY KEY,
url TEXT NOT NULL,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
-- Indexes for performance
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
CREATE INDEX IF NOT EXISTS idx_jobs_created_at ON jobs(created_at);
CREATE INDEX IF NOT EXISTS idx_jobs_worker_id ON jobs(worker_id);
CREATE INDEX IF NOT EXISTS idx_job_metrics_job_id ON job_metrics(job_id);
CREATE INDEX IF NOT EXISTS idx_job_metrics_timestamp ON job_metrics(timestamp);
CREATE INDEX IF NOT EXISTS idx_workers_heartbeat ON workers(last_heartbeat);
CREATE INDEX IF NOT EXISTS idx_system_metrics_timestamp ON system_metrics(timestamp);
CREATE INDEX IF NOT EXISTS idx_experiments_created_at ON experiments(created_at);
CREATE INDEX IF NOT EXISTS idx_experiments_status ON experiments(status);
CREATE INDEX IF NOT EXISTS idx_experiments_user_id ON experiments(user_id);
CREATE INDEX IF NOT EXISTS idx_datasets_name ON datasets(name);
-- Triggers to update timestamps
CREATE TRIGGER IF NOT EXISTS update_jobs_timestamp
AFTER UPDATE ON jobs
FOR EACH ROW
BEGIN
UPDATE jobs SET updated_at = CURRENT_TIMESTAMP WHERE id = NEW.id;
END;
CREATE TRIGGER IF NOT EXISTS update_experiments_timestamp
AFTER UPDATE ON experiments
FOR EACH ROW
BEGIN
UPDATE experiments SET updated_at = CURRENT_TIMESTAMP WHERE id = NEW.id;
END;
CREATE TRIGGER IF NOT EXISTS update_datasets_timestamp
AFTER UPDATE ON datasets
FOR EACH ROW
BEGIN
UPDATE datasets SET updated_at = CURRENT_TIMESTAMP WHERE name = NEW.name;
END;
-- WebSocket metrics table for tracking real-time metrics
CREATE TABLE IF NOT EXISTS websocket_metrics (
id INTEGER PRIMARY KEY AUTOINCREMENT,
metric_name TEXT NOT NULL,
metric_value REAL NOT NULL,
user TEXT,
recorded_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_websocket_metrics_name_time ON websocket_metrics(metric_name, recorded_at);
-- Groups and membership for lab-based task sharing
CREATE TABLE IF NOT EXISTS groups (
id TEXT PRIMARY KEY,
name TEXT NOT NULL UNIQUE,
description TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
created_by TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS group_members (
group_id TEXT NOT NULL,
user_id TEXT NOT NULL,
role TEXT DEFAULT 'member', -- 'admin', 'member', 'viewer'
joined_at DATETIME DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (group_id, user_id),
FOREIGN KEY (group_id) REFERENCES groups(id) ON DELETE CASCADE
);
-- System group for institution visibility (all authenticated users)
INSERT OR IGNORE INTO groups (id, name, description, created_by)
VALUES ('all-users', 'all-users', 'System group: all authenticated users', 'system');
-- Invite-and-accept flow: group admins invite; users accept or decline
CREATE TABLE IF NOT EXISTS group_invitations (
id TEXT PRIMARY KEY,
group_id TEXT NOT NULL,
invited_user_id TEXT NOT NULL,
invited_by TEXT NOT NULL,
status TEXT DEFAULT 'pending', -- 'pending', 'accepted', 'declined'
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
expires_at DATETIME, -- NULL = 7d default enforced in app layer
FOREIGN KEY (group_id) REFERENCES groups(id) ON DELETE CASCADE
);
-- Experiment/project grouping: share a whole experiment, not individual tasks
-- Note: experiments table already exists; adding group_id to link with sharing system
ALTER TABLE experiments ADD COLUMN group_id TEXT REFERENCES groups(id);
-- Link tasks to experiments
CREATE TABLE IF NOT EXISTS experiment_tasks (
experiment_id TEXT NOT NULL,
task_id TEXT NOT NULL,
added_at DATETIME DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (experiment_id, task_id),
FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE,
FOREIGN KEY (task_id) REFERENCES jobs(id) ON DELETE CASCADE
);
-- Per-user explicit shares with optional expiry
CREATE TABLE IF NOT EXISTS task_shares (
task_id TEXT NOT NULL,
user_id TEXT NOT NULL,
granted_by TEXT NOT NULL,
granted_at DATETIME DEFAULT CURRENT_TIMESTAMP,
expires_at DATETIME, -- NULL = no expiry; checked at access time
PRIMARY KEY (task_id, user_id),
FOREIGN KEY (task_id) REFERENCES jobs(id) ON DELETE CASCADE
);
-- Group-level task association
-- Records which group a task is associated with at submit time.
-- Actual membership is always resolved live from group_members.
CREATE TABLE IF NOT EXISTS task_group_access (
task_id TEXT NOT NULL,
group_id TEXT NOT NULL,
PRIMARY KEY (task_id, group_id),
FOREIGN KEY (task_id) REFERENCES jobs(id) ON DELETE CASCADE,
FOREIGN KEY (group_id) REFERENCES groups(id) ON DELETE CASCADE
);
-- Signed share tokens for unauthenticated open access (paper reproducibility links)
CREATE TABLE IF NOT EXISTS share_tokens (
token TEXT PRIMARY KEY, -- cryptographically random (32 bytes, base64url)
task_id TEXT, -- NULL if experiment-level
experiment_id TEXT, -- NULL if task-level
created_by TEXT NOT NULL,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
expires_at DATETIME, -- NULL = never expires
access_count INTEGER DEFAULT 0,
max_accesses INTEGER, -- NULL = unlimited
FOREIGN KEY (task_id) REFERENCES jobs(id) ON DELETE CASCADE,
FOREIGN KEY (experiment_id) REFERENCES experiments(id) ON DELETE CASCADE
);
-- Audit log for task access
CREATE TABLE IF NOT EXISTS task_access_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
task_id TEXT NOT NULL,
user_id TEXT, -- NULL for token-based access
token TEXT, -- NULL for session-based access
action TEXT NOT NULL, -- 'view', 'clone', 'execute', 'modify'
accessed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
ip_address TEXT,
FOREIGN KEY (task_id) REFERENCES jobs(id) ON DELETE CASCADE
);
-- Indexes for task sharing performance
CREATE INDEX IF NOT EXISTS idx_jobs_visibility ON jobs(visibility);
CREATE INDEX IF NOT EXISTS idx_jobs_user_id ON jobs(user_id);
CREATE INDEX IF NOT EXISTS idx_jobs_visibility_owner ON jobs(visibility, user_id);
CREATE INDEX IF NOT EXISTS idx_jobs_experiment ON jobs(experiment_id);
CREATE INDEX IF NOT EXISTS idx_task_shares_user ON task_shares(user_id);
CREATE INDEX IF NOT EXISTS idx_task_shares_expires ON task_shares(expires_at);
CREATE INDEX IF NOT EXISTS idx_tga_group ON task_group_access(group_id);
CREATE INDEX IF NOT EXISTS idx_share_tokens_task ON share_tokens(task_id);
CREATE INDEX IF NOT EXISTS idx_task_access_task ON task_access_log(task_id);
CREATE INDEX IF NOT EXISTS idx_task_access_user ON task_access_log(user_id);
CREATE INDEX IF NOT EXISTS idx_task_access_token ON task_access_log(token) WHERE token IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_invitations_user ON group_invitations(invited_user_id);