From 9b2d5986a311b45653ef63cc5de75b0c2f8d7ffa Mon Sep 17 00:00:00 2001 From: Jeremie Fraeys Date: Thu, 26 Feb 2026 12:04:33 -0500 Subject: [PATCH] docs(architecture): add technical documentation for scheduler and security Add comprehensive architecture documentation: - scheduler-architecture.md - Design of distributed job scheduler - Hub coordination model - Gang scheduling algorithm - Service discovery mechanisms - Failure recovery strategies - multi-tenant-security.md - Security isolation patterns - Tenant boundary enforcement - Resource quota management - Cross-tenant data protection - runtime-security.md - Operational security guidelines - Container security configurations - Network policy enforcement - Audit logging requirements --- docs/src/multi-tenant-security.md | 224 +++++++++++++++++++++++++++++ docs/src/runtime-security.md | 142 ++++++++++++++++++ docs/src/scheduler-architecture.md | 219 ++++++++++++++++++++++++++++ 3 files changed, 585 insertions(+) create mode 100644 docs/src/multi-tenant-security.md create mode 100644 docs/src/runtime-security.md create mode 100644 docs/src/scheduler-architecture.md diff --git a/docs/src/multi-tenant-security.md b/docs/src/multi-tenant-security.md new file mode 100644 index 0000000..e816c4f --- /dev/null +++ b/docs/src/multi-tenant-security.md @@ -0,0 +1,224 @@ +# Phase 10 Multi-Tenant Security Implementation Summary + +## Overview + +This document summarizes the Phase 10 Multi-Tenant Server Security features implemented for tenant isolation, cross-tenant access prevention, resource quotas, worker sanitization, and per-tenant audit logging. + +--- + +## Phase 10.1: Tenant Isolation + +### Tenant Manager (`internal/worker/tenant/manager.go`) + +**Core Types:** +- `Tenant` - Represents an isolated tenant with metadata, config, and lifecycle state +- `TenantConfig` - Holds tenant-specific configuration including quotas and security policies +- `IsolationLevel` - Defines isolation degree: `soft`, `hard`, or `dedicated` + +**Key Methods:** +- `CreateTenant()` - Creates isolated tenant workspace with subdirectories (artifacts, snapshots, logs, cache) +- `GetTenant()` - Retrieves active tenant by ID +- `DeactivateTenant()` - Soft-delete tenant +- `GetTenantWorkspace()` - Returns isolated workspace path for tenant +- `ListTenants()` - Returns all active tenants + +**Workspace Isolation:** +``` +/tenants/ + ├── {tenant-id}/ + │ ├── artifacts/ + │ ├── snapshots/ + │ ├── logs/ + │ └── cache/ +``` + +**Security Defaults (`DefaultTenantConfig`):** +- IsolationLevel: `hard` (container-level) +- RequireEncryption: true +- RequireAuditLogging: true +- RequireSandbox: true +- NetworkPolicy: "restricted" + +--- + +## Phase 10.2: Cross-Tenant Access Prevention + +### Middleware (`internal/worker/tenant/middleware.go`) + +**HTTP Middleware:** +- `Middleware.Handler()` - Validates tenant ID from headers/query params/context +- `ExtractTenantID()` - Extracts tenant ID from request (header: `X-Tenant-ID`, query param, or context) +- Automatic audit logging of all tenant requests + +**Resource Access Control:** +- `ResourceAccessChecker` - Validates cross-tenant resource access +- `CheckAccess()` - Denies all cross-tenant access by default +- `CheckResourceOwnership()` - Validates resource belongs to requesting tenant +- `ValidateResourcePath()` - Ensures path within tenant workspace + +**Cross-Tenant Denial:** +```go +// All cross-tenant access denied by default +if requestingTenantID != resourceTenantID { + return fmt.Errorf("cross-tenant access denied") +} +``` + +--- + +## Phase 10.3: Resource Quotas per Tenant + +### Quota Manager (`internal/worker/tenant/quota.go`) + +**ResourceQuota Structure:** +- MaxConcurrentJobs - Job concurrency limit +- MaxGPUs - GPU allocation limit +- MaxMemoryGB - Memory usage limit +- MaxStorageGB - Storage quota +- MaxCPUCores - CPU core limit +- MaxRuntimeHours - Maximum job runtime +- MaxArtifactsPerHour - Artifact creation rate limit + +**QuotaManager Features:** +- `CheckQuota()` - Validates resource request against tenant limits +- `Allocate()` - Reserves resources for tenant +- `Release()` - Frees resources when done +- `RecordArtifact()` - Tracks artifact creation rate +- Automatic hourly counter reset + +**Default Quotas:** +```go +MaxConcurrentJobs: 5 +MaxGPUs: 1 +MaxMemoryGB: 32 +MaxStorageGB: 100 +MaxCPUCores: 8 +MaxRuntimeHours: 24 +MaxArtifactsPerHour: 10 +``` + +--- + +## Phase 10.4: Worker Sanitization Between Tenants + +### Sanitization (`internal/worker/tenant/manager.go`) + +**SanitizeForTenant():** +- Clears tenant-specific caches +- Logs tenant transition for audit +- Prepares worker environment for different tenant + +**Called When:** +- Worker switches between tenant tasks +- New tenant session begins + +**Audit Event:** `AuditWorkerSanitized` + +--- + +## Phase 10.5: Per-Tenant Audit Logging + +### Audit Logger (`internal/worker/tenant/quota.go`) + +**AuditEvent Types:** +- `AuditTenantCreated` - Tenant provisioned +- `AuditTenantDeactivated` - Tenant deactivated +- `AuditTenantUpdated` - Configuration changed +- `AuditResourceAccess` - Resource accessed +- `AuditResourceCreated` - Resource created +- `AuditResourceDeleted` - Resource deleted +- `AuditJobSubmitted` - Job queued +- `AuditJobCompleted` - Job finished +- `AuditJobFailed` - Job error +- `AuditCrossTenantDeny` - Cross-tenant blocked +- `AuditQuotaExceeded` - Quota violation +- `AuditWorkerSanitized` - Worker cleaned +- `AuditEncryptionOp` - Encryption operation +- `AuditDecryptionOp` - Decryption operation + +**Audit Log Structure:** +``` +/tenants/ + └── {tenant-id}/ + └── audit.log (JSON format) +``` + +**Features:** +- Per-tenant isolated log files +- Structured JSON format +- IP address tracking +- Success/failure status +- Detailed context in `Details` field + +--- + +## Files Created + +### Phase 10 Core Implementation +1. `internal/worker/tenant/manager.go` - Tenant lifecycle and isolation +2. `internal/worker/tenant/quota.go` - Resource quotas and audit logging +3. `internal/worker/tenant/middleware.go` - HTTP middleware and access control + +### Worker Integration +4. `internal/worker/worker.go` - Added `TenantManager` field to Worker struct + +--- + +## Testing + +Build verification: +```bash +make dev # Successful +``` + +All Go packages compile on: +- macOS (Darwin) +- Linux +- Windows + +--- + +## Security Impact + +| Feature | Threat Mitigated | Implementation | +|---------|------------------|----------------| +| Tenant Isolation | Data leakage between tenants | Hard isolation with dedicated workspaces | +| Cross-Tenant Access | Unauthorized data access | Deny-by-default with audit logging | +| Resource Quotas | Resource exhaustion / DoS | Per-tenant limits with enforcement | +| Worker Sanitization | Cross-contamination | State clearing between tenant switches | +| Per-Tenant Audit | Compliance gaps | Isolated audit logs per tenant | + +--- + +## HIPAA Compliance + +All Phase 10 features support HIPAA compliance: +- Tenant isolation ensures data separation +- Cross-tenant access prevention blocks unauthorized access +- Per-tenant audit logs enable compliance tracking +- Resource quotas prevent resource-based DoS + +--- + +## Integration Points + +**Worker Usage:** +```go +// Initialize tenant manager +w.TenantManager, _ = tenant.NewManager("/tenants", w.Logger) + +// Create tenant +tenant, _ := w.TenantManager.CreateTenant(ctx, "tenant-1", "Acme Corp", config) + +// Validate resource access +err := w.TenantManager.ValidateTenantAccess(ctx, requestingTenant, resourceTenant) + +// Sanitize between tenants +w.TenantManager.SanitizeForTenant(ctx, newTenantID) +``` + +**HTTP Middleware Usage:** +```go +middleware := tenant.NewMiddleware(tenantManager, logger) +http.Handle("/api/", middleware.Handler(apiHandler)) +``` diff --git a/docs/src/runtime-security.md b/docs/src/runtime-security.md new file mode 100644 index 0000000..72d2eae --- /dev/null +++ b/docs/src/runtime-security.md @@ -0,0 +1,142 @@ +# Phase 9 Runtime Security Implementation Summary + +## Overview + +This document summarizes the Phase 9 Runtime Security features implemented for worker process isolation, network micro-segmentation, and hardened seccomp profiles. + +## Phase 9.2: Worker Process Isolation + +### Configuration Fields (internal/worker/config.go) + +Added to `SandboxConfig`: +- `MaxProcesses` - Maximum number of processes (fork bomb protection) +- `MaxOpenFiles` - Maximum open file descriptors per task +- `DisableSwap` - Whether to disable swap via mlockall +- `OOMScoreAdj` - OOM killer priority adjustment +- `TaskUID` - Task user ID for privilege separation +- `TaskGID` - Task group ID for privilege separation + +### Security Defaults (SecurityDefaults) + +```go +MaxProcesses: 100 // Fork bomb protection +MaxOpenFiles: 1024 // FD limit +DisableSwap: true // Swap disabled by default +OOMScoreAdj: 100 // Less likely to be killed +TaskUID: 1000 // Non-privileged UID +TaskGID: 1000 // Non-privileged GID +``` + +### Process Isolation Module (internal/worker/process/) + +**isolation.go** - Core isolation logic: +- `ApplyIsolation()` - Applies all resource limits +- `IsolationConfig` struct for configuration +- `IsolatedExec()` - Helper for running commands with isolation +- `GetCurrentLimits()` - Diagnostic function + +**isolation_unix.go** - Unix/Linux-specific: +- `applyResourceLimits()` - Sets RLIMIT_NPROC and RLIMIT_NOFILE +- `disableSwap()` - Uses mlockall(MCL_CURRENT|MCL_FUTURE) +- `setOOMScoreAdj()` - Writes to /proc/self/oom_score_adj + +**isolation_windows.go** - Windows stubs: +- Graceful degradation with no-op implementations +- Platform-specific error messages + +### Container Integration (internal/container/podman.go) + +Updated `PodmanSecurityConfig` with process isolation fields. + +Updated `BuildSecurityArgs()` to add: +- `--pids-limit` for fork bomb protection +- `--ulimit nofile` for FD limits +- `--oom-score-adj` for OOM priority +- `--memory-swap=0` to disable swap + +### Container Executor (internal/worker/executor/container.go) + +Updated `SandboxConfig` interface with process isolation getters. +Updated security config conversion to pass process isolation fields. + +## Phase 9.3: Network Micro-Segmentation + +### Network Policy Module (internal/worker/process/) + +**network_policy.go** (Linux): +- `NetworkPolicy` struct for network rules +- `DefaultNetworkPolicy()` - Blocks all by default +- `HIPAACompliantPolicy()` - Restricted allowlist mode +- `ApplyNetworkPolicy()` - Adds podman network arguments +- `SetupExternalFirewall()` - iptables/nsenter integration + +**network_policy_windows.go** (Windows): +- Windows stub implementations +- Validates network mode restrictions + +## Phase 9.6: Seccomp Hardened Profile + +### Seccomp Profile (configs/seccomp/default-hardened.json) + +Already exists with hardened default syscalls. + +### Integration (internal/container/podman.go) + +`BuildSecurityArgs()` already applies seccomp profiles: +```go +if sandbox.SeccompProfile != "" && sandbox.SeccompProfile != "unconfined" { + profilePath := GetSeccompProfilePath(sandbox.SeccompProfile) + if profilePath != "" { + args = append(args, "--security-opt", fmt.Sprintf("seccomp=%s", profilePath)) + } +} +``` + +## Files Modified + +### Phase 9.2 Process Isolation +1. `internal/worker/config.go` - Added config fields and getter methods +2. `internal/worker/process/isolation.go` - Core isolation logic +3. `internal/worker/process/isolation_unix.go` - Unix-specific syscalls +4. `internal/worker/process/isolation_windows.go` - Windows stubs +5. `internal/container/podman.go` - PodmanSecurityConfig and BuildSecurityArgs +6. `internal/worker/executor/container.go` - SandboxConfig interface and integration + +### Phase 9.3 Network Segmentation +7. `internal/worker/process/network_policy.go` - Linux network policy +8. `internal/worker/process/network_policy_windows.go` - Windows stub + +### Phase 9.6 Seccomp +- Used existing `configs/seccomp/default-hardened.json` +- Already integrated via existing `GetSeccompProfilePath()` + +## Testing + +Build verification: +```bash +make dev # Successful +``` + +All Go packages compile on: +- macOS (Darwin) +- Linux +- Windows + +## Security Impact + +| Feature | Threat Mitigated | Default Value | +|---------|------------------|---------------| +| MaxProcesses | Fork bombs | 100 processes | +| MaxOpenFiles | FD exhaustion | 1024 FDs | +| DisableSwap | Memory swapping | Enabled | +| OOMScoreAdj | Priority inversion | 100 (less likely killed) | +| NetworkMode | Data exfiltration | "none" | +| Seccomp | Kernel attack surface | Hardened profile | + +## HIPAA Compliance + +All Phase 9 features support HIPAA compliance mode: +- Network mode "none" enforced +- Seccomp profile required +- Process isolation enforced by default +- Resource limits prevent DoS diff --git a/docs/src/scheduler-architecture.md b/docs/src/scheduler-architecture.md new file mode 100644 index 0000000..d6532d8 --- /dev/null +++ b/docs/src/scheduler-architecture.md @@ -0,0 +1,219 @@ +# Scheduler Architecture + +The FetchML Scheduler manages distributed job scheduling across workers via WebSocket connections. + +## Overview + +The scheduler consists of: +- **SchedulerHub**: Core scheduling engine (`internal/scheduler/hub.go`) +- **PriorityQueue**: Heap-based job queues for batch and service jobs +- **WorkerConn**: WebSocket connection handling per worker +- **StateStore**: Persistent state for crash recovery +- **ServiceManager**: Long-running service lifecycle management + +## Key Components + +### SchedulerHub + +```go +type SchedulerHub struct { + workers map[string]*WorkerConn // Active worker connections + readyWorkers map[string]*WorkerConn // Workers ready for jobs + batchQueue *PriorityQueue // Batch job queue + serviceQueue *PriorityQueue // Service job queue + reservations map[string]*Reservation // Job reservations + multiNodePending map[string]*MultiNodeJob // Multi-node gang allocations + pendingAcceptance map[string]*JobAssignment // Jobs awaiting acceptance + state *StateStore // Persistent state +} +``` + +### Job Types + +| Type | Description | Scheduling | +|------|-------------|------------| +| **Batch** | Finite training jobs | FIFO with priority aging | +| **Service** | Long-running inference | Dedicated slots, health checks | +| **Multi-node** | Distributed training | Gang allocation across workers | + +## Protocol + +### Unified WSS Protocol + +All communication uses a single WebSocket Secure (WSS) endpoint: +- Workers connect to `wss://scheduler:port/ws/worker` +- Metrics clients connect with `metrics-` prefixed token + +### Message Types + +```go +const ( + // Worker → Scheduler + MsgRegister = "register" + MsgHeartbeat = "heartbeat" + MsgReadyForWork = "ready_for_work" + MsgJobAccepted = "job_accepted" + MsgJobResult = "job_result" + MsgServiceHealth = "service_health" + MsgMetricsRequest = "metrics_request" // Metrics over WSS + + // Scheduler → Worker + MsgJobAssign = "job_assign" + MsgNoWork = "no_work" + MsgJobCancel = "job_cancel" + MsgPrewarmHint = "prewarm_hint" + MsgAck = "ack" + MsgMetricsResponse = "metrics_response" // Metrics over WSS +) +``` + +### Metrics Over WSS + +Metrics are retrieved via WSS using a special client token: + +```go +// Connect with metrics token +conn, err := scheduler.DialWSS("scheduler:8443", "ca.crt", "metrics-scraper-1") + +// Request metrics +conn.WriteJSON(scheduler.Message{ + Type: scheduler.MsgMetricsRequest, +}) + +// Receive metrics +var msg scheduler.Message +conn.ReadJSON(&msg) +// msg.Type == MsgMetricsResponse +// msg.Payload contains metrics map +``` + +**Metrics payload:** +```json +{ + "workers_connected": 5, + "queue_depth_batch": 12, + "queue_depth_service": 3, + "jobs_completed": 142, + "jobs_failed": 2, + "jobs_cancelled": 0, + "worker_slots": { + "worker-1": {"batch_total": 4, "batch_in_use": 2, ...} + } +} +``` + +## Features + +### Priority Aging + +Prevents starvation by increasing priority of long-waiting jobs: +```go +effective_priority = base_priority + (wait_time * aging_rate) +``` + +### Gang Allocation + +Multi-node jobs are allocated atomically across workers: +1. Job submitted with `NodeCount > 1` +2. Scheduler waits for required workers +3. All nodes assigned simultaneously +4. Timeout handling for partial allocations + +### Starvation Prevention + +Tracks job wait times and triggers priority boosts: +```go +if wait_time > starvation_threshold { + effective_priority += boost_amount +} +``` + +### Worker Mode Switching + +Workers can switch between batch and service modes: +- Batch mode: processes training jobs +- Service mode: runs long-lived inference services + +## Testing + +### Test Infrastructure + +All tests use shared fixtures in `tests/fixtures/`: +- `SchedulerTestFixture`: Common setup/teardown +- `MockWorker`: Simulated worker connections + +### Test Categories + +| Category | Count | Files | +|----------|-------|-------| +| Unit | 17+ | `tests/unit/scheduler/` | +| Integration | 6 | `tests/integration/scheduler/` | +| E2E | 6 | `tests/e2e/scheduler/` | + +### Running Tests + +```bash +make test # All tests +make test-unit # Unit tests only +make test-integration # Integration tests only +go test ./tests/e2e/... # E2E tests +``` + +## State Persistence + +The scheduler persists state for crash recovery: +- Job queue state +- Task assignments +- Worker registrations +- Lease timestamps + +State is replayed on startup via `StateStore.Replay()`. + +## API Methods + +```go +// SubmitJob submits a job to the scheduler +func (h *SchedulerHub) SubmitJob(spec JobSpec) error + +// GetTask retrieves a task by ID +func (h *SchedulerHub) GetTask(taskID string) *Task + +// Addr returns the scheduler's listen address +func (h *SchedulerHub) Addr() string + +// Start begins the scheduler +func (h *SchedulerHub) Start() error + +// Stop shuts down the scheduler +func (h *SchedulerHub) Stop() +``` + +## Configuration + +```go +type HubConfig struct { + BindAddr string // Listen address + CertFile string // TLS certificate + KeyFile string // TLS key + StateDir string // State persistence dir + DefaultBatchSlots int // Default batch slots per worker + DefaultServiceSlots int // Default service slots per worker + StarvationThresholdMins float64 // Starvation detection threshold + PriorityAgingRate float64 // Priority increase rate + GangAllocTimeoutSecs int // Multi-node allocation timeout + AcceptanceTimeoutSecs int // Job acceptance timeout + WorkerTokens map[string]string // Authentication tokens +} +``` + +## Cross-Platform Support + +Process management is abstracted for Unix/Windows: +- `service_manager_unix.go`: POSIX process groups +- `service_manager_windows.go`: Windows job objects + +## See Also + +- `internal/scheduler/hub.go` - Core implementation +- `tests/fixtures/scheduler_fixture.go` - Test infrastructure +- `docs/src/native-libraries.md` - Native C++ performance libraries