feat: add new API handlers, build scripts, and ADRs
Some checks failed
Build Pipeline / Build Binaries (push) Failing after 3m39s
Build Pipeline / Build Docker Images (push) Has been skipped
Build Pipeline / Sign HIPAA Config (push) Has been skipped
Build Pipeline / Generate SLSA Provenance (push) Has been skipped
Checkout test / test (push) Successful in 6s
CI Pipeline / Test (ubuntu-latest on self-hosted) (push) Failing after 1s
CI Pipeline / Dev Compose Smoke Test (push) Has been skipped
CI Pipeline / Security Scan (push) Has been skipped
CI Pipeline / Test Scripts (push) Has been skipped
CI Pipeline / Test Native Libraries (push) Has been skipped
CI Pipeline / Native Library Build Matrix (push) Has been skipped
Contract Tests / Spec Drift Detection (push) Failing after 11s
Contract Tests / API Contract Tests (push) Has been skipped
Deploy API Docs / Build API Documentation (push) Failing after 5s
Deploy API Docs / Deploy to GitHub Pages (push) Has been skipped
Documentation / build-and-publish (push) Failing after 40s
Test Matrix / test-native-vs-pure (cgo) (push) Failing after 14s
Test Matrix / test-native-vs-pure (native) (push) Failing after 35s
Test Matrix / test-native-vs-pure (pure) (push) Failing after 18s
CI Pipeline / Trigger Build Workflow (push) Failing after 1s
Build CLI with Embedded SQLite / build (arm64, aarch64-linux) (push) Has been cancelled
Build CLI with Embedded SQLite / build (x86_64, x86_64-linux) (push) Has been cancelled
Build CLI with Embedded SQLite / build-macos (arm64) (push) Has been cancelled
Build CLI with Embedded SQLite / build-macos (x86_64) (push) Has been cancelled
Security Scan / Security Analysis (push) Has been cancelled
Security Scan / Native Library Security (push) Has been cancelled
Verification & Maintenance / V.1 - Schema Drift Detection (push) Has been cancelled
Verification & Maintenance / V.4 - Custom Go Vet Analyzers (push) Has been cancelled
Verification & Maintenance / V.7 - Audit Chain Integrity (push) Has been cancelled
Verification & Maintenance / V.6 - Extended Security Scanning (push) Has been cancelled
Verification & Maintenance / V.10 - OpenSSF Scorecard (push) Has been cancelled
Verification & Maintenance / Verification Summary (push) Has been cancelled

- Introduce audit, plugin, and scheduler API handlers
- Add spec_embed.go for OpenAPI spec embedding
- Create modular build scripts (cli, go, native, cross-platform)
- Add deployment cleanup and health-check utilities
- New ADRs: hot reload, audit store, SSE updates, RBAC, caching, offline mode, KMS regions, tenant offboarding
- Add KMS configuration schema and worker variants
- Include KMS benchmark tests
This commit is contained in:
Jeremie Fraeys 2026-03-04 13:24:27 -05:00
parent 5f53104fcd
commit 7cd86fb88a
No known key found for this signature in database
23 changed files with 2432 additions and 0 deletions

View file

@ -0,0 +1,50 @@
name: Test Matrix
on:
push:
branches: [main]
pull_request:
env:
GO_VERSION: '1.25.0'
jobs:
test-native-vs-pure:
strategy:
matrix:
build_type: [pure, cgo, native]
runs-on: self-hosted
steps:
- uses: actions/checkout@v4
- name: Setup dependencies
run: |
sudo apt-get update
sudo apt-get install -y build-essential cmake
- name: Build with ${{ matrix.build_type }}
run: |
if [ "${{ matrix.build_type }}" = "native" ]; then
scripts/build/build-native.sh
fi
scripts/build/build-go.sh ${{ matrix.build_type }} linux amd64 cmd/api-server/main.go
- name: Run unit tests
run: |
export FETCHML_NATIVE_LIBS=$([ "${{ matrix.build_type }}" = "native" ] && echo "1" || echo "0")
go test -v ./tests/unit/...
- name: Run integration tests
run: |
export FETCHML_NATIVE_LIBS=$([ "${{ matrix.build_type }}" = "native" ] && echo "1" || echo "0")
go test -v ./tests/integration/...
- name: Run benchmark comparison
if: matrix.build_type == 'native'
run: make benchmark-compare
- name: Upload test results
uses: actions/upload-artifact@v4
with:
name: test-results-${{ matrix.build_type }}
path: test-results/

View file

@ -0,0 +1,104 @@
# KMS Configuration Schema
# Defines the structure for KMS (Key Management System) configuration
# per ADR-012 through ADR-015.
$schema: http://json-schema.org/draft-07/schema#
type: object
description: KMS configuration for external key management (Vault, AWS KMS, etc.)
properties:
provider:
type: string
enum: [vault, aws, memory]
description: KMS provider type
vault:
type: object
description: HashiCorp Vault configuration
properties:
address:
type: string
format: uri
description: Vault server URL (e.g., https://vault.internal:8200)
auth_method:
type: string
enum: [approle, kubernetes, token]
description: Authentication method
role_id:
type: string
description: AppRole role ID (for approle auth)
secret_id:
type: string
description: AppRole secret ID (for approle auth)
token:
type: string
description: Vault token (for token auth, development only)
transit_mount:
type: string
default: transit
description: Transit engine mount path
key_prefix:
type: string
default: fetchml-tenant
description: Prefix for tenant key names
region:
type: string
description: Region identifier for per-region keys (per ADR-014)
timeout:
type: integer
default: 30
description: HTTP client timeout in seconds
aws:
type: object
description: AWS KMS configuration
properties:
region:
type: string
description: AWS region (e.g., us-east-1)
key_alias_prefix:
type: string
default: alias/fetchml
description: Prefix for KMS key aliases
role_arn:
type: string
description: IAM role ARN to assume (optional)
endpoint:
type: string
format: uri
description: Custom endpoint for testing (e.g., LocalStack)
cache:
type: object
description: DEK cache configuration per ADR-012
properties:
ttl_minutes:
type: integer
default: 15
description: DEK cache TTL in minutes
max_entries:
type: integer
default: 1000
description: Maximum cached DEKs (LRU eviction)
grace_window_minutes:
type: integer
default: 60
description: Extended grace period during KMS unavailability (per ADR-013)
required:
- provider
# Conditional validation
allOf:
- if:
properties:
provider:
const: vault
then:
required: [vault]
- if:
properties:
provider:
const: aws
then:
required: [aws]

View file

@ -0,0 +1,70 @@
# Staging environment worker configuration
# Pre-production validation with production-like settings
host: localhost
port: 22
user: worker-user
base_path: /var/lib/fetchml
entrypoint: train.py
# Redis configuration
redis_url: redis://redis:6379
# Standard mode for staging (production-like but not strict)
compliance_mode: standard
max_workers: 2
# Sandbox settings (standard isolation)
sandbox:
network_mode: none
seccomp_profile: default
no_new_privileges: true
allowed_secrets:
- HF_TOKEN
- WANDB_API_KEY
- AWS_ACCESS_KEY_ID
- AWS_SECRET_ACCESS_KEY
# GPU configuration
gpu_vendor: nvidia
# Artifact handling (production limits)
max_artifact_files: 1000
max_artifact_total_bytes: 536870912 # 512MB
# Provenance (enabled for audit trail)
provenance_best_effort: true
# MinIO configuration for staging
minio:
endpoint: minio:9000
bucket: fetchml-snapshots-staging
secure: false
# Plugin Configuration
plugins:
# Jupyter Notebook/Lab Service
jupyter:
enabled: true
image: "quay.io/jupyter/base-notebook:latest"
default_port: 8888
mode: "lab"
security:
trusted_channels:
- "conda-forge"
- "defaults"
blocked_packages:
- "requests"
- "urllib3"
require_password: true
max_gpu_per_instance: 1
max_memory_per_instance: "8Gi"
# vLLM Inference Service
vllm:
enabled: true
image: "vllm/vllm-openai:latest"
default_port: 8000
model_cache: "/models"
default_quantization: ""
max_gpu_per_instance: 1
max_model_len: 4096

View file

@ -0,0 +1,64 @@
# Standard security mode worker configuration
# Normal sandbox, network isolation
host: localhost
port: 22
user: worker-user
base_path: /var/lib/fetchml
entrypoint: train.py
# Redis configuration
redis_url: redis://redis:6379
# Standard mode - normal security
compliance_mode: standard
max_workers: 2
# Sandbox settings (standard isolation)
sandbox:
network_mode: none
seccomp_profile: default
no_new_privileges: true
allowed_secrets:
- HF_TOKEN
- WANDB_API_KEY
- AWS_ACCESS_KEY_ID
- AWS_SECRET_ACCESS_KEY
# GPU configuration
gpu_vendor: none
# Artifact handling (reasonable limits)
max_artifact_files: 1000
max_artifact_total_bytes: 536870912 # 512MB
# Provenance (enabled)
provenance_best_effort: true
# Plugin Configuration
plugins:
# Jupyter Notebook/Lab Service
jupyter:
enabled: true
image: "quay.io/jupyter/base-notebook:latest"
default_port: 8888
mode: "lab"
security:
trusted_channels:
- "conda-forge"
- "defaults"
blocked_packages:
- "requests"
- "urllib3"
require_password: true
max_gpu_per_instance: 1
max_memory_per_instance: "8Gi"
# vLLM Inference Service
vllm:
enabled: true
image: "vllm/vllm-openai:latest"
default_port: 8000
model_cache: "/models"
default_quantization: ""
max_gpu_per_instance: 1
max_model_len: 4096

View file

@ -0,0 +1,60 @@
# ADR-009: Plugin Configuration Supports Hot-Reload with Per-Plugin Restart Flag
## Status
Accepted
## Context
The plugin system (`internal/tracking/`) manages MLflow, TensorBoard, and Weights & Biases integrations as sidecar processes. The REST API will expose plugin configuration updates via `PUT /v1/plugins/{pluginName}/config`.
When a configuration update is applied, we needed to decide whether the new config takes effect immediately without restarting the plugin process (hot-reload), or whether it requires the worker to restart the plugin sidecar.
Plugins vary significantly in what their configuration touches. Some config fields (e.g., a remote tracking URI) can be applied to a running process safely. Others (e.g., a listening port or a DB connection string) are consumed only at initialization time and cannot be changed without a restart.
## Decision
**Hot-reload is the default behavior.** Plugins that cannot safely reload declare `requiresRestart: true` in their manifest. The plugin host calls `plugin.Reload(newConfig)` on a config update; if the plugin returns an error or declares the restart flag, the host falls back to the previous config and surfaces a `409 Requires Restart` response to the caller.
## Consequences
### Positive
- Operators can update tracking URIs, credentials, and feature flags without downtime or job interruption
- Reduces pressure to batch config changes or schedule maintenance windows for minor updates
- The restart flag gives plugin authors a clear, documented escape hatch for initialization-bound resources
- Config rollback on reload failure prevents a bad update from leaving a plugin in a broken state
### Negative
- Plugin authors must implement a correct `Reload()` lifecycle method — leaked goroutines or global state not cleaned up on reload will cause subtle bugs
- The reload contract must be clearly documented and enforced via integration tests; it is easy to accidentally write a plugin that claims to support hot-reload but does not
- Two code paths to maintain in the plugin host (reload vs. restart)
## Options Considered
### Require Restart for All Config Changes
**Pros:**
- Simple and safe — no lifecycle contract required of plugin authors
- No risk of partial state from a failed reload
**Cons:**
- Poor operator experience; a URI change should not require killing running sidecars
- Creates pressure to accumulate config changes and apply them together, which is a riskier pattern
- Kills observability continuity (TensorBoard logs, MLflow runs) unnecessarily for trivial updates
### Hot-Reload All Config Changes Without Escape Hatch
**Pros:**
- Consistent behavior across all plugins
- Simpler API contract
**Cons:**
- Not all plugins can safely reload all fields; forcing hot-reload on init-bound resources will silently fail or corrupt state
- No mechanism for plugin authors to signal that a restart is required
### Hot-Reload with Per-Plugin Restart Flag (Selected)
**Pros:**
- Best operator experience by default
- Plugin authors retain control over what they can safely support
- Clear error signaling when a restart is actually required
**Cons:**
- Slightly more complex host logic and plugin authoring contract
## Rationale
Hot-reload provides the right default experience and covers the majority of real-world config changes (remote URIs, credentials, log levels). The `requiresRestart` flag is a minimal escape hatch that does not compromise the default without requiring plugin authors to do something fundamentally unsafe. The reload contract — teardown, reinit, rollback on error — is well-understood and testable.

View file

@ -0,0 +1,64 @@
# ADR-008: Audit Queries Served from Separate Append-Only Store
## Status
Accepted
## Context
The audit subsystem (`internal/audit/`) writes tamper-evident, hash-chained log entries to flat files. As part of exposing audit capabilities via the REST API (`/v1/audit/events`), we needed to decide where the query layer reads from:
- The raw audit log files written by the audit subsystem, or
- A separate store populated from those files
Audit queries must support filtering by time range, event type, and user ID, as well as pagination. The audit chain itself must remain tamper-evident and verifiable independently of whatever query mechanism is chosen.
## Decision
Audit queries will be served from a dedicated **append-only store** (a database table). The raw log files remain the authoritative source for hash-chain verification. A lightweight pipeline process tails the log files and inserts new entries into the store, keeping query lag minimal without coupling the query layer to the file layer.
## Consequences
### Positive
- Query performance is fast and predictable at scale — filtering, pagination, and indexed lookups are all native DB operations
- The query layer is decoupled from the file write path, eliminating read/write contention on hot log files
- The store survives log rotation and file archival without data loss for queries
- Compliance export (`ml audit export`) becomes a straightforward SELECT rather than a file parse
### Negative
- Dual-write complexity: the pipeline process must be monitored and must handle backpressure and restarts safely
- A lag window exists between a log entry being written and it being queryable (acceptable for audit use cases; not a real-time feed)
- Additional infrastructure component to operate and back up
- Inconsistency risk if the pipeline falls behind or fails — operators must be alerted on pipeline lag
## Options Considered
### Query Directly from Log Files
**Pros:**
- Single source of truth, zero sync lag
- No additional infrastructure
**Cons:**
- Scanning flat files for filtered queries is slow and does not scale
- Concurrent reads on a hot log file risk contention with the write path
- Pagination is not naturally supported by line-based files
- Log rotation and archival break query continuity
### Write to Store Directly from Audit Subsystem (Dual-Write at Source)
**Pros:**
- Zero lag between write and query availability
- No pipeline process to operate
**Cons:**
- Couples the audit subsystem to a DB dependency, increasing blast radius of DB failures
- The tamper-evident hash chain must remain file-based for verifiability; adding a second write path at the source risks divergence
### Stream via Message Bus (e.g., Kafka/NATS) into Store
**Pros:**
- Decoupled and durable
- Enables future consumers beyond the query store
**Cons:**
- Significant infrastructure overhead for the current scale
- Introduces another system to operate and monitor
- Overkill relative to the query volume expected from an audit endpoint
## Rationale
Log files are an operational safety net and the root of trust for the hash chain, not a query surface. A separate store provides the indexing and filtering capabilities the API requires without compromising the write path or the chain's integrity. The pipeline approach (file tail → DB insert) keeps the audit subsystem itself simple and dependency-free, while giving the query layer everything it needs. The lag introduced by the pipeline is acceptable for audit queries, which are not latency-sensitive.

View file

@ -0,0 +1,71 @@
# ADR-010: Scheduler Real-Time Updates Use SSE for Status Streams, REST for Control
## Status
Accepted
## Context
The scheduler API (`/v1/scheduler/`) exposes queue depth, worker state, and job transitions. Operators and the CLI (`ml scheduler status`) need to observe changes in near real-time without polling. We needed to choose a transport for this streaming:
- **WebSocket:** Full-duplex, persistent connection, client can send messages mid-stream
- **Server-Sent Events (SSE):** Server-push only, unidirectional, native HTTP
- **Long polling:** Repeated HTTP requests held open until data is available
The use case is observability: showing live queue depth, worker connect/disconnect events, and job state transitions. Control operations (drain a worker, update job priority, cancel a job) are discrete, low-frequency actions.
## Decision
Use **SSE for status and progress streams**. Keep all control operations on **REST endpoints**. Do not use WebSocket.
Two new streaming endpoints will be added:
- `GET /v1/scheduler/status/stream` — emits queue depth changes, worker events, scheduler state transitions
- `GET /v1/scheduler/jobs/{jobId}/stream` — emits state transitions and priority changes for a specific job
## Consequences
### Positive
- SSE is unidirectional server-push, which matches the use case exactly — clients observe, they do not send mid-stream commands
- SSE works over standard HTTP/1.1 and HTTP/2, proxies and load balancers handle it without special configuration
- `EventSource` is a browser-native API; the CLI can consume SSE streams with standard HTTP tooling (`curl -N`)
- No upgrade handshake, no ping/pong, no framing protocol to implement or debug
- Control operations on REST are stateless, independently rate-limitable, and trivially auditable
### Negative
- SSE is unidirectional — if a future requirement emerges for clients to send commands mid-stream (e.g., pause a job from within a live status feed), the transport will need to change
- SSE connections held through HTTP/1.1 proxies with aggressive timeouts may require reconnect logic on the client
- Per-connection server resources are still consumed for each SSE subscriber; high subscriber counts need backpressure consideration
## Options Considered
### WebSocket
**Pros:**
- Full-duplex: client can send commands mid-stream if needed
- Low overhead per message once connection is established
- Well-supported in all clients
**Cons:**
- Upgrade handshake adds complexity; many proxies and load balancers require explicit configuration to support it
- Full-duplex is unnecessary — the scheduler status stream is read-only from the client's perspective
- Mixes control and observation into a single connection, which complicates auditing and access control
- More complex to implement correctly (ping/pong, close handshake, reconnect logic)
### Server-Sent Events (Selected)
**Pros:**
- Matches the server-push, read-only nature of the use case
- Standard HTTP — no proxy configuration required
- Simpler implementation and client consumption
- Native browser support via `EventSource`
**Cons:**
- Unidirectional only
- Slightly higher per-event overhead than WebSocket for high-frequency streams (acceptable at scheduler event rates)
### Long Polling
**Pros:**
- Works everywhere, no streaming support required
**Cons:**
- Higher latency per event
- More server load from repeated connection setup/teardown
- More complex client logic to manage polling intervals and state continuity
## Rationale
The scheduler status stream is fundamentally a server-push feed. SSE is the right tool for server-push over HTTP — it is simpler to implement, simpler to consume, and requires no special infrastructure support. Mixing control operations into the same stream would complicate access control and auditing unnecessarily; keeping control on REST preserves a clean separation between observation and mutation.

View file

@ -0,0 +1,76 @@
# ADR-011: RBAC Implemented as Permission-Based Roles
## Status
Accepted
## Context
Three new API subsystems (plugins, scheduler, audit) expose operations with meaningfully different sensitivity levels. Read-only status queries are appropriate for regular users; operations like draining a worker, updating job priority, or reading the audit log should be restricted to administrators or operators.
We needed to choose an authorization model that enforces least privilege, is extensible without code changes, and is simple enough to implement correctly in the near term.
## Decision
Implement **permission-based RBAC**. Roles are named bundles of atomic permissions. Ship two built-in roles (`admin` and `user`). The permission set for each role is defined in configuration so operators can create custom roles without code changes. Permissions are enforced at the middleware layer; JWT claims or session tokens carry the resolved permission set so enforcement is stateless.
### Permissions Matrix
| Permission | `admin` | `user` |
|---------------------|:-------:|:------:|
| `plugins:read` | ✓ | ✓ |
| `plugins:write` | ✓ | — |
| `scheduler:read` | ✓ | ✓ |
| `scheduler:write` | ✓ | — |
| `scheduler:drain` | ✓ | — |
| `audit:read` | ✓ | — |
| `audit:verify` | ✓ | — |
| `tasks:read` | ✓ | ✓ |
| `tasks:write` | ✓ | ✓ |
| `tasks:priority` | ✓ | — |
## Consequences
### Positive
- Atomic permissions allow fine-grained custom roles without schema changes (e.g., an `operator` role with `scheduler:write + scheduler:drain` but no `audit:read`)
- Enforcement at the middleware layer is stateless — no per-request DB lookup for permission checks
- Audit log entries can record the specific permission used for each action, not just the role
- Easy to reason about and test — each handler has a declared required permission
### Negative
- More initial setup than flat roles (`admin` / `user` strings)
- Token size increases slightly as the resolved permission set is embedded in claims
- Permission sprawl risk over time if new permissions are added without pruning obsolete ones
- No row- or resource-level restrictions (e.g., "user can only see their own jobs") — this requires ABAC if needed in future
## Options Considered
### Flat Roles (admin / user string check)
**Pros:**
- Trivially simple to implement and reason about
- No configuration required
**Cons:**
- Breaks down immediately when partial delegation is needed (e.g., on-call engineer who can drain workers but cannot read audit logs)
- Adding a third role requires code changes
- Cannot express nuanced operational roles without becoming an ad hoc permission system anyway
### Permission-Based RBAC (Selected)
**Pros:**
- Roles are composable without code changes
- Clean enforcement boundary at middleware
- Extensible to new operations by adding a permission constant
**Cons:**
- Slightly more upfront design work
- Permission set in token must be kept in sync when roles change (requires token reissue or short TTLs)
### Attribute-Based Access Control (ABAC)
**Pros:**
- Maximum flexibility — can express policies like "user can only modify jobs they submitted"
- Handles resource-level restrictions natively
**Cons:**
- Significant implementation complexity; requires a policy engine (e.g., OPA, Casbin)
- Harder to reason about and audit
- Overkill for the current access control requirements
## Rationale
Permission-based RBAC sits at the right point on the complexity curve for this system's current needs. Flat roles are too rigid for real operational workflows (on-call access, read-only auditors, etc.). ABAC would be the right choice if resource-level restrictions were required, but they are not today. The configuration-driven role definition means the system can evolve toward more granular roles over time without touching application code, and the middleware enforcement pattern keeps permission checks consistent and auditable across all three new subsystems.

View file

@ -0,0 +1,73 @@
# ADR-012: DEK Caching in Memory with Bounded TTL
## Status
Accepted
## Context
The encryption subsystem wraps Data Encryption Keys (DEKs) using a KMS (e.g., AWS KMS). Every encrypt or decrypt operation requires the DEK to be unwrapped by the KMS before use. Without caching, this means a KMS API call on every operation — introducing network latency, increasing KMS costs, and creating a hard dependency on KMS availability for every read and write.
We needed to decide whether unwrapped DEKs should be held in memory for reuse, and if so, under what constraints to prevent the cache from becoming a security liability.
## Decision
Unwrapped DEKs are cached **in-process memory only**, subject to the following constraints:
- **TTL:** 15 minutes per entry. After expiry the DEK is evicted and the next operation fetches a fresh unwrapped key from KMS.
- **Max size:** 1,000 entries (LRU eviction). Prevents unbounded memory growth across large tenant populations.
- **Scope:** In-process only. DEKs are never serialized to disk, written to a shared cache (e.g., Redis), or logged.
- **Explicit invalidation:** The cache exposes a `Flush(tenantID)` method called on key rotation events and tenant offboarding.
## Consequences
### Positive
- Eliminates a KMS round-trip on every decrypt for hot keys, reducing p99 latency significantly on read-heavy workloads
- Reduces KMS API call volume and associated cost
- Provides a buffer against transient KMS unavailability for in-flight operations (see ADR-007)
- TTL and size bounds keep the memory footprint predictable
### Negative
- A process memory dump during the TTL window exposes plaintext DEKs — this is the core security tradeoff and must be documented in the threat model
- Cache invalidation on key rotation requires a reliable signal path; a missed flush leaves stale DEKs in use until TTL expiry
- LRU eviction means infrequently accessed tenant keys are silently dropped and re-fetched, which can cause latency spikes for cold tenants
## Options Considered
### No Caching (KMS Call on Every Operation)
**Pros:**
- No plaintext key material held in memory beyond the immediate operation
- Simplest implementation
**Cons:**
- KMS latency (typically 15ms) on every encrypt/decrypt; unacceptable on high-throughput paths
- KMS becomes a hard availability dependency for every operation — any blip causes immediate failures
- KMS API costs scale linearly with operation volume
### Cache with No TTL (Evict Only on Explicit Flush)
**Pros:**
- Maximum cache hit rate
- Lowest KMS call volume
**Cons:**
- Plaintext DEKs held in memory indefinitely unless explicitly flushed — a missed rotation event leaves old key material live forever
- Unacceptable security posture for a multi-tenant encryption system
### Cache with Bounded TTL and Size (Selected)
**Pros:**
- Balances performance and security exposure window
- TTL provides automatic backstop against missed invalidation signals
- Size bound prevents memory abuse
**Cons:**
- TTL window represents a residual exposure period that must be accepted and documented
### Shared External Cache (e.g., Redis)
**Pros:**
- Survives process restart
- Shared across multiple instances
**Cons:**
- Serializing plaintext DEKs to an external store dramatically widens the attack surface
- Adds a network hop, partially negating the latency benefit over going directly to KMS
- External cache compromise exposes all cached DEKs across all tenants simultaneously
## Rationale
In-memory DEK caching with a bounded TTL is a well-established pattern used by AWS Encryption SDK, HashiCorp Vault, and Google Tink. The residual risk — memory dump during the TTL window — is the accepted tradeoff for practical performance. It is mitigated by keeping the cache strictly in-process, enforcing a short TTL, and providing an explicit flush path for key lifecycle events. Refusing to cache at all produces a system that is impractical to operate at scale and paradoxically more fragile, since KMS availability becomes a prerequisite for every single operation.

View file

@ -0,0 +1,68 @@
# ADR-013: KMS Offline Mode Fails Closed with a Narrow Grace Window
## Status
Accepted
## Context
The encryption subsystem depends on KMS availability to unwrap DEKs. KMS services experience occasional transient unavailability (network partitions, regional incidents, rate limiting). We needed a policy for what happens when a KMS call fails:
- **Fail-closed:** Deny the operation. No decryption without KMS reachability or a valid cached DEK.
- **Fail-open:** Allow the operation to proceed using cached material or a fallback mechanism, even beyond normal cache TTL.
This decision directly affects the security guarantee the system provides: if an attacker can cause KMS unavailability, does that unlock data or lock it?
## Decision
The system **fails closed by default**. A narrow grace window is permitted under the following conditions only:
- A valid DEK is already present in the in-process cache (see ADR-006)
- KMS has been confirmed unreachable (not returning errors indicating invalid credentials or revoked access — those are not treated as unavailability)
- The extended grace TTL has not expired (default: 1 hour beyond normal cache TTL)
Outside of the grace window, or when no cached DEK exists, operations that require KMS return a `503 KMS Unavailable` error. No new decryption is permitted without a reachable KMS.
## Consequences
### Positive
- Availability-based attacks (deliberately taking down KMS to force fail-open) do not succeed in exposing data
- The security guarantee — data is inaccessible without KMS — holds through infrastructure incidents
- Transient blips (seconds to low minutes) are absorbed by the existing DEK cache without user impact
- Clear operational signal: `503 KMS Unavailable` is unambiguous and alertable
### Negative
- Extended KMS outages (beyond cache TTL + grace window) cause user-visible failures — this is intentional but must be communicated in SLA documentation
- The grace window logic adds implementation complexity; the distinction between "KMS unreachable" and "KMS refusing due to revoked access" must be handled correctly or the grace window becomes a bypass
- Teams accustomed to fail-open systems will push back on this posture during incidents
## Options Considered
### Fail-Open (Allow Operations Through KMS Unavailability)
**Pros:**
- Maximum availability during infrastructure incidents
- Users are not impacted by KMS outages
**Cons:**
- If KMS unavailability is attacker-induced, fail-open directly enables the attack — data becomes accessible precisely when the key management system is compromised
- "Temporary" fail-open states have a history of becoming permanent through operational pressure
- Undermines the core security model: the KMS is the control plane for data access; bypassing it on failure means it never truly controlled access
### Fail-Closed with No Grace Window
**Pros:**
- Strictest security posture
- Simplest implementation — no grace window logic
**Cons:**
- Any transient KMS blip (even a 200ms network hiccup) causes a user-visible error
- Operations in-flight at the moment of a KMS timeout fail immediately, even though a valid cached DEK exists in memory
### Fail-Closed with Grace Window on Cached DEKs (Selected)
**Pros:**
- Transient unavailability absorbed silently for tenants with warm caches
- Security guarantee maintained — no new decryption without KMS reachability
- Grace window is bounded and expires
**Cons:**
- Additional logic to distinguish unavailability from access denial
- Grace window duration is a judgment call that must be documented and reviewed
## Rationale
The correct question for a fail-open/fail-closed decision is: *what is the worst-case outcome of each posture when the unavailability is not accidental?* Fail-open means an attacker who can disrupt KMS also gets access to encrypted data — the security control collapses exactly when it is most needed. Fail-closed means an attacker who disrupts KMS causes a denial of service, not a data breach. A denial of service is recoverable; a data breach is not. The grace window is a pragmatic concession to operational reality that does not compromise this core principle, because it only applies to key material that was already authorized and cached.

View file

@ -0,0 +1,65 @@
# ADR-014: KMS Keys Are Per-Region with Explicit Cross-Region Replication for DR
## Status
Accepted
## Context
The encryption subsystem manages one KMS key per tenant. As the system scales to multi-region deployments, we needed to decide whether tenant keys should be:
- **Per-region:** Each region holds its own KMS key for each tenant. Data encrypted in a region is decrypted by that region's key.
- **Multi-region:** A single logical key is replicated across regions (e.g., AWS KMS multi-region keys), allowing the same key material to decrypt data regardless of which region the request is served from.
This decision affects blast radius on key compromise, data residency compliance, disaster recovery posture, and operational complexity.
## Decision
Tenant KMS keys are **per-region**. Cross-region key replication is supported as an **explicit opt-in** for tenants with documented cross-region read requirements (e.g., active-active deployments, cross-region DR reads). Replication uses the KMS provider's native multi-region key mechanism (e.g., AWS KMS multi-region keys) rather than manual key export/import.
Keys are not replicated preemptively. Replication is provisioned per-tenant on request, with the tenant's data residency requirements confirmed before a replica is created in any new region.
## Consequences
### Positive
- Blast radius of a key compromise is contained to a single region by default — a compromised key in `us-east-1` does not affect `eu-west-1`
- Per-region keys naturally satisfy data residency requirements (GDPR, data sovereignty) without additional policy configuration
- Key policies and audit trails are simpler to reason about per-region
- Default posture does not require cross-region KMS API calls, keeping latency and cost predictable
### Negative
- Cross-region DR requires explicit replication setup per tenant — recovery from a full regional failure requires the replica to be pre-provisioned, not created on demand during an incident
- Tenants with active-active multi-region deployments require replication, adding operational overhead
- Key inventory management becomes more complex as tenant count and region count grow
## Options Considered
### Multi-Region Keys for All Tenants by Default
**Pros:**
- Simplifies DR — key material available in all regions without per-tenant provisioning
- Enables cross-region reads without tenant-specific configuration
**Cons:**
- Key compromise in any region exposes all regions simultaneously
- Multi-region key policies are harder to reason about and audit
- Data residency compliance becomes harder — key material is by definition present in multiple jurisdictions
- Creates a broader attack surface than the majority of tenants actually require
### Per-Region Keys, No Cross-Region Support
**Pros:**
- Maximum blast radius containment
- Simplest key inventory
**Cons:**
- Blocks legitimate cross-region DR scenarios entirely
- Forces tenants with active-active requirements to work around the system
### Per-Region Keys with Opt-In Cross-Region Replication (Selected)
**Pros:**
- Secure by default, flexible on demand
- Blast radius contained for the majority of tenants who don't need cross-region reads
- Native KMS multi-region key mechanism handles key material sync without manual export
**Cons:**
- Replication must be provisioned ahead of a regional failure, not during it
- Slightly more complex provisioning workflow for multi-region tenants
## Rationale
Per-region keys are the secure default because they limit the consequences of a key policy misconfiguration or credential compromise to a single region. The instinct to use multi-region keys everywhere is driven by DR convenience, but it solves that problem by widening the attack surface for every tenant, including those who will never need cross-region reads. The opt-in replication model preserves the security default while accommodating legitimate operational requirements. Data residency compliance — increasingly a hard requirement for enterprise tenants — also strongly favors per-region as the default posture.

View file

@ -0,0 +1,72 @@
# ADR-015: Tenant Offboarding Disables KMS Key Immediately, Schedules Hard Deletion
## Status
Accepted
## Context
When a tenant is offboarded, the KMS key used to wrap their DEKs must be handled. The options range from immediate hard deletion (maximum data destruction assurance) to indefinite retention (maximum recoverability). We needed a policy that satisfies security requirements, compliance obligations, and operational safety.
Key considerations:
- KMS key deletion is **irreversible** — there is no recovery path once a key is deleted
- Encrypted backups, audit logs, and compliance exports may need to be decrypted after the tenant relationship ends
- Regulatory frameworks commonly require the ability to produce records for a defined retention period post-offboarding
- Mistaken offboardings (billing errors, wrong tenant ID) do occur
## Decision
On tenant offboarding:
1. **Immediately disable the KMS key.** This blocks all new encrypt and decrypt operations instantly. The tenant's data is inaccessible from this point.
2. **Schedule hard deletion** after a retention window of **90 days** (configurable per tenant tier and jurisdiction). The window begins at the moment the key is disabled.
3. During the retention window, key re-enablement requires an explicit approval workflow (not a self-serve action) to prevent casual reversal.
4. Hard deletion is executed automatically at the end of the retention window unless a hold has been placed (e.g., for legal hold or active dispute).
The 90-day default satisfies the minimum pending deletion window enforced by AWS KMS (7 days) with significant margin, and aligns with common contractual data retention obligations.
## Consequences
### Positive
- Immediate disable gives the tenant (and the platform) a strong assurance that data access is revoked the moment offboarding completes
- Retention window provides recovery from mistaken offboardings without permanently weakening the deletion guarantee
- Scheduled hard deletion is auditable and automatable — no manual step required to complete the key lifecycle
- Compliance teams can request decryption of specific records during the retention window for legal hold, audits, or disputes
### Negative
- Data is not immediately and permanently destroyed on offboarding — tenants who require cryptographic erasure on termination must be given a shorter configurable window
- Retention window must be actively managed — holds and overrides need a governance process to prevent accumulation of keys that should have been deleted
- Re-enablement approval workflow adds operational overhead for the rare legitimate reversal case
## Options Considered
### Immediate Hard Deletion
**Pros:**
- Strongest data destruction guarantee — key is gone, data is cryptographically unrecoverable immediately
- Simplest post-offboarding state
**Cons:**
- Irreversible — a mistaken offboarding permanently destroys the tenant's data
- Blocks legitimate post-offboarding decryption for compliance, legal hold, or support purposes
- AWS KMS enforces a minimum 7-day pending deletion window regardless, so true immediate deletion is not possible on that platform
### Revoke Access Only, Retain Key Indefinitely
**Pros:**
- Maximum recoverability
- No risk of accidental permanent loss
**Cons:**
- KMS key inventory grows without bound
- Retaining key material indefinitely after a tenant relationship ends is difficult to justify under data minimization principles (GDPR Article 5)
- Creates ongoing cost and key management overhead for tenants who have no active relationship with the platform
### Disable Immediately, Schedule Deletion with Retention Window (Selected)
**Pros:**
- Immediate revocation of access satisfies security and tenant expectations
- Retention window handles mistaken offboarding, legal hold, and compliance decryption needs
- Hard deletion is guaranteed at end of window unless a hold is active
- Aligns with KMS provider behavior (AWS pending deletion model)
**Cons:**
- Retention window requires governance process for holds and overrides
- Configurable window adds complexity to the offboarding workflow
## Rationale
Immediate hard deletion conflates two separate concerns: revoking access (which should happen immediately) and destroying key material (which benefits from a deliberate delay). Disabling the key achieves the former instantly. The retention window serves the latter without creating unnecessary operational risk. The irreversibility of KMS key deletion demands a margin of safety — the cost of a 90-day delay before permanent destruction is negligible compared to the cost of discovering a mistaken deletion after the fact. Indefinite retention is the opposite failure mode and should be explicitly rejected by automating the deletion at the end of the window.

View file

@ -0,0 +1,201 @@
// Package audit provides HTTP handlers for audit log management
package audit
import (
"encoding/json"
"net/http"
"strconv"
"time"
"github.com/jfraeys/fetch_ml/internal/auth"
"github.com/jfraeys/fetch_ml/internal/logging"
)
// Handler provides audit-related HTTP API handlers
type Handler struct {
logger *logging.Logger
store AuditStore // Optional: separate store for querying
}
// AuditStore interface for querying audit events
type AuditStore interface {
QueryEvents(from, to time.Time, eventType, userID string, limit, offset int) ([]AuditEvent, int, error)
}
// AuditEvent represents an audit event for API responses
type AuditEvent struct {
Timestamp time.Time `json:"timestamp"`
EventType string `json:"event_type"`
UserID string `json:"user_id,omitempty"`
Resource string `json:"resource,omitempty"`
Action string `json:"action,omitempty"`
Success bool `json:"success"`
IPAddress string `json:"ip_address,omitempty"`
Error string `json:"error,omitempty"`
PrevHash string `json:"prev_hash,omitempty"`
EventHash string `json:"event_hash,omitempty"`
SequenceNum int `json:"sequence_num,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
// AuditEventList represents a list of audit events
type AuditEventList struct {
Events []AuditEvent `json:"events"`
Total int `json:"total"`
Limit int `json:"limit"`
Offset int `json:"offset"`
}
// VerificationResult represents the result of audit chain verification
type VerificationResult struct {
Valid bool `json:"valid"`
TotalEvents int `json:"total_events"`
FirstTampered int `json:"first_tampered,omitempty"`
ChainRootHash string `json:"chain_root_hash,omitempty"`
VerifiedAt time.Time `json:"verified_at"`
}
// ChainRootResponse represents the chain root hash response
type ChainRootResponse struct {
RootHash string `json:"root_hash"`
Timestamp time.Time `json:"timestamp"`
TotalEvents int `json:"total_events"`
}
// NewHandler creates a new audit API handler
func NewHandler(logger *logging.Logger, store AuditStore) *Handler {
return &Handler{
logger: logger,
store: store,
}
}
// GetV1AuditEvents handles GET /v1/audit/events
func (h *Handler) GetV1AuditEvents(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "audit:read") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
// Parse query parameters
fromStr := r.URL.Query().Get("from")
toStr := r.URL.Query().Get("to")
eventType := r.URL.Query().Get("event_type")
userID := r.URL.Query().Get("user_id")
limit := parseIntQueryParam(r, "limit", 100)
offset := parseIntQueryParam(r, "offset", 0)
// Validate limit
if limit > 1000 {
limit = 1000
}
// Parse timestamps
var from, to time.Time
if fromStr != "" {
from, _ = time.Parse(time.RFC3339, fromStr)
}
if toStr != "" {
to, _ = time.Parse(time.RFC3339, toStr)
}
// If store is available, query from it
if h.store != nil {
events, total, err := h.store.QueryEvents(from, to, eventType, userID, limit, offset)
if err != nil {
http.Error(w, `{"error":"Failed to query events","code":"INTERNAL_ERROR"}`, http.StatusInternalServerError)
return
}
response := AuditEventList{
Events: events,
Total: total,
Limit: limit,
Offset: offset,
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(response)
return
}
// Return empty list if no store configured
response := AuditEventList{
Events: []AuditEvent{},
Total: 0,
Limit: limit,
Offset: offset,
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(response)
}
// PostV1AuditVerify handles POST /v1/audit/verify
func (h *Handler) PostV1AuditVerify(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "audit:verify") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
h.logger.Info("verifying audit chain", "user", user.Name)
// Perform verification (placeholder implementation)
result := VerificationResult{
Valid: true,
TotalEvents: 0, // Would be populated from actual verification
VerifiedAt: time.Now().UTC(),
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(result)
}
// GetV1AuditChainRoot handles GET /v1/audit/chain-root
func (h *Handler) GetV1AuditChainRoot(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "audit:read") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
// Get chain root (placeholder implementation)
response := ChainRootResponse{
RootHash: "sha256:0000000000000000000000000000000000000000000000000000000000000000",
Timestamp: time.Now().UTC(),
TotalEvents: 0,
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(response)
}
// checkPermission checks if the user has the required permission
func (h *Handler) checkPermission(user *auth.User, permission string) bool {
if user == nil {
return false
}
// Admin has all permissions
if user.Admin {
return true
}
// Check specific permission
return user.HasPermission(permission)
}
// parseIntQueryParam parses an integer query parameter
func parseIntQueryParam(r *http.Request, name string, defaultVal int) int {
str := r.URL.Query().Get(name)
if str == "" {
return defaultVal
}
val, err := strconv.Atoi(str)
if err != nil {
return defaultVal
}
return val
}

View file

@ -0,0 +1,289 @@
// Package plugins provides HTTP handlers for plugin management
package plugins
import (
"encoding/json"
"net/http"
"time"
"github.com/jfraeys/fetch_ml/internal/auth"
"github.com/jfraeys/fetch_ml/internal/logging"
"github.com/jfraeys/fetch_ml/internal/tracking"
)
// Handler provides plugin-related HTTP handlers
type Handler struct {
logger *logging.Logger
registry *tracking.Registry
config map[string]PluginConfig // Plugin configurations
}
// PluginConfig represents the configuration for a plugin
type PluginConfig struct {
Enabled bool `json:"enabled"`
Mode string `json:"mode"` // sidecar, remote, disabled
Image string `json:"image,omitempty"`
Settings map[string]interface{} `json:"settings,omitempty"`
LogBasePath string `json:"log_base_path,omitempty"`
ArtifactPath string `json:"artifact_path,omitempty"`
}
// PluginInfo represents plugin information returned by the API
type PluginInfo struct {
Name string `json:"name"`
Enabled bool `json:"enabled"`
Mode string `json:"mode"`
Status string `json:"status"` // healthy, unhealthy, starting, stopped
Config PluginConfig `json:"config"`
RequiresRestart bool `json:"requires_restart"`
Version string `json:"version,omitempty"`
}
// PluginStatus represents the status of a plugin instance
type PluginStatus struct {
Name string `json:"name"`
Status string `json:"status"`
URL string `json:"url,omitempty"`
LastCheck time.Time `json:"last_check,omitempty"`
}
// NewHandler creates a new plugins handler
func NewHandler(
logger *logging.Logger,
registry *tracking.Registry,
config map[string]PluginConfig,
) *Handler {
return &Handler{
logger: logger,
registry: registry,
config: config,
}
}
// GetV1Plugins handles GET /v1/plugins
func (h *Handler) GetV1Plugins(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "plugins:read") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
plugins := []PluginInfo{}
// If registry is available, get plugin status
if h.registry != nil {
for name := range h.config {
cfg := h.config[name]
info := PluginInfo{
Name: name,
Enabled: cfg.Enabled,
Mode: cfg.Mode,
Status: "unknown",
Config: cfg,
RequiresRestart: false, // Default: plugins support hot-reload
Version: "1.0.0", // Placeholder
}
// Check if plugin is registered
if plugin, ok := h.registry.Get(name); ok {
// Check plugin health if available
if cfg.Enabled && cfg.Mode != "disabled" {
info.Status = "healthy" // Default, would check actual health
_ = plugin // Use plugin for actual health check
}
}
plugins = append(plugins, info)
}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(plugins)
}
// GetV1PluginsPluginName handles GET /v1/plugins/{pluginName}
func (h *Handler) GetV1PluginsPluginName(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "plugins:read") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
pluginName := r.PathValue("pluginName")
if pluginName == "" {
http.Error(w, `{"error":"Missing plugin name","code":"BAD_REQUEST"}`, http.StatusBadRequest)
return
}
cfg, ok := h.config[pluginName]
if !ok {
http.Error(w, `{"error":"Plugin not found","code":"NOT_FOUND"}`, http.StatusNotFound)
return
}
info := PluginInfo{
Name: pluginName,
Enabled: cfg.Enabled,
Mode: cfg.Mode,
Status: "unknown",
Config: cfg,
RequiresRestart: false,
Version: "1.0.0",
}
if cfg.Enabled && cfg.Mode != "disabled" {
info.Status = "healthy"
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(info)
}
// GetV1PluginsPluginNameConfig handles GET /v1/plugins/{pluginName}/config
func (h *Handler) GetV1PluginsPluginNameConfig(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "plugins:read") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
pluginName := r.PathValue("pluginName")
if pluginName == "" {
http.Error(w, `{"error":"Missing plugin name","code":"BAD_REQUEST"}`, http.StatusBadRequest)
return
}
cfg, ok := h.config[pluginName]
if !ok {
http.Error(w, `{"error":"Plugin not found","code":"NOT_FOUND"}`, http.StatusNotFound)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(cfg)
}
// PutV1PluginsPluginNameConfig handles PUT /v1/plugins/{pluginName}/config
func (h *Handler) PutV1PluginsPluginNameConfig(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "plugins:write") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
pluginName := r.PathValue("pluginName")
if pluginName == "" {
http.Error(w, `{"error":"Missing plugin name","code":"BAD_REQUEST"}`, http.StatusBadRequest)
return
}
var newConfig PluginConfig
if err := json.NewDecoder(r.Body).Decode(&newConfig); err != nil {
http.Error(w, `{"error":"Invalid request body","code":"BAD_REQUEST"}`, http.StatusBadRequest)
return
}
// Update config
h.config[pluginName] = newConfig
h.logger.Info("updated plugin config", "plugin", pluginName, "user", user.Name)
// Return updated plugin info
info := PluginInfo{
Name: pluginName,
Enabled: newConfig.Enabled,
Mode: newConfig.Mode,
Status: "healthy",
Config: newConfig,
RequiresRestart: false,
Version: "1.0.0",
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(info)
}
// DeleteV1PluginsPluginNameConfig handles DELETE /v1/plugins/{pluginName}/config
func (h *Handler) DeleteV1PluginsPluginNameConfig(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "plugins:write") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
pluginName := r.PathValue("pluginName")
if pluginName == "" {
http.Error(w, `{"error":"Missing plugin name","code":"BAD_REQUEST"}`, http.StatusBadRequest)
return
}
cfg, ok := h.config[pluginName]
if !ok {
http.Error(w, `{"error":"Plugin not found","code":"NOT_FOUND"}`, http.StatusNotFound)
return
}
// Disable the plugin
cfg.Enabled = false
h.config[pluginName] = cfg
h.logger.Info("disabled plugin", "plugin", pluginName, "user", user.Name)
w.WriteHeader(http.StatusNoContent)
}
// GetV1PluginsPluginNameHealth handles GET /v1/plugins/{pluginName}/health
func (h *Handler) GetV1PluginsPluginNameHealth(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "plugins:read") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
pluginName := r.PathValue("pluginName")
if pluginName == "" {
http.Error(w, `{"error":"Missing plugin name","code":"BAD_REQUEST"}`, http.StatusBadRequest)
return
}
cfg, ok := h.config[pluginName]
if !ok {
http.Error(w, `{"error":"Plugin not found","code":"NOT_FOUND"}`, http.StatusNotFound)
return
}
status := "healthy"
if !cfg.Enabled || cfg.Mode == "disabled" {
status = "stopped"
}
response := map[string]interface{}{
"status": status,
"version": "1.0.0",
"timestamp": time.Now().UTC(),
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(response)
}
// checkPermission checks if the user has the required permission
func (h *Handler) checkPermission(user *auth.User, permission string) bool {
if user == nil {
return false
}
// Admin has all permissions
for _, role := range user.Roles {
if role == "admin" {
return true
}
}
// Check specific permission
for perm, hasPerm := range user.Permissions {
if hasPerm && perm == permission {
return true
}
}
return false
}

View file

@ -0,0 +1,480 @@
// Package scheduler provides HTTP handlers for scheduler management
package scheduler
import (
"encoding/json"
"fmt"
"net/http"
"strconv"
"time"
"github.com/jfraeys/fetch_ml/internal/auth"
"github.com/jfraeys/fetch_ml/internal/logging"
sch "github.com/jfraeys/fetch_ml/internal/scheduler"
)
// APIHandler provides scheduler-related HTTP API handlers
type APIHandler struct {
logger *logging.Logger
hub *sch.SchedulerHub
streaming map[string][]chan StreamingEvent
}
// NewHandler creates a new scheduler API handler
func NewHandler(hub *sch.SchedulerHub, logger *logging.Logger, authConfig *auth.Config) *APIHandler {
return &APIHandler{
logger: logger,
hub: hub,
streaming: make(map[string][]chan StreamingEvent),
}
}
// StreamingEvent represents an event for SSE streaming
type StreamingEvent struct {
Type string `json:"type"`
Payload json.RawMessage `json:"payload"`
}
// WorkerInfo represents worker information for API responses
type WorkerInfo struct {
ID string `json:"id"`
ConnectedAt time.Time `json:"connected_at"`
LastHeartbeat time.Time `json:"last_heartbeat,omitempty"`
Capabilities sch.WorkerCapabilities `json:"capabilities"`
Slots sch.SlotStatus `json:"slots"`
ActiveTasks []string `json:"active_tasks"`
Status string `json:"status"` // active, draining, offline
}
// SchedulerStatus represents scheduler status for API responses
type SchedulerStatus struct {
WorkersTotal int `json:"workers_total"`
WorkersActive int `json:"workers_active"`
WorkersDraining int `json:"workers_draining"`
BatchQueueDepth int `json:"batch_queue_depth"`
ServiceQueueDepth int `json:"service_queue_depth"`
TasksRunning int `json:"tasks_running"`
TasksCompleted24h int `json:"tasks_completed_24h"`
ReservationsActive int `json:"reservations_active"`
Timestamp time.Time `json:"timestamp"`
}
// ReservationInfo represents reservation information for API responses
type ReservationInfo struct {
ID string `json:"id"`
UserID string `json:"user_id"`
GPUCount int `json:"gpu_count"`
GPUType string `json:"gpu_type,omitempty"`
NodeCount int `json:"node_count"`
ExpiresAt time.Time `json:"expires_at"`
Status string `json:"status"` // active, claimed, expired
}
// CreateReservationRequest represents a request to create a reservation
type CreateReservationRequest struct {
GPUCount int `json:"gpu_count"`
GPUType string `json:"gpu_type,omitempty"`
NodeCount int `json:"node_count,omitempty"`
ExpiresMinutes int `json:"expires_minutes,omitempty"`
}
// NewAPIHandler creates a new scheduler API handler
func NewAPIHandler(logger *logging.Logger, hub *sch.SchedulerHub) *APIHandler {
handler := &APIHandler{
logger: logger,
hub: hub,
streaming: make(map[string][]chan StreamingEvent),
}
return handler
}
// GetV1SchedulerStatus handles GET /v1/scheduler/status
func (h *APIHandler) GetV1SchedulerStatus(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "scheduler:read") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
if h.hub == nil {
http.Error(w, `{"error":"Scheduler not available","code":"SERVICE_UNAVAILABLE"}`, http.StatusServiceUnavailable)
return
}
// Get metrics from hub
metrics := h.hub.GetMetricsPayload()
status := SchedulerStatus{
Timestamp: time.Now().UTC(),
}
// Extract values from metrics payload
if v, ok := metrics["workers_connected"].(int); ok {
status.WorkersTotal = v
status.WorkersActive = v // Simplified - all connected are active
}
if v, ok := metrics["queue_depth_batch"].(int); ok {
status.BatchQueueDepth = v
}
if v, ok := metrics["queue_depth_service"].(int); ok {
status.ServiceQueueDepth = v
}
if v, ok := metrics["jobs_completed"].(int); ok {
status.TasksCompleted24h = v
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(status)
}
// GetV1SchedulerWorkers handles GET /v1/scheduler/workers
func (h *APIHandler) GetV1SchedulerWorkers(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "scheduler:read") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
if h.hub == nil {
http.Error(w, `{"error":"Scheduler not available","code":"SERVICE_UNAVAILABLE"}`, http.StatusServiceUnavailable)
return
}
// Get worker slots from metrics
metrics := h.hub.GetMetricsPayload()
slots, _ := metrics["worker_slots"].(map[string]sch.SlotStatus)
workers := []WorkerInfo{}
// Build worker list from slots data
for workerID, slotStatus := range slots {
worker := WorkerInfo{
ID: workerID,
Slots: slotStatus,
Capabilities: sch.WorkerCapabilities{},
Status: "active",
ActiveTasks: []string{},
}
workers = append(workers, worker)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(workers)
}
// GetV1SchedulerWorkersWorkerID handles GET /v1/scheduler/workers/{workerId}
func (h *APIHandler) GetV1SchedulerWorkersWorkerID(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "scheduler:read") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
if h.hub == nil {
http.Error(w, `{"error":"Scheduler not available","code":"SERVICE_UNAVAILABLE"}`, http.StatusServiceUnavailable)
return
}
workerID := r.PathValue("workerId")
if workerID == "" {
http.Error(w, `{"error":"Missing worker ID","code":"BAD_REQUEST"}`, http.StatusBadRequest)
return
}
// Get worker slots from metrics
metrics := h.hub.GetMetricsPayload()
slots, _ := metrics["worker_slots"].(map[string]sch.SlotStatus)
slotStatus, ok := slots[workerID]
if !ok {
http.Error(w, `{"error":"Worker not found","code":"NOT_FOUND"}`, http.StatusNotFound)
return
}
worker := WorkerInfo{
ID: workerID,
Slots: slotStatus,
Capabilities: sch.WorkerCapabilities{},
Status: "active",
ActiveTasks: []string{},
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(worker)
}
// DeleteV1SchedulerWorkersWorkerID handles DELETE /v1/scheduler/workers/{workerId}
func (h *APIHandler) DeleteV1SchedulerWorkersWorkerID(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "scheduler:drain") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
if h.hub == nil {
http.Error(w, `{"error":"Scheduler not available","code":"SERVICE_UNAVAILABLE"}`, http.StatusServiceUnavailable)
return
}
workerID := r.PathValue("workerId")
if workerID == "" {
http.Error(w, `{"error":"Missing worker ID","code":"BAD_REQUEST"}`, http.StatusBadRequest)
return
}
h.logger.Info("draining worker", "worker_id", workerID, "user", user.Name)
// Note: Actual drain implementation would involve signaling the worker
// and waiting for tasks to complete. This is a simplified version.
w.WriteHeader(http.StatusNoContent)
}
// GetV1SchedulerReservations handles GET /v1/scheduler/reservations
func (h *APIHandler) GetV1SchedulerReservations(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "scheduler:read") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
if h.hub == nil {
http.Error(w, `{"error":"Scheduler not available","code":"SERVICE_UNAVAILABLE"}`, http.StatusServiceUnavailable)
return
}
// Return empty list for now - reservations would be tracked in hub
reservations := []ReservationInfo{}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(reservations)
}
// PostV1SchedulerReservations handles POST /v1/scheduler/reservations
func (h *APIHandler) PostV1SchedulerReservations(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "scheduler:write") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
if h.hub == nil {
http.Error(w, `{"error":"Scheduler not available","code":"SERVICE_UNAVAILABLE"}`, http.StatusServiceUnavailable)
return
}
var req CreateReservationRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, `{"error":"Invalid request body","code":"BAD_REQUEST"}`, http.StatusBadRequest)
return
}
if req.GPUCount <= 0 {
http.Error(w, `{"error":"GPU count must be positive","code":"VALIDATION_ERROR"}`, http.StatusUnprocessableEntity)
return
}
if req.NodeCount <= 0 {
req.NodeCount = 1
}
if req.ExpiresMinutes <= 0 {
req.ExpiresMinutes = 30
}
reservation := ReservationInfo{
ID: fmt.Sprintf("res-%d", time.Now().UnixNano()),
UserID: user.Name,
GPUCount: req.GPUCount,
GPUType: req.GPUType,
NodeCount: req.NodeCount,
ExpiresAt: time.Now().Add(time.Duration(req.ExpiresMinutes) * time.Minute),
Status: "active",
}
h.logger.Info("created reservation", "reservation_id", reservation.ID, "user", user.Name)
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusCreated)
json.NewEncoder(w).Encode(reservation)
}
// PatchV1SchedulerJobsJobIDPriority handles PATCH /v1/scheduler/jobs/{jobId}/priority
func (h *APIHandler) PatchV1SchedulerJobsJobIDPriority(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "tasks:priority") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
if h.hub == nil {
http.Error(w, `{"error":"Scheduler not available","code":"SERVICE_UNAVAILABLE"}`, http.StatusServiceUnavailable)
return
}
jobID := r.PathValue("jobId")
if jobID == "" {
http.Error(w, `{"error":"Missing job ID","code":"BAD_REQUEST"}`, http.StatusBadRequest)
return
}
var req struct {
Priority int `json:"priority"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, `{"error":"Invalid request body","code":"BAD_REQUEST"}`, http.StatusBadRequest)
return
}
if req.Priority < 1 || req.Priority > 10 {
http.Error(w, `{"error":"Priority must be between 1 and 10","code":"VALIDATION_ERROR"}`, http.StatusUnprocessableEntity)
return
}
h.logger.Info("updating job priority", "job_id", jobID, "priority", req.Priority, "user", user.Name)
// Note: Actual priority update would modify the task in the queue
// This is a simplified version
response := map[string]interface{}{
"id": jobID,
"priority": req.Priority,
"status": "queued",
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(response)
}
// GetV1SchedulerStatusStream handles GET /v1/scheduler/status/stream (SSE)
func (h *APIHandler) GetV1SchedulerStatusStream(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "scheduler:read") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
// Set SSE headers
w.Header().Set("Content-Type", "text/event-stream")
w.Header().Set("Cache-Control", "no-cache")
w.Header().Set("Connection", "keep-alive")
// Create event channel for this client
eventChan := make(chan StreamingEvent, 10)
clientID := fmt.Sprintf("%s-%d", user.Name, time.Now().UnixNano())
h.streaming[clientID] = append(h.streaming[clientID], eventChan)
// Clean up on disconnect
defer func() {
delete(h.streaming, clientID)
close(eventChan)
}()
// Send initial status
status := map[string]interface{}{
"type": "connected",
"timestamp": time.Now().UTC(),
}
data, _ := json.Marshal(status)
fmt.Fprintf(w, "data: %s\n\n", data)
w.(http.Flusher).Flush()
// Keep connection alive and send periodic updates
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for {
select {
case <-r.Context().Done():
return
case <-ticker.C:
heartbeat := map[string]interface{}{
"type": "heartbeat",
"timestamp": time.Now().UTC(),
}
data, _ := json.Marshal(heartbeat)
fmt.Fprintf(w, "data: %s\n\n", data)
w.(http.Flusher).Flush()
case event := <-eventChan:
data, _ := json.Marshal(event)
fmt.Fprintf(w, "data: %s\n\n", data)
w.(http.Flusher).Flush()
}
}
}
// GetV1SchedulerJobsJobIDStream handles GET /v1/scheduler/jobs/{jobId}/stream (SSE)
func (h *APIHandler) GetV1SchedulerJobsJobIDStream(w http.ResponseWriter, r *http.Request) {
user := auth.GetUserFromContext(r.Context())
if !h.checkPermission(user, "scheduler:read") {
http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
return
}
jobID := r.PathValue("jobId")
if jobID == "" {
http.Error(w, `{"error":"Missing job ID","code":"BAD_REQUEST"}`, http.StatusBadRequest)
return
}
// Set SSE headers
w.Header().Set("Content-Type", "text/event-stream")
w.Header().Set("Cache-Control", "no-cache")
w.Header().Set("Connection", "keep-alive")
// Send initial status
response := map[string]interface{}{
"type": "connected",
"job_id": jobID,
"timestamp": time.Now().UTC(),
}
data, _ := json.Marshal(response)
fmt.Fprintf(w, "data: %s\n\n", data)
w.(http.Flusher).Flush()
// Keep connection alive
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for {
select {
case <-r.Context().Done():
return
case <-ticker.C:
heartbeat := map[string]interface{}{
"type": "heartbeat",
"timestamp": time.Now().UTC(),
}
data, _ := json.Marshal(heartbeat)
fmt.Fprintf(w, "data: %s\n\n", data)
w.(http.Flusher).Flush()
}
}
}
// checkPermission checks if the user has the required permission
func (h *APIHandler) checkPermission(user *auth.User, permission string) bool {
if user == nil {
return false
}
// Admin has all permissions
if user.Admin {
return true
}
// Check specific permission
return user.HasPermission(permission)
}
// parseIntQueryParam parses an integer query parameter
func parseIntQueryParam(r *http.Request, name string, defaultVal int) int {
str := r.URL.Query().Get(name)
if str == "" {
return defaultVal
}
val, err := strconv.Atoi(str)
if err != nil {
return defaultVal
}
return val
}

View file

@ -0,0 +1,30 @@
package api
import (
"net/http"
"os"
"path/filepath"
"runtime"
)
// openAPISpecPath returns the path to the OpenAPI spec file
func openAPISpecPath() string {
// Get the directory of this source file
_, filename, _, _ := runtime.Caller(0)
dir := filepath.Dir(filename)
// Navigate to repo root and then to api/
return filepath.Join(dir, "..", "..", "api", "openapi.yaml")
}
// ServeOpenAPISpec serves the OpenAPI specification as YAML
func ServeOpenAPISpec(w http.ResponseWriter, _ *http.Request) {
specPath := openAPISpecPath()
data, err := os.ReadFile(specPath)
if err != nil {
http.Error(w, "Failed to read OpenAPI spec", http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/yaml")
w.WriteHeader(http.StatusOK)
_, _ = w.Write(data)
}

18
scripts/build/build-cli.sh Executable file
View file

@ -0,0 +1,18 @@
#!/bin/bash
# Builds CLI binaries using Zig
# Usage: build-cli.sh
set -euo pipefail
echo "Building CLI with Zig..."
cd cli
# Build release-optimized CLI
zig build --release=fast
cd ..
# Copy to dist
mkdir -p dist
cp cli/zig-out/bin/ml dist/ml_linux_x86_64
echo "✓ CLI built: dist/ml_linux_x86_64"

44
scripts/build/build-go.sh Executable file
View file

@ -0,0 +1,44 @@
#!/bin/bash
# Builds Go binaries with configurable CGO/native options
# Usage: build-go.sh <build_type> <os> <arch> <source_path>
set -euo pipefail
BUILD_TYPE=${1:-native}
OS=${2:-linux}
ARCH=${3:-amd64}
SOURCE_PATH=${4:-cmd/api-server/main.go}
LDFLAGS="-s -w -X main.BuildHash=$(git rev-parse --short HEAD) -X main.BuildTime=$(date -u +%Y%m%d.%H%M%S)"
case $BUILD_TYPE in
pure)
export CGO_ENABLED=0
TAGS=""
SUFFIX="_${OS}_${ARCH}_pure"
;;
cgo)
export CGO_ENABLED=1
TAGS=""
SUFFIX="_${OS}_${ARCH}_cgo"
;;
native)
export CGO_ENABLED=1
TAGS="native_libs"
SUFFIX="_${OS}_${ARCH}_native"
;;
*)
echo "Unknown build type: $BUILD_TYPE"
echo "Usage: $0 <pure|cgo|native> <os> <arch> <source_path>"
exit 1
;;
esac
BINARY_NAME=$(basename "$SOURCE_PATH" .go)
OUTPUT="dist/fetch_ml_${BINARY_NAME}_${OS}_${ARCH}${SUFFIX}"
mkdir -p dist
echo "Building ${BINARY_NAME} (${BUILD_TYPE}) for ${OS}/${ARCH}..."
go build -tags="$TAGS" -ldflags="$LDFLAGS" -o "$OUTPUT" "$SOURCE_PATH"
echo "✓ Built: $OUTPUT"

18
scripts/build/build-native.sh Executable file
View file

@ -0,0 +1,18 @@
#!/bin/bash
# Builds native C++ libraries for Linux x86_64
# Run on Ubuntu self-hosted runner
set -euo pipefail
mkdir -p native/build/linux_amd64
# Use cmake for native build
cmake -S native -B native/build/linux_amd64 \
-DCMAKE_BUILD_TYPE=Release
cmake --build native/build/linux_amd64 --parallel
# Package libs
mkdir -p bin/native/linux_amd64
cp native/build/linux_amd64/lib*.so bin/native/linux_amd64/ 2>/dev/null || true
echo "✓ Native libraries built for Linux x86_64"

30
scripts/build/cross-platform.sh Executable file
View file

@ -0,0 +1,30 @@
#!/bin/bash
# Orchestrates full build for Linux x86_64 only
set -euo pipefail
echo "=== Building for Linux x86_64 ==="
# Build native C++ libraries
scripts/build/build-native.sh
# Build Go backends for all build types
for build_type in pure cgo native; do
echo "=== Building ${build_type} binaries ==="
for binary in api-server worker data_manager user_manager tui; do
source_path="cmd/${binary}"
[ "$binary" = "worker" ] && source_path="cmd/worker/worker_server.go"
[ "$binary" = "api-server" ] && source_path="cmd/api-server/main.go"
[ "$binary" = "data_manager" ] && source_path="cmd/data_manager/main.go"
[ "$binary" = "user_manager" ] && source_path="cmd/user_manager/main.go"
[ "$binary" = "tui" ] && source_path="cmd/tui/main.go"
scripts/build/build-go.sh "$build_type" linux amd64 "$source_path"
done
done
# Build CLI binaries
scripts/build/build-cli.sh
echo "=== Build complete ==="
ls -lh dist/

133
scripts/deploy/cleanup.sh Executable file
View file

@ -0,0 +1,133 @@
#!/bin/bash
# Cleanup script for fetch_ml deployments
# Removes orphaned containers, volumes, and networks
set -e
REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)
# Colors
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
NC='\033[0m'
print_success() { echo -e "${GREEN}[OK]${NC} $1"; }
print_error() { echo -e "${RED}[ERROR]${NC} $1"; }
print_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
print_info() { echo "[INFO] $1"; }
show_usage() {
echo "Usage: $0 [ENVIRONMENT|all]"
echo ""
echo "Environments:"
echo " dev Clean up development environment"
echo " staging Clean up staging environment"
echo " prod Clean up production environment"
echo " homelab Clean up homelab environment"
echo " all Clean up all environments (default)"
echo ""
echo "Examples:"
echo " $0 dev # Clean dev only"
echo " $0 all # Clean everything"
}
cleanup_environment() {
local env=$1
local compose_file
case $env in
dev)
compose_file="${REPO_ROOT}/deployments/docker-compose.dev.yml"
;;
staging)
compose_file="${REPO_ROOT}/deployments/docker-compose.staging.yml"
;;
prod)
compose_file="${REPO_ROOT}/deployments/docker-compose.prod.yml"
;;
homelab)
compose_file="${REPO_ROOT}/deployments/docker-compose.homelab-secure.yml"
;;
*)
print_error "Unknown environment: $env"
show_usage
exit 1
;;
esac
if [ ! -f "$compose_file" ]; then
print_warn "Compose file not found: $compose_file"
return 0
fi
print_info "Cleaning up $env environment..."
# Stop and remove containers, networks, volumes
docker-compose -f "$compose_file" down -v --remove-orphans 2>/dev/null || true
# Remove any leftover containers with project name
local project_name
project_name=$(basename "$compose_file" .yml | sed 's/docker-compose\.//')
local containers
containers=$(docker ps -aq --filter "name=ml-${project_name}" --filter "name=ml-experiments" 2>/dev/null || true)
if [ -n "$containers" ]; then
print_info "Removing leftover containers..."
docker rm -f $containers 2>/dev/null || true
fi
print_success "${env} environment cleaned"
}
cleanup_all() {
print_info "Cleaning up all FetchML environments..."
# Clean each environment
for env in dev staging prod homelab; do
cleanup_environment "$env" || true
done
# Remove any remaining fetchml containers by name pattern
local all_containers
all_containers=$(docker ps -aq --filter "name=^ml-" 2>/dev/null || true)
if [ -n "$all_containers" ]; then
print_info "Removing remaining FetchML containers..."
docker rm -f $all_containers 2>/dev/null || true
fi
# Only prune FetchML-specific networks (not all unused)
local networks
networks=$(docker network ls -q --filter "name=^ml-" 2>/dev/null || true)
if [ -n "$networks" ]; then
print_info "Removing FetchML networks..."
docker network rm $networks 2>/dev/null || true
fi
print_success "All FetchML environments cleaned (other Docker data preserved)"
}
main() {
local env=${1:-all}
case $env in
dev|staging|prod|homelab)
cleanup_environment "$env"
;;
all)
cleanup_all
;;
-h|--help)
show_usage
exit 0
;;
*)
print_error "Unknown option: $env"
show_usage
exit 1
;;
esac
}
main "$@"

73
scripts/deploy/health-check.sh Executable file
View file

@ -0,0 +1,73 @@
#!/bin/bash
# Health check utilities for fetch_ml deployments
set -e
REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)
# Colors
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
NC='\033[0m'
print_success() { echo -e "${GREEN}[OK]${NC} $1"; }
print_error() { echo -e "${RED}[FAIL]${NC} $1"; }
print_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
# Check health for a specific environment
check_environment() {
local env=$1
local port
case $env in
dev) port=9101 ;;
staging) port=9102 ;;
prod) port=9101 ;;
*) port=9101 ;;
esac
echo "=== ${env} Environment (port ${port}) ==="
if curl -fsS "http://localhost:${port}/health" > /dev/null 2>&1; then
print_success "API is responding"
# Get detailed health info
local health
health=$(curl -fsS "http://localhost:${port}/health" 2>/dev/null || echo "{}")
# Check compliance mode
local compliance
compliance=$(echo "$health" | grep -o '"compliance_mode":"[^"]*"' | cut -d'"' -f4 || echo "unknown")
echo " Compliance mode: $compliance"
# Check native libs status
if echo "$health" | grep -q '"native_libs":true'; then
print_success "Native libraries enabled"
else
print_warn "Native libraries not enabled"
fi
else
print_error "API health check failed"
return 1
fi
echo ""
}
# Main
main() {
local env=${1:-all}
echo "=== FetchML Health Check ==="
echo ""
if [ "$env" = "all" ]; then
check_environment dev || true
check_environment staging || true
check_environment prod || true
else
check_environment "$env"
fi
}
main "$@"

View file

@ -0,0 +1,279 @@
package benchmarks
import (
"testing"
"time"
"github.com/jfraeys/fetch_ml/internal/crypto"
"github.com/jfraeys/fetch_ml/internal/crypto/kms"
)
// BenchmarkEncryptArtifact measures the full encryption pipeline performance.
// Per ADR-012: Total overhead should be <10ms for MemoryProvider.
func BenchmarkEncryptArtifact(b *testing.B) {
tkm := crypto.NewTestTenantKeyManager(nil)
// Provision a test tenant
hierarchy, err := tkm.ProvisionTenant("bench-tenant")
if err != nil {
b.Fatalf("Failed to provision tenant: %v", err)
}
// Test data - 1KB payload (typical model weights chunk)
plaintext := make([]byte, 1024)
for i := range plaintext {
plaintext[i] = byte(i % 256)
}
b.ResetTimer()
b.ReportAllocs()
for b.Loop() {
_, err := tkm.EncryptArtifact("bench-tenant", "artifact-1", hierarchy.KMSKeyID, plaintext)
if err != nil {
b.Fatalf("Encrypt failed: %v", err)
}
}
}
// BenchmarkDecryptArtifact measures the full decryption pipeline performance.
// Per ADR-012: Total overhead should be <10ms for MemoryProvider.
func BenchmarkDecryptArtifact(b *testing.B) {
tkm := crypto.NewTestTenantKeyManager(nil)
hierarchy, err := tkm.ProvisionTenant("bench-tenant")
if err != nil {
b.Fatalf("Failed to provision tenant: %v", err)
}
plaintext := make([]byte, 1024)
for i := range plaintext {
plaintext[i] = byte(i % 256)
}
// Pre-encrypt data
encrypted, err := tkm.EncryptArtifact("bench-tenant", "artifact-1", hierarchy.KMSKeyID, plaintext)
if err != nil {
b.Fatalf("Pre-encryption failed: %v", err)
}
b.ReportAllocs()
for b.Loop() {
_, err := tkm.DecryptArtifact(encrypted, hierarchy.KMSKeyID)
if err != nil {
b.Fatalf("Decrypt failed: %v", err)
}
}
}
// BenchmarkMemoryProvider_Encrypt measures baseline encryption without network overhead.
// This establishes the theoretical minimum for KMS operations.
func BenchmarkMemoryProvider_Encrypt(b *testing.B) {
provider := kms.NewMemoryProvider()
defer provider.Close()
cache := kms.NewDEKCache(kms.DefaultCacheConfig())
defer cache.Clear()
config := kms.Config{
Provider: kms.ProviderTypeMemory,
Cache: kms.DefaultCacheConfig(),
}
tkm := crypto.NewTenantKeyManager(provider, cache, config, nil)
hierarchy, err := tkm.ProvisionTenant("bench-tenant")
if err != nil {
b.Fatalf("Failed to provision tenant: %v", err)
}
plaintext := make([]byte, 1024)
b.ReportAllocs()
for b.Loop() {
_, err := tkm.EncryptArtifact("bench-tenant", "artifact-1", hierarchy.KMSKeyID, plaintext)
if err != nil {
b.Fatalf("Encrypt failed: %v", err)
}
}
}
// BenchmarkCacheHit verifies cached DEKs provide <10ms overhead.
func BenchmarkCacheHit(b *testing.B) {
tkm := crypto.NewTestTenantKeyManager(nil)
hierarchy, err := tkm.ProvisionTenant("bench-tenant")
if err != nil {
b.Fatalf("Failed to provision tenant: %v", err)
}
plaintext := make([]byte, 1024)
// First encrypt to populate cache
encrypted, err := tkm.EncryptArtifact("bench-tenant", "cached-artifact", hierarchy.KMSKeyID, plaintext)
if err != nil {
b.Fatalf("Pre-encryption failed: %v", err)
}
// First decrypt to populate DEK cache
_, err = tkm.DecryptArtifact(encrypted, hierarchy.KMSKeyID)
if err != nil {
b.Fatalf("First decrypt failed: %v", err)
}
b.ReportAllocs()
for b.Loop() {
_, err := tkm.DecryptArtifact(encrypted, hierarchy.KMSKeyID)
if err != nil {
b.Fatalf("Decrypt failed: %v", err)
}
}
}
// BenchmarkKeyRotation measures key rotation overhead.
func BenchmarkKeyRotation(b *testing.B) {
tkm := crypto.NewTestTenantKeyManager(nil)
hierarchy, err := tkm.ProvisionTenant("bench-tenant")
if err != nil {
b.Fatalf("Failed to provision tenant: %v", err)
}
b.ReportAllocs()
for b.Loop() {
// Rotate key
newHierarchy, err := tkm.RotateTenantKey("bench-tenant", hierarchy)
if err != nil {
b.Fatalf("Rotation failed: %v", err)
}
hierarchy = newHierarchy
}
}
// BenchmarkEncryptArtifact_LargePayload measures encryption with larger payloads.
func BenchmarkEncryptArtifact_LargePayload(b *testing.B) {
tkm := crypto.NewTestTenantKeyManager(nil)
hierarchy, err := tkm.ProvisionTenant("bench-tenant")
if err != nil {
b.Fatalf("Failed to provision tenant: %v", err)
}
// 1MB payload
plaintext := make([]byte, 1024*1024)
b.ReportAllocs()
for b.Loop() {
_, err := tkm.EncryptArtifact("bench-tenant", "large-artifact", hierarchy.KMSKeyID, plaintext)
if err != nil {
b.Fatalf("Encrypt failed: %v", err)
}
}
}
// BenchmarkParallelEncrypt measures concurrent encryption performance.
func BenchmarkParallelEncrypt(b *testing.B) {
tkm := crypto.NewTestTenantKeyManager(nil)
hierarchy, err := tkm.ProvisionTenant("bench-tenant")
if err != nil {
b.Fatalf("Failed to provision tenant: %v", err)
}
plaintext := make([]byte, 1024)
b.ResetTimer()
b.ReportAllocs()
b.RunParallel(func(pb *testing.PB) {
i := 0
for pb.Next() {
artifactID := "parallel-artifact-" + string(rune('0'+i%10))
_, err := tkm.EncryptArtifact("bench-tenant", artifactID, hierarchy.KMSKeyID, plaintext)
if err != nil {
b.Fatalf("Encrypt failed: %v", err)
}
i++
}
})
}
// VerifyPerformanceRequirement runs a quick sanity check for the <10ms requirement.
// This is not a benchmark but a verification that typical operations complete within limits.
func TestEncryptPerformance_10msRequirement(t *testing.T) {
tkm := crypto.NewTestTenantKeyManager(nil)
hierarchy, err := tkm.ProvisionTenant("perf-test-tenant")
if err != nil {
t.Fatalf("Failed to provision tenant: %v", err)
}
plaintext := make([]byte, 1024)
// Warm up
for range 10 {
_, _ = tkm.EncryptArtifact("perf-test-tenant", "warmup", hierarchy.KMSKeyID, plaintext)
}
// Measure 100 operations
start := time.Now()
for i := 0; i < 100; i++ {
_, err := tkm.EncryptArtifact("perf-test-tenant", "perf-test", hierarchy.KMSKeyID, plaintext)
if err != nil {
t.Fatalf("Encrypt failed: %v", err)
}
}
elapsed := time.Since(start)
avgPerOp := elapsed / 100
if avgPerOp > 10*time.Millisecond {
t.Errorf("Average encrypt time %v exceeds 10ms requirement", avgPerOp)
}
t.Logf("Average encrypt time: %v (requirement: <10ms)", avgPerOp)
}
// TestDecryptPerformance_10msRequirement verifies decrypt completes within 10ms.
func TestDecryptPerformance_10msRequirement(t *testing.T) {
tkm := crypto.NewTestTenantKeyManager(nil)
hierarchy, err := tkm.ProvisionTenant("perf-test-tenant")
if err != nil {
t.Fatalf("Failed to provision tenant: %v", err)
}
plaintext := make([]byte, 1024)
// Pre-encrypt
encrypted, err := tkm.EncryptArtifact("perf-test-tenant", "perf-test", hierarchy.KMSKeyID, plaintext)
if err != nil {
t.Fatalf("Pre-encryption failed: %v", err)
}
// Warm up cache
for range 10 {
_, _ = tkm.DecryptArtifact(encrypted, hierarchy.KMSKeyID)
}
// Measure 100 operations with cache
start := time.Now()
for range 10 {
_, err := tkm.DecryptArtifact(encrypted, hierarchy.KMSKeyID)
if err != nil {
t.Fatalf("Decrypt failed: %v", err)
}
}
elapsed := time.Since(start)
avgPerOp := elapsed / 100
if avgPerOp > 10*time.Millisecond {
t.Errorf("Average decrypt time %v exceeds 10ms requirement", avgPerOp)
}
t.Logf("Average decrypt time: %v (requirement: <10ms)", avgPerOp)
}