feat: add new API handlers, build scripts, and ADRs

- Introduce audit, plugin, and scheduler API handlers - Add spec_embed.go for OpenAPI spec embedding - Create modular build scripts (cli, go, native, cross-platform) - Add deployment cleanup and health-check utilities - New ADRs: hot reload, audit store, SSE updates, RBAC, caching, offline mode, KMS regions, tenant offboarding - Add KMS configuration schema and worker variants - Include KMS benchmark tests
2026-03-04 13:24:27 -05:00 · 2026-03-04 13:24:27 -05:00 · 7cd86fb88a
commit 7cd86fb88a
parent 5f53104fcd
23 changed files with 2432 additions and 0 deletions
--- a/.forgejo/workflows/test-matrix.yml
+++ b/.forgejo/workflows/test-matrix.yml
@ -0,0 +1,50 @@
 name: Test Matrix
 on:
  push:
    branches: [main]
  pull_request:
 env:
  GO_VERSION: '1.25.0'
 jobs:
  test-native-vs-pure:
    strategy:
      matrix:
        build_type: [pure, cgo, native]
    runs-on: self-hosted
    steps:
      - uses: actions/checkout@v4
      - name: Setup dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y build-essential cmake
      - name: Build with ${{ matrix.build_type }}
        run: |
          if [ "${{ matrix.build_type }}" = "native" ]; then
            scripts/build/build-native.sh
          fi
          scripts/build/build-go.sh ${{ matrix.build_type }} linux amd64 cmd/api-server/main.go
      - name: Run unit tests
        run: |
          export FETCHML_NATIVE_LIBS=$([ "${{ matrix.build_type }}" = "native" ] && echo "1" || echo "0")
          go test -v ./tests/unit/...
      - name: Run integration tests
        run: |
          export FETCHML_NATIVE_LIBS=$([ "${{ matrix.build_type }}" = "native" ] && echo "1" || echo "0")
          go test -v ./tests/integration/...
      - name: Run benchmark comparison
        if: matrix.build_type == 'native'
        run: make benchmark-compare
      - name: Upload test results
        uses: actions/upload-artifact@v4
        with:
          name: test-results-${{ matrix.build_type }}
          path: test-results/
--- a/configs/schema/kms_config_schema.yaml
+++ b/configs/schema/kms_config_schema.yaml
@ -0,0 +1,104 @@
 # KMS Configuration Schema
 # Defines the structure for KMS (Key Management System) configuration
 # per ADR-012 through ADR-015.
 $schema: http://json-schema.org/draft-07/schema#
 type: object
 description: KMS configuration for external key management (Vault, AWS KMS, etc.)
 properties:
  provider:
    type: string
    enum: [vault, aws, memory]
    description: KMS provider type
  vault:
    type: object
    description: HashiCorp Vault configuration
    properties:
      address:
        type: string
        format: uri
        description: Vault server URL (e.g., https://vault.internal:8200)
      auth_method:
        type: string
        enum: [approle, kubernetes, token]
        description: Authentication method
      role_id:
        type: string
        description: AppRole role ID (for approle auth)
      secret_id:
        type: string
        description: AppRole secret ID (for approle auth)
      token:
        type: string
        description: Vault token (for token auth, development only)
      transit_mount:
        type: string
        default: transit
        description: Transit engine mount path
      key_prefix:
        type: string
        default: fetchml-tenant
        description: Prefix for tenant key names
      region:
        type: string
        description: Region identifier for per-region keys (per ADR-014)
      timeout:
        type: integer
        default: 30
        description: HTTP client timeout in seconds
  aws:
    type: object
    description: AWS KMS configuration
    properties:
      region:
        type: string
        description: AWS region (e.g., us-east-1)
      key_alias_prefix:
        type: string
        default: alias/fetchml
        description: Prefix for KMS key aliases
      role_arn:
        type: string
        description: IAM role ARN to assume (optional)
      endpoint:
        type: string
        format: uri
        description: Custom endpoint for testing (e.g., LocalStack)
  cache:
    type: object
    description: DEK cache configuration per ADR-012
    properties:
      ttl_minutes:
        type: integer
        default: 15
        description: DEK cache TTL in minutes
      max_entries:
        type: integer
        default: 1000
        description: Maximum cached DEKs (LRU eviction)
      grace_window_minutes:
        type: integer
        default: 60
        description: Extended grace period during KMS unavailability (per ADR-013)
 required:
  - provider
 # Conditional validation
 allOf:
  - if:
      properties:
        provider:
          const: vault
    then:
      required: [vault]
  - if:
      properties:
        provider:
          const: aws
    then:
      required: [aws]
--- a/configs/worker/docker-staging.yaml
+++ b/configs/worker/docker-staging.yaml
@ -0,0 +1,70 @@
 # Staging environment worker configuration
 # Pre-production validation with production-like settings
 host: localhost
 port: 22
 user: worker-user
 base_path: /var/lib/fetchml
 entrypoint: train.py
 # Redis configuration
 redis_url: redis://redis:6379
 # Standard mode for staging (production-like but not strict)
 compliance_mode: standard
 max_workers: 2
 # Sandbox settings (standard isolation)
 sandbox:
  network_mode: none
  seccomp_profile: default
  no_new_privileges: true
  allowed_secrets:
    - HF_TOKEN
    - WANDB_API_KEY
    - AWS_ACCESS_KEY_ID
    - AWS_SECRET_ACCESS_KEY
 # GPU configuration
 gpu_vendor: nvidia
 # Artifact handling (production limits)
 max_artifact_files: 1000
 max_artifact_total_bytes: 536870912  # 512MB
 # Provenance (enabled for audit trail)
 provenance_best_effort: true
 # MinIO configuration for staging
 minio:
  endpoint: minio:9000
  bucket: fetchml-snapshots-staging
  secure: false
 # Plugin Configuration
 plugins:
  # Jupyter Notebook/Lab Service
  jupyter:
    enabled: true
    image: "quay.io/jupyter/base-notebook:latest"
    default_port: 8888
    mode: "lab"
    security:
      trusted_channels:
        - "conda-forge"
        - "defaults"
      blocked_packages:
        - "requests"
        - "urllib3"
      require_password: true
    max_gpu_per_instance: 1
    max_memory_per_instance: "8Gi"
  # vLLM Inference Service
  vllm:
    enabled: true
    image: "vllm/vllm-openai:latest"
    default_port: 8000
    model_cache: "/models"
    default_quantization: ""
    max_gpu_per_instance: 1
    max_model_len: 4096
--- a/configs/worker/docker-standard.yaml
+++ b/configs/worker/docker-standard.yaml
@ -0,0 +1,64 @@
 # Standard security mode worker configuration
 # Normal sandbox, network isolation
 host: localhost
 port: 22
 user: worker-user
 base_path: /var/lib/fetchml
 entrypoint: train.py
 # Redis configuration
 redis_url: redis://redis:6379
 # Standard mode - normal security
 compliance_mode: standard
 max_workers: 2
 # Sandbox settings (standard isolation)
 sandbox:
  network_mode: none
  seccomp_profile: default
  no_new_privileges: true
  allowed_secrets:
    - HF_TOKEN
    - WANDB_API_KEY
    - AWS_ACCESS_KEY_ID
    - AWS_SECRET_ACCESS_KEY
 # GPU configuration
 gpu_vendor: none
 # Artifact handling (reasonable limits)
 max_artifact_files: 1000
 max_artifact_total_bytes: 536870912  # 512MB
 # Provenance (enabled)
 provenance_best_effort: true
 # Plugin Configuration
 plugins:
  # Jupyter Notebook/Lab Service
  jupyter:
    enabled: true
    image: "quay.io/jupyter/base-notebook:latest"
    default_port: 8888
    mode: "lab"
    security:
      trusted_channels:
        - "conda-forge"
        - "defaults"
      blocked_packages:
        - "requests"
        - "urllib3"
      require_password: true
    max_gpu_per_instance: 1
    max_memory_per_instance: "8Gi"
  # vLLM Inference Service
  vllm:
    enabled: true
    image: "vllm/vllm-openai:latest"
    default_port: 8000
    model_cache: "/models"
    default_quantization: ""
    max_gpu_per_instance: 1
    max_model_len: 4096
--- a/docs/src/adr/ADR-009-hoat-lreload-per-plugin-restart-flag.md.md
+++ b/docs/src/adr/ADR-009-hoat-lreload-per-plugin-restart-flag.md.md
@ -0,0 +1,60 @@
 # ADR-009: Plugin Configuration Supports Hot-Reload with Per-Plugin Restart Flag
 ## Status
 Accepted
 ## Context
 The plugin system (`internal/tracking/`) manages MLflow, TensorBoard, and Weights & Biases integrations as sidecar processes. The REST API will expose plugin configuration updates via `PUT /v1/plugins/{pluginName}/config`.
 When a configuration update is applied, we needed to decide whether the new config takes effect immediately without restarting the plugin process (hot-reload), or whether it requires the worker to restart the plugin sidecar.
 Plugins vary significantly in what their configuration touches. Some config fields (e.g., a remote tracking URI) can be applied to a running process safely. Others (e.g., a listening port or a DB connection string) are consumed only at initialization time and cannot be changed without a restart.
 ## Decision
 **Hot-reload is the default behavior.** Plugins that cannot safely reload declare `requiresRestart: true` in their manifest. The plugin host calls `plugin.Reload(newConfig)` on a config update; if the plugin returns an error or declares the restart flag, the host falls back to the previous config and surfaces a `409 Requires Restart` response to the caller.
 ## Consequences
 ### Positive
 - Operators can update tracking URIs, credentials, and feature flags without downtime or job interruption
 - Reduces pressure to batch config changes or schedule maintenance windows for minor updates
 - The restart flag gives plugin authors a clear, documented escape hatch for initialization-bound resources
 - Config rollback on reload failure prevents a bad update from leaving a plugin in a broken state
 ### Negative
 - Plugin authors must implement a correct `Reload()` lifecycle method — leaked goroutines or global state not cleaned up on reload will cause subtle bugs
 - The reload contract must be clearly documented and enforced via integration tests; it is easy to accidentally write a plugin that claims to support hot-reload but does not
 - Two code paths to maintain in the plugin host (reload vs. restart)
 ## Options Considered
 ### Require Restart for All Config Changes
 **Pros:**
 - Simple and safe — no lifecycle contract required of plugin authors
 - No risk of partial state from a failed reload
 **Cons:**
 - Poor operator experience; a URI change should not require killing running sidecars
 - Creates pressure to accumulate config changes and apply them together, which is a riskier pattern
 - Kills observability continuity (TensorBoard logs, MLflow runs) unnecessarily for trivial updates
 ### Hot-Reload All Config Changes Without Escape Hatch
 **Pros:**
 - Consistent behavior across all plugins
 - Simpler API contract
 **Cons:**
 - Not all plugins can safely reload all fields; forcing hot-reload on init-bound resources will silently fail or corrupt state
 - No mechanism for plugin authors to signal that a restart is required
 ### Hot-Reload with Per-Plugin Restart Flag (Selected)
 **Pros:**
 - Best operator experience by default
 - Plugin authors retain control over what they can safely support
 - Clear error signaling when a restart is actually required
 **Cons:**
 - Slightly more complex host logic and plugin authoring contract
 ## Rationale
 Hot-reload provides the right default experience and covers the majority of real-world config changes (remote URIs, credentials, log levels). The `requiresRestart` flag is a minimal escape hatch that does not compromise the default without requiring plugin authors to do something fundamentally unsafe. The reload contract — teardown, reinit, rollback on error — is well-understood and testable.
--- a/docs/src/adr/ADR-010-audit-queries-served-seperate-append-only-store.md.md
+++ b/docs/src/adr/ADR-010-audit-queries-served-seperate-append-only-store.md.md
@ -0,0 +1,64 @@
 # ADR-008: Audit Queries Served from Separate Append-Only Store
 ## Status
 Accepted
 ## Context
 The audit subsystem (`internal/audit/`) writes tamper-evident, hash-chained log entries to flat files. As part of exposing audit capabilities via the REST API (`/v1/audit/events`), we needed to decide where the query layer reads from:
 - The raw audit log files written by the audit subsystem, or
 - A separate store populated from those files
 Audit queries must support filtering by time range, event type, and user ID, as well as pagination. The audit chain itself must remain tamper-evident and verifiable independently of whatever query mechanism is chosen.
 ## Decision
 Audit queries will be served from a dedicated **append-only store** (a database table). The raw log files remain the authoritative source for hash-chain verification. A lightweight pipeline process tails the log files and inserts new entries into the store, keeping query lag minimal without coupling the query layer to the file layer.
 ## Consequences
 ### Positive
 - Query performance is fast and predictable at scale — filtering, pagination, and indexed lookups are all native DB operations
 - The query layer is decoupled from the file write path, eliminating read/write contention on hot log files
 - The store survives log rotation and file archival without data loss for queries
 - Compliance export (`ml audit export`) becomes a straightforward SELECT rather than a file parse
 ### Negative
 - Dual-write complexity: the pipeline process must be monitored and must handle backpressure and restarts safely
 - A lag window exists between a log entry being written and it being queryable (acceptable for audit use cases; not a real-time feed)
 - Additional infrastructure component to operate and back up
 - Inconsistency risk if the pipeline falls behind or fails — operators must be alerted on pipeline lag
 ## Options Considered
 ### Query Directly from Log Files
 **Pros:**
 - Single source of truth, zero sync lag
 - No additional infrastructure
 **Cons:**
 - Scanning flat files for filtered queries is slow and does not scale
 - Concurrent reads on a hot log file risk contention with the write path
 - Pagination is not naturally supported by line-based files
 - Log rotation and archival break query continuity
 ### Write to Store Directly from Audit Subsystem (Dual-Write at Source)
 **Pros:**
 - Zero lag between write and query availability
 - No pipeline process to operate
 **Cons:**
 - Couples the audit subsystem to a DB dependency, increasing blast radius of DB failures
 - The tamper-evident hash chain must remain file-based for verifiability; adding a second write path at the source risks divergence
 ### Stream via Message Bus (e.g., Kafka/NATS) into Store
 **Pros:**
 - Decoupled and durable
 - Enables future consumers beyond the query store
 **Cons:**
 - Significant infrastructure overhead for the current scale
 - Introduces another system to operate and monitor
 - Overkill relative to the query volume expected from an audit endpoint
 ## Rationale
 Log files are an operational safety net and the root of trust for the hash chain, not a query surface. A separate store provides the indexing and filtering capabilities the API requires without compromising the write path or the chain's integrity. The pipeline approach (file tail → DB insert) keeps the audit subsystem itself simple and dependency-free, while giving the query layer everything it needs. The lag introduced by the pipeline is acceptable for audit queries, which are not latency-sensitive.
--- a/docs/src/adr/ADR-010-scheduler-real-time-updates-sse-for-status-streams.md
+++ b/docs/src/adr/ADR-010-scheduler-real-time-updates-sse-for-status-streams.md
@ -0,0 +1,71 @@
 # ADR-010: Scheduler Real-Time Updates Use SSE for Status Streams, REST for Control
 ## Status
 Accepted
 ## Context
 The scheduler API (`/v1/scheduler/`) exposes queue depth, worker state, and job transitions. Operators and the CLI (`ml scheduler status`) need to observe changes in near real-time without polling. We needed to choose a transport for this streaming:
 - **WebSocket:** Full-duplex, persistent connection, client can send messages mid-stream
 - **Server-Sent Events (SSE):** Server-push only, unidirectional, native HTTP
 - **Long polling:** Repeated HTTP requests held open until data is available
 The use case is observability: showing live queue depth, worker connect/disconnect events, and job state transitions. Control operations (drain a worker, update job priority, cancel a job) are discrete, low-frequency actions.
 ## Decision
 Use **SSE for status and progress streams**. Keep all control operations on **REST endpoints**. Do not use WebSocket.
 Two new streaming endpoints will be added:
 - `GET /v1/scheduler/status/stream` — emits queue depth changes, worker events, scheduler state transitions
 - `GET /v1/scheduler/jobs/{jobId}/stream` — emits state transitions and priority changes for a specific job
 ## Consequences
 ### Positive
 - SSE is unidirectional server-push, which matches the use case exactly — clients observe, they do not send mid-stream commands
 - SSE works over standard HTTP/1.1 and HTTP/2, proxies and load balancers handle it without special configuration
 - `EventSource` is a browser-native API; the CLI can consume SSE streams with standard HTTP tooling (`curl -N`)
 - No upgrade handshake, no ping/pong, no framing protocol to implement or debug
 - Control operations on REST are stateless, independently rate-limitable, and trivially auditable
 ### Negative
 - SSE is unidirectional — if a future requirement emerges for clients to send commands mid-stream (e.g., pause a job from within a live status feed), the transport will need to change
 - SSE connections held through HTTP/1.1 proxies with aggressive timeouts may require reconnect logic on the client
 - Per-connection server resources are still consumed for each SSE subscriber; high subscriber counts need backpressure consideration
 ## Options Considered
 ### WebSocket
 **Pros:**
 - Full-duplex: client can send commands mid-stream if needed
 - Low overhead per message once connection is established
 - Well-supported in all clients
 **Cons:**
 - Upgrade handshake adds complexity; many proxies and load balancers require explicit configuration to support it
 - Full-duplex is unnecessary — the scheduler status stream is read-only from the client's perspective
 - Mixes control and observation into a single connection, which complicates auditing and access control
 - More complex to implement correctly (ping/pong, close handshake, reconnect logic)
 ### Server-Sent Events (Selected)
 **Pros:**
 - Matches the server-push, read-only nature of the use case
 - Standard HTTP — no proxy configuration required
 - Simpler implementation and client consumption
 - Native browser support via `EventSource`
 **Cons:**
 - Unidirectional only
 - Slightly higher per-event overhead than WebSocket for high-frequency streams (acceptable at scheduler event rates)
 ### Long Polling
 **Pros:**
 - Works everywhere, no streaming support required
 **Cons:**
 - Higher latency per event
 - More server load from repeated connection setup/teardown
 - More complex client logic to manage polling intervals and state continuity
 ## Rationale
 The scheduler status stream is fundamentally a server-push feed. SSE is the right tool for server-push over HTTP — it is simpler to implement, simpler to consume, and requires no special infrastructure support. Mixing control operations into the same stream would complicate access control and auditing unnecessarily; keeping control on REST preserves a clean separation between observation and mutation.
--- a/rbac-implemented-as-permission-based-roles.md
+++ b/rbac-implemented-as-permission-based-roles.md
@ -0,0 +1,76 @@
 # ADR-011: RBAC Implemented as Permission-Based Roles
 ## Status
 Accepted
 ## Context
 Three new API subsystems (plugins, scheduler, audit) expose operations with meaningfully different sensitivity levels. Read-only status queries are appropriate for regular users; operations like draining a worker, updating job priority, or reading the audit log should be restricted to administrators or operators.
 We needed to choose an authorization model that enforces least privilege, is extensible without code changes, and is simple enough to implement correctly in the near term.
 ## Decision
 Implement **permission-based RBAC**. Roles are named bundles of atomic permissions. Ship two built-in roles (`admin` and `user`). The permission set for each role is defined in configuration so operators can create custom roles without code changes. Permissions are enforced at the middleware layer; JWT claims or session tokens carry the resolved permission set so enforcement is stateless.
 ### Permissions Matrix
 | Permission          | `admin` | `user` |
 |---------------------|:-------:|:------:|
 | `plugins:read`      | ✓       | ✓      |
 | `plugins:write`     | ✓       | —      |
 | `scheduler:read`    | ✓       | ✓      |
 | `scheduler:write`   | ✓       | —      |
 | `scheduler:drain`   | ✓       | —      |
 | `audit:read`        | ✓       | —      |
 | `audit:verify`      | ✓       | —      |
 | `tasks:read`        | ✓       | ✓      |
 | `tasks:write`       | ✓       | ✓      |
 | `tasks:priority`    | ✓       | —      |
 ## Consequences
 ### Positive
 - Atomic permissions allow fine-grained custom roles without schema changes (e.g., an `operator` role with `scheduler:write + scheduler:drain` but no `audit:read`)
 - Enforcement at the middleware layer is stateless — no per-request DB lookup for permission checks
 - Audit log entries can record the specific permission used for each action, not just the role
 - Easy to reason about and test — each handler has a declared required permission
 ### Negative
 - More initial setup than flat roles (`admin` / `user` strings)
 - Token size increases slightly as the resolved permission set is embedded in claims
 - Permission sprawl risk over time if new permissions are added without pruning obsolete ones
 - No row- or resource-level restrictions (e.g., "user can only see their own jobs") — this requires ABAC if needed in future
 ## Options Considered
 ### Flat Roles (admin / user string check)
 **Pros:**
 - Trivially simple to implement and reason about
 - No configuration required
 **Cons:**
 - Breaks down immediately when partial delegation is needed (e.g., on-call engineer who can drain workers but cannot read audit logs)
 - Adding a third role requires code changes
 - Cannot express nuanced operational roles without becoming an ad hoc permission system anyway
 ### Permission-Based RBAC (Selected)
 **Pros:**
 - Roles are composable without code changes
 - Clean enforcement boundary at middleware
 - Extensible to new operations by adding a permission constant
 **Cons:**
 - Slightly more upfront design work
 - Permission set in token must be kept in sync when roles change (requires token reissue or short TTLs)
 ### Attribute-Based Access Control (ABAC)
 **Pros:**
 - Maximum flexibility — can express policies like "user can only modify jobs they submitted"
 - Handles resource-level restrictions natively
 **Cons:**
 - Significant implementation complexity; requires a policy engine (e.g., OPA, Casbin)
 - Harder to reason about and audit
 - Overkill for the current access control requirements
 ## Rationale
 Permission-based RBAC sits at the right point on the complexity curve for this system's current needs. Flat roles are too rigid for real operational workflows (on-call access, read-only auditors, etc.). ABAC would be the right choice if resource-level restrictions were required, but they are not today. The configuration-driven role definition means the system can evolve toward more granular roles over time without touching application code, and the middleware enforcement pattern keeps permission checks consistent and auditable across all three new subsystems.
--- a/docs/src/adr/ADR-012-dekcacing-mem-bounded-ttl.md
+++ b/docs/src/adr/ADR-012-dekcacing-mem-bounded-ttl.md
@ -0,0 +1,73 @@
 # ADR-012: DEK Caching in Memory with Bounded TTL
 ## Status
 Accepted
 ## Context
 The encryption subsystem wraps Data Encryption Keys (DEKs) using a KMS (e.g., AWS KMS). Every encrypt or decrypt operation requires the DEK to be unwrapped by the KMS before use. Without caching, this means a KMS API call on every operation — introducing network latency, increasing KMS costs, and creating a hard dependency on KMS availability for every read and write.
 We needed to decide whether unwrapped DEKs should be held in memory for reuse, and if so, under what constraints to prevent the cache from becoming a security liability.
 ## Decision
 Unwrapped DEKs are cached **in-process memory only**, subject to the following constraints:
 - **TTL:** 15 minutes per entry. After expiry the DEK is evicted and the next operation fetches a fresh unwrapped key from KMS.
 - **Max size:** 1,000 entries (LRU eviction). Prevents unbounded memory growth across large tenant populations.
 - **Scope:** In-process only. DEKs are never serialized to disk, written to a shared cache (e.g., Redis), or logged.
 - **Explicit invalidation:** The cache exposes a `Flush(tenantID)` method called on key rotation events and tenant offboarding.
 ## Consequences
 ### Positive
 - Eliminates a KMS round-trip on every decrypt for hot keys, reducing p99 latency significantly on read-heavy workloads
 - Reduces KMS API call volume and associated cost
 - Provides a buffer against transient KMS unavailability for in-flight operations (see ADR-007)
 - TTL and size bounds keep the memory footprint predictable
 ### Negative
 - A process memory dump during the TTL window exposes plaintext DEKs — this is the core security tradeoff and must be documented in the threat model
 - Cache invalidation on key rotation requires a reliable signal path; a missed flush leaves stale DEKs in use until TTL expiry
 - LRU eviction means infrequently accessed tenant keys are silently dropped and re-fetched, which can cause latency spikes for cold tenants
 ## Options Considered
 ### No Caching (KMS Call on Every Operation)
 **Pros:**
 - No plaintext key material held in memory beyond the immediate operation
 - Simplest implementation
 **Cons:**
 - KMS latency (typically 1–5ms) on every encrypt/decrypt; unacceptable on high-throughput paths
 - KMS becomes a hard availability dependency for every operation — any blip causes immediate failures
 - KMS API costs scale linearly with operation volume
 ### Cache with No TTL (Evict Only on Explicit Flush)
 **Pros:**
 - Maximum cache hit rate
 - Lowest KMS call volume
 **Cons:**
 - Plaintext DEKs held in memory indefinitely unless explicitly flushed — a missed rotation event leaves old key material live forever
 - Unacceptable security posture for a multi-tenant encryption system
 ### Cache with Bounded TTL and Size (Selected)
 **Pros:**
 - Balances performance and security exposure window
 - TTL provides automatic backstop against missed invalidation signals
 - Size bound prevents memory abuse
 **Cons:**
 - TTL window represents a residual exposure period that must be accepted and documented
 ### Shared External Cache (e.g., Redis)
 **Pros:**
 - Survives process restart
 - Shared across multiple instances
 **Cons:**
 - Serializing plaintext DEKs to an external store dramatically widens the attack surface
 - Adds a network hop, partially negating the latency benefit over going directly to KMS
 - External cache compromise exposes all cached DEKs across all tenants simultaneously
 ## Rationale
 In-memory DEK caching with a bounded TTL is a well-established pattern used by AWS Encryption SDK, HashiCorp Vault, and Google Tink. The residual risk — memory dump during the TTL window — is the accepted tradeoff for practical performance. It is mitigated by keeping the cache strictly in-process, enforcing a short TTL, and providing an explicit flush path for key lifecycle events. Refusing to cache at all produces a system that is impractical to operate at scale and paradoxically more fragile, since KMS availability becomes a prerequisite for every single operation.
--- a/docs/src/adr/ADR-013-offline-mode-fails-close-narrow-grace.md
+++ b/docs/src/adr/ADR-013-offline-mode-fails-close-narrow-grace.md
@ -0,0 +1,68 @@
 # ADR-013: KMS Offline Mode Fails Closed with a Narrow Grace Window
 ## Status
 Accepted
 ## Context
 The encryption subsystem depends on KMS availability to unwrap DEKs. KMS services experience occasional transient unavailability (network partitions, regional incidents, rate limiting). We needed a policy for what happens when a KMS call fails:
 - **Fail-closed:** Deny the operation. No decryption without KMS reachability or a valid cached DEK.
 - **Fail-open:** Allow the operation to proceed using cached material or a fallback mechanism, even beyond normal cache TTL.
 This decision directly affects the security guarantee the system provides: if an attacker can cause KMS unavailability, does that unlock data or lock it?
 ## Decision
 The system **fails closed by default**. A narrow grace window is permitted under the following conditions only:
 - A valid DEK is already present in the in-process cache (see ADR-006)
 - KMS has been confirmed unreachable (not returning errors indicating invalid credentials or revoked access — those are not treated as unavailability)
 - The extended grace TTL has not expired (default: 1 hour beyond normal cache TTL)
 Outside of the grace window, or when no cached DEK exists, operations that require KMS return a `503 KMS Unavailable` error. No new decryption is permitted without a reachable KMS.
 ## Consequences
 ### Positive
 - Availability-based attacks (deliberately taking down KMS to force fail-open) do not succeed in exposing data
 - The security guarantee — data is inaccessible without KMS — holds through infrastructure incidents
 - Transient blips (seconds to low minutes) are absorbed by the existing DEK cache without user impact
 - Clear operational signal: `503 KMS Unavailable` is unambiguous and alertable
 ### Negative
 - Extended KMS outages (beyond cache TTL + grace window) cause user-visible failures — this is intentional but must be communicated in SLA documentation
 - The grace window logic adds implementation complexity; the distinction between "KMS unreachable" and "KMS refusing due to revoked access" must be handled correctly or the grace window becomes a bypass
 - Teams accustomed to fail-open systems will push back on this posture during incidents
 ## Options Considered
 ### Fail-Open (Allow Operations Through KMS Unavailability)
 **Pros:**
 - Maximum availability during infrastructure incidents
 - Users are not impacted by KMS outages
 **Cons:**
 - If KMS unavailability is attacker-induced, fail-open directly enables the attack — data becomes accessible precisely when the key management system is compromised
 - "Temporary" fail-open states have a history of becoming permanent through operational pressure
 - Undermines the core security model: the KMS is the control plane for data access; bypassing it on failure means it never truly controlled access
 ### Fail-Closed with No Grace Window
 **Pros:**
 - Strictest security posture
 - Simplest implementation — no grace window logic
 **Cons:**
 - Any transient KMS blip (even a 200ms network hiccup) causes a user-visible error
 - Operations in-flight at the moment of a KMS timeout fail immediately, even though a valid cached DEK exists in memory
 ### Fail-Closed with Grace Window on Cached DEKs (Selected)
 **Pros:**
 - Transient unavailability absorbed silently for tenants with warm caches
 - Security guarantee maintained — no new decryption without KMS reachability
 - Grace window is bounded and expires
 **Cons:**
 - Additional logic to distinguish unavailability from access denial
 - Grace window duration is a judgment call that must be documented and reviewed
 ## Rationale
 The correct question for a fail-open/fail-closed decision is: *what is the worst-case outcome of each posture when the unavailability is not accidental?* Fail-open means an attacker who can disrupt KMS also gets access to encrypted data — the security control collapses exactly when it is most needed. Fail-closed means an attacker who disrupts KMS causes a denial of service, not a data breach. A denial of service is recoverable; a data breach is not. The grace window is a pragmatic concession to operational reality that does not compromise this core principle, because it only applies to key material that was already authorized and cached.
--- a/docs/src/adr/ADR-014-kms-keys-per-region-expli-cross-reg.md
+++ b/docs/src/adr/ADR-014-kms-keys-per-region-expli-cross-reg.md
@ -0,0 +1,65 @@
 # ADR-014: KMS Keys Are Per-Region with Explicit Cross-Region Replication for DR
 ## Status
 Accepted
 ## Context
 The encryption subsystem manages one KMS key per tenant. As the system scales to multi-region deployments, we needed to decide whether tenant keys should be:
 - **Per-region:** Each region holds its own KMS key for each tenant. Data encrypted in a region is decrypted by that region's key.
 - **Multi-region:** A single logical key is replicated across regions (e.g., AWS KMS multi-region keys), allowing the same key material to decrypt data regardless of which region the request is served from.
 This decision affects blast radius on key compromise, data residency compliance, disaster recovery posture, and operational complexity.
 ## Decision
 Tenant KMS keys are **per-region**. Cross-region key replication is supported as an **explicit opt-in** for tenants with documented cross-region read requirements (e.g., active-active deployments, cross-region DR reads). Replication uses the KMS provider's native multi-region key mechanism (e.g., AWS KMS multi-region keys) rather than manual key export/import.
 Keys are not replicated preemptively. Replication is provisioned per-tenant on request, with the tenant's data residency requirements confirmed before a replica is created in any new region.
 ## Consequences
 ### Positive
 - Blast radius of a key compromise is contained to a single region by default — a compromised key in `us-east-1` does not affect `eu-west-1`
 - Per-region keys naturally satisfy data residency requirements (GDPR, data sovereignty) without additional policy configuration
 - Key policies and audit trails are simpler to reason about per-region
 - Default posture does not require cross-region KMS API calls, keeping latency and cost predictable
 ### Negative
 - Cross-region DR requires explicit replication setup per tenant — recovery from a full regional failure requires the replica to be pre-provisioned, not created on demand during an incident
 - Tenants with active-active multi-region deployments require replication, adding operational overhead
 - Key inventory management becomes more complex as tenant count and region count grow
 ## Options Considered
 ### Multi-Region Keys for All Tenants by Default
 **Pros:**
 - Simplifies DR — key material available in all regions without per-tenant provisioning
 - Enables cross-region reads without tenant-specific configuration
 **Cons:**
 - Key compromise in any region exposes all regions simultaneously
 - Multi-region key policies are harder to reason about and audit
 - Data residency compliance becomes harder — key material is by definition present in multiple jurisdictions
 - Creates a broader attack surface than the majority of tenants actually require
 ### Per-Region Keys, No Cross-Region Support
 **Pros:**
 - Maximum blast radius containment
 - Simplest key inventory
 **Cons:**
 - Blocks legitimate cross-region DR scenarios entirely
 - Forces tenants with active-active requirements to work around the system
 ### Per-Region Keys with Opt-In Cross-Region Replication (Selected)
 **Pros:**
 - Secure by default, flexible on demand
 - Blast radius contained for the majority of tenants who don't need cross-region reads
 - Native KMS multi-region key mechanism handles key material sync without manual export
 **Cons:**
 - Replication must be provisioned ahead of a regional failure, not during it
 - Slightly more complex provisioning workflow for multi-region tenants
 ## Rationale
 Per-region keys are the secure default because they limit the consequences of a key policy misconfiguration or credential compromise to a single region. The instinct to use multi-region keys everywhere is driven by DR convenience, but it solves that problem by widening the attack surface for every tenant, including those who will never need cross-region reads. The opt-in replication model preserves the security default while accommodating legitimate operational requirements. Data residency compliance — increasingly a hard requirement for enterprise tenants — also strongly favors per-region as the default posture.
--- a/docs/src/adr/ADR-015-tenant-offbaording-disables-kms-key.md
+++ b/docs/src/adr/ADR-015-tenant-offbaording-disables-kms-key.md
@ -0,0 +1,72 @@
 # ADR-015: Tenant Offboarding Disables KMS Key Immediately, Schedules Hard Deletion
 ## Status
 Accepted
 ## Context
 When a tenant is offboarded, the KMS key used to wrap their DEKs must be handled. The options range from immediate hard deletion (maximum data destruction assurance) to indefinite retention (maximum recoverability). We needed a policy that satisfies security requirements, compliance obligations, and operational safety.
 Key considerations:
 - KMS key deletion is **irreversible** — there is no recovery path once a key is deleted
 - Encrypted backups, audit logs, and compliance exports may need to be decrypted after the tenant relationship ends
 - Regulatory frameworks commonly require the ability to produce records for a defined retention period post-offboarding
 - Mistaken offboardings (billing errors, wrong tenant ID) do occur
 ## Decision
 On tenant offboarding:
 1. **Immediately disable the KMS key.** This blocks all new encrypt and decrypt operations instantly. The tenant's data is inaccessible from this point.
 2. **Schedule hard deletion** after a retention window of **90 days** (configurable per tenant tier and jurisdiction). The window begins at the moment the key is disabled.
 3. During the retention window, key re-enablement requires an explicit approval workflow (not a self-serve action) to prevent casual reversal.
 4. Hard deletion is executed automatically at the end of the retention window unless a hold has been placed (e.g., for legal hold or active dispute).
 The 90-day default satisfies the minimum pending deletion window enforced by AWS KMS (7 days) with significant margin, and aligns with common contractual data retention obligations.
 ## Consequences
 ### Positive
 - Immediate disable gives the tenant (and the platform) a strong assurance that data access is revoked the moment offboarding completes
 - Retention window provides recovery from mistaken offboardings without permanently weakening the deletion guarantee
 - Scheduled hard deletion is auditable and automatable — no manual step required to complete the key lifecycle
 - Compliance teams can request decryption of specific records during the retention window for legal hold, audits, or disputes
 ### Negative
 - Data is not immediately and permanently destroyed on offboarding — tenants who require cryptographic erasure on termination must be given a shorter configurable window
 - Retention window must be actively managed — holds and overrides need a governance process to prevent accumulation of keys that should have been deleted
 - Re-enablement approval workflow adds operational overhead for the rare legitimate reversal case
 ## Options Considered
 ### Immediate Hard Deletion
 **Pros:**
 - Strongest data destruction guarantee — key is gone, data is cryptographically unrecoverable immediately
 - Simplest post-offboarding state
 **Cons:**
 - Irreversible — a mistaken offboarding permanently destroys the tenant's data
 - Blocks legitimate post-offboarding decryption for compliance, legal hold, or support purposes
 - AWS KMS enforces a minimum 7-day pending deletion window regardless, so true immediate deletion is not possible on that platform
 ### Revoke Access Only, Retain Key Indefinitely
 **Pros:**
 - Maximum recoverability
 - No risk of accidental permanent loss
 **Cons:**
 - KMS key inventory grows without bound
 - Retaining key material indefinitely after a tenant relationship ends is difficult to justify under data minimization principles (GDPR Article 5)
 - Creates ongoing cost and key management overhead for tenants who have no active relationship with the platform
 ### Disable Immediately, Schedule Deletion with Retention Window (Selected)
 **Pros:**
 - Immediate revocation of access satisfies security and tenant expectations
 - Retention window handles mistaken offboarding, legal hold, and compliance decryption needs
 - Hard deletion is guaranteed at end of window unless a hold is active
 - Aligns with KMS provider behavior (AWS pending deletion model)
 **Cons:**
 - Retention window requires governance process for holds and overrides
 - Configurable window adds complexity to the offboarding workflow
 ## Rationale
 Immediate hard deletion conflates two separate concerns: revoking access (which should happen immediately) and destroying key material (which benefits from a deliberate delay). Disabling the key achieves the former instantly. The retention window serves the latter without creating unnecessary operational risk. The irreversibility of KMS key deletion demands a margin of safety — the cost of a 90-day delay before permanent destruction is negligible compared to the cost of discovering a mistaken deletion after the fact. Indefinite retention is the opposite failure mode and should be explicitly rejected by automating the deletion at the end of the window.
--- a/internal/api/audit/handlers.go
+++ b/internal/api/audit/handlers.go
@ -0,0 +1,201 @@
 // Package audit provides HTTP handlers for audit log management
 package audit
 import (
 	"encoding/json"
 	"net/http"
 	"strconv"
 	"time"
 	"github.com/jfraeys/fetch_ml/internal/auth"
 	"github.com/jfraeys/fetch_ml/internal/logging"
 )
 // Handler provides audit-related HTTP API handlers
 type Handler struct {
 	logger *logging.Logger
 	store  AuditStore // Optional: separate store for querying
 }
 // AuditStore interface for querying audit events
 type AuditStore interface {
 	QueryEvents(from, to time.Time, eventType, userID string, limit, offset int) ([]AuditEvent, int, error)
 }
 // AuditEvent represents an audit event for API responses
 type AuditEvent struct {
 	Timestamp   time.Time       `json:"timestamp"`
 	EventType   string          `json:"event_type"`
 	UserID      string          `json:"user_id,omitempty"`
 	Resource    string          `json:"resource,omitempty"`
 	Action      string          `json:"action,omitempty"`
 	Success     bool            `json:"success"`
 	IPAddress   string          `json:"ip_address,omitempty"`
 	Error       string          `json:"error,omitempty"`
 	PrevHash    string          `json:"prev_hash,omitempty"`
 	EventHash   string          `json:"event_hash,omitempty"`
 	SequenceNum int             `json:"sequence_num,omitempty"`
 	Metadata    json.RawMessage `json:"metadata,omitempty"`
 }
 // AuditEventList represents a list of audit events
 type AuditEventList struct {
 	Events []AuditEvent `json:"events"`
 	Total  int          `json:"total"`
 	Limit  int          `json:"limit"`
 	Offset int          `json:"offset"`
 }
 // VerificationResult represents the result of audit chain verification
 type VerificationResult struct {
 	Valid         bool      `json:"valid"`
 	TotalEvents   int       `json:"total_events"`
 	FirstTampered int       `json:"first_tampered,omitempty"`
 	ChainRootHash string    `json:"chain_root_hash,omitempty"`
 	VerifiedAt    time.Time `json:"verified_at"`
 }
 // ChainRootResponse represents the chain root hash response
 type ChainRootResponse struct {
 	RootHash    string    `json:"root_hash"`
 	Timestamp   time.Time `json:"timestamp"`
 	TotalEvents int       `json:"total_events"`
 }
 // NewHandler creates a new audit API handler
 func NewHandler(logger *logging.Logger, store AuditStore) *Handler {
 	return &Handler{
 		logger: logger,
 		store:  store,
 	}
 }
 // GetV1AuditEvents handles GET /v1/audit/events
 func (h *Handler) GetV1AuditEvents(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "audit:read") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	// Parse query parameters
 	fromStr := r.URL.Query().Get("from")
 	toStr := r.URL.Query().Get("to")
 	eventType := r.URL.Query().Get("event_type")
 	userID := r.URL.Query().Get("user_id")
 	limit := parseIntQueryParam(r, "limit", 100)
 	offset := parseIntQueryParam(r, "offset", 0)
 	// Validate limit
 	if limit > 1000 {
 		limit = 1000
 	}
 	// Parse timestamps
 	var from, to time.Time
 	if fromStr != "" {
 		from, _ = time.Parse(time.RFC3339, fromStr)
 	}
 	if toStr != "" {
 		to, _ = time.Parse(time.RFC3339, toStr)
 	}
 	// If store is available, query from it
 	if h.store != nil {
 		events, total, err := h.store.QueryEvents(from, to, eventType, userID, limit, offset)
 		if err != nil {
 			http.Error(w, `{"error":"Failed to query events","code":"INTERNAL_ERROR"}`, http.StatusInternalServerError)
 			return
 		}
 		response := AuditEventList{
 			Events: events,
 			Total:  total,
 			Limit:  limit,
 			Offset: offset,
 		}
 		w.Header().Set("Content-Type", "application/json")
 		json.NewEncoder(w).Encode(response)
 		return
 	}
 	// Return empty list if no store configured
 	response := AuditEventList{
 		Events: []AuditEvent{},
 		Total:  0,
 		Limit:  limit,
 		Offset: offset,
 	}
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(response)
 }
 // PostV1AuditVerify handles POST /v1/audit/verify
 func (h *Handler) PostV1AuditVerify(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "audit:verify") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	h.logger.Info("verifying audit chain", "user", user.Name)
 	// Perform verification (placeholder implementation)
 	result := VerificationResult{
 		Valid:       true,
 		TotalEvents: 0, // Would be populated from actual verification
 		VerifiedAt:  time.Now().UTC(),
 	}
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(result)
 }
 // GetV1AuditChainRoot handles GET /v1/audit/chain-root
 func (h *Handler) GetV1AuditChainRoot(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "audit:read") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	// Get chain root (placeholder implementation)
 	response := ChainRootResponse{
 		RootHash:    "sha256:0000000000000000000000000000000000000000000000000000000000000000",
 		Timestamp:   time.Now().UTC(),
 		TotalEvents: 0,
 	}
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(response)
 }
 // checkPermission checks if the user has the required permission
 func (h *Handler) checkPermission(user *auth.User, permission string) bool {
 	if user == nil {
 		return false
 	}
 	// Admin has all permissions
 	if user.Admin {
 		return true
 	}
 	// Check specific permission
 	return user.HasPermission(permission)
 }
 // parseIntQueryParam parses an integer query parameter
 func parseIntQueryParam(r *http.Request, name string, defaultVal int) int {
 	str := r.URL.Query().Get(name)
 	if str == "" {
 		return defaultVal
 	}
 	val, err := strconv.Atoi(str)
 	if err != nil {
 		return defaultVal
 	}
 	return val
 }
--- a/internal/api/plugins/handlers.go
+++ b/internal/api/plugins/handlers.go
@ -0,0 +1,289 @@
 // Package plugins provides HTTP handlers for plugin management
 package plugins
 import (
 	"encoding/json"
 	"net/http"
 	"time"
 	"github.com/jfraeys/fetch_ml/internal/auth"
 	"github.com/jfraeys/fetch_ml/internal/logging"
 	"github.com/jfraeys/fetch_ml/internal/tracking"
 )
 // Handler provides plugin-related HTTP handlers
 type Handler struct {
 	logger   *logging.Logger
 	registry *tracking.Registry
 	config   map[string]PluginConfig // Plugin configurations
 }
 // PluginConfig represents the configuration for a plugin
 type PluginConfig struct {
 	Enabled      bool                   `json:"enabled"`
 	Mode         string                 `json:"mode"` // sidecar, remote, disabled
 	Image        string                 `json:"image,omitempty"`
 	Settings     map[string]interface{} `json:"settings,omitempty"`
 	LogBasePath  string                 `json:"log_base_path,omitempty"`
 	ArtifactPath string                 `json:"artifact_path,omitempty"`
 }
 // PluginInfo represents plugin information returned by the API
 type PluginInfo struct {
 	Name            string       `json:"name"`
 	Enabled         bool         `json:"enabled"`
 	Mode            string       `json:"mode"`
 	Status          string       `json:"status"` // healthy, unhealthy, starting, stopped
 	Config          PluginConfig `json:"config"`
 	RequiresRestart bool         `json:"requires_restart"`
 	Version         string       `json:"version,omitempty"`
 }
 // PluginStatus represents the status of a plugin instance
 type PluginStatus struct {
 	Name      string    `json:"name"`
 	Status    string    `json:"status"`
 	URL       string    `json:"url,omitempty"`
 	LastCheck time.Time `json:"last_check,omitempty"`
 }
 // NewHandler creates a new plugins handler
 func NewHandler(
 	logger *logging.Logger,
 	registry *tracking.Registry,
 	config map[string]PluginConfig,
 ) *Handler {
 	return &Handler{
 		logger:   logger,
 		registry: registry,
 		config:   config,
 	}
 }
 // GetV1Plugins handles GET /v1/plugins
 func (h *Handler) GetV1Plugins(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "plugins:read") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	plugins := []PluginInfo{}
 	// If registry is available, get plugin status
 	if h.registry != nil {
 		for name := range h.config {
 			cfg := h.config[name]
 			info := PluginInfo{
 				Name:            name,
 				Enabled:         cfg.Enabled,
 				Mode:            cfg.Mode,
 				Status:          "unknown",
 				Config:          cfg,
 				RequiresRestart: false,   // Default: plugins support hot-reload
 				Version:         "1.0.0", // Placeholder
 			}
 			// Check if plugin is registered
 			if plugin, ok := h.registry.Get(name); ok {
 				// Check plugin health if available
 				if cfg.Enabled && cfg.Mode != "disabled" {
 					info.Status = "healthy" // Default, would check actual health
 					_ = plugin              // Use plugin for actual health check
 				}
 			}
 			plugins = append(plugins, info)
 		}
 	}
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(plugins)
 }
 // GetV1PluginsPluginName handles GET /v1/plugins/{pluginName}
 func (h *Handler) GetV1PluginsPluginName(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "plugins:read") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	pluginName := r.PathValue("pluginName")
 	if pluginName == "" {
 		http.Error(w, `{"error":"Missing plugin name","code":"BAD_REQUEST"}`, http.StatusBadRequest)
 		return
 	}
 	cfg, ok := h.config[pluginName]
 	if !ok {
 		http.Error(w, `{"error":"Plugin not found","code":"NOT_FOUND"}`, http.StatusNotFound)
 		return
 	}
 	info := PluginInfo{
 		Name:            pluginName,
 		Enabled:         cfg.Enabled,
 		Mode:            cfg.Mode,
 		Status:          "unknown",
 		Config:          cfg,
 		RequiresRestart: false,
 		Version:         "1.0.0",
 	}
 	if cfg.Enabled && cfg.Mode != "disabled" {
 		info.Status = "healthy"
 	}
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(info)
 }
 // GetV1PluginsPluginNameConfig handles GET /v1/plugins/{pluginName}/config
 func (h *Handler) GetV1PluginsPluginNameConfig(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "plugins:read") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	pluginName := r.PathValue("pluginName")
 	if pluginName == "" {
 		http.Error(w, `{"error":"Missing plugin name","code":"BAD_REQUEST"}`, http.StatusBadRequest)
 		return
 	}
 	cfg, ok := h.config[pluginName]
 	if !ok {
 		http.Error(w, `{"error":"Plugin not found","code":"NOT_FOUND"}`, http.StatusNotFound)
 		return
 	}
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(cfg)
 }
 // PutV1PluginsPluginNameConfig handles PUT /v1/plugins/{pluginName}/config
 func (h *Handler) PutV1PluginsPluginNameConfig(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "plugins:write") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	pluginName := r.PathValue("pluginName")
 	if pluginName == "" {
 		http.Error(w, `{"error":"Missing plugin name","code":"BAD_REQUEST"}`, http.StatusBadRequest)
 		return
 	}
 	var newConfig PluginConfig
 	if err := json.NewDecoder(r.Body).Decode(&newConfig); err != nil {
 		http.Error(w, `{"error":"Invalid request body","code":"BAD_REQUEST"}`, http.StatusBadRequest)
 		return
 	}
 	// Update config
 	h.config[pluginName] = newConfig
 	h.logger.Info("updated plugin config", "plugin", pluginName, "user", user.Name)
 	// Return updated plugin info
 	info := PluginInfo{
 		Name:            pluginName,
 		Enabled:         newConfig.Enabled,
 		Mode:            newConfig.Mode,
 		Status:          "healthy",
 		Config:          newConfig,
 		RequiresRestart: false,
 		Version:         "1.0.0",
 	}
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(info)
 }
 // DeleteV1PluginsPluginNameConfig handles DELETE /v1/plugins/{pluginName}/config
 func (h *Handler) DeleteV1PluginsPluginNameConfig(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "plugins:write") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	pluginName := r.PathValue("pluginName")
 	if pluginName == "" {
 		http.Error(w, `{"error":"Missing plugin name","code":"BAD_REQUEST"}`, http.StatusBadRequest)
 		return
 	}
 	cfg, ok := h.config[pluginName]
 	if !ok {
 		http.Error(w, `{"error":"Plugin not found","code":"NOT_FOUND"}`, http.StatusNotFound)
 		return
 	}
 	// Disable the plugin
 	cfg.Enabled = false
 	h.config[pluginName] = cfg
 	h.logger.Info("disabled plugin", "plugin", pluginName, "user", user.Name)
 	w.WriteHeader(http.StatusNoContent)
 }
 // GetV1PluginsPluginNameHealth handles GET /v1/plugins/{pluginName}/health
 func (h *Handler) GetV1PluginsPluginNameHealth(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "plugins:read") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	pluginName := r.PathValue("pluginName")
 	if pluginName == "" {
 		http.Error(w, `{"error":"Missing plugin name","code":"BAD_REQUEST"}`, http.StatusBadRequest)
 		return
 	}
 	cfg, ok := h.config[pluginName]
 	if !ok {
 		http.Error(w, `{"error":"Plugin not found","code":"NOT_FOUND"}`, http.StatusNotFound)
 		return
 	}
 	status := "healthy"
 	if !cfg.Enabled || cfg.Mode == "disabled" {
 		status = "stopped"
 	}
 	response := map[string]interface{}{
 		"status":    status,
 		"version":   "1.0.0",
 		"timestamp": time.Now().UTC(),
 	}
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(response)
 }
 // checkPermission checks if the user has the required permission
 func (h *Handler) checkPermission(user *auth.User, permission string) bool {
 	if user == nil {
 		return false
 	}
 	// Admin has all permissions
 	for _, role := range user.Roles {
 		if role == "admin" {
 			return true
 		}
 	}
 	// Check specific permission
 	for perm, hasPerm := range user.Permissions {
 		if hasPerm && perm == permission {
 			return true
 		}
 	}
 	return false
 }
--- a/internal/api/scheduler/handlers.go
+++ b/internal/api/scheduler/handlers.go
@ -0,0 +1,480 @@
 // Package scheduler provides HTTP handlers for scheduler management
 package scheduler
 import (
 	"encoding/json"
 	"fmt"
 	"net/http"
 	"strconv"
 	"time"
 	"github.com/jfraeys/fetch_ml/internal/auth"
 	"github.com/jfraeys/fetch_ml/internal/logging"
 	sch "github.com/jfraeys/fetch_ml/internal/scheduler"
 )
 // APIHandler provides scheduler-related HTTP API handlers
 type APIHandler struct {
 	logger    *logging.Logger
 	hub       *sch.SchedulerHub
 	streaming map[string][]chan StreamingEvent
 }
 // NewHandler creates a new scheduler API handler
 func NewHandler(hub *sch.SchedulerHub, logger *logging.Logger, authConfig *auth.Config) *APIHandler {
 	return &APIHandler{
 		logger:    logger,
 		hub:       hub,
 		streaming: make(map[string][]chan StreamingEvent),
 	}
 }
 // StreamingEvent represents an event for SSE streaming
 type StreamingEvent struct {
 	Type    string          `json:"type"`
 	Payload json.RawMessage `json:"payload"`
 }
 // WorkerInfo represents worker information for API responses
 type WorkerInfo struct {
 	ID            string                 `json:"id"`
 	ConnectedAt   time.Time              `json:"connected_at"`
 	LastHeartbeat time.Time              `json:"last_heartbeat,omitempty"`
 	Capabilities  sch.WorkerCapabilities `json:"capabilities"`
 	Slots         sch.SlotStatus         `json:"slots"`
 	ActiveTasks   []string               `json:"active_tasks"`
 	Status        string                 `json:"status"` // active, draining, offline
 }
 // SchedulerStatus represents scheduler status for API responses
 type SchedulerStatus struct {
 	WorkersTotal       int       `json:"workers_total"`
 	WorkersActive      int       `json:"workers_active"`
 	WorkersDraining    int       `json:"workers_draining"`
 	BatchQueueDepth    int       `json:"batch_queue_depth"`
 	ServiceQueueDepth  int       `json:"service_queue_depth"`
 	TasksRunning       int       `json:"tasks_running"`
 	TasksCompleted24h  int       `json:"tasks_completed_24h"`
 	ReservationsActive int       `json:"reservations_active"`
 	Timestamp          time.Time `json:"timestamp"`
 }
 // ReservationInfo represents reservation information for API responses
 type ReservationInfo struct {
 	ID        string    `json:"id"`
 	UserID    string    `json:"user_id"`
 	GPUCount  int       `json:"gpu_count"`
 	GPUType   string    `json:"gpu_type,omitempty"`
 	NodeCount int       `json:"node_count"`
 	ExpiresAt time.Time `json:"expires_at"`
 	Status    string    `json:"status"` // active, claimed, expired
 }
 // CreateReservationRequest represents a request to create a reservation
 type CreateReservationRequest struct {
 	GPUCount       int    `json:"gpu_count"`
 	GPUType        string `json:"gpu_type,omitempty"`
 	NodeCount      int    `json:"node_count,omitempty"`
 	ExpiresMinutes int    `json:"expires_minutes,omitempty"`
 }
 // NewAPIHandler creates a new scheduler API handler
 func NewAPIHandler(logger *logging.Logger, hub *sch.SchedulerHub) *APIHandler {
 	handler := &APIHandler{
 		logger:    logger,
 		hub:       hub,
 		streaming: make(map[string][]chan StreamingEvent),
 	}
 	return handler
 }
 // GetV1SchedulerStatus handles GET /v1/scheduler/status
 func (h *APIHandler) GetV1SchedulerStatus(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "scheduler:read") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	if h.hub == nil {
 		http.Error(w, `{"error":"Scheduler not available","code":"SERVICE_UNAVAILABLE"}`, http.StatusServiceUnavailable)
 		return
 	}
 	// Get metrics from hub
 	metrics := h.hub.GetMetricsPayload()
 	status := SchedulerStatus{
 		Timestamp: time.Now().UTC(),
 	}
 	// Extract values from metrics payload
 	if v, ok := metrics["workers_connected"].(int); ok {
 		status.WorkersTotal = v
 		status.WorkersActive = v // Simplified - all connected are active
 	}
 	if v, ok := metrics["queue_depth_batch"].(int); ok {
 		status.BatchQueueDepth = v
 	}
 	if v, ok := metrics["queue_depth_service"].(int); ok {
 		status.ServiceQueueDepth = v
 	}
 	if v, ok := metrics["jobs_completed"].(int); ok {
 		status.TasksCompleted24h = v
 	}
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(status)
 }
 // GetV1SchedulerWorkers handles GET /v1/scheduler/workers
 func (h *APIHandler) GetV1SchedulerWorkers(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "scheduler:read") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	if h.hub == nil {
 		http.Error(w, `{"error":"Scheduler not available","code":"SERVICE_UNAVAILABLE"}`, http.StatusServiceUnavailable)
 		return
 	}
 	// Get worker slots from metrics
 	metrics := h.hub.GetMetricsPayload()
 	slots, _ := metrics["worker_slots"].(map[string]sch.SlotStatus)
 	workers := []WorkerInfo{}
 	// Build worker list from slots data
 	for workerID, slotStatus := range slots {
 		worker := WorkerInfo{
 			ID:           workerID,
 			Slots:        slotStatus,
 			Capabilities: sch.WorkerCapabilities{},
 			Status:       "active",
 			ActiveTasks:  []string{},
 		}
 		workers = append(workers, worker)
 	}
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(workers)
 }
 // GetV1SchedulerWorkersWorkerID handles GET /v1/scheduler/workers/{workerId}
 func (h *APIHandler) GetV1SchedulerWorkersWorkerID(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "scheduler:read") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	if h.hub == nil {
 		http.Error(w, `{"error":"Scheduler not available","code":"SERVICE_UNAVAILABLE"}`, http.StatusServiceUnavailable)
 		return
 	}
 	workerID := r.PathValue("workerId")
 	if workerID == "" {
 		http.Error(w, `{"error":"Missing worker ID","code":"BAD_REQUEST"}`, http.StatusBadRequest)
 		return
 	}
 	// Get worker slots from metrics
 	metrics := h.hub.GetMetricsPayload()
 	slots, _ := metrics["worker_slots"].(map[string]sch.SlotStatus)
 	slotStatus, ok := slots[workerID]
 	if !ok {
 		http.Error(w, `{"error":"Worker not found","code":"NOT_FOUND"}`, http.StatusNotFound)
 		return
 	}
 	worker := WorkerInfo{
 		ID:           workerID,
 		Slots:        slotStatus,
 		Capabilities: sch.WorkerCapabilities{},
 		Status:       "active",
 		ActiveTasks:  []string{},
 	}
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(worker)
 }
 // DeleteV1SchedulerWorkersWorkerID handles DELETE /v1/scheduler/workers/{workerId}
 func (h *APIHandler) DeleteV1SchedulerWorkersWorkerID(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "scheduler:drain") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	if h.hub == nil {
 		http.Error(w, `{"error":"Scheduler not available","code":"SERVICE_UNAVAILABLE"}`, http.StatusServiceUnavailable)
 		return
 	}
 	workerID := r.PathValue("workerId")
 	if workerID == "" {
 		http.Error(w, `{"error":"Missing worker ID","code":"BAD_REQUEST"}`, http.StatusBadRequest)
 		return
 	}
 	h.logger.Info("draining worker", "worker_id", workerID, "user", user.Name)
 	// Note: Actual drain implementation would involve signaling the worker
 	// and waiting for tasks to complete. This is a simplified version.
 	w.WriteHeader(http.StatusNoContent)
 }
 // GetV1SchedulerReservations handles GET /v1/scheduler/reservations
 func (h *APIHandler) GetV1SchedulerReservations(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "scheduler:read") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	if h.hub == nil {
 		http.Error(w, `{"error":"Scheduler not available","code":"SERVICE_UNAVAILABLE"}`, http.StatusServiceUnavailable)
 		return
 	}
 	// Return empty list for now - reservations would be tracked in hub
 	reservations := []ReservationInfo{}
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(reservations)
 }
 // PostV1SchedulerReservations handles POST /v1/scheduler/reservations
 func (h *APIHandler) PostV1SchedulerReservations(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "scheduler:write") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	if h.hub == nil {
 		http.Error(w, `{"error":"Scheduler not available","code":"SERVICE_UNAVAILABLE"}`, http.StatusServiceUnavailable)
 		return
 	}
 	var req CreateReservationRequest
 	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
 		http.Error(w, `{"error":"Invalid request body","code":"BAD_REQUEST"}`, http.StatusBadRequest)
 		return
 	}
 	if req.GPUCount <= 0 {
 		http.Error(w, `{"error":"GPU count must be positive","code":"VALIDATION_ERROR"}`, http.StatusUnprocessableEntity)
 		return
 	}
 	if req.NodeCount <= 0 {
 		req.NodeCount = 1
 	}
 	if req.ExpiresMinutes <= 0 {
 		req.ExpiresMinutes = 30
 	}
 	reservation := ReservationInfo{
 		ID:        fmt.Sprintf("res-%d", time.Now().UnixNano()),
 		UserID:    user.Name,
 		GPUCount:  req.GPUCount,
 		GPUType:   req.GPUType,
 		NodeCount: req.NodeCount,
 		ExpiresAt: time.Now().Add(time.Duration(req.ExpiresMinutes) * time.Minute),
 		Status:    "active",
 	}
 	h.logger.Info("created reservation", "reservation_id", reservation.ID, "user", user.Name)
 	w.Header().Set("Content-Type", "application/json")
 	w.WriteHeader(http.StatusCreated)
 	json.NewEncoder(w).Encode(reservation)
 }
 // PatchV1SchedulerJobsJobIDPriority handles PATCH /v1/scheduler/jobs/{jobId}/priority
 func (h *APIHandler) PatchV1SchedulerJobsJobIDPriority(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "tasks:priority") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	if h.hub == nil {
 		http.Error(w, `{"error":"Scheduler not available","code":"SERVICE_UNAVAILABLE"}`, http.StatusServiceUnavailable)
 		return
 	}
 	jobID := r.PathValue("jobId")
 	if jobID == "" {
 		http.Error(w, `{"error":"Missing job ID","code":"BAD_REQUEST"}`, http.StatusBadRequest)
 		return
 	}
 	var req struct {
 		Priority int `json:"priority"`
 	}
 	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
 		http.Error(w, `{"error":"Invalid request body","code":"BAD_REQUEST"}`, http.StatusBadRequest)
 		return
 	}
 	if req.Priority < 1 || req.Priority > 10 {
 		http.Error(w, `{"error":"Priority must be between 1 and 10","code":"VALIDATION_ERROR"}`, http.StatusUnprocessableEntity)
 		return
 	}
 	h.logger.Info("updating job priority", "job_id", jobID, "priority", req.Priority, "user", user.Name)
 	// Note: Actual priority update would modify the task in the queue
 	// This is a simplified version
 	response := map[string]interface{}{
 		"id":       jobID,
 		"priority": req.Priority,
 		"status":   "queued",
 	}
 	w.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(w).Encode(response)
 }
 // GetV1SchedulerStatusStream handles GET /v1/scheduler/status/stream (SSE)
 func (h *APIHandler) GetV1SchedulerStatusStream(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "scheduler:read") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	// Set SSE headers
 	w.Header().Set("Content-Type", "text/event-stream")
 	w.Header().Set("Cache-Control", "no-cache")
 	w.Header().Set("Connection", "keep-alive")
 	// Create event channel for this client
 	eventChan := make(chan StreamingEvent, 10)
 	clientID := fmt.Sprintf("%s-%d", user.Name, time.Now().UnixNano())
 	h.streaming[clientID] = append(h.streaming[clientID], eventChan)
 	// Clean up on disconnect
 	defer func() {
 		delete(h.streaming, clientID)
 		close(eventChan)
 	}()
 	// Send initial status
 	status := map[string]interface{}{
 		"type":      "connected",
 		"timestamp": time.Now().UTC(),
 	}
 	data, _ := json.Marshal(status)
 	fmt.Fprintf(w, "data: %s\n\n", data)
 	w.(http.Flusher).Flush()
 	// Keep connection alive and send periodic updates
 	ticker := time.NewTicker(5 * time.Second)
 	defer ticker.Stop()
 	for {
 		select {
 		case <-r.Context().Done():
 			return
 		case <-ticker.C:
 			heartbeat := map[string]interface{}{
 				"type":      "heartbeat",
 				"timestamp": time.Now().UTC(),
 			}
 			data, _ := json.Marshal(heartbeat)
 			fmt.Fprintf(w, "data: %s\n\n", data)
 			w.(http.Flusher).Flush()
 		case event := <-eventChan:
 			data, _ := json.Marshal(event)
 			fmt.Fprintf(w, "data: %s\n\n", data)
 			w.(http.Flusher).Flush()
 		}
 	}
 }
 // GetV1SchedulerJobsJobIDStream handles GET /v1/scheduler/jobs/{jobId}/stream (SSE)
 func (h *APIHandler) GetV1SchedulerJobsJobIDStream(w http.ResponseWriter, r *http.Request) {
 	user := auth.GetUserFromContext(r.Context())
 	if !h.checkPermission(user, "scheduler:read") {
 		http.Error(w, `{"error":"Insufficient permissions","code":"FORBIDDEN"}`, http.StatusForbidden)
 		return
 	}
 	jobID := r.PathValue("jobId")
 	if jobID == "" {
 		http.Error(w, `{"error":"Missing job ID","code":"BAD_REQUEST"}`, http.StatusBadRequest)
 		return
 	}
 	// Set SSE headers
 	w.Header().Set("Content-Type", "text/event-stream")
 	w.Header().Set("Cache-Control", "no-cache")
 	w.Header().Set("Connection", "keep-alive")
 	// Send initial status
 	response := map[string]interface{}{
 		"type":      "connected",
 		"job_id":    jobID,
 		"timestamp": time.Now().UTC(),
 	}
 	data, _ := json.Marshal(response)
 	fmt.Fprintf(w, "data: %s\n\n", data)
 	w.(http.Flusher).Flush()
 	// Keep connection alive
 	ticker := time.NewTicker(5 * time.Second)
 	defer ticker.Stop()
 	for {
 		select {
 		case <-r.Context().Done():
 			return
 		case <-ticker.C:
 			heartbeat := map[string]interface{}{
 				"type":      "heartbeat",
 				"timestamp": time.Now().UTC(),
 			}
 			data, _ := json.Marshal(heartbeat)
 			fmt.Fprintf(w, "data: %s\n\n", data)
 			w.(http.Flusher).Flush()
 		}
 	}
 }
 // checkPermission checks if the user has the required permission
 func (h *APIHandler) checkPermission(user *auth.User, permission string) bool {
 	if user == nil {
 		return false
 	}
 	// Admin has all permissions
 	if user.Admin {
 		return true
 	}
 	// Check specific permission
 	return user.HasPermission(permission)
 }
 // parseIntQueryParam parses an integer query parameter
 func parseIntQueryParam(r *http.Request, name string, defaultVal int) int {
 	str := r.URL.Query().Get(name)
 	if str == "" {
 		return defaultVal
 	}
 	val, err := strconv.Atoi(str)
 	if err != nil {
 		return defaultVal
 	}
 	return val
 }
--- a/internal/api/spec_embed.go
+++ b/internal/api/spec_embed.go
@ -0,0 +1,30 @@
 package api
 import (
 	"net/http"
 	"os"
 	"path/filepath"
 	"runtime"
 )
 // openAPISpecPath returns the path to the OpenAPI spec file
 func openAPISpecPath() string {
 	// Get the directory of this source file
 	_, filename, _, _ := runtime.Caller(0)
 	dir := filepath.Dir(filename)
 	// Navigate to repo root and then to api/
 	return filepath.Join(dir, "..", "..", "api", "openapi.yaml")
 }
 // ServeOpenAPISpec serves the OpenAPI specification as YAML
 func ServeOpenAPISpec(w http.ResponseWriter, _ *http.Request) {
 	specPath := openAPISpecPath()
 	data, err := os.ReadFile(specPath)
 	if err != nil {
 		http.Error(w, "Failed to read OpenAPI spec", http.StatusInternalServerError)
 		return
 	}
 	w.Header().Set("Content-Type", "application/yaml")
 	w.WriteHeader(http.StatusOK)
 	_, _ = w.Write(data)
 }
--- a/scripts/build/build-cli.sh
+++ b/scripts/build/build-cli.sh
@ -0,0 +1,18 @@
 #!/bin/bash
 # Builds CLI binaries using Zig
 # Usage: build-cli.sh
 set -euo pipefail
 echo "Building CLI with Zig..."
 cd cli
 # Build release-optimized CLI
 zig build --release=fast
 cd ..
 # Copy to dist
 mkdir -p dist
 cp cli/zig-out/bin/ml dist/ml_linux_x86_64
 echo "✓ CLI built: dist/ml_linux_x86_64"
--- a/scripts/build/build-go.sh
+++ b/scripts/build/build-go.sh
@ -0,0 +1,44 @@
 #!/bin/bash
 # Builds Go binaries with configurable CGO/native options
 # Usage: build-go.sh <build_type> <os> <arch> <source_path>
 set -euo pipefail
 BUILD_TYPE=${1:-native}
 OS=${2:-linux}
 ARCH=${3:-amd64}
 SOURCE_PATH=${4:-cmd/api-server/main.go}
 LDFLAGS="-s -w -X main.BuildHash=$(git rev-parse --short HEAD) -X main.BuildTime=$(date -u +%Y%m%d.%H%M%S)"
 case $BUILD_TYPE in
    pure)
        export CGO_ENABLED=0
        TAGS=""
        SUFFIX="_${OS}_${ARCH}_pure"
        ;;
    cgo)
        export CGO_ENABLED=1
        TAGS=""
        SUFFIX="_${OS}_${ARCH}_cgo"
        ;;
    native)
        export CGO_ENABLED=1
        TAGS="native_libs"
        SUFFIX="_${OS}_${ARCH}_native"
        ;;
    *)
        echo "Unknown build type: $BUILD_TYPE"
        echo "Usage: $0 <pure|cgo|native> <os> <arch> <source_path>"
        exit 1
        ;;
 esac
 BINARY_NAME=$(basename "$SOURCE_PATH" .go)
 OUTPUT="dist/fetch_ml_${BINARY_NAME}_${OS}_${ARCH}${SUFFIX}"
 mkdir -p dist
 echo "Building ${BINARY_NAME} (${BUILD_TYPE}) for ${OS}/${ARCH}..."
 go build -tags="$TAGS" -ldflags="$LDFLAGS" -o "$OUTPUT" "$SOURCE_PATH"
 echo "✓ Built: $OUTPUT"
--- a/scripts/build/build-native.sh
+++ b/scripts/build/build-native.sh
@ -0,0 +1,18 @@
 #!/bin/bash
 # Builds native C++ libraries for Linux x86_64
 # Run on Ubuntu self-hosted runner
 set -euo pipefail
 mkdir -p native/build/linux_amd64
 # Use cmake for native build
 cmake -S native -B native/build/linux_amd64 \
    -DCMAKE_BUILD_TYPE=Release
 cmake --build native/build/linux_amd64 --parallel
 # Package libs
 mkdir -p bin/native/linux_amd64
 cp native/build/linux_amd64/lib*.so bin/native/linux_amd64/ 2>/dev/null || true
 echo "✓ Native libraries built for Linux x86_64"
--- a/scripts/build/cross-platform.sh
+++ b/scripts/build/cross-platform.sh
@ -0,0 +1,30 @@
 #!/bin/bash
 # Orchestrates full build for Linux x86_64 only
 set -euo pipefail
 echo "=== Building for Linux x86_64 ==="
 # Build native C++ libraries
 scripts/build/build-native.sh
 # Build Go backends for all build types
 for build_type in pure cgo native; do
    echo "=== Building ${build_type} binaries ==="
    for binary in api-server worker data_manager user_manager tui; do
        source_path="cmd/${binary}"
        [ "$binary" = "worker" ] && source_path="cmd/worker/worker_server.go"
        [ "$binary" = "api-server" ] && source_path="cmd/api-server/main.go"
        [ "$binary" = "data_manager" ] && source_path="cmd/data_manager/main.go"
        [ "$binary" = "user_manager" ] && source_path="cmd/user_manager/main.go"
        [ "$binary" = "tui" ] && source_path="cmd/tui/main.go"
        scripts/build/build-go.sh "$build_type" linux amd64 "$source_path"
    done
 done
 # Build CLI binaries
 scripts/build/build-cli.sh
 echo "=== Build complete ==="
 ls -lh dist/
--- a/scripts/deploy/cleanup.sh
+++ b/scripts/deploy/cleanup.sh
@ -0,0 +1,133 @@
 #!/bin/bash
 # Cleanup script for fetch_ml deployments
 # Removes orphaned containers, volumes, and networks
 set -e
 REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)
 # Colors
 GREEN='\033[0;32m'
 RED='\033[0;31m'
 YELLOW='\033[1;33m'
 NC='\033[0m'
 print_success() { echo -e "${GREEN}[OK]${NC} $1"; }
 print_error() { echo -e "${RED}[ERROR]${NC} $1"; }
 print_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
 print_info() { echo "[INFO] $1"; }
 show_usage() {
    echo "Usage: $0 [ENVIRONMENT|all]"
    echo ""
    echo "Environments:"
    echo "  dev         Clean up development environment"
    echo "  staging     Clean up staging environment"
    echo "  prod        Clean up production environment"
    echo "  homelab     Clean up homelab environment"
    echo "  all         Clean up all environments (default)"
    echo ""
    echo "Examples:"
    echo "  $0 dev       # Clean dev only"
    echo "  $0 all       # Clean everything"
 }
 cleanup_environment() {
    local env=$1
    local compose_file
    case $env in
        dev)
            compose_file="${REPO_ROOT}/deployments/docker-compose.dev.yml"
            ;;
        staging)
            compose_file="${REPO_ROOT}/deployments/docker-compose.staging.yml"
            ;;
        prod)
            compose_file="${REPO_ROOT}/deployments/docker-compose.prod.yml"
            ;;
        homelab)
            compose_file="${REPO_ROOT}/deployments/docker-compose.homelab-secure.yml"
            ;;
        *)
            print_error "Unknown environment: $env"
            show_usage
            exit 1
            ;;
    esac
    if [ ! -f "$compose_file" ]; then
        print_warn "Compose file not found: $compose_file"
        return 0
    fi
    print_info "Cleaning up $env environment..."
    # Stop and remove containers, networks, volumes
    docker-compose -f "$compose_file" down -v --remove-orphans 2>/dev/null || true
    # Remove any leftover containers with project name
    local project_name
    project_name=$(basename "$compose_file" .yml | sed 's/docker-compose\.//')
    local containers
    containers=$(docker ps -aq --filter "name=ml-${project_name}" --filter "name=ml-experiments" 2>/dev/null || true)
    if [ -n "$containers" ]; then
        print_info "Removing leftover containers..."
        docker rm -f $containers 2>/dev/null || true
    fi
    print_success "${env} environment cleaned"
 }
 cleanup_all() {
    print_info "Cleaning up all FetchML environments..."
    # Clean each environment
    for env in dev staging prod homelab; do
        cleanup_environment "$env" || true
    done
    # Remove any remaining fetchml containers by name pattern
    local all_containers
    all_containers=$(docker ps -aq --filter "name=^ml-" 2>/dev/null || true)
    if [ -n "$all_containers" ]; then
        print_info "Removing remaining FetchML containers..."
        docker rm -f $all_containers 2>/dev/null || true
    fi
    # Only prune FetchML-specific networks (not all unused)
    local networks
    networks=$(docker network ls -q --filter "name=^ml-" 2>/dev/null || true)
    if [ -n "$networks" ]; then
        print_info "Removing FetchML networks..."
        docker network rm $networks 2>/dev/null || true
    fi
    print_success "All FetchML environments cleaned (other Docker data preserved)"
 }
 main() {
    local env=${1:-all}
    case $env in
        dev|staging|prod|homelab)
            cleanup_environment "$env"
            ;;
        all)
            cleanup_all
            ;;
        -h|--help)
            show_usage
            exit 0
            ;;
        *)
            print_error "Unknown option: $env"
            show_usage
            exit 1
            ;;
    esac
 }
 main "$@"
--- a/scripts/deploy/health-check.sh
+++ b/scripts/deploy/health-check.sh
@ -0,0 +1,73 @@
 #!/bin/bash
 # Health check utilities for fetch_ml deployments
 set -e
 REPO_ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)
 # Colors
 GREEN='\033[0;32m'
 RED='\033[0;31m'
 YELLOW='\033[1;33m'
 NC='\033[0m'
 print_success() { echo -e "${GREEN}[OK]${NC} $1"; }
 print_error() { echo -e "${RED}[FAIL]${NC} $1"; }
 print_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
 # Check health for a specific environment
 check_environment() {
    local env=$1
    local port
    case $env in
        dev) port=9101 ;;
        staging) port=9102 ;;
        prod) port=9101 ;;
        *) port=9101 ;;
    esac
    echo "=== ${env} Environment (port ${port}) ==="
    if curl -fsS "http://localhost:${port}/health" > /dev/null 2>&1; then
        print_success "API is responding"
        # Get detailed health info
        local health
        health=$(curl -fsS "http://localhost:${port}/health" 2>/dev/null || echo "{}")
        # Check compliance mode
        local compliance
        compliance=$(echo "$health" | grep -o '"compliance_mode":"[^"]*"' | cut -d'"' -f4 || echo "unknown")
        echo "  Compliance mode: $compliance"
        # Check native libs status
        if echo "$health" | grep -q '"native_libs":true'; then
            print_success "Native libraries enabled"
        else
            print_warn "Native libraries not enabled"
        fi
    else
        print_error "API health check failed"
        return 1
    fi
    echo ""
 }
 # Main
 main() {
    local env=${1:-all}
    echo "=== FetchML Health Check ==="
    echo ""
    if [ "$env" = "all" ]; then
        check_environment dev || true
        check_environment staging || true
        check_environment prod || true
    else
        check_environment "$env"
    fi
 }
 main "$@"
--- a/tests/benchmarks/kms_benchmark_test.go
+++ b/tests/benchmarks/kms_benchmark_test.go
@ -0,0 +1,279 @@
 package benchmarks
 import (
 	"testing"
 	"time"
 	"github.com/jfraeys/fetch_ml/internal/crypto"
 	"github.com/jfraeys/fetch_ml/internal/crypto/kms"
 )
 // BenchmarkEncryptArtifact measures the full encryption pipeline performance.
 // Per ADR-012: Total overhead should be <10ms for MemoryProvider.
 func BenchmarkEncryptArtifact(b *testing.B) {
 	tkm := crypto.NewTestTenantKeyManager(nil)
 	// Provision a test tenant
 	hierarchy, err := tkm.ProvisionTenant("bench-tenant")
 	if err != nil {
 		b.Fatalf("Failed to provision tenant: %v", err)
 	}
 	// Test data - 1KB payload (typical model weights chunk)
 	plaintext := make([]byte, 1024)
 	for i := range plaintext {
 		plaintext[i] = byte(i % 256)
 	}
 	b.ResetTimer()
 	b.ReportAllocs()
 	for b.Loop() {
 		_, err := tkm.EncryptArtifact("bench-tenant", "artifact-1", hierarchy.KMSKeyID, plaintext)
 		if err != nil {
 			b.Fatalf("Encrypt failed: %v", err)
 		}
 	}
 }
 // BenchmarkDecryptArtifact measures the full decryption pipeline performance.
 // Per ADR-012: Total overhead should be <10ms for MemoryProvider.
 func BenchmarkDecryptArtifact(b *testing.B) {
 	tkm := crypto.NewTestTenantKeyManager(nil)
 	hierarchy, err := tkm.ProvisionTenant("bench-tenant")
 	if err != nil {
 		b.Fatalf("Failed to provision tenant: %v", err)
 	}
 	plaintext := make([]byte, 1024)
 	for i := range plaintext {
 		plaintext[i] = byte(i % 256)
 	}
 	// Pre-encrypt data
 	encrypted, err := tkm.EncryptArtifact("bench-tenant", "artifact-1", hierarchy.KMSKeyID, plaintext)
 	if err != nil {
 		b.Fatalf("Pre-encryption failed: %v", err)
 	}
 	b.ReportAllocs()
 	for b.Loop() {
 		_, err := tkm.DecryptArtifact(encrypted, hierarchy.KMSKeyID)
 		if err != nil {
 			b.Fatalf("Decrypt failed: %v", err)
 		}
 	}
 }
 // BenchmarkMemoryProvider_Encrypt measures baseline encryption without network overhead.
 // This establishes the theoretical minimum for KMS operations.
 func BenchmarkMemoryProvider_Encrypt(b *testing.B) {
 	provider := kms.NewMemoryProvider()
 	defer provider.Close()
 	cache := kms.NewDEKCache(kms.DefaultCacheConfig())
 	defer cache.Clear()
 	config := kms.Config{
 		Provider: kms.ProviderTypeMemory,
 		Cache:    kms.DefaultCacheConfig(),
 	}
 	tkm := crypto.NewTenantKeyManager(provider, cache, config, nil)
 	hierarchy, err := tkm.ProvisionTenant("bench-tenant")
 	if err != nil {
 		b.Fatalf("Failed to provision tenant: %v", err)
 	}
 	plaintext := make([]byte, 1024)
 	b.ReportAllocs()
 	for b.Loop() {
 		_, err := tkm.EncryptArtifact("bench-tenant", "artifact-1", hierarchy.KMSKeyID, plaintext)
 		if err != nil {
 			b.Fatalf("Encrypt failed: %v", err)
 		}
 	}
 }
 // BenchmarkCacheHit verifies cached DEKs provide <10ms overhead.
 func BenchmarkCacheHit(b *testing.B) {
 	tkm := crypto.NewTestTenantKeyManager(nil)
 	hierarchy, err := tkm.ProvisionTenant("bench-tenant")
 	if err != nil {
 		b.Fatalf("Failed to provision tenant: %v", err)
 	}
 	plaintext := make([]byte, 1024)
 	// First encrypt to populate cache
 	encrypted, err := tkm.EncryptArtifact("bench-tenant", "cached-artifact", hierarchy.KMSKeyID, plaintext)
 	if err != nil {
 		b.Fatalf("Pre-encryption failed: %v", err)
 	}
 	// First decrypt to populate DEK cache
 	_, err = tkm.DecryptArtifact(encrypted, hierarchy.KMSKeyID)
 	if err != nil {
 		b.Fatalf("First decrypt failed: %v", err)
 	}
 	b.ReportAllocs()
 	for b.Loop() {
 		_, err := tkm.DecryptArtifact(encrypted, hierarchy.KMSKeyID)
 		if err != nil {
 			b.Fatalf("Decrypt failed: %v", err)
 		}
 	}
 }
 // BenchmarkKeyRotation measures key rotation overhead.
 func BenchmarkKeyRotation(b *testing.B) {
 	tkm := crypto.NewTestTenantKeyManager(nil)
 	hierarchy, err := tkm.ProvisionTenant("bench-tenant")
 	if err != nil {
 		b.Fatalf("Failed to provision tenant: %v", err)
 	}
 	b.ReportAllocs()
 	for b.Loop() {
 		// Rotate key
 		newHierarchy, err := tkm.RotateTenantKey("bench-tenant", hierarchy)
 		if err != nil {
 			b.Fatalf("Rotation failed: %v", err)
 		}
 		hierarchy = newHierarchy
 	}
 }
 // BenchmarkEncryptArtifact_LargePayload measures encryption with larger payloads.
 func BenchmarkEncryptArtifact_LargePayload(b *testing.B) {
 	tkm := crypto.NewTestTenantKeyManager(nil)
 	hierarchy, err := tkm.ProvisionTenant("bench-tenant")
 	if err != nil {
 		b.Fatalf("Failed to provision tenant: %v", err)
 	}
 	// 1MB payload
 	plaintext := make([]byte, 1024*1024)
 	b.ReportAllocs()
 	for b.Loop() {
 		_, err := tkm.EncryptArtifact("bench-tenant", "large-artifact", hierarchy.KMSKeyID, plaintext)
 		if err != nil {
 			b.Fatalf("Encrypt failed: %v", err)
 		}
 	}
 }
 // BenchmarkParallelEncrypt measures concurrent encryption performance.
 func BenchmarkParallelEncrypt(b *testing.B) {
 	tkm := crypto.NewTestTenantKeyManager(nil)
 	hierarchy, err := tkm.ProvisionTenant("bench-tenant")
 	if err != nil {
 		b.Fatalf("Failed to provision tenant: %v", err)
 	}
 	plaintext := make([]byte, 1024)
 	b.ResetTimer()
 	b.ReportAllocs()
 	b.RunParallel(func(pb *testing.PB) {
 		i := 0
 		for pb.Next() {
 			artifactID := "parallel-artifact-" + string(rune('0'+i%10))
 			_, err := tkm.EncryptArtifact("bench-tenant", artifactID, hierarchy.KMSKeyID, plaintext)
 			if err != nil {
 				b.Fatalf("Encrypt failed: %v", err)
 			}
 			i++
 		}
 	})
 }
 // VerifyPerformanceRequirement runs a quick sanity check for the <10ms requirement.
 // This is not a benchmark but a verification that typical operations complete within limits.
 func TestEncryptPerformance_10msRequirement(t *testing.T) {
 	tkm := crypto.NewTestTenantKeyManager(nil)
 	hierarchy, err := tkm.ProvisionTenant("perf-test-tenant")
 	if err != nil {
 		t.Fatalf("Failed to provision tenant: %v", err)
 	}
 	plaintext := make([]byte, 1024)
 	// Warm up
 	for range 10 {
 		_, _ = tkm.EncryptArtifact("perf-test-tenant", "warmup", hierarchy.KMSKeyID, plaintext)
 	}
 	// Measure 100 operations
 	start := time.Now()
 	for i := 0; i < 100; i++ {
 		_, err := tkm.EncryptArtifact("perf-test-tenant", "perf-test", hierarchy.KMSKeyID, plaintext)
 		if err != nil {
 			t.Fatalf("Encrypt failed: %v", err)
 		}
 	}
 	elapsed := time.Since(start)
 	avgPerOp := elapsed / 100
 	if avgPerOp > 10*time.Millisecond {
 		t.Errorf("Average encrypt time %v exceeds 10ms requirement", avgPerOp)
 	}
 	t.Logf("Average encrypt time: %v (requirement: <10ms)", avgPerOp)
 }
 // TestDecryptPerformance_10msRequirement verifies decrypt completes within 10ms.
 func TestDecryptPerformance_10msRequirement(t *testing.T) {
 	tkm := crypto.NewTestTenantKeyManager(nil)
 	hierarchy, err := tkm.ProvisionTenant("perf-test-tenant")
 	if err != nil {
 		t.Fatalf("Failed to provision tenant: %v", err)
 	}
 	plaintext := make([]byte, 1024)
 	// Pre-encrypt
 	encrypted, err := tkm.EncryptArtifact("perf-test-tenant", "perf-test", hierarchy.KMSKeyID, plaintext)
 	if err != nil {
 		t.Fatalf("Pre-encryption failed: %v", err)
 	}
 	// Warm up cache
 	for range 10 {
 		_, _ = tkm.DecryptArtifact(encrypted, hierarchy.KMSKeyID)
 	}
 	// Measure 100 operations with cache
 	start := time.Now()
 	for range 10 {
 		_, err := tkm.DecryptArtifact(encrypted, hierarchy.KMSKeyID)
 		if err != nil {
 			t.Fatalf("Decrypt failed: %v", err)
 		}
 	}
 	elapsed := time.Since(start)
 	avgPerOp := elapsed / 100
 	if avgPerOp > 10*time.Millisecond {
 		t.Errorf("Average decrypt time %v exceeds 10ms requirement", avgPerOp)
 	}
 	t.Logf("Average decrypt time: %v (requirement: <10ms)", avgPerOp)
 }