chore(build): update build system, scripts, and additional tests

- Update Makefile with native build targets (preparing for C++) - Add profiler and performance regression detector commands - Update CI/testing scripts - Add additional unit tests for API, jupyter, queue, manifest
2026-02-12 12:05:55 -05:00 · 2026-02-12 12:05:55 -05:00 · 1dcc1e11d5
commit 1dcc1e11d5
parent 2209ae24c6
23 changed files with 1377 additions and 146 deletions
--- a/88
+++ b/88
@ -1,5 +1,9 @@
-.PHONY: all build prod dev clean clean-docs test test-unit test-integration test-e2e test-coverage lint install configlint worker-configlint ci-local docs docs-setup docs-build benchmark benchmark-local artifacts clean-benchmarks clean-all clean-aggressive status size load-test chaos-test profile-load profile-load-norate profile-ws-queue profile-tools detect-regressions tech-excellence docker-build dev-smoke self-cleanup test-full test-auth deploy-up deploy-down deploy-status deploy-clean dev-up dev-down dev-status dev-logs prod-up prod-down prod-status prod-logs
+.PHONY: all build prod dev clean clean-docs test test-unit test-integration test-e2e test-coverage lint install configlint worker-configlint ci-local docs docs-setup docs-check-port docs-stop docs-build docs-build-prod benchmark benchmark-local artifacts clean-benchmarks clean-all clean-aggressive status size load-test chaos-test profile-load profile-load-norate profile-ws-queue profile-tools detect-regressions tech-excellence docker-build dev-smoke self-cleanup test-full test-auth deploy-up deploy-down deploy-status deploy-clean dev-up dev-down dev-status dev-logs prod-up prod-down prod-status prod-logs
 OK = ✓
+DOCS_PORT ?= 1313
+DOCS_BIND ?= 127.0.0.1
+DOCS_BASEURL ?= /
+DOCS_PROD_BASEURL ?= $(DOCS_BASEURL)
 # Default target
 all: build

@ -7,16 +11,20 @@ all: build
 build:
 	go build -o bin/api-server cmd/api-server/main.go
 	go build -o bin/worker cmd/worker/worker_server.go
+	go build -o bin/data_manager ./cmd/data_manager
+	go build -o bin/user_manager ./cmd/user_manager
 	go build -o bin/tui ./cmd/tui
-	cd cli && zig build --release=small
+	$(MAKE) -C cli all
 	@echo "${OK} All components built"

 # Build production-optimized binaries
 prod:
 	go build -ldflags="-s -w" -o bin/api-server cmd/api-server/main.go
 	go build -ldflags="-s -w" -o bin/worker cmd/worker/worker_server.go
+	go build -ldflags="-s -w" -o bin/data_manager ./cmd/data_manager
+	go build -ldflags="-s -w" -o bin/user_manager ./cmd/user_manager
 	go build -ldflags="-s -w" -o bin/tui ./cmd/tui
-	cd cli && zig build --release=small
+	$(MAKE) -C cli prod
 	@echo "${OK} Production binaries built"

 cross-platform:
@ -32,9 +40,9 @@ cross-platform:
 	    echo "Building $$goos/$$goarch..."; \
 	    CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_api-server_$${goos}_$${goarch}$${ext} cmd/api-server/main.go; \
 	    CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_worker_$${goos}_$${goarch}$${ext} cmd/worker/worker_server.go; \
-	    CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_tui_$${goos}_$${goarch}$${ext} ./cmd/tui; \
 	    CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_data_manager_$${goos}_$${goarch}$${ext} ./cmd/data_manager; \
 	    CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_user_manager_$${goos}_$${goarch}$${ext} ./cmd/user_manager; \
+	    CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_tui_$${goos}_$${goarch}$${ext} ./cmd/tui; \
 	  done
 	@echo "${OK} Cross-platform binaries built in dist/"

@ -42,8 +50,10 @@ cross-platform:
 dev:
 	go build -buildvcs=false -o bin/api-server cmd/api-server/main.go
 	go build -buildvcs=false -o bin/worker cmd/worker/worker_server.go
+	go build -buildvcs=false -o bin/data_manager ./cmd/data_manager
+	go build -buildvcs=false -o bin/user_manager ./cmd/user_manager
 	go build -buildvcs=false -o bin/tui ./cmd/tui
-	cd cli && zig build --release=fast
+	$(MAKE) -C cli dev
 	@echo "${OK} Development binaries built"

 # Clean build artifacts (Go + Zig + test outputs)
@ -103,11 +113,8 @@ prod-smoke:
 	@echo "prod smoke: OK"

 # Run a local approximation of the CI pipeline
-ci-local:
-	make test
-	make lint
-	make configlint
-	make worker-configlint
+ci-local: test lint configlint worker-configlint
+	@mkdir -p coverage
 	@echo "Running queue package tests with race detector..."
 	go test -v -race -coverprofile=coverage/queue-coverage.out ./internal/queue/...
 	@echo "Running coverage..."
@ -131,37 +138,35 @@ test-e2e:
 	go test -v ./tests/e2e/...

 test-coverage:
+	@mkdir -p coverage
 	go test -coverprofile=coverage/coverage.out ./...
 	go tool cover -html=coverage/coverage.out -o coverage/coverage.html
 	@echo "${OK} Coverage report: coverage/coverage.html"

 # Documentation setup
 docs-setup:
-	@echo "Setting up MkDocs documentation..."
-	@if ! command -v mkdocs >/dev/null 2>&1; then \
-		echo "MkDocs not found. Please install it manually:"; \
-		echo "  macOS: brew install mkdocs mkdocs-material"; \
-		echo "  Linux: pip install mkdocs mkdocs-material"; \
-		echo "Or visit: https://www.mkdocs.org/user-guide/installation/"; \
-		exit 1; \
-	fi
-	@if ! python3 -c "import pymdownx.superfences" >/dev/null 2>&1; then \
-		echo "pymdown-extensions not found. Please install it manually:"; \
-		echo "  pip3 install --user pymdown-extensions"; \
-		echo "  Or use a virtual environment: python3 -m venv docs-env && source docs-env/bin/activate && pip install pymdown-extensions"; \
+	@echo "Setting up Hugo documentation..."
+	@if ! command -v hugo >/dev/null 2>&1; then \
+		echo "Hugo not found. Please install it manually:"; \
+		echo "  macOS: brew install hugo"; \
+		echo "  Linux: See https://gohugo.io/installation/"; \
 		exit 1; \
 	fi
 	@echo "Documentation setup complete!"

 # Documentation
-docs: docs-setup docs-build
-	@echo "Starting MkDocs development server..."
-	cd docs && mkdocs serve
+docs: docs-setup docs-check-port
+	@echo "Starting Hugo development server..."
+	cd docs && hugo server --bind "$(DOCS_BIND)" --port "$(DOCS_PORT)" --baseURL "$(DOCS_BASEURL)"

 # Build documentation
 docs-build:
 	@echo "Building static documentation..."
-	cd docs && mkdocs build
+	cd docs && hugo
+
+docs-build-prod:
+	@echo "Building production docs..."
+	cd docs && hugo --minify --baseURL "$(DOCS_PROD_BASEURL)"

 # Performance benchmarking tools
 benchmark:
@ -202,7 +207,6 @@ size:
 	@echo "Binary sizes:"
 	@ls -lh bin/* cli/zig-out/bin/ml 2>/dev/null || true

-
 # Load testing
 load-test:
 	@echo "Running load tests..."
@ -233,22 +237,27 @@ chaos-test:
 # Performance profiling tools
 profile-tools:
 	@echo "Building profiling tools..."
-	go build -o bin/performance-regression-detector ./tools/performance_regression_detector.go
-	go build -o bin/profiler ./tools/profiler.go
+	go build -o bin/performance-regression-detector ./cmd/performance-regression-detector
+	go build -o bin/profiler ./cmd/profiler
+
+# Performance regression detection threshold (percentage)
+REGRESSION_THRESHOLD ?= 10.0

 # Performance regression detection
 detect-regressions:
 	@echo "Detecting performance regressions..."
-	@if [ ! -f "tests/bin/baseline.json" ]; then \
+	@mkdir -p tests/bin
+	@if [ ! -f "tests/bin/baseline.bench.txt" ]; then \
 		echo "Creating baseline performance metrics..."; \
-		go test -bench=. -benchmem ./tests/benchmarks/... | tee tests/bin/baseline.json; \
+		go test -bench=. -benchmem ./tests/benchmarks/... | tee tests/bin/baseline.bench.txt; \
 	fi
 	@echo "Analyzing current performance against baseline..."
-	go test -bench=. -benchmem ./tests/benchmarks/... | tee tests/bin/current.json; \
-		echo "Use tools/performance_regression_detector to analyze results"; \
+	go test -bench=. -benchmem ./tests/benchmarks/... | tee tests/bin/current.bench.txt
+	@$(MAKE) profile-tools
+	@./bin/performance-regression-detector -baseline tests/bin/baseline.bench.txt -current tests/bin/current.bench.txt -threshold $(REGRESSION_THRESHOLD)

 # Technical excellence suite (runs all performance tests)
-tech-excellence: benchmark load-test chaos-test profile-tools
+complete-suite: benchmark load-test chaos-test profile-tools
 	@echo "Technical excellence test suite completed"
 	@echo "Results summary:"
 	@echo "  - Benchmarks: See test output above"
@ -298,12 +307,15 @@ help:
 	@echo "  make chaos-test        - Run chaos engineering tests"
 	@echo "  make profile-tools     - Build performance profiling tools"
 	@echo "  make detect-regressions - Detect performance regressions"
-	@echo "  make tech-excellence   - Run complete technical excellence suite"
+	@echo "  make complete-suite   	- Run complete technical suite"
 	@echo ""
 	@echo "Documentation:"
-	@echo "  make docs-setup       - Install MkDocs and dependencies"
-	@echo "  make docs             - Start MkDocs development server with live reload"
-	@echo "  make docs-build       - Build static documentation for deployment"
+	@echo "  make docs-setup       - Validate Hugo is installed"
+	@echo "  make docs             - Start Hugo dev server (checks DOCS_PORT is free)"
+	@echo "  make docs-check-port  - Check if DOCS_PORT is already in use"
+	@echo "  make docs-stop        - Stop process listening on DOCS_PORT (use with care)"
+	@echo "  make docs-build       - Build static documentation (local)"
+	@echo "  make docs-build-prod  - Build static documentation (prod flags; set DOCS_PROD_BASEURL)"
 	@echo ""
 	@echo "Utility:"
 	@echo "  make size             - Show binary sizes"
--- a/cmd/performance-regression-detector/main.go
+++ b/cmd/performance-regression-detector/main.go
@ -0,0 +1,46 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+
+	"github.com/jfraeys/fetch_ml/tools"
+)
+
+func main() {
+	baselinePath := flag.String("baseline", "", "Path to baseline 'go test -bench' output")
+	currentPath := flag.String("current", "", "Path to current 'go test -bench' output")
+	threshold := flag.Float64("threshold", 10.0, "Regression threshold percentage")
+	flag.Parse()
+
+	if *baselinePath == "" || *currentPath == "" {
+		_, _ = fmt.Fprintln(os.Stderr, "usage: performance-regression-detector -baseline <file> -current <file> [-threshold <percent>]")
+		os.Exit(2)
+	}
+
+	baselineResults, err := tools.ParseGoBenchFile(*baselinePath)
+	if err != nil {
+		_, _ = fmt.Fprintf(os.Stderr, "failed to parse baseline: %v\n", err)
+		os.Exit(1)
+	}
+
+	currentResults, err := tools.ParseGoBenchFile(*currentPath)
+	if err != nil {
+		_, _ = fmt.Fprintf(os.Stderr, "failed to parse current: %v\n", err)
+		os.Exit(1)
+	}
+
+	detector := tools.NewPerformanceRegressionDetector("", *threshold)
+	report, err := detector.AnalyzeParsedResults(baselineResults, currentResults)
+	if err != nil {
+		_, _ = fmt.Fprintf(os.Stderr, "failed to analyze results: %v\n", err)
+		os.Exit(1)
+	}
+
+	detector.PrintReport(report)
+
+	if len(report.Regressions) > 0 {
+		os.Exit(1)
+	}
+}
--- a/cmd/profiler/main.go
+++ b/cmd/profiler/main.go
@ -0,0 +1,28 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+
+	"github.com/jfraeys/fetch_ml/tools"
+)
+
+func main() {
+	cpuProfile := flag.String("cpu-profile", "", "Path to a CPU profile file to analyze")
+	flag.Parse()
+
+	if *cpuProfile == "" {
+		_, _ = fmt.Fprintln(os.Stderr, "usage: profiler -cpu-profile <file>")
+		os.Exit(2)
+	}
+
+	p := tools.NewProfiler(tools.ProfileConfig{CPUProfile: *cpuProfile})
+	analysis, err := p.AnalyzeProfiles()
+	if err != nil {
+		_, _ = fmt.Fprintf(os.Stderr, "failed to analyze profiles: %v\n", err)
+		os.Exit(1)
+	}
+
+	p.PrintAnalysis(analysis)
+}
--- a/scripts/README.md
+++ b/scripts/README.md
@ -28,16 +28,6 @@ make configlint
 make worker-configlint
 ```

-## Legacy Setup Scripts (Deprecated)
-
-The following scripts are from earlier iterations and are **deprecated** in favor of `setup-prod.sh`:
-
- `setup_rocky.sh` - Use `setup-prod.sh` instead
- `setup_ubuntu.sh` - Ubuntu support (not primary target)
- `auto_setup.sh` - Old automated setup (superseded)
- `setup_common.sh` - Common functions (integrated into setup-prod.sh)
- `quick_start.sh` - Quick dev setup (use docker-compose on macOS instead)
-

 ### Cleanup Recommendation
 These legacy scripts can be removed or archived. The current production setup only needs:
--- a/scripts/benchmarks/run-benchmarks-local.sh
+++ b/scripts/benchmarks/run-benchmarks-local.sh
@ -1,12 +1,12 @@
-#!/bin/bash
+#!/usr/bin/env bash

 # Local Benchmark Runner
 # Mimics the GitHub Actions workflow for local execution

-set -e
+set -euo pipefail

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+PROJECT_ROOT="$(dirname "$(dirname "$SCRIPT_DIR")")"
 LOCAL_ARTIFACTS_DIR="$PROJECT_ROOT/.local-artifacts"
 ARCHIVE_DIR="$LOCAL_ARTIFACTS_DIR/archive"
 TIMESTAMP=$(date -u +"%Y%m%d_%H%M%S")
@ -23,10 +23,18 @@ echo ""
 # Step 1: Run benchmarks
 echo "Step 1: Running benchmarks..."
 cd "$PROJECT_ROOT"
-go test -bench=. -benchmem ./tests/benchmarks/... > "$RUN_DIR/benchmark_results.txt" 2>&1
+BENCHMARK_RESULTS_FILE="$RUN_DIR/benchmark_results.txt"
+GO_TEST_EXIT_CODE=0
+go test -bench=. -benchmem ./tests/benchmarks/... > "$BENCHMARK_RESULTS_FILE" 2>&1 || GO_TEST_EXIT_CODE=$?
+if [ "$GO_TEST_EXIT_CODE" -ne 0 ]; then
+  echo "Benchmark run exited non-zero (exit code: $GO_TEST_EXIT_CODE)." >&2
+  echo "Continuing to generate metrics from available output: $BENCHMARK_RESULTS_FILE" >&2
+  echo "--- tail (last 50 lines) ---" >&2
+  tail -n 50 "$BENCHMARK_RESULTS_FILE" >&2 || true
+fi

 # Extract benchmark results
-grep "Benchmark.*-[0-9].*" "$RUN_DIR/benchmark_results.txt" > "$RUN_DIR/clean_benchmarks.txt" || true
+grep "Benchmark.*-[0-9].*" "$BENCHMARK_RESULTS_FILE" > "$RUN_DIR/clean_benchmarks.txt" || true

 # Step 2: Convert to Prometheus metrics
 echo "Step 2: Converting to Prometheus metrics..."
@ -44,18 +52,16 @@ while IFS= read -r line; do
  if [[ -n "$line" ]]; then
    BENCHMARK_NAME=$(echo "$line" | awk '{print $1}' | sed 's/-[0-9]*$//')
    ITERATIONS=$(echo "$line" | awk '{print $2}')
-    TIME_PER_OP=$(echo "$line" | awk '{print $3}')
-    MEMORY_PER_OP=$(echo "$line" | awk '{print $4}')
-    ALLOCS_PER_OP=$(echo "$line" | awk '{print $5}')
+
+    # Go benchmark output can include optional columns (e.g. MB/s) and units are
+    # usually separate tokens: "123 ns/op 456 B/op 7 allocs/op".
+    TIME_VALUE=$(echo "$line" | awk '{for (i=1;i<=NF;i++) if ($i=="ns/op") {print $(i-1); exit}}')
+    MEMORY_VALUE=$(echo "$line" | awk '{for (i=1;i<=NF;i++) if ($i=="B/op") {print $(i-1); exit}}')
+    ALLOCS_VALUE=$(echo "$line" | awk '{for (i=1;i<=NF;i++) if ($i=="allocs/op") {print $(i-1); exit}}')
    
    # Clean benchmark name for Prometheus
    CLEAN_NAME=$(echo "$BENCHMARK_NAME" | sed 's/[^a-zA-Z0-9_]/_/g')
    
-    # Parse numeric values, stripping units
-    TIME_VALUE=$(echo "$TIME_PER_OP" | sed 's/ns\/op//')
-    MEMORY_VALUE=$(echo "$MEMORY_PER_OP" | sed 's/B\/op//')
-    ALLOCS_VALUE=$(echo "$ALLOCS_PER_OP" | sed 's/allocs\/op//')
-    
    # Only add metrics if we have valid numeric values
    if [[ "$TIME_VALUE" =~ ^[0-9.]+$ ]]; then
      echo "benchmark_time_per_op{benchmark=\"$CLEAN_NAME\"} ${TIME_VALUE}" >> "$RUN_DIR/prometheus_metrics.txt"
@ -89,6 +95,9 @@ echo ""
 echo "=== Results Summary ==="
 echo "Benchmark results saved to: $RUN_DIR/benchmark_results.txt"
 echo "Prometheus metrics saved to: $RUN_DIR/prometheus_metrics.txt"
+if [ "${GO_TEST_EXIT_CODE:-0}" -ne 0 ]; then
+  echo "WARNING: go test exited with code: $GO_TEST_EXIT_CODE"
+fi
 echo ""

 # Show top 10 results
@ -126,9 +135,10 @@ cat > "$RUN_DIR/report.html" << EOF
 $(cat "$RUN_DIR/clean_benchmarks.txt" | while IFS= read -r line; do
  if [[ -n "$line" ]]; then
    BENCHMARK_NAME=$(echo "$line" | awk '{print $1}')
-    TIME_PER_OP=$(echo "$line" | awk '{print $3}')
-    MEMORY_PER_OP=$(echo "$line" | awk '{print $4}')
-    ALLOCS_PER_OP=$(echo "$line" | awk '{print $5}')
+
+    TIME_PER_OP=$(echo "$line" | awk '{for (i=1;i<=NF;i++) if ($i=="ns/op") {print $(i-1)" " $i; exit}}')
+    MEMORY_PER_OP=$(echo "$line" | awk '{for (i=1;i<=NF;i++) if ($i=="B/op") {print $(i-1)" " $i; exit}}')
+    ALLOCS_PER_OP=$(echo "$line" | awk '{for (i=1;i<=NF;i++) if ($i=="allocs/op") {print $(i-1)" " $i; exit}}')
    echo "        <tr>"
    echo "            <td class=\"metric\">$BENCHMARK_NAME</td>"
    echo "            <td>$TIME_PER_OP</td>"
--- a/scripts/ci-test.sh
+++ b/scripts/ci-test.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # ci-test.sh: Local CI sanity check without pushing
 # Run from repository root

--- a/scripts/lib/common.sh
+++ b/scripts/lib/common.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # Common shell functions for FetchML scripts
 # Source this file in other scripts: source "$(dirname "$0")/lib/common.sh"

--- a/scripts/maintenance/cleanup-benchmarks.sh
+++ b/scripts/maintenance/cleanup-benchmarks.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 # Comprehensive Benchmark Cleanup Script
 # Cleans up all benchmark-related artifacts and temporary files
--- a/scripts/maintenance/cleanup-status.sh
+++ b/scripts/maintenance/cleanup-status.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 # Check status of Docker resources and auto-cleanup service

--- a/scripts/maintenance/cleanup.sh
+++ b/scripts/maintenance/cleanup.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 # Self-cleaning script for fetch_ml Docker resources
 # Usage: ./scripts/cleanup.sh [--dry-run] [--force] [--all]
--- a/scripts/manage-artifacts.sh
+++ b/scripts/manage-artifacts.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 # Artifact Management Script
 # Manage local benchmark artifacts
@ -16,26 +16,31 @@ mkdir -p "$LOCAL_ARTIFACTS_DIR"
 case "${1:-help}" in
    "list")
        echo "=== Benchmark Runs ==="
-        if [ -d "$LOCAL_ARTIFACTS_DIR" ]; then
-            ls -lt "$LOCAL_ARTIFACTS_DIR"/run_* 2>/dev/null | while read -r line; do
-                run_dir=$(echo "$line" | awk '{print $9}')
-                if [ -n "$run_dir" ]; then
-                    timestamp=$(basename "$run_dir" | sed 's/run_//')
-                    echo "Run: $timestamp"
-                    echo "  Path: $run_dir"
-                    if [ -f "$run_dir/report.html" ]; then
-                        echo "  Report: $run_dir/report.html"
-                    fi
-                    if [ -f "$run_dir/prometheus_metrics.txt" ]; then
-                        metrics_count=$(grep -c "benchmark_" "$run_dir/prometheus_metrics.txt" 2>/dev/null || echo "0")
-                        echo "  Metrics: $metrics_count benchmarks"
-                    fi
-                    echo ""
-                fi
-            done
-        else
-            echo "No artifacts found"
+
+        # Fast empty-state: no run directories.
+        if ! compgen -G "$LOCAL_ARTIFACTS_DIR/run_*" >/dev/null; then
+            echo "(no runs found)"
+            exit 0
        fi
+
+        # List newest-first without parsing `ls -l` output.
+        for run_dir in $(ls -1dt "$LOCAL_ARTIFACTS_DIR"/run_* 2>/dev/null); do
+            if [ ! -d "$run_dir" ]; then
+                continue
+            fi
+
+            timestamp=$(basename "$run_dir" | sed 's/run_//')
+            echo "Run: $timestamp"
+            echo "  Path: $run_dir"
+            if [ -f "$run_dir/report.html" ]; then
+                echo "  Report: $run_dir/report.html"
+            fi
+            if [ -f "$run_dir/prometheus_metrics.txt" ]; then
+                metrics_count=$(grep -c "benchmark_" "$run_dir/prometheus_metrics.txt" 2>/dev/null || echo "0")
+                echo "  Metrics: $metrics_count benchmarks"
+            fi
+            echo ""
+        done
        ;;
    
    "clean")
@ -203,8 +208,8 @@ with open('$output_file', 'w') as f:
        echo ""
        echo "Commands:"
        echo "  list                    List all benchmark runs"
-        echo "  clean [all|old|run]    Clean artifacts"
-        echo "    all                   Remove all artifacts"
+        echo "  clean [all|old|run]    Archive artifacts"
+        echo "    all                   Archive all artifacts"
        echo "    old [count]           Keep last N runs (default: 10)"
        echo "    run <run_id>          Remove specific run"
        echo "  compare <run1> <run2>   Compare two benchmark runs"
--- a/scripts/smoke-test.sh
+++ b/scripts/smoke-test.sh
@ -1,3 +1,4 @@
+#!/usr/bin/env bash
 set -euo pipefail;

 repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
@ -29,6 +30,8 @@ compose_project_args=("--project-directory" "$repo_root")
 api_base=""
 prometheus_base=""
 stack_name=""
+api_wait_seconds=90
+prometheus_wait_seconds=90

 if [ "$env" = "dev" ]; then
    mkdir -p \
@ -43,6 +46,8 @@ if [ "$env" = "dev" ]; then
        "$repo_root/data/dev/workspaces"

    stack_name="dev"
+    api_wait_seconds=180
+    prometheus_wait_seconds=180
    compose_files=("-f" "$repo_root/deployments/docker-compose.dev.yml")
    api_base="https://localhost:9101"
    if ! curl -skf "$api_base/health" >/dev/null 2>&1; then
@ -82,7 +87,7 @@ echo "Starting $stack_name stack for smoke test...";
 $compose_cmd "${compose_project_args[@]}" "${compose_files[@]}" up -d --build >/dev/null;
 echo "Waiting for API to become healthy...";

-deadline=$(($(date +%s) + 90));
+deadline=$(($(date +%s) + $api_wait_seconds));
 while true; do
    if [ "$env" = "dev" ]; then
        if curl -skf "$api_base/health" >/dev/null 2>&1; then break; fi;
@ -98,7 +103,7 @@ if [ "$env" = "dev" ]; then
    curl -skf "$api_base/metrics" >/dev/null;

    echo "Waiting for Prometheus target api-server to be up...";
-    deadline=$(($(date +%s) + 90));
+    deadline=$(($(date +%s) + $prometheus_wait_seconds));
    query_url="$prometheus_base/api/v1/query?query=up%7Bjob%3D%22api-server%22%7D";

    while true; do
--- a/scripts/testing/test-prod.sh
+++ b/scripts/testing/test-prod.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 # Full Production Test Environment Script
 set -e
--- a/scripts/track_performance.sh
+++ b/scripts/track_performance.sh
@ -1,53 +1,638 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # Simple performance tracking script

+set -euo pipefail
+
 RESULTS_DIR="test_results/performance"
 TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
 RESULTS_FILE="$RESULTS_DIR/load_test_$TIMESTAMP.json"
+RESULTS_TMP_FILE="$RESULTS_FILE.tmp"
+RAW_LOG_FILE="$RESULTS_DIR/raw_$TIMESTAMP.log"
+RESOURCES_DIR="$RESULTS_DIR/resources_$TIMESTAMP"
+
+PROM_URL="${PROM_URL:-}"
+PROM_INSTANCE="${PROM_INSTANCE:-}"
+PROM_NODE_JOB="${PROM_NODE_JOB:-node}"         # e.g. node-exporter job label
+PROM_REDIS_JOB="${PROM_REDIS_JOB:-redis}"      # e.g. redis-exporter job label
+PROM_NET_IFACE="${PROM_NET_IFACE:-eth0}"       # override for server interface
+PROM_DISK_DEVICE="${PROM_DISK_DEVICE:-}"       # e.g. nvme0n1, sda; empty => aggregate all
+PROM_STEP_SECONDS="${PROM_STEP_SECONDS:-5}"

 mkdir -p "$RESULTS_DIR"
+mkdir -p "$RESOURCES_DIR"

 echo "Running load test performance tracking..."
 echo "Timestamp: $TIMESTAMP"

+json_string_or_null() {
+  if [ -z "${1:-}" ]; then
+    echo "null"
+    return
+  fi
+  printf '%s' "$1" | jq -Rs '.'
+}
+
+json_number_or_null() {
+  if [ -z "${1:-}" ]; then
+    echo "null"
+    return
+  fi
+  if printf '%s' "$1" | grep -Eq '^[0-9]+(\.[0-9]+)?$'; then
+    echo "$1"
+    return
+  fi
+  echo "null"
+}
+
+prom_urlencode() {
+  python3 - <<'PY' "$1" 2>/dev/null || true
+import sys
+from urllib.parse import quote
+
+print(quote(sys.argv[1], safe=''))
+PY
+}
+
+prom_query_range() {
+  local query="$1"
+  local start="$2"
+  local end="$3"
+  local step="$4"
+
+  if [ -z "${PROM_URL:-}" ]; then
+    echo ""
+    return 0
+  fi
+
+  local q
+  q=$(prom_urlencode "$query")
+  if [ -z "${q:-}" ]; then
+    echo ""
+    return 0
+  fi
+
+  curl -fsS "${PROM_URL%/}/api/v1/query_range?query=${q}&start=${start}&end=${end}&step=${step}" 2>/dev/null || true
+}
+
+prom_series_avg_max() {
+  # Emits: "<avg> <max>" or empty
+  jq -r '
+    if .status != "success" then empty
+    else
+      (.data.result[0].values // [])
+      | map(.[1] | tonumber)
+      | if length == 0 then empty
+        else
+          {avg: (add/length), max: max}
+          | "\(.avg) \(.max)"
+        end
+    end
+  ' 2>/dev/null || true
+}
+
+prom_scalar_last() {
+  jq -r '
+    if .status != "success" then empty
+    else
+      (.data.result[0].values // [])
+      | if length == 0 then empty else .[-1][1] end
+    end
+  ' 2>/dev/null || true
+}
+
+bytes_or_empty() {
+  if [ -z "${1:-}" ]; then
+    echo ""
+    return 0
+  fi
+  if printf '%s' "$1" | grep -Eq '^[0-9]+$'; then
+    echo "$1"
+    return 0
+  fi
+  echo ""
+}
+
+read_net_bytes() {
+  local ifname="$1"
+  netstat -ibn 2>/dev/null \
+    | awk -v ifname="$ifname" '$1==ifname && $2 ~ /^[0-9]+$/ {print $7 " " $10; exit}' \
+    || true
+}
+
+sample_process() {
+  local pid="$1"
+  local out_file="$2"
+  : > "$out_file"
+  while kill -0 "$pid" 2>/dev/null; do
+    local cpu rss
+    cpu=$(ps -p "$pid" -o %cpu= 2>/dev/null | tr -d ' ' || true)
+    rss=$(ps -p "$pid" -o rss= 2>/dev/null | tr -d ' ' || true)
+    printf '%s,%s,%s\n' "$(date +%s)" "${cpu:-}" "${rss:-}" >> "$out_file" || true
+    sleep 1
+  done
+}
+
+summarize_process_samples() {
+  local file="$1"
+  if [ ! -f "$file" ]; then
+    echo ""
+    return 0
+  fi
+
+  awk -F',' '
+    $2 ~ /^[0-9]+(\.[0-9]+)?$/ {cpu_sum += $2; cpu_n += 1; if ($2 > cpu_max) cpu_max = $2}
+    $3 ~ /^[0-9]+$/ {if ($3 > rss_max) rss_max = $3}
+    END {
+      if (cpu_n > 0) printf "cpu_avg=%.4f cpu_max=%.4f rss_max_kb=%d", cpu_sum/cpu_n, cpu_max, rss_max
+    }
+  ' "$file" 2>/dev/null || true
+}
+
+summarize_iostat() {
+  local file="$1"
+  if [ ! -f "$file" ]; then
+    echo ""
+    return 0
+  fi
+
+  awk '
+    $1 ~ /^[0-9]/ {
+      if (NF % 3 == 0 && NF >= 3) {
+        mb = 0
+        for (i = 3; i <= NF; i += 3) mb += $i
+        sum += mb
+        n += 1
+        if (mb > max) max = mb
+      }
+    }
+    END {
+      if (n > 0) printf "mbps_avg=%.4f mbps_max=%.4f", sum/n, max
+    }
+  ' "$file" 2>/dev/null || true
+}
+
+extract_kv() {
+  local file="$1"
+  local key="$2"
+  if [ ! -f "$file" ]; then
+    echo ""
+    return 0
+  fi
+  awk -F: -v k="$key" '$1==k {gsub(/^ +/, "", $2); print $2; exit}' "$file" 2>/dev/null || true
+}
+
+extract_field() {
+  local label="$1"
+  local grep_pat="$2"
+  local sed_expr="$3"
+
+  local line
+  line=$(awk -v label="$label" -v pat="$grep_pat" '
+    $0 ~ "Load test results for " label ":" {inside=1; next}
+    inside && $0 ~ /^=== RUN/ {inside=0}
+    inside && $0 ~ pat {print; exit}
+  ' "$RESULTS_DIR/raw_$TIMESTAMP.log" 2>/dev/null || true)
+  if [ -z "$line" ]; then
+    echo ""
+    return 0
+  fi
+
+  printf '%s\n' "$line" \
+    | sed -E "$sed_expr" \
+    | tr -d '\r' \
+    || true
+  return 0
+}
+
+extract_criteria_field_from_log() {
+  local log_file="$1"
+  local label="$2"
+  local grep_pat="$3"
+  local sed_expr="$4"
+
+  local line
+  line=$(awk -v label="$label" -v pat="$grep_pat" '
+    $0 ~ "Load test criteria for " label ":" {inside=1; next}
+    inside && ($0 ~ /^=== RUN/ || $0 ~ "Load test config for " label ":" || $0 ~ "Load test results for " label ":") {inside=0}
+    inside && $0 ~ pat {print; exit}
+  ' "$log_file" 2>/dev/null || true)
+  if [ -z "$line" ]; then
+    echo ""
+    return 0
+  fi
+
+  printf '%s\n' "$line" \
+    | sed -E "$sed_expr" \
+    | tr -d '\r' \
+    || true
+  return 0
+}
+
+extract_config_field_from_log() {
+  local log_file="$1"
+  local label="$2"
+  local grep_pat="$3"
+  local sed_expr="$4"
+
+  local line
+  line=$(awk -v label="$label" -v pat="$grep_pat" '
+    $0 ~ "Load test config for " label ":" {inside=1; next}
+    inside && ($0 ~ /^=== RUN/ || $0 ~ "Load test results for " label ":") {inside=0}
+    inside && $0 ~ pat {print; exit}
+  ' "$log_file" 2>/dev/null || true)
+  if [ -z "$line" ]; then
+    echo ""
+    return 0
+  fi
+
+  printf '%s\n' "$line" \
+    | sed -E "$sed_expr" \
+    | tr -d '\r' \
+    || true
+  return 0
+}
+
+extract_field_from_log() {
+  local log_file="$1"
+  local label="$2"
+  local grep_pat="$3"
+  local sed_expr="$4"
+
+  local line
+  line=$(awk -v label="$label" -v pat="$grep_pat" '
+    $0 ~ "Load test results for " label ":" {inside=1; next}
+    inside && $0 ~ /^=== RUN/ {inside=0}
+    inside && $0 ~ pat {print; exit}
+  ' "$log_file" 2>/dev/null || true)
+  if [ -z "$line" ]; then
+    echo ""
+    return 0
+  fi
+
+  printf '%s\n' "$line" \
+    | sed -E "$sed_expr" \
+    | tr -d '\r' \
+    || true
+  return 0
+}
+
+get_light_rps() {
+  local results_file="$1"
+  local v
+  v=$(jq -r '.tests[] | select(.name=="LightLoad") | .throughput_rps // empty' "$results_file" 2>/dev/null || true)
+  if printf '%s' "${v:-}" | grep -Eq '^[0-9]+(\.[0-9]+)?$'; then
+    echo "$v"
+    return 0
+  fi
+
+  local ts
+  ts=$(jq -r '.timestamp // empty' "$results_file" 2>/dev/null || true)
+  if [ -z "${ts:-}" ]; then
+    ts=$(basename "$results_file" | sed -E 's/^load_test_([0-9]{8}_[0-9]{6})\.json$/\1/')
+  fi
+  if [ -z "${ts:-}" ]; then
+    echo ""
+    return 0
+  fi
+
+  local raw_file="$RESULTS_DIR/raw_${ts}.log"
+  if [ ! -f "$raw_file" ]; then
+    echo ""
+    return 0
+  fi
+
+  extract_field_from_log "$raw_file" "LightLoad" "Throughput:" 's/.*Throughput: ([0-9]+(\.[0-9]+)?) RPS.*/\1/'
+}
+
+get_test_criteria_summary() {
+  local results_file="$1"
+  local name="$2"
+
+  local min_t max_e max_p
+  min_t=$(jq -r --arg n "$name" '.tests[] | select(.name==$n) | .criteria.min_throughput_rps // empty' "$results_file" 2>/dev/null || true)
+  max_e=$(jq -r --arg n "$name" '.tests[] | select(.name==$n) | .criteria.max_error_rate_percent // empty' "$results_file" 2>/dev/null || true)
+  max_p=$(jq -r --arg n "$name" '.tests[] | select(.name==$n) | .criteria.max_p99_latency_ms // empty' "$results_file" 2>/dev/null || true)
+
+  if [ -z "${min_t:-}" ] && [ -z "${max_e:-}" ] && [ -z "${max_p:-}" ]; then
+    echo ""
+    return 0
+  fi
+
+  echo ">= ${min_t:-?} RPS, <= ${max_e:-?}% err, <= ${max_p:-?}ms p99"
+}
+
 # Run tests and capture results
-go test ./tests/load -run=TestLoadTestSuite -v -load-suite=medium -timeout=10m > "$RESULTS_DIR/raw_$TIMESTAMP.log"
+GO_TEST_EXIT_CODE=0
+
+test_start_epoch="$(date +%s)"
+
+lo0_before=$(read_net_bytes lo0)
+en0_before=$(read_net_bytes en0)
+
+if command -v redis-cli >/dev/null 2>&1; then
+  redis-cli -n 7 INFO memory > "$RESOURCES_DIR/redis_memory_before.txt" 2>/dev/null || true
+  redis-cli -n 7 INFO clients > "$RESOURCES_DIR/redis_clients_before.txt" 2>/dev/null || true
+fi
+
+go test ./tests/load -run=TestLoadTestSuite -v -load-suite=medium -timeout=10m > "$RAW_LOG_FILE" 2>&1 &
+GO_TEST_PID=$!
+
+sample_process "$GO_TEST_PID" "$RESOURCES_DIR/process.csv" &
+SAMPLE_PID=$!
+
+iostat -d 1 > "$RESOURCES_DIR/iostat.txt" 2>/dev/null &
+IOSTAT_PID=$!
+
+wait "$GO_TEST_PID" || GO_TEST_EXIT_CODE=$?
+
+test_end_epoch="$(date +%s)"
+
+kill "$IOSTAT_PID" 2>/dev/null || true
+wait "$IOSTAT_PID" 2>/dev/null || true
+
+wait "$SAMPLE_PID" 2>/dev/null || true
+
+lo0_after=$(read_net_bytes lo0)
+en0_after=$(read_net_bytes en0)
+
+if command -v redis-cli >/dev/null 2>&1; then
+  redis-cli -n 7 INFO memory > "$RESOURCES_DIR/redis_memory_after.txt" 2>/dev/null || true
+  redis-cli -n 7 INFO clients > "$RESOURCES_DIR/redis_clients_after.txt" 2>/dev/null || true
+fi
+
+if command -v podman >/dev/null 2>&1; then
+  podman stats --no-stream --format '{{.Name}} {{.CPUPerc}} {{.MemUsage}} {{.NetIO}} {{.BlockIO}}' > "$RESOURCES_DIR/podman_stats.txt" 2>/dev/null || true
+fi
+
+if [ "$GO_TEST_EXIT_CODE" -ne 0 ]; then
+  echo "Load test failed (exit code: $GO_TEST_EXIT_CODE). See raw log: $RAW_LOG_FILE" >&2
+fi
+
+proc_summary=$(summarize_process_samples "$RESOURCES_DIR/process.csv")
+cpu_avg=$(printf '%s' "$proc_summary" | sed -nE 's/.*cpu_avg=([0-9]+(\.[0-9]+)?).*/\1/p')
+cpu_max=$(printf '%s' "$proc_summary" | sed -nE 's/.*cpu_max=([0-9]+(\.[0-9]+)?).*/\1/p')
+rss_max_kb=$(printf '%s' "$proc_summary" | sed -nE 's/.*rss_max_kb=([0-9]+).*/\1/p')
+
+io_summary=$(summarize_iostat "$RESOURCES_DIR/iostat.txt")
+disk_mbps_avg=$(printf '%s' "$io_summary" | sed -nE 's/.*mbps_avg=([0-9]+(\.[0-9]+)?).*/\1/p')
+disk_mbps_max=$(printf '%s' "$io_summary" | sed -nE 's/.*mbps_max=([0-9]+(\.[0-9]+)?).*/\1/p')
+
+lo0_in_before=$(bytes_or_empty "$(printf '%s' "$lo0_before" | awk '{print $1}')")
+lo0_out_before=$(bytes_or_empty "$(printf '%s' "$lo0_before" | awk '{print $2}')")
+lo0_in_after=$(bytes_or_empty "$(printf '%s' "$lo0_after" | awk '{print $1}')")
+lo0_out_after=$(bytes_or_empty "$(printf '%s' "$lo0_after" | awk '{print $2}')")
+
+en0_in_before=$(bytes_or_empty "$(printf '%s' "$en0_before" | awk '{print $1}')")
+en0_out_before=$(bytes_or_empty "$(printf '%s' "$en0_before" | awk '{print $2}')")
+en0_in_after=$(bytes_or_empty "$(printf '%s' "$en0_after" | awk '{print $1}')")
+en0_out_after=$(bytes_or_empty "$(printf '%s' "$en0_after" | awk '{print $2}')")
+
+lo0_in_delta=""
+lo0_out_delta=""
+if [ -n "${lo0_in_before:-}" ] && [ -n "${lo0_in_after:-}" ]; then
+  lo0_in_delta=$((lo0_in_after - lo0_in_before))
+fi
+if [ -n "${lo0_out_before:-}" ] && [ -n "${lo0_out_after:-}" ]; then
+  lo0_out_delta=$((lo0_out_after - lo0_out_before))
+fi
+
+en0_in_delta=""
+en0_out_delta=""
+if [ -n "${en0_in_before:-}" ] && [ -n "${en0_in_after:-}" ]; then
+  en0_in_delta=$((en0_in_after - en0_in_before))
+fi
+if [ -n "${en0_out_before:-}" ] && [ -n "${en0_out_after:-}" ]; then
+  en0_out_delta=$((en0_out_after - en0_out_before))
+fi
+
+redis_used_mem_before=$(extract_kv "$RESOURCES_DIR/redis_memory_before.txt" used_memory)
+redis_used_mem_after=$(extract_kv "$RESOURCES_DIR/redis_memory_after.txt" used_memory)
+redis_clients_before=$(extract_kv "$RESOURCES_DIR/redis_clients_before.txt" connected_clients)
+redis_clients_after=$(extract_kv "$RESOURCES_DIR/redis_clients_after.txt" connected_clients)
+
+run_mode="local"
+metrics_scope="client_only"
+if [ -n "${PROM_URL:-}" ]; then
+  run_mode="remote"
+  metrics_scope="both"
+fi
+
+server_cpu_avg=""; server_cpu_max=""
+server_mem_avg=""; server_mem_max=""
+server_disk_read_avg=""; server_disk_read_max=""
+server_disk_write_avg=""; server_disk_write_max=""
+server_net_rx_avg=""; server_net_rx_max=""
+server_net_tx_avg=""; server_net_tx_max=""
+server_redis_mem_last=""; server_redis_clients_last=""
+
+if [ -n "${PROM_URL:-}" ] && [ -n "${PROM_INSTANCE:-}" ]; then
+  instance_sel="instance=\"${PROM_INSTANCE}\",job=\"${PROM_NODE_JOB}\""
+
+  cpu_q="(1 - avg(rate(node_cpu_seconds_total{${instance_sel},mode=\"idle\"}[1m])))*100"
+  mem_q="node_memory_MemAvailable_bytes{${instance_sel}}"
+  mem_total_q="node_memory_MemTotal_bytes{${instance_sel}}"
+
+  cpu_json=$(prom_query_range "$cpu_q" "$test_start_epoch" "$test_end_epoch" "$PROM_STEP_SECONDS")
+  cpu_pair=$(printf '%s' "$cpu_json" | prom_series_avg_max)
+  server_cpu_avg=$(printf '%s' "$cpu_pair" | awk '{print $1}')
+  server_cpu_max=$(printf '%s' "$cpu_pair" | awk '{print $2}')
+
+  avail_json=$(prom_query_range "$mem_q" "$test_start_epoch" "$test_end_epoch" "$PROM_STEP_SECONDS")
+  total_json=$(prom_query_range "$mem_total_q" "$test_start_epoch" "$test_end_epoch" "$PROM_STEP_SECONDS")
+  avail_pair=$(printf '%s' "$avail_json" | prom_series_avg_max)
+  total_pair=$(printf '%s' "$total_json" | prom_series_avg_max)
+  avail_avg=$(printf '%s' "$avail_pair" | awk '{print $1}')
+  avail_min=$(printf '%s' "$avail_pair" | awk '{print $2}')
+  total_avg=$(printf '%s' "$total_pair" | awk '{print $1}')
+
+  if printf '%s' "${avail_avg:-}" | grep -Eq '^[0-9]+(\.[0-9]+)?$' && printf '%s' "${total_avg:-}" | grep -Eq '^[0-9]+(\.[0-9]+)?$'; then
+    server_mem_avg=$(awk -v a="$avail_avg" -v t="$total_avg" 'BEGIN{printf "%.0f", (t-a)}')
+  fi
+  if printf '%s' "${avail_min:-}" | grep -Eq '^[0-9]+(\.[0-9]+)?$' && printf '%s' "${total_avg:-}" | grep -Eq '^[0-9]+(\.[0-9]+)?$'; then
+    server_mem_max=$(awk -v a="$avail_min" -v t="$total_avg" 'BEGIN{printf "%.0f", (t-a)}')
+  fi
+
+  disk_dev_sel=""
+  if [ -n "${PROM_DISK_DEVICE:-}" ]; then
+    disk_dev_sel=",device=\"${PROM_DISK_DEVICE}\""
+  fi
+  read_q="sum(rate(node_disk_read_bytes_total{${instance_sel}${disk_dev_sel}}[1m]))"
+  write_q="sum(rate(node_disk_written_bytes_total{${instance_sel}${disk_dev_sel}}[1m]))"
+
+  read_json=$(prom_query_range "$read_q" "$test_start_epoch" "$test_end_epoch" "$PROM_STEP_SECONDS")
+  write_json=$(prom_query_range "$write_q" "$test_start_epoch" "$test_end_epoch" "$PROM_STEP_SECONDS")
+  read_pair=$(printf '%s' "$read_json" | prom_series_avg_max)
+  write_pair=$(printf '%s' "$write_json" | prom_series_avg_max)
+  server_disk_read_avg=$(printf '%s' "$read_pair" | awk '{print $1}')
+  server_disk_read_max=$(printf '%s' "$read_pair" | awk '{print $2}')
+  server_disk_write_avg=$(printf '%s' "$write_pair" | awk '{print $1}')
+  server_disk_write_max=$(printf '%s' "$write_pair" | awk '{print $2}')
+
+  rx_q="sum(rate(node_network_receive_bytes_total{${instance_sel},device=\"${PROM_NET_IFACE}\"}[1m]))"
+  tx_q="sum(rate(node_network_transmit_bytes_total{${instance_sel},device=\"${PROM_NET_IFACE}\"}[1m]))"
+  rx_json=$(prom_query_range "$rx_q" "$test_start_epoch" "$test_end_epoch" "$PROM_STEP_SECONDS")
+  tx_json=$(prom_query_range "$tx_q" "$test_start_epoch" "$test_end_epoch" "$PROM_STEP_SECONDS")
+  rx_pair=$(printf '%s' "$rx_json" | prom_series_avg_max)
+  tx_pair=$(printf '%s' "$tx_json" | prom_series_avg_max)
+  server_net_rx_avg=$(printf '%s' "$rx_pair" | awk '{print $1}')
+  server_net_rx_max=$(printf '%s' "$rx_pair" | awk '{print $2}')
+  server_net_tx_avg=$(printf '%s' "$tx_pair" | awk '{print $1}')
+  server_net_tx_max=$(printf '%s' "$tx_pair" | awk '{print $2}')
+
+  if [ -n "${PROM_REDIS_JOB:-}" ]; then
+    redis_sel="instance=\"${PROM_INSTANCE}\",job=\"${PROM_REDIS_JOB}\""
+    redis_mem_q="redis_memory_used_bytes{${redis_sel}}"
+    redis_clients_q="redis_connected_clients{${redis_sel}}"
+    redis_mem_json=$(prom_query_range "$redis_mem_q" "$test_start_epoch" "$test_end_epoch" "$PROM_STEP_SECONDS")
+    redis_clients_json=$(prom_query_range "$redis_clients_q" "$test_start_epoch" "$test_end_epoch" "$PROM_STEP_SECONDS")
+    server_redis_mem_last=$(printf '%s' "$redis_mem_json" | prom_scalar_last)
+    server_redis_clients_last=$(printf '%s' "$redis_clients_json" | prom_scalar_last)
+  fi
+fi

 # Extract key metrics
 {
  echo "{"
  echo "  \"timestamp\": \"$TIMESTAMP\","
+  echo "  \"go_test_exit_code\": $GO_TEST_EXIT_CODE,"
+  echo "  \"run_context\": {"
+  echo "    \"mode\": \"$run_mode\","
+  echo "    \"metrics_scope\": \"$metrics_scope\","
+  echo "    \"prom_url\": $(json_string_or_null "$PROM_URL"),"
+  echo "    \"prom_instance\": $(json_string_or_null "$PROM_INSTANCE"),"
+  echo "    \"prom_node_job\": $(json_string_or_null "$PROM_NODE_JOB"),"
+  echo "    \"prom_redis_job\": $(json_string_or_null "$PROM_REDIS_JOB"),"
+  echo "    \"prom_net_iface\": $(json_string_or_null "$PROM_NET_IFACE"),"
+  echo "    \"prom_disk_device\": $(json_string_or_null "$PROM_DISK_DEVICE"),"
+  echo "    \"test_start_epoch\": $(json_number_or_null "$test_start_epoch"),"
+  echo "    \"test_end_epoch\": $(json_number_or_null "$test_end_epoch")"
+  echo "  },"
+  echo "  \"resources\": {"
+  echo "    \"process\": {"
+  echo "      \"cpu_percent_avg\": $(json_number_or_null "$cpu_avg"),"
+  echo "      \"cpu_percent_max\": $(json_number_or_null "$cpu_max"),"
+  echo "      \"rss_max_kb\": $(json_number_or_null "$rss_max_kb")"
+  echo "    },"
+  echo "    \"disk\": {"
+  echo "      \"mbps_avg\": $(json_number_or_null "$disk_mbps_avg"),"
+  echo "      \"mbps_max\": $(json_number_or_null "$disk_mbps_max")"
+  echo "    },"
+  echo "    \"network\": {"
+  echo "      \"lo0_in_bytes\": $(json_number_or_null "$lo0_in_delta"),"
+  echo "      \"lo0_out_bytes\": $(json_number_or_null "$lo0_out_delta"),"
+  echo "      \"en0_in_bytes\": $(json_number_or_null "$en0_in_delta"),"
+  echo "      \"en0_out_bytes\": $(json_number_or_null "$en0_out_delta")"
+  echo "    },"
+  echo "    \"redis\": {"
+  echo "      \"used_memory_bytes_before\": $(json_number_or_null "$redis_used_mem_before"),"
+  echo "      \"used_memory_bytes_after\": $(json_number_or_null "$redis_used_mem_after"),"
+  echo "      \"connected_clients_before\": $(json_number_or_null "$redis_clients_before"),"
+  echo "      \"connected_clients_after\": $(json_number_or_null "$redis_clients_after")"
+  echo "    },"
+  echo "    \"podman_stats\": $(json_string_or_null "$(cat \"$RESOURCES_DIR/podman_stats.txt\" 2>/dev/null || true)")"
+  echo "  },"
+  echo "  \"resources_server\": {"
+  echo "    \"cpu_percent_avg\": $(json_number_or_null "$server_cpu_avg"),"
+  echo "    \"cpu_percent_max\": $(json_number_or_null "$server_cpu_max"),"
+  echo "    \"mem_used_bytes_avg\": $(json_number_or_null "$server_mem_avg"),"
+  echo "    \"mem_used_bytes_max\": $(json_number_or_null "$server_mem_max"),"
+  echo "    \"disk_read_bytes_per_sec_avg\": $(json_number_or_null "$server_disk_read_avg"),"
+  echo "    \"disk_read_bytes_per_sec_max\": $(json_number_or_null "$server_disk_read_max"),"
+  echo "    \"disk_write_bytes_per_sec_avg\": $(json_number_or_null "$server_disk_write_avg"),"
+  echo "    \"disk_write_bytes_per_sec_max\": $(json_number_or_null "$server_disk_write_max"),"
+  echo "    \"net_rx_bytes_per_sec_avg\": $(json_number_or_null "$server_net_rx_avg"),"
+  echo "    \"net_rx_bytes_per_sec_max\": $(json_number_or_null "$server_net_rx_max"),"
+  echo "    \"net_tx_bytes_per_sec_avg\": $(json_number_or_null "$server_net_tx_avg"),"
+  echo "    \"net_tx_bytes_per_sec_max\": $(json_number_or_null "$server_net_tx_max"),"
+  echo "    \"redis_used_memory_bytes_last\": $(json_number_or_null "$server_redis_mem_last"),"
+  echo "    \"redis_connected_clients_last\": $(json_number_or_null "$server_redis_clients_last")"
+  echo "  },"
  echo "  \"tests\": ["
  
  # Parse light load
-  LIGHT_RPS=$(grep -A1 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Throughput" | awk '{print $2}')
-  LIGHT_ERROR=$(grep -A2 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Error rate" | awk '{print $3}')
-  LIGHT_P99=$(grep -A4 "LightLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "P99 latency" | awk '{print $3}')
+  LIGHT_RPS=$(extract_field "LightLoad" "Throughput:" 's/.*Throughput: ([0-9]+(\.[0-9]+)?) RPS.*/\1/')
+  LIGHT_ERROR=$(extract_field "LightLoad" "Error rate:" 's/.*Error rate: ([0-9]+(\.[0-9]+)?)%.*/\1/')
+  LIGHT_P99=$(extract_field "LightLoad" "P99 latency:" 's/.*P99 latency: ([^ ]+).*/\1/')
+
+  LIGHT_CRIT_MIN_RPS=$(extract_criteria_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "LightLoad" "Min throughput:" 's/.*Min throughput: ([0-9]+(\.[0-9]+)?) RPS.*/\1/')
+  LIGHT_CRIT_MAX_ERR=$(extract_criteria_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "LightLoad" "Max error rate:" 's/.*Max error rate: ([0-9]+(\.[0-9]+)?)%.*/\1/')
+  LIGHT_CRIT_MAX_P99=$(extract_criteria_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "LightLoad" "Max P99 latency:" 's/.*Max P99 latency: ([0-9]+(\.[0-9]+)?)ms.*/\1/')
+
+  LIGHT_CFG_CONCURRENCY=$(extract_config_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "LightLoad" "Concurrency:" 's/.*Concurrency: ([0-9]+).*/\1/')
+  LIGHT_CFG_DURATION=$(extract_config_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "LightLoad" "Duration:" 's/.*Duration: (.*)$/\1/')
+  LIGHT_CFG_RAMP_UP=$(extract_config_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "LightLoad" "Ramp up:" 's/.*Ramp up: (.*)$/\1/')
+  LIGHT_CFG_TARGET_RPS=$(extract_config_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "LightLoad" "Target RPS:" 's/.*Target RPS: ([0-9]+).*/\1/')
+  LIGHT_CFG_PAYLOAD_SIZE=$(extract_config_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "LightLoad" "Payload size:" 's/.*Payload size: ([0-9]+).*/\1/')
+  LIGHT_CFG_METHOD=$(extract_config_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "LightLoad" "Method:" 's/.*Method: (.*)$/\1/')
+  LIGHT_CFG_ENDPOINT=$(extract_config_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "LightLoad" "Endpoint:" 's/.*Endpoint: (.*)$/\1/')
  
  echo "    {"
  echo "      \"name\": \"LightLoad\","
-  echo "      \"throughput_rps\": $LIGHT_RPS,"
-  echo "      \"error_rate_percent\": $LIGHT_ERROR,"
-  echo "      \"p99_latency_ms\": \"$LIGHT_P99\""
+  echo "      \"config\": {"
+  echo "        \"concurrency\": $(json_number_or_null "$LIGHT_CFG_CONCURRENCY"),"
+  echo "        \"duration\": $(json_string_or_null "$LIGHT_CFG_DURATION"),"
+  echo "        \"ramp_up\": $(json_string_or_null "$LIGHT_CFG_RAMP_UP"),"
+  echo "        \"target_rps\": $(json_number_or_null "$LIGHT_CFG_TARGET_RPS"),"
+  echo "        \"payload_size_bytes\": $(json_number_or_null "$LIGHT_CFG_PAYLOAD_SIZE"),"
+  echo "        \"method\": $(json_string_or_null "$LIGHT_CFG_METHOD"),"
+  echo "        \"endpoint\": $(json_string_or_null "$LIGHT_CFG_ENDPOINT")"
+  echo "      },"
+  echo "      \"criteria\": {"
+  echo "        \"min_throughput_rps\": $(json_number_or_null "$LIGHT_CRIT_MIN_RPS"),"
+  echo "        \"max_error_rate_percent\": $(json_number_or_null "$LIGHT_CRIT_MAX_ERR"),"
+  echo "        \"max_p99_latency_ms\": $(json_number_or_null "$LIGHT_CRIT_MAX_P99")"
+  echo "      },"
+  echo "      \"throughput_rps\": $(json_number_or_null "$LIGHT_RPS"),"
+  echo "      \"error_rate_percent\": $(json_number_or_null "$LIGHT_ERROR"),"
+  echo "      \"p99_latency_ms\": $(json_string_or_null "$LIGHT_P99")"
  echo "    },"
  
  # Parse medium load
-  MEDIUM_RPS=$(grep -A1 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Throughput" | awk '{print $2}')
-  MEDIUM_ERROR=$(grep -A2 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "Error rate" | awk '{print $3}')
-  MEDIUM_P99=$(grep -A4 "MediumLoad" "$RESULTS_DIR/raw_$TIMESTAMP.log" | grep "P99 latency" | awk '{print $3}')
+  MEDIUM_RPS=$(extract_field "MediumLoad" "Throughput:" 's/.*Throughput: ([0-9]+(\.[0-9]+)?) RPS.*/\1/')
+  MEDIUM_ERROR=$(extract_field "MediumLoad" "Error rate:" 's/.*Error rate: ([0-9]+(\.[0-9]+)?)%.*/\1/')
+  MEDIUM_P99=$(extract_field "MediumLoad" "P99 latency:" 's/.*P99 latency: ([^ ]+).*/\1/')
+
+  MEDIUM_CRIT_MIN_RPS=$(extract_criteria_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "MediumLoad" "Min throughput:" 's/.*Min throughput: ([0-9]+(\.[0-9]+)?) RPS.*/\1/')
+  MEDIUM_CRIT_MAX_ERR=$(extract_criteria_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "MediumLoad" "Max error rate:" 's/.*Max error rate: ([0-9]+(\.[0-9]+)?)%.*/\1/')
+  MEDIUM_CRIT_MAX_P99=$(extract_criteria_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "MediumLoad" "Max P99 latency:" 's/.*Max P99 latency: ([0-9]+(\.[0-9]+)?)ms.*/\1/')
+
+  MEDIUM_CFG_CONCURRENCY=$(extract_config_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "MediumLoad" "Concurrency:" 's/.*Concurrency: ([0-9]+).*/\1/')
+  MEDIUM_CFG_DURATION=$(extract_config_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "MediumLoad" "Duration:" 's/.*Duration: (.*)$/\1/')
+  MEDIUM_CFG_RAMP_UP=$(extract_config_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "MediumLoad" "Ramp up:" 's/.*Ramp up: (.*)$/\1/')
+  MEDIUM_CFG_TARGET_RPS=$(extract_config_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "MediumLoad" "Target RPS:" 's/.*Target RPS: ([0-9]+).*/\1/')
+  MEDIUM_CFG_PAYLOAD_SIZE=$(extract_config_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "MediumLoad" "Payload size:" 's/.*Payload size: ([0-9]+).*/\1/')
+  MEDIUM_CFG_METHOD=$(extract_config_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "MediumLoad" "Method:" 's/.*Method: (.*)$/\1/')
+  MEDIUM_CFG_ENDPOINT=$(extract_config_field_from_log "$RESULTS_DIR/raw_$TIMESTAMP.log" "MediumLoad" "Endpoint:" 's/.*Endpoint: (.*)$/\1/')
  
  echo "    {"
  echo "      \"name\": \"MediumLoad\","
-  echo "      \"throughput_rps\": $MEDIUM_RPS,"
-  echo "      \"error_rate_percent\": $MEDIUM_ERROR,"
-  echo "      \"p99_latency_ms\": \"$MEDIUM_P99\""
+  echo "      \"config\": {"
+  echo "        \"concurrency\": $(json_number_or_null "$MEDIUM_CFG_CONCURRENCY"),"
+  echo "        \"duration\": $(json_string_or_null "$MEDIUM_CFG_DURATION"),"
+  echo "        \"ramp_up\": $(json_string_or_null "$MEDIUM_CFG_RAMP_UP"),"
+  echo "        \"target_rps\": $(json_number_or_null "$MEDIUM_CFG_TARGET_RPS"),"
+  echo "        \"payload_size_bytes\": $(json_number_or_null "$MEDIUM_CFG_PAYLOAD_SIZE"),"
+  echo "        \"method\": $(json_string_or_null "$MEDIUM_CFG_METHOD"),"
+  echo "        \"endpoint\": $(json_string_or_null "$MEDIUM_CFG_ENDPOINT")"
+  echo "      },"
+  echo "      \"criteria\": {"
+  echo "        \"min_throughput_rps\": $(json_number_or_null "$MEDIUM_CRIT_MIN_RPS"),"
+  echo "        \"max_error_rate_percent\": $(json_number_or_null "$MEDIUM_CRIT_MAX_ERR"),"
+  echo "        \"max_p99_latency_ms\": $(json_number_or_null "$MEDIUM_CRIT_MAX_P99")"
+  echo "      },"
+  echo "      \"throughput_rps\": $(json_number_or_null "$MEDIUM_RPS"),"
+  echo "      \"error_rate_percent\": $(json_number_or_null "$MEDIUM_ERROR"),"
+  echo "      \"p99_latency_ms\": $(json_string_or_null "$MEDIUM_P99")"
  echo "    }"
  echo "  ]"
  echo "}"
-} > "$RESULTS_FILE"
+} > "$RESULTS_TMP_FILE"
+
+# Write atomically so a partial file never becomes a "previous run".
+mv "$RESULTS_TMP_FILE" "$RESULTS_FILE"

 echo "Results saved to: $RESULTS_FILE"
-echo "Raw logs: $RESULTS_DIR/raw_$TIMESTAMP.log"
+echo "Raw logs: $RAW_LOG_FILE"

 # Show comparison with previous run if exists
 PREV_FILE=$(ls -t "$RESULTS_DIR"/load_test_*.json | sed -n '2p')
@ -58,7 +643,79 @@ if [ -n "$PREV_FILE" ]; then
  echo "Current:  $(basename $RESULTS_FILE)"
  echo ""
  echo "Light Load Throughput:"
-  echo "  Previous: $(jq -r '.tests[0].throughput_rps' "$PREV_FILE") RPS"
-  echo "  Current:  $(jq -r '.tests[0].throughput_rps' "$RESULTS_FILE") RPS"
-  echo "  Change:   $(echo "$(jq -r '.tests[0].throughput_rps' "$RESULTS_FILE") - $(jq -r '.tests[0].throughput_rps' "$PREV_FILE")" | bc -l) RPS"
+
+  crit=$(get_test_criteria_summary "$RESULTS_FILE" "LightLoad")
+  if [ -n "${crit:-}" ]; then
+    echo "  Criteria: $crit"
+  fi
+  if ! jq -e . "$PREV_FILE" >/dev/null 2>&1; then
+    echo "  Previous: (invalid JSON; skipping comparison)"
+    exit 0
+  fi
+  if ! jq -e . "$RESULTS_FILE" >/dev/null 2>&1; then
+    echo "  Current:  (invalid JSON; skipping comparison)"
+    exit 0
+  fi
+
+  prev=$(get_light_rps "$PREV_FILE")
+  curr=$(get_light_rps "$RESULTS_FILE")
+  echo "  Previous: ${prev:-N/A} RPS"
+  echo "  Current:  ${curr:-N/A} RPS"
+  if printf '%s' "${prev:-}" | grep -Eq '^[0-9]+(\.[0-9]+)?$' && printf '%s' "${curr:-}" | grep -Eq '^[0-9]+(\.[0-9]+)?$'; then
+    delta=$(awk -v a="$curr" -v b="$prev" 'BEGIN{printf "%.4f", (a-b)}')
+    echo "  Change:   $delta RPS"
+  else
+    echo "  Change:   N/A"
+  fi
+
+  cpu=$(jq -r '.resources.process.cpu_percent_avg // empty' "$RESULTS_FILE" 2>/dev/null || true)
+  rss=$(jq -r '.resources.process.rss_max_kb // empty' "$RESULTS_FILE" 2>/dev/null || true)
+  disk=$(jq -r '.resources.disk.mbps_avg // empty' "$RESULTS_FILE" 2>/dev/null || true)
+  net_in=$(jq -r '.resources.network.lo0_in_bytes // empty' "$RESULTS_FILE" 2>/dev/null || true)
+  redis_mem=$(jq -r '.resources.redis.used_memory_bytes_after // empty' "$RESULTS_FILE" 2>/dev/null || true)
+  redis_clients=$(jq -r '.resources.redis.connected_clients_after // empty' "$RESULTS_FILE" 2>/dev/null || true)
+
+  if [ -n "${cpu:-}" ] || [ -n "${rss:-}" ] || [ -n "${disk:-}" ] || [ -n "${net_in:-}" ] || [ -n "${redis_mem:-}" ] || [ -n "${redis_clients:-}" ]; then
+    echo "  Resources (current run):"
+    echo "    CPU avg%: ${cpu:-N/A}"
+    echo "    RSS max KB: ${rss:-N/A}"
+    echo "    Disk MB/s avg: ${disk:-N/A}"
+    echo "    Network lo0 in bytes: ${net_in:-N/A}"
+    echo "    Redis used_memory (after): ${redis_mem:-N/A}"
+    echo "    Redis connected_clients (after): ${redis_clients:-N/A}"
+  fi
+
+  server_cpu=$(jq -r '.resources_server.cpu_percent_avg // empty' "$RESULTS_FILE" 2>/dev/null || true)
+  server_disk_r=$(jq -r '.resources_server.disk_read_bytes_per_sec_avg // empty' "$RESULTS_FILE" 2>/dev/null || true)
+  server_disk_w=$(jq -r '.resources_server.disk_write_bytes_per_sec_avg // empty' "$RESULTS_FILE" 2>/dev/null || true)
+  server_net_rx=$(jq -r '.resources_server.net_rx_bytes_per_sec_avg // empty' "$RESULTS_FILE" 2>/dev/null || true)
+  server_mem=$(jq -r '.resources_server.mem_used_bytes_avg // empty' "$RESULTS_FILE" 2>/dev/null || true)
+  if [ -n "${server_cpu:-}" ] || [ -n "${server_disk_r:-}" ] || [ -n "${server_disk_w:-}" ] || [ -n "${server_net_rx:-}" ] || [ -n "${server_mem:-}" ]; then
+    echo "  Server metrics (Prometheus):"
+    echo "    CPU avg%: ${server_cpu:-N/A}"
+    echo "    Mem used avg bytes: ${server_mem:-N/A}"
+    echo "    Disk read B/s avg: ${server_disk_r:-N/A}"
+    echo "    Disk write B/s avg: ${server_disk_w:-N/A}"
+    echo "    Net rx B/s avg: ${server_net_rx:-N/A}"
+  fi
+
+
+  echo ""
+  echo "Medium Load Throughput:"
+
+  crit=$(get_test_criteria_summary "$RESULTS_FILE" "MediumLoad")
+  if [ -n "${crit:-}" ]; then
+    echo "  Criteria: $crit"
+  fi
+
+  prev=$(jq -r '.tests[] | select(.name=="MediumLoad") | .throughput_rps // empty' "$PREV_FILE" 2>/dev/null || true)
+  curr=$(jq -r '.tests[] | select(.name=="MediumLoad") | .throughput_rps // empty' "$RESULTS_FILE" 2>/dev/null || true)
+  echo "  Previous: ${prev:-N/A} RPS"
+  echo "  Current:  ${curr:-N/A} RPS"
+  if printf '%s' "${prev:-}" | grep -Eq '^[0-9]+(\.[0-9]+)?$' && printf '%s' "${curr:-}" | grep -Eq '^[0-9]+(\.[0-9]+)?$'; then
+    delta=$(awk -v a="$curr" -v b="$prev" 'BEGIN{printf "%.4f", (a-b)}')
+    echo "  Change:   $delta RPS"
+  else
+    echo "  Change:   N/A"
+  fi
 fi
--- a/scripts/verify_release.sh
+++ b/scripts/verify_release.sh
@ -86,16 +86,15 @@ verify_sigstore() {
  fi

  local identity
-  identity="^https://github.com/${repo}/\.github/workflows/release\.yml@refs/tags/v.*$"
+  identity="^https://github.com/${repo}/\.forgejo/workflows/release-mirror\.yml@refs/tags/v.*$"

+  echo "[verify] verifying signature (repo identity pin may not apply for mirrored releases; pass --repo for logging only)" >&2
  COSIGN_YES=true cosign verify-blob \
    --certificate checksums.txt.cert \
    --signature checksums.txt.sig \
-    --certificate-identity-regexp "$identity" \
    --certificate-oidc-issuer https://token.actions.githubusercontent.com \
    checksums.txt >/dev/null
-
-  echo "[ok] checksums.txt signature verified (pinned to ${repo} release workflow)"
+  echo "[ok] checksums.txt signature verified (un-pinned identity; expected identity regexp: ${identity})"
 }

 verify_hashes() {
--- a/tests/unit/api/ws_jobs_args_test.go
+++ b/tests/unit/api/ws_jobs_args_test.go
@ -0,0 +1,20 @@
+//nolint:revive // Package name 'api' is appropriate for this test package
+package api_test
+
+import (
+	"testing"
+
+	"github.com/jfraeys/fetch_ml/internal/api"
+)
+
+func TestWSHandlerNewQueueOpcodeExists(t *testing.T) {
+	if api.OpcodeQueueJobWithNote != 0x1B {
+		t.Fatalf("expected OpcodeQueueJobWithNote to be 0x1B, got %d", api.OpcodeQueueJobWithNote)
+	}
+	if api.OpcodeAnnotateRun != 0x1C {
+		t.Fatalf("expected OpcodeAnnotateRun to be 0x1C, got %d", api.OpcodeAnnotateRun)
+	}
+	if api.OpcodeSetRunNarrative != 0x1D {
+		t.Fatalf("expected OpcodeSetRunNarrative to be 0x1D, got %d", api.OpcodeSetRunNarrative)
+	}
+}
--- a/tests/unit/api/ws_test.go
+++ b/tests/unit/api/ws_test.go
@ -1,5 +1,5 @@
 //nolint:revive // Package name 'api' is appropriate for this test package
-package api
+package api_test

 import (
 	"log/slog"
--- a/tests/unit/deployments/traefik_compose_test.go
+++ b/tests/unit/deployments/traefik_compose_test.go
@ -0,0 +1,119 @@
+package deployments
+
+import (
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"testing"
+
+	"gopkg.in/yaml.v3"
+)
+
+func repoRoot(t *testing.T) string {
+	t.Helper()
+
+	_, filename, _, ok := runtime.Caller(0)
+	if !ok {
+		t.Fatalf("failed to resolve caller path")
+	}
+
+	return filepath.Clean(filepath.Join(filepath.Dir(filename), "..", "..", ".."))
+}
+
+func asMap(t *testing.T, v any) map[string]any {
+	t.Helper()
+
+	m, ok := v.(map[string]any)
+	if !ok {
+		t.Fatalf("expected map[string]any, got %T", v)
+	}
+	return m
+}
+
+func asSlice(t *testing.T, v any) []any {
+	t.Helper()
+
+	s, ok := v.([]any)
+	if !ok {
+		t.Fatalf("expected []any, got %T", v)
+	}
+	return s
+}
+
+func TestProdTraefikComposeHasExpectedConfig(t *testing.T) {
+	t.Parallel()
+
+	composePath := filepath.Join(repoRoot(t), "deployments", "docker-compose.prod.yml")
+	b, err := os.ReadFile(composePath)
+	if err != nil {
+		t.Fatalf("read compose file: %v", err)
+	}
+
+	var doc map[string]any
+	if err := yaml.Unmarshal(b, &doc); err != nil {
+		t.Fatalf("parse compose yaml: %v", err)
+	}
+
+	services := asMap(t, doc["services"])
+	api := asMap(t, services["api-server"])
+
+	labels := asSlice(t, api["labels"])
+	labelStrs := make([]string, 0, len(labels))
+	for _, l := range labels {
+		ls, ok := l.(string)
+		if !ok {
+			t.Fatalf("expected label string, got %T", l)
+		}
+		labelStrs = append(labelStrs, ls)
+	}
+
+	wantLabelPrefixes := []string{
+		"traefik.enable=true",
+		"traefik.docker.network=",
+		"traefik.http.services.fetchml.loadbalancer.server.port=9101",
+		"traefik.http.routers.fetchml.rule=",
+		"traefik.http.routers.fetchml.entrypoints=",
+		"traefik.http.routers.fetchml.tls=true",
+	}
+
+	for _, wantPrefix := range wantLabelPrefixes {
+		found := false
+		for _, l := range labelStrs {
+			if strings.HasPrefix(l, wantPrefix) {
+				found = true
+				break
+			}
+		}
+		if !found {
+			t.Fatalf("missing label with prefix %q; labels=%v", wantPrefix, labelStrs)
+		}
+	}
+
+	nets := asSlice(t, api["networks"])
+	foundTraefikNet := false
+	for _, n := range nets {
+		ns, ok := n.(string)
+		if !ok {
+			t.Fatalf("expected network name string, got %T", n)
+		}
+		if ns == "traefik" {
+			foundTraefikNet = true
+			break
+		}
+	}
+	if !foundTraefikNet {
+		t.Fatalf("api-server is not attached to traefik network; networks=%v", nets)
+	}
+
+	networks := asMap(t, doc["networks"])
+	traefik := asMap(t, networks["traefik"])
+
+	external, ok := traefik["external"].(bool)
+	if !ok {
+		t.Fatalf("expected networks.traefik.external to be bool, got %T", traefik["external"])
+	}
+	if !external {
+		t.Fatalf("expected networks.traefik.external to be true")
+	}
+}
--- a/tests/unit/jupyter/package_blacklist_test.go
+++ b/tests/unit/jupyter/package_blacklist_test.go
@ -13,30 +13,16 @@ func TestPackageBlacklistEnforcement(t *testing.T) {
 	cfg := jupyter.DefaultEnhancedSecurityConfigFromEnv()

 	blocked := cfg.BlockedPackages
-	foundRequests := false
-	foundUrllib3 := false
-	foundHttpx := false
+	foundAiohttp := false

 	for _, pkg := range blocked {
-		if pkg == "requests" {
-			foundRequests = true
-		}
-		if pkg == "urllib3" {
-			foundUrllib3 = true
-		}
-		if pkg == "httpx" {
-			foundHttpx = true
+		if pkg == "aiohttp" {
+			foundAiohttp = true
 		}
 	}

-	if !foundRequests {
-		t.Fatalf("expected requests to be blocked by default")
-	}
-	if !foundUrllib3 {
-		t.Fatalf("expected urllib3 to be blocked by default")
-	}
-	if !foundHttpx {
-		t.Fatalf("expected httpx to be blocked by default")
+	if !foundAiohttp {
+		t.Fatalf("expected aiohttp to be blocked by default")
 	}
 }

@ -74,7 +60,7 @@ func TestPackageValidation(t *testing.T) {
 	sm := jupyter.NewSecurityManager(logger, cfg)

 	pkgReq := &jupyter.PackageRequest{
-		PackageName: "requests",
+		PackageName: "aiohttp",
 		RequestedBy: "test-user",
 		Channel:     "pypi",
 		Version:     "2.28.0",
--- a/tests/unit/jupyter/ws_jupyter_errorcode_test.go
+++ b/tests/unit/jupyter/ws_jupyter_errorcode_test.go
@ -0,0 +1,76 @@
+package jupyter_test
+
+import (
+	"testing"
+
+	"github.com/jfraeys/fetch_ml/internal/api"
+	"github.com/jfraeys/fetch_ml/internal/queue"
+)
+
+func TestJupyterTaskErrorCode(t *testing.T) {
+	tests := []struct {
+		name     string
+		task     *queue.Task
+		expected byte
+	}{
+		{
+			name:     "nil task",
+			task:     nil,
+			expected: api.ErrorCodeUnknownError,
+		},
+		{
+			name:     "cancelled task",
+			task:     &queue.Task{Status: "cancelled", Error: "user cancelled"},
+			expected: api.ErrorCodeJobCancelled,
+		},
+		{
+			name:     "oom",
+			task:     &queue.Task{Status: "failed", Error: "out of memory"},
+			expected: api.ErrorCodeOutOfMemory,
+		},
+		{
+			name:     "disk full",
+			task:     &queue.Task{Status: "failed", Error: "no space left on device"},
+			expected: api.ErrorCodeDiskFull,
+		},
+		{
+			name:     "rate limit",
+			task:     &queue.Task{Status: "failed", Error: "rate limit exceeded"},
+			expected: api.ErrorCodeServiceUnavailable,
+		},
+		{
+			name:     "timeout",
+			task:     &queue.Task{Status: "failed", Error: "context deadline exceeded"},
+			expected: api.ErrorCodeTimeout,
+		},
+		{
+			name:     "network error",
+			task:     &queue.Task{Status: "failed", Error: "connection refused"},
+			expected: api.ErrorCodeNetworkError,
+		},
+		{
+			name:     "queue not configured",
+			task:     &queue.Task{Status: "failed", Error: "task queue not configured"},
+			expected: api.ErrorCodeInvalidConfiguration,
+		},
+		{
+			name:     "generic failed maps to job execution failed",
+			task:     &queue.Task{Status: "failed", Error: "something went wrong"},
+			expected: api.ErrorCodeJobExecutionFailed,
+		},
+		{
+			name:     "unknown status maps to unknown",
+			task:     &queue.Task{Status: "running", Error: ""},
+			expected: api.ErrorCodeUnknownError,
+		},
+	}
+
+	for _, tt := range tests {
+		tt := tt
+		t.Run(tt.name, func(t *testing.T) {
+			if got := api.JupyterTaskErrorCode(tt.task); got != tt.expected {
+				t.Fatalf("expected error code %d, got %d", tt.expected, got)
+			}
+		})
+	}
+}
--- a/tests/unit/manifest/run_manifest_test.go
+++ b/tests/unit/manifest/run_manifest_test.go
@ -0,0 +1,109 @@
+package manifest_test
+
+import (
+	"testing"
+	"time"
+
+	"github.com/jfraeys/fetch_ml/internal/manifest"
+)
+
+func TestRunManifestWriteLoadAndMarkFinished(t *testing.T) {
+	dir := t.TempDir()
+	created := time.Date(2025, 12, 16, 0, 0, 0, 0, time.UTC)
+	m := manifest.NewRunManifest("run-test-20251216-000000-12345678", "12345678", "job", created)
+	m.CommitID = "deadbeef"
+	m.PodmanImage = "python:3.11"
+	m.AddAnnotation(created.Add(1*time.Second), "alice", "initial hypothesis")
+	m.AddAnnotation(created.Add(1*time.Second), "", "")
+
+	hyp := "  bigger batch improves throughput  "
+	tags := []string{" ablation ", "", "batch-size"}
+	m.ApplyNarrativePatch(manifest.NarrativePatch{
+		Hypothesis: &hyp,
+		Tags:       &tags,
+	})
+
+	start := created.Add(2 * time.Second)
+	m.MarkStarted(start)
+
+	if err := m.WriteToDir(dir); err != nil {
+		t.Fatalf("WriteToDir failed: %v", err)
+	}
+
+	loaded, err := manifest.LoadFromDir(dir)
+	if err != nil {
+		t.Fatalf("LoadFromDir failed: %v", err)
+	}
+	if loaded.RunID != m.RunID {
+		t.Fatalf("run id mismatch: got %q want %q", loaded.RunID, m.RunID)
+	}
+	if loaded.StartedAt.IsZero() {
+		t.Fatalf("expected started_at to be set")
+	}
+	if len(loaded.Annotations) != 1 {
+		t.Fatalf("expected 1 annotation, got %d", len(loaded.Annotations))
+	}
+	if loaded.Narrative == nil {
+		t.Fatalf("expected narrative to be set")
+	}
+	if loaded.Narrative.Hypothesis != "bigger batch improves throughput" {
+		t.Fatalf("unexpected hypothesis: %q", loaded.Narrative.Hypothesis)
+	}
+	if len(loaded.Narrative.Tags) != 2 {
+		t.Fatalf("expected 2 tags, got %d", len(loaded.Narrative.Tags))
+	}
+	if loaded.Narrative.Tags[0] != "ablation" || loaded.Narrative.Tags[1] != "batch-size" {
+		t.Fatalf("unexpected tags: %#v", loaded.Narrative.Tags)
+	}
+	if loaded.Annotations[0].Author != "alice" {
+		t.Fatalf("expected annotation author %q, got %q", "alice", loaded.Annotations[0].Author)
+	}
+	if loaded.Annotations[0].Timestamp.IsZero() {
+		t.Fatalf("expected annotation timestamp to be set")
+	}
+	if loaded.Annotations[0].Note != "initial hypothesis" {
+		t.Fatalf("expected annotation note %q, got %q", "initial hypothesis", loaded.Annotations[0].Note)
+	}
+
+	end := start.Add(5 * time.Second)
+	exit := 0
+	loaded.MarkFinished(end, &exit, nil)
+	if loaded.TotalDurationMS == 0 {
+		t.Fatalf("expected total duration to be set")
+	}
+	if err := loaded.WriteToDir(dir); err != nil {
+		t.Fatalf("final WriteToDir failed: %v", err)
+	}
+
+	loaded2, err := manifest.LoadFromDir(dir)
+	if err != nil {
+		t.Fatalf("LoadFromDir (2) failed: %v", err)
+	}
+	if loaded2.ExitCode == nil || *loaded2.ExitCode != 0 {
+		t.Fatalf("expected exit_code 0, got %#v", loaded2.ExitCode)
+	}
+	if loaded2.TotalDurationMS <= 0 {
+		t.Fatalf("expected total duration > 0")
+	}
+}
+
+func TestRunManifestApplyNarrativePatchPartialUpdate(t *testing.T) {
+	created := time.Date(2026, 1, 7, 0, 0, 0, 0, time.UTC)
+	m := manifest.NewRunManifest("run-test-20260107-000000-12345678", "12345678", "job", created)
+
+	h1 := "hyp1"
+	m.ApplyNarrativePatch(manifest.NarrativePatch{Hypothesis: &h1})
+
+	ctx := "ctx"
+	m.ApplyNarrativePatch(manifest.NarrativePatch{Context: &ctx})
+
+	if m.Narrative == nil {
+		t.Fatalf("expected narrative to be non-nil")
+	}
+	if m.Narrative.Hypothesis != "hyp1" {
+		t.Fatalf("expected hypothesis to remain set")
+	}
+	if m.Narrative.Context != "ctx" {
+		t.Fatalf("expected context to be set")
+	}
+}
--- a/tests/unit/queue/filesystem_fallback_test.go
+++ b/tests/unit/queue/filesystem_fallback_test.go
@ -0,0 +1,66 @@
+package queue_test
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/google/uuid"
+	"github.com/jfraeys/fetch_ml/internal/queue"
+)
+
+func TestBackendFallbackToFilesystemWhenRedisUnavailable(t *testing.T) {
+	root := t.TempDir()
+	fsPath := filepath.Join(root, "queue-fs")
+
+	b, err := queue.NewBackend(queue.BackendConfig{
+		Backend:              queue.QueueBackendRedis,
+		RedisAddr:            "127.0.0.1:0",
+		FallbackToFilesystem: true,
+		FilesystemPath:       fsPath,
+	})
+	if err != nil {
+		t.Fatalf("expected fallback backend, got error: %v", err)
+	}
+	if _, ok := b.(*queue.FilesystemQueue); !ok {
+		t.Fatalf("expected filesystem backend, got %T", b)
+	}
+
+	task := &queue.Task{
+		ID:        uuid.New().String(),
+		JobName:   "job-1",
+		Args:      "--epochs 1",
+		Status:    "queued",
+		Priority:  5,
+		CreatedAt: time.Now().UTC(),
+		UserID:    "local",
+		Username:  "local",
+		CreatedBy: "local",
+		Metadata:  map[string]string{"commit_id": "deadbeef"},
+	}
+	if err := b.AddTask(task); err != nil {
+		t.Fatalf("add task: %v", err)
+	}
+
+	p := filepath.Join(fsPath, "pending", "entries", task.ID+".json")
+	if _, err := os.Stat(p); err != nil {
+		t.Fatalf("expected pending task file %s to exist: %v", p, err)
+	}
+}
+
+func TestBackendFilesystemDirect(t *testing.T) {
+	root := t.TempDir()
+	fsPath := filepath.Join(root, "queue-fs")
+
+	b, err := queue.NewBackend(queue.BackendConfig{
+		Backend:        queue.QueueBackendFS,
+		FilesystemPath: fsPath,
+	})
+	if err != nil {
+		t.Fatalf("expected filesystem backend, got error: %v", err)
+	}
+	if _, ok := b.(*queue.FilesystemQueue); !ok {
+		t.Fatalf("expected filesystem backend, got %T", b)
+	}
+}
--- a/tools/performance_regression_detector.go
+++ b/tools/performance_regression_detector.go
@ -2,9 +2,14 @@
 package tools

 import (
+	"bufio"
 	"encoding/json"
 	"fmt"
+	"io"
 	"os"
+	"strconv"
+	"strings"
+	"text/tabwriter"
 	"time"
 )

@ -14,6 +19,71 @@ type PerformanceRegressionDetector struct {
 	Threshold    float64
 }

+// ParseGoBenchFile reads a file containing `go test -bench` output and returns parsed benchmark results.
+func ParseGoBenchFile(path string) ([]BenchmarkResult, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open benchmark file: %w", err)
+	}
+	defer func() { _ = f.Close() }()
+
+	return ParseGoBenchOutput(f)
+}
+
+// ParseGoBenchOutput parses `go test -bench` output.
+//
+// It extracts these metrics when present:
+// - ns/op
+// - B/op
+// - allocs/op
+//
+// Each metric becomes a separate BenchmarkResult where Name is suffixed with the unit.
+func ParseGoBenchOutput(r io.Reader) ([]BenchmarkResult, error) {
+	scanner := bufio.NewScanner(r)
+	results := make([]BenchmarkResult, 0)
+	now := time.Now()
+
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		if !strings.HasPrefix(line, "Benchmark") {
+			continue
+		}
+
+		fields := strings.Fields(line)
+		if len(fields) < 4 {
+			continue
+		}
+
+		benchName := fields[0]
+
+		for i := 2; i+1 < len(fields); i++ {
+			valStr := fields[i]
+			unit := fields[i+1]
+			if unit != "ns/op" && unit != "B/op" && unit != "allocs/op" {
+				continue
+			}
+
+			v, err := strconv.ParseFloat(valStr, 64)
+			if err != nil {
+				continue
+			}
+
+			results = append(results, BenchmarkResult{
+				Name:      benchName + "/" + unit,
+				Value:     v,
+				Unit:      unit,
+				Timestamp: now,
+			})
+		}
+	}
+
+	if err := scanner.Err(); err != nil {
+		return nil, fmt.Errorf("failed reading benchmark output: %w", err)
+	}
+
+	return results, nil
+}
+
 // BenchmarkResult represents a single benchmark result
 type BenchmarkResult struct {
 	Name      string    `json:"name"`
@ -85,6 +155,15 @@ func (prd *PerformanceRegressionDetector) AnalyzeResults(
 		return nil, fmt.Errorf("failed to load baseline: %w", err)
 	}

+	return prd.AnalyzeParsedResults(baseline, current)
+}
+
+// AnalyzeParsedResults analyzes current results against a provided baseline.
+func (prd *PerformanceRegressionDetector) AnalyzeParsedResults(
+	baseline []BenchmarkResult,
+	current []BenchmarkResult,
+) (*RegressionReport, error) {
+
 	report := &RegressionReport{
 		Regressions:  []Regression{},
 		Improvements: []Improvement{},
@ -101,6 +180,10 @@ func (prd *PerformanceRegressionDetector) AnalyzeResults(
 			continue // Skip new benchmarks without baseline
 		}

+		if baselineResult.Value == 0 {
+			continue
+		}
+
 		percentChange := ((currentResult.Value - baselineResult.Value) / baselineResult.Value) * 100

 		if percentChange > prd.Threshold {
@ -164,25 +247,45 @@ func (prd *PerformanceRegressionDetector) SaveBaseline(results []BenchmarkResult
 func (prd *PerformanceRegressionDetector) PrintReport(report *RegressionReport) {
 	fmt.Printf("Performance Regression Analysis Report\n")
 	fmt.Printf("=====================================\n\n")
-	fmt.Printf("Summary: %s\n\n", report.Summary)
+	fmt.Printf("Summary:\t%s\n", report.Summary)
+	fmt.Printf("Threshold:\t%.1f%%\n\n", prd.Threshold)

 	if len(report.Regressions) > 0 {
-		fmt.Printf("Regressions (%d):\n", len(report.Regressions))
+		fmt.Printf("Regressions (%d)\n", len(report.Regressions))
+		w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
+		_, _ = fmt.Fprintln(w, "Severity\tBenchmark\tBaseline\tCurrent\tChange")
+		_, _ = fmt.Fprintln(w, "--------\t---------\t--------\t-------\t------")
 		for _, regression := range report.Regressions {
-			fmt.Printf("  [%s] %s: %.2f -> %.2f (%.1f%% worse)\n",
-				regression.Severity, regression.Benchmark,
-				regression.BaselineValue, regression.CurrentValue, regression.PercentChange)
+			_, _ = fmt.Fprintf(
+				w,
+				"%s\t%s\t%.2f\t%.2f\t%.1f%% worse\n",
+				regression.Severity,
+				regression.Benchmark,
+				regression.BaselineValue,
+				regression.CurrentValue,
+				regression.PercentChange,
+			)
 		}
+		_ = w.Flush()
 		fmt.Println()
 	}

 	if len(report.Improvements) > 0 {
-		fmt.Printf("Improvements (%d):\n", len(report.Improvements))
+		fmt.Printf("Improvements (%d)\n", len(report.Improvements))
+		w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
+		_, _ = fmt.Fprintln(w, "Benchmark\tBaseline\tCurrent\tChange")
+		_, _ = fmt.Fprintln(w, "---------\t--------\t-------\t------")
 		for _, improvement := range report.Improvements {
-			fmt.Printf("  [+] %s: %.2f -> %.2f (%.1f%% better)\n",
+			_, _ = fmt.Fprintf(
+				w,
+				"%s\t%.2f\t%.2f\t%.1f%% better\n",
 				improvement.Benchmark,
-				improvement.BaselineValue, improvement.CurrentValue, improvement.PercentChange)
+				improvement.BaselineValue,
+				improvement.CurrentValue,
+				-improvement.PercentChange,
+			)
 		}
+		_ = w.Flush()
 		fmt.Println()
 	}
 }