From 5144d291cbb01a7f77e1e5549bc5539e19023e19 Mon Sep 17 00:00:00 2001
From: Jeremie Fraeys <jfaeys@gmail.com>
Date: Thu, 12 Feb 2026 12:05:27 -0500
Subject: [PATCH] docs: comprehensive documentation updates

- Add architecture, CI/CD, CLI reference documentation
- Update installation, operations, and quick-start guides
- Add Jupyter workflow and queue documentation
- New landing page and research runner plan
---
 CHANGELOG.md                                  |   1 +
 DEVELOPMENT.md                                |  79 +--
 README.md                                     |  12 +-
 docs/.hugo_build.lock                         |   0
 docs/go.mod                                   |   2 +
 docs/go.sum                                   |   2 +
 docs/hugo.toml                                |   2 +-
 ...s_b807c86e8030af4cdc30edccea379f5f.content |   1 +
 ...scss_b807c86e8030af4cdc30edccea379f5f.json |   1 +
 docs/src/architecture.md                      |   5 +-
 docs/src/cicd.md                              |  20 +-
 docs/src/cli-reference.md                     |  39 +-
 docs/src/installation.md                      |   2 +-
 docs/src/jupyter-workflow.md                  |  25 +-
 docs/src/{index.md => landing.md}             |   3 +-
 docs/src/operations.md                        |   5 +-
 docs/src/queue.md                             |   5 +-
 docs/src/quick-start.md                       |   7 +-
 docs/src/redis-ha.md                          |   5 +-
 docs/src/research-runner-plan.md              | 667 ++++++++++++++++++
 docs/src/user-permissions.md                  |   5 +-
 docs/src/validate.md                          |   3 +-
 docs/src/zig-cli.md                           |   9 +-
 23 files changed, 790 insertions(+), 110 deletions(-)
 create mode 100644 docs/.hugo_build.lock
 create mode 100644 docs/go.sum
 create mode 100644 docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.content
 create mode 100644 docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.json
 rename docs/src/{index.md => landing.md} (98%)
 create mode 100644 docs/src/research-runner-plan.md
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7c319c8..685a5c7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@
 - Worker: stage verified `snapshot_id` into each task workspace and expose it to training code via `FETCH_ML_SNAPSHOT_DIR`.
 - Worker: provenance enforcement is trustworthiness-by-default (fail-closed) with `provenance_best_effort` opt-in.
 - CLI/API: add `ml validate` to fetch a validation report (commit/task) for provenance + integrity checks.
+- Worker: persist discovered artifacts into `run_manifest.json` (`artifacts.discovery_time`, `artifacts.files[]`, `artifacts.total_size_bytes`) at task completion.
 - Worker: best-effort environment prewarm can build a warmed Podman image keyed by `deps_manifest_sha256` and reuse it for subsequent tasks.
 - Worker: export env prewarm hit/miss/built counters and total build time via the worker Prometheus metrics endpoint.
 - API/Worker: `ml prune` also triggers best-effort garbage collection of warmed env images.
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 20e0574..e03e3df 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -9,11 +9,8 @@ This guide helps developers set up their environment and contribute effectively
 git clone <your-repo>
 cd fetch_ml
 
-# Install dependencies
-make setup-dev
-
 # Start development environment
-make dev-start
+make dev-up
 
 # Run tests
 make test
@@ -24,11 +21,10 @@ make test
 ### Prerequisites
 
 - Go 1.25+
-- Zig 0.11+
+- Zig 0.15+
 - Python 3.11+
 - Docker & Docker Compose
 - Redis
-- Node.js (for some tools)
 
 ### Local Development Setup
 
@@ -40,15 +36,15 @@ make test
 
 2. **Install Zig tools**
    ```bash
-   # Install Zig language server
-   zig build --install zls
+   # Zig is required for building the CLI and running CLI tests
+   zig version
    ```
 
 3. **Setup Python environment**
    ```bash
    python -m venv venv
    source venv/bin/activate  # or venv\Scripts\activate on Windows
-   pip install -r requirements-dev.txt
+   # Python is optional (used for a few helper scripts)
    ```
 
 4. **Optional: Install pre-commit hooks**
@@ -69,11 +65,8 @@ make test
 
 2. Make your changes with live feedback:
    ```bash
-   # Go development with hot reload
-   make dev-go
-
-   # Zig development with build on save
-   make dev-zig
+   # Build Go services + Zig CLI
+   make dev
 
    # Run specific tests
    make test-unit
@@ -84,11 +77,10 @@ make test
    ```bash
    # Lint and format (if you have tools configured)
    make lint
-   make format
 
    # Full test suite
-   make test-all
-
+   make test-full
+   
    # Optional: Pre-commit checks
    pre-commit run --all-files
    ```
@@ -105,13 +97,14 @@ make test
 make test-unit          # Unit tests only
 make test-integration   # Integration tests only  
 make test-e2e          # End-to-end tests only
-make test-performance  # Performance tests only
-
+make benchmark         # Benchmarks
+make load-test         # Load tests
+ 
 # Run with coverage
 make test-coverage
-
+ 
 # Watch mode for development
-make test-watch
+# (no watch mode target; run specific package tests with go test -run)
 ```
 
 ## Code Quality
@@ -145,50 +138,14 @@ test: add or update tests
 chore: maintenance tasks
 ```
 
-## Debugging
-
-### Go Debugging
-
-```bash
-# Debug with delve
-dlv debug cmd/api-server/main.go
-
-# Debug tests
-dlv test ./internal/...
-
-# Profile with pprof
-go tool pprof http://localhost:6060/debug/pprof/profile
-```
-
-### Zig Debugging
-
-```bash
-# Debug build
-zig build-exe -O Debug -fstrip=false your_file.zig
-
-# Test with debugging
-zig test --gdb your_file.zig
-```
-
-### Container Debugging
-
-```bash
-# Debug containers
-docker-compose exec api-server bash
-docker-compose logs -f api-server
-
-# Inspect running processes
-docker-compose exec api-server ps aux
-```
-
 ## Performance Monitoring
 
 ### Local Monitoring
 
 ```bash
 # Start monitoring stack
-make monitoring-start
-
+make dev-up
+ 
 # View metrics
 open http://localhost:3000  # Grafana
 open http://localhost:9090  # Prometheus
@@ -198,8 +155,8 @@ open http://localhost:9090  # Prometheus
 
 ```bash
 # Load test API
-make load-test-api
-
+make load-test
+ 
 # Performance benchmarks
 make benchmark
 
diff --git a/README.md b/README.md
index 97623d3..2d4fd43 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Verify the signature (keyless Sigstore) using cosign:
 cosign verify-blob \
   --certificate checksums.txt.cert \
   --signature checksums.txt.sig \
-  --certificate-identity-regexp "^https://github.com/<org>/<repo>/.github/workflows/release.yml@refs/tags/v.*$" \
+  --certificate-identity-regexp "^https://github.com/jfraeysd/fetch_ml/.forgejo/workflows/release-mirror.yml@refs/tags/v.*$" \
   --certificate-oidc-issuer https://token.actions.githubusercontent.com \
   checksums.txt
 ```
@@ -40,16 +40,16 @@ Example (CLI on Linux x86_64):
 
 ```bash
 # Download
-curl -fsSLO https://github.com/<org>/<repo>/releases/download/<tag>/ml-linux-x86_64.tar.gz
-curl -fsSLO https://github.com/<org>/<repo>/releases/download/<tag>/checksums.txt
-curl -fsSLO https://github.com/<org>/<repo>/releases/download/<tag>/checksums.txt.sig
-curl -fsSLO https://github.com/<org>/<repo>/releases/download/<tag>/checksums.txt.cert
+curl -fsSLO https://github.com/jfraeysd/fetch_ml/releases/download/<tag>/ml-linux-x86_64.tar.gz
+curl -fsSLO https://github.com/jfraeysd/fetch_ml/releases/download/<tag>/checksums.txt
+curl -fsSLO https://github.com/jfraeysd/fetch_ml/releases/download/<tag>/checksums.txt.sig
+curl -fsSLO https://github.com/jfraeysd/fetch_ml/releases/download/<tag>/checksums.txt.cert
 
 # Verify
 cosign verify-blob \
   --certificate checksums.txt.cert \
   --signature checksums.txt.sig \
-  --certificate-identity-regexp "^https://github.com/<org>/<repo>/.github/workflows/release.yml@refs/tags/v.*$" \
+  --certificate-identity-regexp "^https://github.com/jfraeysd/fetch_ml/.forgejo/workflows/release-mirror.yml@refs/tags/v.*$" \
   --certificate-oidc-issuer https://token.actions.githubusercontent.com \
   checksums.txt
 sha256sum -c --ignore-missing checksums.txt
diff --git a/docs/.hugo_build.lock b/docs/.hugo_build.lock
new file mode 100644
index 0000000..e69de29
diff --git a/docs/go.mod b/docs/go.mod
index 03f40e4..cf710a2 100644
--- a/docs/go.mod
+++ b/docs/go.mod
@@ -1,3 +1,5 @@
 module github.com/jfraeys/fetch_ml/docs
 
 go 1.21
+
+require github.com/alex-shpak/hugo-book v0.0.0-20251118074854-b7f9c8cb0f51 // indirect
diff --git a/docs/go.sum b/docs/go.sum
new file mode 100644
index 0000000..f7b1c37
--- /dev/null
+++ b/docs/go.sum
@@ -0,0 +1,2 @@
+github.com/alex-shpak/hugo-book v0.0.0-20251118074854-b7f9c8cb0f51 h1:HHxBwO6r6h3AUflUc/X/Gf5UrfTY5rZEbD7QoGzbVvU=
+github.com/alex-shpak/hugo-book v0.0.0-20251118074854-b7f9c8cb0f51/go.mod h1:L4NMyzbn15fpLIpmmtDg9ZFFyTZzw87/lk7M2bMQ7ds=
diff --git a/docs/hugo.toml b/docs/hugo.toml
index 6a8e5e1..b55d578 100644
--- a/docs/hugo.toml
+++ b/docs/hugo.toml
@@ -9,7 +9,7 @@ publishDir = "_site"
 
 enableGitInfo = true
 
-disableKinds = ["taxonomy", "taxonomyTerm"]
+disableKinds = ["taxonomy"]
 
 [module]
   [[module.imports]]
diff --git a/docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.content b/docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.content
new file mode 100644
index 0000000..63f7a65
--- /dev/null
+++ b/docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.content
@@ -0,0 +1 @@
+@charset "UTF-8";:root{--font-size:16px;--font-size-smaller:0.875rem;--font-size-smallest:0.75rem;--body-font-weight:400;--body-background:white;--body-background-tint:transparent;--body-font-color:black;--border-radius:0.25rem}/*!modern-normalize v3.0.1 | MIT License | https://github.com/sindresorhus/modern-normalize*/*,::before,::after{box-sizing:border-box}html{font-family:system-ui,segoe ui,Roboto,Helvetica,Arial,sans-serif,apple color emoji,segoe ui emoji;line-height:1.15;-webkit-text-size-adjust:100%;tab-size:4}body{margin:0}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:ui-monospace,SFMono-Regular,Consolas,liberation mono,Menlo,monospace;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{border-color:initial}button,input,optgroup,select,textarea{font-family:inherit;font-size:100%;line-height:1.15;margin:0}button,[type=button],[type=reset],[type=submit]{-webkit-appearance:button}legend{padding:0}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}.flex{display:flex}.flex.gap{gap:1rem}.flex-auto{flex:auto}.flex-even{flex:1 1}.flex-wrap{flex-wrap:wrap}.justify-start{justify-content:flex-start}.justify-end{justify-content:flex-end}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.align-center{align-items:center}.mx-auto{margin:0 auto}.text-center{text-align:center}.text-left{text-align:left}.text-right{text-align:right}.text-small,small{font-size:.875em}.hidden{display:none}input.toggle{height:0;width:0;overflow:hidden;opacity:0;position:absolute}html{font-size:var(--font-size);scroll-behavior:smooth;touch-action:manipulation;scrollbar-gutter:stable}body{min-width:20rem;color:var(--body-font-color);background:var(--body-background)var(--body-background-tint);font-weight:var(--body-font-weight);text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}h1,h2,h3,h4,h5,h6{font-weight:inherit}a{flex:auto;align-items:center;gap:.5em;text-decoration:none;cursor:default}a[href],a[role=button]{color:var(--color-link);cursor:pointer}:focus-visible,input.toggle:focus-visible+label{outline-style:auto;outline-color:var(--color-link)}nav ul{padding:0;margin:0;list-style:none}nav ul li{position:relative}nav ul a{padding:.5em 0;display:flex;transition:opacity .1s ease-in-out}nav ul a[href]:hover,nav ul a[role=button]:hover{opacity:.5}nav ul ul{padding-inline-start:1.5em}ul.pagination{display:flex;justify-content:center;list-style-type:none;padding-inline-start:0}ul.pagination .page-item a{padding:1rem}.container{max-width:80rem;margin:0 auto}.book-icon{filter:var(--icon-filter)}a .book-icon{height:1em;width:1em}.book-brand{margin-top:0;margin-bottom:1rem}.book-brand img{height:1.5em;width:1.5em}.book-menu{flex:0 0 16rem;font-size:var(--font-size-smaller)}.book-menu .book-menu-content{width:16rem;padding:1rem;position:fixed;top:0;bottom:0;overflow-x:hidden;overflow-y:auto}.book-menu a,.book-menu label{color:inherit;word-wrap:break-word;display:flex}.book-menu a.active{color:var(--color-link)}.book-menu label>img:last-child{height:1em;width:1em;cursor:pointer;align-self:center;transition:transform .1s ease-in-out}.book-menu input.toggle+label+ul{display:none}.book-menu input.toggle:checked+label>img:last-child{transform:rotate(90deg)}.book-menu input.toggle:checked+label+ul{display:block}body[dir=rtl] .book-menu input.toggle+label>img:last-child{transform:rotate(180deg)}body[dir=rtl] .book-menu input.toggle:checked+label>img:last-child{transform:rotate(90deg)}.book-section-flat{margin:1rem 0}.book-section-flat>a,.book-section-flat>span,.book-section-flat>label{font-weight:bolder}.book-section-flat>ul{padding-inline-start:0}.book-page{min-width:20rem;flex-grow:1;padding:1rem}.book-post{margin-bottom:4rem}.book-post .book-post-date img{height:1em;width:1em;margin-inline-end:.5em}.book-post .book-post-content{margin-top:1rem}.book-post .book-post-thumbnail{flex:0 0 34%}.book-post .book-post-thumbnail img{width:100%;aspect-ratio:4/3;object-fit:cover}.book-header{margin-bottom:1rem}.book-header label{line-height:0}.book-header h3{overflow:hidden;text-overflow:ellipsis;margin:0 1rem}.book-layout-landing .book-header{display:block;position:relative;z-index:1}.book-layout-landing .book-header nav>ul{display:flex;gap:1rem;justify-content:end}.book-layout-landing .book-header nav>ul>li{display:block;white-space:nowrap}.book-layout-landing .book-header nav>ul>li>ul{display:none;position:absolute;padding:0}.book-layout-landing .book-header nav>ul>li:hover>ul,.book-layout-landing .book-header nav>ul>li:focus-within>ul{display:block}.book-search{position:relative;margin:.5rem 0}.book-search input{width:100%;padding:.5rem;border:1px solid var(--gray-200);border-radius:var(--border-radius);background:var(--gray-100);color:var(--body-font-color)}.book-search input:required+.book-search-spinner{display:block}.book-search .book-search-spinner{position:absolute;top:0;margin:.5rem;margin-inline-start:calc(100% - 1.5rem);width:1rem;height:1rem;border:1px solid transparent;border-top-color:var(--body-font-color);border-radius:50%;animation:spin 1s ease infinite}@keyframes spin{100%{transform:rotate(360deg)}}.book-search ul a{padding-bottom:0}.book-search small{opacity:.5}.book-toc{flex:0 0 16rem;font-size:var(--font-size-smallest)}.book-toc .book-toc-content{width:16rem;padding:1rem;position:fixed;top:0;bottom:0;overflow-x:hidden;overflow-y:auto}.book-toc a{display:block}.book-toc img{height:1em;width:1em}.book-toc nav>ul>li:first-child{margin-top:0}.book-footer{padding-top:1rem;font-size:var(--font-size-smaller)}.book-footer a{margin:.25rem 0;padding:.25rem 0}.book-comments{margin-top:1rem}.book-copyright{margin-top:1rem}.book-languages{margin-bottom:1rem}.book-languages span{padding:0}.book-languages ul{padding-inline-start:1.5em}.book-menu-content,.book-toc-content{transition:.2s ease-in-out;transition-property:transform,margin,opacity,visibility;will-change:transform,margin,opacity}@media screen and (max-width:56rem){.book-menu{visibility:hidden;margin-inline-start:-16rem;z-index:1}.book-menu .book-menu-content{background:var(--body-background)}.book-toc{display:none}.book-header{display:block}.book-post-container{flex-direction:column-reverse}#menu-control,#toc-control{display:inline}#menu-control:checked~main .book-menu{visibility:initial}#menu-control:checked~main .book-menu .book-menu-content{transform:translateX(16rem);box-shadow:0 0 .5rem rgba(0,0,0,.1)}#menu-control:checked~main .book-page{opacity:.25}#menu-control:checked~main .book-menu-overlay{display:block;position:fixed;top:0;bottom:0;left:0;right:0}#toc-control:checked~main .book-header aside{display:block}body[dir=rtl] #menu-control:checked~main .book-menu .book-menu-content{transform:translateX(-16rem)}}@media screen and (min-width:80rem){.book-page,.book-menu .book-menu-content,.book-toc .book-toc-content{padding:2rem 1rem}}@media print{.book-menu,.book-footer,.book-toc{display:none}.book-header,.book-header aside{display:block}main{display:block!important}}.markdown{line-height:1.6}.markdown>:first-child{margin-top:0}.markdown h1,.markdown h2,.markdown h3,.markdown h4,.markdown h5,.markdown h6{font-weight:inherit;line-height:1;margin-top:1.5em;margin-bottom:1rem}.markdown h1 a.anchor,.markdown h2 a.anchor,.markdown h3 a.anchor,.markdown h4 a.anchor,.markdown h5 a.anchor,.markdown h6 a.anchor{opacity:0;font-size:.75em;margin-inline-start:.25em}.markdown h1:hover a.anchor,.markdown h1 a.anchor:focus-visible,.markdown h2:hover a.anchor,.markdown h2 a.anchor:focus-visible,.markdown h3:hover a.anchor,.markdown h3 a.anchor:focus-visible,.markdown h4:hover a.anchor,.markdown h4 a.anchor:focus-visible,.markdown h5:hover a.anchor,.markdown h5 a.anchor:focus-visible,.markdown h6:hover a.anchor,.markdown h6 a.anchor:focus-visible{opacity:initial;text-decoration:none}.markdown h1{font-size:2rem}.markdown h2{font-size:1.5rem}.markdown h3{font-size:1.25rem}.markdown h4{font-size:1.125rem}.markdown h5{font-size:1rem}.markdown h6{font-size:.875rem}.markdown b,.markdown optgroup,.markdown strong{font-weight:bolder}.markdown a{text-decoration:none}.markdown a[href]:hover{text-decoration:underline}.markdown a[href]:visited{color:var(--color-visited-link)}.markdown img{max-width:100%;height:auto}.markdown code{direction:ltr;unicode-bidi:embed;padding:.125em .25em;background:var(--gray-100);border:1px solid var(--gray-200);border-radius:var(--border-radius);font-size:.875em}.markdown pre{padding:1rem;background:var(--gray-100);border:1px solid var(--gray-200);border-radius:var(--border-radius);overflow-x:auto}.markdown pre:focus{outline-style:auto;outline-color:var(--color-link)}.markdown pre code{padding:0;border:0;background:0 0}.markdown p{word-wrap:break-word}.markdown blockquote{margin:1rem 0;padding:.5rem 1rem .5rem .75rem;border-inline-start:.25rem solid var(--gray-200);border-radius:var(--border-radius)}.markdown blockquote :first-child{margin-top:0}.markdown blockquote :last-child{margin-bottom:0}.markdown table{overflow:auto;display:block;border-spacing:0;border-collapse:collapse;margin-top:1rem;margin-bottom:1rem}.markdown table tr th,.markdown table tr td{padding:.5rem 1rem;border:1px solid var(--gray-200);text-align:start}.markdown table tr:nth-child(2n){background:var(--gray-100)}.markdown hr{height:1px;border:none;background:var(--gray-200)}.markdown ul,.markdown ol{padding-inline-start:2rem;word-wrap:break-word}.markdown dl dt{font-weight:bolder;margin-top:1rem}.markdown dl dd{margin-inline-start:0;margin-bottom:1rem}.markdown .highlight{direction:ltr;unicode-bidi:embed;border-radius:var(--border-radius)}.markdown .highlight table tbody{border:1px solid var(--gray-200)}.markdown .highlight table tr pre{border:0}.markdown .highlight table tr td pre code>span{display:flex}.markdown .highlight table tr td:nth-child(1) pre{margin:0;padding-inline-end:0}.markdown .highlight table tr td:nth-child(2) pre{margin:0;padding-inline-start:0}.markdown details{padding:1rem;margin:1rem 0;border:1px solid var(--gray-200);border-radius:var(--border-radius)}.markdown details summary{line-height:1;padding:1rem;margin:-1rem;cursor:pointer;list-style:none}.markdown details summary::before{content:"›";display:inline-block;margin-inline-end:.5rem;transition:transform .1s ease-in-out}.markdown details[open] summary{margin-bottom:0}.markdown details[open] summary::before{transform:rotate(90deg)}.markdown figure{margin:1rem 0}.markdown figure figcaption{margin-top:1rem}.markdown-inner>:first-child,.markdown .book-steps>ol>li>:first-child,.markdown figure figcaption>:first-child{margin-top:0}.markdown-inner>:last-child,.markdown .book-steps>ol>li>:last-child,.markdown figure figcaption>:last-child{margin-bottom:0}.markdown .book-tabs{margin-top:1rem;margin-bottom:1rem;border:1px solid var(--gray-200);border-radius:var(--border-radius);display:flex;flex-wrap:wrap}.markdown .book-tabs label{display:inline-block;padding:.5rem 1rem;border-bottom:1px transparent;cursor:pointer}.markdown .book-tabs .book-tabs-content{order:999;width:100%;border-top:1px solid var(--gray-100);padding:1rem;display:none}.markdown .book-tabs input[type=radio]:checked+label{border-bottom:1px solid var(--color-link)}.markdown .book-tabs input[type=radio]:checked+label+.book-tabs-content{display:block}.markdown .book-columns{gap:1rem}.markdown .book-columns>div{margin:1rem 0;min-width:13.2rem}.markdown .book-columns>ul{list-style:none;display:flex;padding:0;flex-wrap:wrap;gap:1rem}.markdown .book-columns>ul>li{flex:1 1;min-width:13.2rem}.markdown a.book-btn[href]{display:inline-block;font-size:var(--font-size-smaller);color:var(--color-link);line-height:2rem;padding:0 1rem;border:1px solid var(--color-link);border-radius:var(--border-radius);cursor:pointer}.markdown a.book-btn[href]:hover{text-decoration:none}.markdown .book-hint.note{border-color:var(--color-accent-note);background-color:var(--color-accent-note-tint)}.markdown .book-hint.tip{border-color:var(--color-accent-tip);background-color:var(--color-accent-tip-tint)}.markdown .book-hint.important{border-color:var(--color-accent-important);background-color:var(--color-accent-important-tint)}.markdown .book-hint.warning{border-color:var(--color-accent-warning);background-color:var(--color-accent-warning-tint)}.markdown .book-hint.caution{border-color:var(--color-accent-caution);background-color:var(--color-accent-caution-tint)}.markdown .book-hint.default{border-color:var(--color-accent-default);background-color:var(--color-accent-default-tint)}.markdown .book-hint.info{border-color:var(--color-accent-info);background-color:var(--color-accent-info-tint)}.markdown .book-hint.success{border-color:var(--color-accent-success);background-color:var(--color-accent-success-tint)}.markdown .book-hint.danger{border-color:var(--color-accent-danger);background-color:var(--color-accent-danger-tint)}.markdown .book-badge{display:inline-block;font-size:var(--font-size-smaller);font-weight:var(--body-font-weight);vertical-align:middle;border-radius:var(--border-radius);border:1px solid var(--accent-color);overflow:hidden;text-wrap:nowrap;color:var(--body-font-color)}.markdown .book-badge.note{--accent-color:var(--color-accent-note)}.markdown .book-badge.tip{--accent-color:var(--color-accent-tip)}.markdown .book-badge.important{--accent-color:var(--color-accent-important)}.markdown .book-badge.warning{--accent-color:var(--color-accent-warning)}.markdown .book-badge.caution{--accent-color:var(--color-accent-caution)}.markdown .book-badge.default{--accent-color:var(--color-accent-default)}.markdown .book-badge.info{--accent-color:var(--color-accent-info)}.markdown .book-badge.success{--accent-color:var(--color-accent-success)}.markdown .book-badge.danger{--accent-color:var(--color-accent-danger)}.markdown .book-badge span{display:inline-block;padding:0 .5rem}.markdown .book-badge span.book-badge-value{color:var(--body-background);background-color:var(--accent-color)}.markdown .book-steps{position:relative}.markdown .book-steps>ol{counter-reset:steps;list-style:none;padding-inline-start:1.25rem;margin-top:2rem}.markdown .book-steps>ol>li::before{content:counter(steps);counter-increment:steps;position:absolute;display:flex;justify-content:center;left:.5rem;height:1.5rem;width:1.5rem;padding:.25rem;border-radius:.5rem;white-space:nowrap;line-height:1rem;color:var(--body-background);background:var(--gray-500);outline:.25rem solid var(--body-background)}.markdown .book-steps>ol>li{border-inline-start:1px solid var(--gray-500);padding-inline-start:3rem;padding-bottom:2rem}.markdown .book-steps>ol>li:last-child{border:0}.markdown .book-card{display:block;overflow:hidden;height:100%;border-radius:var(--border-radius);border:1px solid var(--gray-200)}.markdown .book-card>a{display:block;height:100%}.markdown .book-card>a[href],.markdown .book-card>a[href]:visited{color:var(--body-font-color)}.markdown .book-card>a[href]:hover{text-decoration:none;background:var(--gray-100)}.markdown .book-card>a>img,.markdown .book-card>img{width:100%;display:block;aspect-ratio:4/3;object-fit:cover}.markdown .book-card .markdown-inner,.markdown .book-card figure figcaption,.markdown figure .book-card figcaption,.markdown .book-card .book-steps>ol>li{padding:1rem}.markdown .book-image input+img{cursor:zoom-in;transition:transform .2s ease-in-out}.markdown .book-image input:checked+img{position:fixed;top:0;left:0;right:0;bottom:0;background:var(--body-background);object-fit:contain;width:100%;height:100%;z-index:1;cursor:zoom-out;padding:1rem}.markdown .book-asciinema{margin:1rem 0}.markdown .book-hero{min-height:24rem;align-content:center}.markdown .book-hero h1{font-size:3em}.markdown .book-codeblock-filename{background:var(--gray-100);border:1px solid var(--gray-200);border-bottom:0;font-size:var(--font-size-smaller);margin-top:1rem;padding:.25rem .5rem;border-start-start-radius:var(--border-radius);border-start-end-radius:var(--border-radius)}.markdown .book-codeblock-filename a{color:var(--body-font-color)}.markdown .book-codeblock-filename+.highlight pre{margin-top:0;border-start-start-radius:0;border-start-end-radius:0}:root{--body-background:white;--body-background-tint:none;--body-font-color:black;--color-link:#0055bb;--color-visited-link:#5500bb;--icon-filter:none;--gray-100:#f8f9fa;--gray-200:#e9ecef;--gray-500:#adb5bd;--color-accent-default:#64748b;--color-accent-default-tint:rgba(100, 116, 139, 0.1);--color-accent-note:#4486dd;--color-accent-note-tint:rgba(68, 134, 221, 0.1);--color-accent-tip:#3bad3b;--color-accent-tip-tint:rgba(59, 173, 59, 0.1);--color-accent-important:#8144dd;--color-accent-important-tint:rgba(129, 68, 221, 0.1);--color-accent-warning:#f59e42;--color-accent-warning-tint:rgba(245, 158, 66, 0.1);--color-accent-caution:#d84747;--color-accent-caution-tint:rgba(216, 71, 71, 0.1);--color-accent-info:#4486dd;--color-accent-info-tint:rgba(68, 134, 221, 0.1);--color-accent-success:#3bad3b;--color-accent-success-tint:rgba(59, 173, 59, 0.1);--color-accent-danger:#d84747;--color-accent-danger-tint:rgba(216, 71, 71, 0.1)}@media(prefers-color-scheme:dark){:root{--body-background:#343a40;--body-background-tint:none;--body-font-color:#e9ecef;--color-link:#84b2ff;--color-visited-link:#b88dff;--icon-filter:brightness(0) invert(1);--gray-100:#494e54;--gray-200:#5c6165;--gray-500:#999d9f;--color-accent-default:#64748b;--color-accent-default-tint:rgba(100, 116, 139, 0.1);--color-accent-note:#4486dd;--color-accent-note-tint:rgba(68, 134, 221, 0.1);--color-accent-tip:#3bad3b;--color-accent-tip-tint:rgba(59, 173, 59, 0.1);--color-accent-important:#8144dd;--color-accent-important-tint:rgba(129, 68, 221, 0.1);--color-accent-warning:#f59e42;--color-accent-warning-tint:rgba(245, 158, 66, 0.1);--color-accent-caution:#d84747;--color-accent-caution-tint:rgba(216, 71, 71, 0.1);--color-accent-info:#4486dd;--color-accent-info-tint:rgba(68, 134, 221, 0.1);--color-accent-success:#3bad3b;--color-accent-success-tint:rgba(59, 173, 59, 0.1);--color-accent-danger:#d84747;--color-accent-danger-tint:rgba(216, 71, 71, 0.1)}}
\ No newline at end of file
diff --git a/docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.json b/docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.json
new file mode 100644
index 0000000..2fff0b2
--- /dev/null
+++ b/docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.json
@@ -0,0 +1 @@
+{"Target":"book.min.6970156cec683193d93c9c4edaf0d56574e4361df2e0c1be4f697ae81c3ba55f.css","MediaType":"text/css","Data":{"Integrity":"sha256-aXAVbOxoMZPZPJxO2vDVZXTkNh3y4MG+T2l66Bw7pV8="}}
\ No newline at end of file
diff --git a/docs/src/architecture.md b/docs/src/architecture.md
index 88edd90..a2940e2 100644
--- a/docs/src/architecture.md
+++ b/docs/src/architecture.md
@@ -1,8 +1,7 @@
 ---
-layout: page
 title: "Homelab Architecture"
-permalink: /architecture/
-nav_order: 1
+url: "/architecture/"
+weight: 1
 ---
 
 # Homelab Architecture
diff --git a/docs/src/cicd.md b/docs/src/cicd.md
index 4ad86a9..326ad14 100644
--- a/docs/src/cicd.md
+++ b/docs/src/cicd.md
@@ -1,8 +1,7 @@
 ---
-layout: page
 title: "CI/CD Pipeline"
-permalink: /cicd/
-nav_order: 5
+url: "/cicd/"
+weight: 5
 ---
 
 # CI/CD Pipeline
@@ -11,7 +10,7 @@ Automated testing, building, and releasing for fetch_ml.
 
 ## Workflows
 
-### CI Workflow (`.github/workflows/ci.yml`)
+### CI Workflow (`.forgejo/workflows/ci.yml`)
 
 Runs on every push to `main`/`develop` and all pull requests.
 
@@ -29,7 +28,7 @@ Runs on every push to `main`/`develop` and all pull requests.
 - Integration tests
 - Security audits
 
-### Release Workflow (`.github/workflows/release.yml`)
+### Release Workflow (`.forgejo/workflows/release-mirror.yml`)
 
 Runs on version tags (e.g., `v1.0.0`).
 
@@ -49,7 +48,7 @@ Runs on version tags (e.g., `v1.0.0`).
 3. **create-release**
    - Collects all artifacts
    - Generates SHA256 checksums
-   - Creates GitHub release with notes
+   - Mirrors release artifacts to GitHub Releases
 
 ## Release Process
 
@@ -141,7 +140,8 @@ ZIG_VERSION: '0.15.2'
 ### Secrets
 
 Required for releases:
-- `GITHUB_TOKEN` - Automatic, provided by GitHub Actions
+- `GH_MIRROR_TOKEN` - GitHub token for publishing mirrored releases
+- `GH_MIRROR_REPO` (variable) - GitHub repo slug, e.g. `jfraeysd/fetch_ml`
 
 ## Monitoring
 
@@ -149,7 +149,7 @@ Required for releases:
 
 Check workflow runs at:
 ```
-https://github.com/jfraeys/fetch_ml/actions
+https://git.jfraeys.com/jfraeysd/fetch_ml/actions
 ```
 
 ### Artifacts
@@ -161,5 +161,5 @@ Download build artifacts from:
 ---
 
 For implementation details:
-- [.github/workflows/ci.yml](https://github.com/jfraeys/fetch_ml/blob/main/.github/workflows/ci.yml)
-- [.github/workflows/release.yml](https://github.com/jfraeys/fetch_ml/blob/main/.github/workflows/release.yml)
+- `.forgejo/workflows/ci.yml`
+- `.forgejo/workflows/release-mirror.yml`
diff --git a/docs/src/cli-reference.md b/docs/src/cli-reference.md
index 41c1a0c..0731b40 100644
--- a/docs/src/cli-reference.md
+++ b/docs/src/cli-reference.md
@@ -1,8 +1,7 @@
 ---
-layout: page
 title: "CLI Reference"
-permalink: /cli-reference/
-nav_order: 2
+url: "/cli-reference/"
+weight: 2
 ---
 
 # Fetch ML CLI Reference
@@ -37,6 +36,7 @@ High-performance command-line interface for experiment management, written in Zi
 | `jupyter` | Manage Jupyter notebook services | `ml jupyter start --name my-nb` |
 | `validate` | Validate provenance/integrity for a commit or task | `ml validate <commit_id> --verbose` |
 | `info` | Show run info from `run_manifest.json` | `ml info <run_dir>` |
+| `requeue` | Re-submit an existing run/commit with new args/resources | `ml requeue <commit_id|run_id|task_id|path> -- --epochs 20` |
 
 ### Command Details
 
@@ -72,8 +72,11 @@ ml sync ./my-project --priority 9
 # Queue with commit ID
 ml queue my-job --commit abc123def456
 
-# Queue with priority (1-10, default 5)
+# Queue with commit ID prefix (>=7 hex chars; must be unique)
 ml queue my-job --commit abc123 --priority 8
+
+# Queue with extra runner args (stored as task.Args)
+ml queue my-job --commit abc123 -- --epochs 5 --lr 1e-3
 ```
 
 **Features:**
@@ -81,6 +84,34 @@ ml queue my-job --commit abc123 --priority 8
 - Priority queuing system
 - API key authentication
 
+**Notes:**
+- `--priority` is passed to the server as a single byte (0-255).
+- Args are sent via a dedicated queue opcode and become `task.Args` on the worker.
+- `--commit` may be a full 40-hex commit id or a unique prefix (>=7 hex chars) resolvable under `worker_base`.
+
+#### `requeue` - Re-submit a Previous Run
+```bash
+# Requeue directly by commit_id
+ml requeue <commit_id> -- --epochs 20
+
+# Requeue by commit_id prefix (>=7 hex chars; must be unique)
+ml requeue <commit_prefix> -- --epochs 20
+
+# Requeue by run_id/task_id (CLI scans run_manifest.json under worker_base)
+ml requeue <run_id> -- --epochs 20
+
+# Requeue by a run directory or run_manifest.json path
+ml requeue /data/ml-experiments/finished/<run_id> -- --epochs 20
+
+# Override priority/resources on requeue
+ml requeue <task_id> --priority 10 --gpu 1 -- --epochs 20
+```
+
+**What it does:**
+- Locates `run_manifest.json`
+- Extracts `commit_id`
+- Submits a new queue request using that `commit_id` with optional overridden args/resources
+
 **Notes:**
 - Tasks support optional `snapshot_id` and `dataset_specs` fields server-side (for provenance and dataset resolution).
 
diff --git a/docs/src/installation.md b/docs/src/installation.md
index 6656913..19b216c 100644
--- a/docs/src/installation.md
+++ b/docs/src/installation.md
@@ -12,7 +12,7 @@ make install
 ./bin/ml setup
 
 # 3. Run experiments
-./bin/ml run my-experiment.py
+./cli/zig-out/bin/ml queue my-job
 ```
 
 That's it. Everything else is optional.
diff --git a/docs/src/jupyter-workflow.md b/docs/src/jupyter-workflow.md
index 2ebf5d9..361326b 100644
--- a/docs/src/jupyter-workflow.md
+++ b/docs/src/jupyter-workflow.md
@@ -75,9 +75,32 @@ environment:
   
 security:
   trusted_channels: ["conda-forge", "defaults", "pytorch"]
-  blocked_packages: ["requests", "urllib3"]
+  blocked_packages: ["aiohttp", "telnetlib"]
 ```
 
+You can also override the blocked package list at runtime using an environment variable on the worker:
+
+```bash
+export FETCHML_JUPYTER_BLOCKED_PACKAGES="aiohttp,telnetlib"
+```
+
+Some base images (including the default `quay.io/jupyter/base-notebook`) ship with common HTTP client libraries
+like `requests`, `urllib3`, and `httpx` preinstalled.
+
+If you want to **block installing** packages like `requests`, `urllib3`, and `httpx` for security reasons but still
+use a base image that already includes them, you can disable the **startup image scan** separately:
+
+```bash
+# Block installs (user requests)
+export FETCHML_JUPYTER_BLOCKED_PACKAGES="requests,urllib3,httpx"
+
+# Allow base images that already contain these packages to start
+export FETCHML_JUPYTER_STARTUP_BLOCKED_PACKAGES="off"
+```
+
+If you want startup scanning enabled, set `FETCHML_JUPYTER_STARTUP_BLOCKED_PACKAGES` to a comma-separated list.
+
+
 ### Access Control
 
 ```bash
diff --git a/docs/src/index.md b/docs/src/landing.md
similarity index 98%
rename from docs/src/index.md
rename to docs/src/landing.md
index 38dfd59..6f44890 100644
--- a/docs/src/index.md
+++ b/docs/src/landing.md
@@ -1,6 +1,5 @@
 ---
-layout: default
-title: Fetch ML Documentation
+title: "Fetch ML Documentation"
 bookHidden: true
 ---
 
diff --git a/docs/src/operations.md b/docs/src/operations.md
index 3a580a7..7f78ede 100644
--- a/docs/src/operations.md
+++ b/docs/src/operations.md
@@ -1,8 +1,7 @@
 ---
-layout: page
 title: "Operations Runbook"
-permalink: /operations/
-nav_order: 6
+url: "/operations/"
+weight: 6
 ---
 
 # Operations Runbook
diff --git a/docs/src/queue.md b/docs/src/queue.md
index 08e9f37..2ae52fd 100644
--- a/docs/src/queue.md
+++ b/docs/src/queue.md
@@ -1,8 +1,7 @@
 ---
-layout: page
 title: "Task Queue Architecture"
-permalink: /queue/
-nav_order: 3
+url: "/queue/"
+weight: 3
 ---
 
 # Task Queue Architecture
diff --git a/docs/src/quick-start.md b/docs/src/quick-start.md
index 3b275e6..9d41d47 100644
--- a/docs/src/quick-start.md
+++ b/docs/src/quick-start.md
@@ -9,8 +9,8 @@ Get Fetch ML running in minutes with Docker Compose and integrated monitoring.
 - **Podman**: For production experiment execution
 
 **Requirements:**
-- Go 1.21+
-- Zig 0.11+
+- Go 1.25+
+- Zig 0.15+
 - Docker Compose (testing only)
 - 4GB+ RAM
 - 2GB+ disk space
@@ -137,8 +137,7 @@ cd cli && zig build --release=fast
 # Common operations
 ./cli/zig-out/bin/ml status          # Check system status
 ./cli/zig-out/bin/ml queue job-name  # Queue job
-./cli/zig-out/bin/ml list           # List jobs
-./cli/zig-out/bin/ml help           # Show help
+./cli/zig-out/bin/ml --help         # Show help
 ```
 
 ### Monitoring Commands
diff --git a/docs/src/redis-ha.md b/docs/src/redis-ha.md
index 4b7f17b..1d6f783 100644
--- a/docs/src/redis-ha.md
+++ b/docs/src/redis-ha.md
@@ -1,8 +1,7 @@
 ---
-layout: page
 title: "Redis High Availability (Optional)"
-permalink: /redis-ha/
-nav_order: 7
+url: "/redis-ha/"
+weight: 7
 ---
 
 # Redis High Availability
diff --git a/docs/src/research-runner-plan.md b/docs/src/research-runner-plan.md
new file mode 100644
index 0000000..9653924
--- /dev/null
+++ b/docs/src/research-runner-plan.md
@@ -0,0 +1,667 @@
+# Research-First Runner: Missing Themes Plan
+
+This file captures additional themes that are commonly missing in existing ML runners/experiment tools, translated into actionable design targets for a lightweight, research-first runner.
+
+## Quick Overview
+
+**What makes this different:**
+- **Your server, not their cloud**: Everything runs on your homelab/workstation/uni server
+- **Dual interfaces**: Zig CLI for scripting + SSH-accessible TUI for interactive work
+- **Fair queueing**: `ml queue` (not `run`) makes resource sharing explicit
+- **Research narrative**: Capture why you ran experiments, not just what ran
+- **Zero SaaS**: No accounts, web dashboards, or external services
+- **Plain text everything**: Human-readable manifests, long-term reproducibility
+
+**Perfect for:** Researchers in uni labs, homelab enthusiasts, small research groups who want control over their infrastructure without cloud vendor lock-in.
+
+## Architecture Context
+
+**Server-Centric Model for Homelab/Workstation/Uni Lab:**
+- **Two client interfaces**:
+  - **Zig CLI**: Thin WebSocket client for scripting, automation, remote access
+  - **SSH-accessible TUI**: Interactive Bubble Tea UI for monitoring when SSH'd into server
+- Go API server with embedded rsync (reduces dependencies)
+- Worker pulls from flexible queue backend (Redis/SQLite/filesystem)
+- Priority-based scheduling with prewarm mechanism
+- NAS integration for data prefetching
+- Target: single server, workstation, or small uni lab cluster (not cloud/SaaS)
+
+**Client Access Patterns:**
+```bash
+# CLI (from anywhere via WebSocket)
+ml queue train.py --epochs 100
+ml status --watch
+ml info <path|id>
+
+# TUI (when SSH'd into server or jump box)
+ssh mluser@worker.local
+ml-tui  # Interactive terminal UI
+# Navigate with keyboard, see live updates
+```
+
+**Configuration:**
+```toml
+# ~/.ml/config.toml (shared by both CLI and TUI)
+worker_host = "worker.local"
+worker_user = "mluser" 
+worker_base = "/data/ml-experiments"
+worker_port = 22
+api_key = "your-api-key"
+```
+
+## Plan (Missing Themes)
+
+## Implemented Today (in this repo)
+
+- Runs are queued via `ml queue` and processed by workers.
+- Run provenance is written to `run_manifest.json`.
+- You can attach queue-time notes with `ml queue --note "..."` (persisted under `run_manifest.json` → `metadata.note`).
+- Queue backends support Redis / SQLite / filesystem (and optional filesystem fallback).
+- CLI + SSH-launched TUI are both available (`ml monitor` launches the TUI).
+
+## Future Ideas (this document)
+
+### 1. Own-infrastructure-first, research-centric by default
+
+### 2. Minimal server dependencies (simple operations)
+
+### 3. Text-first tracking (logs > dashboards)
+
+- **Research narrative completion**: post-run outcome/learnings/next steps captured in the manifest
+- **Auto-captured context**:
+  - Command + args (as sent from CLI)
+  - Timestamps (queue time, start time, end time)
+  - Git commit hash (and optionally diff)
+  - Environment snapshot (pip freeze, conda export, container image digest)
+  - Hardware context (GPU model, driver version, CUDA version)
+- **Plain text manifests**: JSON or YAML, never binary blobs
+- **Stable formats**: Can read experiments from 5 years ago without the runner
+
+**Implementation note**: Server writes `run_manifest.json` to experiment directory. CLI can display it via `ml info`.
+
+### 4. CLI and TUI as complementary interfaces
+
+- **Consistent CLI scripting UX**: Future idea (uniform `--json`, quiet modes, and stable exit codes across commands)
+- **TUI feature parity**: Future idea (surface the same key details in TUI + CLI: queue position/ETA, narrative, validation results)
+
+### 5. Failure-tolerant, messy-research friendly
+
+- **Failure is first-class**: Failed runs stay visible and queryable
+- **Partial artifacts preserved**: Keep artifacts/logs up to failure point (including checkpoints, if the script produces them)
+- **No punishment for refactors**: Script renames don't break history
+- **Grouping/tagging**: Label attempts (baseline/ablation/debug/exploration)
+
+**Server implementation**: Worker should catch exceptions, record failure reason, preserve state. Queue should track failure modes (OOM, timeout, code error, data error).
+
+### 6. Minimal abstraction over Python (transparent execution)
+
+- **Run scripts as-is**: No decorators, no framework rewrites
+- **Preserve debuggability**: Clean stack traces, pdb works
+- **Optional instrumentation**: Explicit metric logging via simple API
+  ```python
+  # Optional, not required
+  from ml_runner import log_metric
+  log_metric("loss", 0.5, step=100)
+  ```
+- **Standard I/O works**: `print()` goes to logs, arguments via `sys.argv`
+
+**Server implementation**: Worker spawns process, captures stdout/stderr, parses optional structured logs. No magic wrappers that hide what's happening.
+
+### 7. Reproducibility that survives time
+
+- **Immutable run folders**: Server never modifies completed runs
+- **Environment capture** (best-effort, pluggable):
+  - Container image digest (primary method)
+  - `pip freeze` / `uv pip freeze` / `poetry.lock`
+  - `conda env export`
+  - `nix flake.lock` (if available)
+- **Hardware fingerprint**: GPU model, driver, CUDA, CPU, RAM
+- **Data provenance**: Dataset checksums, NAS paths, version identifiers
+- **Commit everything**: Store full environment, even if verbose
+
+**Server implementation**: Pre-run hook captures environment. Store in `run_manifest.json`. Validate on `ml validate <run-id>`.
+
+### 8. Small compute and shared machine friendliness
+
+### 9. Server-side storage with client-side visibility
+- **Energy awareness**: Respect that homelabs pay electricity bills
+- **Laptop-friendly**: Support thermal/power throttling
+- **Single-GPU to 4-GPU range**: Optimize for typical research setups
+- **No cluster assumptions**: Don't require Kubernetes/SLURM/etc.
+
+**Why this matters**: Researchers want to `ls` experiment directories but don't want to manually sync. Server handles storage, CLI provides views.
+
+### 11. Research narrative (lab notebook, not job IDs)
+
+- **Queue-time narrative capture**: Future idea (add `--hypothesis`, `--context`, `--intent`, etc. to `ml queue`)
+- **Post-run learning capture**: Future idea (explicit `outcome`, `learnings[]`, `next_steps[]`, and validation status)
+- **Narrative UX**: Future idea (view/edit narrative from TUI/CLI without hand-editing JSON)
+
+**CLI commands**:
+```bash
+ml queue train.py --note "Testing warmup hypothesis from paper X"
+```
+
+  - CLI: WebSocket streaming for `--watch` and `--follow`
+  - TUI: Live refresh (500ms tick), immediate queue updates
+- **No magic**: Minimize implicit behavior
+  - Explicit is better than clever
+  - Defaults should be obvious and documented
+  - Side effects should be visible (both in CLI and TUI)
+  - Configuration hierarchy clear: CLI flags > env > config file > defaults
+
+**TUI advantages for observability:**
+- See everything at once: jobs, queue, GPUs, containers, logs
+- Keyboard shortcuts for common operations
+- Instant feedback on actions (queue, cancel, delete)
+- Prewarm state visible in GPU panel
+- No need to run multiple `ml status` commands
+
+### 13. Support clear thinking during experimentation
+
+- **Optimize for cognitive throughput**:
+  - Make it easy to remember what you were thinking
+  - Surface patterns across experiments
+  - Warn about near-duplicates before running
+- **Built-in comparison**:
+  ```bash
+  # Future ideas:
+  # ml diff <run-a> <run-b>
+  # ml similar <run-id>
+  ```
+- **Learning from history**:
+  ```bash
+  # Future ideas:
+  # ml lessons --tag ablation
+  # ml dead-ends
+  ```
+- **Hypothesis tracking**:
+  - Link hypothesis → experiment → outcome → next hypothesis
+  - Mark outcomes: validates/refutes/inconclusive
+- **Reduce cognitive load**:
+  - Natural queries: Future idea (search over manifests/notes)
+  - Show relevant history when queueing
+  - Don't make researchers remember IDs
+
+**Server implementation**: Maintain index (rebuildable from filesystem). Support semantic queries over manifests, notes, tags.
+
+### 14. Fast iteration velocity
+
+- **Easy modification**:
+  ```bash
+  # Future ideas:
+  # ml clone <run-id>
+  # ml fork <run-id>
+  ```
+- **Batch operations**:
+  ```bash
+  # Future idea: ml sweep
+  ```
+
+**Why prewarm matters**: Your NAS prefetch in prewarm means jobs start training immediately instead of waiting for data. This dramatically improves iteration velocity.
+
+### 15. Full research lifecycle support
+
+- **Exploration phase**: Minimal metadata, quick runs
+- **Development phase**: Group attempts, compare variations
+- **Validation phase**: Strict reproducibility, complete capture
+- **Publication phase**: Export bundles, generate reproduction instructions
+- **Maintenance phase**: Long-term readable, re-executable years later
+
+**Reproducibility levels** (your strict/best-effort model):
+```bash
+# Future idea: --repro-level
+ml validate <commit_id>                     # Future idea: expand validation coverage + outputs
+```
+
+### 16. Collaboration without platforms
+
+- **Async collaboration** (no shared server required):
+  ```bash
+  # Future ideas:
+  # ml export <run-id> --bundle run_42.tar.gz
+  # ml import run_42.tar.gz
+  ```
+- **Selective sharing**:
+  ```bash
+  # Future ideas:
+  # ml export <run-id> --metadata-only
+  # ml export <run-id> --include-artifacts
+  ```
+- **Review-friendly**:
+  - Self-contained bundles
+  - All provenance included
+  - Reproducibility instructions
+  - No "install our platform" friction
+
+**Server implementation**: Export packages `run_manifest.json` + artifacts into tarball. Import validates and unpacks into experiments directory.
+
+### 17. Graceful degradation
+
+- **Core works with minimal setup**:
+  - Filesystem-only queue (no Redis required)
+  - SQLite for metadata (no Postgres)
+  - Local execution (no remote targets needed)
+- **Optional enhancements**:
+  - Redis for better multi-worker queueing
+  - Git integration (works without git)
+  - NAS prewarm (falls back to on-demand fetch)
+  - WebSocket updates (falls back to polling)
+- **Progressive disclosure**:
+  - Simple commands for simple cases
+  - Advanced flags for power users
+  - Features activate when available
+
+**Implementation note**:
+
+### 18. Concrete features (derived from above)
+
+#### Findability
+```bash
+# Future ideas:
+# ml find "failed runs on GPU2 last week"
+# ml find --note "warmup"
+```
+Server maintains rebuildable index over manifests, logs, tags.
+
+#### Dataset provenance
+```json
+{
+  "datasets": [
+    {
+      "name": "imagenet-train",
+      "nas_path": "/nas/datasets/imagenet/train",
+      "checksum": "sha256:abc123...",
+      "fetched_at": "2024-01-15T10:30:00Z",
+      "fetch_method": "prewarm"
+    }
+  ]
+}
+```
+Server validates checksums, warns on drift.
+
+#### Prewarm observability
+```bash
+ml status
+# Shows:
+#   Next in queue: run_xyz (priority 5)
+#   Prewarming: dataset imagenet-train (2/5 complete)
+#   GPU 0: running run_abc (50% complete, ETA 2h)
+#   GPU 1: idle
+```
+
+#### CLI queue/requeue workflows
+
+**Core principle**: the runner does not introduce checkpoint conventions. The script should run identically when executed directly vs via `ml`.
+
+**Passive artifact tracking** (future idea): worker records what files exist in the run directory after completion (or via configured glob patterns). Checkpoints are just artifacts.
+
+**Requeue = replay command with modifications** (future idea):
+```bash
+# Original run
+ml queue train.py --epochs 100 --save-dir ./checkpoints
+
+# Requeue (continue)
+ml requeue run_abc -- --resume ./checkpoints/best.pt --epochs 200
+```
+
+**Arg merge strategies** (future idea):
+```bash
+# Append new args (default)
+ml requeue run_abc --append -- --resume ./checkpoints/best.pt
+
+# Replace (rerun with only new args)
+ml requeue run_abc --replace -- --epochs 200 --lr 3e-4
+
+# Merge (override matching flags, keep the rest)
+ml requeue run_abc --merge -- --epochs 200
+```
+
+**Optional staging** (future idea): copy an artifact from the source run into the new run directory, then reference it with a placeholder.
+```bash
+ml requeue run_abc --stage checkpoints/best.pt -- \
+  --resume {staged}/best.pt --epochs 200
+```
+
+#### Hardware/resource management
+```json
+{
+  "resources": {
+    "gpus": 2,
+    "gpu_memory_gb": 40,
+    "cpu_cores": 16,
+    "ram_gb": 64,
+    "disk_gb": 100,
+    "max_runtime_hours": 24
+  }
+}
+```
+Worker validates resources before pulling from queue. Server tracks utilization.
+
+---
+
+## Design Philosophy Summary (Server-Centric)
+
+The goal is to build a **research assistant that runs on YOUR server**, not a platform that runs on someone else's cloud.
+
+### Every feature should answer:
+
+1. Does this help researchers **understand** what happened on the server?
+2. Does this make the server **transparent** instead of a black box?
+3. Does this work on a **single workstation** or small lab server?
+4. Does this respect that researchers **SSH into the server**?
+5. Does this make **local data** (NAS, scratch drives) first-class?
+
+### Architecture principles:
+
+- **Server is the control plane**: All logic, storage, scheduling on server
+- **CLI is a thin client**: Just communicates via WebSocket, no local state
+- **Filesystem is still king**: Server writes plain text, CLI reads via API
+- **Queue-first for fairness**: `ml queue` not `ml run` - explicit resource requests
+- **Priority without hogging**: Higher priority = earlier in queue, not exclusive access
+- **Prewarm is a performance optimization**: Best-effort, never required for correctness
+- **NAS integration is native**: Server understands mounted storage
+
+### When in doubt:
+
+- **Server-side is better** than client-side (for logic)
+- **WebSocket is better** than REST (for interactivity)
+- **Embedded is better** than external deps (rsync in server)
+- **Flexible backend is better** than required service (Redis OR SQLite OR filesystem)
+- **Plain text is better** than binary
+- **Your hardware is better** than their cloud
+
+The runner should feel like **SSH into your well-organized research server with powerful tools**, not like operating a cloud platform. Whether you're using the CLI for automation or the TUI for interactive work, the experience should be transparent, fair, and research-focused.
+
+---
+
+## Typical Research Workflows (CLI + TUI)
+
+### Morning Routine: Check What Happened Overnight
+```bash
+# From your laptop (via WebSocket)
+ml status
+# Shows: 2 finished, 1 running, 3 in queue
+
+ml info run_abc --show-metrics
+# Quick check: did the overnight run validate the hypothesis?
+
+# If you need deep investigation, SSH in
+ssh mluser@worker.local
+ml-tui
+# Visual inspection of logs, GPU usage, etc.
+```
+
+### Starting a New Experiment Series
+```bash
+# Script a parameter sweep (CLI automation)
+for lr in 1e-3 3e-4 1e-4; do
+  ml queue train.py --lr $lr \
+    # Future idea: --hypothesis / --experiment-group
+    --priority 5
+done
+
+# Monitor in TUI (interactive)
+ssh mluser@worker.local
+ml-tui
+# Watch queue, see ETA, check prewarm status
+```
+
+### Debugging a Failed Run
+```bash
+# Notice failure via CLI
+ml status
+# run_xyz: failed (exit code 137) - OOM?
+
+# Jump into TUI for investigation
+ssh mluser@worker.local
+ml-tui
+# Navigate to run_xyz, press 'l' for logs
+# See OOM error at batch 128
+# Future idea: narrative/annotation UX in the TUI
+```
+
+### End-of-Day Review
+```bash
+# TUI for visual summary
+ssh mluser@worker.local
+ml-tui
+# Scroll through today's runs
+# Future ideas: compare views, export bundles
+```
+
+### Paper Writing Time (6 months later)
+```bash
+# Today: use the filesystem + run manifests
+ml info <path|id>
+
+# Future ideas: searching/filtering + comparison reports
+
+# TUI for visual exploration
+ssh mluser@worker.local
+ml-tui
+# Navigate through old experiments
+# Press 'n' to read narratives
+# Reconstruct your thought process
+```
+
+### Collaborative Debugging with Advisor
+```bash
+# Both SSH into server simultaneously
+ssh mluser@worker.local
+
+# You run TUI to show current state
+ml-tui
+# Navigate to problem run, show logs live
+
+# Advisor suggests fix
+# You queue new run with their suggestion
+ml queue train.py --lr 1e-4 \
+  --note "Per advisor: try smaller LR with warmup" \
+  # Future idea: --parent-run
+  --priority 7
+
+# Watch it start in TUI immediately
+# Queue position visible, prewarm status shown
+```
+
+This dual-interface approach gives researchers the best of both worlds: **scriptability when they need it, visibility when they want it**.
+
+---
+
+## How This Maps to Your Current Architecture
+
+✅ **Already correct**:
+- Server-centric with dual client interfaces (CLI + TUI)
+- WebSocket communication (CLI)
+- SSH-based TUI with Bubble Tea (interactive monitoring)
+- Embedded rsync in server
+- Flexible queue backend (Redis/SQLite/filesystem)
+- Priority scheduling
+- Prewarm mechanism for NAS prefetch
+- **Fair queueing philosophy** - `queue` not `run`
+- TUI shows live updates: jobs, queue, GPU status, logs
+
+🎯 **Natural extensions**:
+- Queue-time narrative flags for `ml queue` (hypothesis/context/intent/etc.)
+- CLI commands for diffing and finding (and higher-level comparison workflows)
+- TUI panels for hypothesis/learnings (in job details)
+- Reproducibility validation improvements (extend `ml validate`)
+- Export/import for collaboration
+- Graceful degradation (filesystem-only mode)
+- Visible queue position and fairness metrics
+
+📝 **Design considerations**:
+- Show prewarm state/progress in `ml status`
+- Show queue position and ETA in both CLI and TUI
+- Add research context fields to manifests
+- Build comparison workflows (diff, similar, why-different)
+- Support hypothesis tracking in both interfaces
+- Create export bundles for sharing
+- Expose fairness metrics (wait time distribution, resource utilization)
+- TUI could show narrative snippets in job list (hypothesis as subtitle?)
+
+**TUI Research Narrative Integration Ideas:**
+```
+┌─ ML Jobs & Queue ─────────────────────────────────────┐
+│ > imagenet_baseline                                   │
+│   ✓ finished | Priority: 5                            │
+│   "Testing baseline performance before ablations"     │
+│                                                        │
+│   batch_size_64                                       │
+│   ▶ running (epoch 45/100) | Priority: 5             │
+│   "Validating linear LR scaling hypothesis"           │
+│                                                        │
+│   warmup_test                                         │
+│   ⏳ queued (position 2) | Priority: 3               │
+│   "Following up on advisor suggestion about warmup"   │
+└───────────────────────────────────────────────────────┘
+
+Press 'n' to view narrative, 'a' to annotate
+```
+
+**Implementation status (today):**
+- **Annotations are implemented** and stored at the **root** of `run_manifest.json` as `annotations[]`.
+- **Narrative fields are implemented** and stored under `run_manifest.json` as `narrative` (set/update via CLI).
+- Use `ml annotate <path|run_id|task_id> --note "..." [--author "..."]` to append an entry.
+- Remaining gaps are around **queue-time capture**, **post-run learnings/outcomes**, and **TUI-first narrative UX**.
+
+Example manifest.json
+```json
+{
+  // === Standard Execution Metadata ===
+  "run_id": "2024-01-15_abc123",
+  "status": "completed",
+  "command": "train.py --lr 0.001 --epochs 100 --batch-size 64",
+  "queued_at": "2024-01-15T10:25:00Z",
+  "started_at": "2024-01-15T10:30:00Z",
+  "ended_at": "2024-01-15T14:45:00Z",
+  "exit_code": 0,
+  "priority": 5,
+  
+  // === Research Narrative (The Important Part) ===
+  "narrative": {
+    // WHY did you run this?
+    "hypothesis": "Larger batch size with linear LR scaling should improve convergence speed without hurting final accuracy",
+    
+    // WHAT were you thinking at the time?
+    "context": "Previous run (run_789) with batch=32 took 8 hours and plateaued at 0.85. Paper XYZ suggests linear scaling rule should work.",
+    
+    // WHAT were you trying to accomplish?
+    "intent": "Test if doubling batch size (32→64) with 2x learning rate maintains accuracy while reducing training time",
+    
+    // WHAT did you expect to happen?
+    "expected_outcome": "Similar final accuracy (~0.85) but ~4 hour training time instead of 8",
+    
+    // HOW is this related to other experiments?
+    "parent_run": "2024-01-14_run789",
+    "experiment_group": "batch-size-scaling-ablation",
+    "tags": ["ablation", "batch-size", "convergence-speed", "paper-xyz-reproduction"],
+    
+    // WHAT did you learn? (filled in post-run or during)
+    "outcome": "Success: accuracy=0.87 (+0.02), time=3.5h (-56%). Linear scaling rule validated.",
+    "learnings": [
+      "Linear LR scaling worked as expected from paper XYZ",
+      "GPU memory utilization went from 60% to 95% - near limit",
+      "Convergence was actually smoother (fewer spikes in loss curve)",
+      "Could probably push to batch=96 before OOM"
+    ],
+    "next_steps": [
+      "Try batch=96 to maximize GPU utilization",
+      "Test if this scales to batch=128 with gradient accumulation",
+      "Validate on other datasets (currently only tested on ImageNet)"
+    ],
+    "validation_status": "validates",  // or "refutes", "inconclusive", "partial"
+  },
+  
+  // Human annotations added later
+  "annotations": [
+    {
+      "timestamp": "2024-01-15T15:00:00Z",
+      "author": "user@lab.edu",
+      "note": "This result is strong enough for the paper. Use these hyperparams for final training."
+    },
+    {
+      "timestamp": "2024-01-16T09:00:00Z",
+      "author": "advisor@lab.edu", 
+      "note": "Good work. Also compare with warmup schedule before finalizing."
+    }
+  ],
+  
+  // === Reproducibility Metadata ===
+  "environment": {
+    "git_commit": "a1b2c3d4",
+    "git_dirty": false,
+    "git_branch": "experiment/batch-scaling",
+    "container_image": "pytorch/pytorch:2.0.1-cuda11.8-cudnn8-runtime",
+    "container_digest": "sha256:abc123...",
+    "pip_freeze": "torch==2.0.1\ntorchvision==0.15.2\n...",
+    "cuda_version": "11.8",
+    "gpu_driver": "525.105.17",
+    "python_version": "3.10.12"
+  },
+  
+  // === Data Provenance ===
+  "datasets": [
+    {
+      "name": "imagenet-train",
+      "nas_path": "/nas/datasets/imagenet/ILSVRC2012/train",
+      "checksum": "sha256:def456...",
+      "size_gb": 144.2,
+      "num_samples": 1281167,
+      "version": "ILSVRC2012",
+      "fetched_via": "prewarm",
+      "fetch_time_seconds": 180
+    }
+  ],
+  
+  // === Resource Usage ===
+  "resources": {
+    "requested": {
+      "gpus": 1,
+      "gpu_memory_gb": 24,
+      "cpu_cores": 8,
+      "ram_gb": 32
+    },
+    "actual": {
+      "gpu_utilization_avg": 95,
+      "gpu_memory_peak_gb": 22.8,
+      "cpu_utilization_avg": 45,
+      "ram_peak_gb": 28.5,
+      "disk_read_gb": 145,
+      "disk_write_gb": 12
+    },
+    "gpu_model": "NVIDIA RTX 3090",
+    "host": "ml-server-01"
+  },
+  
+  // === Results ===
+  "metrics": {
+    "final_train_accuracy": 0.891,
+    "final_val_accuracy": 0.873,
+    "final_train_loss": 0.234,
+    "final_val_loss": 0.287,
+    "best_val_accuracy": 0.876,
+    "best_epoch": 87,
+    "total_epochs": 100,
+    "training_time_hours": 3.52
+  },
+  
+  // === Artifacts ===
+  "artifacts": {
+    "discovery_time": "2024-01-15T14:45:00Z",
+    "files": [
+      {
+        "path": "checkpoints/epoch_010.pth",
+        "size_bytes": 450000000,
+        "modified": "2024-01-15T11:30:00Z"
+      },
+      {
+        "path": "checkpoints/best.pth",
+        "size_bytes": 450000000,
+        "modified": "2024-01-15T13:45:00Z"
+      }
+    ],
+    "total_size_bytes": 900000000
+  }
+}
\ No newline at end of file
diff --git a/docs/src/user-permissions.md b/docs/src/user-permissions.md
index 828068e..df101e6 100644
--- a/docs/src/user-permissions.md
+++ b/docs/src/user-permissions.md
@@ -61,13 +61,16 @@ User roles and permissions are configured on the server side by administrators.
 ### Data Scientist Workflow
 ```bash
 # Submit your experiment
-ml run my-experiment
+ml queue my-experiment
 
 # Check your experiments (only shows yours)
 ml status
 
 # Cancel your own experiment
 ml cancel my-experiment
+
+# Requeue a previous run with different args
+ml requeue <run_id|task_id|path> -- --epochs 20
 ```
 
 ### Administrator Workflow
diff --git a/docs/src/validate.md b/docs/src/validate.md
index 7e91dd9..136b39f 100644
--- a/docs/src/validate.md
+++ b/docs/src/validate.md
@@ -1,7 +1,6 @@
 ---
-layout: page
 title: "Validation (ml validate)"
-permalink: /validate/
+url: "/validate/"
 ---
 
 # Validation (`ml validate`)
diff --git a/docs/src/zig-cli.md b/docs/src/zig-cli.md
index c2ab7af..b8fb241 100644
--- a/docs/src/zig-cli.md
+++ b/docs/src/zig-cli.md
@@ -1,8 +1,7 @@
 ---
-layout: page
 title: "Zig CLI Guide"
-permalink: /zig-cli/
-nav_order: 3
+url: "/zig-cli/"
+weight: 3
 ---
 
 # Zig CLI Guide
@@ -28,7 +27,7 @@ The CLI reads `~/.ml/config.toml` and respects `FETCH_ML_CLI_*` env vars:
 worker_host = "127.0.0.1"
 worker_user = "dev_user"
 worker_base = "/tmp/ml-experiments"
-worker_port = 9101
+worker_port = 22
 api_key = "your-api-key"
 ```
 
@@ -59,4 +58,4 @@ All use `zig build-exe` with `-OReleaseSmall -fstrip` and are compatible with Li
 
 ## CI/CD
 
-The release workflow builds cross‑platform binaries and packages them with checksums. See `.github/workflows/release.yml`.
+The release workflow builds cross‑platform binaries and packages them with checksums. See `.forgejo/workflows/release-mirror.yml`.