From 5144d291cbb01a7f77e1e5549bc5539e19023e19 Mon Sep 17 00:00:00 2001 From: Jeremie Fraeys Date: Thu, 12 Feb 2026 12:05:27 -0500 Subject: [PATCH] docs: comprehensive documentation updates - Add architecture, CI/CD, CLI reference documentation - Update installation, operations, and quick-start guides - Add Jupyter workflow and queue documentation - New landing page and research runner plan --- CHANGELOG.md | 1 + DEVELOPMENT.md | 79 +-- README.md | 12 +- docs/.hugo_build.lock | 0 docs/go.mod | 2 + docs/go.sum | 2 + docs/hugo.toml | 2 +- ...s_b807c86e8030af4cdc30edccea379f5f.content | 1 + ...scss_b807c86e8030af4cdc30edccea379f5f.json | 1 + docs/src/architecture.md | 5 +- docs/src/cicd.md | 20 +- docs/src/cli-reference.md | 39 +- docs/src/installation.md | 2 +- docs/src/jupyter-workflow.md | 25 +- docs/src/{index.md => landing.md} | 3 +- docs/src/operations.md | 5 +- docs/src/queue.md | 5 +- docs/src/quick-start.md | 7 +- docs/src/redis-ha.md | 5 +- docs/src/research-runner-plan.md | 667 ++++++++++++++++++ docs/src/user-permissions.md | 5 +- docs/src/validate.md | 3 +- docs/src/zig-cli.md | 9 +- 23 files changed, 790 insertions(+), 110 deletions(-) create mode 100644 docs/.hugo_build.lock create mode 100644 docs/go.sum create mode 100644 docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.content create mode 100644 docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.json rename docs/src/{index.md => landing.md} (98%) create mode 100644 docs/src/research-runner-plan.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c319c8..685a5c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - Worker: stage verified `snapshot_id` into each task workspace and expose it to training code via `FETCH_ML_SNAPSHOT_DIR`. - Worker: provenance enforcement is trustworthiness-by-default (fail-closed) with `provenance_best_effort` opt-in. - CLI/API: add `ml validate` to fetch a validation report (commit/task) for provenance + integrity checks. +- Worker: persist discovered artifacts into `run_manifest.json` (`artifacts.discovery_time`, `artifacts.files[]`, `artifacts.total_size_bytes`) at task completion. - Worker: best-effort environment prewarm can build a warmed Podman image keyed by `deps_manifest_sha256` and reuse it for subsequent tasks. - Worker: export env prewarm hit/miss/built counters and total build time via the worker Prometheus metrics endpoint. - API/Worker: `ml prune` also triggers best-effort garbage collection of warmed env images. diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 20e0574..e03e3df 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -9,11 +9,8 @@ This guide helps developers set up their environment and contribute effectively git clone cd fetch_ml -# Install dependencies -make setup-dev - # Start development environment -make dev-start +make dev-up # Run tests make test @@ -24,11 +21,10 @@ make test ### Prerequisites - Go 1.25+ -- Zig 0.11+ +- Zig 0.15+ - Python 3.11+ - Docker & Docker Compose - Redis -- Node.js (for some tools) ### Local Development Setup @@ -40,15 +36,15 @@ make test 2. **Install Zig tools** ```bash - # Install Zig language server - zig build --install zls + # Zig is required for building the CLI and running CLI tests + zig version ``` 3. **Setup Python environment** ```bash python -m venv venv source venv/bin/activate # or venv\Scripts\activate on Windows - pip install -r requirements-dev.txt + # Python is optional (used for a few helper scripts) ``` 4. **Optional: Install pre-commit hooks** @@ -69,11 +65,8 @@ make test 2. Make your changes with live feedback: ```bash - # Go development with hot reload - make dev-go - - # Zig development with build on save - make dev-zig + # Build Go services + Zig CLI + make dev # Run specific tests make test-unit @@ -84,11 +77,10 @@ make test ```bash # Lint and format (if you have tools configured) make lint - make format # Full test suite - make test-all - + make test-full + # Optional: Pre-commit checks pre-commit run --all-files ``` @@ -105,13 +97,14 @@ make test make test-unit # Unit tests only make test-integration # Integration tests only make test-e2e # End-to-end tests only -make test-performance # Performance tests only - +make benchmark # Benchmarks +make load-test # Load tests + # Run with coverage make test-coverage - + # Watch mode for development -make test-watch +# (no watch mode target; run specific package tests with go test -run) ``` ## Code Quality @@ -145,50 +138,14 @@ test: add or update tests chore: maintenance tasks ``` -## Debugging - -### Go Debugging - -```bash -# Debug with delve -dlv debug cmd/api-server/main.go - -# Debug tests -dlv test ./internal/... - -# Profile with pprof -go tool pprof http://localhost:6060/debug/pprof/profile -``` - -### Zig Debugging - -```bash -# Debug build -zig build-exe -O Debug -fstrip=false your_file.zig - -# Test with debugging -zig test --gdb your_file.zig -``` - -### Container Debugging - -```bash -# Debug containers -docker-compose exec api-server bash -docker-compose logs -f api-server - -# Inspect running processes -docker-compose exec api-server ps aux -``` - ## Performance Monitoring ### Local Monitoring ```bash # Start monitoring stack -make monitoring-start - +make dev-up + # View metrics open http://localhost:3000 # Grafana open http://localhost:9090 # Prometheus @@ -198,8 +155,8 @@ open http://localhost:9090 # Prometheus ```bash # Load test API -make load-test-api - +make load-test + # Performance benchmarks make benchmark diff --git a/README.md b/README.md index 97623d3..2d4fd43 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Verify the signature (keyless Sigstore) using cosign: cosign verify-blob \ --certificate checksums.txt.cert \ --signature checksums.txt.sig \ - --certificate-identity-regexp "^https://github.com///.github/workflows/release.yml@refs/tags/v.*$" \ + --certificate-identity-regexp "^https://github.com/jfraeysd/fetch_ml/.forgejo/workflows/release-mirror.yml@refs/tags/v.*$" \ --certificate-oidc-issuer https://token.actions.githubusercontent.com \ checksums.txt ``` @@ -40,16 +40,16 @@ Example (CLI on Linux x86_64): ```bash # Download -curl -fsSLO https://github.com///releases/download//ml-linux-x86_64.tar.gz -curl -fsSLO https://github.com///releases/download//checksums.txt -curl -fsSLO https://github.com///releases/download//checksums.txt.sig -curl -fsSLO https://github.com///releases/download//checksums.txt.cert +curl -fsSLO https://github.com/jfraeysd/fetch_ml/releases/download//ml-linux-x86_64.tar.gz +curl -fsSLO https://github.com/jfraeysd/fetch_ml/releases/download//checksums.txt +curl -fsSLO https://github.com/jfraeysd/fetch_ml/releases/download//checksums.txt.sig +curl -fsSLO https://github.com/jfraeysd/fetch_ml/releases/download//checksums.txt.cert # Verify cosign verify-blob \ --certificate checksums.txt.cert \ --signature checksums.txt.sig \ - --certificate-identity-regexp "^https://github.com///.github/workflows/release.yml@refs/tags/v.*$" \ + --certificate-identity-regexp "^https://github.com/jfraeysd/fetch_ml/.forgejo/workflows/release-mirror.yml@refs/tags/v.*$" \ --certificate-oidc-issuer https://token.actions.githubusercontent.com \ checksums.txt sha256sum -c --ignore-missing checksums.txt diff --git a/docs/.hugo_build.lock b/docs/.hugo_build.lock new file mode 100644 index 0000000..e69de29 diff --git a/docs/go.mod b/docs/go.mod index 03f40e4..cf710a2 100644 --- a/docs/go.mod +++ b/docs/go.mod @@ -1,3 +1,5 @@ module github.com/jfraeys/fetch_ml/docs go 1.21 + +require github.com/alex-shpak/hugo-book v0.0.0-20251118074854-b7f9c8cb0f51 // indirect diff --git a/docs/go.sum b/docs/go.sum new file mode 100644 index 0000000..f7b1c37 --- /dev/null +++ b/docs/go.sum @@ -0,0 +1,2 @@ +github.com/alex-shpak/hugo-book v0.0.0-20251118074854-b7f9c8cb0f51 h1:HHxBwO6r6h3AUflUc/X/Gf5UrfTY5rZEbD7QoGzbVvU= +github.com/alex-shpak/hugo-book v0.0.0-20251118074854-b7f9c8cb0f51/go.mod h1:L4NMyzbn15fpLIpmmtDg9ZFFyTZzw87/lk7M2bMQ7ds= diff --git a/docs/hugo.toml b/docs/hugo.toml index 6a8e5e1..b55d578 100644 --- a/docs/hugo.toml +++ b/docs/hugo.toml @@ -9,7 +9,7 @@ publishDir = "_site" enableGitInfo = true -disableKinds = ["taxonomy", "taxonomyTerm"] +disableKinds = ["taxonomy"] [module] [[module.imports]] diff --git a/docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.content b/docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.content new file mode 100644 index 0000000..63f7a65 --- /dev/null +++ b/docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.content @@ -0,0 +1 @@ +@charset "UTF-8";:root{--font-size:16px;--font-size-smaller:0.875rem;--font-size-smallest:0.75rem;--body-font-weight:400;--body-background:white;--body-background-tint:transparent;--body-font-color:black;--border-radius:0.25rem}/*!modern-normalize v3.0.1 | MIT License | https://github.com/sindresorhus/modern-normalize*/*,::before,::after{box-sizing:border-box}html{font-family:system-ui,segoe ui,Roboto,Helvetica,Arial,sans-serif,apple color emoji,segoe ui emoji;line-height:1.15;-webkit-text-size-adjust:100%;tab-size:4}body{margin:0}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:ui-monospace,SFMono-Regular,Consolas,liberation mono,Menlo,monospace;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{border-color:initial}button,input,optgroup,select,textarea{font-family:inherit;font-size:100%;line-height:1.15;margin:0}button,[type=button],[type=reset],[type=submit]{-webkit-appearance:button}legend{padding:0}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}.flex{display:flex}.flex.gap{gap:1rem}.flex-auto{flex:auto}.flex-even{flex:1 1}.flex-wrap{flex-wrap:wrap}.justify-start{justify-content:flex-start}.justify-end{justify-content:flex-end}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.align-center{align-items:center}.mx-auto{margin:0 auto}.text-center{text-align:center}.text-left{text-align:left}.text-right{text-align:right}.text-small,small{font-size:.875em}.hidden{display:none}input.toggle{height:0;width:0;overflow:hidden;opacity:0;position:absolute}html{font-size:var(--font-size);scroll-behavior:smooth;touch-action:manipulation;scrollbar-gutter:stable}body{min-width:20rem;color:var(--body-font-color);background:var(--body-background)var(--body-background-tint);font-weight:var(--body-font-weight);text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}h1,h2,h3,h4,h5,h6{font-weight:inherit}a{flex:auto;align-items:center;gap:.5em;text-decoration:none;cursor:default}a[href],a[role=button]{color:var(--color-link);cursor:pointer}:focus-visible,input.toggle:focus-visible+label{outline-style:auto;outline-color:var(--color-link)}nav ul{padding:0;margin:0;list-style:none}nav ul li{position:relative}nav ul a{padding:.5em 0;display:flex;transition:opacity .1s ease-in-out}nav ul a[href]:hover,nav ul a[role=button]:hover{opacity:.5}nav ul ul{padding-inline-start:1.5em}ul.pagination{display:flex;justify-content:center;list-style-type:none;padding-inline-start:0}ul.pagination .page-item a{padding:1rem}.container{max-width:80rem;margin:0 auto}.book-icon{filter:var(--icon-filter)}a .book-icon{height:1em;width:1em}.book-brand{margin-top:0;margin-bottom:1rem}.book-brand img{height:1.5em;width:1.5em}.book-menu{flex:0 0 16rem;font-size:var(--font-size-smaller)}.book-menu .book-menu-content{width:16rem;padding:1rem;position:fixed;top:0;bottom:0;overflow-x:hidden;overflow-y:auto}.book-menu a,.book-menu label{color:inherit;word-wrap:break-word;display:flex}.book-menu a.active{color:var(--color-link)}.book-menu label>img:last-child{height:1em;width:1em;cursor:pointer;align-self:center;transition:transform .1s ease-in-out}.book-menu input.toggle+label+ul{display:none}.book-menu input.toggle:checked+label>img:last-child{transform:rotate(90deg)}.book-menu input.toggle:checked+label+ul{display:block}body[dir=rtl] .book-menu input.toggle+label>img:last-child{transform:rotate(180deg)}body[dir=rtl] .book-menu input.toggle:checked+label>img:last-child{transform:rotate(90deg)}.book-section-flat{margin:1rem 0}.book-section-flat>a,.book-section-flat>span,.book-section-flat>label{font-weight:bolder}.book-section-flat>ul{padding-inline-start:0}.book-page{min-width:20rem;flex-grow:1;padding:1rem}.book-post{margin-bottom:4rem}.book-post .book-post-date img{height:1em;width:1em;margin-inline-end:.5em}.book-post .book-post-content{margin-top:1rem}.book-post .book-post-thumbnail{flex:0 0 34%}.book-post .book-post-thumbnail img{width:100%;aspect-ratio:4/3;object-fit:cover}.book-header{margin-bottom:1rem}.book-header label{line-height:0}.book-header h3{overflow:hidden;text-overflow:ellipsis;margin:0 1rem}.book-layout-landing .book-header{display:block;position:relative;z-index:1}.book-layout-landing .book-header nav>ul{display:flex;gap:1rem;justify-content:end}.book-layout-landing .book-header nav>ul>li{display:block;white-space:nowrap}.book-layout-landing .book-header nav>ul>li>ul{display:none;position:absolute;padding:0}.book-layout-landing .book-header nav>ul>li:hover>ul,.book-layout-landing .book-header nav>ul>li:focus-within>ul{display:block}.book-search{position:relative;margin:.5rem 0}.book-search input{width:100%;padding:.5rem;border:1px solid var(--gray-200);border-radius:var(--border-radius);background:var(--gray-100);color:var(--body-font-color)}.book-search input:required+.book-search-spinner{display:block}.book-search .book-search-spinner{position:absolute;top:0;margin:.5rem;margin-inline-start:calc(100% - 1.5rem);width:1rem;height:1rem;border:1px solid transparent;border-top-color:var(--body-font-color);border-radius:50%;animation:spin 1s ease infinite}@keyframes spin{100%{transform:rotate(360deg)}}.book-search ul a{padding-bottom:0}.book-search small{opacity:.5}.book-toc{flex:0 0 16rem;font-size:var(--font-size-smallest)}.book-toc .book-toc-content{width:16rem;padding:1rem;position:fixed;top:0;bottom:0;overflow-x:hidden;overflow-y:auto}.book-toc a{display:block}.book-toc img{height:1em;width:1em}.book-toc nav>ul>li:first-child{margin-top:0}.book-footer{padding-top:1rem;font-size:var(--font-size-smaller)}.book-footer a{margin:.25rem 0;padding:.25rem 0}.book-comments{margin-top:1rem}.book-copyright{margin-top:1rem}.book-languages{margin-bottom:1rem}.book-languages span{padding:0}.book-languages ul{padding-inline-start:1.5em}.book-menu-content,.book-toc-content{transition:.2s ease-in-out;transition-property:transform,margin,opacity,visibility;will-change:transform,margin,opacity}@media screen and (max-width:56rem){.book-menu{visibility:hidden;margin-inline-start:-16rem;z-index:1}.book-menu .book-menu-content{background:var(--body-background)}.book-toc{display:none}.book-header{display:block}.book-post-container{flex-direction:column-reverse}#menu-control,#toc-control{display:inline}#menu-control:checked~main .book-menu{visibility:initial}#menu-control:checked~main .book-menu .book-menu-content{transform:translateX(16rem);box-shadow:0 0 .5rem rgba(0,0,0,.1)}#menu-control:checked~main .book-page{opacity:.25}#menu-control:checked~main .book-menu-overlay{display:block;position:fixed;top:0;bottom:0;left:0;right:0}#toc-control:checked~main .book-header aside{display:block}body[dir=rtl] #menu-control:checked~main .book-menu .book-menu-content{transform:translateX(-16rem)}}@media screen and (min-width:80rem){.book-page,.book-menu .book-menu-content,.book-toc .book-toc-content{padding:2rem 1rem}}@media print{.book-menu,.book-footer,.book-toc{display:none}.book-header,.book-header aside{display:block}main{display:block!important}}.markdown{line-height:1.6}.markdown>:first-child{margin-top:0}.markdown h1,.markdown h2,.markdown h3,.markdown h4,.markdown h5,.markdown h6{font-weight:inherit;line-height:1;margin-top:1.5em;margin-bottom:1rem}.markdown h1 a.anchor,.markdown h2 a.anchor,.markdown h3 a.anchor,.markdown h4 a.anchor,.markdown h5 a.anchor,.markdown h6 a.anchor{opacity:0;font-size:.75em;margin-inline-start:.25em}.markdown h1:hover a.anchor,.markdown h1 a.anchor:focus-visible,.markdown h2:hover a.anchor,.markdown h2 a.anchor:focus-visible,.markdown h3:hover a.anchor,.markdown h3 a.anchor:focus-visible,.markdown h4:hover a.anchor,.markdown h4 a.anchor:focus-visible,.markdown h5:hover a.anchor,.markdown h5 a.anchor:focus-visible,.markdown h6:hover a.anchor,.markdown h6 a.anchor:focus-visible{opacity:initial;text-decoration:none}.markdown h1{font-size:2rem}.markdown h2{font-size:1.5rem}.markdown h3{font-size:1.25rem}.markdown h4{font-size:1.125rem}.markdown h5{font-size:1rem}.markdown h6{font-size:.875rem}.markdown b,.markdown optgroup,.markdown strong{font-weight:bolder}.markdown a{text-decoration:none}.markdown a[href]:hover{text-decoration:underline}.markdown a[href]:visited{color:var(--color-visited-link)}.markdown img{max-width:100%;height:auto}.markdown code{direction:ltr;unicode-bidi:embed;padding:.125em .25em;background:var(--gray-100);border:1px solid var(--gray-200);border-radius:var(--border-radius);font-size:.875em}.markdown pre{padding:1rem;background:var(--gray-100);border:1px solid var(--gray-200);border-radius:var(--border-radius);overflow-x:auto}.markdown pre:focus{outline-style:auto;outline-color:var(--color-link)}.markdown pre code{padding:0;border:0;background:0 0}.markdown p{word-wrap:break-word}.markdown blockquote{margin:1rem 0;padding:.5rem 1rem .5rem .75rem;border-inline-start:.25rem solid var(--gray-200);border-radius:var(--border-radius)}.markdown blockquote :first-child{margin-top:0}.markdown blockquote :last-child{margin-bottom:0}.markdown table{overflow:auto;display:block;border-spacing:0;border-collapse:collapse;margin-top:1rem;margin-bottom:1rem}.markdown table tr th,.markdown table tr td{padding:.5rem 1rem;border:1px solid var(--gray-200);text-align:start}.markdown table tr:nth-child(2n){background:var(--gray-100)}.markdown hr{height:1px;border:none;background:var(--gray-200)}.markdown ul,.markdown ol{padding-inline-start:2rem;word-wrap:break-word}.markdown dl dt{font-weight:bolder;margin-top:1rem}.markdown dl dd{margin-inline-start:0;margin-bottom:1rem}.markdown .highlight{direction:ltr;unicode-bidi:embed;border-radius:var(--border-radius)}.markdown .highlight table tbody{border:1px solid var(--gray-200)}.markdown .highlight table tr pre{border:0}.markdown .highlight table tr td pre code>span{display:flex}.markdown .highlight table tr td:nth-child(1) pre{margin:0;padding-inline-end:0}.markdown .highlight table tr td:nth-child(2) pre{margin:0;padding-inline-start:0}.markdown details{padding:1rem;margin:1rem 0;border:1px solid var(--gray-200);border-radius:var(--border-radius)}.markdown details summary{line-height:1;padding:1rem;margin:-1rem;cursor:pointer;list-style:none}.markdown details summary::before{content:"›";display:inline-block;margin-inline-end:.5rem;transition:transform .1s ease-in-out}.markdown details[open] summary{margin-bottom:0}.markdown details[open] summary::before{transform:rotate(90deg)}.markdown figure{margin:1rem 0}.markdown figure figcaption{margin-top:1rem}.markdown-inner>:first-child,.markdown .book-steps>ol>li>:first-child,.markdown figure figcaption>:first-child{margin-top:0}.markdown-inner>:last-child,.markdown .book-steps>ol>li>:last-child,.markdown figure figcaption>:last-child{margin-bottom:0}.markdown .book-tabs{margin-top:1rem;margin-bottom:1rem;border:1px solid var(--gray-200);border-radius:var(--border-radius);display:flex;flex-wrap:wrap}.markdown .book-tabs label{display:inline-block;padding:.5rem 1rem;border-bottom:1px transparent;cursor:pointer}.markdown .book-tabs .book-tabs-content{order:999;width:100%;border-top:1px solid var(--gray-100);padding:1rem;display:none}.markdown .book-tabs input[type=radio]:checked+label{border-bottom:1px solid var(--color-link)}.markdown .book-tabs input[type=radio]:checked+label+.book-tabs-content{display:block}.markdown .book-columns{gap:1rem}.markdown .book-columns>div{margin:1rem 0;min-width:13.2rem}.markdown .book-columns>ul{list-style:none;display:flex;padding:0;flex-wrap:wrap;gap:1rem}.markdown .book-columns>ul>li{flex:1 1;min-width:13.2rem}.markdown a.book-btn[href]{display:inline-block;font-size:var(--font-size-smaller);color:var(--color-link);line-height:2rem;padding:0 1rem;border:1px solid var(--color-link);border-radius:var(--border-radius);cursor:pointer}.markdown a.book-btn[href]:hover{text-decoration:none}.markdown .book-hint.note{border-color:var(--color-accent-note);background-color:var(--color-accent-note-tint)}.markdown .book-hint.tip{border-color:var(--color-accent-tip);background-color:var(--color-accent-tip-tint)}.markdown .book-hint.important{border-color:var(--color-accent-important);background-color:var(--color-accent-important-tint)}.markdown .book-hint.warning{border-color:var(--color-accent-warning);background-color:var(--color-accent-warning-tint)}.markdown .book-hint.caution{border-color:var(--color-accent-caution);background-color:var(--color-accent-caution-tint)}.markdown .book-hint.default{border-color:var(--color-accent-default);background-color:var(--color-accent-default-tint)}.markdown .book-hint.info{border-color:var(--color-accent-info);background-color:var(--color-accent-info-tint)}.markdown .book-hint.success{border-color:var(--color-accent-success);background-color:var(--color-accent-success-tint)}.markdown .book-hint.danger{border-color:var(--color-accent-danger);background-color:var(--color-accent-danger-tint)}.markdown .book-badge{display:inline-block;font-size:var(--font-size-smaller);font-weight:var(--body-font-weight);vertical-align:middle;border-radius:var(--border-radius);border:1px solid var(--accent-color);overflow:hidden;text-wrap:nowrap;color:var(--body-font-color)}.markdown .book-badge.note{--accent-color:var(--color-accent-note)}.markdown .book-badge.tip{--accent-color:var(--color-accent-tip)}.markdown .book-badge.important{--accent-color:var(--color-accent-important)}.markdown .book-badge.warning{--accent-color:var(--color-accent-warning)}.markdown .book-badge.caution{--accent-color:var(--color-accent-caution)}.markdown .book-badge.default{--accent-color:var(--color-accent-default)}.markdown .book-badge.info{--accent-color:var(--color-accent-info)}.markdown .book-badge.success{--accent-color:var(--color-accent-success)}.markdown .book-badge.danger{--accent-color:var(--color-accent-danger)}.markdown .book-badge span{display:inline-block;padding:0 .5rem}.markdown .book-badge span.book-badge-value{color:var(--body-background);background-color:var(--accent-color)}.markdown .book-steps{position:relative}.markdown .book-steps>ol{counter-reset:steps;list-style:none;padding-inline-start:1.25rem;margin-top:2rem}.markdown .book-steps>ol>li::before{content:counter(steps);counter-increment:steps;position:absolute;display:flex;justify-content:center;left:.5rem;height:1.5rem;width:1.5rem;padding:.25rem;border-radius:.5rem;white-space:nowrap;line-height:1rem;color:var(--body-background);background:var(--gray-500);outline:.25rem solid var(--body-background)}.markdown .book-steps>ol>li{border-inline-start:1px solid var(--gray-500);padding-inline-start:3rem;padding-bottom:2rem}.markdown .book-steps>ol>li:last-child{border:0}.markdown .book-card{display:block;overflow:hidden;height:100%;border-radius:var(--border-radius);border:1px solid var(--gray-200)}.markdown .book-card>a{display:block;height:100%}.markdown .book-card>a[href],.markdown .book-card>a[href]:visited{color:var(--body-font-color)}.markdown .book-card>a[href]:hover{text-decoration:none;background:var(--gray-100)}.markdown .book-card>a>img,.markdown .book-card>img{width:100%;display:block;aspect-ratio:4/3;object-fit:cover}.markdown .book-card .markdown-inner,.markdown .book-card figure figcaption,.markdown figure .book-card figcaption,.markdown .book-card .book-steps>ol>li{padding:1rem}.markdown .book-image input+img{cursor:zoom-in;transition:transform .2s ease-in-out}.markdown .book-image input:checked+img{position:fixed;top:0;left:0;right:0;bottom:0;background:var(--body-background);object-fit:contain;width:100%;height:100%;z-index:1;cursor:zoom-out;padding:1rem}.markdown .book-asciinema{margin:1rem 0}.markdown .book-hero{min-height:24rem;align-content:center}.markdown .book-hero h1{font-size:3em}.markdown .book-codeblock-filename{background:var(--gray-100);border:1px solid var(--gray-200);border-bottom:0;font-size:var(--font-size-smaller);margin-top:1rem;padding:.25rem .5rem;border-start-start-radius:var(--border-radius);border-start-end-radius:var(--border-radius)}.markdown .book-codeblock-filename a{color:var(--body-font-color)}.markdown .book-codeblock-filename+.highlight pre{margin-top:0;border-start-start-radius:0;border-start-end-radius:0}:root{--body-background:white;--body-background-tint:none;--body-font-color:black;--color-link:#0055bb;--color-visited-link:#5500bb;--icon-filter:none;--gray-100:#f8f9fa;--gray-200:#e9ecef;--gray-500:#adb5bd;--color-accent-default:#64748b;--color-accent-default-tint:rgba(100, 116, 139, 0.1);--color-accent-note:#4486dd;--color-accent-note-tint:rgba(68, 134, 221, 0.1);--color-accent-tip:#3bad3b;--color-accent-tip-tint:rgba(59, 173, 59, 0.1);--color-accent-important:#8144dd;--color-accent-important-tint:rgba(129, 68, 221, 0.1);--color-accent-warning:#f59e42;--color-accent-warning-tint:rgba(245, 158, 66, 0.1);--color-accent-caution:#d84747;--color-accent-caution-tint:rgba(216, 71, 71, 0.1);--color-accent-info:#4486dd;--color-accent-info-tint:rgba(68, 134, 221, 0.1);--color-accent-success:#3bad3b;--color-accent-success-tint:rgba(59, 173, 59, 0.1);--color-accent-danger:#d84747;--color-accent-danger-tint:rgba(216, 71, 71, 0.1)}@media(prefers-color-scheme:dark){:root{--body-background:#343a40;--body-background-tint:none;--body-font-color:#e9ecef;--color-link:#84b2ff;--color-visited-link:#b88dff;--icon-filter:brightness(0) invert(1);--gray-100:#494e54;--gray-200:#5c6165;--gray-500:#999d9f;--color-accent-default:#64748b;--color-accent-default-tint:rgba(100, 116, 139, 0.1);--color-accent-note:#4486dd;--color-accent-note-tint:rgba(68, 134, 221, 0.1);--color-accent-tip:#3bad3b;--color-accent-tip-tint:rgba(59, 173, 59, 0.1);--color-accent-important:#8144dd;--color-accent-important-tint:rgba(129, 68, 221, 0.1);--color-accent-warning:#f59e42;--color-accent-warning-tint:rgba(245, 158, 66, 0.1);--color-accent-caution:#d84747;--color-accent-caution-tint:rgba(216, 71, 71, 0.1);--color-accent-info:#4486dd;--color-accent-info-tint:rgba(68, 134, 221, 0.1);--color-accent-success:#3bad3b;--color-accent-success-tint:rgba(59, 173, 59, 0.1);--color-accent-danger:#d84747;--color-accent-danger-tint:rgba(216, 71, 71, 0.1)}} \ No newline at end of file diff --git a/docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.json b/docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.json new file mode 100644 index 0000000..2fff0b2 --- /dev/null +++ b/docs/resources/_gen/assets/book.scss_b807c86e8030af4cdc30edccea379f5f.json @@ -0,0 +1 @@ +{"Target":"book.min.6970156cec683193d93c9c4edaf0d56574e4361df2e0c1be4f697ae81c3ba55f.css","MediaType":"text/css","Data":{"Integrity":"sha256-aXAVbOxoMZPZPJxO2vDVZXTkNh3y4MG+T2l66Bw7pV8="}} \ No newline at end of file diff --git a/docs/src/architecture.md b/docs/src/architecture.md index 88edd90..a2940e2 100644 --- a/docs/src/architecture.md +++ b/docs/src/architecture.md @@ -1,8 +1,7 @@ --- -layout: page title: "Homelab Architecture" -permalink: /architecture/ -nav_order: 1 +url: "/architecture/" +weight: 1 --- # Homelab Architecture diff --git a/docs/src/cicd.md b/docs/src/cicd.md index 4ad86a9..326ad14 100644 --- a/docs/src/cicd.md +++ b/docs/src/cicd.md @@ -1,8 +1,7 @@ --- -layout: page title: "CI/CD Pipeline" -permalink: /cicd/ -nav_order: 5 +url: "/cicd/" +weight: 5 --- # CI/CD Pipeline @@ -11,7 +10,7 @@ Automated testing, building, and releasing for fetch_ml. ## Workflows -### CI Workflow (`.github/workflows/ci.yml`) +### CI Workflow (`.forgejo/workflows/ci.yml`) Runs on every push to `main`/`develop` and all pull requests. @@ -29,7 +28,7 @@ Runs on every push to `main`/`develop` and all pull requests. - Integration tests - Security audits -### Release Workflow (`.github/workflows/release.yml`) +### Release Workflow (`.forgejo/workflows/release-mirror.yml`) Runs on version tags (e.g., `v1.0.0`). @@ -49,7 +48,7 @@ Runs on version tags (e.g., `v1.0.0`). 3. **create-release** - Collects all artifacts - Generates SHA256 checksums - - Creates GitHub release with notes + - Mirrors release artifacts to GitHub Releases ## Release Process @@ -141,7 +140,8 @@ ZIG_VERSION: '0.15.2' ### Secrets Required for releases: -- `GITHUB_TOKEN` - Automatic, provided by GitHub Actions +- `GH_MIRROR_TOKEN` - GitHub token for publishing mirrored releases +- `GH_MIRROR_REPO` (variable) - GitHub repo slug, e.g. `jfraeysd/fetch_ml` ## Monitoring @@ -149,7 +149,7 @@ Required for releases: Check workflow runs at: ``` -https://github.com/jfraeys/fetch_ml/actions +https://git.jfraeys.com/jfraeysd/fetch_ml/actions ``` ### Artifacts @@ -161,5 +161,5 @@ Download build artifacts from: --- For implementation details: -- [.github/workflows/ci.yml](https://github.com/jfraeys/fetch_ml/blob/main/.github/workflows/ci.yml) -- [.github/workflows/release.yml](https://github.com/jfraeys/fetch_ml/blob/main/.github/workflows/release.yml) +- `.forgejo/workflows/ci.yml` +- `.forgejo/workflows/release-mirror.yml` diff --git a/docs/src/cli-reference.md b/docs/src/cli-reference.md index 41c1a0c..0731b40 100644 --- a/docs/src/cli-reference.md +++ b/docs/src/cli-reference.md @@ -1,8 +1,7 @@ --- -layout: page title: "CLI Reference" -permalink: /cli-reference/ -nav_order: 2 +url: "/cli-reference/" +weight: 2 --- # Fetch ML CLI Reference @@ -37,6 +36,7 @@ High-performance command-line interface for experiment management, written in Zi | `jupyter` | Manage Jupyter notebook services | `ml jupyter start --name my-nb` | | `validate` | Validate provenance/integrity for a commit or task | `ml validate --verbose` | | `info` | Show run info from `run_manifest.json` | `ml info ` | +| `requeue` | Re-submit an existing run/commit with new args/resources | `ml requeue -- --epochs 20` | ### Command Details @@ -72,8 +72,11 @@ ml sync ./my-project --priority 9 # Queue with commit ID ml queue my-job --commit abc123def456 -# Queue with priority (1-10, default 5) +# Queue with commit ID prefix (>=7 hex chars; must be unique) ml queue my-job --commit abc123 --priority 8 + +# Queue with extra runner args (stored as task.Args) +ml queue my-job --commit abc123 -- --epochs 5 --lr 1e-3 ``` **Features:** @@ -81,6 +84,34 @@ ml queue my-job --commit abc123 --priority 8 - Priority queuing system - API key authentication +**Notes:** +- `--priority` is passed to the server as a single byte (0-255). +- Args are sent via a dedicated queue opcode and become `task.Args` on the worker. +- `--commit` may be a full 40-hex commit id or a unique prefix (>=7 hex chars) resolvable under `worker_base`. + +#### `requeue` - Re-submit a Previous Run +```bash +# Requeue directly by commit_id +ml requeue -- --epochs 20 + +# Requeue by commit_id prefix (>=7 hex chars; must be unique) +ml requeue -- --epochs 20 + +# Requeue by run_id/task_id (CLI scans run_manifest.json under worker_base) +ml requeue -- --epochs 20 + +# Requeue by a run directory or run_manifest.json path +ml requeue /data/ml-experiments/finished/ -- --epochs 20 + +# Override priority/resources on requeue +ml requeue --priority 10 --gpu 1 -- --epochs 20 +``` + +**What it does:** +- Locates `run_manifest.json` +- Extracts `commit_id` +- Submits a new queue request using that `commit_id` with optional overridden args/resources + **Notes:** - Tasks support optional `snapshot_id` and `dataset_specs` fields server-side (for provenance and dataset resolution). diff --git a/docs/src/installation.md b/docs/src/installation.md index 6656913..19b216c 100644 --- a/docs/src/installation.md +++ b/docs/src/installation.md @@ -12,7 +12,7 @@ make install ./bin/ml setup # 3. Run experiments -./bin/ml run my-experiment.py +./cli/zig-out/bin/ml queue my-job ``` That's it. Everything else is optional. diff --git a/docs/src/jupyter-workflow.md b/docs/src/jupyter-workflow.md index 2ebf5d9..361326b 100644 --- a/docs/src/jupyter-workflow.md +++ b/docs/src/jupyter-workflow.md @@ -75,9 +75,32 @@ environment: security: trusted_channels: ["conda-forge", "defaults", "pytorch"] - blocked_packages: ["requests", "urllib3"] + blocked_packages: ["aiohttp", "telnetlib"] ``` +You can also override the blocked package list at runtime using an environment variable on the worker: + +```bash +export FETCHML_JUPYTER_BLOCKED_PACKAGES="aiohttp,telnetlib" +``` + +Some base images (including the default `quay.io/jupyter/base-notebook`) ship with common HTTP client libraries +like `requests`, `urllib3`, and `httpx` preinstalled. + +If you want to **block installing** packages like `requests`, `urllib3`, and `httpx` for security reasons but still +use a base image that already includes them, you can disable the **startup image scan** separately: + +```bash +# Block installs (user requests) +export FETCHML_JUPYTER_BLOCKED_PACKAGES="requests,urllib3,httpx" + +# Allow base images that already contain these packages to start +export FETCHML_JUPYTER_STARTUP_BLOCKED_PACKAGES="off" +``` + +If you want startup scanning enabled, set `FETCHML_JUPYTER_STARTUP_BLOCKED_PACKAGES` to a comma-separated list. + + ### Access Control ```bash diff --git a/docs/src/index.md b/docs/src/landing.md similarity index 98% rename from docs/src/index.md rename to docs/src/landing.md index 38dfd59..6f44890 100644 --- a/docs/src/index.md +++ b/docs/src/landing.md @@ -1,6 +1,5 @@ --- -layout: default -title: Fetch ML Documentation +title: "Fetch ML Documentation" bookHidden: true --- diff --git a/docs/src/operations.md b/docs/src/operations.md index 3a580a7..7f78ede 100644 --- a/docs/src/operations.md +++ b/docs/src/operations.md @@ -1,8 +1,7 @@ --- -layout: page title: "Operations Runbook" -permalink: /operations/ -nav_order: 6 +url: "/operations/" +weight: 6 --- # Operations Runbook diff --git a/docs/src/queue.md b/docs/src/queue.md index 08e9f37..2ae52fd 100644 --- a/docs/src/queue.md +++ b/docs/src/queue.md @@ -1,8 +1,7 @@ --- -layout: page title: "Task Queue Architecture" -permalink: /queue/ -nav_order: 3 +url: "/queue/" +weight: 3 --- # Task Queue Architecture diff --git a/docs/src/quick-start.md b/docs/src/quick-start.md index 3b275e6..9d41d47 100644 --- a/docs/src/quick-start.md +++ b/docs/src/quick-start.md @@ -9,8 +9,8 @@ Get Fetch ML running in minutes with Docker Compose and integrated monitoring. - **Podman**: For production experiment execution **Requirements:** -- Go 1.21+ -- Zig 0.11+ +- Go 1.25+ +- Zig 0.15+ - Docker Compose (testing only) - 4GB+ RAM - 2GB+ disk space @@ -137,8 +137,7 @@ cd cli && zig build --release=fast # Common operations ./cli/zig-out/bin/ml status # Check system status ./cli/zig-out/bin/ml queue job-name # Queue job -./cli/zig-out/bin/ml list # List jobs -./cli/zig-out/bin/ml help # Show help +./cli/zig-out/bin/ml --help # Show help ``` ### Monitoring Commands diff --git a/docs/src/redis-ha.md b/docs/src/redis-ha.md index 4b7f17b..1d6f783 100644 --- a/docs/src/redis-ha.md +++ b/docs/src/redis-ha.md @@ -1,8 +1,7 @@ --- -layout: page title: "Redis High Availability (Optional)" -permalink: /redis-ha/ -nav_order: 7 +url: "/redis-ha/" +weight: 7 --- # Redis High Availability diff --git a/docs/src/research-runner-plan.md b/docs/src/research-runner-plan.md new file mode 100644 index 0000000..9653924 --- /dev/null +++ b/docs/src/research-runner-plan.md @@ -0,0 +1,667 @@ +# Research-First Runner: Missing Themes Plan + +This file captures additional themes that are commonly missing in existing ML runners/experiment tools, translated into actionable design targets for a lightweight, research-first runner. + +## Quick Overview + +**What makes this different:** +- **Your server, not their cloud**: Everything runs on your homelab/workstation/uni server +- **Dual interfaces**: Zig CLI for scripting + SSH-accessible TUI for interactive work +- **Fair queueing**: `ml queue` (not `run`) makes resource sharing explicit +- **Research narrative**: Capture why you ran experiments, not just what ran +- **Zero SaaS**: No accounts, web dashboards, or external services +- **Plain text everything**: Human-readable manifests, long-term reproducibility + +**Perfect for:** Researchers in uni labs, homelab enthusiasts, small research groups who want control over their infrastructure without cloud vendor lock-in. + +## Architecture Context + +**Server-Centric Model for Homelab/Workstation/Uni Lab:** +- **Two client interfaces**: + - **Zig CLI**: Thin WebSocket client for scripting, automation, remote access + - **SSH-accessible TUI**: Interactive Bubble Tea UI for monitoring when SSH'd into server +- Go API server with embedded rsync (reduces dependencies) +- Worker pulls from flexible queue backend (Redis/SQLite/filesystem) +- Priority-based scheduling with prewarm mechanism +- NAS integration for data prefetching +- Target: single server, workstation, or small uni lab cluster (not cloud/SaaS) + +**Client Access Patterns:** +```bash +# CLI (from anywhere via WebSocket) +ml queue train.py --epochs 100 +ml status --watch +ml info + +# TUI (when SSH'd into server or jump box) +ssh mluser@worker.local +ml-tui # Interactive terminal UI +# Navigate with keyboard, see live updates +``` + +**Configuration:** +```toml +# ~/.ml/config.toml (shared by both CLI and TUI) +worker_host = "worker.local" +worker_user = "mluser" +worker_base = "/data/ml-experiments" +worker_port = 22 +api_key = "your-api-key" +``` + +## Plan (Missing Themes) + +## Implemented Today (in this repo) + +- Runs are queued via `ml queue` and processed by workers. +- Run provenance is written to `run_manifest.json`. +- You can attach queue-time notes with `ml queue --note "..."` (persisted under `run_manifest.json` → `metadata.note`). +- Queue backends support Redis / SQLite / filesystem (and optional filesystem fallback). +- CLI + SSH-launched TUI are both available (`ml monitor` launches the TUI). + +## Future Ideas (this document) + +### 1. Own-infrastructure-first, research-centric by default + +### 2. Minimal server dependencies (simple operations) + +### 3. Text-first tracking (logs > dashboards) + +- **Research narrative completion**: post-run outcome/learnings/next steps captured in the manifest +- **Auto-captured context**: + - Command + args (as sent from CLI) + - Timestamps (queue time, start time, end time) + - Git commit hash (and optionally diff) + - Environment snapshot (pip freeze, conda export, container image digest) + - Hardware context (GPU model, driver version, CUDA version) +- **Plain text manifests**: JSON or YAML, never binary blobs +- **Stable formats**: Can read experiments from 5 years ago without the runner + +**Implementation note**: Server writes `run_manifest.json` to experiment directory. CLI can display it via `ml info`. + +### 4. CLI and TUI as complementary interfaces + +- **Consistent CLI scripting UX**: Future idea (uniform `--json`, quiet modes, and stable exit codes across commands) +- **TUI feature parity**: Future idea (surface the same key details in TUI + CLI: queue position/ETA, narrative, validation results) + +### 5. Failure-tolerant, messy-research friendly + +- **Failure is first-class**: Failed runs stay visible and queryable +- **Partial artifacts preserved**: Keep artifacts/logs up to failure point (including checkpoints, if the script produces them) +- **No punishment for refactors**: Script renames don't break history +- **Grouping/tagging**: Label attempts (baseline/ablation/debug/exploration) + +**Server implementation**: Worker should catch exceptions, record failure reason, preserve state. Queue should track failure modes (OOM, timeout, code error, data error). + +### 6. Minimal abstraction over Python (transparent execution) + +- **Run scripts as-is**: No decorators, no framework rewrites +- **Preserve debuggability**: Clean stack traces, pdb works +- **Optional instrumentation**: Explicit metric logging via simple API + ```python + # Optional, not required + from ml_runner import log_metric + log_metric("loss", 0.5, step=100) + ``` +- **Standard I/O works**: `print()` goes to logs, arguments via `sys.argv` + +**Server implementation**: Worker spawns process, captures stdout/stderr, parses optional structured logs. No magic wrappers that hide what's happening. + +### 7. Reproducibility that survives time + +- **Immutable run folders**: Server never modifies completed runs +- **Environment capture** (best-effort, pluggable): + - Container image digest (primary method) + - `pip freeze` / `uv pip freeze` / `poetry.lock` + - `conda env export` + - `nix flake.lock` (if available) +- **Hardware fingerprint**: GPU model, driver, CUDA, CPU, RAM +- **Data provenance**: Dataset checksums, NAS paths, version identifiers +- **Commit everything**: Store full environment, even if verbose + +**Server implementation**: Pre-run hook captures environment. Store in `run_manifest.json`. Validate on `ml validate `. + +### 8. Small compute and shared machine friendliness + +### 9. Server-side storage with client-side visibility +- **Energy awareness**: Respect that homelabs pay electricity bills +- **Laptop-friendly**: Support thermal/power throttling +- **Single-GPU to 4-GPU range**: Optimize for typical research setups +- **No cluster assumptions**: Don't require Kubernetes/SLURM/etc. + +**Why this matters**: Researchers want to `ls` experiment directories but don't want to manually sync. Server handles storage, CLI provides views. + +### 11. Research narrative (lab notebook, not job IDs) + +- **Queue-time narrative capture**: Future idea (add `--hypothesis`, `--context`, `--intent`, etc. to `ml queue`) +- **Post-run learning capture**: Future idea (explicit `outcome`, `learnings[]`, `next_steps[]`, and validation status) +- **Narrative UX**: Future idea (view/edit narrative from TUI/CLI without hand-editing JSON) + +**CLI commands**: +```bash +ml queue train.py --note "Testing warmup hypothesis from paper X" +``` + + - CLI: WebSocket streaming for `--watch` and `--follow` + - TUI: Live refresh (500ms tick), immediate queue updates +- **No magic**: Minimize implicit behavior + - Explicit is better than clever + - Defaults should be obvious and documented + - Side effects should be visible (both in CLI and TUI) + - Configuration hierarchy clear: CLI flags > env > config file > defaults + +**TUI advantages for observability:** +- See everything at once: jobs, queue, GPUs, containers, logs +- Keyboard shortcuts for common operations +- Instant feedback on actions (queue, cancel, delete) +- Prewarm state visible in GPU panel +- No need to run multiple `ml status` commands + +### 13. Support clear thinking during experimentation + +- **Optimize for cognitive throughput**: + - Make it easy to remember what you were thinking + - Surface patterns across experiments + - Warn about near-duplicates before running +- **Built-in comparison**: + ```bash + # Future ideas: + # ml diff + # ml similar + ``` +- **Learning from history**: + ```bash + # Future ideas: + # ml lessons --tag ablation + # ml dead-ends + ``` +- **Hypothesis tracking**: + - Link hypothesis → experiment → outcome → next hypothesis + - Mark outcomes: validates/refutes/inconclusive +- **Reduce cognitive load**: + - Natural queries: Future idea (search over manifests/notes) + - Show relevant history when queueing + - Don't make researchers remember IDs + +**Server implementation**: Maintain index (rebuildable from filesystem). Support semantic queries over manifests, notes, tags. + +### 14. Fast iteration velocity + +- **Easy modification**: + ```bash + # Future ideas: + # ml clone + # ml fork + ``` +- **Batch operations**: + ```bash + # Future idea: ml sweep + ``` + +**Why prewarm matters**: Your NAS prefetch in prewarm means jobs start training immediately instead of waiting for data. This dramatically improves iteration velocity. + +### 15. Full research lifecycle support + +- **Exploration phase**: Minimal metadata, quick runs +- **Development phase**: Group attempts, compare variations +- **Validation phase**: Strict reproducibility, complete capture +- **Publication phase**: Export bundles, generate reproduction instructions +- **Maintenance phase**: Long-term readable, re-executable years later + +**Reproducibility levels** (your strict/best-effort model): +```bash +# Future idea: --repro-level +ml validate # Future idea: expand validation coverage + outputs +``` + +### 16. Collaboration without platforms + +- **Async collaboration** (no shared server required): + ```bash + # Future ideas: + # ml export --bundle run_42.tar.gz + # ml import run_42.tar.gz + ``` +- **Selective sharing**: + ```bash + # Future ideas: + # ml export --metadata-only + # ml export --include-artifacts + ``` +- **Review-friendly**: + - Self-contained bundles + - All provenance included + - Reproducibility instructions + - No "install our platform" friction + +**Server implementation**: Export packages `run_manifest.json` + artifacts into tarball. Import validates and unpacks into experiments directory. + +### 17. Graceful degradation + +- **Core works with minimal setup**: + - Filesystem-only queue (no Redis required) + - SQLite for metadata (no Postgres) + - Local execution (no remote targets needed) +- **Optional enhancements**: + - Redis for better multi-worker queueing + - Git integration (works without git) + - NAS prewarm (falls back to on-demand fetch) + - WebSocket updates (falls back to polling) +- **Progressive disclosure**: + - Simple commands for simple cases + - Advanced flags for power users + - Features activate when available + +**Implementation note**: + +### 18. Concrete features (derived from above) + +#### Findability +```bash +# Future ideas: +# ml find "failed runs on GPU2 last week" +# ml find --note "warmup" +``` +Server maintains rebuildable index over manifests, logs, tags. + +#### Dataset provenance +```json +{ + "datasets": [ + { + "name": "imagenet-train", + "nas_path": "/nas/datasets/imagenet/train", + "checksum": "sha256:abc123...", + "fetched_at": "2024-01-15T10:30:00Z", + "fetch_method": "prewarm" + } + ] +} +``` +Server validates checksums, warns on drift. + +#### Prewarm observability +```bash +ml status +# Shows: +# Next in queue: run_xyz (priority 5) +# Prewarming: dataset imagenet-train (2/5 complete) +# GPU 0: running run_abc (50% complete, ETA 2h) +# GPU 1: idle +``` + +#### CLI queue/requeue workflows + +**Core principle**: the runner does not introduce checkpoint conventions. The script should run identically when executed directly vs via `ml`. + +**Passive artifact tracking** (future idea): worker records what files exist in the run directory after completion (or via configured glob patterns). Checkpoints are just artifacts. + +**Requeue = replay command with modifications** (future idea): +```bash +# Original run +ml queue train.py --epochs 100 --save-dir ./checkpoints + +# Requeue (continue) +ml requeue run_abc -- --resume ./checkpoints/best.pt --epochs 200 +``` + +**Arg merge strategies** (future idea): +```bash +# Append new args (default) +ml requeue run_abc --append -- --resume ./checkpoints/best.pt + +# Replace (rerun with only new args) +ml requeue run_abc --replace -- --epochs 200 --lr 3e-4 + +# Merge (override matching flags, keep the rest) +ml requeue run_abc --merge -- --epochs 200 +``` + +**Optional staging** (future idea): copy an artifact from the source run into the new run directory, then reference it with a placeholder. +```bash +ml requeue run_abc --stage checkpoints/best.pt -- \ + --resume {staged}/best.pt --epochs 200 +``` + +#### Hardware/resource management +```json +{ + "resources": { + "gpus": 2, + "gpu_memory_gb": 40, + "cpu_cores": 16, + "ram_gb": 64, + "disk_gb": 100, + "max_runtime_hours": 24 + } +} +``` +Worker validates resources before pulling from queue. Server tracks utilization. + +--- + +## Design Philosophy Summary (Server-Centric) + +The goal is to build a **research assistant that runs on YOUR server**, not a platform that runs on someone else's cloud. + +### Every feature should answer: + +1. Does this help researchers **understand** what happened on the server? +2. Does this make the server **transparent** instead of a black box? +3. Does this work on a **single workstation** or small lab server? +4. Does this respect that researchers **SSH into the server**? +5. Does this make **local data** (NAS, scratch drives) first-class? + +### Architecture principles: + +- **Server is the control plane**: All logic, storage, scheduling on server +- **CLI is a thin client**: Just communicates via WebSocket, no local state +- **Filesystem is still king**: Server writes plain text, CLI reads via API +- **Queue-first for fairness**: `ml queue` not `ml run` - explicit resource requests +- **Priority without hogging**: Higher priority = earlier in queue, not exclusive access +- **Prewarm is a performance optimization**: Best-effort, never required for correctness +- **NAS integration is native**: Server understands mounted storage + +### When in doubt: + +- **Server-side is better** than client-side (for logic) +- **WebSocket is better** than REST (for interactivity) +- **Embedded is better** than external deps (rsync in server) +- **Flexible backend is better** than required service (Redis OR SQLite OR filesystem) +- **Plain text is better** than binary +- **Your hardware is better** than their cloud + +The runner should feel like **SSH into your well-organized research server with powerful tools**, not like operating a cloud platform. Whether you're using the CLI for automation or the TUI for interactive work, the experience should be transparent, fair, and research-focused. + +--- + +## Typical Research Workflows (CLI + TUI) + +### Morning Routine: Check What Happened Overnight +```bash +# From your laptop (via WebSocket) +ml status +# Shows: 2 finished, 1 running, 3 in queue + +ml info run_abc --show-metrics +# Quick check: did the overnight run validate the hypothesis? + +# If you need deep investigation, SSH in +ssh mluser@worker.local +ml-tui +# Visual inspection of logs, GPU usage, etc. +``` + +### Starting a New Experiment Series +```bash +# Script a parameter sweep (CLI automation) +for lr in 1e-3 3e-4 1e-4; do + ml queue train.py --lr $lr \ + # Future idea: --hypothesis / --experiment-group + --priority 5 +done + +# Monitor in TUI (interactive) +ssh mluser@worker.local +ml-tui +# Watch queue, see ETA, check prewarm status +``` + +### Debugging a Failed Run +```bash +# Notice failure via CLI +ml status +# run_xyz: failed (exit code 137) - OOM? + +# Jump into TUI for investigation +ssh mluser@worker.local +ml-tui +# Navigate to run_xyz, press 'l' for logs +# See OOM error at batch 128 +# Future idea: narrative/annotation UX in the TUI +``` + +### End-of-Day Review +```bash +# TUI for visual summary +ssh mluser@worker.local +ml-tui +# Scroll through today's runs +# Future ideas: compare views, export bundles +``` + +### Paper Writing Time (6 months later) +```bash +# Today: use the filesystem + run manifests +ml info + +# Future ideas: searching/filtering + comparison reports + +# TUI for visual exploration +ssh mluser@worker.local +ml-tui +# Navigate through old experiments +# Press 'n' to read narratives +# Reconstruct your thought process +``` + +### Collaborative Debugging with Advisor +```bash +# Both SSH into server simultaneously +ssh mluser@worker.local + +# You run TUI to show current state +ml-tui +# Navigate to problem run, show logs live + +# Advisor suggests fix +# You queue new run with their suggestion +ml queue train.py --lr 1e-4 \ + --note "Per advisor: try smaller LR with warmup" \ + # Future idea: --parent-run + --priority 7 + +# Watch it start in TUI immediately +# Queue position visible, prewarm status shown +``` + +This dual-interface approach gives researchers the best of both worlds: **scriptability when they need it, visibility when they want it**. + +--- + +## How This Maps to Your Current Architecture + +✅ **Already correct**: +- Server-centric with dual client interfaces (CLI + TUI) +- WebSocket communication (CLI) +- SSH-based TUI with Bubble Tea (interactive monitoring) +- Embedded rsync in server +- Flexible queue backend (Redis/SQLite/filesystem) +- Priority scheduling +- Prewarm mechanism for NAS prefetch +- **Fair queueing philosophy** - `queue` not `run` +- TUI shows live updates: jobs, queue, GPU status, logs + +🎯 **Natural extensions**: +- Queue-time narrative flags for `ml queue` (hypothesis/context/intent/etc.) +- CLI commands for diffing and finding (and higher-level comparison workflows) +- TUI panels for hypothesis/learnings (in job details) +- Reproducibility validation improvements (extend `ml validate`) +- Export/import for collaboration +- Graceful degradation (filesystem-only mode) +- Visible queue position and fairness metrics + +📝 **Design considerations**: +- Show prewarm state/progress in `ml status` +- Show queue position and ETA in both CLI and TUI +- Add research context fields to manifests +- Build comparison workflows (diff, similar, why-different) +- Support hypothesis tracking in both interfaces +- Create export bundles for sharing +- Expose fairness metrics (wait time distribution, resource utilization) +- TUI could show narrative snippets in job list (hypothesis as subtitle?) + +**TUI Research Narrative Integration Ideas:** +``` +┌─ ML Jobs & Queue ─────────────────────────────────────┐ +│ > imagenet_baseline │ +│ ✓ finished | Priority: 5 │ +│ "Testing baseline performance before ablations" │ +│ │ +│ batch_size_64 │ +│ ▶ running (epoch 45/100) | Priority: 5 │ +│ "Validating linear LR scaling hypothesis" │ +│ │ +│ warmup_test │ +│ ⏳ queued (position 2) | Priority: 3 │ +│ "Following up on advisor suggestion about warmup" │ +└───────────────────────────────────────────────────────┘ + +Press 'n' to view narrative, 'a' to annotate +``` + +**Implementation status (today):** +- **Annotations are implemented** and stored at the **root** of `run_manifest.json` as `annotations[]`. +- **Narrative fields are implemented** and stored under `run_manifest.json` as `narrative` (set/update via CLI). +- Use `ml annotate --note "..." [--author "..."]` to append an entry. +- Remaining gaps are around **queue-time capture**, **post-run learnings/outcomes**, and **TUI-first narrative UX**. + +Example manifest.json +```json +{ + // === Standard Execution Metadata === + "run_id": "2024-01-15_abc123", + "status": "completed", + "command": "train.py --lr 0.001 --epochs 100 --batch-size 64", + "queued_at": "2024-01-15T10:25:00Z", + "started_at": "2024-01-15T10:30:00Z", + "ended_at": "2024-01-15T14:45:00Z", + "exit_code": 0, + "priority": 5, + + // === Research Narrative (The Important Part) === + "narrative": { + // WHY did you run this? + "hypothesis": "Larger batch size with linear LR scaling should improve convergence speed without hurting final accuracy", + + // WHAT were you thinking at the time? + "context": "Previous run (run_789) with batch=32 took 8 hours and plateaued at 0.85. Paper XYZ suggests linear scaling rule should work.", + + // WHAT were you trying to accomplish? + "intent": "Test if doubling batch size (32→64) with 2x learning rate maintains accuracy while reducing training time", + + // WHAT did you expect to happen? + "expected_outcome": "Similar final accuracy (~0.85) but ~4 hour training time instead of 8", + + // HOW is this related to other experiments? + "parent_run": "2024-01-14_run789", + "experiment_group": "batch-size-scaling-ablation", + "tags": ["ablation", "batch-size", "convergence-speed", "paper-xyz-reproduction"], + + // WHAT did you learn? (filled in post-run or during) + "outcome": "Success: accuracy=0.87 (+0.02), time=3.5h (-56%). Linear scaling rule validated.", + "learnings": [ + "Linear LR scaling worked as expected from paper XYZ", + "GPU memory utilization went from 60% to 95% - near limit", + "Convergence was actually smoother (fewer spikes in loss curve)", + "Could probably push to batch=96 before OOM" + ], + "next_steps": [ + "Try batch=96 to maximize GPU utilization", + "Test if this scales to batch=128 with gradient accumulation", + "Validate on other datasets (currently only tested on ImageNet)" + ], + "validation_status": "validates", // or "refutes", "inconclusive", "partial" + }, + + // Human annotations added later + "annotations": [ + { + "timestamp": "2024-01-15T15:00:00Z", + "author": "user@lab.edu", + "note": "This result is strong enough for the paper. Use these hyperparams for final training." + }, + { + "timestamp": "2024-01-16T09:00:00Z", + "author": "advisor@lab.edu", + "note": "Good work. Also compare with warmup schedule before finalizing." + } + ], + + // === Reproducibility Metadata === + "environment": { + "git_commit": "a1b2c3d4", + "git_dirty": false, + "git_branch": "experiment/batch-scaling", + "container_image": "pytorch/pytorch:2.0.1-cuda11.8-cudnn8-runtime", + "container_digest": "sha256:abc123...", + "pip_freeze": "torch==2.0.1\ntorchvision==0.15.2\n...", + "cuda_version": "11.8", + "gpu_driver": "525.105.17", + "python_version": "3.10.12" + }, + + // === Data Provenance === + "datasets": [ + { + "name": "imagenet-train", + "nas_path": "/nas/datasets/imagenet/ILSVRC2012/train", + "checksum": "sha256:def456...", + "size_gb": 144.2, + "num_samples": 1281167, + "version": "ILSVRC2012", + "fetched_via": "prewarm", + "fetch_time_seconds": 180 + } + ], + + // === Resource Usage === + "resources": { + "requested": { + "gpus": 1, + "gpu_memory_gb": 24, + "cpu_cores": 8, + "ram_gb": 32 + }, + "actual": { + "gpu_utilization_avg": 95, + "gpu_memory_peak_gb": 22.8, + "cpu_utilization_avg": 45, + "ram_peak_gb": 28.5, + "disk_read_gb": 145, + "disk_write_gb": 12 + }, + "gpu_model": "NVIDIA RTX 3090", + "host": "ml-server-01" + }, + + // === Results === + "metrics": { + "final_train_accuracy": 0.891, + "final_val_accuracy": 0.873, + "final_train_loss": 0.234, + "final_val_loss": 0.287, + "best_val_accuracy": 0.876, + "best_epoch": 87, + "total_epochs": 100, + "training_time_hours": 3.52 + }, + + // === Artifacts === + "artifacts": { + "discovery_time": "2024-01-15T14:45:00Z", + "files": [ + { + "path": "checkpoints/epoch_010.pth", + "size_bytes": 450000000, + "modified": "2024-01-15T11:30:00Z" + }, + { + "path": "checkpoints/best.pth", + "size_bytes": 450000000, + "modified": "2024-01-15T13:45:00Z" + } + ], + "total_size_bytes": 900000000 + } +} \ No newline at end of file diff --git a/docs/src/user-permissions.md b/docs/src/user-permissions.md index 828068e..df101e6 100644 --- a/docs/src/user-permissions.md +++ b/docs/src/user-permissions.md @@ -61,13 +61,16 @@ User roles and permissions are configured on the server side by administrators. ### Data Scientist Workflow ```bash # Submit your experiment -ml run my-experiment +ml queue my-experiment # Check your experiments (only shows yours) ml status # Cancel your own experiment ml cancel my-experiment + +# Requeue a previous run with different args +ml requeue -- --epochs 20 ``` ### Administrator Workflow diff --git a/docs/src/validate.md b/docs/src/validate.md index 7e91dd9..136b39f 100644 --- a/docs/src/validate.md +++ b/docs/src/validate.md @@ -1,7 +1,6 @@ --- -layout: page title: "Validation (ml validate)" -permalink: /validate/ +url: "/validate/" --- # Validation (`ml validate`) diff --git a/docs/src/zig-cli.md b/docs/src/zig-cli.md index c2ab7af..b8fb241 100644 --- a/docs/src/zig-cli.md +++ b/docs/src/zig-cli.md @@ -1,8 +1,7 @@ --- -layout: page title: "Zig CLI Guide" -permalink: /zig-cli/ -nav_order: 3 +url: "/zig-cli/" +weight: 3 --- # Zig CLI Guide @@ -28,7 +27,7 @@ The CLI reads `~/.ml/config.toml` and respects `FETCH_ML_CLI_*` env vars: worker_host = "127.0.0.1" worker_user = "dev_user" worker_base = "/tmp/ml-experiments" -worker_port = 9101 +worker_port = 22 api_key = "your-api-key" ``` @@ -59,4 +58,4 @@ All use `zig build-exe` with `-OReleaseSmall -fstrip` and are compatible with Li ## CI/CD -The release workflow builds cross‑platform binaries and packages them with checksums. See `.github/workflows/release.yml`. +The release workflow builds cross‑platform binaries and packages them with checksums. See `.forgejo/workflows/release-mirror.yml`.