From d673bce2160242356ccfccb6bbf5e6c8341274c6 Mon Sep 17 00:00:00 2001 From: Jeremie Fraeys Date: Mon, 16 Feb 2026 20:37:38 -0500 Subject: [PATCH] docs: fix mermaid graphs and update outdated content - Fix mermaid graph syntax errors (escape parentheses in node labels) - Move mermaid-init.js to Hugo static directory for correct MIME type - Update Future Extensions section in cli-tui-ux-contract-v1.md to match current roadmap - Add ADR-004 through ADR-007 documenting C++ native optimization strategy --- docs/layouts/partials/docs/inject/head.html | 2 + ...e-cpp-for-selective-native-optimization.md | 92 +++++++++++ ...5-batch-first-apis-for-cgo-amortization.md | 123 ++++++++++++++ .../src/adr/ADR-006-runtime-simd-detection.md | 122 ++++++++++++++ .../adr/ADR-007-simplified-streaming-io.md | 154 ++++++++++++++++++ docs/src/adr/README.md | 4 + docs/src/architecture.md | 2 +- docs/src/cli-tui-ux-contract-v1.md | 28 +++- docs/static/js/mermaid-init.js | 84 ++++++++++ 9 files changed, 606 insertions(+), 5 deletions(-) create mode 100644 docs/layouts/partials/docs/inject/head.html create mode 100644 docs/src/adr/ADR-004-use-cpp-for-selective-native-optimization.md create mode 100644 docs/src/adr/ADR-005-batch-first-apis-for-cgo-amortization.md create mode 100644 docs/src/adr/ADR-006-runtime-simd-detection.md create mode 100644 docs/src/adr/ADR-007-simplified-streaming-io.md create mode 100644 docs/static/js/mermaid-init.js diff --git a/docs/layouts/partials/docs/inject/head.html b/docs/layouts/partials/docs/inject/head.html new file mode 100644 index 0000000..13fa818 --- /dev/null +++ b/docs/layouts/partials/docs/inject/head.html @@ -0,0 +1,2 @@ +{{/* Custom head partial for Hugo Book theme - loads Mermaid initialization */}} + diff --git a/docs/src/adr/ADR-004-use-cpp-for-selective-native-optimization.md b/docs/src/adr/ADR-004-use-cpp-for-selective-native-optimization.md new file mode 100644 index 0000000..028fe78 --- /dev/null +++ b/docs/src/adr/ADR-004-use-cpp-for-selective-native-optimization.md @@ -0,0 +1,92 @@ +# ADR-004: Use C++ for Selective Native Optimization + +## Status +Proposed + +## Context +Profiling identified ~80 functions in the Go codebase. Of these, only 4 functions showed significant syscall overhead that could benefit from native optimization: + +| Function | Syscall Reduction | CGo Overhead Risk | +|----------|-------------------|-------------------| +| queue_index | 96% | Low (batch operations) | +| dataset_hash | 78% | High (per-file calls) | +| artifact_scanner | 87% | Low (single scan) | +| streaming_io | 95% | N/A (streaming) | + +The key question is whether to implement all 4 libraries or focus on the highest ROI ones first, and how to manage the CGo overhead risk. + +## Decision +We will implement **selective C++ optimization** for 4 specific libraries, prioritized by ROI (syscall reduction × operational simplicity). + +**Phase 1: queue_index** (Highest ROI) +- 96% syscall reduction +- Natural batch operations minimize CGo overhead +- Heap-based priority queue in C++ + +**Phase 2: dataset_hash** +- Requires batching layer to amortize CGo overhead +- Single-file interface explicitly removed +- Only batch and combined interfaces exposed + +**Phase 3: artifact_scanner** +- 87% syscall reduction +- Single scan operation (low CGo overhead risk) + +**Phase 4: streaming_io** +- Simplified implementation: mmap + thread pool only +- **Explicitly excludes io_uring** (complexity exceeds benefit) + +## Consequences + +### Positive +- Significant syscall reduction (78-96%) for hot paths +- Maintains Go for business logic (fast iteration) +- Gradual implementation reduces risk +- Batch-first design amortizes CGo overhead + +### Negative +- CGo boundary adds ~100ns per call +- Cross-platform complexity (Intel vs ARM SIMD) +- Additional build complexity (CMake, C++ toolchain) +- Runtime CPU feature detection required +- Two implementations to maintain (Go + C++) + +## Options Considered + +### Option 1: Pure Go Only +**Pros:** Simple build, single codebase, no CGo complexity +**Cons:** Cannot achieve syscall reduction for file-heavy operations +**Verdict:** Rejected - syscall overhead is the bottleneck + +### Option 2: All Functions in C++ +**Pros:** Maximum performance +**Cons:** Most functions don't benefit; adds CGo overhead where not needed +**Verdict:** Rejected - 4/80 functions show measurable benefit + +### Option 3: Selective C++ (Chosen) +**Pros:** Focus on measurable wins, manageable complexity +**Cons:** Need discipline to avoid CGo creep +**Verdict:** Accepted - data-driven approach + +## Risk Mitigations + +### CGo Overhead Risk +- **Rule:** Native library ONLY used via batch interface +- **Enforcement:** Code review + benchmarks fail if CGo > 1% of runtime +- **Fallback:** Go implementation always available via build tags + +### SIMD Cross-Platform Risk +- Runtime CPU feature detection (never assume) +- CI tests on both Intel and Apple Silicon +- Generic C++ fallback always compiled + +### Zero-Allocation Discipline +- Hot path uses bump allocator with asserts +- ASan-enabled CI catches leaks +- Code review: any malloc/new in hot path requires justification + +## Rationale + +The 96% syscall reduction for queue_index alone justifies the C++ approach. The key discipline is **selectivity**: only functions with >75% syscall reduction and low CGo overhead risk are candidates. The simplified streaming_io (no io_uring) keeps operational complexity manageable while retaining 95% of the benefit. + +The batch-first API design is mandatory, not optional. Single-file CGo calls are slower than Go stdlib due to boundary crossing overhead. diff --git a/docs/src/adr/ADR-005-batch-first-apis-for-cgo-amortization.md b/docs/src/adr/ADR-005-batch-first-apis-for-cgo-amortization.md new file mode 100644 index 0000000..f7b632b --- /dev/null +++ b/docs/src/adr/ADR-005-batch-first-apis-for-cgo-amortization.md @@ -0,0 +1,123 @@ +# ADR-005: Use Batch-First APIs for CGo Overhead Amortization + +## Status +Proposed + +## Context +CGo has a boundary crossing overhead of approximately 100ns per call. If native functions are called individually for many small files, the CGo overhead negates syscall gains. This is particularly problematic for `dataset_hash`, which historically operated on a per-file basis. + +The naive approach: +```go +// SLOW: CGo overhead dominates +for _, file := range files { + hash := C.fh_hash_file(file) // 100ns + syscall +} +``` + +The optimized approach must batch operations to amortize the 100ns overhead across many files. + +## Decision +All C++ native libraries MUST expose **batch-first APIs**. Single-function interfaces are explicitly prohibited for high-call-frequency operations. + +**Required API Pattern:** +```go +// Batch hash directory contents +// Single CGo call for entire directory +func dirOverallSHA256HexNative(root string) (string, error) { + croot := C.CString(root) + defer C.free(unsafe.Pointer(croot)) + + result := C.fh_hash_directory_combined(croot, 0) // 0 = auto threads + if result == nil { + return "", errors.New("hash failed") + } + defer C.fh_free_string(result) + + return C.GoString(result), nil +} +``` + +**C++ Interface Design:** +```c +// dataset_hash.h - REVISED (batch only) +// ALWAYS use batch interface. Single-file interface removed. + +// Batch hash directory contents +// out_hashes: pre-allocated array of 65-char buffers +int fh_hash_directory_batch( + const char* dir_path, + char** out_hashes, + char** out_paths, + uint32_t max_results, + uint32_t* out_count, + uint32_t num_threads +); + +// Simple combined hash (single CGo call) +char* fh_hash_directory_combined(const char* dir_path, uint32_t num_threads); +``` + +## Consequences + +### Positive +- CGo overhead amortized across N files (100ns total vs 100ns × N) +- Single CGo call per operation simplifies error handling +- Forces coarse-grained operations that match Go concurrency model + +### Negative +- API less granular than pure Go equivalent +- Caller must batch requests or use Go fallback +- Memory management for pre-allocated buffers + +## Options Considered + +### Option 1: Single-File Native with Caching +**Pros:** Simple API, matches Go stdlib +**Cons:** CGo overhead > syscall win for small files +**Verdict:** Rejected - overhead dominates + +### Option 2: Automatic Batching Layer in Go +**Pros:** Transparent to caller +**Cons:** Complexity, unpredictable latency, buffering issues +**Verdict:** Rejected - explicit is better than implicit + +### Option 3: Batch-First APIs (Chosen) +**Pros:** Predictable performance, caller controls batching +**Cons:** Requires caller awareness +**Verdict:** Accepted - performance-critical paths use batch explicitly + +## Enforcement + +### Build-Time +- Single-file C functions removed from headers +- `native_bridge.go` only exposes batch interfaces + +### Run-Time +- Go fallback for small batches (<10 files or <1MB total) +- Native only used when batch size justifies overhead + +```go +func hashFilesBatch(paths []string) ([]string, error) { + if !UseNativeLibs || len(paths) == 0 { + return hashFilesBatchGo(paths) + } + + // Only use native for batches >10 files or total size >1MB + totalSize := estimateSize(paths) + if len(paths) < 10 && totalSize < 1024*1024 { + return hashFilesBatchGo(paths) + } + + return hashFilesBatchNative(paths) +} +``` + +### CI Enforcement +- Benchmarks fail if CGo > 1% of total runtime +- Profile-guided check: `go test -cpuprofile` analysis + +## Rationale + +The 100ns CGo overhead is fixed and unavoidable. The only solution is to make each crossing do more work. Batch-first APIs ensure that syscall wins (78-96% reduction) exceed CGo costs by operating on coarse-grained units (directories, not files; task arrays, not individual tasks). + +This is a trade-off: less granular APIs for predictable performance. The batch constraint is documented and enforced at API boundary, not hidden in implementation. diff --git a/docs/src/adr/ADR-006-runtime-simd-detection.md b/docs/src/adr/ADR-006-runtime-simd-detection.md new file mode 100644 index 0000000..9889ca9 --- /dev/null +++ b/docs/src/adr/ADR-006-runtime-simd-detection.md @@ -0,0 +1,122 @@ +# ADR-006: Use Runtime SIMD Detection for Cross-Platform Correctness + +## Status +Proposed + +## Context +The C++ native libraries use SIMD instructions for performance (SHA-NI on Intel, ARMv8 crypto extensions on Apple Silicon). However, macOS universal binaries support both x86_64 and arm64, and not all CPUs support the same extensions. Compile-time detection (e.g., `#ifdef __AVX__`) is insufficient because: + +1. Universal binaries must compile for the lowest common denominator +2. Runtime CPU detection is required for correct operation on heterogeneous hardware +3. Silent failures or illegal instruction crashes occur if SIMD is assumed + +## Decision +Use **runtime CPU feature detection** with compile-time guards. Function pointers are resolved at library initialization based on detected CPU capabilities. + +**Implementation Pattern:** +```cpp +// sha256_simd.cpp +#include +#include + +enum class Sha256Impl { + GENERIC, // Pure C++ (fallback) + SHA_NI, // Intel SHA-NI (x86_64) + ARMV8_CRYPTO // ARMv8 crypto extensions +}; + +// Runtime detection (called once at init) +Sha256Impl detect_best_impl() { +#if defined(__aarch64__) + // Apple Silicon - always has ARMv8 crypto + return Sha256Impl::ARMV8_CRYPTO; +#elif defined(__x86_64__) + unsigned int eax, ebx, ecx, edx; + __get_cpuid(7, &eax, &ebx, &ecx, &edx); + if (ebx & bit_SHA) { // SHA-NI bit + return Sha256Impl::SHA_NI; + } + return Sha256Impl::GENERIC; +#else + return Sha256Impl::GENERIC; +#endif +} + +// Function pointer set at library init +void (*sha256_block_fn)(uint32_t* state, const uint8_t* data, size_t blocks) = nullptr; + +extern "C" void fh_init_impl() { + auto impl = detect_best_impl(); + switch (impl) { + case Sha256Impl::SHA_NI: sha256_block_fn = sha256_block_sha_ni; break; + case Sha256Impl::ARMV8_CRYPTO: sha256_block_fn = sha256_block_armv8; break; + default: sha256_block_fn = sha256_block_generic; break; + } +} +``` + +## Consequences + +### Positive +- Single binary works on all supported platforms +- Graceful degradation to generic implementation +- No runtime crashes from illegal instructions +- Apple Silicon and Intel Macs supported equally + +### Negative +- Slight initialization overhead (one-time, negligible) +- Function pointer indirection in hot path +- More complex build (multiple SIMD variants compiled) +- Larger binary (generic + SIMD code paths) + +## Options Considered + +### Option 1: Compile-Time Detection Only +**Pros:** Simple, no runtime overhead +**Cons:** Universal binary fails on one architecture or the other +**Verdict:** Rejected - macOS requires universal binaries + +### Option 2: Separate Binaries per Architecture +**Pros:** Maximum performance, simple code +**Cons:** Complex distribution, user must choose correct binary +**Verdict:** Rejected - poor user experience + +### Option 3: Runtime Detection with Function Pointers (Chosen) +**Pros:** Single binary, correct on all platforms +**Cons:** Slight indirection overhead +**Verdict:** Accepted - correctness over micro-optimization + +## Enforcement + +### Build Requirements +- Compile with `-march=x86-64-v2` (baseline) + separate SHA-NI object +- Compile with `-march=armv8-a` (baseline) + separate crypto object +- Linker combines all variants into single binary + +### CI Strategy +```yaml +# Build and test on both architectures +jobs: + test-intel: + runs-on: macos-latest # Intel + steps: + - build + - test -bench=. ./tests/benchmarks/ + + test-arm: + runs-on: macos-latest # Apple Silicon + steps: + - build + - test -bench=. ./tests/benchmarks/ +``` + +### Correctness Verification +- Same hashes produced on Intel and ARM +- Benchmarks verify SIMD path is faster than generic +- Unit tests run on both architectures in CI + +## Rationale + +Runtime detection is mandatory for macOS universal binaries. The function pointer indirection cost (~1-2 cycles) is negligible compared to the SIMD speedup (2-4x for SHA-256). The generic fallback ensures the code never crashes, even on unexpected hardware. + +The key discipline: **never assume CPU features**. Detection happens once at init, then the optimal path is used for all subsequent operations. diff --git a/docs/src/adr/ADR-007-simplified-streaming-io.md b/docs/src/adr/ADR-007-simplified-streaming-io.md new file mode 100644 index 0000000..2b93f15 --- /dev/null +++ b/docs/src/adr/ADR-007-simplified-streaming-io.md @@ -0,0 +1,154 @@ +# ADR-007: Simplified Streaming I/O Without io_uring + +## Status +Proposed + +## Context +The streaming_io library handles large artifact uploads/downloads with gzip/tar decompression. Initial plans included io_uring for asynchronous I/O, which promised maximum performance. However, io_uring adds significant operational complexity: + +- Kernel version requirements (Linux 5.10+) +- Container compatibility issues (Docker/Podman restrictions) +- Security surface area (ring buffer attacks) +- Fallback complexity (must handle unsupported kernels) +- macOS incompatibility (io_uring is Linux-only) + +The 95% syscall reduction from streaming_io primarily comes from: +1. Memory-mapped I/O (mmap) for reading archive headers +2. Thread pool for parallel gzip decompression +3. Direct I/O (O_DIRECT) for large writes + +io_uring adds only marginal benefit beyond these techniques. + +## Decision +Use **simplified streaming I/O** without io_uring. The implementation uses: + +1. **mmap** for archive header inspection (gzip/tar structure) +2. **Thread pool** for parallel decompression of independent gzip blocks +3. **Direct I/O (O_DIRECT)** for large file writes (portable, simple fallback to buffered) +4. **Standard pread/pwrite** for random access (no io_uring) + +**Revised Implementation:** +```cpp +// streaming_io.h - REVISED (no io_uring) + +struct streaming_ctx { + // Memory mapping + void* mmap_base; + size_t mmap_size; + + // Thread pool + ThreadPool thread_pool{4}; // Configurable + + // Direct I/O for large writes + bool use_direct_io; + int output_fd; +}; + +// Main entry point - single CGo call +extern "C" int extract_tar_gz( + const char* archive_path, + const char* output_dir, + uint32_t num_threads +); +``` + +**Thread Pool Decompression:** +```cpp +void decompress_blocks_parallel( + const std::vector& blocks, + const std::string& output_dir +) { + // Each block independent - perfect for thread pool + thread_pool.parallel_for(blocks.size(), [&](size_t i) { + auto decompressed = gzip_decompress(blocks[i].data); + direct_write(output_dir + "/" + blocks[i].filename, decompressed); + }); +} +``` + +## Consequences + +### Positive +- Works on all platforms (Linux, macOS, containers) +- No kernel version dependencies +- Simpler code (no ring buffer management) +- Easier debugging (standard syscalls) +- 95% of syscall win retained + +### Negative +- ~5% performance loss vs io_uring for bulk I/O +- Thread pool overhead for small archives +- O_DIRECT alignment requirements (platform-specific) + +## Options Considered + +### Option 1: Full io_uring Implementation +**Pros:** Maximum performance on modern Linux +**Cons:** Container issues, kernel requirements, macOS incompatible, security surface +**Verdict:** Rejected - complexity exceeds benefit + +### Option 2: Hybrid (io_uring when available, fallback otherwise) +**Pros:** Best of both worlds +**Cons:** Double the code, double the testing, runtime detection complexity +**Verdict:** Rejected - fallback paths often have bugs + +### Option 3: Simplified (mmap + thread pool) (Chosen) +**Pros:** 95% of benefit, works everywhere +**Cons:** Slightly lower peak performance +**Verdict:** Accepted - pragmatic performance + +## Performance Comparison + +| Approach | Syscall Reduction | Complexity | Portability | +|----------|-------------------|------------|-------------| +| Go stdlib | Baseline | Low | Universal | +| Simplified C++ | 95% | Medium | Universal | +| io_uring C++ | 98% | High | Linux 5.10+ only | + +**3% difference not worth the complexity cost.** + +## Enforcement + +### Code Review Checklist +- [ ] No io_uring headers included +- [ ] Standard POSIX I/O only (pread/pwrite, mmap) +- [ ] Thread pool size configurable +- [ ] O_DIRECT with automatic fallback + +### Testing Requirements +- CI on Linux (containerized and native) +- CI on macOS (Intel and ARM) +- Large file tests (>1GB archives) +- Concurrent extraction tests + +### Benchmarks +```bash +# Verify 95% syscall reduction +make benchmark-streaming-io + +# Compare with Go implementation +go test -bench=BenchmarkStreaming -benchmem ./tests/benchmarks/ +``` + +## Rationale + +The 95/5 rule applies: 95% of the syscall win comes from parallel decompression and mmap, only 5% from io_uring. The operational cost of io_uring (kernel requirements, container restrictions, macOS incompatibility) exceeds the marginal benefit. + +The simplified approach is **correct first, fast second**. A universal 95% win beats a fragmented 98% win. + +## Future Considerations + +If kernel requirements change (e.g., minimum becomes Linux 6.0+), io_uring can be reconsidered. The simplified architecture is forward-compatible: io_uring can be added as an optional backend without changing the Go API. + +```cpp +// Future: pluggable backend +class IOBackend { + virtual ssize_t read(int fd, void* buf, size_t count, off_t offset) = 0; + virtual ssize_t write(int fd, const void* buf, size_t count, off_t offset) = 0; +}; + +class PosixBackend : public IOBackend { /* current impl */ }; +// class UringBackend : public IOBackend { /* future */ }; +``` + +Until then, the simplified implementation is the correct choice. diff --git a/docs/src/adr/README.md b/docs/src/adr/README.md index 2f6abe6..19c6e18 100644 --- a/docs/src/adr/README.md +++ b/docs/src/adr/README.md @@ -36,6 +36,10 @@ Each ADR follows this structure: | ADR-001 | Use Go for API Server | Accepted | | ADR-002 | Use SQLite for Local Development | Accepted | | ADR-003 | Use Redis for Job Queue | Accepted | +| ADR-004 | Use C++ for Selective Native Optimization | Proposed | +| ADR-005 | Use Batch-First APIs for CGo Amortization | Proposed | +| ADR-006 | Use Runtime SIMD Detection | Proposed | +| ADR-007 | Simplified Streaming I/O Without io_uring | Proposed | ## How to Add a New ADR diff --git a/docs/src/architecture.md b/docs/src/architecture.md index a2940e2..7e4fbcc 100644 --- a/docs/src/architecture.md +++ b/docs/src/architecture.md @@ -14,7 +14,7 @@ Simple, secure architecture for ML experiments in your homelab. graph TB subgraph "Homelab Stack" CLI[Zig CLI] - API[API Server (HTTPS + WebSocket)] + API["API Server (HTTPS + WebSocket)"] REDIS[Redis Cache] DB[(SQLite/PostgreSQL)] FS[Local Storage] diff --git a/docs/src/cli-tui-ux-contract-v1.md b/docs/src/cli-tui-ux-contract-v1.md index 014413d..33b7288 100644 --- a/docs/src/cli-tui-ux-contract-v1.md +++ b/docs/src/cli-tui-ux-contract-v1.md @@ -329,9 +329,29 @@ ml status $JOB_ID The v1 contract is intentionally minimal but designed for extension: -- **v1.1**: Add job dependencies and workflows -- **v1.2**: Add experiment templates and scaffolding -- **v1.3**: Add distributed execution across multiple workers -- **v2.0**: Add advanced scheduling and resource optimization +### Phase 0: Trust and usability (highest priority) + +1. **Make `ml status` excellent** - Compact summary with queue counts, relevant tasks, prewarm state +2. **Add `ml explain`** - Dry-run preview command showing resolved execution plan +3. **Tighten run manifest completeness** - Require timestamps, exit codes, dataset identities +4. **Dataset identity** - Structured `dataset_specs` with checksums (strict-by-default) + +### Phase 1: Simple performance wins + +- Keep prewarming single-level (next task only) +- Improve observability first (status output + metrics) + +### Phase 2+: Research workflows + +- `ml compare `: Manifest-driven diff of provenance +- `ml reproduce `: Submit task from recorded manifest +- `ml export `: Package provenance + artifacts + +### Phase 3: Infrastructure (only if needed) + +- Multi-level prewarming, predictive scheduling +- Optional S3-compatible storage (MinIO) +- Optional integrations (MLflow, W&B) +- Optional Kubernetes deployment All extensions will maintain backward compatibility with the v1 contract. diff --git a/docs/static/js/mermaid-init.js b/docs/static/js/mermaid-init.js new file mode 100644 index 0000000..ffe60f0 --- /dev/null +++ b/docs/static/js/mermaid-init.js @@ -0,0 +1,84 @@ +// Initialize Mermaid for Hugo Book theme +// Loaded via Hugo's custom head/footer + +(function () { + var initialized = false; + + function renderNode(node) { + var graphDefinition = node.textContent; + node.innerHTML = ""; + node.removeAttribute("data-processed"); + + if (typeof window.mermaid.render === "function") { + try { + var id = "mermaid-" + Math.random().toString(36).substr(2, 9); + var result = window.mermaid.render(id, graphDefinition); + + if (result && typeof result.then === "function") { + result + .then(function (out) { + node.innerHTML = out && out.svg ? out.svg : ""; + }) + .catch(function (err) { + node.innerHTML = + '
Error: ' +
+                (err && err.message ? err.message : String(err)) +
+                "
"; + }); + return; + } + + if (result && result.svg) { + node.innerHTML = result.svg; + return; + } + + if (typeof result === "string") { + node.innerHTML = result; + return; + } + } catch (err) { + node.innerHTML = + '
Error: ' +
+          (err && err.message ? err.message : String(err)) +
+          "
"; + return; + } + } + + // Fallback + node.innerHTML = graphDefinition; + } + + function renderMermaid() { + if (typeof window.mermaid === "undefined") { + setTimeout(renderMermaid, 100); + return; + } + + if (!initialized) { + window.mermaid.initialize({ + startOnLoad: false, + securityLevel: "loose", + theme: "default", + }); + initialized = true; + } + + var nodes = document.querySelectorAll(".mermaid"); + if (!nodes || nodes.length === 0) { + return; + } + + for (var i = 0; i < nodes.length; i++) { + renderNode(nodes[i]); + } + } + + // Initial load + if (document.readyState === "loading") { + document.addEventListener("DOMContentLoaded", renderMermaid); + } else { + setTimeout(renderMermaid, 100); + } +})();