diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 3a16116..792a430 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -1,13 +1,10 @@ name: Documentation on: + workflow_dispatch: push: branches: [ main ] - paths: [ 'docs/**', 'README.md' ] pull_request: - branches: [ main ] - paths: [ 'docs/**', 'README.md' ] - workflow_dispatch: permissions: contents: read @@ -26,16 +23,27 @@ jobs: uses: actions/checkout@v4 - name: Setup Pages + id: pages uses: actions/configure-pages@v3 - - name: Build with Jekyll - uses: actions/jekyll-build-pages@v1 + - name: Set up Go (for Hugo Modules) + uses: actions/setup-go@v5 with: - source: ./docs - destination: ./_site + go-version: '1.21' + + - name: Setup Hugo + uses: peaceiris/actions-hugo@v3 + with: + hugo-version: '0.125.7' + extended: true + + - name: Build with Hugo + run: hugo --source docs --minify --baseURL "${{ steps.pages.outputs.base_url }}/" - name: Upload artifact uses: actions/upload-pages-artifact@v2 + with: + path: docs/_site deploy: environment: diff --git a/docs/_config.yml b/docs/_config.yml deleted file mode 100644 index 31a47be..0000000 --- a/docs/_config.yml +++ /dev/null @@ -1,90 +0,0 @@ -# GitHub Pages configuration - -# Site settings -title: "Fetch ML Documentation" -description: "Secure Machine Learning Platform" -baseurl: "/fetch_ml" -url: "https://fetch-ml.github.io" - -# Build settings -markdown: kramdown -highlighter: rouge -theme: minima -plugins: - - jekyll-sitemap - - jekyll-feed - - jekyll-optional-front-matter - - jekyll-readme-index - - jekyll-titles-from-headings - - jekyll-seo-tag - -# Versioning -version: "1.0.0" -versions: - - "1.0.0" - - "0.9.0" -latest_version: "1.0.0" - -# Navigation -nav: - - title: "Getting Started" - subnav: - - title: "Quick Start" - url: "/quick-start/" - - title: "Guides" - subnav: - - title: "CLI Reference" - url: "/cli-reference/" - - title: "Architecture" - url: "/architecture/" - - title: "Server Setup" - url: "/server-setup/" - - title: "Development" - subnav: - - title: "Contributing" - url: "/contributing/" - - title: "API Reference" - url: "/api/" - - title: "Performance Monitoring" - url: "/performance-monitoring/" - -# Collections -collections: - docs: - output: true - permalink: /:collection/:name/ - api: - output: true - permalink: /api/:name/ - -# Exclude files from processing -exclude: - - Gemfile - - Gemfile.lock - - node_modules - - vendor - - .gitignore - - README.md - - Makefile - -# Include files -include: - - _pages - -# SEO -author: "Fetch ML Team" -twitter: - username: "fetch_ml" - card: "summary" - -# Google Analytics (optional) -google_analytics: "" - -# Mermaid diagrams for architecture -mermaid: - enabled: true - -# Code highlighting -kramdown: - input: GFM - syntax_highlighter: rouge diff --git a/docs/_site/404.html b/docs/_site/404.html deleted file mode 100644 index a07fa07..0000000 --- a/docs/_site/404.html +++ /dev/null @@ -1,1630 +0,0 @@ - - - -
- - - - - - - - - - - - - - - - - - - -Accepted
-We needed to choose a programming language for the Fetch ML API server that would provide: -- High performance for ML experiment management -- Strong concurrency support for handling multiple experiments -- Good ecosystem for HTTP APIs and WebSocket connections -- Easy deployment and containerization -- Strong type safety and reliability
-We chose Go as the primary language for the API server implementation.
-Pros: -- Rich ML ecosystem (TensorFlow, PyTorch, scikit-learn) -- Easy to learn and write -- Great for data science teams -- FastAPI provides good performance
-Cons: -- Global Interpreter Lock limits true parallelism -- Higher memory usage -- Slower performance for high-throughput scenarios -- More complex deployment (multiple files, dependencies)
-Pros: -- Excellent WebSocket support -- Large ecosystem -- Fast development cycle
-Cons: -- Single-threaded event loop can be limiting -- Not ideal for CPU-intensive ML operations -- Dynamic typing can lead to runtime errors
-Pros: -- Maximum performance and memory safety -- Strong type system -- Growing ecosystem
-Cons: -- Very steep learning curve -- Longer development time -- Smaller ecosystem for web frameworks
-Pros: -- Mature ecosystem -- Good performance -- Strong typing
-Cons: -- Higher memory usage -- More verbose syntax -- Slower startup time -- Heavier deployment footprint
-Go provides the best balance of performance, concurrency support, and deployment simplicity for our API server needs. The ability to handle many concurrent ML experiments efficiently with goroutines is a key advantage. The single binary deployment model also simplifies our containerization and distribution strategy.
- - - - - - - - - - - - - -Accepted
-For local development and testing, we needed a database solution that: -- Requires minimal setup and configuration -- Works well with Go's database drivers -- Supports the same SQL features as production databases -- Allows easy reset and recreation of test data -- Doesn't require external services running locally
-We chose SQLite as the default database for local development and testing environments.
-Pros: -- Production-grade database -- Excellent feature support -- Good Go driver support -- Consistent with production environment
-Cons: -- Requires external service installation and configuration -- Higher resource usage -- More complex setup for new developers -- Overkill for simple local development
-Pros: -- Popular and well-supported -- Good Go drivers available
-Cons: -- Requires external service -- More complex setup -- Different SQL dialect than PostgreSQL
-Pros: -- Very fast -- No persistence needed for some tests
-Cons: -- Limited query capabilities -- Not suitable for complex relational data -- Different data model than production
-Pros: -- Simple implementation -- No dependencies
-Cons: -- Limited query capabilities -- No transaction support -- Hard to scale to complex data needs
-SQLite provides the perfect balance of simplicity and functionality for local development. It requires zero setup - developers can just run the application and it works. The file-based nature makes it easy to reset test data by deleting the database file. While it differs from our production PostgreSQL database, it supports the same core SQL features needed for development and testing.
-The main limitation is single-writer access, but this is acceptable for local development where typically only one developer is working with the database at a time. For integration tests that need concurrent access, we can use PostgreSQL or Redis.
- - - - - - - - - - - - - -Accepted
-For the ML experiment job queue system, we needed a solution that: -- Provides reliable job queuing and distribution -- Supports multiple workers consuming jobs concurrently -- Offers persistence and durability -- Handles job priorities and retries -- Integrates well with our Go-based API server -- Can scale horizontally with multiple workers
-We chose Redis as the job queue backend using its list data structures and pub/sub capabilities.
-Pros: -- No additional infrastructure -- ACID transactions -- Complex queries and joins possible -- Integrated with primary database
-Cons: -- Higher latency for queue operations -- Database contention under high load -- More complex implementation for reliable polling -- Limited scalability for high-frequency operations
-Pros: -- Purpose-built message broker -- Advanced routing and filtering -- Built-in acknowledgments and retries -- Good clustering support
-Cons: -- More complex setup and configuration -- Higher resource requirements -- Steeper learning curve -- Overkill for simple queue needs
-Pros: -- Extremely high throughput -- Built-in partitioning and replication -- Good for event streaming
-Cons: -- Complex setup and operations -- Designed for streaming, not job queuing -- Higher latency for individual job processing -- More resource intensive
-Pros: -- Zero external dependencies -- Very fast -- Simple implementation
-Cons: -- No persistence (jobs lost on restart) -- Limited to single process -- No monitoring or observability -- Not suitable for distributed systems
-Redis provides the optimal balance of simplicity, performance, and reliability for our job queue needs. The list-based queue implementation (LPUSH/RPOP) is straightforward and highly performant. Redis's persistence options ensure jobs aren't lost during restarts, and the pub/sub capabilities enable real-time notifications for workers.
-The Go client library is excellent and provides connection pooling, automatic reconnection, and good error handling. Redis's low memory footprint and fast operations make it ideal for high-frequency job queuing scenarios common in ML workloads.
-While RabbitMQ offers more advanced features, Redis is sufficient for our current needs and much simpler to operate. The simple queue model also makes it easier to understand and debug when issues arise.
- - - - - - - - - - - - - -This directory contains Architecture Decision Records (ADRs) for the Fetch ML project.
-Architecture Decision Records are short text files that document a single architectural decision. They capture the context, options considered, decision made, and consequences of that decision.
-Each ADR follows this structure:
-# ADR-XXX: [Title]
-
-## Status
-[Proposed | Accepted | Deprecated | Superseded]
-
-## Context
-[What is the issue that we're facing that needs a decision?]
-
-## Decision
-[What is the change that we're proposing and/or doing?]
-
-## Consequences
-[What becomes easier or more difficult to do because of this change?]
-
-## Options Considered
-[What other approaches did we consider and why did we reject them?]
-| ADR | -Title | -Status | -
|---|---|---|
| ADR-001 | -Use Go for API Server | -Accepted | -
| ADR-002 | -Use SQLite for Local Development | -Accepted | -
| ADR-003 | -Use Redis for Job Queue | -Accepted | -
ADR-XXX-title.md where XXX is the next sequential numberThis document describes how API keys are issued and how team members should configure the ml CLI to use them.
The goal is to keep access easy for your homelab while treating API keys as sensitive secrets.
-ml CLI to authenticate to the FetchML API.There are two supported ways to receive your key:
-./scripts/create_bitwarden_fetchml_item.sh <username> <api_key> <api_key_hash>
-This script:
-FetchML API – <username>.Stores:
-<username><api_key> (the actual API key)api_key_hash: <api_key_hash>Share that item with the user in Bitwarden (for example, via a shared collection like FetchML).
Open Bitwarden and locate the item:
-Name: FetchML API – <your-name>
Copy the password field (this is your FetchML API key).
-Configure the CLI, e.g. in ~/.ml/config.toml:
api_key = "<paste-from-bitwarden>"
-worker_host = "localhost"
-worker_port = 9100
-api_url = "ws://localhost:9100/ws"
-ml status
-If the command works, your key and tunnel/config are correct.
-For users who do not use Bitwarden, a lightweight alternative is a direct one-to-one share.
-Share only the API key with the user via a direct channel you both trust, such as:
-Signal / WhatsApp direct message
-Short call/meeting where you read it to them
-Ask the user to:
-Paste the key into their local config.
-~/.ml/config.toml:api_key = "<your-api-key>"
-worker_host = "localhost"
-worker_port = 9100
-api_url = "ws://localhost:9100/ws"
-ml status
-ml queue my-training-job
-ml cancel my-training-job
-api_key_hash is as sensitive as the API key itself.Do not commit keys or hashes to Git or share them in screenshots or tickets.
-Rotation
-The admin will revoke the old key, generate a new one, and update Bitwarden or share a new key.
-Transport security
-api_url is typically ws://localhost:9100/ws when used through an SSH tunnel to the homelab.Following these steps keeps API access easy for the team while maintaining a reasonable security posture for a personal homelab deployment.
- - - - - - - - - - - - - -Simple, secure architecture for ML experiments in your homelab.
-graph TB
- subgraph "Homelab Stack"
- CLI[Zig CLI]
- API[HTTPS API]
- REDIS[Redis Cache]
- FS[Local Storage]
- end
-
- CLI --> API
- API --> REDIS
- API --> FS
-graph LR
- USER[User] --> AUTH[API Key Auth]
- AUTH --> RATE[Rate Limiting]
- RATE --> WHITELIST[IP Whitelist]
- WHITELIST --> API[Secure API]
- API --> AUDIT[Audit Logging]
-sequenceDiagram
- participant CLI
- participant API
- participant Redis
- participant Storage
-
- CLI->>API: HTTPS Request
- API->>API: Validate Auth
- API->>Redis: Cache/Queue
- API->>Storage: Experiment Data
- Storage->>API: Results
- API->>CLI: Response
-services:
- redis:
- image: redis:7-alpine
- ports: ["6379:6379"]
- volumes: [redis_data:/data]
-
- api-server:
- build: .
- ports: ["9101:9101"]
- depends_on: [redis]
-./setup.sh && ./manage.sh start
-data/
-├── experiments/ # ML experiment results
-├── cache/ # Temporary cache files
-└── backups/ # Local backups
-
-logs/
-├── app.log # Application logs
-├── audit.log # Security events
-└── access.log # API access logs
-Simple, lightweight monitoring: -- Health Checks: Service availability -- Log Files: Structured logging -- Basic Metrics: Request counts, error rates -- Security Events: Failed auth, rate limits
-graph TB
- subgraph "Client Layer"
- CLI[CLI Tools]
- TUI[Terminal UI]
- API[REST API]
- end
-
- subgraph "Authentication Layer"
- Auth[Authentication Service]
- RBAC[Role-Based Access Control]
- Perm[Permission Manager]
- end
-
- subgraph "Core Services"
- Worker[ML Worker Service]
- DataMgr[Data Manager Service]
- Queue[Job Queue]
- end
-
- subgraph "Storage Layer"
- Redis[(Redis Cache)]
- DB[(SQLite/PostgreSQL)]
- Files[File Storage]
- end
-
- subgraph "Container Runtime"
- Podman[Podman/Docker]
- Containers[ML Containers]
- end
-
- CLI --> Auth
- TUI --> Auth
- API --> Auth
-
- Auth --> RBAC
- RBAC --> Perm
-
- Worker --> Queue
- Worker --> DataMgr
- Worker --> Podman
-
- DataMgr --> DB
- DataMgr --> Files
-
- Queue --> Redis
-
- Podman --> Containers
-graph TB
- subgraph "Zig CLI Components"
- Main[main.zig] --> Commands[commands/]
- Commands --> Config[config.zig]
- Commands --> Utils[utils/]
- Commands --> Net[net/]
- Commands --> Errors[errors.zig]
-
- subgraph "Commands"
- Init[init.zig]
- Sync[sync.zig]
- Queue[queue.zig]
- Watch[watch.zig]
- Status[status.zig]
- Monitor[monitor.zig]
- Cancel[cancel.zig]
- Prune[prune.zig]
- end
-
- subgraph "Utils"
- Crypto[crypto.zig]
- Storage[storage.zig]
- Rsync[rsync.zig]
- end
-
- subgraph "Network"
- WS[ws.zig]
- end
- end
-graph LR
- subgraph "CLI Security"
- Config[Config File] --> Hash[SHA256 Hashing]
- Hash --> Auth[API Authentication]
- Auth --> SSH[SSH Transfer]
- SSH --> WS[WebSocket Security]
- end
-graph LR
- subgraph "Auth Flow"
- Client[Client] --> APIKey[API Key]
- APIKey --> Hash[Hash Validation]
- Hash --> Roles[Role Resolution]
- Roles --> Perms[Permission Check]
- Perms --> Access[Grant/Deny Access]
- end
-
- subgraph "Permission Sources"
- YAML[YAML Config]
- Inline[Inline Fallback]
- Roles --> YAML
- Roles --> Inline
- end
-Features: -- API key-based authentication -- Role-based access control (RBAC) -- YAML-based permission configuration -- Fallback to inline permissions -- Admin wildcard permissions
-graph TB
- subgraph "Worker Architecture"
- API[HTTP API] --> Router[Request Router]
- Router --> Auth[Auth Middleware]
- Auth --> Queue[Job Queue]
- Queue --> Processor[Job Processor]
- Processor --> Runtime[Container Runtime]
- Runtime --> Storage[Result Storage]
-
- subgraph "Job Lifecycle"
- Submit[Submit Job] --> Queue
- Queue --> Execute[Execute]
- Execute --> Monitor[Monitor]
- Monitor --> Complete[Complete]
- Complete --> Store[Store Results]
- end
- end
-Responsibilities: -- HTTP API for job submission -- Job queue management -- Container orchestration -- Result collection and storage -- Metrics and monitoring
-graph TB
- subgraph "Data Management"
- API[Data API] --> Storage[Storage Layer]
- Storage --> Metadata[Metadata DB]
- Storage --> Files[File System]
- Storage --> Cache[Redis Cache]
-
- subgraph "Data Operations"
- Upload[Upload Data] --> Validate[Validate]
- Validate --> Store[Store]
- Store --> Index[Index]
- Index --> Catalog[Catalog]
- end
- end
-Features: -- Data upload and validation -- Metadata management -- File system abstraction -- Caching layer -- Data catalog
-graph TB
- subgraph "TUI Architecture"
- UI[UI Components] --> Model[Data Model]
- Model --> Update[Update Loop]
- Update --> Render[Render]
-
- subgraph "UI Panels"
- Jobs[Job List]
- Details[Job Details]
- Logs[Log Viewer]
- Status[Status Bar]
- end
-
- UI --> Jobs
- UI --> Details
- UI --> Logs
- UI --> Status
- end
-Components: -- Bubble Tea framework -- Component-based architecture -- Real-time updates -- Keyboard navigation -- Theme support
-sequenceDiagram
- participant Client
- participant Auth
- participant Worker
- participant Queue
- participant Container
- participant Storage
-
- Client->>Auth: Submit job with API key
- Auth->>Client: Validate and return job ID
-
- Client->>Worker: Execute job request
- Worker->>Queue: Queue job
- Queue->>Worker: Job ready
- Worker->>Container: Start ML container
- Container->>Worker: Execute experiment
- Worker->>Storage: Store results
- Worker->>Client: Return results
-sequenceDiagram
- participant Client
- participant Auth
- participant PermMgr
- participant Config
-
- Client->>Auth: Request with API key
- Auth->>Auth: Validate key hash
- Auth->>PermMgr: Get user permissions
- PermMgr->>Config: Load YAML permissions
- Config->>PermMgr: Return permissions
- PermMgr->>Auth: Return resolved permissions
- Auth->>Client: Grant/deny access
-graph TB
- subgraph "Security Layers"
- Network[Network Security]
- Auth[Authentication]
- AuthZ[Authorization]
- Container[Container Security]
- Data[Data Protection]
- Audit[Audit Logging]
- end
-
- Network --> Auth
- Auth --> AuthZ
- AuthZ --> Container
- Container --> Data
- Data --> Audit
-Security Features: -- API key authentication -- Role-based permissions -- Container isolation -- File system sandboxing -- Comprehensive audit logs -- Input validation and sanitization
-graph TB
- subgraph "Container Isolation"
- Host[Host System]
- Podman[Podman Runtime]
- Network[Network Isolation]
- FS[File System Isolation]
- User[User Namespaces]
- ML[ML Container]
-
- Host --> Podman
- Podman --> Network
- Podman --> FS
- Podman --> User
- User --> ML
- end
-Isolation Features: -- Rootless containers -- Network isolation -- File system sandboxing -- User namespace mapping -- Resource limits
-graph TB
- subgraph "Config Sources"
- Env[Environment Variables]
- File[Config Files]
- CLI[CLI Flags]
- Defaults[Default Values]
- end
-
- subgraph "Config Processing"
- Merge[Config Merger]
- Validate[Schema Validator]
- Apply[Config Applier]
- end
-
- Env --> Merge
- File --> Merge
- CLI --> Merge
- Defaults --> Merge
-
- Merge --> Validate
- Validate --> Apply
-Configuration Priority: -1. CLI flags (highest) -2. Environment variables -3. Configuration files -4. Default values (lowest)
-graph TB
- subgraph "Scaled Architecture"
- LB[Load Balancer]
- W1[Worker 1]
- W2[Worker 2]
- W3[Worker N]
- Redis[Redis Cluster]
- Storage[Shared Storage]
-
- LB --> W1
- LB --> W2
- LB --> W3
-
- W1 --> Redis
- W2 --> Redis
- W3 --> Redis
-
- W1 --> Storage
- W2 --> Storage
- W3 --> Storage
- end
-Scaling Features: -- Stateless worker services -- Shared job queue (Redis) -- Distributed storage -- Load balancer ready -- Health checks and monitoring
-| Component | -Technology | -Purpose | -
|---|---|---|
| Language | -Go 1.25+ | -Core application | -
| Web Framework | -Standard library | -HTTP server | -
| Authentication | -Custom | -API key + RBAC | -
| Database | -SQLite/PostgreSQL | -Metadata storage | -
| Cache | -Redis | -Job queue & caching | -
| Containers | -Podman/Docker | -Job isolation | -
| UI Framework | -Bubble Tea | -Terminal UI | -
// Core dependencies
-require (
- github.com/charmbracelet/bubbletea v1.3.10 // TUI framework
- github.com/go-redis/redis/v8 v8.11.5 // Redis client
- github.com/google/uuid v1.6.0 // UUID generation
- github.com/mattn/go-sqlite3 v1.14.32 // SQLite driver
- golang.org/x/crypto v0.45.0 // Crypto utilities
- gopkg.in/yaml.v3 v3.0.1 // YAML parsing
-)
-fetch_ml/
-├── cmd/ # CLI applications
-│ ├── worker/ # ML worker service
-│ ├── tui/ # Terminal UI
-│ ├── data_manager/ # Data management
-│ └── user_manager/ # User management
-├── internal/ # Internal packages
-│ ├── auth/ # Authentication system
-│ ├── config/ # Configuration management
-│ ├── container/ # Container operations
-│ ├── database/ # Database operations
-│ ├── logging/ # Logging utilities
-│ ├── metrics/ # Metrics collection
-│ └── network/ # Network utilities
-├── configs/ # Configuration files
-├── scripts/ # Setup and utility scripts
-├── tests/ # Test suites
-└── docs/ # Documentation
-graph TB
- subgraph "Application Layer"
- Worker[cmd/worker]
- TUI[cmd/tui]
- DataMgr[cmd/data_manager]
- UserMgr[cmd/user_manager]
- end
-
- subgraph "Service Layer"
- Auth[internal/auth]
- Config[internal/config]
- Container[internal/container]
- Database[internal/database]
- end
-
- subgraph "Utility Layer"
- Logging[internal/logging]
- Metrics[internal/metrics]
- Network[internal/network]
- end
-
- Worker --> Auth
- Worker --> Config
- Worker --> Container
- TUI --> Auth
- DataMgr --> Database
- UserMgr --> Auth
-
- Auth --> Logging
- Container --> Network
- Database --> Metrics
-graph TB
- subgraph "Metrics Pipeline"
- App[Application] --> Metrics[Metrics Collector]
- Metrics --> Export[Prometheus Exporter]
- Export --> Prometheus[Prometheus Server]
- Prometheus --> Grafana[Grafana Dashboard]
-
- subgraph "Metric Types"
- Counter[Counters]
- Gauge[Gauges]
- Histogram[Histograms]
- Timer[Timers]
- end
-
- App --> Counter
- App --> Gauge
- App --> Histogram
- App --> Timer
- end
-graph TB
- subgraph "Logging Pipeline"
- App[Application] --> Logger[Structured Logger]
- Logger --> File[File Output]
- Logger --> Console[Console Output]
- Logger --> Syslog[Syslog Forwarder]
- Syslog --> Aggregator[Log Aggregator]
- Aggregator --> Storage[Log Storage]
- Storage --> Viewer[Log Viewer]
- end
-graph TB
- subgraph "Deployment Stack"
- Image[Container Image]
- Registry[Container Registry]
- Orchestrator[Docker Compose]
- Config[ConfigMaps/Secrets]
- Storage[Persistent Storage]
-
- Image --> Registry
- Registry --> Orchestrator
- Config --> Orchestrator
- Storage --> Orchestrator
- end
-graph TB
- subgraph "Service Mesh"
- Gateway[API Gateway]
- Discovery[Service Discovery]
- Worker[Worker Service]
- Data[Data Service]
- Redis[Redis Cluster]
-
- Gateway --> Discovery
- Discovery --> Worker
- Discovery --> Data
- Discovery --> Redis
- end
-This architecture provides a solid foundation for secure, scalable machine learning experiments while maintaining simplicity and developer productivity.
- - - - - - - - - - - - - - - - -0&&i[i.length-1])&&(p[0]===6||p[0]===2)){r=0;continue}if(p[0]===3&&(!i||p[1]>i[0]&&p[1]=e.length&&(e=void 0),{value:e&&e[o++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function K(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var o=r.call(e),n,i=[],s;try{for(;(t===void 0||t-- >0)&&!(n=o.next()).done;)i.push(n.value)}catch(a){s={error:a}}finally{try{n&&!n.done&&(r=o.return)&&r.call(o)}finally{if(s)throw s.error}}return i}function B(e,t,r){if(r||arguments.length===2)for(var o=0,n=t.length,i;o