From 94112f0af546a30314592f885db497438bb0b903 Mon Sep 17 00:00:00 2001 From: Jeremie Fraeys Date: Mon, 5 Jan 2026 12:34:23 -0500 Subject: [PATCH] ci: align workflows, build scripts, and docs with current architecture --- .github/workflows/benchmark-metrics.yml | 12 +- .github/workflows/ci.yml | 24 +-- .github/workflows/label.yml | 3 +- .github/workflows/license-check.yml | 10 +- .github/workflows/release.yml | 36 ++-- .golangci.yml | 134 ++++++++------ DEVELOPMENT.md | 9 +- LICENSE | 89 ++++++--- Makefile | 230 +++++++++++++----------- README.md | 100 ++++++++++- SECURITY.md | 10 +- build/docker/api-server.Dockerfile | 14 +- build/docker/full-prod.Dockerfile | 9 +- build/docker/homelab-secure.Dockerfile | 8 +- build/docker/secure-prod.Dockerfile | 11 +- build/docker/simple.Dockerfile | 19 +- build/docker/test.Dockerfile | 19 +- cmd/configlint/main.go | 40 +++-- cmd/data_manager/data_manager_config.go | 8 +- cmd/data_manager/data_sync.go | 204 +++++++++++++++------ cmd/db-utils/init_multi_user.go | 2 +- 21 files changed, 649 insertions(+), 342 deletions(-) diff --git a/.github/workflows/benchmark-metrics.yml b/.github/workflows/benchmark-metrics.yml index b6d7e45..bf5b1e3 100644 --- a/.github/workflows/benchmark-metrics.yml +++ b/.github/workflows/benchmark-metrics.yml @@ -1,12 +1,6 @@ name: Benchmark Metrics on: - push: - branches: [ main, develop ] - pull_request: - branches: [ main ] - schedule: - - cron: '0 6 * * *' # Daily at 6 AM UTC workflow_dispatch: jobs: @@ -65,12 +59,14 @@ jobs: done < clean_benchmarks.txt - name: Push to Prometheus Pushgateway + env: + PROMETHEUS_PUSHGATEWAY_URL: ${{ secrets['PROMETHEUS_PUSHGATEWAY_URL'] }} run: | # Push metrics to Prometheus Pushgateway (if configured) - if [ -n "${{ secrets.PROMETHEUS_PUSHGATEWAY_URL }}" ]; then + if [ -n "$PROMETHEUS_PUSHGATEWAY_URL" ]; then echo "Pushing metrics to Prometheus..." curl --data-binary @prometheus_metrics.txt \ - "${{ secrets.PROMETHEUS_PUSHGATEWAY_URL }}/metrics/job/benchmark/instance/${{ github.run_id }}" + "$PROMETHEUS_PUSHGATEWAY_URL/metrics/job/benchmark/instance/${{ github.run_id }}" else echo "PROMETHEUS_PUSHGATEWAY_URL not configured, skipping push" fi diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0bb3f16..3b85d89 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,10 +1,7 @@ name: CI/CD Pipeline on: - push: - branches: [ main, develop ] - pull_request: - branches: [ main ] + workflow_dispatch: # Concurrency control to prevent multiple runs of the same workflow concurrency: @@ -85,18 +82,12 @@ jobs: - name: Run tests run: make test - env: - REDIS_URL: redis://localhost:6379 - name: Test internal/queue package run: go test -v -race -coverprofile=queue-coverage.out ./internal/queue/... - env: - REDIS_URL: redis://localhost:6379 - name: Run comprehensive tests run: make test-all - env: - REDIS_URL: redis://localhost:6379 - name: Run linters run: make lint @@ -111,6 +102,19 @@ jobs: flags: unittests name: codecov-umbrella + dev-smoke: + name: Dev Compose Smoke Test + runs-on: ubuntu-latest + needs: test + timeout-minutes: 20 + + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Run dev smoke test + run: make dev-smoke + build: name: Build runs-on: ubuntu-latest diff --git a/.github/workflows/label.yml b/.github/workflows/label.yml index 2f2f131..5603b88 100644 --- a/.github/workflows/label.yml +++ b/.github/workflows/label.yml @@ -1,8 +1,7 @@ name: Label Pull Request on: - pull_request: - types: [opened, edited, synchronize] + workflow_dispatch: jobs: label: diff --git a/.github/workflows/license-check.yml b/.github/workflows/license-check.yml index 83a39db..24f4b85 100644 --- a/.github/workflows/license-check.yml +++ b/.github/workflows/license-check.yml @@ -21,13 +21,7 @@ jobs: echo "LICENSE file is missing" exit 1 fi - - # Check if it's MIT license - if ! grep -q "MIT License" LICENSE; then - echo "License file should be MIT License" - exit 1 - fi - + echo "License file OK" - name: Check Go files for license headers @@ -36,7 +30,7 @@ jobs: missing_headers=0 for file in $(find . -name "*.go" -not -path "./vendor/*" -not -path "./.git/*"); do - if ! head -10 "$file" | grep -q "Copyright" && ! head -10 "$file" | grep -q "MIT"; then + if ! head -10 "$file" | grep -q "Copyright"; then echo "Missing license header in: $file" missing_headers=$((missing_headers + 1)) fi diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index abf62f8..68ca145 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,13 +1,12 @@ name: Release on: - push: - tags: - - 'v*' # Trigger on version tags like v1.0.0 + workflow_dispatch: permissions: contents: write packages: write + id-token: write env: GO_VERSION: '1.25.0' @@ -113,10 +112,10 @@ jobs: - name: Package binaries run: | cd dist - for binary in api-server worker tui data_manager user_manager; do + for binary in fetch_ml_*; do if [[ -f "${binary}" ]]; then - tar -czf "fetch_ml_${binary}.tar.gz" "${binary}" - sha256sum "fetch_ml_${binary}.tar.gz" > "fetch_ml_${binary}.tar.gz.sha256" + tar -czf "${binary}.tar.gz" "${binary}" + sha256sum "${binary}.tar.gz" > "${binary}.tar.gz.sha256" fi done @@ -155,6 +154,20 @@ jobs: sha256sum *.tar.gz > checksums.txt ls -lh + - name: Install cosign + uses: sigstore/cosign-installer@v3 + + - name: Sign checksums.txt (keyless) + working-directory: release + env: + COSIGN_YES: "true" + run: | + cosign sign-blob \ + --output-signature checksums.txt.sig \ + --output-certificate checksums.txt.cert \ + checksums.txt + ls -lh checksums.txt checksums.txt.sig checksums.txt.cert + - name: Create Release uses: softprops/action-gh-release@v2 with: @@ -170,15 +183,12 @@ jobs: - **`ml-macos-arm64.tar.gz`** - macOS Apple Silicon ### Go Backend Binaries - - **`fetch_ml_api-server.tar.gz`** - API Server - - **`fetch_ml_worker.tar.gz`** - Worker - - **`fetch_ml_tui.tar.gz`** - Terminal UI - - **`fetch_ml_data_manager.tar.gz`** - Data Manager - - **`fetch_ml_user_manager.tar.gz`** - User Manager - + Included as per-platform tarballs named: + - `fetch_ml___[.exe].tar.gz` + ### Installation ```bash - # Download and extract + # Download and extract (example: CLI) tar -xzf ml-.tar.gz # Make executable and move to PATH diff --git a/.golangci.yml b/.golangci.yml index 86f8587..5d3f279 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -4,27 +4,11 @@ run: timeout: 5m tests: true output: - format: colored-line-number -linters-settings: - govet: - enable: - - shadow - - fieldalignment - gocyclo: - min-complexity: 15 - dupl: - threshold: 100 - goconst: - min-len: 3 - min-occurrences: 3 - misspell: - locale: US - lll: - line-length: 100 - revive: - confidence: 0.8 + formats: + text: + path: stdout linters: - disable-all: true + default: none enable: - bodyclose - dogsled @@ -51,40 +35,84 @@ linters: - unused - whitespace - revive + settings: + nolintlint: + allow-unused: true + govet: + enable: + - shadow + - fieldalignment + gocyclo: + min-complexity: 15 + dupl: + threshold: 100 + goconst: + min-len: 3 + min-occurrences: 3 + misspell: + locale: US + ignore-rules: + - cancelled + - cancelling + - behaviour + - colour + - sanitise + - initialise + - optimise + - normalised + lll: + line-length: 100 + revive: + confidence: 0.8 + exclusions: + rules: + # G306: File permissions - acceptable for test files and scripts + - text: "G306:" + linters: + - gosec + # Exclude linters for test files + - path: ".*_test\\.go" + linters: + - gocyclo + - errcheck + - dupl + - lll + - gosec + - revive + # Exclude errcheck for tests directory + - path: "^tests/" + linters: + - errcheck + # approve insecureSkipVerify in test files + - path: _test\\.go + text: "insecureSkipVerify" + linters: + - gosec + # Exclude gosec G204 for tests and tools via source match + - source: "exec\\.CommandContext" + path: "(tests|tools)/" + linters: + - gosec + # Exclude revive for api package naming via source match + - source: "^package api$" + linters: + - revive + # Known legacy staticcheck issue + - path: "^internal/worker/snapshot_store\\.go$" + text: "SA1019" + linters: + - staticcheck + # Known legacy unparam issue + - path: "^internal/resources/manager\\.go$" + text: "gpuSlotsForTask" + linters: + - unparam + # Ignore whitespace-only lint noise in this file + - path: "^internal/api/ws_jobs\\.go$" + linters: + - whitespace issues: - exclude-rules: - # G306: File permissions - acceptable for test files and scripts - - text: "G306:" - linters: - - gosec - # Exclude linters for test files - - path: ".*_test\\.go" - linters: - - gocyclo - - errcheck - - dupl - - lll - - gosec - - revive - # Exclude errcheck for tests directory - - path: "^tests/" - linters: - - errcheck - # approve insecureSkipVerify in test files - - path: _test\.go - text: "insecureSkipVerify" - linters: - - gosec - # Exclude gosec G204 for tests and tools via source match - - source: "exec\\.CommandContext" - path: "(tests|tools)/" - linters: - - gosec - # Exclude revive for api package naming via source match - - source: "^package api$" - linters: - - revive max-issues-per-linter: 0 max-same-issues: 0 severity: - default-severity: error + default: error diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 90bb66e..20e0574 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -239,15 +239,16 @@ cp .env.example .env.local ``` Key variables for development: -- `REDIS_URL`: Redis connection string - `LOG_LEVEL`: Set to `debug` for verbose logging - `API_PORT`: API server port (default: 9101) ### Configuration Files -- `configs/config-dev.yaml`: Development configuration -- `configs/config-local.yaml`: Local overrides -- `configs/config-prod.yaml`: Production settings +- `configs/api/dev.yaml`: Development (Docker) API server configuration +- `configs/api/homelab-secure.yaml`: Homelab secure API server configuration +- `configs/api/prod.yaml`: Production API server configuration +- `configs/workers/docker.yaml`: Docker worker configuration +- `configs/workers/worker-prod.toml`: Production worker configuration ## IDE Setup diff --git a/LICENSE b/LICENSE index 00e9572..ab27f15 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,68 @@ -MIT License - -Copyright (c) 2024 Fetch ML - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +FetchML Source-Available Research & Audit License (SARAL) + + Copyright (c) 2026 Fetch ML + + This software is source-available for transparency and auditability. It is not + open-source. + + 1. Definitions + 1.1 "Software" means the FetchML source code, binaries, documentation, and any + accompanying files. + 1.2 "You" means any individual or entity exercising permissions under this + license. + 1.3 "Commercial Use" means use of the Software (or any Derivative Works) in a + way that is primarily intended for or directed toward commercial advantage + or monetary compensation, including use by or for a for-profit entity in + its business operations. Commercial Use does not include personal hobby use + or academic/non-profit research that is not for a commercial purpose. + 1.4 "Hosted Service" means making the Software (or substantially similar + functionality) available to third parties as a service, including via SaaS, + managed service, hosted API, or similar offering. + 1.5 "Redistribute" means to copy, publish, sublicense, sell, rent, lease, + transfer, or otherwise provide the Software (or any portion) to any third + party, whether in source or binary form. + 1.6 "Derivative Works" means any modified version of the Software, or any work + based on the Software. + + 2. Grant of Transparency (Source Viewing and Audit) + You may view, read, and audit the source code of the Software for purposes of + security review, privacy review, correctness validation, reproducibility of + results, and internal evaluation. + + 3. Permitted Use + Subject to the restrictions in this license, You may use the Software solely + for: + - personal use; and + - non-commercial research, development, and experimentation on systems that You + own or control (including homelabs and lab servers). + + You may modify the Software solely for Your internal Permitted Use. + + 4. Prohibited Use + You may not: + - use the Software for Commercial Use; + - offer the Software or Derivative Works as a Hosted Service; + - Redistribute the Software or any Derivative Works, in source or binary form; + - use the Software in a way that enables third parties to access or benefit + from the Software as a service; + - remove or alter this license. + + 5. No Trademark Rights + This license does not grant any rights to use the names, logos, or trademarks + of Fetch ML or FetchML, except as necessary to comply with attribution + requirements in this license. + + 6. No Warranty + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + + 7. Limitation of Liability + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE + OR OTHER DEALINGS IN THE SOFTWARE. + + 8. Termination + Any violation of this license automatically terminates Your rights under this + license. diff --git a/Makefile b/Makefile index 0c13aed..b07a0b4 100644 --- a/Makefile +++ b/Makefile @@ -1,88 +1,106 @@ -.PHONY: all build prod dev clean clean-docs test test-unit test-integration test-e2e test-coverage lint install setup validate configlint ci-local docs benchmark benchmark-local artifacts clean-benchmarks clean-all clean-aggressive status load-test chaos-test profile-tools detect-regressions tech-excellence docker-build docker-run docker-stop docker-logs monitoring-performance monitoring-performance-stop dashboard-performance self-cleanup auto-cleanup test-full test-auth test-status - +.PHONY: all build prod dev clean clean-docs test test-unit test-integration test-e2e test-coverage lint install configlint worker-configlint ci-local docs docs-setup docs-build benchmark benchmark-local artifacts clean-benchmarks clean-all clean-aggressive status size load-test chaos-test profile-load profile-load-norate profile-ws-queue profile-tools detect-regressions tech-excellence docker-build dev-smoke self-cleanup test-full test-auth deploy-up deploy-down deploy-status deploy-clean dev-up dev-down dev-status dev-logs prod-up prod-down prod-status prod-logs +OK = ✓ # Default target all: build -# Build all components +# Build all components (Go binaries + optimized CLI) build: go build -o bin/api-server cmd/api-server/main.go - go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go + go build -o bin/worker cmd/worker/worker_server.go go build -o bin/tui ./cmd/tui - cd cli && zig build prod - @echo "✓ All components built" + cd cli && zig build --release=small + @echo "${OK} All components built" # Build production-optimized binaries prod: go build -ldflags="-s -w" -o bin/api-server cmd/api-server/main.go - go build -ldflags="-s -w" -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go + go build -ldflags="-s -w" -o bin/worker cmd/worker/worker_server.go go build -ldflags="-s -w" -o bin/tui ./cmd/tui - cd cli && zig build prod && strip zig-out/prod/ml - @echo "✓ Production binaries built" + cd cli && zig build --release=small + @echo "${OK} Production binaries built" + +cross-platform: + @rm -rf dist + @mkdir -p dist + @set -e; \ + LDFLAGS='-s -w -buildid='; \ + for target in linux/amd64 linux/arm64 darwin/amd64 darwin/arm64 windows/amd64; do \ + goos=$${target%/*}; \ + goarch=$${target#*/}; \ + ext=''; \ + if [ "$$goos" = "windows" ]; then ext='.exe'; fi; \ + echo "Building $$goos/$$goarch..."; \ + CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_api-server_$${goos}_$${goarch}$${ext} cmd/api-server/main.go; \ + CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_worker_$${goos}_$${goarch}$${ext} cmd/worker/worker_server.go; \ + CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_tui_$${goos}_$${goarch}$${ext} ./cmd/tui; \ + CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_data_manager_$${goos}_$${goarch}$${ext} ./cmd/data_manager; \ + CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_user_manager_$${goos}_$${goarch}$${ext} ./cmd/user_manager; \ + done + @echo "${OK} Cross-platform binaries built in dist/" # Development build (faster compilation) dev: - go build -o bin/api-server cmd/api-server/main.go - go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go - go build -o bin/tui ./cmd/tui - cd cli && zig build dev - @echo "✓ Development binaries built" + go build -buildvcs=false -o bin/api-server cmd/api-server/main.go + go build -buildvcs=false -o bin/worker cmd/worker/worker_server.go + go build -buildvcs=false -o bin/tui ./cmd/tui + cd cli && zig build --release=fast + @echo "${OK} Development binaries built" -# Clean build artifacts +# Clean build artifacts (Go + Zig + test outputs) clean: - rm -rf bin/ coverage/ - rm -rf cli/zig-out/ - rm -rf cli/.zig-cache/ + rm -rf bin/ coverage/ tests/bin/ + rm -rf cli/zig-out/ cli/.zig-cache/ .zig-cache/ go clean - @echo "✓ Cleaned" + @echo "${OK} Cleaned" clean-docs: rm -rf docs/_site/ - @echo "✓ Cleaned docs" + @echo "${OK} Cleaned docs" # Run tests test: go test ./tests/... cd cli && zig build test - @echo "✓ All tests passed" + @echo "${OK} All tests passed" # Lint Go and Zig code lint: gofmt -w ./cmd ./internal ./tests || true go vet ./... cd cli && zig fmt . - @echo "✓ Lint completed" + @echo "${OK} Lint completed" # Install to system (requires sudo) install: prod sudo cp bin/api-server /usr/local/bin/fetchml-api sudo cp bin/worker /usr/local/bin/fetchml-worker sudo cp bin/tui /usr/local/bin/fetchml-tui - sudo cp cli/zig-out/prod/ml /usr/local/bin/ml - @echo "✓ Installed" - -# Setup production environment -setup: - @if [ "$(shell uname)" = "Linux" ]; then \ - sudo ./scripts/setup-prod.sh; \ - else \ - echo "Production setup is for Linux only. You're on $(shell uname)."; \ - echo "Use docker-compose for local development."; \ - fi - -# Validate production configuration -validate: - ./scripts/validate-prod-config.sh configs/config-prod.yaml configs/worker-prod.toml + sudo cp cli/zig-out/bin/ml /usr/local/bin/ml + @echo "${OK} Installed" # Validate YAML configs against JSON schema configlint: - go run ./cmd/configlint --schema configs/schema/config_schema.yaml \ - configs/config-prod.yaml \ - configs/config-no-tls.yaml \ - configs/config-dev.yaml + go run ./cmd/configlint --schema configs/schema/api_server_config.yaml \ + configs/api/dev.yaml \ + configs/api/homelab-secure.yaml \ + configs/api/multi-user.yaml \ + configs/api/prod.yaml worker-configlint: go run ./cmd/configlint --schema configs/schema/worker_config_schema.yaml \ - configs/worker-prod.toml + configs/workers/worker-prod.toml \ + configs/workers/docker.yaml \ + configs/workers/docker-dev.yaml \ + configs/workers/docker-prod.yaml \ + configs/workers/homelab-secure.yaml + +dev-smoke: + bash ./scripts/smoke-test.sh dev + @echo "dev smoke: OK" + +prod-smoke: + bash ./scripts/smoke-test.sh prod + @echo "prod smoke: OK" # Run a local approximation of the CI pipeline ci-local: @@ -95,36 +113,19 @@ ci-local: @echo "Running coverage..." make test-coverage -# Docker targets +# Docker image build (no direct docker-compose run; use deploy-* targets instead) docker-build: docker build -f build/docker/simple.Dockerfile -t fetchml:latest . - @echo "✓ Docker image built" - -docker-run: - docker-compose up -d - @echo "✓ Services started" - -docker-stop: - docker-compose down - @echo "✓ Services stopped" - -docker-logs: - docker-compose logs -f - -# Monitoring setup (Linux only) -setup-monitoring: - @if [ "$(shell uname)" = "Linux" ]; then \ - sudo ./scripts/setup-monitoring-prod.sh; \ - else \ - echo "Monitoring setup is for Linux production. Use docker-compose for local development."; \ - fi + @echo "${OK} Docker image built" # Enhanced test targets test-unit: - go test -v -short ./... + go test -v -short ./tests/unit/... + cd cli && zig build test test-integration: - go test -v ./... + go test -v ./tests/integration/... ./tests + cd cli && zig build test test-e2e: go test -v ./tests/e2e/... @@ -132,7 +133,7 @@ test-e2e: test-coverage: go test -coverprofile=coverage/coverage.out ./... go tool cover -html=coverage/coverage.out -o coverage/coverage.html - @echo "✓ Coverage report: coverage/coverage.html" + @echo "${OK} Coverage report: coverage/coverage.html" # Documentation setup docs-setup: @@ -170,7 +171,7 @@ benchmark: # Run benchmarks locally with artifact management benchmark-local: @echo "Running benchmarks locally with full workflow..." - ./scripts/run-benchmarks-local.sh + ./scripts/benchmarks/run-benchmarks-local.sh # Manage benchmark artifacts artifacts: @@ -180,41 +181,27 @@ artifacts: # Clean benchmark artifacts (keep last 10) clean-benchmarks: @echo "Cleaning benchmark artifacts..." - ./scripts/cleanup-benchmarks.sh benchmarks + ./scripts/maintenance/cleanup-benchmarks.sh benchmarks # Comprehensive cleanup (keep last 5 runs) clean-all: @echo "Running comprehensive cleanup..." - ./scripts/cleanup-benchmarks.sh all + ./scripts/maintenance/cleanup-benchmarks.sh all # Aggressive cleanup (removes more data) clean-aggressive: @echo "Running aggressive cleanup..." - ./scripts/cleanup-benchmarks.sh aggressive + ./scripts/maintenance/cleanup-benchmarks.sh aggressive # Show disk usage status status: @echo "Checking disk usage..." - ./scripts/cleanup-benchmarks.sh status + ./scripts/maintenance/cleanup-benchmarks.sh status -# Start performance monitoring stack -monitoring-performance: - @echo "Starting performance monitoring stack..." - cd monitoring && docker-compose -f docker-compose.performance.yml up -d - @echo "Grafana available at: http://localhost:3001 (admin/admin)" - @echo "Loki available at: http://localhost:3100" - @echo "Pushgateway available at: http://localhost:9091" - @echo "Quick start guide: docs/src/performance-quick-start.md" +size: + @echo "Binary sizes:" + @ls -lh bin/* cli/zig-out/bin/ml 2>/dev/null || true -# Stop performance monitoring stack -monitoring-performance-stop: - @echo "Stopping performance monitoring stack..." - cd monitoring && docker-compose -f docker-compose.performance.yml down - -# View performance dashboard -dashboard-performance: - @echo "Opening performance dashboard..." - @echo "URL: http://localhost:3001/d/fetchml-performance/fetch-ml-performance-dashboard" # Load testing load-test: @@ -225,18 +212,18 @@ load-test: profile-load: @echo "CPU profiling MediumLoad HTTP load test..." go test ./tests/load -run TestLoadProfile_Medium -count=1 -cpuprofile tests/bin/cpu_load.out - @echo "✓ CPU profile written to cpu_load.out (inspect with: go tool pprof tests/bin/cpu_load.out)" + @echo "${OK} CPU profile written to cpu_load.out (inspect with: go tool pprof tests/bin/cpu_load.out)" profile-load-norate: @echo "CPU profiling MediumLoad HTTP load test (no rate limiting)..." go test ./tests/load -run TestLoadProfile_Medium -count=1 -cpuprofile tests/bin/cpu_load.out -v -args -profile-norate - @echo "✓ CPU profile written to cpu_load.out (inspect with: go tool pprof tests/bin/cpu_load.out)" + @echo "${OK} CPU profile written to cpu_load.out (inspect with: go tool pprof tests/bin/cpu_load.out)" # CPU profiling for WebSocket → Redis queue → worker path profile-ws-queue: @echo "CPU profiling WebSocket queue integration test..." go test ./tests/integration -run WebSocketQueue -count=5 -cpuprofile tests/bin/cpu_ws.out - @echo "✓ CPU profile written to cpu_ws.out (inspect with: go tool pprof tests/bin/cpu_ws.out)" + @echo "${OK} CPU profile written to cpu_ws.out (inspect with: go tool pprof tests/bin/cpu_ws.out)" # Chaos engineering tests chaos-test: @@ -282,24 +269,20 @@ help: @echo "" @echo "Docker Targets:" @echo " make docker-build - Build Docker image" - @echo " make docker-run - Start services with docker-compose" - @echo " make docker-stop - Stop docker-compose services" - @echo " make docker-logs - View docker-compose logs" @echo "" @echo "Test Targets:" @echo " make test - Run all tests" @echo " make test-unit - Run unit tests only" @echo " make test-integration - Run integration tests" + @echo " make test-e2e - Run end-to-end tests (Podman test is opt-in via FETCH_ML_E2E_PODMAN=1)" @echo " make test-coverage - Generate coverage report" @echo " make lint - Run formatters and linters" @echo " make ci-local - Run local CI dry-run (tests, lint, config validation, coverage)" @echo " make configlint - Validate YAML configs against schema" + @echo " make worker-configlint - Validate worker configs against schema" @echo "" @echo "Setup Targets:" @echo " make install - Install binaries to /usr/local/bin (requires sudo)" - @echo " make setup - Run production setup (Linux only)" - @echo " make setup-monitoring - Setup monitoring stack (Linux only)" - @echo " make validate - Validate production configuration" @echo "" @echo "Performance Testing:" @echo " make benchmark - Run performance benchmarks" @@ -325,10 +308,8 @@ help: @echo "Utility:" @echo " make size - Show binary sizes" @echo " make self-cleanup - Clean up Docker resources" - @echo " make auto-cleanup - Setup daily auto-cleanup service" @echo " make test-full - Run complete test suite" @echo " make test-auth - Test multi-user authentication" - @echo " make test-status - Check cleanup status" @echo " make help - Show this help" # Self-cleaning for Docker resources @@ -336,15 +317,10 @@ self-cleanup: @echo "Running self-cleanup..." @./scripts/maintenance/cleanup.sh -# Setup auto-cleanup service -auto-cleanup: - @echo "Setting up auto-cleanup service..." - @./scripts/deployment/setup-auto-cleanup.sh - # Run full test suite test-full: @echo "Running full test suite..." - @./scripts/testing/run-full-test-suite.sh + @$(MAKE) ci-local # Quick authentication test test-auth: @@ -353,7 +329,43 @@ test-auth: @echo "Testing researcher user..." && cp ~/.ml/config-researcher.toml ~/.ml/config.toml && ./cli/zig-out/bin/ml status @echo "Testing analyst user..." && cp ~/.ml/config-analyst.toml ~/.ml/config.toml && ./cli/zig-out/bin/ml status -# Test cleanup status -test-status: - @echo "Checking cleanup status..." - @./scripts/maintenance/cleanup-status.sh +# Deployment management (using organized docker-compose files) +deploy-up: + @echo "Starting development environment..." + @./deployments/deploy.sh dev up + +deploy-down: + @echo "Stopping development environment..." + @./deployments/deploy.sh dev down + +deploy-status: + @echo "Checking deployment status..." + @./deployments/deploy.sh dev status + +deploy-clean: + @echo "Cleaning all deployments..." + @cd deployments && make clean + +dev-up: + @./deployments/deploy.sh dev up + +dev-down: + @./deployments/deploy.sh dev down + +dev-status: + @./deployments/deploy.sh dev status + +dev-logs: + @./deployments/deploy.sh dev logs + +prod-up: + @./deployments/deploy.sh prod up + +prod-down: + @./deployments/deploy.sh prod down + +prod-status: + @./deployments/deploy.sh prod status + +prod-logs: + @./deployments/deploy.sh prod logs diff --git a/README.md b/README.md index 1a18509..c5158d9 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,66 @@ A lightweight ML experiment platform with a tiny Zig CLI and a Go backend. Designed for homelabs and small teams. +## Installation (recommended) + +FetchML publishes pre-built release artifacts (CLI + Go services) on GitHub Releases. + +If you prefer a one-shot check (recommended for most users), you can use: + +```bash +./scripts/verify_release.sh --dir . --repo / +``` + +1) Download the right archive for your platform + +2) Verify `checksums.txt` signature (recommended) + +The release includes a signed `checksums.txt` plus: + +- `checksums.txt.sig` +- `checksums.txt.cert` + +Verify the signature (keyless Sigstore) using cosign: + +```bash +cosign verify-blob \ + --certificate checksums.txt.cert \ + --signature checksums.txt.sig \ + --certificate-identity-regexp "^https://github.com///.github/workflows/release.yml@refs/tags/v.*$" \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com \ + checksums.txt +``` + +3) Verify the SHA256 checksum against `checksums.txt` + +4) Extract and install + +Example (CLI on Linux x86_64): + +```bash +# Download +curl -fsSLO https://github.com///releases/download//ml-linux-x86_64.tar.gz +curl -fsSLO https://github.com///releases/download//checksums.txt +curl -fsSLO https://github.com///releases/download//checksums.txt.sig +curl -fsSLO https://github.com///releases/download//checksums.txt.cert + +# Verify +cosign verify-blob \ + --certificate checksums.txt.cert \ + --signature checksums.txt.sig \ + --certificate-identity-regexp "^https://github.com///.github/workflows/release.yml@refs/tags/v.*$" \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com \ + checksums.txt +sha256sum -c --ignore-missing checksums.txt + +# Install +tar -xzf ml-linux-x86_64.tar.gz +chmod +x ml-linux-x86_64 +sudo mv ml-linux-x86_64 /usr/local/bin/ml + +ml --help +``` + ## Quick start ```bash @@ -12,7 +72,7 @@ docker-compose up -d # Or build the CLI locally cd cli && make all -./build/ml --help +./zig-out/bin/ml --help ``` ## What you get @@ -42,6 +102,17 @@ ml dataset list ml monitor # SSH to run TUI remotely ``` +## Phase 1 (V1) notes + +- **Task schema** supports optional `snapshot_id` (opaque identifier) and `dataset_specs` (structured dataset inputs). If `dataset_specs` is present it takes precedence over legacy `datasets` / `--datasets` args. +- **Snapshot restore (S1)** stages verified `snapshot_id` into each task workspace and exposes it via `FETCH_ML_SNAPSHOT_DIR` and `FETCH_ML_SNAPSHOT_ID`. If `snapshot_store.enabled: true` in the worker config, the worker will pull `/.tar.gz` from an S3-compatible store (e.g. MinIO), verify `snapshot_sha256`, and cache it under `data_dir/snapshots/sha256/`. +- **Prewarm (best-effort)** can fetch datasets for the next queued task while another task is running. Prewarm state is surfaced in `ml status --json` under the optional `prewarm` field. +- **Env prewarm (best-effort)** can build a warmed Podman image keyed by `deps_manifest_sha256` and reuse it for later tasks. + +## Changelog + +See `CHANGELOG.md`. + ## Build ```bash @@ -66,6 +137,31 @@ See `docs/` for detailed guides: - `docs/src/quick-start.md` – Full setup guide - `docs/src/deployment.md` – Production deployment +## Source code + +The FetchML source code is intentionally not hosted on GitHub. + +The canonical source repository is available at: ``. + +## Contributing + +Contributions are welcome. + +- **Questions / bug reports**: Use GitHub Issues: ``. Include: + - how to reproduce + - expected vs actual behavior + - logs/config snippets (sanitize secrets) + - OS + versions (Go, Zig, Podman/Docker if relevant) +- **Changes**: Submit a patch in a GitHub issue. + - Create a topic branch. + - Run tests/linters. + - Export your change as either: + - a patch series: `git format-patch -N origin/main`, or + - a single bundle: `git bundle create fetchml.bundle origin/main..HEAD` + - Attach the generated files to a GitHub issue at ``. + ## License -See LICENSE. +FetchML is source-available for transparency and auditability. It is not open-source. + +See `LICENSE`. diff --git a/SECURITY.md b/SECURITY.md index a0a44be..ae3b948 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -27,7 +27,7 @@ This will: - **TLS/SSL**: HTTPS encrypted communication - **IP Whitelisting**: Restrict access to trusted networks - **Rate Limiting**: Prevent abuse and DoS attacks -- **Reverse Proxy**: Nginx with security headers +- **Reverse Proxy**: Caddy with security headers ### Data Protection - **Path Traversal Protection**: Prevents directory escape attacks @@ -37,7 +37,7 @@ This will: ## Configuration Files ### Secure Config Location -- `configs/environments/config-homelab-secure.yaml` - Main secure configuration +- `configs/api/homelab-secure.yaml` - Main secure configuration ### API Keys - `.api-keys` - Generated API keys (600 permissions) @@ -71,7 +71,7 @@ docker-compose -f docker-compose.yml -f docker-compose.homelab-secure.yml up -d source .env.secure # Start server -./api-server -config configs/environments/config-homelab-secure.yaml +./api-server -config configs/api/homelab-secure.yaml ``` ## Security Checklist @@ -87,7 +87,7 @@ source .env.secure ### Network Security - [ ] Use HTTPS only (disable HTTP) - [ ] Restrict API access to trusted IPs -- [ ] Use reverse proxy (nginx) +- [ ] Use reverse proxy (caddy) - [ ] Enable security headers - [ ] Monitor access logs @@ -174,7 +174,7 @@ curl -s https://api.ipify.org Monitor these files: - `logs/fetch_ml.log` - Application logs -- `/var/log/nginx/security.log` - Nginx access logs +- Caddy access logs (configure if enabled) - Docker logs: `docker logs ml-experiments-api` ## Best Practices diff --git a/build/docker/api-server.Dockerfile b/build/docker/api-server.Dockerfile index f9e93c5..6933c52 100644 --- a/build/docker/api-server.Dockerfile +++ b/build/docker/api-server.Dockerfile @@ -53,19 +53,23 @@ WORKDIR /app COPY --from=go-builder /app/bin/ /usr/local/bin/ COPY --from=zig-builder /app/cli/zig-out/bin/ml /usr/local/bin/ +# Copy configs +COPY --from=go-builder /app/configs/ /app/configs/ + # Create directories -RUN mkdir -p /data/ml-experiments /home/appuser/.ml && \ - chown -R appuser:appgroup /data /home/appuser +RUN mkdir -p /data/experiments /data/datasets /data/snapshots /home/appuser/.ml && \ + mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl && \ + chown -R appuser:appgroup /data /app /home/appuser # Switch to app user USER appuser # Expose ports -EXPOSE 9100 9101 +EXPOSE 9101 # Health check HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ - CMD wget --no-verbose --tries=1 --spider http://localhost:9100/health || exit 1 + CMD wget --no-verbose --tries=1 --no-check-certificate --spider https://localhost:9101/health || exit 1 # Default command -CMD ["/usr/local/bin/api-server"] +CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"] diff --git a/build/docker/full-prod.Dockerfile b/build/docker/full-prod.Dockerfile index 03a293f..3968d0b 100644 --- a/build/docker/full-prod.Dockerfile +++ b/build/docker/full-prod.Dockerfile @@ -40,12 +40,15 @@ COPY --from=builder /app/bin/ /usr/local/bin/ COPY --from=builder /app/configs/ /app/configs/ # Create necessary directories -RUN mkdir -p /app/data/experiments /app/logs /app/ssl /app/ssh /tmp/fetchml-jobs +RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /app/ssh /tmp/fetchml-jobs && \ + mkdir -p /data/active/datasets /data/active/snapshots && \ + mkdir -p /logs && \ + chown -R appuser:appgroup /app /data /logs # Generate SSL certificates RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \ -subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \ - chmod 644 /app/ssl/cert.pem /app/ssl/key.pem + chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem # Generate SSH keys for container communication RUN ssh-keygen -t rsa -b 2048 -f /app/ssh/id_rsa -N "" && \ @@ -70,4 +73,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ CMD curl -k -f https://localhost:9101/health || exit 1 # Default command for API server -CMD ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"] +CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"] diff --git a/build/docker/homelab-secure.Dockerfile b/build/docker/homelab-secure.Dockerfile index 075ba26..fdda694 100644 --- a/build/docker/homelab-secure.Dockerfile +++ b/build/docker/homelab-secure.Dockerfile @@ -55,8 +55,10 @@ COPY --from=builder /app/bin/ /usr/local/bin/ COPY --from=builder /app/configs/ /app/configs/ # Create necessary directories with proper permissions -RUN mkdir -p /app/data/experiments /app/logs /app/ssl /tmp/fetchml-jobs && \ - chown -R appuser:appgroup /app && \ +RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /tmp/fetchml-jobs && \ + mkdir -p /data/active/datasets /data/active/snapshots && \ + mkdir -p /logs && \ + chown -R appuser:appgroup /app /data /logs && \ chmod 750 /app/data/experiments /app/logs # Generate SSL certificates with stronger crypto @@ -144,4 +146,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ CMD curl -k -f https://localhost:9101/health || exit 1 # Default command for API server -CMD ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"] +CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"] diff --git a/build/docker/secure-prod.Dockerfile b/build/docker/secure-prod.Dockerfile index a35e28b..7b0ce3c 100644 --- a/build/docker/secure-prod.Dockerfile +++ b/build/docker/secure-prod.Dockerfile @@ -45,13 +45,16 @@ COPY --from=builder /app/bin/ /usr/local/bin/ COPY --from=builder /app/configs/ /app/configs/ # Create necessary directories -RUN mkdir -p /app/data/experiments /app/logs /app/ssl /tmp/fetchml-jobs && \ - chown -R appuser:appgroup /app +RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /tmp/fetchml-jobs && \ + mkdir -p /data/active/datasets /data/active/snapshots && \ + mkdir -p /logs && \ + chown -R appuser:appgroup /app /data /logs # Generate SSL certificates RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \ -subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \ - chmod 644 /app/ssl/cert.pem /app/ssl/key.pem + chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem && \ + chown -R appuser:appgroup /app/ssl # Generate SSH keys for worker user RUN ssh-keygen -t rsa -b 4096 -f /home/worker/.ssh/id_rsa -N "" && \ @@ -99,4 +102,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ CMD curl -k -f https://localhost:9101/health || exit 1 # Default command for API server -CMD ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"] +CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"] diff --git a/build/docker/simple.Dockerfile b/build/docker/simple.Dockerfile index c6610cb..2092c13 100644 --- a/build/docker/simple.Dockerfile +++ b/build/docker/simple.Dockerfile @@ -2,7 +2,7 @@ FROM golang:1.25-alpine AS builder # Install dependencies -RUN apk add --no-cache git make +RUN apk add --no-cache git make gcc musl-dev # Set working directory WORKDIR /app @@ -17,13 +17,14 @@ RUN go mod download COPY . . # Build Go binaries -RUN go build -o bin/api-server cmd/api-server/main.go +RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \ + CGO_ENABLED=1 go build -o bin/worker ./cmd/worker # Final stage FROM alpine:3.19 # Install runtime dependencies -RUN apk add --no-cache ca-certificates redis openssl +RUN apk add --no-cache bash ca-certificates redis openssl curl podman fuse-overlayfs slirp4netns iptables # Create app user RUN addgroup -g 1001 -S appgroup && \ @@ -37,15 +38,17 @@ COPY --from=builder /app/bin/ /usr/local/bin/ # Copy configs and templates COPY --from=builder /app/configs/ /app/configs/ -COPY --from=builder /app/nginx/ /app/nginx/ # Create necessary directories -RUN mkdir -p /app/data/experiments /app/logs /app/ssl +RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl # Generate SSL certificates for container use RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \ -subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \ - chmod 644 /app/ssl/cert.pem /app/ssl/key.pem + chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem + +# Ensure app user can write to data/logs and read TLS material +RUN chown -R appuser:appgroup /app/data /app/logs /app/ssl /app/configs # Switch to app user USER appuser @@ -55,7 +58,7 @@ EXPOSE 9101 # Health check HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ - CMD curl -k -f https://localhost:9101/health || exit 1 + CMD curl -f http://localhost:9101/health || curl -k -f https://localhost:9101/health || exit 1 # Default command -CMD ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"] +CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"] diff --git a/build/docker/test.Dockerfile b/build/docker/test.Dockerfile index 0b3da5a..41b2639 100644 --- a/build/docker/test.Dockerfile +++ b/build/docker/test.Dockerfile @@ -2,7 +2,7 @@ FROM golang:1.25-alpine AS builder # Install dependencies -RUN apk add --no-cache git +RUN apk add --no-cache git gcc musl-dev # Set working directory WORKDIR /app @@ -17,7 +17,7 @@ RUN go mod download COPY . . # Build only Go binaries (skip Zig) -RUN go build -o bin/api-server cmd/api-server/main.go && \ +RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \ go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go && \ go build -o bin/tui ./cmd/tui @@ -25,7 +25,7 @@ RUN go build -o bin/api-server cmd/api-server/main.go && \ FROM alpine:3.19 # Install runtime dependencies -RUN apk add --no-cache ca-certificates curl +RUN apk add --no-cache ca-certificates curl openssl # Create app user RUN addgroup -g 1001 -S appgroup && \ @@ -41,7 +41,16 @@ COPY --from=builder /app/bin/ /usr/local/bin/ COPY --from=builder /app/configs/ /app/configs/ # Create necessary directories -RUN mkdir -p /app/data/experiments /app/logs +RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl && \ + mkdir -p /data/experiments /data/datasets /data/snapshots + +# Generate SSL certificates for container use +RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \ + -subj "/C=US/ST=Test/L=Local/O=FetchML/OU=Tests/CN=localhost" && \ + chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem + +# Ensure app user can write to data/logs and read TLS material +RUN chown -R appuser:appgroup /app/data /app/logs /app/ssl /app/configs /data # Switch to app user USER appuser @@ -50,4 +59,4 @@ USER appuser EXPOSE 9101 # Default command -CMD ["/usr/local/bin/api-server", "-config", "/app/configs/environments/config-local.yaml"] +CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"] diff --git a/cmd/configlint/main.go b/cmd/configlint/main.go index 65b6e1a..d3f8758 100644 --- a/cmd/configlint/main.go +++ b/cmd/configlint/main.go @@ -2,6 +2,7 @@ package main import ( + "bytes" "encoding/json" "flag" "fmt" @@ -10,6 +11,7 @@ import ( "path/filepath" "strings" + "github.com/BurntSushi/toml" "github.com/jfraeys/fetch_ml/internal/fileutil" "github.com/xeipuuv/gojsonschema" "gopkg.in/yaml.v3" @@ -68,20 +70,7 @@ func loadSchema(schemaPath string) (gojsonschema.JSONLoader, error) { return nil, err } - tmpFile, err := os.CreateTemp("", "fetchml-schema-*.json") - if err != nil { - return nil, err - } - defer func() { - _ = tmpFile.Close() - _ = os.Remove(tmpFile.Name()) - }() - - if _, err := tmpFile.Write(schemaJSON); err != nil { - return nil, err - } - - return gojsonschema.NewReferenceLoader("file://" + filepath.ToSlash(tmpFile.Name())), nil + return gojsonschema.NewBytesLoader(schemaJSON), nil } func validateConfig(schemaLoader gojsonschema.JSONLoader, configPath string) error { @@ -90,12 +79,27 @@ func validateConfig(schemaLoader gojsonschema.JSONLoader, configPath string) err return err } - var configYAML interface{} - if err := yaml.Unmarshal(data, &configYAML); err != nil { - return fmt.Errorf("failed to parse YAML: %w", err) + ext := strings.ToLower(filepath.Ext(configPath)) + var decoded any + switch ext { + case ".toml": + var configTOML map[string]any + if _, err := toml.Decode(string(data), &configTOML); err != nil { + return fmt.Errorf("failed to parse TOML: %w", err) + } + decoded = configTOML + default: + // YAML (default) + var configYAML any + dec := yaml.NewDecoder(bytes.NewReader(data)) + dec.KnownFields(false) + if err := dec.Decode(&configYAML); err != nil { + return fmt.Errorf("failed to parse YAML: %w", err) + } + decoded = configYAML } - configJSON, err := json.Marshal(configYAML) + configJSON, err := json.Marshal(decoded) if err != nil { return err } diff --git a/cmd/data_manager/data_manager_config.go b/cmd/data_manager/data_manager_config.go index e78aa6b..1309db9 100644 --- a/cmd/data_manager/data_manager_config.go +++ b/cmd/data_manager/data_manager_config.go @@ -40,10 +40,10 @@ type DataConfig struct { CleanupInterval int `yaml:"cleanup_interval_min"` // Run cleanup every X minutes // Podman integration - PodmanImage string `yaml:"podman_image"` - ContainerWorkspace string `yaml:"container_workspace"` - ContainerResults string `yaml:"container_results"` - GPUAccess bool `yaml:"gpu_access"` + PodmanImage string `yaml:"podman_image"` + ContainerWorkspace string `yaml:"container_workspace"` + ContainerResults string `yaml:"container_results"` + GPUDevices []string `yaml:"gpu_devices"` } // LoadDataConfig loads data manager configuration from a YAML file. diff --git a/cmd/data_manager/data_sync.go b/cmd/data_manager/data_sync.go index ba02814..fadbd26 100644 --- a/cmd/data_manager/data_sync.go +++ b/cmd/data_manager/data_sync.go @@ -10,6 +10,7 @@ import ( "os" "os/signal" "path/filepath" + "sort" "strings" "syscall" "time" @@ -23,6 +24,13 @@ import ( "github.com/jfraeys/fetch_ml/internal/telemetry" ) +func shellQuote(s string) string { + if s == "" { + return "''" + } + return "'" + strings.ReplaceAll(s, "'", "'\"'\"'") + "'" +} + // SSHClient alias for convenience. type SSHClient = network.SSHClient @@ -37,6 +45,32 @@ type DataManager struct { logger *logging.Logger } +func (dm *DataManager) archiveDatasetOnML(datasetName string) (string, error) { + datasetName = strings.TrimSpace(datasetName) + if err := container.ValidateJobName(datasetName); err != nil { + return "", fmt.Errorf("invalid dataset name: %w", err) + } + if strings.TrimSpace(dm.config.MLDataDir) == "" { + return "", fmt.Errorf("missing ml_data_dir") + } + + stamp := time.Now().UTC().Format("20060102-150405") + archiveRoot := filepath.Join(dm.config.MLDataDir, ".archive", stamp) + src := filepath.Join(dm.config.MLDataDir, datasetName) + dst := filepath.Join(archiveRoot, datasetName) + + cmd := fmt.Sprintf( + "mkdir -p %s && mv %s %s", + shellQuote(archiveRoot), + shellQuote(src), + shellQuote(dst), + ) + if _, err := dm.mlServer.Exec(cmd); err != nil { + return "", err + } + return dst, nil +} + // DataFetchRequest represents a request to fetch datasets. type DataFetchRequest struct { JobName string `json:"job_name"` @@ -141,7 +175,11 @@ func (dm *DataManager) FetchDataset(jobName, datasetName string) error { }) } -func (dm *DataManager) fetchDatasetInternal(ctx context.Context, jobName, datasetName string) error { +func (dm *DataManager) fetchDatasetInternal( + ctx context.Context, + jobName string, + datasetName string, +) error { if err := container.ValidateJobName(datasetName); err != nil { return &errtypes.DataFetchError{ Dataset: datasetName, @@ -225,9 +263,14 @@ func (dm *DataManager) fetchDatasetInternal(ctx context.Context, jobName, datase ioBefore, ioErr := telemetry.ReadProcessIO() start := time.Now() - out, err := telemetry.ExecWithMetrics(dm.logger, "dataset transfer", time.Since(start), func() (string, error) { - return dm.nasServer.ExecContext(ctx, rsyncCmd) - }) + out, err := telemetry.ExecWithMetrics( + dm.logger, + "dataset transfer", + time.Since(start), + func() (string, error) { + return dm.nasServer.ExecContext(ctx, rsyncCmd) + }, + ) duration := time.Since(start) if err != nil { @@ -413,64 +456,101 @@ func (dm *DataManager) CleanupOldData() error { "total_size_gb", totalSizeGB, "dataset_count", len(datasets)) - // Delete datasets older than max age or if over size limit + // Archive datasets older than max age or if over size limit maxAge := time.Duration(dm.config.MaxAgeHours) * time.Hour maxSize := int64(dm.config.MaxSizeGB) * 1024 * 1024 * 1024 - var deleted []string + // Ensure deterministic ordering when needing to reduce size. + sort.Slice(datasets, func(i, j int) bool { + ai := datasets[i].LastAccess + aj := datasets[j].LastAccess + if ai.IsZero() && aj.IsZero() { + return datasets[i].Name < datasets[j].Name + } + if ai.IsZero() { + return true + } + if aj.IsZero() { + return false + } + if ai.Equal(aj) { + return datasets[i].Name < datasets[j].Name + } + return ai.Before(aj) + }) + + valid := make([]DatasetInfo, 0, len(datasets)) for _, ds := range datasets { - shouldDelete := false - - // Check age - if !ds.LastAccess.IsZero() && time.Since(ds.LastAccess) > maxAge { - logger.Info("dataset is old, marking for deletion", - "dataset", ds.Name, - "last_access", ds.LastAccess, - "age_hours", time.Since(ds.LastAccess).Hours()) - shouldDelete = true + name := strings.TrimSpace(ds.Name) + if err := container.ValidateJobName(name); err != nil { + logger.Warn("skipping dataset with invalid name", "dataset", ds.Name) + continue } + ds.Name = name + valid = append(valid, ds) + } - // Check if over size limit - if totalSize > maxSize { - logger.Info("over size limit, deleting oldest dataset", - "dataset", ds.Name, - "current_size_gb", totalSizeGB, - "max_size_gb", dm.config.MaxSizeGB) - shouldDelete = true + archivedSet := make(map[string]struct{}, len(valid)) + var archived []string + archiveOne := func(ds DatasetInfo, reason string) { + if _, ok := archivedSet[ds.Name]; ok { + return } - - if shouldDelete { - path := filepath.Join(dm.config.MLDataDir, ds.Name) - logger.Info("deleting dataset", "dataset", ds.Name, "path", path) - - if _, err := dm.mlServer.Exec(fmt.Sprintf("rm -rf %s", path)); err != nil { - logger.Error("failed to delete dataset", + path := filepath.Join(dm.config.MLDataDir, ds.Name) + logger.Info("archiving dataset", "dataset", ds.Name, "path", path, "reason", reason) + if _, err := dm.archiveDatasetOnML(ds.Name); err != nil { + logger.Error("failed to archive dataset", + "dataset", ds.Name, + "error", err) + return + } + archivedSet[ds.Name] = struct{}{} + archived = append(archived, ds.Name) + totalSize -= ds.SizeBytes + totalSizeGB = float64(totalSize) / (1024 * 1024 * 1024) + if dm.taskQueue != nil { + redisClient := dm.taskQueue.GetRedisClient() + if err := redisClient.Del(dm.ctx, fmt.Sprintf("ml:dataset:%s", ds.Name)).Err(); err != nil { + logger.Warn("failed to delete dataset from Redis", "dataset", ds.Name, "error", err) - continue - } - - deleted = append(deleted, ds.Name) - totalSize -= ds.SizeBytes - - // FIXED: Remove from Redis only if available, with error handling - if dm.taskQueue != nil { - redisClient := dm.taskQueue.GetRedisClient() - if err := redisClient.Del(dm.ctx, fmt.Sprintf("ml:dataset:%s", ds.Name)).Err(); err != nil { - logger.Warn("failed to delete dataset from Redis", - "dataset", ds.Name, - "error", err) - } } } } - if len(deleted) > 0 { + // First archive datasets older than maxAge. + now := time.Now() + for _, ds := range valid { + if ds.LastAccess.IsZero() { + continue + } + if now.Sub(ds.LastAccess) > maxAge { + archiveOne(ds, "max_age") + } + } + + // Then archive additional oldest datasets until we're under maxSize. + for totalSize > maxSize { + found := false + for _, ds := range valid { + if _, ok := archivedSet[ds.Name]; ok { + continue + } + archiveOne(ds, "max_size") + found = true + break + } + if !found { + break + } + } + + if len(archived) > 0 { logger.Info("cleanup complete", - "deleted_count", len(deleted), - "deleted_datasets", deleted) + "archived_count", len(archived), + "archived_datasets", archived) } else { - logger.Info("cleanup complete", "deleted_count", 0) + logger.Info("cleanup complete", "archived_count", 0) } return nil @@ -625,17 +705,29 @@ func (dm *DataManager) Close() { if dm.mlServer != nil { if err := dm.mlServer.Close(); err != nil { - dm.logger.Job(dm.ctx, "data_manager", "").Warn("error closing ML server connection", "error", err) + dm.logger.Job(dm.ctx, "data_manager", "").Warn( + "error closing ML server connection", + "error", + err, + ) } } if dm.nasServer != nil { if err := dm.nasServer.Close(); err != nil { - dm.logger.Job(dm.ctx, "data_manager", "").Warn("error closing NAS server connection", "error", err) + dm.logger.Job(dm.ctx, "data_manager", "").Warn( + "error closing NAS server connection", + "error", + err, + ) } } if dm.taskQueue != nil { if err := dm.taskQueue.Close(); err != nil { - dm.logger.Job(dm.ctx, "data_manager", "").Warn("error closing Redis connection", "error", err) + dm.logger.Job(dm.ctx, "data_manager", "").Warn( + "error closing Redis connection", + "error", + err, + ) } } } @@ -650,7 +742,7 @@ func main() { // Get API key from various sources apiKey := auth.GetAPIKeyFromSources(authFlags) - configFile := "configs/config-local.yaml" + configFile := "configs/api/dev.yaml" if authFlags.ConfigFile != "" { configFile = authFlags.ConfigFile } @@ -658,12 +750,12 @@ func main() { // Parse command line args if len(os.Args) < 2 { fmt.Println("Usage:") - fmt.Println(" data_manager [--config configs/config-local.yaml] [--api-key ] " + + fmt.Println(" data_manager [--config configs/api/dev.yaml] [--api-key ] " + "fetch [dataset...]") - fmt.Println(" data_manager [--config configs/config-local.yaml] [--api-key ] list") - fmt.Println(" data_manager [--config configs/config-local.yaml] [--api-key ] cleanup") - fmt.Println(" data_manager [--config configs/config-local.yaml] [--api-key ] validate ") - fmt.Println(" data_manager [--config configs/config-local.yaml] [--api-key ] daemon") + fmt.Println(" data_manager [--config configs/api/dev.yaml] [--api-key ] list") + fmt.Println(" data_manager [--config configs/api/dev.yaml] [--api-key ] cleanup") + fmt.Println(" data_manager [--config configs/api/dev.yaml] [--api-key ] validate ") + fmt.Println(" data_manager [--config configs/api/dev.yaml] [--api-key ] daemon") fmt.Println() auth.PrintAuthHelp() os.Exit(1) diff --git a/cmd/db-utils/init_multi_user.go b/cmd/db-utils/init_multi_user.go index 36e781f..6f0455a 100644 --- a/cmd/db-utils/init_multi_user.go +++ b/cmd/db-utils/init_multi_user.go @@ -78,7 +78,7 @@ func main() { for _, user := range users { insert := ` - INSERT OR REPLACE INTO api_keys (user_id, key_hash, admin, roles, permissions) + INSERT OR REPLACE INTO api_keys (user_id, key_hash, admin, roles, permissions) VALUES (?, ?, ?, ?, ?)` if _, err := db.ExecContext(context.Background(), insert,