ci: align workflows, build scripts, and docs with current architecture

This commit is contained in:
Jeremie Fraeys 2026-01-05 12:34:23 -05:00
parent dab680a60d
commit 94112f0af5
21 changed files with 649 additions and 342 deletions

View file

@ -1,12 +1,6 @@
name: Benchmark Metrics
on:
push:
branches: [ main, develop ]
pull_request:
branches: [ main ]
schedule:
- cron: '0 6 * * *' # Daily at 6 AM UTC
workflow_dispatch:
jobs:
@ -65,12 +59,14 @@ jobs:
done < clean_benchmarks.txt
- name: Push to Prometheus Pushgateway
env:
PROMETHEUS_PUSHGATEWAY_URL: ${{ secrets['PROMETHEUS_PUSHGATEWAY_URL'] }}
run: |
# Push metrics to Prometheus Pushgateway (if configured)
if [ -n "${{ secrets.PROMETHEUS_PUSHGATEWAY_URL }}" ]; then
if [ -n "$PROMETHEUS_PUSHGATEWAY_URL" ]; then
echo "Pushing metrics to Prometheus..."
curl --data-binary @prometheus_metrics.txt \
"${{ secrets.PROMETHEUS_PUSHGATEWAY_URL }}/metrics/job/benchmark/instance/${{ github.run_id }}"
"$PROMETHEUS_PUSHGATEWAY_URL/metrics/job/benchmark/instance/${{ github.run_id }}"
else
echo "PROMETHEUS_PUSHGATEWAY_URL not configured, skipping push"
fi

View file

@ -1,10 +1,7 @@
name: CI/CD Pipeline
on:
push:
branches: [ main, develop ]
pull_request:
branches: [ main ]
workflow_dispatch:
# Concurrency control to prevent multiple runs of the same workflow
concurrency:
@ -85,18 +82,12 @@ jobs:
- name: Run tests
run: make test
env:
REDIS_URL: redis://localhost:6379
- name: Test internal/queue package
run: go test -v -race -coverprofile=queue-coverage.out ./internal/queue/...
env:
REDIS_URL: redis://localhost:6379
- name: Run comprehensive tests
run: make test-all
env:
REDIS_URL: redis://localhost:6379
- name: Run linters
run: make lint
@ -111,6 +102,19 @@ jobs:
flags: unittests
name: codecov-umbrella
dev-smoke:
name: Dev Compose Smoke Test
runs-on: ubuntu-latest
needs: test
timeout-minutes: 20
steps:
- name: Checkout code
uses: actions/checkout@v5
- name: Run dev smoke test
run: make dev-smoke
build:
name: Build
runs-on: ubuntu-latest

View file

@ -1,8 +1,7 @@
name: Label Pull Request
on:
pull_request:
types: [opened, edited, synchronize]
workflow_dispatch:
jobs:
label:

View file

@ -21,13 +21,7 @@ jobs:
echo "LICENSE file is missing"
exit 1
fi
# Check if it's MIT license
if ! grep -q "MIT License" LICENSE; then
echo "License file should be MIT License"
exit 1
fi
echo "License file OK"
- name: Check Go files for license headers
@ -36,7 +30,7 @@ jobs:
missing_headers=0
for file in $(find . -name "*.go" -not -path "./vendor/*" -not -path "./.git/*"); do
if ! head -10 "$file" | grep -q "Copyright" && ! head -10 "$file" | grep -q "MIT"; then
if ! head -10 "$file" | grep -q "Copyright"; then
echo "Missing license header in: $file"
missing_headers=$((missing_headers + 1))
fi

View file

@ -1,13 +1,12 @@
name: Release
on:
push:
tags:
- 'v*' # Trigger on version tags like v1.0.0
workflow_dispatch:
permissions:
contents: write
packages: write
id-token: write
env:
GO_VERSION: '1.25.0'
@ -113,10 +112,10 @@ jobs:
- name: Package binaries
run: |
cd dist
for binary in api-server worker tui data_manager user_manager; do
for binary in fetch_ml_*; do
if [[ -f "${binary}" ]]; then
tar -czf "fetch_ml_${binary}.tar.gz" "${binary}"
sha256sum "fetch_ml_${binary}.tar.gz" > "fetch_ml_${binary}.tar.gz.sha256"
tar -czf "${binary}.tar.gz" "${binary}"
sha256sum "${binary}.tar.gz" > "${binary}.tar.gz.sha256"
fi
done
@ -155,6 +154,20 @@ jobs:
sha256sum *.tar.gz > checksums.txt
ls -lh
- name: Install cosign
uses: sigstore/cosign-installer@v3
- name: Sign checksums.txt (keyless)
working-directory: release
env:
COSIGN_YES: "true"
run: |
cosign sign-blob \
--output-signature checksums.txt.sig \
--output-certificate checksums.txt.cert \
checksums.txt
ls -lh checksums.txt checksums.txt.sig checksums.txt.cert
- name: Create Release
uses: softprops/action-gh-release@v2
with:
@ -170,15 +183,12 @@ jobs:
- **`ml-macos-arm64.tar.gz`** - macOS Apple Silicon
### Go Backend Binaries
- **`fetch_ml_api-server.tar.gz`** - API Server
- **`fetch_ml_worker.tar.gz`** - Worker
- **`fetch_ml_tui.tar.gz`** - Terminal UI
- **`fetch_ml_data_manager.tar.gz`** - Data Manager
- **`fetch_ml_user_manager.tar.gz`** - User Manager
Included as per-platform tarballs named:
- `fetch_ml_<component>_<os>_<arch>[.exe].tar.gz`
### Installation
```bash
# Download and extract
# Download and extract (example: CLI)
tar -xzf ml-<platform>.tar.gz
# Make executable and move to PATH

View file

@ -4,27 +4,11 @@ run:
timeout: 5m
tests: true
output:
format: colored-line-number
linters-settings:
govet:
enable:
- shadow
- fieldalignment
gocyclo:
min-complexity: 15
dupl:
threshold: 100
goconst:
min-len: 3
min-occurrences: 3
misspell:
locale: US
lll:
line-length: 100
revive:
confidence: 0.8
formats:
text:
path: stdout
linters:
disable-all: true
default: none
enable:
- bodyclose
- dogsled
@ -51,40 +35,84 @@ linters:
- unused
- whitespace
- revive
settings:
nolintlint:
allow-unused: true
govet:
enable:
- shadow
- fieldalignment
gocyclo:
min-complexity: 15
dupl:
threshold: 100
goconst:
min-len: 3
min-occurrences: 3
misspell:
locale: US
ignore-rules:
- cancelled
- cancelling
- behaviour
- colour
- sanitise
- initialise
- optimise
- normalised
lll:
line-length: 100
revive:
confidence: 0.8
exclusions:
rules:
# G306: File permissions - acceptable for test files and scripts
- text: "G306:"
linters:
- gosec
# Exclude linters for test files
- path: ".*_test\\.go"
linters:
- gocyclo
- errcheck
- dupl
- lll
- gosec
- revive
# Exclude errcheck for tests directory
- path: "^tests/"
linters:
- errcheck
# approve insecureSkipVerify in test files
- path: _test\\.go
text: "insecureSkipVerify"
linters:
- gosec
# Exclude gosec G204 for tests and tools via source match
- source: "exec\\.CommandContext"
path: "(tests|tools)/"
linters:
- gosec
# Exclude revive for api package naming via source match
- source: "^package api$"
linters:
- revive
# Known legacy staticcheck issue
- path: "^internal/worker/snapshot_store\\.go$"
text: "SA1019"
linters:
- staticcheck
# Known legacy unparam issue
- path: "^internal/resources/manager\\.go$"
text: "gpuSlotsForTask"
linters:
- unparam
# Ignore whitespace-only lint noise in this file
- path: "^internal/api/ws_jobs\\.go$"
linters:
- whitespace
issues:
exclude-rules:
# G306: File permissions - acceptable for test files and scripts
- text: "G306:"
linters:
- gosec
# Exclude linters for test files
- path: ".*_test\\.go"
linters:
- gocyclo
- errcheck
- dupl
- lll
- gosec
- revive
# Exclude errcheck for tests directory
- path: "^tests/"
linters:
- errcheck
# approve insecureSkipVerify in test files
- path: _test\.go
text: "insecureSkipVerify"
linters:
- gosec
# Exclude gosec G204 for tests and tools via source match
- source: "exec\\.CommandContext"
path: "(tests|tools)/"
linters:
- gosec
# Exclude revive for api package naming via source match
- source: "^package api$"
linters:
- revive
max-issues-per-linter: 0
max-same-issues: 0
severity:
default-severity: error
default: error

View file

@ -239,15 +239,16 @@ cp .env.example .env.local
```
Key variables for development:
- `REDIS_URL`: Redis connection string
- `LOG_LEVEL`: Set to `debug` for verbose logging
- `API_PORT`: API server port (default: 9101)
### Configuration Files
- `configs/config-dev.yaml`: Development configuration
- `configs/config-local.yaml`: Local overrides
- `configs/config-prod.yaml`: Production settings
- `configs/api/dev.yaml`: Development (Docker) API server configuration
- `configs/api/homelab-secure.yaml`: Homelab secure API server configuration
- `configs/api/prod.yaml`: Production API server configuration
- `configs/workers/docker.yaml`: Docker worker configuration
- `configs/workers/worker-prod.toml`: Production worker configuration
## IDE Setup

89
LICENSE
View file

@ -1,21 +1,68 @@
MIT License
Copyright (c) 2024 Fetch ML
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
FetchML Source-Available Research & Audit License (SARAL)
Copyright (c) 2026 Fetch ML
This software is source-available for transparency and auditability. It is not
open-source.
1. Definitions
1.1 "Software" means the FetchML source code, binaries, documentation, and any
accompanying files.
1.2 "You" means any individual or entity exercising permissions under this
license.
1.3 "Commercial Use" means use of the Software (or any Derivative Works) in a
way that is primarily intended for or directed toward commercial advantage
or monetary compensation, including use by or for a for-profit entity in
its business operations. Commercial Use does not include personal hobby use
or academic/non-profit research that is not for a commercial purpose.
1.4 "Hosted Service" means making the Software (or substantially similar
functionality) available to third parties as a service, including via SaaS,
managed service, hosted API, or similar offering.
1.5 "Redistribute" means to copy, publish, sublicense, sell, rent, lease,
transfer, or otherwise provide the Software (or any portion) to any third
party, whether in source or binary form.
1.6 "Derivative Works" means any modified version of the Software, or any work
based on the Software.
2. Grant of Transparency (Source Viewing and Audit)
You may view, read, and audit the source code of the Software for purposes of
security review, privacy review, correctness validation, reproducibility of
results, and internal evaluation.
3. Permitted Use
Subject to the restrictions in this license, You may use the Software solely
for:
- personal use; and
- non-commercial research, development, and experimentation on systems that You
own or control (including homelabs and lab servers).
You may modify the Software solely for Your internal Permitted Use.
4. Prohibited Use
You may not:
- use the Software for Commercial Use;
- offer the Software or Derivative Works as a Hosted Service;
- Redistribute the Software or any Derivative Works, in source or binary form;
- use the Software in a way that enables third parties to access or benefit
from the Software as a service;
- remove or alter this license.
5. No Trademark Rights
This license does not grant any rights to use the names, logos, or trademarks
of Fetch ML or FetchML, except as necessary to comply with attribution
requirements in this license.
6. No Warranty
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
7. Limitation of Liability
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
OR OTHER DEALINGS IN THE SOFTWARE.
8. Termination
Any violation of this license automatically terminates Your rights under this
license.

230
Makefile
View file

@ -1,88 +1,106 @@
.PHONY: all build prod dev clean clean-docs test test-unit test-integration test-e2e test-coverage lint install setup validate configlint ci-local docs benchmark benchmark-local artifacts clean-benchmarks clean-all clean-aggressive status load-test chaos-test profile-tools detect-regressions tech-excellence docker-build docker-run docker-stop docker-logs monitoring-performance monitoring-performance-stop dashboard-performance self-cleanup auto-cleanup test-full test-auth test-status
.PHONY: all build prod dev clean clean-docs test test-unit test-integration test-e2e test-coverage lint install configlint worker-configlint ci-local docs docs-setup docs-build benchmark benchmark-local artifacts clean-benchmarks clean-all clean-aggressive status size load-test chaos-test profile-load profile-load-norate profile-ws-queue profile-tools detect-regressions tech-excellence docker-build dev-smoke self-cleanup test-full test-auth deploy-up deploy-down deploy-status deploy-clean dev-up dev-down dev-status dev-logs prod-up prod-down prod-status prod-logs
OK =
# Default target
all: build
# Build all components
# Build all components (Go binaries + optimized CLI)
build:
go build -o bin/api-server cmd/api-server/main.go
go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go
go build -o bin/worker cmd/worker/worker_server.go
go build -o bin/tui ./cmd/tui
cd cli && zig build prod
@echo "✓ All components built"
cd cli && zig build --release=small
@echo "${OK} All components built"
# Build production-optimized binaries
prod:
go build -ldflags="-s -w" -o bin/api-server cmd/api-server/main.go
go build -ldflags="-s -w" -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go
go build -ldflags="-s -w" -o bin/worker cmd/worker/worker_server.go
go build -ldflags="-s -w" -o bin/tui ./cmd/tui
cd cli && zig build prod && strip zig-out/prod/ml
@echo "✓ Production binaries built"
cd cli && zig build --release=small
@echo "${OK} Production binaries built"
cross-platform:
@rm -rf dist
@mkdir -p dist
@set -e; \
LDFLAGS='-s -w -buildid='; \
for target in linux/amd64 linux/arm64 darwin/amd64 darwin/arm64 windows/amd64; do \
goos=$${target%/*}; \
goarch=$${target#*/}; \
ext=''; \
if [ "$$goos" = "windows" ]; then ext='.exe'; fi; \
echo "Building $$goos/$$goarch..."; \
CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_api-server_$${goos}_$${goarch}$${ext} cmd/api-server/main.go; \
CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_worker_$${goos}_$${goarch}$${ext} cmd/worker/worker_server.go; \
CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_tui_$${goos}_$${goarch}$${ext} ./cmd/tui; \
CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_data_manager_$${goos}_$${goarch}$${ext} ./cmd/data_manager; \
CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_user_manager_$${goos}_$${goarch}$${ext} ./cmd/user_manager; \
done
@echo "${OK} Cross-platform binaries built in dist/"
# Development build (faster compilation)
dev:
go build -o bin/api-server cmd/api-server/main.go
go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go
go build -o bin/tui ./cmd/tui
cd cli && zig build dev
@echo "✓ Development binaries built"
go build -buildvcs=false -o bin/api-server cmd/api-server/main.go
go build -buildvcs=false -o bin/worker cmd/worker/worker_server.go
go build -buildvcs=false -o bin/tui ./cmd/tui
cd cli && zig build --release=fast
@echo "${OK} Development binaries built"
# Clean build artifacts
# Clean build artifacts (Go + Zig + test outputs)
clean:
rm -rf bin/ coverage/
rm -rf cli/zig-out/
rm -rf cli/.zig-cache/
rm -rf bin/ coverage/ tests/bin/
rm -rf cli/zig-out/ cli/.zig-cache/ .zig-cache/
go clean
@echo "✓ Cleaned"
@echo "${OK} Cleaned"
clean-docs:
rm -rf docs/_site/
@echo "✓ Cleaned docs"
@echo "${OK} Cleaned docs"
# Run tests
test:
go test ./tests/...
cd cli && zig build test
@echo "✓ All tests passed"
@echo "${OK} All tests passed"
# Lint Go and Zig code
lint:
gofmt -w ./cmd ./internal ./tests || true
go vet ./...
cd cli && zig fmt .
@echo "✓ Lint completed"
@echo "${OK} Lint completed"
# Install to system (requires sudo)
install: prod
sudo cp bin/api-server /usr/local/bin/fetchml-api
sudo cp bin/worker /usr/local/bin/fetchml-worker
sudo cp bin/tui /usr/local/bin/fetchml-tui
sudo cp cli/zig-out/prod/ml /usr/local/bin/ml
@echo "✓ Installed"
# Setup production environment
setup:
@if [ "$(shell uname)" = "Linux" ]; then \
sudo ./scripts/setup-prod.sh; \
else \
echo "Production setup is for Linux only. You're on $(shell uname)."; \
echo "Use docker-compose for local development."; \
fi
# Validate production configuration
validate:
./scripts/validate-prod-config.sh configs/config-prod.yaml configs/worker-prod.toml
sudo cp cli/zig-out/bin/ml /usr/local/bin/ml
@echo "${OK} Installed"
# Validate YAML configs against JSON schema
configlint:
go run ./cmd/configlint --schema configs/schema/config_schema.yaml \
configs/config-prod.yaml \
configs/config-no-tls.yaml \
configs/config-dev.yaml
go run ./cmd/configlint --schema configs/schema/api_server_config.yaml \
configs/api/dev.yaml \
configs/api/homelab-secure.yaml \
configs/api/multi-user.yaml \
configs/api/prod.yaml
worker-configlint:
go run ./cmd/configlint --schema configs/schema/worker_config_schema.yaml \
configs/worker-prod.toml
configs/workers/worker-prod.toml \
configs/workers/docker.yaml \
configs/workers/docker-dev.yaml \
configs/workers/docker-prod.yaml \
configs/workers/homelab-secure.yaml
dev-smoke:
bash ./scripts/smoke-test.sh dev
@echo "dev smoke: OK"
prod-smoke:
bash ./scripts/smoke-test.sh prod
@echo "prod smoke: OK"
# Run a local approximation of the CI pipeline
ci-local:
@ -95,36 +113,19 @@ ci-local:
@echo "Running coverage..."
make test-coverage
# Docker targets
# Docker image build (no direct docker-compose run; use deploy-* targets instead)
docker-build:
docker build -f build/docker/simple.Dockerfile -t fetchml:latest .
@echo "✓ Docker image built"
docker-run:
docker-compose up -d
@echo "✓ Services started"
docker-stop:
docker-compose down
@echo "✓ Services stopped"
docker-logs:
docker-compose logs -f
# Monitoring setup (Linux only)
setup-monitoring:
@if [ "$(shell uname)" = "Linux" ]; then \
sudo ./scripts/setup-monitoring-prod.sh; \
else \
echo "Monitoring setup is for Linux production. Use docker-compose for local development."; \
fi
@echo "${OK} Docker image built"
# Enhanced test targets
test-unit:
go test -v -short ./...
go test -v -short ./tests/unit/...
cd cli && zig build test
test-integration:
go test -v ./...
go test -v ./tests/integration/... ./tests
cd cli && zig build test
test-e2e:
go test -v ./tests/e2e/...
@ -132,7 +133,7 @@ test-e2e:
test-coverage:
go test -coverprofile=coverage/coverage.out ./...
go tool cover -html=coverage/coverage.out -o coverage/coverage.html
@echo "✓ Coverage report: coverage/coverage.html"
@echo "${OK} Coverage report: coverage/coverage.html"
# Documentation setup
docs-setup:
@ -170,7 +171,7 @@ benchmark:
# Run benchmarks locally with artifact management
benchmark-local:
@echo "Running benchmarks locally with full workflow..."
./scripts/run-benchmarks-local.sh
./scripts/benchmarks/run-benchmarks-local.sh
# Manage benchmark artifacts
artifacts:
@ -180,41 +181,27 @@ artifacts:
# Clean benchmark artifacts (keep last 10)
clean-benchmarks:
@echo "Cleaning benchmark artifacts..."
./scripts/cleanup-benchmarks.sh benchmarks
./scripts/maintenance/cleanup-benchmarks.sh benchmarks
# Comprehensive cleanup (keep last 5 runs)
clean-all:
@echo "Running comprehensive cleanup..."
./scripts/cleanup-benchmarks.sh all
./scripts/maintenance/cleanup-benchmarks.sh all
# Aggressive cleanup (removes more data)
clean-aggressive:
@echo "Running aggressive cleanup..."
./scripts/cleanup-benchmarks.sh aggressive
./scripts/maintenance/cleanup-benchmarks.sh aggressive
# Show disk usage status
status:
@echo "Checking disk usage..."
./scripts/cleanup-benchmarks.sh status
./scripts/maintenance/cleanup-benchmarks.sh status
# Start performance monitoring stack
monitoring-performance:
@echo "Starting performance monitoring stack..."
cd monitoring && docker-compose -f docker-compose.performance.yml up -d
@echo "Grafana available at: http://localhost:3001 (admin/admin)"
@echo "Loki available at: http://localhost:3100"
@echo "Pushgateway available at: http://localhost:9091"
@echo "Quick start guide: docs/src/performance-quick-start.md"
size:
@echo "Binary sizes:"
@ls -lh bin/* cli/zig-out/bin/ml 2>/dev/null || true
# Stop performance monitoring stack
monitoring-performance-stop:
@echo "Stopping performance monitoring stack..."
cd monitoring && docker-compose -f docker-compose.performance.yml down
# View performance dashboard
dashboard-performance:
@echo "Opening performance dashboard..."
@echo "URL: http://localhost:3001/d/fetchml-performance/fetch-ml-performance-dashboard"
# Load testing
load-test:
@ -225,18 +212,18 @@ load-test:
profile-load:
@echo "CPU profiling MediumLoad HTTP load test..."
go test ./tests/load -run TestLoadProfile_Medium -count=1 -cpuprofile tests/bin/cpu_load.out
@echo " CPU profile written to cpu_load.out (inspect with: go tool pprof tests/bin/cpu_load.out)"
@echo "${OK} CPU profile written to cpu_load.out (inspect with: go tool pprof tests/bin/cpu_load.out)"
profile-load-norate:
@echo "CPU profiling MediumLoad HTTP load test (no rate limiting)..."
go test ./tests/load -run TestLoadProfile_Medium -count=1 -cpuprofile tests/bin/cpu_load.out -v -args -profile-norate
@echo " CPU profile written to cpu_load.out (inspect with: go tool pprof tests/bin/cpu_load.out)"
@echo "${OK} CPU profile written to cpu_load.out (inspect with: go tool pprof tests/bin/cpu_load.out)"
# CPU profiling for WebSocket → Redis queue → worker path
profile-ws-queue:
@echo "CPU profiling WebSocket queue integration test..."
go test ./tests/integration -run WebSocketQueue -count=5 -cpuprofile tests/bin/cpu_ws.out
@echo " CPU profile written to cpu_ws.out (inspect with: go tool pprof tests/bin/cpu_ws.out)"
@echo "${OK} CPU profile written to cpu_ws.out (inspect with: go tool pprof tests/bin/cpu_ws.out)"
# Chaos engineering tests
chaos-test:
@ -282,24 +269,20 @@ help:
@echo ""
@echo "Docker Targets:"
@echo " make docker-build - Build Docker image"
@echo " make docker-run - Start services with docker-compose"
@echo " make docker-stop - Stop docker-compose services"
@echo " make docker-logs - View docker-compose logs"
@echo ""
@echo "Test Targets:"
@echo " make test - Run all tests"
@echo " make test-unit - Run unit tests only"
@echo " make test-integration - Run integration tests"
@echo " make test-e2e - Run end-to-end tests (Podman test is opt-in via FETCH_ML_E2E_PODMAN=1)"
@echo " make test-coverage - Generate coverage report"
@echo " make lint - Run formatters and linters"
@echo " make ci-local - Run local CI dry-run (tests, lint, config validation, coverage)"
@echo " make configlint - Validate YAML configs against schema"
@echo " make worker-configlint - Validate worker configs against schema"
@echo ""
@echo "Setup Targets:"
@echo " make install - Install binaries to /usr/local/bin (requires sudo)"
@echo " make setup - Run production setup (Linux only)"
@echo " make setup-monitoring - Setup monitoring stack (Linux only)"
@echo " make validate - Validate production configuration"
@echo ""
@echo "Performance Testing:"
@echo " make benchmark - Run performance benchmarks"
@ -325,10 +308,8 @@ help:
@echo "Utility:"
@echo " make size - Show binary sizes"
@echo " make self-cleanup - Clean up Docker resources"
@echo " make auto-cleanup - Setup daily auto-cleanup service"
@echo " make test-full - Run complete test suite"
@echo " make test-auth - Test multi-user authentication"
@echo " make test-status - Check cleanup status"
@echo " make help - Show this help"
# Self-cleaning for Docker resources
@ -336,15 +317,10 @@ self-cleanup:
@echo "Running self-cleanup..."
@./scripts/maintenance/cleanup.sh
# Setup auto-cleanup service
auto-cleanup:
@echo "Setting up auto-cleanup service..."
@./scripts/deployment/setup-auto-cleanup.sh
# Run full test suite
test-full:
@echo "Running full test suite..."
@./scripts/testing/run-full-test-suite.sh
@$(MAKE) ci-local
# Quick authentication test
test-auth:
@ -353,7 +329,43 @@ test-auth:
@echo "Testing researcher user..." && cp ~/.ml/config-researcher.toml ~/.ml/config.toml && ./cli/zig-out/bin/ml status
@echo "Testing analyst user..." && cp ~/.ml/config-analyst.toml ~/.ml/config.toml && ./cli/zig-out/bin/ml status
# Test cleanup status
test-status:
@echo "Checking cleanup status..."
@./scripts/maintenance/cleanup-status.sh
# Deployment management (using organized docker-compose files)
deploy-up:
@echo "Starting development environment..."
@./deployments/deploy.sh dev up
deploy-down:
@echo "Stopping development environment..."
@./deployments/deploy.sh dev down
deploy-status:
@echo "Checking deployment status..."
@./deployments/deploy.sh dev status
deploy-clean:
@echo "Cleaning all deployments..."
@cd deployments && make clean
dev-up:
@./deployments/deploy.sh dev up
dev-down:
@./deployments/deploy.sh dev down
dev-status:
@./deployments/deploy.sh dev status
dev-logs:
@./deployments/deploy.sh dev logs
prod-up:
@./deployments/deploy.sh prod up
prod-down:
@./deployments/deploy.sh prod down
prod-status:
@./deployments/deploy.sh prod status
prod-logs:
@./deployments/deploy.sh prod logs

100
README.md
View file

@ -2,6 +2,66 @@
A lightweight ML experiment platform with a tiny Zig CLI and a Go backend. Designed for homelabs and small teams.
## Installation (recommended)
FetchML publishes pre-built release artifacts (CLI + Go services) on GitHub Releases.
If you prefer a one-shot check (recommended for most users), you can use:
```bash
./scripts/verify_release.sh --dir . --repo <org>/<repo>
```
1) Download the right archive for your platform
2) Verify `checksums.txt` signature (recommended)
The release includes a signed `checksums.txt` plus:
- `checksums.txt.sig`
- `checksums.txt.cert`
Verify the signature (keyless Sigstore) using cosign:
```bash
cosign verify-blob \
--certificate checksums.txt.cert \
--signature checksums.txt.sig \
--certificate-identity-regexp "^https://github.com/<org>/<repo>/.github/workflows/release.yml@refs/tags/v.*$" \
--certificate-oidc-issuer https://token.actions.githubusercontent.com \
checksums.txt
```
3) Verify the SHA256 checksum against `checksums.txt`
4) Extract and install
Example (CLI on Linux x86_64):
```bash
# Download
curl -fsSLO https://github.com/<org>/<repo>/releases/download/<tag>/ml-linux-x86_64.tar.gz
curl -fsSLO https://github.com/<org>/<repo>/releases/download/<tag>/checksums.txt
curl -fsSLO https://github.com/<org>/<repo>/releases/download/<tag>/checksums.txt.sig
curl -fsSLO https://github.com/<org>/<repo>/releases/download/<tag>/checksums.txt.cert
# Verify
cosign verify-blob \
--certificate checksums.txt.cert \
--signature checksums.txt.sig \
--certificate-identity-regexp "^https://github.com/<org>/<repo>/.github/workflows/release.yml@refs/tags/v.*$" \
--certificate-oidc-issuer https://token.actions.githubusercontent.com \
checksums.txt
sha256sum -c --ignore-missing checksums.txt
# Install
tar -xzf ml-linux-x86_64.tar.gz
chmod +x ml-linux-x86_64
sudo mv ml-linux-x86_64 /usr/local/bin/ml
ml --help
```
## Quick start
```bash
@ -12,7 +72,7 @@ docker-compose up -d
# Or build the CLI locally
cd cli && make all
./build/ml --help
./zig-out/bin/ml --help
```
## What you get
@ -42,6 +102,17 @@ ml dataset list
ml monitor # SSH to run TUI remotely
```
## Phase 1 (V1) notes
- **Task schema** supports optional `snapshot_id` (opaque identifier) and `dataset_specs` (structured dataset inputs). If `dataset_specs` is present it takes precedence over legacy `datasets` / `--datasets` args.
- **Snapshot restore (S1)** stages verified `snapshot_id` into each task workspace and exposes it via `FETCH_ML_SNAPSHOT_DIR` and `FETCH_ML_SNAPSHOT_ID`. If `snapshot_store.enabled: true` in the worker config, the worker will pull `<prefix>/<snapshot_id>.tar.gz` from an S3-compatible store (e.g. MinIO), verify `snapshot_sha256`, and cache it under `data_dir/snapshots/sha256/<snapshot_sha256>`.
- **Prewarm (best-effort)** can fetch datasets for the next queued task while another task is running. Prewarm state is surfaced in `ml status --json` under the optional `prewarm` field.
- **Env prewarm (best-effort)** can build a warmed Podman image keyed by `deps_manifest_sha256` and reuse it for later tasks.
## Changelog
See `CHANGELOG.md`.
## Build
```bash
@ -66,6 +137,31 @@ See `docs/` for detailed guides:
- `docs/src/quick-start.md` Full setup guide
- `docs/src/deployment.md` Production deployment
## Source code
The FetchML source code is intentionally not hosted on GitHub.
The canonical source repository is available at: `<SOURCE_REPO_URL>`.
## Contributing
Contributions are welcome.
- **Questions / bug reports**: Use GitHub Issues: `<GITHUB_ISSUES_URL>`. Include:
- how to reproduce
- expected vs actual behavior
- logs/config snippets (sanitize secrets)
- OS + versions (Go, Zig, Podman/Docker if relevant)
- **Changes**: Submit a patch in a GitHub issue.
- Create a topic branch.
- Run tests/linters.
- Export your change as either:
- a patch series: `git format-patch -N origin/main`, or
- a single bundle: `git bundle create fetchml.bundle origin/main..HEAD`
- Attach the generated files to a GitHub issue at `<GITHUB_ISSUES_URL>`.
## License
See LICENSE.
FetchML is source-available for transparency and auditability. It is not open-source.
See `LICENSE`.

View file

@ -27,7 +27,7 @@ This will:
- **TLS/SSL**: HTTPS encrypted communication
- **IP Whitelisting**: Restrict access to trusted networks
- **Rate Limiting**: Prevent abuse and DoS attacks
- **Reverse Proxy**: Nginx with security headers
- **Reverse Proxy**: Caddy with security headers
### Data Protection
- **Path Traversal Protection**: Prevents directory escape attacks
@ -37,7 +37,7 @@ This will:
## Configuration Files
### Secure Config Location
- `configs/environments/config-homelab-secure.yaml` - Main secure configuration
- `configs/api/homelab-secure.yaml` - Main secure configuration
### API Keys
- `.api-keys` - Generated API keys (600 permissions)
@ -71,7 +71,7 @@ docker-compose -f docker-compose.yml -f docker-compose.homelab-secure.yml up -d
source .env.secure
# Start server
./api-server -config configs/environments/config-homelab-secure.yaml
./api-server -config configs/api/homelab-secure.yaml
```
## Security Checklist
@ -87,7 +87,7 @@ source .env.secure
### Network Security
- [ ] Use HTTPS only (disable HTTP)
- [ ] Restrict API access to trusted IPs
- [ ] Use reverse proxy (nginx)
- [ ] Use reverse proxy (caddy)
- [ ] Enable security headers
- [ ] Monitor access logs
@ -174,7 +174,7 @@ curl -s https://api.ipify.org
Monitor these files:
- `logs/fetch_ml.log` - Application logs
- `/var/log/nginx/security.log` - Nginx access logs
- Caddy access logs (configure if enabled)
- Docker logs: `docker logs ml-experiments-api`
## Best Practices

View file

@ -53,19 +53,23 @@ WORKDIR /app
COPY --from=go-builder /app/bin/ /usr/local/bin/
COPY --from=zig-builder /app/cli/zig-out/bin/ml /usr/local/bin/
# Copy configs
COPY --from=go-builder /app/configs/ /app/configs/
# Create directories
RUN mkdir -p /data/ml-experiments /home/appuser/.ml && \
chown -R appuser:appgroup /data /home/appuser
RUN mkdir -p /data/experiments /data/datasets /data/snapshots /home/appuser/.ml && \
mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl && \
chown -R appuser:appgroup /data /app /home/appuser
# Switch to app user
USER appuser
# Expose ports
EXPOSE 9100 9101
EXPOSE 9101
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD wget --no-verbose --tries=1 --spider http://localhost:9100/health || exit 1
CMD wget --no-verbose --tries=1 --no-check-certificate --spider https://localhost:9101/health || exit 1
# Default command
CMD ["/usr/local/bin/api-server"]
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"]

View file

@ -40,12 +40,15 @@ COPY --from=builder /app/bin/ /usr/local/bin/
COPY --from=builder /app/configs/ /app/configs/
# Create necessary directories
RUN mkdir -p /app/data/experiments /app/logs /app/ssl /app/ssh /tmp/fetchml-jobs
RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /app/ssh /tmp/fetchml-jobs && \
mkdir -p /data/active/datasets /data/active/snapshots && \
mkdir -p /logs && \
chown -R appuser:appgroup /app /data /logs
# Generate SSL certificates
RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
-subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \
chmod 644 /app/ssl/cert.pem /app/ssl/key.pem
chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem
# Generate SSH keys for container communication
RUN ssh-keygen -t rsa -b 2048 -f /app/ssh/id_rsa -N "" && \
@ -70,4 +73,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
CMD curl -k -f https://localhost:9101/health || exit 1
# Default command for API server
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"]
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"]

View file

@ -55,8 +55,10 @@ COPY --from=builder /app/bin/ /usr/local/bin/
COPY --from=builder /app/configs/ /app/configs/
# Create necessary directories with proper permissions
RUN mkdir -p /app/data/experiments /app/logs /app/ssl /tmp/fetchml-jobs && \
chown -R appuser:appgroup /app && \
RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /tmp/fetchml-jobs && \
mkdir -p /data/active/datasets /data/active/snapshots && \
mkdir -p /logs && \
chown -R appuser:appgroup /app /data /logs && \
chmod 750 /app/data/experiments /app/logs
# Generate SSL certificates with stronger crypto
@ -144,4 +146,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
CMD curl -k -f https://localhost:9101/health || exit 1
# Default command for API server
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"]
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"]

View file

@ -45,13 +45,16 @@ COPY --from=builder /app/bin/ /usr/local/bin/
COPY --from=builder /app/configs/ /app/configs/
# Create necessary directories
RUN mkdir -p /app/data/experiments /app/logs /app/ssl /tmp/fetchml-jobs && \
chown -R appuser:appgroup /app
RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /tmp/fetchml-jobs && \
mkdir -p /data/active/datasets /data/active/snapshots && \
mkdir -p /logs && \
chown -R appuser:appgroup /app /data /logs
# Generate SSL certificates
RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
-subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \
chmod 644 /app/ssl/cert.pem /app/ssl/key.pem
chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem && \
chown -R appuser:appgroup /app/ssl
# Generate SSH keys for worker user
RUN ssh-keygen -t rsa -b 4096 -f /home/worker/.ssh/id_rsa -N "" && \
@ -99,4 +102,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
CMD curl -k -f https://localhost:9101/health || exit 1
# Default command for API server
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"]
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"]

View file

@ -2,7 +2,7 @@
FROM golang:1.25-alpine AS builder
# Install dependencies
RUN apk add --no-cache git make
RUN apk add --no-cache git make gcc musl-dev
# Set working directory
WORKDIR /app
@ -17,13 +17,14 @@ RUN go mod download
COPY . .
# Build Go binaries
RUN go build -o bin/api-server cmd/api-server/main.go
RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \
CGO_ENABLED=1 go build -o bin/worker ./cmd/worker
# Final stage
FROM alpine:3.19
# Install runtime dependencies
RUN apk add --no-cache ca-certificates redis openssl
RUN apk add --no-cache bash ca-certificates redis openssl curl podman fuse-overlayfs slirp4netns iptables
# Create app user
RUN addgroup -g 1001 -S appgroup && \
@ -37,15 +38,17 @@ COPY --from=builder /app/bin/ /usr/local/bin/
# Copy configs and templates
COPY --from=builder /app/configs/ /app/configs/
COPY --from=builder /app/nginx/ /app/nginx/
# Create necessary directories
RUN mkdir -p /app/data/experiments /app/logs /app/ssl
RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl
# Generate SSL certificates for container use
RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
-subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \
chmod 644 /app/ssl/cert.pem /app/ssl/key.pem
chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem
# Ensure app user can write to data/logs and read TLS material
RUN chown -R appuser:appgroup /app/data /app/logs /app/ssl /app/configs
# Switch to app user
USER appuser
@ -55,7 +58,7 @@ EXPOSE 9101
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
CMD curl -k -f https://localhost:9101/health || exit 1
CMD curl -f http://localhost:9101/health || curl -k -f https://localhost:9101/health || exit 1
# Default command
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"]
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"]

View file

@ -2,7 +2,7 @@
FROM golang:1.25-alpine AS builder
# Install dependencies
RUN apk add --no-cache git
RUN apk add --no-cache git gcc musl-dev
# Set working directory
WORKDIR /app
@ -17,7 +17,7 @@ RUN go mod download
COPY . .
# Build only Go binaries (skip Zig)
RUN go build -o bin/api-server cmd/api-server/main.go && \
RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \
go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go && \
go build -o bin/tui ./cmd/tui
@ -25,7 +25,7 @@ RUN go build -o bin/api-server cmd/api-server/main.go && \
FROM alpine:3.19
# Install runtime dependencies
RUN apk add --no-cache ca-certificates curl
RUN apk add --no-cache ca-certificates curl openssl
# Create app user
RUN addgroup -g 1001 -S appgroup && \
@ -41,7 +41,16 @@ COPY --from=builder /app/bin/ /usr/local/bin/
COPY --from=builder /app/configs/ /app/configs/
# Create necessary directories
RUN mkdir -p /app/data/experiments /app/logs
RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl && \
mkdir -p /data/experiments /data/datasets /data/snapshots
# Generate SSL certificates for container use
RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
-subj "/C=US/ST=Test/L=Local/O=FetchML/OU=Tests/CN=localhost" && \
chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem
# Ensure app user can write to data/logs and read TLS material
RUN chown -R appuser:appgroup /app/data /app/logs /app/ssl /app/configs /data
# Switch to app user
USER appuser
@ -50,4 +59,4 @@ USER appuser
EXPOSE 9101
# Default command
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/environments/config-local.yaml"]
CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"]

View file

@ -2,6 +2,7 @@
package main
import (
"bytes"
"encoding/json"
"flag"
"fmt"
@ -10,6 +11,7 @@ import (
"path/filepath"
"strings"
"github.com/BurntSushi/toml"
"github.com/jfraeys/fetch_ml/internal/fileutil"
"github.com/xeipuuv/gojsonschema"
"gopkg.in/yaml.v3"
@ -68,20 +70,7 @@ func loadSchema(schemaPath string) (gojsonschema.JSONLoader, error) {
return nil, err
}
tmpFile, err := os.CreateTemp("", "fetchml-schema-*.json")
if err != nil {
return nil, err
}
defer func() {
_ = tmpFile.Close()
_ = os.Remove(tmpFile.Name())
}()
if _, err := tmpFile.Write(schemaJSON); err != nil {
return nil, err
}
return gojsonschema.NewReferenceLoader("file://" + filepath.ToSlash(tmpFile.Name())), nil
return gojsonschema.NewBytesLoader(schemaJSON), nil
}
func validateConfig(schemaLoader gojsonschema.JSONLoader, configPath string) error {
@ -90,12 +79,27 @@ func validateConfig(schemaLoader gojsonschema.JSONLoader, configPath string) err
return err
}
var configYAML interface{}
if err := yaml.Unmarshal(data, &configYAML); err != nil {
return fmt.Errorf("failed to parse YAML: %w", err)
ext := strings.ToLower(filepath.Ext(configPath))
var decoded any
switch ext {
case ".toml":
var configTOML map[string]any
if _, err := toml.Decode(string(data), &configTOML); err != nil {
return fmt.Errorf("failed to parse TOML: %w", err)
}
decoded = configTOML
default:
// YAML (default)
var configYAML any
dec := yaml.NewDecoder(bytes.NewReader(data))
dec.KnownFields(false)
if err := dec.Decode(&configYAML); err != nil {
return fmt.Errorf("failed to parse YAML: %w", err)
}
decoded = configYAML
}
configJSON, err := json.Marshal(configYAML)
configJSON, err := json.Marshal(decoded)
if err != nil {
return err
}

View file

@ -40,10 +40,10 @@ type DataConfig struct {
CleanupInterval int `yaml:"cleanup_interval_min"` // Run cleanup every X minutes
// Podman integration
PodmanImage string `yaml:"podman_image"`
ContainerWorkspace string `yaml:"container_workspace"`
ContainerResults string `yaml:"container_results"`
GPUAccess bool `yaml:"gpu_access"`
PodmanImage string `yaml:"podman_image"`
ContainerWorkspace string `yaml:"container_workspace"`
ContainerResults string `yaml:"container_results"`
GPUDevices []string `yaml:"gpu_devices"`
}
// LoadDataConfig loads data manager configuration from a YAML file.

View file

@ -10,6 +10,7 @@ import (
"os"
"os/signal"
"path/filepath"
"sort"
"strings"
"syscall"
"time"
@ -23,6 +24,13 @@ import (
"github.com/jfraeys/fetch_ml/internal/telemetry"
)
func shellQuote(s string) string {
if s == "" {
return "''"
}
return "'" + strings.ReplaceAll(s, "'", "'\"'\"'") + "'"
}
// SSHClient alias for convenience.
type SSHClient = network.SSHClient
@ -37,6 +45,32 @@ type DataManager struct {
logger *logging.Logger
}
func (dm *DataManager) archiveDatasetOnML(datasetName string) (string, error) {
datasetName = strings.TrimSpace(datasetName)
if err := container.ValidateJobName(datasetName); err != nil {
return "", fmt.Errorf("invalid dataset name: %w", err)
}
if strings.TrimSpace(dm.config.MLDataDir) == "" {
return "", fmt.Errorf("missing ml_data_dir")
}
stamp := time.Now().UTC().Format("20060102-150405")
archiveRoot := filepath.Join(dm.config.MLDataDir, ".archive", stamp)
src := filepath.Join(dm.config.MLDataDir, datasetName)
dst := filepath.Join(archiveRoot, datasetName)
cmd := fmt.Sprintf(
"mkdir -p %s && mv %s %s",
shellQuote(archiveRoot),
shellQuote(src),
shellQuote(dst),
)
if _, err := dm.mlServer.Exec(cmd); err != nil {
return "", err
}
return dst, nil
}
// DataFetchRequest represents a request to fetch datasets.
type DataFetchRequest struct {
JobName string `json:"job_name"`
@ -141,7 +175,11 @@ func (dm *DataManager) FetchDataset(jobName, datasetName string) error {
})
}
func (dm *DataManager) fetchDatasetInternal(ctx context.Context, jobName, datasetName string) error {
func (dm *DataManager) fetchDatasetInternal(
ctx context.Context,
jobName string,
datasetName string,
) error {
if err := container.ValidateJobName(datasetName); err != nil {
return &errtypes.DataFetchError{
Dataset: datasetName,
@ -225,9 +263,14 @@ func (dm *DataManager) fetchDatasetInternal(ctx context.Context, jobName, datase
ioBefore, ioErr := telemetry.ReadProcessIO()
start := time.Now()
out, err := telemetry.ExecWithMetrics(dm.logger, "dataset transfer", time.Since(start), func() (string, error) {
return dm.nasServer.ExecContext(ctx, rsyncCmd)
})
out, err := telemetry.ExecWithMetrics(
dm.logger,
"dataset transfer",
time.Since(start),
func() (string, error) {
return dm.nasServer.ExecContext(ctx, rsyncCmd)
},
)
duration := time.Since(start)
if err != nil {
@ -413,64 +456,101 @@ func (dm *DataManager) CleanupOldData() error {
"total_size_gb", totalSizeGB,
"dataset_count", len(datasets))
// Delete datasets older than max age or if over size limit
// Archive datasets older than max age or if over size limit
maxAge := time.Duration(dm.config.MaxAgeHours) * time.Hour
maxSize := int64(dm.config.MaxSizeGB) * 1024 * 1024 * 1024
var deleted []string
// Ensure deterministic ordering when needing to reduce size.
sort.Slice(datasets, func(i, j int) bool {
ai := datasets[i].LastAccess
aj := datasets[j].LastAccess
if ai.IsZero() && aj.IsZero() {
return datasets[i].Name < datasets[j].Name
}
if ai.IsZero() {
return true
}
if aj.IsZero() {
return false
}
if ai.Equal(aj) {
return datasets[i].Name < datasets[j].Name
}
return ai.Before(aj)
})
valid := make([]DatasetInfo, 0, len(datasets))
for _, ds := range datasets {
shouldDelete := false
// Check age
if !ds.LastAccess.IsZero() && time.Since(ds.LastAccess) > maxAge {
logger.Info("dataset is old, marking for deletion",
"dataset", ds.Name,
"last_access", ds.LastAccess,
"age_hours", time.Since(ds.LastAccess).Hours())
shouldDelete = true
name := strings.TrimSpace(ds.Name)
if err := container.ValidateJobName(name); err != nil {
logger.Warn("skipping dataset with invalid name", "dataset", ds.Name)
continue
}
ds.Name = name
valid = append(valid, ds)
}
// Check if over size limit
if totalSize > maxSize {
logger.Info("over size limit, deleting oldest dataset",
"dataset", ds.Name,
"current_size_gb", totalSizeGB,
"max_size_gb", dm.config.MaxSizeGB)
shouldDelete = true
archivedSet := make(map[string]struct{}, len(valid))
var archived []string
archiveOne := func(ds DatasetInfo, reason string) {
if _, ok := archivedSet[ds.Name]; ok {
return
}
if shouldDelete {
path := filepath.Join(dm.config.MLDataDir, ds.Name)
logger.Info("deleting dataset", "dataset", ds.Name, "path", path)
if _, err := dm.mlServer.Exec(fmt.Sprintf("rm -rf %s", path)); err != nil {
logger.Error("failed to delete dataset",
path := filepath.Join(dm.config.MLDataDir, ds.Name)
logger.Info("archiving dataset", "dataset", ds.Name, "path", path, "reason", reason)
if _, err := dm.archiveDatasetOnML(ds.Name); err != nil {
logger.Error("failed to archive dataset",
"dataset", ds.Name,
"error", err)
return
}
archivedSet[ds.Name] = struct{}{}
archived = append(archived, ds.Name)
totalSize -= ds.SizeBytes
totalSizeGB = float64(totalSize) / (1024 * 1024 * 1024)
if dm.taskQueue != nil {
redisClient := dm.taskQueue.GetRedisClient()
if err := redisClient.Del(dm.ctx, fmt.Sprintf("ml:dataset:%s", ds.Name)).Err(); err != nil {
logger.Warn("failed to delete dataset from Redis",
"dataset", ds.Name,
"error", err)
continue
}
deleted = append(deleted, ds.Name)
totalSize -= ds.SizeBytes
// FIXED: Remove from Redis only if available, with error handling
if dm.taskQueue != nil {
redisClient := dm.taskQueue.GetRedisClient()
if err := redisClient.Del(dm.ctx, fmt.Sprintf("ml:dataset:%s", ds.Name)).Err(); err != nil {
logger.Warn("failed to delete dataset from Redis",
"dataset", ds.Name,
"error", err)
}
}
}
}
if len(deleted) > 0 {
// First archive datasets older than maxAge.
now := time.Now()
for _, ds := range valid {
if ds.LastAccess.IsZero() {
continue
}
if now.Sub(ds.LastAccess) > maxAge {
archiveOne(ds, "max_age")
}
}
// Then archive additional oldest datasets until we're under maxSize.
for totalSize > maxSize {
found := false
for _, ds := range valid {
if _, ok := archivedSet[ds.Name]; ok {
continue
}
archiveOne(ds, "max_size")
found = true
break
}
if !found {
break
}
}
if len(archived) > 0 {
logger.Info("cleanup complete",
"deleted_count", len(deleted),
"deleted_datasets", deleted)
"archived_count", len(archived),
"archived_datasets", archived)
} else {
logger.Info("cleanup complete", "deleted_count", 0)
logger.Info("cleanup complete", "archived_count", 0)
}
return nil
@ -625,17 +705,29 @@ func (dm *DataManager) Close() {
if dm.mlServer != nil {
if err := dm.mlServer.Close(); err != nil {
dm.logger.Job(dm.ctx, "data_manager", "").Warn("error closing ML server connection", "error", err)
dm.logger.Job(dm.ctx, "data_manager", "").Warn(
"error closing ML server connection",
"error",
err,
)
}
}
if dm.nasServer != nil {
if err := dm.nasServer.Close(); err != nil {
dm.logger.Job(dm.ctx, "data_manager", "").Warn("error closing NAS server connection", "error", err)
dm.logger.Job(dm.ctx, "data_manager", "").Warn(
"error closing NAS server connection",
"error",
err,
)
}
}
if dm.taskQueue != nil {
if err := dm.taskQueue.Close(); err != nil {
dm.logger.Job(dm.ctx, "data_manager", "").Warn("error closing Redis connection", "error", err)
dm.logger.Job(dm.ctx, "data_manager", "").Warn(
"error closing Redis connection",
"error",
err,
)
}
}
}
@ -650,7 +742,7 @@ func main() {
// Get API key from various sources
apiKey := auth.GetAPIKeyFromSources(authFlags)
configFile := "configs/config-local.yaml"
configFile := "configs/api/dev.yaml"
if authFlags.ConfigFile != "" {
configFile = authFlags.ConfigFile
}
@ -658,12 +750,12 @@ func main() {
// Parse command line args
if len(os.Args) < 2 {
fmt.Println("Usage:")
fmt.Println(" data_manager [--config configs/config-local.yaml] [--api-key <key>] " +
fmt.Println(" data_manager [--config configs/api/dev.yaml] [--api-key <key>] " +
"fetch <job-name> <dataset> [dataset...]")
fmt.Println(" data_manager [--config configs/config-local.yaml] [--api-key <key>] list")
fmt.Println(" data_manager [--config configs/config-local.yaml] [--api-key <key>] cleanup")
fmt.Println(" data_manager [--config configs/config-local.yaml] [--api-key <key>] validate <dataset>")
fmt.Println(" data_manager [--config configs/config-local.yaml] [--api-key <key>] daemon")
fmt.Println(" data_manager [--config configs/api/dev.yaml] [--api-key <key>] list")
fmt.Println(" data_manager [--config configs/api/dev.yaml] [--api-key <key>] cleanup")
fmt.Println(" data_manager [--config configs/api/dev.yaml] [--api-key <key>] validate <dataset>")
fmt.Println(" data_manager [--config configs/api/dev.yaml] [--api-key <key>] daemon")
fmt.Println()
auth.PrintAuthHelp()
os.Exit(1)

View file

@ -78,7 +78,7 @@ func main() {
for _, user := range users {
insert := `
INSERT OR REPLACE INTO api_keys (user_id, key_hash, admin, roles, permissions)
INSERT OR REPLACE INTO api_keys (user_id, key_hash, admin, roles, permissions)
VALUES (?, ?, ?, ?, ?)`
if _, err := db.ExecContext(context.Background(), insert,