diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..e831378 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,35 @@ +.git +.github +.vscode +.windsurf + +# Build outputs +bin/ +dist/ +coverage/ + +# Local data outputs (compose bind mounts) +data/ +logs/ +data_active/ +workspaces/ + +# Local artifacts +.local-artifacts/ + +# Zig build cache/output +cli/.zig-cache/ +cli/zig-out/ +.zig-cache/ + +# Python caches +__pycache__/ +*.pyc + +# OS junk +.DS_Store + +# Not needed for building Go binaries +monitoring/ +docs/ +examples/ diff --git a/.env.example b/.env.example index e438e0e..94ddb87 100644 --- a/.env.example +++ b/.env.example @@ -8,10 +8,6 @@ FETCH_ML_CLI_BASE="/tmp/ml-experiments" FETCH_ML_CLI_PORT="9101" FETCH_ML_CLI_API_KEY="your-api-key-here" -# Redis (if used) -REDIS_URL="redis://localhost:6379" -REDIS_PASSWORD="your-redis-password" - # Optional: TLS (if enabled) TLS_CERT_FILE="" TLS_KEY_FILE="" \ No newline at end of file diff --git a/.gitignore b/.gitignore index c61c8ac..47bf62e 100644 --- a/.gitignore +++ b/.gitignore @@ -212,6 +212,9 @@ coverage.html queue-coverage.out coverage/ +# documents +docs/_site/** + # Redis dump dump.rdb @@ -230,8 +233,12 @@ db/*.db secrets/ cli/src/assets/rsync_release.bin +# Local artifacts (e.g. test run outputs) +.local-artifacts/ + # Test files -test_*.go +# test_*.go +# *_test.go *_test_output/ # Build artifacts @@ -244,6 +251,7 @@ zig-out/ # Experiment data (local testing) experiments/ data/ +workspaces/ # SQLite temporary files db/*.db-shm @@ -257,3 +265,6 @@ db/*.db ssl/ *.pem *.key + +# Windsurf AI +.windsurf/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..7c319c8 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,13 @@ +## [Unreleased] + +- Deployments: production now terminates TLS/WSS at Caddy (reverse proxy) and keeps the API server on internal HTTP/WS. +- Tests: add e2e coverage for `wss://` upgrade through a TLS-terminating reverse proxy. +- Worker: verify `dataset_specs[].checksum` when provided and fail tasks on mismatch. +- Worker: verify `snapshot_id` using `snapshot_sha256` and fail-closed (supports local `data_dir/snapshots/` and optional S3-backed `snapshot_store`). +- Worker: stage verified `snapshot_id` into each task workspace and expose it to training code via `FETCH_ML_SNAPSHOT_DIR`. +- Worker: provenance enforcement is trustworthiness-by-default (fail-closed) with `provenance_best_effort` opt-in. +- CLI/API: add `ml validate` to fetch a validation report (commit/task) for provenance + integrity checks. +- Worker: best-effort environment prewarm can build a warmed Podman image keyed by `deps_manifest_sha256` and reuse it for subsequent tasks. +- Worker: export env prewarm hit/miss/built counters and total build time via the worker Prometheus metrics endpoint. +- API/Worker: `ml prune` also triggers best-effort garbage collection of warmed env images. +- API: add `/health/ok` (when health checks are enabled) and wrap HTTP handlers with Prometheus HTTP request metrics when Prometheus is enabled.