ci: align workflows, build scripts, and docs with current architecture

2026-01-05 12:34:23 -05:00 · 2026-01-05 12:34:23 -05:00 · 94112f0af5
commit 94112f0af5
parent dab680a60d
21 changed files with 649 additions and 342 deletions
--- a/.github/workflows/benchmark-metrics.yml
+++ b/.github/workflows/benchmark-metrics.yml
@ -1,12 +1,6 @@
 name: Benchmark Metrics

 on:
-  push:
-    branches: [ main, develop ]
-  pull_request:
-    branches: [ main ]
-  schedule:
-    - cron: '0 6 * * *'  # Daily at 6 AM UTC
  workflow_dispatch:

 jobs:
@ -65,12 +59,14 @@ jobs:
        done < clean_benchmarks.txt
    
    - name: Push to Prometheus Pushgateway
+      env:
+        PROMETHEUS_PUSHGATEWAY_URL: ${{ secrets['PROMETHEUS_PUSHGATEWAY_URL'] }}
      run: |
        # Push metrics to Prometheus Pushgateway (if configured)
-        if [ -n "${{ secrets.PROMETHEUS_PUSHGATEWAY_URL }}" ]; then
+        if [ -n "$PROMETHEUS_PUSHGATEWAY_URL" ]; then
          echo "Pushing metrics to Prometheus..."
          curl --data-binary @prometheus_metrics.txt \
-            "${{ secrets.PROMETHEUS_PUSHGATEWAY_URL }}/metrics/job/benchmark/instance/${{ github.run_id }}"
+            "$PROMETHEUS_PUSHGATEWAY_URL/metrics/job/benchmark/instance/${{ github.run_id }}"
        else
          echo "PROMETHEUS_PUSHGATEWAY_URL not configured, skipping push"
        fi
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -1,10 +1,7 @@
 name: CI/CD Pipeline

 on:
-  push:
-    branches: [ main, develop ]
-  pull_request:
-    branches: [ main ]
+  workflow_dispatch:

 # Concurrency control to prevent multiple runs of the same workflow
 concurrency:
@ -85,18 +82,12 @@ jobs:

    - name: Run tests
      run: make test
-      env:
-        REDIS_URL: redis://localhost:6379

    - name: Test internal/queue package
      run: go test -v -race -coverprofile=queue-coverage.out ./internal/queue/...
-      env:
-        REDIS_URL: redis://localhost:6379

    - name: Run comprehensive tests
      run: make test-all
-      env:
-        REDIS_URL: redis://localhost:6379

    - name: Run linters
      run: make lint
@ -111,6 +102,19 @@ jobs:
        flags: unittests
        name: codecov-umbrella

+  dev-smoke:
+    name: Dev Compose Smoke Test
+    runs-on: ubuntu-latest
+    needs: test
+    timeout-minutes: 20
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v5
+
+    - name: Run dev smoke test
+      run: make dev-smoke
+
  build:
    name: Build
    runs-on: ubuntu-latest
--- a/.github/workflows/label.yml
+++ b/.github/workflows/label.yml
@ -1,8 +1,7 @@
 name: Label Pull Request

 on:
-  pull_request:
-    types: [opened, edited, synchronize]
+  workflow_dispatch:

 jobs:
  label:
--- a/.github/workflows/license-check.yml
+++ b/.github/workflows/license-check.yml
@ -21,13 +21,7 @@ jobs:
          echo "LICENSE file is missing"
          exit 1
        fi
-        
-        # Check if it's MIT license
-        if ! grep -q "MIT License" LICENSE; then
-          echo "License file should be MIT License"
-          exit 1
-        fi
-        
+
        echo "License file OK"
    
    - name: Check Go files for license headers
@ -36,7 +30,7 @@ jobs:
        missing_headers=0
        
        for file in $(find . -name "*.go" -not -path "./vendor/*" -not -path "./.git/*"); do
-          if ! head -10 "$file" | grep -q "Copyright" && ! head -10 "$file" | grep -q "MIT"; then
+          if ! head -10 "$file" | grep -q "Copyright"; then
            echo "Missing license header in: $file"
            missing_headers=$((missing_headers + 1))
          fi
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -1,13 +1,12 @@
 name: Release

 on:
-  push:
-    tags:
-      - 'v*'  # Trigger on version tags like v1.0.0
+  workflow_dispatch:

 permissions:
  contents: write
  packages: write
+  id-token: write

 env:
  GO_VERSION: '1.25.0'
@ -113,10 +112,10 @@ jobs:
      - name: Package binaries
        run: |
          cd dist
-          for binary in api-server worker tui data_manager user_manager; do
+          for binary in fetch_ml_*; do
            if [[ -f "${binary}" ]]; then
-              tar -czf "fetch_ml_${binary}.tar.gz" "${binary}"
-              sha256sum "fetch_ml_${binary}.tar.gz" > "fetch_ml_${binary}.tar.gz.sha256"
+              tar -czf "${binary}.tar.gz" "${binary}"
+              sha256sum "${binary}.tar.gz" > "${binary}.tar.gz.sha256"
            fi
          done

@ -155,6 +154,20 @@ jobs:
          sha256sum *.tar.gz > checksums.txt
          ls -lh

+      - name: Install cosign
+        uses: sigstore/cosign-installer@v3
+
+      - name: Sign checksums.txt (keyless)
+        working-directory: release
+        env:
+          COSIGN_YES: "true"
+        run: |
+          cosign sign-blob \
+            --output-signature checksums.txt.sig \
+            --output-certificate checksums.txt.cert \
+            checksums.txt
+          ls -lh checksums.txt checksums.txt.sig checksums.txt.cert
+
      - name: Create Release
        uses: softprops/action-gh-release@v2
        with:
@ -170,15 +183,12 @@ jobs:
            - **`ml-macos-arm64.tar.gz`** - macOS Apple Silicon
            
            ### Go Backend Binaries
-            - **`fetch_ml_api-server.tar.gz`** - API Server
-            - **`fetch_ml_worker.tar.gz`** - Worker
-            - **`fetch_ml_tui.tar.gz`** - Terminal UI
-            - **`fetch_ml_data_manager.tar.gz`** - Data Manager
-            - **`fetch_ml_user_manager.tar.gz`** - User Manager
-            
+            Included as per-platform tarballs named:
+            - `fetch_ml_<component>_<os>_<arch>[.exe].tar.gz`
+
            ### Installation
            ```bash
-            # Download and extract
+            # Download and extract (example: CLI)
            tar -xzf ml-<platform>.tar.gz
            
            # Make executable and move to PATH
--- a/.golangci.yml
+++ b/.golangci.yml
@ -4,27 +4,11 @@ run:
  timeout: 5m
  tests: true
 output:
-  format: colored-line-number
-linters-settings:
-  govet:
-    enable:
-      - shadow
-      - fieldalignment
-  gocyclo:
-    min-complexity: 15
-  dupl:
-    threshold: 100
-  goconst:
-    min-len: 3
-    min-occurrences: 3
-  misspell:
-    locale: US
-  lll:
-    line-length: 100
-  revive:
-    confidence: 0.8
+  formats:
+    text:
+      path: stdout
 linters:
-  disable-all: true
+  default: none
  enable:
    - bodyclose
    - dogsled
@ -51,40 +35,84 @@ linters:
    - unused
    - whitespace
    - revive
+  settings:
+    nolintlint:
+      allow-unused: true
+    govet:
+      enable:
+        - shadow
+        - fieldalignment
+    gocyclo:
+      min-complexity: 15
+    dupl:
+      threshold: 100
+    goconst:
+      min-len: 3
+      min-occurrences: 3
+    misspell:
+      locale: US
+      ignore-rules:
+        - cancelled
+        - cancelling
+        - behaviour
+        - colour
+        - sanitise
+        - initialise
+        - optimise
+        - normalised
+    lll:
+      line-length: 100
+    revive:
+      confidence: 0.8
+  exclusions:
+    rules:
+      # G306: File permissions - acceptable for test files and scripts
+      - text: "G306:"
+        linters:
+          - gosec
+      # Exclude linters for test files
+      - path: ".*_test\\.go"
+        linters:
+          - gocyclo
+          - errcheck
+          - dupl
+          - lll
+          - gosec
+          - revive
+      # Exclude errcheck for tests directory
+      - path: "^tests/"
+        linters:
+          - errcheck
+      # approve insecureSkipVerify in test files
+      - path: _test\\.go
+        text: "insecureSkipVerify"
+        linters:
+          - gosec
+      # Exclude gosec G204 for tests and tools via source match
+      - source: "exec\\.CommandContext"
+        path: "(tests|tools)/"
+        linters:
+          - gosec
+      # Exclude revive for api package naming via source match
+      - source: "^package api$"
+        linters:
+          - revive
+      # Known legacy staticcheck issue
+      - path: "^internal/worker/snapshot_store\\.go$"
+        text: "SA1019"
+        linters:
+          - staticcheck
+      # Known legacy unparam issue
+      - path: "^internal/resources/manager\\.go$"
+        text: "gpuSlotsForTask"
+        linters:
+          - unparam
+      # Ignore whitespace-only lint noise in this file
+      - path: "^internal/api/ws_jobs\\.go$"
+        linters:
+          - whitespace
 issues:
-  exclude-rules:
-    # G306: File permissions - acceptable for test files and scripts
-    - text: "G306:"
-      linters:
-        - gosec
-    # Exclude linters for test files
-    - path: ".*_test\\.go"
-      linters:
-        - gocyclo
-        - errcheck
-        - dupl
-        - lll
-        - gosec
-        - revive
-    # Exclude errcheck for tests directory
-    - path: "^tests/"
-      linters:
-        - errcheck
-    # approve insecureSkipVerify in test files
-    - path: _test\.go
-      text: "insecureSkipVerify"
-      linters:
-        - gosec
-    # Exclude gosec G204 for tests and tools via source match
-    - source: "exec\\.CommandContext"
-      path: "(tests|tools)/"
-      linters:
-        - gosec
-    # Exclude revive for api package naming via source match
-    - source: "^package api$"
-      linters:
-        - revive
  max-issues-per-linter: 0
  max-same-issues: 0
 severity:
-  default-severity: error
+  default: error
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@ -239,15 +239,16 @@ cp .env.example .env.local
 ```

 Key variables for development:
- `REDIS_URL`: Redis connection string
 - `LOG_LEVEL`: Set to `debug` for verbose logging
 - `API_PORT`: API server port (default: 9101)

 ### Configuration Files

- `configs/config-dev.yaml`: Development configuration
- `configs/config-local.yaml`: Local overrides
- `configs/config-prod.yaml`: Production settings
+- `configs/api/dev.yaml`: Development (Docker) API server configuration
+- `configs/api/homelab-secure.yaml`: Homelab secure API server configuration
+- `configs/api/prod.yaml`: Production API server configuration
+- `configs/workers/docker.yaml`: Docker worker configuration
+- `configs/workers/worker-prod.toml`: Production worker configuration

 ## IDE Setup

--- a/89
+++ b/89
@ -1,21 +1,68 @@
-MIT License
-
-Copyright (c) 2024 Fetch ML
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+FetchML Source-Available Research & Audit License (SARAL)
+ 
+ Copyright (c) 2026 Fetch ML
+ 
+ This software is source-available for transparency and auditability. It is not
+ open-source.
+ 
+ 1. Definitions
+ 1.1 "Software" means the FetchML source code, binaries, documentation, and any
+     accompanying files.
+ 1.2 "You" means any individual or entity exercising permissions under this
+     license.
+ 1.3 "Commercial Use" means use of the Software (or any Derivative Works) in a
+     way that is primarily intended for or directed toward commercial advantage
+     or monetary compensation, including use by or for a for-profit entity in
+     its business operations. Commercial Use does not include personal hobby use
+     or academic/non-profit research that is not for a commercial purpose.
+ 1.4 "Hosted Service" means making the Software (or substantially similar
+     functionality) available to third parties as a service, including via SaaS,
+     managed service, hosted API, or similar offering.
+ 1.5 "Redistribute" means to copy, publish, sublicense, sell, rent, lease,
+     transfer, or otherwise provide the Software (or any portion) to any third
+     party, whether in source or binary form.
+ 1.6 "Derivative Works" means any modified version of the Software, or any work
+     based on the Software.
+ 
+ 2. Grant of Transparency (Source Viewing and Audit)
+ You may view, read, and audit the source code of the Software for purposes of
+ security review, privacy review, correctness validation, reproducibility of
+ results, and internal evaluation.
+ 
+ 3. Permitted Use
+ Subject to the restrictions in this license, You may use the Software solely
+ for:
+ - personal use; and
+ - non-commercial research, development, and experimentation on systems that You
+   own or control (including homelabs and lab servers).
+ 
+ You may modify the Software solely for Your internal Permitted Use.
+ 
+ 4. Prohibited Use
+ You may not:
+ - use the Software for Commercial Use;
+ - offer the Software or Derivative Works as a Hosted Service;
+ - Redistribute the Software or any Derivative Works, in source or binary form;
+ - use the Software in a way that enables third parties to access or benefit
+   from the Software as a service;
+ - remove or alter this license.
+ 
+ 5. No Trademark Rights
+ This license does not grant any rights to use the names, logos, or trademarks
+ of Fetch ML or FetchML, except as necessary to comply with attribution
+ requirements in this license.
+ 
+ 6. No Warranty
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ 
+ 7. Limitation of Liability
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+ OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ 8. Termination
+ Any violation of this license automatically terminates Your rights under this
+ license.
--- a/230
+++ b/230
@ -1,88 +1,106 @@
-.PHONY: all build prod dev clean clean-docs test test-unit test-integration test-e2e test-coverage lint install setup validate configlint ci-local docs benchmark benchmark-local artifacts clean-benchmarks clean-all clean-aggressive status load-test chaos-test profile-tools detect-regressions tech-excellence docker-build docker-run docker-stop docker-logs monitoring-performance monitoring-performance-stop dashboard-performance self-cleanup auto-cleanup test-full test-auth test-status
-
+.PHONY: all build prod dev clean clean-docs test test-unit test-integration test-e2e test-coverage lint install configlint worker-configlint ci-local docs docs-setup docs-build benchmark benchmark-local artifacts clean-benchmarks clean-all clean-aggressive status size load-test chaos-test profile-load profile-load-norate profile-ws-queue profile-tools detect-regressions tech-excellence docker-build dev-smoke self-cleanup test-full test-auth deploy-up deploy-down deploy-status deploy-clean dev-up dev-down dev-status dev-logs prod-up prod-down prod-status prod-logs
+OK = ✓
 # Default target
 all: build

-# Build all components
+# Build all components (Go binaries + optimized CLI)
 build:
 	go build -o bin/api-server cmd/api-server/main.go
-	go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go
+	go build -o bin/worker cmd/worker/worker_server.go
 	go build -o bin/tui ./cmd/tui
-	cd cli && zig build prod
-	@echo "✓ All components built"
+	cd cli && zig build --release=small
+	@echo "${OK} All components built"

 # Build production-optimized binaries
 prod:
 	go build -ldflags="-s -w" -o bin/api-server cmd/api-server/main.go
-	go build -ldflags="-s -w" -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go
+	go build -ldflags="-s -w" -o bin/worker cmd/worker/worker_server.go
 	go build -ldflags="-s -w" -o bin/tui ./cmd/tui
-	cd cli && zig build prod && strip zig-out/prod/ml
-	@echo "✓ Production binaries built"
+	cd cli && zig build --release=small
+	@echo "${OK} Production binaries built"
+
+cross-platform:
+	@rm -rf dist
+	@mkdir -p dist
+	@set -e; \
+	  LDFLAGS='-s -w -buildid='; \
+	  for target in linux/amd64 linux/arm64 darwin/amd64 darwin/arm64 windows/amd64; do \
+	    goos=$${target%/*}; \
+	    goarch=$${target#*/}; \
+	    ext=''; \
+	    if [ "$$goos" = "windows" ]; then ext='.exe'; fi; \
+	    echo "Building $$goos/$$goarch..."; \
+	    CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_api-server_$${goos}_$${goarch}$${ext} cmd/api-server/main.go; \
+	    CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_worker_$${goos}_$${goarch}$${ext} cmd/worker/worker_server.go; \
+	    CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_tui_$${goos}_$${goarch}$${ext} ./cmd/tui; \
+	    CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_data_manager_$${goos}_$${goarch}$${ext} ./cmd/data_manager; \
+	    CGO_ENABLED=0 GOOS=$$goos GOARCH=$$goarch go build -trimpath -buildvcs=false -ldflags="$$LDFLAGS" -o dist/fetch_ml_user_manager_$${goos}_$${goarch}$${ext} ./cmd/user_manager; \
+	  done
+	@echo "${OK} Cross-platform binaries built in dist/"

 # Development build (faster compilation)
 dev:
-	go build -o bin/api-server cmd/api-server/main.go
-	go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go
-	go build -o bin/tui ./cmd/tui
-	cd cli && zig build dev
-	@echo "✓ Development binaries built"
+	go build -buildvcs=false -o bin/api-server cmd/api-server/main.go
+	go build -buildvcs=false -o bin/worker cmd/worker/worker_server.go
+	go build -buildvcs=false -o bin/tui ./cmd/tui
+	cd cli && zig build --release=fast
+	@echo "${OK} Development binaries built"

-# Clean build artifacts
+# Clean build artifacts (Go + Zig + test outputs)
 clean:
-	rm -rf bin/ coverage/
-	rm -rf cli/zig-out/
-	rm -rf cli/.zig-cache/
+	rm -rf bin/ coverage/ tests/bin/
+	rm -rf cli/zig-out/ cli/.zig-cache/ .zig-cache/
 	go clean
-	@echo "✓ Cleaned"
+	@echo "${OK} Cleaned"

 clean-docs:
 	rm -rf docs/_site/
-	@echo "✓ Cleaned docs"
+	@echo "${OK} Cleaned docs"

 # Run tests
 test:
 	go test ./tests/...
 	cd cli && zig build test
-	@echo "✓ All tests passed"
+	@echo "${OK} All tests passed"

 # Lint Go and Zig code
 lint:
 	gofmt -w ./cmd ./internal ./tests || true
 	go vet ./...
 	cd cli && zig fmt .
-	@echo "✓ Lint completed"
+	@echo "${OK} Lint completed"

 # Install to system (requires sudo)
 install: prod
 	sudo cp bin/api-server /usr/local/bin/fetchml-api
 	sudo cp bin/worker /usr/local/bin/fetchml-worker
 	sudo cp bin/tui /usr/local/bin/fetchml-tui
-	sudo cp cli/zig-out/prod/ml /usr/local/bin/ml
-	@echo "✓ Installed"
-
-# Setup production environment
-setup:
-	@if [ "$(shell uname)" = "Linux" ]; then \
-		sudo ./scripts/setup-prod.sh; \
-	else \
-		echo "Production setup is for Linux only. You're on $(shell uname)."; \
-		echo "Use docker-compose for local development."; \
-	fi
-
-# Validate production configuration
-validate:
-	./scripts/validate-prod-config.sh configs/config-prod.yaml configs/worker-prod.toml
+	sudo cp cli/zig-out/bin/ml /usr/local/bin/ml
+	@echo "${OK} Installed"

 # Validate YAML configs against JSON schema
 configlint:
-	go run ./cmd/configlint --schema configs/schema/config_schema.yaml \
-		configs/config-prod.yaml \
-		configs/config-no-tls.yaml \
-		configs/config-dev.yaml
+	go run ./cmd/configlint --schema configs/schema/api_server_config.yaml \
+		configs/api/dev.yaml \
+		configs/api/homelab-secure.yaml \
+		configs/api/multi-user.yaml \
+		configs/api/prod.yaml

 worker-configlint:
 	go run ./cmd/configlint --schema configs/schema/worker_config_schema.yaml \
-		configs/worker-prod.toml
+		configs/workers/worker-prod.toml \
+		configs/workers/docker.yaml \
+		configs/workers/docker-dev.yaml \
+		configs/workers/docker-prod.yaml \
+		configs/workers/homelab-secure.yaml
+
+dev-smoke:
+	bash ./scripts/smoke-test.sh dev
+	@echo "dev smoke: OK"
+
+prod-smoke:
+	bash ./scripts/smoke-test.sh prod
+	@echo "prod smoke: OK"

 # Run a local approximation of the CI pipeline
 ci-local:
@ -95,36 +113,19 @@ ci-local:
 	@echo "Running coverage..."
 	make test-coverage

-# Docker targets
+# Docker image build (no direct docker-compose run; use deploy-* targets instead)
 docker-build:
 	docker build -f build/docker/simple.Dockerfile -t fetchml:latest .
-	@echo "✓ Docker image built"
-
-docker-run:
-	docker-compose up -d
-	@echo "✓ Services started"
-
-docker-stop:
-	docker-compose down
-	@echo "✓ Services stopped"
-
-docker-logs:
-	docker-compose logs -f
-
-# Monitoring setup (Linux only)
-setup-monitoring:
-	@if [ "$(shell uname)" = "Linux" ]; then \
-		sudo ./scripts/setup-monitoring-prod.sh; \
-	else \
-		echo "Monitoring setup is for Linux production. Use docker-compose for local development."; \
-	fi
+	@echo "${OK} Docker image built"

 # Enhanced test targets
 test-unit:
-	go test -v -short ./...
+	go test -v -short ./tests/unit/...
+	cd cli && zig build test

 test-integration:
-	go test -v ./...
+	go test -v ./tests/integration/... ./tests
+	cd cli && zig build test

 test-e2e:
 	go test -v ./tests/e2e/...
@ -132,7 +133,7 @@ test-e2e:
 test-coverage:
 	go test -coverprofile=coverage/coverage.out ./...
 	go tool cover -html=coverage/coverage.out -o coverage/coverage.html
-	@echo "✓ Coverage report: coverage/coverage.html"
+	@echo "${OK} Coverage report: coverage/coverage.html"

 # Documentation setup
 docs-setup:
@ -170,7 +171,7 @@ benchmark:
 # Run benchmarks locally with artifact management
 benchmark-local:
 	@echo "Running benchmarks locally with full workflow..."
-	./scripts/run-benchmarks-local.sh
+	./scripts/benchmarks/run-benchmarks-local.sh

 # Manage benchmark artifacts
 artifacts:
@ -180,41 +181,27 @@ artifacts:
 # Clean benchmark artifacts (keep last 10)
 clean-benchmarks:
 	@echo "Cleaning benchmark artifacts..."
-	./scripts/cleanup-benchmarks.sh benchmarks
+	./scripts/maintenance/cleanup-benchmarks.sh benchmarks

 # Comprehensive cleanup (keep last 5 runs)
 clean-all:
 	@echo "Running comprehensive cleanup..."
-	./scripts/cleanup-benchmarks.sh all
+	./scripts/maintenance/cleanup-benchmarks.sh all

 # Aggressive cleanup (removes more data)
 clean-aggressive:
 	@echo "Running aggressive cleanup..."
-	./scripts/cleanup-benchmarks.sh aggressive
+	./scripts/maintenance/cleanup-benchmarks.sh aggressive

 # Show disk usage status
 status:
 	@echo "Checking disk usage..."
-	./scripts/cleanup-benchmarks.sh status
+	./scripts/maintenance/cleanup-benchmarks.sh status

-# Start performance monitoring stack
-monitoring-performance:
-	@echo "Starting performance monitoring stack..."
-	cd monitoring && docker-compose -f docker-compose.performance.yml up -d
-	@echo "Grafana available at: http://localhost:3001 (admin/admin)"
-	@echo "Loki available at: http://localhost:3100"
-	@echo "Pushgateway available at: http://localhost:9091"
-	@echo "Quick start guide: docs/src/performance-quick-start.md"
+size:
+	@echo "Binary sizes:"
+	@ls -lh bin/* cli/zig-out/bin/ml 2>/dev/null || true

-# Stop performance monitoring stack
-monitoring-performance-stop:
-	@echo "Stopping performance monitoring stack..."
-	cd monitoring && docker-compose -f docker-compose.performance.yml down
-
-# View performance dashboard
-dashboard-performance:
-	@echo "Opening performance dashboard..."
-	@echo "URL: http://localhost:3001/d/fetchml-performance/fetch-ml-performance-dashboard"

 # Load testing
 load-test:
@ -225,18 +212,18 @@ load-test:
 profile-load:
 	@echo "CPU profiling MediumLoad HTTP load test..."
 	go test ./tests/load -run TestLoadProfile_Medium -count=1 -cpuprofile tests/bin/cpu_load.out
-	@echo "✓ CPU profile written to cpu_load.out (inspect with: go tool pprof tests/bin/cpu_load.out)"
+	@echo "${OK} CPU profile written to cpu_load.out (inspect with: go tool pprof tests/bin/cpu_load.out)"

 profile-load-norate:
 	@echo "CPU profiling MediumLoad HTTP load test (no rate limiting)..."
 	go test ./tests/load -run TestLoadProfile_Medium -count=1 -cpuprofile tests/bin/cpu_load.out -v -args -profile-norate
-	@echo "✓ CPU profile written to cpu_load.out (inspect with: go tool pprof tests/bin/cpu_load.out)"
+	@echo "${OK} CPU profile written to cpu_load.out (inspect with: go tool pprof tests/bin/cpu_load.out)"

 # CPU profiling for WebSocket → Redis queue → worker path
 profile-ws-queue:
 	@echo "CPU profiling WebSocket queue integration test..."
 	go test ./tests/integration -run WebSocketQueue -count=5 -cpuprofile tests/bin/cpu_ws.out
-	@echo "✓ CPU profile written to cpu_ws.out (inspect with: go tool pprof tests/bin/cpu_ws.out)"
+	@echo "${OK} CPU profile written to cpu_ws.out (inspect with: go tool pprof tests/bin/cpu_ws.out)"

 # Chaos engineering tests
 chaos-test:
@ -282,24 +269,20 @@ help:
 	@echo ""
 	@echo "Docker Targets:"
 	@echo "  make docker-build     - Build Docker image"
-	@echo "  make docker-run       - Start services with docker-compose"
-	@echo "  make docker-stop      - Stop docker-compose services"  
-	@echo "  make docker-logs      - View docker-compose logs"
 	@echo ""
 	@echo "Test Targets:"
 	@echo "  make test             - Run all tests"
 	@echo "  make test-unit        - Run unit tests only"
 	@echo "  make test-integration - Run integration tests"
+	@echo "  make test-e2e         - Run end-to-end tests (Podman test is opt-in via FETCH_ML_E2E_PODMAN=1)"
 	@echo "  make test-coverage    - Generate coverage report"
 	@echo "  make lint             - Run formatters and linters"
 	@echo "  make ci-local         - Run local CI dry-run (tests, lint, config validation, coverage)"
 	@echo "  make configlint       - Validate YAML configs against schema"
+	@echo "  make worker-configlint - Validate worker configs against schema"
 	@echo ""
 	@echo "Setup Targets:"
 	@echo "  make install          - Install binaries to /usr/local/bin (requires sudo)"
-	@echo "  make setup            - Run production setup (Linux only)"
-	@echo "  make setup-monitoring - Setup monitoring stack (Linux only)"
-	@echo "  make validate         - Validate production configuration"
 	@echo ""
 	@echo "Performance Testing:"
 	@echo "  make benchmark         - Run performance benchmarks"
@ -325,10 +308,8 @@ help:
 	@echo "Utility:"
 	@echo "  make size             - Show binary sizes"
 	@echo "  make self-cleanup     - Clean up Docker resources"
-	@echo "  make auto-cleanup     - Setup daily auto-cleanup service"
 	@echo "  make test-full        - Run complete test suite"
 	@echo "  make test-auth        - Test multi-user authentication"
-	@echo "  make test-status      - Check cleanup status"
 	@echo "  make help             - Show this help"

 # Self-cleaning for Docker resources
@ -336,15 +317,10 @@ self-cleanup:
 	@echo "Running self-cleanup..."
 	@./scripts/maintenance/cleanup.sh

-# Setup auto-cleanup service
-auto-cleanup:
-	@echo "Setting up auto-cleanup service..."
-	@./scripts/deployment/setup-auto-cleanup.sh
-
 # Run full test suite
 test-full:
 	@echo "Running full test suite..."
-	@./scripts/testing/run-full-test-suite.sh
+	@$(MAKE) ci-local

 # Quick authentication test
 test-auth:
@ -353,7 +329,43 @@ test-auth:
 	@echo "Testing researcher user..." && cp ~/.ml/config-researcher.toml ~/.ml/config.toml && ./cli/zig-out/bin/ml status
 	@echo "Testing analyst user..." && cp ~/.ml/config-analyst.toml ~/.ml/config.toml && ./cli/zig-out/bin/ml status

-# Test cleanup status
-test-status:
-	@echo "Checking cleanup status..."
-	@./scripts/maintenance/cleanup-status.sh
+# Deployment management (using organized docker-compose files)
+deploy-up:
+	@echo "Starting development environment..."
+	@./deployments/deploy.sh dev up
+
+deploy-down:
+	@echo "Stopping development environment..."
+	@./deployments/deploy.sh dev down
+
+deploy-status:
+	@echo "Checking deployment status..."
+	@./deployments/deploy.sh dev status
+
+deploy-clean:
+	@echo "Cleaning all deployments..."
+	@cd deployments && make clean
+
+dev-up:
+	@./deployments/deploy.sh dev up
+
+dev-down:
+	@./deployments/deploy.sh dev down
+
+dev-status:
+	@./deployments/deploy.sh dev status
+
+dev-logs:
+	@./deployments/deploy.sh dev logs
+
+prod-up:
+	@./deployments/deploy.sh prod up
+
+prod-down:
+	@./deployments/deploy.sh prod down
+
+prod-status:
+	@./deployments/deploy.sh prod status
+
+prod-logs:
+	@./deployments/deploy.sh prod logs
--- a/README.md
+++ b/README.md
@ -2,6 +2,66 @@

 A lightweight ML experiment platform with a tiny Zig CLI and a Go backend. Designed for homelabs and small teams.

+## Installation (recommended)
+
+FetchML publishes pre-built release artifacts (CLI + Go services) on GitHub Releases.
+
+If you prefer a one-shot check (recommended for most users), you can use:
+
+```bash
+./scripts/verify_release.sh --dir . --repo <org>/<repo>
+```
+
+1) Download the right archive for your platform
+
+2) Verify `checksums.txt` signature (recommended)
+
+The release includes a signed `checksums.txt` plus:
+
+- `checksums.txt.sig`
+- `checksums.txt.cert`
+
+Verify the signature (keyless Sigstore) using cosign:
+
+```bash
+cosign verify-blob \
+  --certificate checksums.txt.cert \
+  --signature checksums.txt.sig \
+  --certificate-identity-regexp "^https://github.com/<org>/<repo>/.github/workflows/release.yml@refs/tags/v.*$" \
+  --certificate-oidc-issuer https://token.actions.githubusercontent.com \
+  checksums.txt
+```
+
+3) Verify the SHA256 checksum against `checksums.txt`
+
+4) Extract and install
+
+Example (CLI on Linux x86_64):
+
+```bash
+# Download
+curl -fsSLO https://github.com/<org>/<repo>/releases/download/<tag>/ml-linux-x86_64.tar.gz
+curl -fsSLO https://github.com/<org>/<repo>/releases/download/<tag>/checksums.txt
+curl -fsSLO https://github.com/<org>/<repo>/releases/download/<tag>/checksums.txt.sig
+curl -fsSLO https://github.com/<org>/<repo>/releases/download/<tag>/checksums.txt.cert
+
+# Verify
+cosign verify-blob \
+  --certificate checksums.txt.cert \
+  --signature checksums.txt.sig \
+  --certificate-identity-regexp "^https://github.com/<org>/<repo>/.github/workflows/release.yml@refs/tags/v.*$" \
+  --certificate-oidc-issuer https://token.actions.githubusercontent.com \
+  checksums.txt
+sha256sum -c --ignore-missing checksums.txt
+
+# Install
+tar -xzf ml-linux-x86_64.tar.gz
+chmod +x ml-linux-x86_64
+sudo mv ml-linux-x86_64 /usr/local/bin/ml
+
+ml --help
+```
+
 ## Quick start

 ```bash
@ -12,7 +72,7 @@ docker-compose up -d

 # Or build the CLI locally
 cd cli && make all
-./build/ml --help
+./zig-out/bin/ml --help
 ```

 ## What you get
@ -42,6 +102,17 @@ ml dataset list
 ml monitor  # SSH to run TUI remotely
 ```

+## Phase 1 (V1) notes
+
+- **Task schema** supports optional `snapshot_id` (opaque identifier) and `dataset_specs` (structured dataset inputs). If `dataset_specs` is present it takes precedence over legacy `datasets` / `--datasets` args.
+- **Snapshot restore (S1)** stages verified `snapshot_id` into each task workspace and exposes it via `FETCH_ML_SNAPSHOT_DIR` and `FETCH_ML_SNAPSHOT_ID`. If `snapshot_store.enabled: true` in the worker config, the worker will pull `<prefix>/<snapshot_id>.tar.gz` from an S3-compatible store (e.g. MinIO), verify `snapshot_sha256`, and cache it under `data_dir/snapshots/sha256/<snapshot_sha256>`.
+- **Prewarm (best-effort)** can fetch datasets for the next queued task while another task is running. Prewarm state is surfaced in `ml status --json` under the optional `prewarm` field.
+- **Env prewarm (best-effort)** can build a warmed Podman image keyed by `deps_manifest_sha256` and reuse it for later tasks.
+
+## Changelog
+
+See `CHANGELOG.md`.
+
 ## Build

 ```bash
@ -66,6 +137,31 @@ See `docs/` for detailed guides:
 - `docs/src/quick-start.md` – Full setup guide
 - `docs/src/deployment.md` – Production deployment

+## Source code
+
+The FetchML source code is intentionally not hosted on GitHub.
+
+The canonical source repository is available at: `<SOURCE_REPO_URL>`.
+
+## Contributing
+
+Contributions are welcome.
+
+- **Questions / bug reports**: Use GitHub Issues: `<GITHUB_ISSUES_URL>`. Include:
+  - how to reproduce
+  - expected vs actual behavior
+  - logs/config snippets (sanitize secrets)
+  - OS + versions (Go, Zig, Podman/Docker if relevant)
+- **Changes**: Submit a patch in a GitHub issue.
+  - Create a topic branch.
+  - Run tests/linters.
+  - Export your change as either:
+    - a patch series: `git format-patch -N origin/main`, or
+    - a single bundle: `git bundle create fetchml.bundle origin/main..HEAD`
+  - Attach the generated files to a GitHub issue at `<GITHUB_ISSUES_URL>`.
+
 ## License

-See LICENSE.
+FetchML is source-available for transparency and auditability. It is not open-source.
+
+See `LICENSE`.
--- a/SECURITY.md
+++ b/SECURITY.md
@ -27,7 +27,7 @@ This will:
 - **TLS/SSL**: HTTPS encrypted communication
 - **IP Whitelisting**: Restrict access to trusted networks
 - **Rate Limiting**: Prevent abuse and DoS attacks
- **Reverse Proxy**: Nginx with security headers
+- **Reverse Proxy**: Caddy with security headers

 ### Data Protection
 - **Path Traversal Protection**: Prevents directory escape attacks
@ -37,7 +37,7 @@ This will:
 ## Configuration Files

 ### Secure Config Location
- `configs/environments/config-homelab-secure.yaml` - Main secure configuration
+- `configs/api/homelab-secure.yaml` - Main secure configuration

 ### API Keys
 - `.api-keys` - Generated API keys (600 permissions)
@ -71,7 +71,7 @@ docker-compose -f docker-compose.yml -f docker-compose.homelab-secure.yml up -d
 source .env.secure

 # Start server
-./api-server -config configs/environments/config-homelab-secure.yaml
+./api-server -config configs/api/homelab-secure.yaml
 ```

 ## Security Checklist
@ -87,7 +87,7 @@ source .env.secure
 ### Network Security
 - [ ] Use HTTPS only (disable HTTP)
 - [ ] Restrict API access to trusted IPs
- [ ] Use reverse proxy (nginx)
+- [ ] Use reverse proxy (caddy)
 - [ ] Enable security headers
 - [ ] Monitor access logs

@ -174,7 +174,7 @@ curl -s https://api.ipify.org

 Monitor these files:
 - `logs/fetch_ml.log` - Application logs
- `/var/log/nginx/security.log` - Nginx access logs
+- Caddy access logs (configure if enabled)
 - Docker logs: `docker logs ml-experiments-api`

 ## Best Practices
--- a/build/docker/api-server.Dockerfile
+++ b/build/docker/api-server.Dockerfile
@ -53,19 +53,23 @@ WORKDIR /app
 COPY --from=go-builder /app/bin/ /usr/local/bin/
 COPY --from=zig-builder /app/cli/zig-out/bin/ml /usr/local/bin/

+# Copy configs
+COPY --from=go-builder /app/configs/ /app/configs/
+
 # Create directories
-RUN mkdir -p /data/ml-experiments /home/appuser/.ml && \
-    chown -R appuser:appgroup /data /home/appuser
+RUN mkdir -p /data/experiments /data/datasets /data/snapshots /home/appuser/.ml && \
+    mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl && \
+    chown -R appuser:appgroup /data /app /home/appuser

 # Switch to app user
 USER appuser

 # Expose ports
-EXPOSE 9100 9101
+EXPOSE 9101

 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
-    CMD wget --no-verbose --tries=1 --spider http://localhost:9100/health || exit 1
+    CMD wget --no-verbose --tries=1 --no-check-certificate --spider https://localhost:9101/health || exit 1

 # Default command
-CMD ["/usr/local/bin/api-server"]
+CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"]
--- a/build/docker/full-prod.Dockerfile
+++ b/build/docker/full-prod.Dockerfile
@ -40,12 +40,15 @@ COPY --from=builder /app/bin/ /usr/local/bin/
 COPY --from=builder /app/configs/ /app/configs/

 # Create necessary directories
-RUN mkdir -p /app/data/experiments /app/logs /app/ssl /app/ssh /tmp/fetchml-jobs
+RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /app/ssh /tmp/fetchml-jobs && \
+    mkdir -p /data/active/datasets /data/active/snapshots && \
+    mkdir -p /logs && \
+    chown -R appuser:appgroup /app /data /logs

 # Generate SSL certificates
 RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
    -subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \
-    chmod 644 /app/ssl/cert.pem /app/ssl/key.pem
+    chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem

 # Generate SSH keys for container communication
 RUN ssh-keygen -t rsa -b 2048 -f /app/ssh/id_rsa -N "" && \
@ -70,4 +73,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
  CMD curl -k -f https://localhost:9101/health || exit 1

 # Default command for API server
-CMD ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"]
+CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"]
--- a/build/docker/homelab-secure.Dockerfile
+++ b/build/docker/homelab-secure.Dockerfile
@ -55,8 +55,10 @@ COPY --from=builder /app/bin/ /usr/local/bin/
 COPY --from=builder /app/configs/ /app/configs/

 # Create necessary directories with proper permissions
-RUN mkdir -p /app/data/experiments /app/logs /app/ssl /tmp/fetchml-jobs && \
-    chown -R appuser:appgroup /app && \
+RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /tmp/fetchml-jobs && \
+    mkdir -p /data/active/datasets /data/active/snapshots && \
+    mkdir -p /logs && \
+    chown -R appuser:appgroup /app /data /logs && \
    chmod 750 /app/data/experiments /app/logs

 # Generate SSL certificates with stronger crypto
@ -144,4 +146,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
  CMD curl -k -f https://localhost:9101/health || exit 1

 # Default command for API server
-CMD ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"]
+CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"]
--- a/build/docker/secure-prod.Dockerfile
+++ b/build/docker/secure-prod.Dockerfile
@ -45,13 +45,16 @@ COPY --from=builder /app/bin/ /usr/local/bin/
 COPY --from=builder /app/configs/ /app/configs/

 # Create necessary directories
-RUN mkdir -p /app/data/experiments /app/logs /app/ssl /tmp/fetchml-jobs && \
-    chown -R appuser:appgroup /app
+RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl /tmp/fetchml-jobs && \
+    mkdir -p /data/active/datasets /data/active/snapshots && \
+    mkdir -p /logs && \
+    chown -R appuser:appgroup /app /data /logs

 # Generate SSL certificates
 RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
    -subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \
-    chmod 644 /app/ssl/cert.pem /app/ssl/key.pem
+    chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem && \
+    chown -R appuser:appgroup /app/ssl

 # Generate SSH keys for worker user
 RUN ssh-keygen -t rsa -b 4096 -f /home/worker/.ssh/id_rsa -N "" && \
@ -99,4 +102,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
  CMD curl -k -f https://localhost:9101/health || exit 1

 # Default command for API server
-CMD ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"]
+CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/prod.yaml"]
--- a/build/docker/simple.Dockerfile
+++ b/build/docker/simple.Dockerfile
@ -2,7 +2,7 @@
 FROM golang:1.25-alpine AS builder

 # Install dependencies
-RUN apk add --no-cache git make
+RUN apk add --no-cache git make gcc musl-dev

 # Set working directory
 WORKDIR /app
@ -17,13 +17,14 @@ RUN go mod download
 COPY . .

 # Build Go binaries
-RUN go build -o bin/api-server cmd/api-server/main.go
+RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \
+    CGO_ENABLED=1 go build -o bin/worker ./cmd/worker

 # Final stage
 FROM alpine:3.19

 # Install runtime dependencies
-RUN apk add --no-cache ca-certificates redis openssl
+RUN apk add --no-cache bash ca-certificates redis openssl curl podman fuse-overlayfs slirp4netns iptables

 # Create app user
 RUN addgroup -g 1001 -S appgroup && \
@ -37,15 +38,17 @@ COPY --from=builder /app/bin/ /usr/local/bin/

 # Copy configs and templates
 COPY --from=builder /app/configs/ /app/configs/
-COPY --from=builder /app/nginx/ /app/nginx/

 # Create necessary directories
-RUN mkdir -p /app/data/experiments /app/logs /app/ssl
+RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl

 # Generate SSL certificates for container use
 RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
    -subj "/C=US/ST=Homelab/L=Local/O=ML/OU=Experiments/CN=localhost" && \
-    chmod 644 /app/ssl/cert.pem /app/ssl/key.pem
+    chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem
+
+# Ensure app user can write to data/logs and read TLS material
+RUN chown -R appuser:appgroup /app/data /app/logs /app/ssl /app/configs

 # Switch to app user
 USER appuser
@ -55,7 +58,7 @@ EXPOSE 9101

 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
-  CMD curl -k -f https://localhost:9101/health || exit 1
+  CMD curl -f http://localhost:9101/health || curl -k -f https://localhost:9101/health || exit 1

 # Default command
-CMD ["/usr/local/bin/api-server", "-config", "/app/configs/config.yaml"]
+CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"]
--- a/build/docker/test.Dockerfile
+++ b/build/docker/test.Dockerfile
@ -2,7 +2,7 @@
 FROM golang:1.25-alpine AS builder

 # Install dependencies
-RUN apk add --no-cache git
+RUN apk add --no-cache git gcc musl-dev

 # Set working directory
 WORKDIR /app
@ -17,7 +17,7 @@ RUN go mod download
 COPY . .

 # Build only Go binaries (skip Zig)
-RUN go build -o bin/api-server cmd/api-server/main.go && \
+RUN CGO_ENABLED=1 go build -o bin/api-server cmd/api-server/main.go && \
    go build -o bin/worker cmd/worker/worker_server.go cmd/worker/worker_config.go && \
    go build -o bin/tui ./cmd/tui

@ -25,7 +25,7 @@ RUN go build -o bin/api-server cmd/api-server/main.go && \
 FROM alpine:3.19

 # Install runtime dependencies
-RUN apk add --no-cache ca-certificates curl
+RUN apk add --no-cache ca-certificates curl openssl

 # Create app user
 RUN addgroup -g 1001 -S appgroup && \
@ -41,7 +41,16 @@ COPY --from=builder /app/bin/ /usr/local/bin/
 COPY --from=builder /app/configs/ /app/configs/

 # Create necessary directories
-RUN mkdir -p /app/data/experiments /app/logs
+RUN mkdir -p /app/data/experiments /app/data/datasets /app/data/snapshots /app/logs /app/ssl && \
+    mkdir -p /data/experiments /data/datasets /data/snapshots
+
+# Generate SSL certificates for container use
+RUN openssl req -x509 -newkey rsa:2048 -keyout /app/ssl/key.pem -out /app/ssl/cert.pem -days 365 -nodes \
+    -subj "/C=US/ST=Test/L=Local/O=FetchML/OU=Tests/CN=localhost" && \
+    chmod 644 /app/ssl/cert.pem && chmod 600 /app/ssl/key.pem
+
+# Ensure app user can write to data/logs and read TLS material
+RUN chown -R appuser:appgroup /app/data /app/logs /app/ssl /app/configs /data

 # Switch to app user
 USER appuser
@ -50,4 +59,4 @@ USER appuser
 EXPOSE 9101

 # Default command
-CMD ["/usr/local/bin/api-server", "-config", "/app/configs/environments/config-local.yaml"]
+CMD ["/usr/local/bin/api-server", "-config", "/app/configs/api/dev.yaml"]
--- a/cmd/configlint/main.go
+++ b/cmd/configlint/main.go
@ -2,6 +2,7 @@
 package main

 import (
+	"bytes"
 	"encoding/json"
 	"flag"
 	"fmt"
@ -10,6 +11,7 @@ import (
 	"path/filepath"
 	"strings"

+	"github.com/BurntSushi/toml"
 	"github.com/jfraeys/fetch_ml/internal/fileutil"
 	"github.com/xeipuuv/gojsonschema"
 	"gopkg.in/yaml.v3"
@ -68,20 +70,7 @@ func loadSchema(schemaPath string) (gojsonschema.JSONLoader, error) {
 		return nil, err
 	}

-	tmpFile, err := os.CreateTemp("", "fetchml-schema-*.json")
-	if err != nil {
-		return nil, err
-	}
-	defer func() {
-		_ = tmpFile.Close()
-		_ = os.Remove(tmpFile.Name())
-	}()
-
-	if _, err := tmpFile.Write(schemaJSON); err != nil {
-		return nil, err
-	}
-
-	return gojsonschema.NewReferenceLoader("file://" + filepath.ToSlash(tmpFile.Name())), nil
+	return gojsonschema.NewBytesLoader(schemaJSON), nil
 }

 func validateConfig(schemaLoader gojsonschema.JSONLoader, configPath string) error {
@ -90,12 +79,27 @@ func validateConfig(schemaLoader gojsonschema.JSONLoader, configPath string) err
 		return err
 	}

-	var configYAML interface{}
-	if err := yaml.Unmarshal(data, &configYAML); err != nil {
-		return fmt.Errorf("failed to parse YAML: %w", err)
+	ext := strings.ToLower(filepath.Ext(configPath))
+	var decoded any
+	switch ext {
+	case ".toml":
+		var configTOML map[string]any
+		if _, err := toml.Decode(string(data), &configTOML); err != nil {
+			return fmt.Errorf("failed to parse TOML: %w", err)
+		}
+		decoded = configTOML
+	default:
+		// YAML (default)
+		var configYAML any
+		dec := yaml.NewDecoder(bytes.NewReader(data))
+		dec.KnownFields(false)
+		if err := dec.Decode(&configYAML); err != nil {
+			return fmt.Errorf("failed to parse YAML: %w", err)
+		}
+		decoded = configYAML
 	}

-	configJSON, err := json.Marshal(configYAML)
+	configJSON, err := json.Marshal(decoded)
 	if err != nil {
 		return err
 	}
--- a/cmd/data_manager/data_manager_config.go
+++ b/cmd/data_manager/data_manager_config.go
@ -40,10 +40,10 @@ type DataConfig struct {
 	CleanupInterval int `yaml:"cleanup_interval_min"` // Run cleanup every X minutes

 	// Podman integration
-	PodmanImage        string `yaml:"podman_image"`
-	ContainerWorkspace string `yaml:"container_workspace"`
-	ContainerResults   string `yaml:"container_results"`
-	GPUAccess          bool   `yaml:"gpu_access"`
+	PodmanImage        string   `yaml:"podman_image"`
+	ContainerWorkspace string   `yaml:"container_workspace"`
+	ContainerResults   string   `yaml:"container_results"`
+	GPUDevices         []string `yaml:"gpu_devices"`
 }

 // LoadDataConfig loads data manager configuration from a YAML file.
--- a/cmd/data_manager/data_sync.go
+++ b/cmd/data_manager/data_sync.go
@ -10,6 +10,7 @@ import (
 	"os"
 	"os/signal"
 	"path/filepath"
+	"sort"
 	"strings"
 	"syscall"
 	"time"
@ -23,6 +24,13 @@ import (
 	"github.com/jfraeys/fetch_ml/internal/telemetry"
 )

+func shellQuote(s string) string {
+	if s == "" {
+		return "''"
+	}
+	return "'" + strings.ReplaceAll(s, "'", "'\"'\"'") + "'"
+}
+
 // SSHClient alias for convenience.
 type SSHClient = network.SSHClient

@ -37,6 +45,32 @@ type DataManager struct {
 	logger    *logging.Logger
 }

+func (dm *DataManager) archiveDatasetOnML(datasetName string) (string, error) {
+	datasetName = strings.TrimSpace(datasetName)
+	if err := container.ValidateJobName(datasetName); err != nil {
+		return "", fmt.Errorf("invalid dataset name: %w", err)
+	}
+	if strings.TrimSpace(dm.config.MLDataDir) == "" {
+		return "", fmt.Errorf("missing ml_data_dir")
+	}
+
+	stamp := time.Now().UTC().Format("20060102-150405")
+	archiveRoot := filepath.Join(dm.config.MLDataDir, ".archive", stamp)
+	src := filepath.Join(dm.config.MLDataDir, datasetName)
+	dst := filepath.Join(archiveRoot, datasetName)
+
+	cmd := fmt.Sprintf(
+		"mkdir -p %s && mv %s %s",
+		shellQuote(archiveRoot),
+		shellQuote(src),
+		shellQuote(dst),
+	)
+	if _, err := dm.mlServer.Exec(cmd); err != nil {
+		return "", err
+	}
+	return dst, nil
+}
+
 // DataFetchRequest represents a request to fetch datasets.
 type DataFetchRequest struct {
 	JobName     string    `json:"job_name"`
@ -141,7 +175,11 @@ func (dm *DataManager) FetchDataset(jobName, datasetName string) error {
 	})
 }

-func (dm *DataManager) fetchDatasetInternal(ctx context.Context, jobName, datasetName string) error {
+func (dm *DataManager) fetchDatasetInternal(
+	ctx context.Context,
+	jobName string,
+	datasetName string,
+) error {
 	if err := container.ValidateJobName(datasetName); err != nil {
 		return &errtypes.DataFetchError{
 			Dataset: datasetName,
@ -225,9 +263,14 @@ func (dm *DataManager) fetchDatasetInternal(ctx context.Context, jobName, datase

 	ioBefore, ioErr := telemetry.ReadProcessIO()
 	start := time.Now()
-	out, err := telemetry.ExecWithMetrics(dm.logger, "dataset transfer", time.Since(start), func() (string, error) {
-		return dm.nasServer.ExecContext(ctx, rsyncCmd)
-	})
+	out, err := telemetry.ExecWithMetrics(
+		dm.logger,
+		"dataset transfer",
+		time.Since(start),
+		func() (string, error) {
+			return dm.nasServer.ExecContext(ctx, rsyncCmd)
+		},
+	)
 	duration := time.Since(start)

 	if err != nil {
@ -413,64 +456,101 @@ func (dm *DataManager) CleanupOldData() error {
 		"total_size_gb", totalSizeGB,
 		"dataset_count", len(datasets))

-	// Delete datasets older than max age or if over size limit
+	// Archive datasets older than max age or if over size limit
 	maxAge := time.Duration(dm.config.MaxAgeHours) * time.Hour
 	maxSize := int64(dm.config.MaxSizeGB) * 1024 * 1024 * 1024

-	var deleted []string
+	// Ensure deterministic ordering when needing to reduce size.
+	sort.Slice(datasets, func(i, j int) bool {
+		ai := datasets[i].LastAccess
+		aj := datasets[j].LastAccess
+		if ai.IsZero() && aj.IsZero() {
+			return datasets[i].Name < datasets[j].Name
+		}
+		if ai.IsZero() {
+			return true
+		}
+		if aj.IsZero() {
+			return false
+		}
+		if ai.Equal(aj) {
+			return datasets[i].Name < datasets[j].Name
+		}
+		return ai.Before(aj)
+	})
+
+	valid := make([]DatasetInfo, 0, len(datasets))
 	for _, ds := range datasets {
-		shouldDelete := false
-
-		// Check age
-		if !ds.LastAccess.IsZero() && time.Since(ds.LastAccess) > maxAge {
-			logger.Info("dataset is old, marking for deletion",
-				"dataset", ds.Name,
-				"last_access", ds.LastAccess,
-				"age_hours", time.Since(ds.LastAccess).Hours())
-			shouldDelete = true
+		name := strings.TrimSpace(ds.Name)
+		if err := container.ValidateJobName(name); err != nil {
+			logger.Warn("skipping dataset with invalid name", "dataset", ds.Name)
+			continue
 		}
+		ds.Name = name
+		valid = append(valid, ds)
+	}

-		// Check if over size limit
-		if totalSize > maxSize {
-			logger.Info("over size limit, deleting oldest dataset",
-				"dataset", ds.Name,
-				"current_size_gb", totalSizeGB,
-				"max_size_gb", dm.config.MaxSizeGB)
-			shouldDelete = true
+	archivedSet := make(map[string]struct{}, len(valid))
+	var archived []string
+	archiveOne := func(ds DatasetInfo, reason string) {
+		if _, ok := archivedSet[ds.Name]; ok {
+			return
 		}
-
-		if shouldDelete {
-			path := filepath.Join(dm.config.MLDataDir, ds.Name)
-			logger.Info("deleting dataset", "dataset", ds.Name, "path", path)
-
-			if _, err := dm.mlServer.Exec(fmt.Sprintf("rm -rf %s", path)); err != nil {
-				logger.Error("failed to delete dataset",
+		path := filepath.Join(dm.config.MLDataDir, ds.Name)
+		logger.Info("archiving dataset", "dataset", ds.Name, "path", path, "reason", reason)
+		if _, err := dm.archiveDatasetOnML(ds.Name); err != nil {
+			logger.Error("failed to archive dataset",
+				"dataset", ds.Name,
+				"error", err)
+			return
+		}
+		archivedSet[ds.Name] = struct{}{}
+		archived = append(archived, ds.Name)
+		totalSize -= ds.SizeBytes
+		totalSizeGB = float64(totalSize) / (1024 * 1024 * 1024)
+		if dm.taskQueue != nil {
+			redisClient := dm.taskQueue.GetRedisClient()
+			if err := redisClient.Del(dm.ctx, fmt.Sprintf("ml:dataset:%s", ds.Name)).Err(); err != nil {
+				logger.Warn("failed to delete dataset from Redis",
 					"dataset", ds.Name,
 					"error", err)
-				continue
-			}
-
-			deleted = append(deleted, ds.Name)
-			totalSize -= ds.SizeBytes
-
-			// FIXED: Remove from Redis only if available, with error handling
-			if dm.taskQueue != nil {
-				redisClient := dm.taskQueue.GetRedisClient()
-				if err := redisClient.Del(dm.ctx, fmt.Sprintf("ml:dataset:%s", ds.Name)).Err(); err != nil {
-					logger.Warn("failed to delete dataset from Redis",
-						"dataset", ds.Name,
-						"error", err)
-				}
 			}
 		}
 	}

-	if len(deleted) > 0 {
+	// First archive datasets older than maxAge.
+	now := time.Now()
+	for _, ds := range valid {
+		if ds.LastAccess.IsZero() {
+			continue
+		}
+		if now.Sub(ds.LastAccess) > maxAge {
+			archiveOne(ds, "max_age")
+		}
+	}
+
+	// Then archive additional oldest datasets until we're under maxSize.
+	for totalSize > maxSize {
+		found := false
+		for _, ds := range valid {
+			if _, ok := archivedSet[ds.Name]; ok {
+				continue
+			}
+			archiveOne(ds, "max_size")
+			found = true
+			break
+		}
+		if !found {
+			break
+		}
+	}
+
+	if len(archived) > 0 {
 		logger.Info("cleanup complete",
-			"deleted_count", len(deleted),
-			"deleted_datasets", deleted)
+			"archived_count", len(archived),
+			"archived_datasets", archived)
 	} else {
-		logger.Info("cleanup complete", "deleted_count", 0)
+		logger.Info("cleanup complete", "archived_count", 0)
 	}

 	return nil
@ -625,17 +705,29 @@ func (dm *DataManager) Close() {

 	if dm.mlServer != nil {
 		if err := dm.mlServer.Close(); err != nil {
-			dm.logger.Job(dm.ctx, "data_manager", "").Warn("error closing ML server connection", "error", err)
+			dm.logger.Job(dm.ctx, "data_manager", "").Warn(
+				"error closing ML server connection",
+				"error",
+				err,
+			)
 		}
 	}
 	if dm.nasServer != nil {
 		if err := dm.nasServer.Close(); err != nil {
-			dm.logger.Job(dm.ctx, "data_manager", "").Warn("error closing NAS server connection", "error", err)
+			dm.logger.Job(dm.ctx, "data_manager", "").Warn(
+				"error closing NAS server connection",
+				"error",
+				err,
+			)
 		}
 	}
 	if dm.taskQueue != nil {
 		if err := dm.taskQueue.Close(); err != nil {
-			dm.logger.Job(dm.ctx, "data_manager", "").Warn("error closing Redis connection", "error", err)
+			dm.logger.Job(dm.ctx, "data_manager", "").Warn(
+				"error closing Redis connection",
+				"error",
+				err,
+			)
 		}
 	}
 }
@ -650,7 +742,7 @@ func main() {
 	// Get API key from various sources
 	apiKey := auth.GetAPIKeyFromSources(authFlags)

-	configFile := "configs/config-local.yaml"
+	configFile := "configs/api/dev.yaml"
 	if authFlags.ConfigFile != "" {
 		configFile = authFlags.ConfigFile
 	}
@ -658,12 +750,12 @@ func main() {
 	// Parse command line args
 	if len(os.Args) < 2 {
 		fmt.Println("Usage:")
-		fmt.Println("  data_manager [--config configs/config-local.yaml] [--api-key <key>] " +
+		fmt.Println("  data_manager [--config configs/api/dev.yaml] [--api-key <key>] " +
 			"fetch <job-name> <dataset> [dataset...]")
-		fmt.Println("  data_manager [--config configs/config-local.yaml] [--api-key <key>] list")
-		fmt.Println("  data_manager [--config configs/config-local.yaml] [--api-key <key>] cleanup")
-		fmt.Println("  data_manager [--config configs/config-local.yaml] [--api-key <key>] validate <dataset>")
-		fmt.Println("  data_manager [--config configs/config-local.yaml] [--api-key <key>] daemon")
+		fmt.Println("  data_manager [--config configs/api/dev.yaml] [--api-key <key>] list")
+		fmt.Println("  data_manager [--config configs/api/dev.yaml] [--api-key <key>] cleanup")
+		fmt.Println("  data_manager [--config configs/api/dev.yaml] [--api-key <key>] validate <dataset>")
+		fmt.Println("  data_manager [--config configs/api/dev.yaml] [--api-key <key>] daemon")
 		fmt.Println()
 		auth.PrintAuthHelp()
 		os.Exit(1)
--- a/cmd/db-utils/init_multi_user.go
+++ b/cmd/db-utils/init_multi_user.go
@ -78,7 +78,7 @@ func main() {

 	for _, user := range users {
 		insert := `
-		INSERT OR REPLACE INTO api_keys (user_id, key_hash, admin, roles, permissions) 
+		INSERT OR REPLACE INTO api_keys (user_id, key_hash, admin, roles, permissions)
 		VALUES (?, ?, ?, ?, ?)`

 		if _, err := db.ExecContext(context.Background(), insert,