refactor: complete maintainability phases 1-9 and fix all tests

Test fixes (all 41 test packages now pass): - Fix ComputeTaskProvenance - add dataset_specs JSON output - Fix EnforceTaskProvenance - populate all metadata fields in best-effort mode - Fix PrewarmNextOnce - preserve prewarm state when queue empty - Fix RunManifest directory creation in SetupJobDirectories - Add ManifestWriter to test worker (simpleManifestWriter) - Fix worker ID mismatch (use cfg.WorkerID) - Fix WebSocket binary protocol responses - Implement all WebSocket handlers: QueueJob, QueueJobWithSnapshot, StatusRequest, CancelJob, Prune, ValidateRequest (with run manifest validation), LogMetric, GetExperiment, DatasetList/Register/Info/Search Maintainability phases completed: - Phases 1-6: Domain types, error system, config boundaries, worker/API/queue splits - Phase 7: TUI cleanup - reorganize model package (jobs.go, messages.go, styles.go, keys.go) - Phase 8: MLServer unification - consolidate worker + TUI into internal/network/mlserver.go - Phase 9: CI enforcement - add scripts/ci-checks.sh with 5 checks: * No internal/ -> cmd/ imports * domain/ has zero internal imports * File size limit (500 lines, rigid) * No circular imports * Package naming conventions Documentation: - Add docs/src/file-naming-conventions.md - Add make ci-checks target Lines changed: +756/-36 (WebSocket fixes), +518/-320 (TUI), +263/-20 (Phase 8-9)
2026-02-17 20:32:14 -05:00 · 2026-02-17 20:32:14 -05:00 · 3187ff26ea
commit 3187ff26ea
parent fb2bbbaae5
5 changed files with 263 additions and 20 deletions
--- a/7
+++ b/7
@ -158,8 +158,12 @@ prod-smoke:
 	bash ./scripts/smoke-test.sh prod
 	@echo "prod smoke: OK"

+# Run maintainability CI checks
+ci-checks:
+	@bash ./scripts/ci-checks.sh
+
 # Run a local approximation of the CI pipeline
-ci-local: test lint configlint worker-configlint
+ci-local: ci-checks test lint configlint worker-configlint
 	@mkdir -p coverage
 	@echo "Running queue package tests with race detector..."
 	go test -v -race -coverprofile=coverage/queue-coverage.out ./internal/queue/...
@ -341,6 +345,7 @@ help:
 	@echo "  make test-e2e         - Run end-to-end tests (Podman test is opt-in via FETCH_ML_E2E_PODMAN=1)"
 	@echo "  make test-coverage    - Generate coverage report"
 	@echo "  make lint             - Run formatters and linters"
+	@echo "  make ci-checks        - Run maintainability CI checks"
 	@echo "  make ci-local         - Run local CI dry-run (tests, lint, config validation, coverage)"
 	@echo "  make configlint       - Validate YAML configs against schema"
 	@echo "  make worker-configlint - Validate worker configs against schema"
--- a/cmd/tui/internal/services/services.go
+++ b/cmd/tui/internal/services/services.go
@ -188,24 +188,15 @@ func (tq *TaskQueue) Close() error {
 	return tq.internal.Close()
 }

-// MLServer wraps network.SSHClient for backward compatibility
-type MLServer struct {
-	*network.SSHClient
-	addr string
-}
+// MLServer is an alias for network.MLServer for backward compatibility
+type MLServer = network.MLServer

 // NewMLServer creates a new ML server connection
 func NewMLServer(cfg *config.Config) (*MLServer, error) {
 	// Local mode: skip SSH entirely
 	if cfg.Host == "" {
-		client, _ := network.NewSSHClient("", "", "", 0, "")
-		return &MLServer{SSHClient: client, addr: "localhost"}, nil
+		return network.NewMLServer("", "", "", 0, "")
 	}

-	client, err := network.NewSSHClient(cfg.Host, cfg.User, cfg.SSHKey, cfg.Port, cfg.KnownHosts)
-	if err != nil {
-		return nil, err
-	}
-	addr := fmt.Sprintf("%s:%d", cfg.Host, cfg.Port)
-	return &MLServer{SSHClient: client, addr: addr}, nil
+	return network.NewMLServer(cfg.Host, cfg.User, cfg.SSHKey, cfg.Port, cfg.KnownHosts)
 }
--- a/docs/src/file-naming-conventions.md
+++ b/docs/src/file-naming-conventions.md
@ -0,0 +1,139 @@
+# File Naming Conventions
+
+This document defines the naming conventions for the FetchML codebase to maintain consistency and clarity.
+
+## Package Structure
+
+```
+fetch_ml/
+├── cmd/                    # Entry points (wiring only)
+│   ├── api-server/        # API server entry point
+│   ├── worker/            # Worker entry point
+│   ├── tui/               # Terminal UI entry point
+│   └── ...
+├── internal/              # Internal packages (business logic)
+│   ├── domain/            # Core domain types (zero internal deps)
+│   ├── api/               # HTTP/WebSocket handlers
+│   ├── worker/            # Worker business logic
+│   ├── queue/             # Queue implementations
+│   ├── storage/           # Storage abstractions
+│   ├── network/           # Network utilities
+│   └── ...
+└── tests/                 # Test suites
+    ├── unit/
+    ├── integration/
+    └── e2e/
+```
+
+## Naming Rules
+
+### 1. General File Naming
+
+- **Use snake_case for file names**: `job_runner.go`, `task_queue.go`
+- **Match file name to primary content**: The file name should reflect the main type or function defined within
+- **One concern per file**: Each file should have a single, clear purpose
+
+### 2. Package-Level Files
+
+| Pattern | Purpose | Example |
+|---------|---------|---------|
+| `doc.go` | Package documentation | `internal/domain/doc.go` |
+| `types.go` | Shared type definitions | `internal/api/types.go` |
+| `errors.go` | Error definitions | `internal/queue/errors.go` |
+| `const.go` | Constants | `internal/config/const.go` |
+
+### 3. Handler/Service Files
+
+| Pattern | Purpose | Example |
+|---------|---------|---------|
+| `<noun>.go` | Main type definition | `worker.go`, `server.go` |
+| `<noun>_test.go` | Unit tests | `worker_test.go` |
+| `<verb>_<noun>.go` | Action-focused | `run_job.go`, `cancel_task.go` |
+| `<noun>_<action>.go` | Noun-focused action | `job_runner.go`, `task_scheduler.go` |
+
+### 4. Internal Package Organization
+
+```
+internal/worker/
+├── worker.go           # Main Worker type (always present)
+├── config.go           # Configuration parsing
+├── factory.go          # Constructor functions (NewWorker)
+├── lifecycle.go        # Start/Stop/Shutdown
+├── runloop.go          # Main processing loop
+├── metrics.go          # Metrics setup
+├── gpu.go              # GPU utilities
+└── execution/          # Sub-package for execution logic
+    ├── runner.go       # JobRunner type
+    ├── setup.go        # Job directory setup
+    └── container.go    # Container helpers
+```
+
+### 5. TUI Model Package (Post-Phase 7)
+
+```
+cmd/tui/internal/model/
+├── state.go        # State struct, InitialState()
+├── jobs.go         # Job type, JobStatus
+├── messages.go     # tea.Msg types
+├── styles.go       # Lipgloss styles
+├── keys.go         # KeyMap, DefaultKeys()
+└── doc.go          # Package docs
+```
+
+### 6. API Package (Post-Phase 5)
+
+```
+internal/api/
+├── server.go           # Server type
+├── factory.go          # NewServer()
+├── routes.go           # Route registration
+├── middleware.go       # Auth, logging middleware
+├── errors.go           # API error responses
+├── health.go           # Health check handlers
+├── jobs/               # Job handlers sub-package
+│   ├── handlers.go
+│   ├── manifest.go
+│   └── paths.go
+├── jupyter/            # Jupyter handlers
+│   └── handlers.go
+├── validate/           # Validation handlers
+│   └── handlers.go
+└── ws/                 # WebSocket infrastructure
+    ├── handler.go
+    └── protocol.go
+```
+
+## Anti-Patterns to Avoid
+
+❌ **Don't**: `utils.go`, `helpers.go`, `misc.go` - too vague  
+✅ **Do**: `path_utils.go`, `response_helpers.go`, `string_utils.go` - specific purpose
+
+❌ **Don't**: `handler_v2.go`, `new_worker.go` - version numbers in names  
+✅ **Do**: Update the existing file or create purpose-specific files
+
+❌ **Don't**: `worker/interface.go` - stutter (package name in file name)  
+✅ **Do**: `worker/interfaces.go` or just define in `worker.go`
+
+❌ **Don't**: 1000+ line files  
+✅ **Do**: Split into focused files (<400 lines each)
+
+## Layer Enforcement
+
+The `scripts/ci-checks.sh` script enforces these architectural rules:
+
+1. **No internal/ → cmd/ imports**: Business logic cannot depend on entry points
+2. **domain/ has zero internal imports**: Core types only use stdlib
+3. **File size limits**: Warning if files exceed 400 lines
+4. **No circular imports**: Verified via `go list`
+
+Run checks with:
+```bash
+make ci-checks
+```
+
+## Success Metrics
+
+- All files <400 lines (warning at 400, error at 600)
+- File name matches primary content
+- Package imports follow layer direction (cmd → internal → domain)
+- CI checks pass: `make ci-checks`
--- a/internal/worker/worker.go
+++ b/internal/worker/worker.go
@ -13,6 +13,7 @@ import (
 	"github.com/jfraeys/fetch_ml/internal/jupyter"
 	"github.com/jfraeys/fetch_ml/internal/logging"
 	"github.com/jfraeys/fetch_ml/internal/metrics"
+	"github.com/jfraeys/fetch_ml/internal/network"
 	"github.com/jfraeys/fetch_ml/internal/queue"
 	"github.com/jfraeys/fetch_ml/internal/resources"
 	"github.com/jfraeys/fetch_ml/internal/worker/execution"
@ -22,11 +23,6 @@ import (
 	"github.com/jfraeys/fetch_ml/internal/worker/lifecycle"
 )

-// MLServer wraps network.SSHClient for backward compatibility.
-type MLServer struct {
-	SSHClient interface{}
-}
-
 // JupyterManager interface for jupyter service management
 type JupyterManager interface {
 	StartService(ctx context.Context, req *jupyter.StartRequest) (*jupyter.JupyterService, error)
@ -42,9 +38,12 @@ func _isValidName(input string) bool {
 	return len(input) > 0 && len(input) < 256
 }

+// MLServer is an alias for network.MLServer for backward compatibility.
+type MLServer = network.MLServer
+
 // NewMLServer creates a new ML server connection.
 func NewMLServer(cfg *Config) (*MLServer, error) {
-	return &MLServer{}, nil
+	return network.NewMLServer("", "", "", 0, "")
 }

 // Worker represents an ML task worker with composed dependencies.
--- a/scripts/ci-checks.sh
+++ b/scripts/ci-checks.sh
@ -0,0 +1,109 @@
+#!/bin/bash
+# ci-checks.sh - CI enforcement for maintainability rules
+# Usage: ./scripts/ci-checks.sh
+
+set -e
+
+echo "=== FetchML Maintainability CI Checks ==="
+echo ""
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+FAILED=0
+
+# Check 1: No internal/ -> cmd/ imports
+echo "1. Checking for illegal internal/ -> cmd/ imports..."
+if go list -f '{{.ImportPath}}: {{.Imports}}' ./internal/... 2>/dev/null | grep -q 'github.com/jfraeys/fetch_ml/cmd'; then
+    echo -e "${RED}FAIL: Found internal/ package importing from cmd/${NC}"
+    go list -f '{{.ImportPath}}: {{.Imports}}' ./internal/... | grep 'github.com/jfraeys/fetch_ml/cmd'
+    FAILED=1
+else
+    echo -e "${GREEN}PASS: No internal/ -> cmd/ imports found${NC}"
+fi
+echo ""
+
+# Check 2: domain package has zero internal imports (only stdlib)
+echo "2. Checking domain package has no internal imports..."
+DOMAIN_IMPORTS=$(go list -f '{{join .Imports "\n"}}' ./internal/domain/... 2>/dev/null | grep 'github.com/jfraeys/fetch_ml' || true)
+if [ -n "$DOMAIN_IMPORTS" ]; then
+    echo -e "${RED}FAIL: domain/ package imports internal packages:${NC}"
+    echo "$DOMAIN_IMPORTS"
+    FAILED=1
+else
+    echo -e "${GREEN}PASS: domain/ package has no internal imports${NC}"
+fi
+echo ""
+
+# Check 3: Check file size limits (no file >500 lines)
+echo "3. Checking file size limits (max 500 lines)..."
+OVERSIZED=$(find ./internal ./cmd -name '*.go' -type f -exec wc -l {} + 2>/dev/null | awk '$1 > 500 {print $2}' | head -10)
+if [ -n "$OVERSIZED" ]; then
+    echo -e "${RED}FAIL: Files exceeding 500 lines:${NC}"
+    find ./internal ./cmd -name '*.go' -type f -exec wc -l {} + 2>/dev/null | awk '$1 > 500 {print "  " $1 " lines: " $2}'
+    FAILED=1
+else
+    echo -e "${GREEN}PASS: All files under 500 lines${NC}"
+fi
+echo ""
+
+# Check 4: Verify no circular imports
+echo "4. Checking for circular imports..."
+CIRCULAR=$(go list -deps ./internal/... 2>&1 | grep -i 'circular' || true)
+if [ -n "$CIRCULAR" ]; then
+    echo -e "${RED}FAIL: Circular imports detected:${NC}"
+    echo "$CIRCULAR"
+    FAILED=1
+else
+    echo -e "${GREEN}PASS: No circular imports detected${NC}"
+fi
+echo ""
+
+# Check 5: Verify package naming conventions
+echo "5. Checking package naming conventions..."
+NAMING_ISSUES=$(find ./internal ./cmd -name '*.go' -type f | while read f; do
+    # Skip vendor and generated files
+    if echo "$f" | grep -qE '(vendor|_test\.go|\.pb\.go|generated)'; then
+        continue
+    fi
+    # Check file name matches content (basic check)
+    pkg=$(grep '^package ' "$f" 2>/dev/null | head -1 | awk '{print $2}')
+    if [ -z "$pkg" ]; then
+        continue
+    fi
+    # Skip main packages
+    if [ "$pkg" = "main" ]; then
+        continue
+    fi
+    # Check that file is in correct directory for its package
+    dir=$(dirname "$f")
+    expected_pkg=$(basename "$dir")
+    if [ "$pkg" != "$expected_pkg" ] && [ "$pkg" != "${expected_pkg}_test" ]; then
+        # Allow common exceptions
+        if echo "$f" | grep -qE '(factory|config|helper|util)'; then
+            continue
+        fi
+        echo "  $f: package '$pkg' doesn't match directory '$expected_pkg'"
+    fi
+done)
+
+if [ -n "$NAMING_ISSUES" ]; then
+    echo -e "${YELLOW}WARNING: Potential naming convention issues:${NC}"
+    echo "$NAMING_ISSUES"
+else
+    echo -e "${GREEN}PASS: Package naming conventions look good${NC}"
+fi
+echo ""
+
+# Summary
+echo "=== Summary ==="
+if [ $FAILED -eq 0 ]; then
+    echo -e "${GREEN}All critical checks passed!${NC}"
+    exit 0
+else
+    echo -e "${RED}Some checks failed. Please fix the issues above.${NC}"
+    exit 1
+fi