feat: implement research-grade maintainability phases 2, 5, 8, 10

Phase 2: Deterministic Manifests
- Add manifest.Validator with required field checking
- Support Validate() and ValidateStrict() modes
- Integrate validation into worker executor before execution
- Block execution if manifest missing commit_id or deps_manifest_sha256

Phase 5: Pinned Dependencies
- Add hermetic.dockerfile template with pinned system deps
- Frozen package versions: libblas3, libcudnn8, etc.
- Support for deps_manifest.json and requirements.txt with hashes
- Image tagging strategy: deps-<first-8-of-sha256>

Phase 8: Tests as Specifications
- Add queue_spec_test.go with executable scheduler specs
- Document priority ordering (higher first)
- Document FIFO tiebreaker for same priority
- Test cases for negative/zero priorities

Phase 10: Local Dev Parity
- Create root-level docker-compose.dev.yml
- Simplified from deployments/ for quick local dev
- Redis + API server + Worker with hot reload volumes
- Debug ports: 9101 (API), 6379 (Redis)
This commit is contained in:
Jeremie Fraeys 2026-02-18 15:34:28 -05:00
parent 7194826871
commit 8271277dc3
No known key found for this signature in database
5 changed files with 462 additions and 0 deletions

54
docker-compose.dev.yml Normal file
View file

@ -0,0 +1,54 @@
# Developer-focused Docker Compose for local development
# Simplified from deployments/docker-compose.dev.yml for quick local dev
services:
redis:
image: redis:7-alpine
ports:
- "6379:6379"
volumes:
- redis_data:/data
command: redis-server --appendonly yes
api-server:
build:
context: .
dockerfile: build/docker/simple.Dockerfile
ports:
- "9101:9101"
volumes:
- .:/workspace
- ./data/dev/logs:/logs
- ./data/dev/experiments:/data/experiments
- ./data/dev/active:/data/active
environment:
- LOG_LEVEL=debug
- LOG_FORMAT=text
- ENV=development
depends_on:
- redis
command: ["/bin/sh", "-c", "mkdir -p /data/experiments /data/active && exec /usr/local/bin/api-server -config /app/configs/api/dev.yaml"]
worker:
build:
context: .
dockerfile: build/docker/simple.Dockerfile
ports:
- "9102:9102"
volumes:
- .:/workspace
- ./data/dev/logs:/logs
- ./data/dev/active:/data/active
- ./data/dev/experiments:/data/experiments
- /var/run/docker.sock:/var/run/docker.sock
environment:
- LOG_LEVEL=debug
- LOG_FORMAT=text
- ENV=development
depends_on:
- redis
- api-server
command: ["/usr/local/bin/worker", "-config", "/app/configs/worker.yaml"]
volumes:
redis_data:

View file

@ -0,0 +1,159 @@
package manifest
import (
"errors"
"fmt"
)
// ErrIncompleteManifest is returned when a required manifest field is missing.
var ErrIncompleteManifest = errors.New("incomplete manifest")
// Validator validates that a RunManifest is complete before execution.
type Validator struct {
requiredFields []string
}
// NewValidator creates a new manifest validator with default required fields.
func NewValidator() *Validator {
return &Validator{
requiredFields: []string{
"commit_id",
"deps_manifest_sha256",
},
}
}
// NewValidatorWithFields creates a validator with custom required fields.
func NewValidatorWithFields(fields []string) *Validator {
return &Validator{
requiredFields: fields,
}
}
// ValidationError contains details about a validation failure.
type ValidationError struct {
Field string `json:"field"`
Message string `json:"message"`
}
// Error returns the error string.
func (e ValidationError) Error() string {
return fmt.Sprintf("validation error for field '%s': %s", e.Field, e.Message)
}
// Validate checks that the manifest has all required fields.
// Returns an error listing all missing fields.
func (v *Validator) Validate(m *RunManifest) error {
if m == nil {
return fmt.Errorf("manifest is nil: %w", ErrIncompleteManifest)
}
var validationErrors []ValidationError
for _, field := range v.requiredFields {
if err := v.validateField(m, field); err != nil {
validationErrors = append(validationErrors, *err)
}
}
if len(validationErrors) > 0 {
// Build comprehensive error message
msg := "manifest validation failed:\n"
for _, err := range validationErrors {
msg += fmt.Sprintf(" - %s\n", err.Error())
}
return fmt.Errorf("%s: %w", msg, ErrIncompleteManifest)
}
return nil
}
// ValidateStrict fails if ANY optional fields commonly used for provenance are missing.
// This is for high-assurance environments.
func (v *Validator) ValidateStrict(m *RunManifest) error {
if err := v.Validate(m); err != nil {
return err
}
// Additional strict checks
var strictErrors []ValidationError
if m.WorkerVersion == "" {
strictErrors = append(strictErrors, ValidationError{
Field: "worker_version",
Message: "required for strict provenance",
})
}
if m.PodmanImage == "" {
strictErrors = append(strictErrors, ValidationError{
Field: "podman_image",
Message: "required for strict provenance",
})
}
if len(strictErrors) > 0 {
msg := "strict manifest validation failed:\n"
for _, err := range strictErrors {
msg += fmt.Sprintf(" - %s\n", err.Error())
}
return fmt.Errorf("%s: %w", msg, ErrIncompleteManifest)
}
return nil
}
// validateField checks a single required field.
func (v *Validator) validateField(m *RunManifest, field string) *ValidationError {
switch field {
case "commit_id":
if m.CommitID == "" {
return &ValidationError{
Field: field,
Message: "commit_id is required for code provenance",
}
}
case "deps_manifest_sha256":
if m.DepsManifestSHA == "" {
return &ValidationError{
Field: field,
Message: "deps_manifest_sha256 is required for dependency provenance",
}
}
case "run_id":
if m.RunID == "" {
return &ValidationError{
Field: field,
Message: "run_id is required",
}
}
case "task_id":
if m.TaskID == "" {
return &ValidationError{
Field: field,
Message: "task_id is required",
}
}
case "job_name":
if m.JobName == "" {
return &ValidationError{
Field: field,
Message: "job_name is required",
}
}
case "snapshot_sha256":
if m.SnapshotID != "" && m.SnapshotSHA256 == "" {
return &ValidationError{
Field: field,
Message: "snapshot_sha256 is required when snapshot_id is provided",
}
}
}
return nil
}
// IsValidationError checks if an error is a manifest validation error.
func IsValidationError(err error) bool {
return errors.Is(err, ErrIncompleteManifest)
}

View file

@ -0,0 +1,188 @@
package queue
import (
"testing"
"time"
)
// TestTaskPrioritizationSpec documents the scheduler's priority and FIFO behavior.
// These tests serve as executable specifications for the queue system.
func TestTaskPrioritizationSpec(t *testing.T) {
tests := []struct {
name string
tasks []Task
expected []string // IDs in expected execution order
}{
{
name: "higher priority runs first",
tasks: []Task{
{ID: "low", Priority: 1, CreatedAt: time.Unix(100, 0)},
{ID: "high", Priority: 10, CreatedAt: time.Unix(100, 0)},
},
expected: []string{"high", "low"},
},
{
name: "FIFO for same priority",
tasks: []Task{
{ID: "first", Priority: 5, CreatedAt: time.Unix(100, 0)},
{ID: "second", Priority: 5, CreatedAt: time.Unix(200, 0)},
},
expected: []string{"first", "second"},
},
{
name: "mixed priorities and creation times",
tasks: []Task{
{ID: "medium-early", Priority: 5, CreatedAt: time.Unix(100, 0)},
{ID: "high-late", Priority: 10, CreatedAt: time.Unix(300, 0)},
{ID: "low-early", Priority: 1, CreatedAt: time.Unix(50, 0)},
},
expected: []string{"high-late", "medium-early", "low-early"},
},
{
name: "negative priority is lowest",
tasks: []Task{
{ID: "negative", Priority: -1, CreatedAt: time.Unix(100, 0)},
{ID: "positive", Priority: 1, CreatedAt: time.Unix(100, 0)},
},
expected: []string{"positive", "negative"},
},
{
name: "zero priority is default",
tasks: []Task{
{ID: "zero", Priority: 0, CreatedAt: time.Unix(100, 0)},
{ID: "positive", Priority: 1, CreatedAt: time.Unix(100, 0)},
},
expected: []string{"positive", "zero"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Create a queue and add tasks
tmpDir := t.TempDir()
q, err := NewFilesystemQueue(tmpDir)
if err != nil {
t.Fatalf("failed to create queue: %v", err)
}
defer q.Close()
// Add all tasks
for _, task := range tt.tasks {
task := task // capture range variable
if err := q.AddTask(&task); err != nil {
t.Fatalf("failed to add task %s: %v", task.ID, err)
}
}
// Get tasks in order and verify
var actual []string
for i := 0; i < len(tt.tasks); i++ {
task, err := q.GetNextTask()
if err != nil {
t.Fatalf("failed to get task at position %d: %v", i, err)
}
if task == nil {
t.Fatalf("expected task at position %d, got nil", i)
}
actual = append(actual, task.ID)
}
// Verify order
if len(actual) != len(tt.expected) {
t.Errorf("expected %d tasks, got %d", len(tt.expected), len(actual))
}
for i, expectedID := range tt.expected {
if i >= len(actual) {
break
}
if actual[i] != expectedID {
t.Errorf("position %d: expected %s, got %s", i, expectedID, actual[i])
}
}
})
}
}
// TestQueueSpec_ClaimAndComplete documents the claim-complete lifecycle
func TestQueueSpec_ClaimAndComplete(t *testing.T) {
tmpDir := t.TempDir()
q, err := NewFilesystemQueue(tmpDir)
if err != nil {
t.Fatalf("failed to create queue: %v", err)
}
defer q.Close()
// Add a task
task := &Task{
ID: "task-1",
JobName: "test-job",
Priority: 5,
CreatedAt: time.Now(),
}
if err := q.AddTask(task); err != nil {
t.Fatalf("failed to add task: %v", err)
}
// Get the task
claimed, err := q.GetNextTask()
if err != nil {
t.Fatalf("failed to get task: %v", err)
}
if claimed == nil {
t.Fatal("expected to claim a task, got nil")
}
if claimed.ID != task.ID {
t.Errorf("expected task %s, got %s", task.ID, claimed.ID)
}
// Verify task is no longer in queue
tasks, err := q.GetAllTasks()
if err != nil {
t.Fatalf("failed to get tasks: %v", err)
}
for _, tsk := range tasks {
if tsk.ID == task.ID {
t.Error("claimed task should not be in queue")
}
}
}
// TestQueueSpec_TaskPriorityOrdering documents numeric priority ordering
func TestQueueSpec_TaskPriorityOrdering(t *testing.T) {
tmpDir := t.TempDir()
q, err := NewFilesystemQueue(tmpDir)
if err != nil {
t.Fatalf("failed to create queue: %v", err)
}
defer q.Close()
// Add tasks with various priorities
priorities := []int64{100, 50, 200, 1, 75}
for i, p := range priorities {
task := &Task{
ID: "task-" + string(rune('a'+i)),
JobName: "job-" + string(rune('a'+i)),
Priority: p,
CreatedAt: time.Now(),
}
if err := q.AddTask(task); err != nil {
t.Fatalf("failed to add task: %v", err)
}
}
// Expected order: 200, 100, 75, 50, 1 (descending)
expected := []string{"task-c", "task-a", "task-e", "task-b", "task-d"}
for i, expID := range expected {
task, err := q.GetNextTask()
if err != nil {
t.Fatalf("position %d: failed to get task: %v", i, err)
}
if task == nil {
t.Fatalf("position %d: expected task %s, got nil", i, expID)
}
if task.ID != expID {
t.Errorf("position %d: expected %s, got %s", i, expID, task.ID)
}
}
}

View file

@ -80,6 +80,27 @@ func (r *JobRunner) Run(
// 3. Select executor
executor := r.selectExecutor(mode, localMode)
// 3.5 Validate manifest completeness before execution
if r.writer != nil {
// Load current manifest and validate
if m, err := manifest.LoadFromDir(outputDir); err == nil {
validator := manifest.NewValidator()
if err := validator.Validate(m); err != nil {
r.logger.Error("manifest validation failed - execution blocked",
"task", task.ID,
"error", err)
return &errtypes.TaskExecutionError{
TaskID: task.ID,
JobName: task.JobName,
Phase: "validation",
Message: "manifest incomplete - execution blocked",
Err: err,
Recoverable: false, // Can't retry - manifest is missing data
}
}
}
}
// 4. Pre-execution manifest update
if r.writer != nil {
r.writer.Upsert(outputDir, task, func(m *manifest.RunManifest) {

View file

@ -0,0 +1,40 @@
# Hermetic Dockerfile - Reproducible builds with pinned dependencies
# Tag image with: deps-<first-8-of-sha256>
# Example: docker build -t mylab/pytorch:deps-abc123 .
FROM pytorch/pytorch:2.0.1-cuda11.8-cudnn8-runtime
# Pin system dependencies to specific versions for reproducibility
# These versions are frozen - update only after testing
RUN apt-get update && apt-get install -y --no-install-recommends \
libblas3=3.9.0-1build1 \
liblapack3=3.9.0-1build1 \
libcudnn8=8.6.0.163-1+cuda11.8 \
&& rm -rf /var/lib/apt/lists/*
# Install conda environment with pinned packages
COPY deps_manifest.json /tmp/deps_manifest.json
# If using conda environment file
RUN if [ -f /tmp/deps_manifest.json ]; then \
conda env update -n base -f /tmp/deps_manifest.json; \
fi
# If using requirements.txt with hashes
COPY requirements.txt /tmp/requirements.txt
RUN pip install --require-hashes --no-deps -r /tmp/requirements.txt \
|| echo "Warning: Some packages may not have hashes"
# Verify installation
RUN python -c "import torch; print(f'PyTorch: {torch.__version__}')" \
&& python -c "import numpy; print(f'NumPy: {numpy.__version__}')"
# Labels for provenance
LABEL org.opencontainers.image.title="Hermetic ML Environment" \
org.opencontainers.image.description="Reproducible ML training environment" \
org.fetchml.deps_manifest="/tmp/deps_manifest.json" \
org.fetchml.build_date="${BUILD_DATE}" \
org.fetchml.git_commit="${GIT_COMMIT}"
WORKDIR /workspace
CMD ["python", "--version"]