Phase 1: Event Sourcing - Add TaskEvent types (queued, started, completed, failed, etc.) - Create EventStore with Redis Streams (append-only) - Support event querying by task ID and time range Phase 3: Diagnosable Failures - Enhance TaskExecutionError with Context map, Timestamp, Recoverable flag - Update container.go to populate error context (image, GPU, duration) - Add WithContext helper for building error context - Create cmd/errors CLI for querying task errors Phase 4: Testable Security - Add security fields to PodmanConfig (Privileged, Network, ReadOnlyMounts) - Create ValidateSecurityPolicy() with ErrSecurityViolation - Add security contract tests (privileged rejection, host network rejection) - Tests serve as executable security documentation Phase 7: Reproducible Builds - Add BuildHash and BuildTime ldflags to Makefile - Create verify-build target for reproducibility testing - Add -version and -verify flags to api-server All tests pass: - go test ./internal/errtypes/... - go test ./internal/container/... -run Security - go test ./internal/queue/... - go build ./cmd/api-server/...
82 lines
2.3 KiB
Go
82 lines
2.3 KiB
Go
// Package main implements the ml errors command for querying task errors
|
|
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/errtypes"
|
|
)
|
|
|
|
func main() {
|
|
if len(os.Args) < 2 {
|
|
fmt.Fprintln(os.Stderr, "Usage: errors <task_id> [--json]")
|
|
fmt.Fprintln(os.Stderr, " task_id: The task ID to query errors for")
|
|
fmt.Fprintln(os.Stderr, " --json: Output as JSON instead of formatted text")
|
|
os.Exit(1)
|
|
}
|
|
|
|
taskID := os.Args[1]
|
|
jsonOutput := len(os.Args) > 2 && os.Args[2] == "--json"
|
|
|
|
// Determine base path from environment or default
|
|
basePath := os.Getenv("FETCH_ML_BASE_PATH")
|
|
if basePath == "" {
|
|
home, err := os.UserHomeDir()
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "Error: failed to get home directory: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
basePath = filepath.Join(home, "ml_jobs")
|
|
}
|
|
|
|
// Try to read error file
|
|
errorPath := filepath.Join(basePath, "errors", taskID+".json")
|
|
data, err := os.ReadFile(errorPath)
|
|
if err != nil {
|
|
// Error file may not exist - check if task exists in other states
|
|
fmt.Fprintf(os.Stderr, "Error: no error record found for task %s\n", taskID)
|
|
fmt.Fprintf(os.Stderr, "Expected: %s\n", errorPath)
|
|
os.Exit(1)
|
|
}
|
|
|
|
var execErr errtypes.TaskExecutionError
|
|
if err := json.Unmarshal(data, &execErr); err != nil {
|
|
fmt.Fprintf(os.Stderr, "Error: failed to parse error record: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
if jsonOutput {
|
|
// Output as pretty-printed JSON
|
|
output, err := json.MarshalIndent(execErr, "", " ")
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "Error: failed to format error: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
fmt.Println(string(output))
|
|
} else {
|
|
// Output as formatted text
|
|
fmt.Printf("Error Report for Task: %s\n", execErr.TaskID)
|
|
fmt.Printf("Job Name: %s\n", execErr.JobName)
|
|
fmt.Printf("Phase: %s\n", execErr.Phase)
|
|
fmt.Printf("Time: %s\n", execErr.Timestamp.Format(time.RFC3339))
|
|
fmt.Printf("Recoverable: %v\n", execErr.Recoverable)
|
|
fmt.Println()
|
|
if execErr.Message != "" {
|
|
fmt.Printf("Message: %s\n", execErr.Message)
|
|
}
|
|
if execErr.Err != nil {
|
|
fmt.Printf("Underlying Error: %v\n", execErr.Err)
|
|
}
|
|
if len(execErr.Context) > 0 {
|
|
fmt.Println()
|
|
fmt.Println("Context:")
|
|
for key, value := range execErr.Context {
|
|
fmt.Printf(" %s: %s\n", key, value)
|
|
}
|
|
}
|
|
}
|
|
}
|