fetch_ml/internal/domain/errors.go
Jeremie Fraeys 6580917ba8
refactor: extract domain types and consolidate error system (Phases 1-2)
Phase 1: Extract Domain Types
=============================
- Create internal/domain/ package with canonical types:
  - domain/task.go: Task, Attempt structs
  - domain/tracking.go: TrackingConfig and MLflow/TensorBoard/Wandb configs
  - domain/dataset.go: DatasetSpec
  - domain/status.go: JobStatus constants
  - domain/errors.go: FailureClass system with classification functions
  - domain/doc.go: package documentation

- Update queue/task.go to re-export domain types (backward compatibility)
- Update TUI model/state.go to use domain types via type aliases
- Simplify TUI services: remove ~60 lines of conversion functions

Phase 2: Delete ErrorCategory System
====================================
- Remove deprecated ErrorCategory type and constants
- Remove TaskError struct and related functions
- Remove mapping functions: ClassifyError, IsRetryable, GetUserMessage, RetryDelay
- Update all queue implementations to use domain.FailureClass directly:
  - queue/metrics.go: RecordTaskFailure/Retry now take FailureClass
  - queue/queue.go: RetryTask uses domain.ClassifyFailure
  - queue/filesystem_queue.go: RetryTask and MoveToDeadLetterQueue updated
  - queue/sqlite_queue.go: RetryTask and MoveToDeadLetterQueue updated

Lines eliminated: ~190 lines of conversion and mapping code
Result: Single source of truth for domain types and error classification
2026-02-17 12:34:28 -05:00

150 lines
5 KiB
Go

package domain
import (
"os"
"strings"
"syscall"
)
// FailureClass represents the classification of a task failure
// Used to determine appropriate retry policy and user guidance
type FailureClass string
const (
FailureInfrastructure FailureClass = "infrastructure" // OOM kill, SIGKILL, node failure
FailureCode FailureClass = "code" // non-zero exit, exception, assertion
FailureData FailureClass = "data" // hash mismatch, dataset unreachable
FailureResource FailureClass = "resource" // GPU OOM, disk full, timeout
FailureUnknown FailureClass = "unknown" // cannot classify
)
// ClassifyFailure determines the failure class from exit signals, codes, and log output
func ClassifyFailure(exitCode int, signal os.Signal, logTail string) FailureClass {
logLower := strings.ToLower(logTail)
// Killed by OS — infrastructure failure
if signal == syscall.SIGKILL {
return FailureInfrastructure
}
// CUDA OOM or GPU resource issues
if strings.Contains(logLower, "cuda out of memory") ||
strings.Contains(logLower, "cuda error") ||
strings.Contains(logLower, "gpu oom") {
return FailureResource
}
// General OOM (non-GPU) — infrastructure
if strings.Contains(logLower, "out of memory") ||
strings.Contains(logLower, "oom") ||
strings.Contains(logLower, "cannot allocate memory") {
return FailureInfrastructure
}
// Dataset hash check failed — data failure
if strings.Contains(logLower, "hash mismatch") ||
strings.Contains(logLower, "checksum failed") ||
strings.Contains(logLower, "dataset not found") ||
strings.Contains(logLower, "dataset unreachable") {
return FailureData
}
// Disk/resource exhaustion
if strings.Contains(logLower, "no space left") ||
strings.Contains(logLower, "disk full") ||
strings.Contains(logLower, "disk quota exceeded") {
return FailureResource
}
// Timeout — resource (time budget exceeded)
if strings.Contains(logLower, "timeout") ||
strings.Contains(logLower, "deadline exceeded") ||
strings.Contains(logLower, "context deadline") {
return FailureResource
}
// Network issues — infrastructure
if strings.Contains(logLower, "connection refused") ||
strings.Contains(logLower, "connection reset") ||
strings.Contains(logLower, "no route to host") ||
strings.Contains(logLower, "network unreachable") {
return FailureInfrastructure
}
// Non-zero exit without specific signal — code failure
if exitCode != 0 {
return FailureCode
}
return FailureUnknown
}
// FailureInfo contains complete failure context for the manifest
type FailureInfo struct {
Class FailureClass `json:"class"`
ExitCode int `json:"exit_code,omitempty"`
Signal string `json:"signal,omitempty"`
LogTail string `json:"log_tail,omitempty"`
Suggestion string `json:"suggestion,omitempty"`
AutoRetried bool `json:"auto_retried,omitempty"`
RetryCount int `json:"retry_count,omitempty"`
RetryCap int `json:"retry_cap,omitempty"`
ClassifiedAt string `json:"classified_at,omitempty"`
Context map[string]string `json:"context,omitempty"`
}
// GetFailureSuggestion returns user guidance based on failure class
func GetFailureSuggestion(class FailureClass, logTail string) string {
switch class {
case FailureInfrastructure:
return "Infrastructure failure (node died, OOM kill). Auto-retry in progress."
case FailureCode:
return "Code error in training script. Fix before resubmitting."
case FailureData:
return "Data verification failed. Check dataset accessibility and hashes."
case FailureResource:
if strings.Contains(strings.ToLower(logTail), "cuda") {
return "GPU OOM. Increase --gpu-memory or use smaller batch size."
}
if strings.Contains(strings.ToLower(logTail), "disk") {
return "Disk full. Clean up storage or request more space."
}
return "Resource exhausted. Try with larger allocation or reduced load."
default:
return "Unknown failure. Review logs and contact support if persistent."
}
}
// ShouldAutoRetry determines if a failure class should auto-retry
// infrastructure: 3 retries transparent
// resource: 1 retry with backoff
// unknown: 1 retry (conservative - was retryable in old system)
// others: never auto-retry
func ShouldAutoRetry(class FailureClass, retryCount int) bool {
switch class {
case FailureInfrastructure:
return retryCount < 3
case FailureResource:
return retryCount < 1
case FailureUnknown:
// Unknown failures get 1 retry attempt (conservative, matches old behavior)
return retryCount < 1
default:
// code, data failures never auto-retry
return false
}
}
// RetryDelayForClass returns appropriate backoff for the failure class
func RetryDelayForClass(class FailureClass, retryCount int) int {
switch class {
case FailureInfrastructure:
// Immediate retry for infrastructure
return 0
case FailureResource:
// Short backoff for resource issues
return 30
default:
return 0
}
}