Phase 1: Extract Domain Types ============================= - Create internal/domain/ package with canonical types: - domain/task.go: Task, Attempt structs - domain/tracking.go: TrackingConfig and MLflow/TensorBoard/Wandb configs - domain/dataset.go: DatasetSpec - domain/status.go: JobStatus constants - domain/errors.go: FailureClass system with classification functions - domain/doc.go: package documentation - Update queue/task.go to re-export domain types (backward compatibility) - Update TUI model/state.go to use domain types via type aliases - Simplify TUI services: remove ~60 lines of conversion functions Phase 2: Delete ErrorCategory System ==================================== - Remove deprecated ErrorCategory type and constants - Remove TaskError struct and related functions - Remove mapping functions: ClassifyError, IsRetryable, GetUserMessage, RetryDelay - Update all queue implementations to use domain.FailureClass directly: - queue/metrics.go: RecordTaskFailure/Retry now take FailureClass - queue/queue.go: RetryTask uses domain.ClassifyFailure - queue/filesystem_queue.go: RetryTask and MoveToDeadLetterQueue updated - queue/sqlite_queue.go: RetryTask and MoveToDeadLetterQueue updated Lines eliminated: ~190 lines of conversion and mapping code Result: Single source of truth for domain types and error classification
150 lines
5 KiB
Go
150 lines
5 KiB
Go
package domain
|
|
|
|
import (
|
|
"os"
|
|
"strings"
|
|
"syscall"
|
|
)
|
|
|
|
// FailureClass represents the classification of a task failure
|
|
// Used to determine appropriate retry policy and user guidance
|
|
type FailureClass string
|
|
|
|
const (
|
|
FailureInfrastructure FailureClass = "infrastructure" // OOM kill, SIGKILL, node failure
|
|
FailureCode FailureClass = "code" // non-zero exit, exception, assertion
|
|
FailureData FailureClass = "data" // hash mismatch, dataset unreachable
|
|
FailureResource FailureClass = "resource" // GPU OOM, disk full, timeout
|
|
FailureUnknown FailureClass = "unknown" // cannot classify
|
|
)
|
|
|
|
// ClassifyFailure determines the failure class from exit signals, codes, and log output
|
|
func ClassifyFailure(exitCode int, signal os.Signal, logTail string) FailureClass {
|
|
logLower := strings.ToLower(logTail)
|
|
|
|
// Killed by OS — infrastructure failure
|
|
if signal == syscall.SIGKILL {
|
|
return FailureInfrastructure
|
|
}
|
|
|
|
// CUDA OOM or GPU resource issues
|
|
if strings.Contains(logLower, "cuda out of memory") ||
|
|
strings.Contains(logLower, "cuda error") ||
|
|
strings.Contains(logLower, "gpu oom") {
|
|
return FailureResource
|
|
}
|
|
|
|
// General OOM (non-GPU) — infrastructure
|
|
if strings.Contains(logLower, "out of memory") ||
|
|
strings.Contains(logLower, "oom") ||
|
|
strings.Contains(logLower, "cannot allocate memory") {
|
|
return FailureInfrastructure
|
|
}
|
|
|
|
// Dataset hash check failed — data failure
|
|
if strings.Contains(logLower, "hash mismatch") ||
|
|
strings.Contains(logLower, "checksum failed") ||
|
|
strings.Contains(logLower, "dataset not found") ||
|
|
strings.Contains(logLower, "dataset unreachable") {
|
|
return FailureData
|
|
}
|
|
|
|
// Disk/resource exhaustion
|
|
if strings.Contains(logLower, "no space left") ||
|
|
strings.Contains(logLower, "disk full") ||
|
|
strings.Contains(logLower, "disk quota exceeded") {
|
|
return FailureResource
|
|
}
|
|
|
|
// Timeout — resource (time budget exceeded)
|
|
if strings.Contains(logLower, "timeout") ||
|
|
strings.Contains(logLower, "deadline exceeded") ||
|
|
strings.Contains(logLower, "context deadline") {
|
|
return FailureResource
|
|
}
|
|
|
|
// Network issues — infrastructure
|
|
if strings.Contains(logLower, "connection refused") ||
|
|
strings.Contains(logLower, "connection reset") ||
|
|
strings.Contains(logLower, "no route to host") ||
|
|
strings.Contains(logLower, "network unreachable") {
|
|
return FailureInfrastructure
|
|
}
|
|
|
|
// Non-zero exit without specific signal — code failure
|
|
if exitCode != 0 {
|
|
return FailureCode
|
|
}
|
|
|
|
return FailureUnknown
|
|
}
|
|
|
|
// FailureInfo contains complete failure context for the manifest
|
|
type FailureInfo struct {
|
|
Class FailureClass `json:"class"`
|
|
ExitCode int `json:"exit_code,omitempty"`
|
|
Signal string `json:"signal,omitempty"`
|
|
LogTail string `json:"log_tail,omitempty"`
|
|
Suggestion string `json:"suggestion,omitempty"`
|
|
AutoRetried bool `json:"auto_retried,omitempty"`
|
|
RetryCount int `json:"retry_count,omitempty"`
|
|
RetryCap int `json:"retry_cap,omitempty"`
|
|
ClassifiedAt string `json:"classified_at,omitempty"`
|
|
Context map[string]string `json:"context,omitempty"`
|
|
}
|
|
|
|
// GetFailureSuggestion returns user guidance based on failure class
|
|
func GetFailureSuggestion(class FailureClass, logTail string) string {
|
|
switch class {
|
|
case FailureInfrastructure:
|
|
return "Infrastructure failure (node died, OOM kill). Auto-retry in progress."
|
|
case FailureCode:
|
|
return "Code error in training script. Fix before resubmitting."
|
|
case FailureData:
|
|
return "Data verification failed. Check dataset accessibility and hashes."
|
|
case FailureResource:
|
|
if strings.Contains(strings.ToLower(logTail), "cuda") {
|
|
return "GPU OOM. Increase --gpu-memory or use smaller batch size."
|
|
}
|
|
if strings.Contains(strings.ToLower(logTail), "disk") {
|
|
return "Disk full. Clean up storage or request more space."
|
|
}
|
|
return "Resource exhausted. Try with larger allocation or reduced load."
|
|
default:
|
|
return "Unknown failure. Review logs and contact support if persistent."
|
|
}
|
|
}
|
|
|
|
// ShouldAutoRetry determines if a failure class should auto-retry
|
|
// infrastructure: 3 retries transparent
|
|
// resource: 1 retry with backoff
|
|
// unknown: 1 retry (conservative - was retryable in old system)
|
|
// others: never auto-retry
|
|
func ShouldAutoRetry(class FailureClass, retryCount int) bool {
|
|
switch class {
|
|
case FailureInfrastructure:
|
|
return retryCount < 3
|
|
case FailureResource:
|
|
return retryCount < 1
|
|
case FailureUnknown:
|
|
// Unknown failures get 1 retry attempt (conservative, matches old behavior)
|
|
return retryCount < 1
|
|
default:
|
|
// code, data failures never auto-retry
|
|
return false
|
|
}
|
|
}
|
|
|
|
// RetryDelayForClass returns appropriate backoff for the failure class
|
|
func RetryDelayForClass(class FailureClass, retryCount int) int {
|
|
switch class {
|
|
case FailureInfrastructure:
|
|
// Immediate retry for infrastructure
|
|
return 0
|
|
case FailureResource:
|
|
// Short backoff for resource issues
|
|
return 30
|
|
default:
|
|
return 0
|
|
}
|
|
}
|