fetch_ml/internal/domain/errors.go
Jeremie Fraeys 6866ba9366
refactor(queue): integrate scheduler backend and storage improvements
Update queue and storage systems for scheduler integration:
- Queue backend with scheduler coordination
- Filesystem queue with batch operations
- Deduplication with tenant-aware keys
- Storage layer with audit logging hooks
- Domain models (Task, Events, Errors) with scheduler fields
- Database layer with tenant isolation
- Dataset storage with integrity checks
2026-02-26 12:06:46 -05:00

150 lines
5 KiB
Go

package domain
import (
"os"
"strings"
"syscall"
)
// FailureClass represents the classification of a task failure
// Used to determine appropriate retry policy and user guidance
type FailureClass string
const (
FailureInfrastructure FailureClass = "infrastructure" // OOM kill, SIGKILL, node failure
FailureCode FailureClass = "code" // non-zero exit, exception, assertion
FailureData FailureClass = "data" // hash mismatch, dataset unreachable
FailureResource FailureClass = "resource" // GPU OOM, disk full, timeout
FailureUnknown FailureClass = "unknown" // cannot classify
)
// ClassifyFailure determines the failure class from exit signals, codes, and log output
func ClassifyFailure(exitCode int, signal os.Signal, logTail string) FailureClass {
logLower := strings.ToLower(logTail)
// Killed by OS — infrastructure failure
if signal == syscall.SIGKILL {
return FailureInfrastructure
}
// CUDA OOM or GPU resource issues
if strings.Contains(logLower, "cuda out of memory") ||
strings.Contains(logLower, "cuda error") ||
strings.Contains(logLower, "gpu oom") {
return FailureResource
}
// General OOM (non-GPU) — infrastructure
if strings.Contains(logLower, "out of memory") ||
strings.Contains(logLower, "oom") ||
strings.Contains(logLower, "cannot allocate memory") {
return FailureInfrastructure
}
// Dataset hash check failed — data failure
if strings.Contains(logLower, "hash mismatch") ||
strings.Contains(logLower, "checksum failed") ||
strings.Contains(logLower, "dataset not found") ||
strings.Contains(logLower, "dataset unreachable") {
return FailureData
}
// Disk/resource exhaustion
if strings.Contains(logLower, "no space left") ||
strings.Contains(logLower, "disk full") ||
strings.Contains(logLower, "disk quota exceeded") {
return FailureResource
}
// Timeout — resource (time budget exceeded)
if strings.Contains(logLower, "timeout") ||
strings.Contains(logLower, "deadline exceeded") ||
strings.Contains(logLower, "context deadline") {
return FailureResource
}
// Network issues — infrastructure
if strings.Contains(logLower, "connection refused") ||
strings.Contains(logLower, "connection reset") ||
strings.Contains(logLower, "no route to host") ||
strings.Contains(logLower, "network unreachable") {
return FailureInfrastructure
}
// Non-zero exit without specific signal — code failure
if exitCode != 0 {
return FailureCode
}
return FailureUnknown
}
// FailureInfo contains complete failure context for the manifest
type FailureInfo struct {
Context map[string]string `json:"context,omitempty"`
Class FailureClass `json:"class"`
Signal string `json:"signal,omitempty"`
LogTail string `json:"log_tail,omitempty"`
Suggestion string `json:"suggestion,omitempty"`
ClassifiedAt string `json:"classified_at,omitempty"`
ExitCode int `json:"exit_code,omitempty"`
RetryCount int `json:"retry_count,omitempty"`
RetryCap int `json:"retry_cap,omitempty"`
AutoRetried bool `json:"auto_retried,omitempty"`
}
// GetFailureSuggestion returns user guidance based on failure class
func GetFailureSuggestion(class FailureClass, logTail string) string {
switch class {
case FailureInfrastructure:
return "Infrastructure failure (node died, OOM kill). Auto-retry in progress."
case FailureCode:
return "Code error in training script. Fix before resubmitting."
case FailureData:
return "Data verification failed. Check dataset accessibility and hashes."
case FailureResource:
if strings.Contains(strings.ToLower(logTail), "cuda") {
return "GPU OOM. Increase --gpu-memory or use smaller batch size."
}
if strings.Contains(strings.ToLower(logTail), "disk") {
return "Disk full. Clean up storage or request more space."
}
return "Resource exhausted. Try with larger allocation or reduced load."
default:
return "Unknown failure. Review logs and contact support if persistent."
}
}
// ShouldAutoRetry determines if a failure class should auto-retry
// infrastructure: 3 retries transparent
// resource: 1 retry with backoff
// unknown: 1 retry (conservative - was retryable in old system)
// others: never auto-retry
func ShouldAutoRetry(class FailureClass, retryCount int) bool {
switch class {
case FailureInfrastructure:
return retryCount < 3
case FailureResource:
return retryCount < 1
case FailureUnknown:
// Unknown failures get 1 retry attempt (conservative, matches old behavior)
return retryCount < 1
default:
// code, data failures never auto-retry
return false
}
}
// RetryDelayForClass returns appropriate backoff for the failure class
func RetryDelayForClass(class FailureClass, retryCount int) int {
switch class {
case FailureInfrastructure:
// Immediate retry for infrastructure
return 0
case FailureResource:
// Short backoff for resource issues
return 30
default:
return 0
}
}