package domain import ( "os" "strings" "syscall" ) // FailureClass represents the classification of a task failure // Used to determine appropriate retry policy and user guidance type FailureClass string const ( FailureInfrastructure FailureClass = "infrastructure" // OOM kill, SIGKILL, node failure FailureCode FailureClass = "code" // non-zero exit, exception, assertion FailureData FailureClass = "data" // hash mismatch, dataset unreachable FailureResource FailureClass = "resource" // GPU OOM, disk full, timeout FailureUnknown FailureClass = "unknown" // cannot classify ) // ClassifyFailure determines the failure class from exit signals, codes, and log output func ClassifyFailure(exitCode int, signal os.Signal, logTail string) FailureClass { logLower := strings.ToLower(logTail) // Killed by OS — infrastructure failure if signal == syscall.SIGKILL { return FailureInfrastructure } // CUDA OOM or GPU resource issues if strings.Contains(logLower, "cuda out of memory") || strings.Contains(logLower, "cuda error") || strings.Contains(logLower, "gpu oom") { return FailureResource } // General OOM (non-GPU) — infrastructure if strings.Contains(logLower, "out of memory") || strings.Contains(logLower, "oom") || strings.Contains(logLower, "cannot allocate memory") { return FailureInfrastructure } // Dataset hash check failed — data failure if strings.Contains(logLower, "hash mismatch") || strings.Contains(logLower, "checksum failed") || strings.Contains(logLower, "dataset not found") || strings.Contains(logLower, "dataset unreachable") { return FailureData } // Disk/resource exhaustion if strings.Contains(logLower, "no space left") || strings.Contains(logLower, "disk full") || strings.Contains(logLower, "disk quota exceeded") { return FailureResource } // Timeout — resource (time budget exceeded) if strings.Contains(logLower, "timeout") || strings.Contains(logLower, "deadline exceeded") || strings.Contains(logLower, "context deadline") { return FailureResource } // Network issues — infrastructure if strings.Contains(logLower, "connection refused") || strings.Contains(logLower, "connection reset") || strings.Contains(logLower, "no route to host") || strings.Contains(logLower, "network unreachable") { return FailureInfrastructure } // Non-zero exit without specific signal — code failure if exitCode != 0 { return FailureCode } return FailureUnknown } // FailureInfo contains complete failure context for the manifest type FailureInfo struct { Class FailureClass `json:"class"` ExitCode int `json:"exit_code,omitempty"` Signal string `json:"signal,omitempty"` LogTail string `json:"log_tail,omitempty"` Suggestion string `json:"suggestion,omitempty"` AutoRetried bool `json:"auto_retried,omitempty"` RetryCount int `json:"retry_count,omitempty"` RetryCap int `json:"retry_cap,omitempty"` ClassifiedAt string `json:"classified_at,omitempty"` Context map[string]string `json:"context,omitempty"` } // GetFailureSuggestion returns user guidance based on failure class func GetFailureSuggestion(class FailureClass, logTail string) string { switch class { case FailureInfrastructure: return "Infrastructure failure (node died, OOM kill). Auto-retry in progress." case FailureCode: return "Code error in training script. Fix before resubmitting." case FailureData: return "Data verification failed. Check dataset accessibility and hashes." case FailureResource: if strings.Contains(strings.ToLower(logTail), "cuda") { return "GPU OOM. Increase --gpu-memory or use smaller batch size." } if strings.Contains(strings.ToLower(logTail), "disk") { return "Disk full. Clean up storage or request more space." } return "Resource exhausted. Try with larger allocation or reduced load." default: return "Unknown failure. Review logs and contact support if persistent." } } // ShouldAutoRetry determines if a failure class should auto-retry // infrastructure: 3 retries transparent // resource: 1 retry with backoff // unknown: 1 retry (conservative - was retryable in old system) // others: never auto-retry func ShouldAutoRetry(class FailureClass, retryCount int) bool { switch class { case FailureInfrastructure: return retryCount < 3 case FailureResource: return retryCount < 1 case FailureUnknown: // Unknown failures get 1 retry attempt (conservative, matches old behavior) return retryCount < 1 default: // code, data failures never auto-retry return false } } // RetryDelayForClass returns appropriate backoff for the failure class func RetryDelayForClass(class FailureClass, retryCount int) int { switch class { case FailureInfrastructure: // Immediate retry for infrastructure return 0 case FailureResource: // Short backoff for resource issues return 30 default: return 0 } }