// Package queue provides task queue functionality package queue import ( "errors" "fmt" "os" "strings" "syscall" ) // FailureClass represents the classification of a task failure // Used to determine appropriate retry policy and user guidance type FailureClass string const ( FailureInfrastructure FailureClass = "infrastructure" // OOM kill, SIGKILL, node failure FailureCode FailureClass = "code" // non-zero exit, exception, assertion FailureData FailureClass = "data" // hash mismatch, dataset unreachable FailureResource FailureClass = "resource" // GPU OOM, disk full, timeout FailureUnknown FailureClass = "unknown" // cannot classify ) // ClassifyFailure determines the failure class from exit signals, codes, and log output func ClassifyFailure(exitCode int, signal os.Signal, logTail string) FailureClass { logLower := strings.ToLower(logTail) // Killed by OS — infrastructure failure if signal == syscall.SIGKILL { return FailureInfrastructure } // CUDA OOM or GPU resource issues if strings.Contains(logLower, "cuda out of memory") || strings.Contains(logLower, "cuda error") || strings.Contains(logLower, "gpu oom") { return FailureResource } // General OOM (non-GPU) — infrastructure if strings.Contains(logLower, "out of memory") || strings.Contains(logLower, "oom") || strings.Contains(logLower, "cannot allocate memory") { return FailureInfrastructure } // Dataset hash check failed — data failure if strings.Contains(logLower, "hash mismatch") || strings.Contains(logLower, "checksum failed") || strings.Contains(logLower, "dataset not found") || strings.Contains(logLower, "dataset unreachable") { return FailureData } // Disk/resource exhaustion if strings.Contains(logLower, "no space left") || strings.Contains(logLower, "disk full") || strings.Contains(logLower, "disk quota exceeded") { return FailureResource } // Timeout — resource (time budget exceeded) if strings.Contains(logLower, "timeout") || strings.Contains(logLower, "deadline exceeded") || strings.Contains(logLower, "context deadline") { return FailureResource } // Network issues — infrastructure if strings.Contains(logLower, "connection refused") || strings.Contains(logLower, "connection reset") || strings.Contains(logLower, "no route to host") || strings.Contains(logLower, "network unreachable") { return FailureInfrastructure } // Non-zero exit without specific signal — code failure if exitCode != 0 { return FailureCode } return FailureUnknown } // FailureInfo contains complete failure context for the manifest type FailureInfo struct { Class FailureClass `json:"class"` ExitCode int `json:"exit_code,omitempty"` Signal string `json:"signal,omitempty"` LogTail string `json:"log_tail,omitempty"` Suggestion string `json:"suggestion,omitempty"` AutoRetried bool `json:"auto_retried,omitempty"` RetryCount int `json:"retry_count,omitempty"` RetryCap int `json:"retry_cap,omitempty"` ClassifiedAt string `json:"classified_at,omitempty"` Context map[string]string `json:"context,omitempty"` } // GetFailureSuggestion returns user guidance based on failure class func GetFailureSuggestion(class FailureClass, logTail string) string { switch class { case FailureInfrastructure: return "Infrastructure failure (node died, OOM kill). Auto-retry in progress." case FailureCode: return "Code error in training script. Fix before resubmitting." case FailureData: return "Data verification failed. Check dataset accessibility and hashes." case FailureResource: if strings.Contains(strings.ToLower(logTail), "cuda") { return "GPU OOM. Increase --gpu-memory or use smaller batch size." } if strings.Contains(strings.ToLower(logTail), "disk") { return "Disk full. Clean up storage or request more space." } return "Resource exhausted. Try with larger allocation or reduced load." default: return "Unknown failure. Review logs and contact support if persistent." } } // ShouldAutoRetry determines if a failure class should auto-retry // infrastructure: 3 retries transparent // resource: 1 retry with backoff // unknown: 1 retry (conservative - was retryable in old system) // others: never auto-retry func ShouldAutoRetry(class FailureClass, retryCount int) bool { switch class { case FailureInfrastructure: return retryCount < 3 case FailureResource: return retryCount < 1 case FailureUnknown: // Unknown failures get 1 retry attempt (conservative, matches old behavior) return retryCount < 1 default: // code, data failures never auto-retry return false } } // RetryDelayForClass returns appropriate backoff for the failure class func RetryDelayForClass(class FailureClass, retryCount int) int { switch class { case FailureInfrastructure: // Immediate retry for infrastructure return 0 case FailureResource: // Short backoff for resource issues return 30 default: return 0 } } // ErrorCategory represents the type of error encountered (DEPRECATED: use FailureClass) type ErrorCategory string // Error categories for task classification and retry logic const ( ErrorNetwork ErrorCategory = "network" // Network connectivity issues ErrorResource ErrorCategory = "resource" // Resource exhaustion (OOM, disk full) ErrorRateLimit ErrorCategory = "rate_limit" // Rate limiting or throttling ErrorAuth ErrorCategory = "auth" // Authentication/authorization failures ErrorValidation ErrorCategory = "validation" // Input validation errors ErrorTimeout ErrorCategory = "timeout" // Operation timeout ErrorPermanent ErrorCategory = "permanent" // Non-retryable errors ErrorUnknown ErrorCategory = "unknown" // Unclassified errors ) // TaskError wraps an error with category and context type TaskError struct { Category ErrorCategory Message string Cause error Context map[string]string } func (e *TaskError) Error() string { if e.Cause != nil { return fmt.Sprintf("[%s] %s: %v", e.Category, e.Message, e.Cause) } return fmt.Sprintf("[%s] %s", e.Category, e.Message) } func (e *TaskError) Unwrap() error { return e.Cause } // NewTaskError creates a new categorized error func NewTaskError(category ErrorCategory, message string, cause error) *TaskError { return &TaskError{ Category: category, Message: message, Cause: cause, Context: make(map[string]string), } } // ClassifyError categorizes an error for retry logic (DEPRECATED: use classifyFailure) // This function now delegates to the more accurate classifyFailure func ClassifyError(err error) ErrorCategory { if err == nil { return ErrorUnknown } // Check if already classified as TaskError var taskErr *TaskError if errors.As(err, &taskErr) { return taskErr.Category } // Delegate to new FailureClass classification failureClass := ClassifyFailure(0, nil, err.Error()) // Map FailureClass back to ErrorCategory for backward compatibility switch failureClass { case FailureInfrastructure: return ErrorNetwork case FailureCode: return ErrorPermanent case FailureData: return ErrorValidation case FailureResource: return ErrorResource default: return ErrorUnknown } } // IsRetryable determines if an error category should be retried // Now delegates to ShouldAutoRetry with FailureClass mapping func IsRetryable(category ErrorCategory) bool { // Map ErrorCategory to FailureClass var failureClass FailureClass switch category { case ErrorNetwork: failureClass = FailureInfrastructure case ErrorResource, ErrorTimeout: failureClass = FailureResource case ErrorAuth, ErrorValidation, ErrorPermanent: failureClass = FailureCode default: failureClass = FailureUnknown } return ShouldAutoRetry(failureClass, 0) } // GetUserMessage returns a user-friendly error message with suggestions func GetUserMessage(category ErrorCategory, err error) string { messages := map[ErrorCategory]string{ ErrorNetwork: "Network connectivity issue. Please check your network " + "connection and try again.", ErrorResource: "System resource exhausted. The system may be under heavy load. " + "Try again later or contact support.", ErrorRateLimit: "Rate limit exceeded. Please wait a moment before retrying.", ErrorAuth: "Authentication failed. Please check your API key or credentials.", ErrorValidation: "Invalid input. Please review your request and correct any errors.", ErrorTimeout: "Operation timed out. The task may be too complex or the system is slow. " + "Try again or simplify the request.", ErrorPermanent: "A permanent error occurred. This task cannot be retried automatically.", ErrorUnknown: "An unexpected error occurred. If this persists, please contact support.", } baseMsg := messages[category] if err != nil { return fmt.Sprintf("%s (Details: %v)", baseMsg, err) } return baseMsg } // RetryDelay calculates the retry delay based on error category and retry count // Now delegates to RetryDelayForClass with FailureClass mapping func RetryDelay(category ErrorCategory, retryCount int) int { // Map ErrorCategory to FailureClass var failureClass FailureClass switch category { case ErrorNetwork: failureClass = FailureInfrastructure case ErrorResource, ErrorTimeout: failureClass = FailureResource default: failureClass = FailureUnknown } return RetryDelayForClass(failureClass, retryCount) }