- Add native_queue.go with CGO bindings for queue operations - Add native_queue_stub.go for non-CGO builds - Add hash_selector to choose between Go and native implementations - Add native_bridge_libs.go for CGO builds with native_libs tag - Add native_bridge_nocgo.go stub for non-CGO builds - Update queue errors and task handling for native integration - Update worker config and runloop for native library support
285 lines
9.4 KiB
Go
285 lines
9.4 KiB
Go
// Package queue provides task queue functionality
|
|
package queue
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
"syscall"
|
|
)
|
|
|
|
// FailureClass represents the classification of a task failure
|
|
// Used to determine appropriate retry policy and user guidance
|
|
type FailureClass string
|
|
|
|
const (
|
|
FailureInfrastructure FailureClass = "infrastructure" // OOM kill, SIGKILL, node failure
|
|
FailureCode FailureClass = "code" // non-zero exit, exception, assertion
|
|
FailureData FailureClass = "data" // hash mismatch, dataset unreachable
|
|
FailureResource FailureClass = "resource" // GPU OOM, disk full, timeout
|
|
FailureUnknown FailureClass = "unknown" // cannot classify
|
|
)
|
|
|
|
// ClassifyFailure determines the failure class from exit signals, codes, and log output
|
|
func ClassifyFailure(exitCode int, signal os.Signal, logTail string) FailureClass {
|
|
logLower := strings.ToLower(logTail)
|
|
|
|
// Killed by OS — infrastructure failure
|
|
if signal == syscall.SIGKILL {
|
|
return FailureInfrastructure
|
|
}
|
|
|
|
// CUDA OOM or GPU resource issues
|
|
if strings.Contains(logLower, "cuda out of memory") ||
|
|
strings.Contains(logLower, "cuda error") ||
|
|
strings.Contains(logLower, "gpu oom") {
|
|
return FailureResource
|
|
}
|
|
|
|
// General OOM (non-GPU) — infrastructure
|
|
if strings.Contains(logLower, "out of memory") ||
|
|
strings.Contains(logLower, "oom") ||
|
|
strings.Contains(logLower, "cannot allocate memory") {
|
|
return FailureInfrastructure
|
|
}
|
|
|
|
// Dataset hash check failed — data failure
|
|
if strings.Contains(logLower, "hash mismatch") ||
|
|
strings.Contains(logLower, "checksum failed") ||
|
|
strings.Contains(logLower, "dataset not found") ||
|
|
strings.Contains(logLower, "dataset unreachable") {
|
|
return FailureData
|
|
}
|
|
|
|
// Disk/resource exhaustion
|
|
if strings.Contains(logLower, "no space left") ||
|
|
strings.Contains(logLower, "disk full") ||
|
|
strings.Contains(logLower, "disk quota exceeded") {
|
|
return FailureResource
|
|
}
|
|
|
|
// Timeout — resource (time budget exceeded)
|
|
if strings.Contains(logLower, "timeout") ||
|
|
strings.Contains(logLower, "deadline exceeded") ||
|
|
strings.Contains(logLower, "context deadline") {
|
|
return FailureResource
|
|
}
|
|
|
|
// Network issues — infrastructure
|
|
if strings.Contains(logLower, "connection refused") ||
|
|
strings.Contains(logLower, "connection reset") ||
|
|
strings.Contains(logLower, "no route to host") ||
|
|
strings.Contains(logLower, "network unreachable") {
|
|
return FailureInfrastructure
|
|
}
|
|
|
|
// Non-zero exit without specific signal — code failure
|
|
if exitCode != 0 {
|
|
return FailureCode
|
|
}
|
|
|
|
return FailureUnknown
|
|
}
|
|
|
|
// FailureInfo contains complete failure context for the manifest
|
|
type FailureInfo struct {
|
|
Class FailureClass `json:"class"`
|
|
ExitCode int `json:"exit_code,omitempty"`
|
|
Signal string `json:"signal,omitempty"`
|
|
LogTail string `json:"log_tail,omitempty"`
|
|
Suggestion string `json:"suggestion,omitempty"`
|
|
AutoRetried bool `json:"auto_retried,omitempty"`
|
|
RetryCount int `json:"retry_count,omitempty"`
|
|
RetryCap int `json:"retry_cap,omitempty"`
|
|
ClassifiedAt string `json:"classified_at,omitempty"`
|
|
Context map[string]string `json:"context,omitempty"`
|
|
}
|
|
|
|
// GetFailureSuggestion returns user guidance based on failure class
|
|
func GetFailureSuggestion(class FailureClass, logTail string) string {
|
|
switch class {
|
|
case FailureInfrastructure:
|
|
return "Infrastructure failure (node died, OOM kill). Auto-retry in progress."
|
|
case FailureCode:
|
|
return "Code error in training script. Fix before resubmitting."
|
|
case FailureData:
|
|
return "Data verification failed. Check dataset accessibility and hashes."
|
|
case FailureResource:
|
|
if strings.Contains(strings.ToLower(logTail), "cuda") {
|
|
return "GPU OOM. Increase --gpu-memory or use smaller batch size."
|
|
}
|
|
if strings.Contains(strings.ToLower(logTail), "disk") {
|
|
return "Disk full. Clean up storage or request more space."
|
|
}
|
|
return "Resource exhausted. Try with larger allocation or reduced load."
|
|
default:
|
|
return "Unknown failure. Review logs and contact support if persistent."
|
|
}
|
|
}
|
|
|
|
// ShouldAutoRetry determines if a failure class should auto-retry
|
|
// infrastructure: 3 retries transparent
|
|
// resource: 1 retry with backoff
|
|
// unknown: 1 retry (conservative - was retryable in old system)
|
|
// others: never auto-retry
|
|
func ShouldAutoRetry(class FailureClass, retryCount int) bool {
|
|
switch class {
|
|
case FailureInfrastructure:
|
|
return retryCount < 3
|
|
case FailureResource:
|
|
return retryCount < 1
|
|
case FailureUnknown:
|
|
// Unknown failures get 1 retry attempt (conservative, matches old behavior)
|
|
return retryCount < 1
|
|
default:
|
|
// code, data failures never auto-retry
|
|
return false
|
|
}
|
|
}
|
|
|
|
// RetryDelayForClass returns appropriate backoff for the failure class
|
|
func RetryDelayForClass(class FailureClass, retryCount int) int {
|
|
switch class {
|
|
case FailureInfrastructure:
|
|
// Immediate retry for infrastructure
|
|
return 0
|
|
case FailureResource:
|
|
// Short backoff for resource issues
|
|
return 30
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
// ErrorCategory represents the type of error encountered (DEPRECATED: use FailureClass)
|
|
type ErrorCategory string
|
|
|
|
// Error categories for task classification and retry logic
|
|
const (
|
|
ErrorNetwork ErrorCategory = "network" // Network connectivity issues
|
|
ErrorResource ErrorCategory = "resource" // Resource exhaustion (OOM, disk full)
|
|
ErrorRateLimit ErrorCategory = "rate_limit" // Rate limiting or throttling
|
|
ErrorAuth ErrorCategory = "auth" // Authentication/authorization failures
|
|
ErrorValidation ErrorCategory = "validation" // Input validation errors
|
|
ErrorTimeout ErrorCategory = "timeout" // Operation timeout
|
|
ErrorPermanent ErrorCategory = "permanent" // Non-retryable errors
|
|
ErrorUnknown ErrorCategory = "unknown" // Unclassified errors
|
|
)
|
|
|
|
// TaskError wraps an error with category and context
|
|
type TaskError struct {
|
|
Category ErrorCategory
|
|
Message string
|
|
Cause error
|
|
Context map[string]string
|
|
}
|
|
|
|
func (e *TaskError) Error() string {
|
|
if e.Cause != nil {
|
|
return fmt.Sprintf("[%s] %s: %v", e.Category, e.Message, e.Cause)
|
|
}
|
|
return fmt.Sprintf("[%s] %s", e.Category, e.Message)
|
|
}
|
|
|
|
func (e *TaskError) Unwrap() error {
|
|
return e.Cause
|
|
}
|
|
|
|
// NewTaskError creates a new categorized error
|
|
func NewTaskError(category ErrorCategory, message string, cause error) *TaskError {
|
|
return &TaskError{
|
|
Category: category,
|
|
Message: message,
|
|
Cause: cause,
|
|
Context: make(map[string]string),
|
|
}
|
|
}
|
|
|
|
// ClassifyError categorizes an error for retry logic (DEPRECATED: use classifyFailure)
|
|
// This function now delegates to the more accurate classifyFailure
|
|
func ClassifyError(err error) ErrorCategory {
|
|
if err == nil {
|
|
return ErrorUnknown
|
|
}
|
|
|
|
// Check if already classified as TaskError
|
|
var taskErr *TaskError
|
|
if errors.As(err, &taskErr) {
|
|
return taskErr.Category
|
|
}
|
|
|
|
// Delegate to new FailureClass classification
|
|
failureClass := ClassifyFailure(0, nil, err.Error())
|
|
|
|
// Map FailureClass back to ErrorCategory for backward compatibility
|
|
switch failureClass {
|
|
case FailureInfrastructure:
|
|
return ErrorNetwork
|
|
case FailureCode:
|
|
return ErrorPermanent
|
|
case FailureData:
|
|
return ErrorValidation
|
|
case FailureResource:
|
|
return ErrorResource
|
|
default:
|
|
return ErrorUnknown
|
|
}
|
|
}
|
|
|
|
// IsRetryable determines if an error category should be retried
|
|
// Now delegates to ShouldAutoRetry with FailureClass mapping
|
|
func IsRetryable(category ErrorCategory) bool {
|
|
// Map ErrorCategory to FailureClass
|
|
var failureClass FailureClass
|
|
switch category {
|
|
case ErrorNetwork:
|
|
failureClass = FailureInfrastructure
|
|
case ErrorResource, ErrorTimeout:
|
|
failureClass = FailureResource
|
|
case ErrorAuth, ErrorValidation, ErrorPermanent:
|
|
failureClass = FailureCode
|
|
default:
|
|
failureClass = FailureUnknown
|
|
}
|
|
return ShouldAutoRetry(failureClass, 0)
|
|
}
|
|
|
|
// GetUserMessage returns a user-friendly error message with suggestions
|
|
func GetUserMessage(category ErrorCategory, err error) string {
|
|
messages := map[ErrorCategory]string{
|
|
ErrorNetwork: "Network connectivity issue. Please check your network " +
|
|
"connection and try again.",
|
|
ErrorResource: "System resource exhausted. The system may be under heavy load. " +
|
|
"Try again later or contact support.",
|
|
ErrorRateLimit: "Rate limit exceeded. Please wait a moment before retrying.",
|
|
ErrorAuth: "Authentication failed. Please check your API key or credentials.",
|
|
ErrorValidation: "Invalid input. Please review your request and correct any errors.",
|
|
ErrorTimeout: "Operation timed out. The task may be too complex or the system is slow. " +
|
|
"Try again or simplify the request.",
|
|
ErrorPermanent: "A permanent error occurred. This task cannot be retried automatically.",
|
|
ErrorUnknown: "An unexpected error occurred. If this persists, please contact support.",
|
|
}
|
|
|
|
baseMsg := messages[category]
|
|
if err != nil {
|
|
return fmt.Sprintf("%s (Details: %v)", baseMsg, err)
|
|
}
|
|
return baseMsg
|
|
}
|
|
|
|
// RetryDelay calculates the retry delay based on error category and retry count
|
|
// Now delegates to RetryDelayForClass with FailureClass mapping
|
|
func RetryDelay(category ErrorCategory, retryCount int) int {
|
|
// Map ErrorCategory to FailureClass
|
|
var failureClass FailureClass
|
|
switch category {
|
|
case ErrorNetwork:
|
|
failureClass = FailureInfrastructure
|
|
case ErrorResource, ErrorTimeout:
|
|
failureClass = FailureResource
|
|
default:
|
|
failureClass = FailureUnknown
|
|
}
|
|
return RetryDelayForClass(failureClass, retryCount)
|
|
}
|