fetch_ml/internal/queue/errors.go
Jeremie Fraeys a93b6715fd
feat: add native library bridge and queue integration
- Add native_queue.go with CGO bindings for queue operations
- Add native_queue_stub.go for non-CGO builds
- Add hash_selector to choose between Go and native implementations
- Add native_bridge_libs.go for CGO builds with native_libs tag
- Add native_bridge_nocgo.go stub for non-CGO builds
- Update queue errors and task handling for native integration
- Update worker config and runloop for native library support
2026-02-16 20:38:30 -05:00

285 lines
9.4 KiB
Go

// Package queue provides task queue functionality
package queue
import (
"errors"
"fmt"
"os"
"strings"
"syscall"
)
// FailureClass represents the classification of a task failure
// Used to determine appropriate retry policy and user guidance
type FailureClass string
const (
FailureInfrastructure FailureClass = "infrastructure" // OOM kill, SIGKILL, node failure
FailureCode FailureClass = "code" // non-zero exit, exception, assertion
FailureData FailureClass = "data" // hash mismatch, dataset unreachable
FailureResource FailureClass = "resource" // GPU OOM, disk full, timeout
FailureUnknown FailureClass = "unknown" // cannot classify
)
// ClassifyFailure determines the failure class from exit signals, codes, and log output
func ClassifyFailure(exitCode int, signal os.Signal, logTail string) FailureClass {
logLower := strings.ToLower(logTail)
// Killed by OS — infrastructure failure
if signal == syscall.SIGKILL {
return FailureInfrastructure
}
// CUDA OOM or GPU resource issues
if strings.Contains(logLower, "cuda out of memory") ||
strings.Contains(logLower, "cuda error") ||
strings.Contains(logLower, "gpu oom") {
return FailureResource
}
// General OOM (non-GPU) — infrastructure
if strings.Contains(logLower, "out of memory") ||
strings.Contains(logLower, "oom") ||
strings.Contains(logLower, "cannot allocate memory") {
return FailureInfrastructure
}
// Dataset hash check failed — data failure
if strings.Contains(logLower, "hash mismatch") ||
strings.Contains(logLower, "checksum failed") ||
strings.Contains(logLower, "dataset not found") ||
strings.Contains(logLower, "dataset unreachable") {
return FailureData
}
// Disk/resource exhaustion
if strings.Contains(logLower, "no space left") ||
strings.Contains(logLower, "disk full") ||
strings.Contains(logLower, "disk quota exceeded") {
return FailureResource
}
// Timeout — resource (time budget exceeded)
if strings.Contains(logLower, "timeout") ||
strings.Contains(logLower, "deadline exceeded") ||
strings.Contains(logLower, "context deadline") {
return FailureResource
}
// Network issues — infrastructure
if strings.Contains(logLower, "connection refused") ||
strings.Contains(logLower, "connection reset") ||
strings.Contains(logLower, "no route to host") ||
strings.Contains(logLower, "network unreachable") {
return FailureInfrastructure
}
// Non-zero exit without specific signal — code failure
if exitCode != 0 {
return FailureCode
}
return FailureUnknown
}
// FailureInfo contains complete failure context for the manifest
type FailureInfo struct {
Class FailureClass `json:"class"`
ExitCode int `json:"exit_code,omitempty"`
Signal string `json:"signal,omitempty"`
LogTail string `json:"log_tail,omitempty"`
Suggestion string `json:"suggestion,omitempty"`
AutoRetried bool `json:"auto_retried,omitempty"`
RetryCount int `json:"retry_count,omitempty"`
RetryCap int `json:"retry_cap,omitempty"`
ClassifiedAt string `json:"classified_at,omitempty"`
Context map[string]string `json:"context,omitempty"`
}
// GetFailureSuggestion returns user guidance based on failure class
func GetFailureSuggestion(class FailureClass, logTail string) string {
switch class {
case FailureInfrastructure:
return "Infrastructure failure (node died, OOM kill). Auto-retry in progress."
case FailureCode:
return "Code error in training script. Fix before resubmitting."
case FailureData:
return "Data verification failed. Check dataset accessibility and hashes."
case FailureResource:
if strings.Contains(strings.ToLower(logTail), "cuda") {
return "GPU OOM. Increase --gpu-memory or use smaller batch size."
}
if strings.Contains(strings.ToLower(logTail), "disk") {
return "Disk full. Clean up storage or request more space."
}
return "Resource exhausted. Try with larger allocation or reduced load."
default:
return "Unknown failure. Review logs and contact support if persistent."
}
}
// ShouldAutoRetry determines if a failure class should auto-retry
// infrastructure: 3 retries transparent
// resource: 1 retry with backoff
// unknown: 1 retry (conservative - was retryable in old system)
// others: never auto-retry
func ShouldAutoRetry(class FailureClass, retryCount int) bool {
switch class {
case FailureInfrastructure:
return retryCount < 3
case FailureResource:
return retryCount < 1
case FailureUnknown:
// Unknown failures get 1 retry attempt (conservative, matches old behavior)
return retryCount < 1
default:
// code, data failures never auto-retry
return false
}
}
// RetryDelayForClass returns appropriate backoff for the failure class
func RetryDelayForClass(class FailureClass, retryCount int) int {
switch class {
case FailureInfrastructure:
// Immediate retry for infrastructure
return 0
case FailureResource:
// Short backoff for resource issues
return 30
default:
return 0
}
}
// ErrorCategory represents the type of error encountered (DEPRECATED: use FailureClass)
type ErrorCategory string
// Error categories for task classification and retry logic
const (
ErrorNetwork ErrorCategory = "network" // Network connectivity issues
ErrorResource ErrorCategory = "resource" // Resource exhaustion (OOM, disk full)
ErrorRateLimit ErrorCategory = "rate_limit" // Rate limiting or throttling
ErrorAuth ErrorCategory = "auth" // Authentication/authorization failures
ErrorValidation ErrorCategory = "validation" // Input validation errors
ErrorTimeout ErrorCategory = "timeout" // Operation timeout
ErrorPermanent ErrorCategory = "permanent" // Non-retryable errors
ErrorUnknown ErrorCategory = "unknown" // Unclassified errors
)
// TaskError wraps an error with category and context
type TaskError struct {
Category ErrorCategory
Message string
Cause error
Context map[string]string
}
func (e *TaskError) Error() string {
if e.Cause != nil {
return fmt.Sprintf("[%s] %s: %v", e.Category, e.Message, e.Cause)
}
return fmt.Sprintf("[%s] %s", e.Category, e.Message)
}
func (e *TaskError) Unwrap() error {
return e.Cause
}
// NewTaskError creates a new categorized error
func NewTaskError(category ErrorCategory, message string, cause error) *TaskError {
return &TaskError{
Category: category,
Message: message,
Cause: cause,
Context: make(map[string]string),
}
}
// ClassifyError categorizes an error for retry logic (DEPRECATED: use classifyFailure)
// This function now delegates to the more accurate classifyFailure
func ClassifyError(err error) ErrorCategory {
if err == nil {
return ErrorUnknown
}
// Check if already classified as TaskError
var taskErr *TaskError
if errors.As(err, &taskErr) {
return taskErr.Category
}
// Delegate to new FailureClass classification
failureClass := ClassifyFailure(0, nil, err.Error())
// Map FailureClass back to ErrorCategory for backward compatibility
switch failureClass {
case FailureInfrastructure:
return ErrorNetwork
case FailureCode:
return ErrorPermanent
case FailureData:
return ErrorValidation
case FailureResource:
return ErrorResource
default:
return ErrorUnknown
}
}
// IsRetryable determines if an error category should be retried
// Now delegates to ShouldAutoRetry with FailureClass mapping
func IsRetryable(category ErrorCategory) bool {
// Map ErrorCategory to FailureClass
var failureClass FailureClass
switch category {
case ErrorNetwork:
failureClass = FailureInfrastructure
case ErrorResource, ErrorTimeout:
failureClass = FailureResource
case ErrorAuth, ErrorValidation, ErrorPermanent:
failureClass = FailureCode
default:
failureClass = FailureUnknown
}
return ShouldAutoRetry(failureClass, 0)
}
// GetUserMessage returns a user-friendly error message with suggestions
func GetUserMessage(category ErrorCategory, err error) string {
messages := map[ErrorCategory]string{
ErrorNetwork: "Network connectivity issue. Please check your network " +
"connection and try again.",
ErrorResource: "System resource exhausted. The system may be under heavy load. " +
"Try again later or contact support.",
ErrorRateLimit: "Rate limit exceeded. Please wait a moment before retrying.",
ErrorAuth: "Authentication failed. Please check your API key or credentials.",
ErrorValidation: "Invalid input. Please review your request and correct any errors.",
ErrorTimeout: "Operation timed out. The task may be too complex or the system is slow. " +
"Try again or simplify the request.",
ErrorPermanent: "A permanent error occurred. This task cannot be retried automatically.",
ErrorUnknown: "An unexpected error occurred. If this persists, please contact support.",
}
baseMsg := messages[category]
if err != nil {
return fmt.Sprintf("%s (Details: %v)", baseMsg, err)
}
return baseMsg
}
// RetryDelay calculates the retry delay based on error category and retry count
// Now delegates to RetryDelayForClass with FailureClass mapping
func RetryDelay(category ErrorCategory, retryCount int) int {
// Map ErrorCategory to FailureClass
var failureClass FailureClass
switch category {
case ErrorNetwork:
failureClass = FailureInfrastructure
case ErrorResource, ErrorTimeout:
failureClass = FailureResource
default:
failureClass = FailureUnknown
}
return RetryDelayForClass(failureClass, retryCount)
}