fetch_ml/internal/queue/errors.go
Jeremie Fraeys ea15af1833 Fix multi-user authentication and clean up debug code
- Fix YAML tags in auth config struct (json -> yaml)
- Update CLI configs to use pre-hashed API keys
- Remove double hashing in WebSocket client
- Fix port mapping (9102 -> 9103) in CLI commands
- Update permission keys to use jobs:read, jobs:create, etc.
- Clean up all debug logging from CLI and server
- All user roles now authenticate correctly:
  * Admin: Can queue jobs and see all jobs
  * Researcher: Can queue jobs and see own jobs
  * Analyst: Can see status (read-only access)

Multi-user authentication is now fully functional.
2025-12-06 12:35:32 -05:00

224 lines
5.9 KiB
Go

// Package queue provides task queue functionality
package queue
import (
"errors"
"fmt"
"strings"
)
// ErrorCategory represents the type of error encountered
type ErrorCategory string
// Error categories for task classification and retry logic
const (
ErrorNetwork ErrorCategory = "network" // Network connectivity issues
ErrorResource ErrorCategory = "resource" // Resource exhaustion (OOM, disk full)
ErrorRateLimit ErrorCategory = "rate_limit" // Rate limiting or throttling
ErrorAuth ErrorCategory = "auth" // Authentication/authorization failures
ErrorValidation ErrorCategory = "validation" // Input validation errors
ErrorTimeout ErrorCategory = "timeout" // Operation timeout
ErrorPermanent ErrorCategory = "permanent" // Non-retryable errors
ErrorUnknown ErrorCategory = "unknown" // Unclassified errors
)
// TaskError wraps an error with category and context
type TaskError struct {
Category ErrorCategory
Message string
Cause error
Context map[string]string
}
func (e *TaskError) Error() string {
if e.Cause != nil {
return fmt.Sprintf("[%s] %s: %v", e.Category, e.Message, e.Cause)
}
return fmt.Sprintf("[%s] %s", e.Category, e.Message)
}
func (e *TaskError) Unwrap() error {
return e.Cause
}
// NewTaskError creates a new categorized error
func NewTaskError(category ErrorCategory, message string, cause error) *TaskError {
return &TaskError{
Category: category,
Message: message,
Cause: cause,
Context: make(map[string]string),
}
}
// ClassifyError categorizes an error for retry logic
func ClassifyError(err error) ErrorCategory {
if err == nil {
return ErrorUnknown
}
// Check if already classified
var taskErr *TaskError
if errors.As(err, &taskErr) {
return taskErr.Category
}
errStr := strings.ToLower(err.Error())
// Network errors (retryable)
networkIndicators := []string{
"connection refused",
"connection reset",
"connection timeout",
"no route to host",
"network unreachable",
"temporary failure",
"dns",
"dial tcp",
"i/o timeout",
}
for _, indicator := range networkIndicators {
if strings.Contains(errStr, indicator) {
return ErrorNetwork
}
}
// Resource errors (retryable after delay)
resourceIndicators := []string{
"out of memory",
"oom",
"no space left",
"disk full",
"resource temporarily unavailable",
"too many open files",
"cannot allocate memory",
}
for _, indicator := range resourceIndicators {
if strings.Contains(errStr, indicator) {
return ErrorResource
}
}
// Rate limiting (retryable with backoff)
rateLimitIndicators := []string{
"rate limit",
"too many requests",
"throttle",
"quota exceeded",
"429",
}
for _, indicator := range rateLimitIndicators {
if strings.Contains(errStr, indicator) {
return ErrorRateLimit
}
}
// Timeout errors (retryable)
timeoutIndicators := []string{
"timeout",
"deadline exceeded",
"context deadline",
}
for _, indicator := range timeoutIndicators {
if strings.Contains(errStr, indicator) {
return ErrorTimeout
}
}
// Authentication errors (not retryable)
authIndicators := []string{
"unauthorized",
"forbidden",
"authentication failed",
"invalid credentials",
"access denied",
"401",
"403",
}
for _, indicator := range authIndicators {
if strings.Contains(errStr, indicator) {
return ErrorAuth
}
}
// Validation errors (not retryable)
validationIndicators := []string{
"invalid input",
"validation failed",
"bad request",
"malformed",
"400",
}
for _, indicator := range validationIndicators {
if strings.Contains(errStr, indicator) {
return ErrorValidation
}
}
// Default to unknown
return ErrorUnknown
}
// IsRetryable determines if an error category should be retried
func IsRetryable(category ErrorCategory) bool {
switch category {
case ErrorNetwork, ErrorResource, ErrorRateLimit, ErrorTimeout, ErrorUnknown:
return true
case ErrorAuth, ErrorValidation, ErrorPermanent:
return false
default:
return false
}
}
// GetUserMessage returns a user-friendly error message with suggestions
func GetUserMessage(category ErrorCategory, err error) string {
messages := map[ErrorCategory]string{
ErrorNetwork: "Network connectivity issue. Please check your network connection and try again.",
ErrorResource: "System resource exhausted. The system may be under heavy load. Try again later or contact support.",
ErrorRateLimit: "Rate limit exceeded. Please wait a moment before retrying.",
ErrorAuth: "Authentication failed. Please check your API key or credentials.",
ErrorValidation: "Invalid input. Please review your request and correct any errors.",
ErrorTimeout: "Operation timed out. The task may be too complex or the system is slow. " +
"Try again or simplify the request.",
ErrorPermanent: "A permanent error occurred. This task cannot be retried automatically.",
ErrorUnknown: "An unexpected error occurred. If this persists, please contact support.",
}
baseMsg := messages[category]
if err != nil {
return fmt.Sprintf("%s (Details: %v)", baseMsg, err)
}
return baseMsg
}
// RetryDelay calculates the retry delay based on error category and retry count
func RetryDelay(category ErrorCategory, retryCount int) int {
switch category {
case ErrorRateLimit:
// Longer backoff for rate limits
return intMin(300, 10*(1<<retryCount)) // 10s, 20s, 40s, 80s, up to 300s
case ErrorResource:
// Medium backoff for resource issues
return intMin(120, 5*(1<<retryCount)) // 5s, 10s, 20s, 40s, up to 120s
case ErrorNetwork, ErrorTimeout:
// Standard exponential backoff
return 1 << retryCount // 1s, 2s, 4s, 8s, etc
case ErrorAuth, ErrorValidation, ErrorPermanent:
// No retry for auth, validation, or permanent errors
return 0
case ErrorUnknown:
// Conservative backoff for unknown errors
return 1 << retryCount
default:
// Fallback for any unexpected categories
return 1 << retryCount
}
}
func intMin(a, b int) int {
if a < b {
return a
}
return b
}