- Fix YAML tags in auth config struct (json -> yaml) - Update CLI configs to use pre-hashed API keys - Remove double hashing in WebSocket client - Fix port mapping (9102 -> 9103) in CLI commands - Update permission keys to use jobs:read, jobs:create, etc. - Clean up all debug logging from CLI and server - All user roles now authenticate correctly: * Admin: Can queue jobs and see all jobs * Researcher: Can queue jobs and see own jobs * Analyst: Can see status (read-only access) Multi-user authentication is now fully functional.
224 lines
5.9 KiB
Go
224 lines
5.9 KiB
Go
// Package queue provides task queue functionality
|
|
package queue
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"strings"
|
|
)
|
|
|
|
// ErrorCategory represents the type of error encountered
|
|
type ErrorCategory string
|
|
|
|
// Error categories for task classification and retry logic
|
|
const (
|
|
ErrorNetwork ErrorCategory = "network" // Network connectivity issues
|
|
ErrorResource ErrorCategory = "resource" // Resource exhaustion (OOM, disk full)
|
|
ErrorRateLimit ErrorCategory = "rate_limit" // Rate limiting or throttling
|
|
ErrorAuth ErrorCategory = "auth" // Authentication/authorization failures
|
|
ErrorValidation ErrorCategory = "validation" // Input validation errors
|
|
ErrorTimeout ErrorCategory = "timeout" // Operation timeout
|
|
ErrorPermanent ErrorCategory = "permanent" // Non-retryable errors
|
|
ErrorUnknown ErrorCategory = "unknown" // Unclassified errors
|
|
)
|
|
|
|
// TaskError wraps an error with category and context
|
|
type TaskError struct {
|
|
Category ErrorCategory
|
|
Message string
|
|
Cause error
|
|
Context map[string]string
|
|
}
|
|
|
|
func (e *TaskError) Error() string {
|
|
if e.Cause != nil {
|
|
return fmt.Sprintf("[%s] %s: %v", e.Category, e.Message, e.Cause)
|
|
}
|
|
return fmt.Sprintf("[%s] %s", e.Category, e.Message)
|
|
}
|
|
|
|
func (e *TaskError) Unwrap() error {
|
|
return e.Cause
|
|
}
|
|
|
|
// NewTaskError creates a new categorized error
|
|
func NewTaskError(category ErrorCategory, message string, cause error) *TaskError {
|
|
return &TaskError{
|
|
Category: category,
|
|
Message: message,
|
|
Cause: cause,
|
|
Context: make(map[string]string),
|
|
}
|
|
}
|
|
|
|
// ClassifyError categorizes an error for retry logic
|
|
func ClassifyError(err error) ErrorCategory {
|
|
if err == nil {
|
|
return ErrorUnknown
|
|
}
|
|
|
|
// Check if already classified
|
|
var taskErr *TaskError
|
|
if errors.As(err, &taskErr) {
|
|
return taskErr.Category
|
|
}
|
|
|
|
errStr := strings.ToLower(err.Error())
|
|
|
|
// Network errors (retryable)
|
|
networkIndicators := []string{
|
|
"connection refused",
|
|
"connection reset",
|
|
"connection timeout",
|
|
"no route to host",
|
|
"network unreachable",
|
|
"temporary failure",
|
|
"dns",
|
|
"dial tcp",
|
|
"i/o timeout",
|
|
}
|
|
for _, indicator := range networkIndicators {
|
|
if strings.Contains(errStr, indicator) {
|
|
return ErrorNetwork
|
|
}
|
|
}
|
|
|
|
// Resource errors (retryable after delay)
|
|
resourceIndicators := []string{
|
|
"out of memory",
|
|
"oom",
|
|
"no space left",
|
|
"disk full",
|
|
"resource temporarily unavailable",
|
|
"too many open files",
|
|
"cannot allocate memory",
|
|
}
|
|
for _, indicator := range resourceIndicators {
|
|
if strings.Contains(errStr, indicator) {
|
|
return ErrorResource
|
|
}
|
|
}
|
|
|
|
// Rate limiting (retryable with backoff)
|
|
rateLimitIndicators := []string{
|
|
"rate limit",
|
|
"too many requests",
|
|
"throttle",
|
|
"quota exceeded",
|
|
"429",
|
|
}
|
|
for _, indicator := range rateLimitIndicators {
|
|
if strings.Contains(errStr, indicator) {
|
|
return ErrorRateLimit
|
|
}
|
|
}
|
|
|
|
// Timeout errors (retryable)
|
|
timeoutIndicators := []string{
|
|
"timeout",
|
|
"deadline exceeded",
|
|
"context deadline",
|
|
}
|
|
for _, indicator := range timeoutIndicators {
|
|
if strings.Contains(errStr, indicator) {
|
|
return ErrorTimeout
|
|
}
|
|
}
|
|
|
|
// Authentication errors (not retryable)
|
|
authIndicators := []string{
|
|
"unauthorized",
|
|
"forbidden",
|
|
"authentication failed",
|
|
"invalid credentials",
|
|
"access denied",
|
|
"401",
|
|
"403",
|
|
}
|
|
for _, indicator := range authIndicators {
|
|
if strings.Contains(errStr, indicator) {
|
|
return ErrorAuth
|
|
}
|
|
}
|
|
|
|
// Validation errors (not retryable)
|
|
validationIndicators := []string{
|
|
"invalid input",
|
|
"validation failed",
|
|
"bad request",
|
|
"malformed",
|
|
"400",
|
|
}
|
|
for _, indicator := range validationIndicators {
|
|
if strings.Contains(errStr, indicator) {
|
|
return ErrorValidation
|
|
}
|
|
}
|
|
|
|
// Default to unknown
|
|
return ErrorUnknown
|
|
}
|
|
|
|
// IsRetryable determines if an error category should be retried
|
|
func IsRetryable(category ErrorCategory) bool {
|
|
switch category {
|
|
case ErrorNetwork, ErrorResource, ErrorRateLimit, ErrorTimeout, ErrorUnknown:
|
|
return true
|
|
case ErrorAuth, ErrorValidation, ErrorPermanent:
|
|
return false
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// GetUserMessage returns a user-friendly error message with suggestions
|
|
func GetUserMessage(category ErrorCategory, err error) string {
|
|
messages := map[ErrorCategory]string{
|
|
ErrorNetwork: "Network connectivity issue. Please check your network connection and try again.",
|
|
ErrorResource: "System resource exhausted. The system may be under heavy load. Try again later or contact support.",
|
|
ErrorRateLimit: "Rate limit exceeded. Please wait a moment before retrying.",
|
|
ErrorAuth: "Authentication failed. Please check your API key or credentials.",
|
|
ErrorValidation: "Invalid input. Please review your request and correct any errors.",
|
|
ErrorTimeout: "Operation timed out. The task may be too complex or the system is slow. " +
|
|
"Try again or simplify the request.",
|
|
ErrorPermanent: "A permanent error occurred. This task cannot be retried automatically.",
|
|
ErrorUnknown: "An unexpected error occurred. If this persists, please contact support.",
|
|
}
|
|
|
|
baseMsg := messages[category]
|
|
if err != nil {
|
|
return fmt.Sprintf("%s (Details: %v)", baseMsg, err)
|
|
}
|
|
return baseMsg
|
|
}
|
|
|
|
// RetryDelay calculates the retry delay based on error category and retry count
|
|
func RetryDelay(category ErrorCategory, retryCount int) int {
|
|
switch category {
|
|
case ErrorRateLimit:
|
|
// Longer backoff for rate limits
|
|
return intMin(300, 10*(1<<retryCount)) // 10s, 20s, 40s, 80s, up to 300s
|
|
case ErrorResource:
|
|
// Medium backoff for resource issues
|
|
return intMin(120, 5*(1<<retryCount)) // 5s, 10s, 20s, 40s, up to 120s
|
|
case ErrorNetwork, ErrorTimeout:
|
|
// Standard exponential backoff
|
|
return 1 << retryCount // 1s, 2s, 4s, 8s, etc
|
|
case ErrorAuth, ErrorValidation, ErrorPermanent:
|
|
// No retry for auth, validation, or permanent errors
|
|
return 0
|
|
case ErrorUnknown:
|
|
// Conservative backoff for unknown errors
|
|
return 1 << retryCount
|
|
default:
|
|
// Fallback for any unexpected categories
|
|
return 1 << retryCount
|
|
}
|
|
}
|
|
|
|
func intMin(a, b int) int {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|