fetch_ml/internal/queue/metrics.go
Jeremie Fraeys 6580917ba8
refactor: extract domain types and consolidate error system (Phases 1-2)
Phase 1: Extract Domain Types
=============================
- Create internal/domain/ package with canonical types:
  - domain/task.go: Task, Attempt structs
  - domain/tracking.go: TrackingConfig and MLflow/TensorBoard/Wandb configs
  - domain/dataset.go: DatasetSpec
  - domain/status.go: JobStatus constants
  - domain/errors.go: FailureClass system with classification functions
  - domain/doc.go: package documentation

- Update queue/task.go to re-export domain types (backward compatibility)
- Update TUI model/state.go to use domain types via type aliases
- Simplify TUI services: remove ~60 lines of conversion functions

Phase 2: Delete ErrorCategory System
====================================
- Remove deprecated ErrorCategory type and constants
- Remove TaskError struct and related functions
- Remove mapping functions: ClassifyError, IsRetryable, GetUserMessage, RetryDelay
- Update all queue implementations to use domain.FailureClass directly:
  - queue/metrics.go: RecordTaskFailure/Retry now take FailureClass
  - queue/queue.go: RetryTask uses domain.ClassifyFailure
  - queue/filesystem_queue.go: RetryTask and MoveToDeadLetterQueue updated
  - queue/sqlite_queue.go: RetryTask and MoveToDeadLetterQueue updated

Lines eliminated: ~190 lines of conversion and mapping code
Result: Single source of truth for domain types and error classification
2026-02-17 12:34:28 -05:00

125 lines
4.3 KiB
Go

package queue
import (
"github.com/jfraeys/fetch_ml/internal/domain"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
var (
// QueueDepth tracks the number of tasks in the queue.
QueueDepth = promauto.NewGauge(prometheus.GaugeOpts{
Name: "fetch_ml_queue_depth",
Help: "Number of tasks in the queue",
})
// TasksQueued tracks the total number of tasks queued.
TasksQueued = promauto.NewCounter(prometheus.CounterOpts{
Name: "fetch_ml_tasks_queued_total",
Help: "Total number of tasks queued",
})
// TaskDuration tracks task execution duration in seconds.
TaskDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "fetch_ml_task_duration_seconds",
Help: "Task execution duration in seconds",
Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600, 1800, 3600}, // 1s to 1h
}, []string{"job_name", "status"})
// TasksCompleted tracks the total number of completed tasks.
TasksCompleted = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "fetch_ml_tasks_completed_total",
Help: "Total number of completed tasks",
}, []string{"job_name", "status"})
// TaskFailures tracks failed tasks by failure class.
TaskFailures = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "fetch_ml_task_failures_total",
Help: "Total number of failed tasks by failure class",
}, []string{"job_name", "failure_class"})
// TaskRetries tracks the total number of task retries.
TaskRetries = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "fetch_ml_task_retries_total",
Help: "Total number of task retries",
}, []string{"job_name", "failure_class"})
// LeaseExpirations tracks expired leases that were reclaimed.
LeaseExpirations = promauto.NewCounter(prometheus.CounterOpts{
Name: "fetch_ml_lease_expirations_total",
Help: "Total number of expired leases reclaimed",
})
// LeaseRenewals tracks successful lease renewals.
LeaseRenewals = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "fetch_ml_lease_renewals_total",
Help: "Total number of successful lease renewals",
}, []string{"worker_id"})
// DLQSize tracks the number of tasks in dead letter queue.
DLQSize = promauto.NewGauge(prometheus.GaugeOpts{
Name: "fetch_ml_dlq_size",
Help: "Number of tasks in dead letter queue",
})
// DLQAdditions tracks tasks moved to dead letter queue.
DLQAdditions = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "fetch_ml_dlq_additions_total",
Help: "Total number of tasks moved to DLQ",
}, []string{"reason"})
// ActiveTasks tracks currently executing tasks.
ActiveTasks = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "fetch_ml_active_tasks",
Help: "Number of currently executing tasks",
}, []string{"worker_id"})
// WorkerHeartbeats tracks worker heartbeat events.
WorkerHeartbeats = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "fetch_ml_worker_heartbeats_total",
Help: "Total number of worker heartbeats",
}, []string{"worker_id"})
)
// RecordTaskStart records when a task starts
func RecordTaskStart(_, workerID string) {
ActiveTasks.WithLabelValues(workerID).Inc()
}
// RecordTaskEnd records when a task completes
func RecordTaskEnd(jobName, workerID, status string, durationSeconds float64) {
ActiveTasks.WithLabelValues(workerID).Dec()
TaskDuration.WithLabelValues(jobName, status).Observe(durationSeconds)
TasksCompleted.WithLabelValues(jobName, status).Inc()
}
// RecordTaskFailure records a task failure with failure class
func RecordTaskFailure(jobName string, failureClass domain.FailureClass) {
TaskFailures.WithLabelValues(jobName, string(failureClass)).Inc()
}
// RecordTaskRetry records a task retry
func RecordTaskRetry(jobName string, failureClass domain.FailureClass) {
TaskRetries.WithLabelValues(jobName, string(failureClass)).Inc()
}
// RecordLeaseExpiration records a lease expiration
func RecordLeaseExpiration() {
LeaseExpirations.Inc()
}
// RecordLeaseRenewal records a successful lease renewal
func RecordLeaseRenewal(workerID string) {
LeaseRenewals.WithLabelValues(workerID).Inc()
}
// RecordDLQAddition records a task being moved to DLQ
func RecordDLQAddition(reason string) {
DLQAdditions.WithLabelValues(reason).Inc()
DLQSize.Inc()
}
// UpdateQueueDepth updates the current queue depth gauge
func UpdateQueueDepth(depth int64) {
QueueDepth.Set(float64(depth))
}