- Add API server with WebSocket support and REST endpoints - Implement authentication system with API keys and permissions - Add task queue system with Redis backend and error handling - Include storage layer with database migrations and schemas - Add comprehensive logging, metrics, and telemetry - Implement security middleware and network utilities - Add experiment management and container orchestration - Include configuration management with smart defaults
118 lines
3.7 KiB
Go
118 lines
3.7 KiB
Go
package queue
|
|
|
|
import (
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
)
|
|
|
|
var (
|
|
// Queue metrics
|
|
QueueDepth = promauto.NewGauge(prometheus.GaugeOpts{
|
|
Name: "fetch_ml_queue_depth",
|
|
Help: "Number of tasks in the queue",
|
|
})
|
|
|
|
TasksQueued = promauto.NewCounter(prometheus.CounterOpts{
|
|
Name: "fetch_ml_tasks_queued_total",
|
|
Help: "Total number of tasks queued",
|
|
})
|
|
|
|
// Task execution metrics
|
|
TaskDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
|
|
Name: "fetch_ml_task_duration_seconds",
|
|
Help: "Task execution duration in seconds",
|
|
Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600, 1800, 3600}, // 1s to 1h
|
|
}, []string{"job_name", "status"})
|
|
|
|
TasksCompleted = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "fetch_ml_tasks_completed_total",
|
|
Help: "Total number of completed tasks",
|
|
}, []string{"job_name", "status"})
|
|
|
|
// Error metrics
|
|
TaskFailures = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "fetch_ml_task_failures_total",
|
|
Help: "Total number of failed tasks by error category",
|
|
}, []string{"job_name", "error_category"})
|
|
|
|
TaskRetries = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "fetch_ml_task_retries_total",
|
|
Help: "Total number of task retries",
|
|
}, []string{"job_name", "error_category"})
|
|
|
|
// Lease metrics
|
|
LeaseExpirations = promauto.NewCounter(prometheus.CounterOpts{
|
|
Name: "fetch_ml_lease_expirations_total",
|
|
Help: "Total number of expired leases reclaimed",
|
|
})
|
|
|
|
LeaseRenewals = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "fetch_ml_lease_renewals_total",
|
|
Help: "Total number of successful lease renewals",
|
|
}, []string{"worker_id"})
|
|
|
|
// Dead letter queue metrics
|
|
DLQSize = promauto.NewGauge(prometheus.GaugeOpts{
|
|
Name: "fetch_ml_dlq_size",
|
|
Help: "Number of tasks in dead letter queue",
|
|
})
|
|
|
|
DLQAdditions = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "fetch_ml_dlq_additions_total",
|
|
Help: "Total number of tasks moved to DLQ",
|
|
}, []string{"reason"})
|
|
|
|
// Worker metrics
|
|
ActiveTasks = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
|
Name: "fetch_ml_active_tasks",
|
|
Help: "Number of currently executing tasks",
|
|
}, []string{"worker_id"})
|
|
|
|
WorkerHeartbeats = promauto.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "fetch_ml_worker_heartbeats_total",
|
|
Help: "Total number of worker heartbeats",
|
|
}, []string{"worker_id"})
|
|
)
|
|
|
|
// RecordTaskStart records when a task starts
|
|
func RecordTaskStart(jobName, workerID string) {
|
|
ActiveTasks.WithLabelValues(workerID).Inc()
|
|
}
|
|
|
|
// RecordTaskEnd records when a task completes
|
|
func RecordTaskEnd(jobName, workerID, status string, durationSeconds float64) {
|
|
ActiveTasks.WithLabelValues(workerID).Dec()
|
|
TaskDuration.WithLabelValues(jobName, status).Observe(durationSeconds)
|
|
TasksCompleted.WithLabelValues(jobName, status).Inc()
|
|
}
|
|
|
|
// RecordTaskFailure records a task failure with error category
|
|
func RecordTaskFailure(jobName string, errorCategory ErrorCategory) {
|
|
TaskFailures.WithLabelValues(jobName, string(errorCategory)).Inc()
|
|
}
|
|
|
|
// RecordTaskRetry records a task retry
|
|
func RecordTaskRetry(jobName string, errorCategory ErrorCategory) {
|
|
TaskRetries.WithLabelValues(jobName, string(errorCategory)).Inc()
|
|
}
|
|
|
|
// RecordLeaseExpiration records a lease expiration
|
|
func RecordLeaseExpiration() {
|
|
LeaseExpirations.Inc()
|
|
}
|
|
|
|
// RecordLeaseRenewal records a successful lease renewal
|
|
func RecordLeaseRenewal(workerID string) {
|
|
LeaseRenewals.WithLabelValues(workerID).Inc()
|
|
}
|
|
|
|
// RecordDLQAddition records a task being moved to DLQ
|
|
func RecordDLQAddition(reason string) {
|
|
DLQAdditions.WithLabelValues(reason).Inc()
|
|
DLQSize.Inc()
|
|
}
|
|
|
|
// UpdateQueueDepth updates the current queue depth gauge
|
|
func UpdateQueueDepth(depth int64) {
|
|
QueueDepth.Set(float64(depth))
|
|
}
|