fetch_ml/internal/prommetrics/prometheus.go

295 lines
7.2 KiB
Go

package prommetrics
import (
"net/http"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
// Metrics holds all Prometheus metrics for the application
type Metrics struct {
// WebSocket metrics
wsConnections *prometheus.GaugeVec
wsMessages *prometheus.CounterVec
wsDuration *prometheus.HistogramVec
wsErrors *prometheus.CounterVec
// Job queue metrics
jobsQueued prometheus.Counter
jobsCompleted *prometheus.CounterVec
jobsActive prometheus.Gauge
jobDuration *prometheus.HistogramVec
queueLength prometheus.Gauge
// Jupyter metrics
jupyterServices *prometheus.GaugeVec
jupyterOps *prometheus.CounterVec
// HTTP metrics
httpRequests *prometheus.CounterVec
httpDuration *prometheus.HistogramVec
// Prewarm metrics
prewarmSnapshotHit prometheus.Counter
prewarmSnapshotMiss prometheus.Counter
prewarmSnapshotBuilt prometheus.Counter
prewarmSnapshotTime prometheus.Histogram
registry *prometheus.Registry
}
// New creates a new Prometheus Metrics instance
func New() *Metrics {
m := &Metrics{
registry: prometheus.NewRegistry(),
}
m.initMetrics()
return m
}
// initMetrics initializes all Prometheus metrics
func (m *Metrics) initMetrics() {
// WebSocket metrics
m.wsConnections = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "fetchml_websocket_connections",
Help: "Number of active WebSocket connections",
},
[]string{"status"},
)
m.wsMessages = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "fetchml_websocket_messages_total",
Help: "Total number of WebSocket messages",
},
[]string{"opcode", "status"},
)
m.wsDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "fetchml_websocket_duration_seconds",
Help: "WebSocket message processing duration",
Buckets: prometheus.DefBuckets,
},
[]string{"opcode"},
)
m.wsErrors = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "fetchml_websocket_errors_total",
Help: "Total number of WebSocket errors",
},
[]string{"type"},
)
// Job queue metrics
m.jobsQueued = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "fetchml_jobs_queued_total",
Help: "Total number of jobs queued",
},
)
m.jobsCompleted = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "fetchml_jobs_completed_total",
Help: "Total number of completed jobs",
},
[]string{"status"},
)
m.jobsActive = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "fetchml_jobs_active",
Help: "Number of currently active jobs",
},
)
m.jobDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "fetchml_job_duration_seconds",
Help: "Job execution duration",
Buckets: []float64{1, 5, 10, 30, 60, 300, 600, 1800, 3600},
},
[]string{"status"},
)
m.queueLength = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "fetchml_queue_length",
Help: "Current job queue length",
},
)
// Jupyter metrics
m.jupyterServices = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "fetchml_jupyter_services",
Help: "Number of Jupyter services",
},
[]string{"status"},
)
m.jupyterOps = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "fetchml_jupyter_operations_total",
Help: "Total number of Jupyter operations",
},
[]string{"operation", "status"},
)
// HTTP metrics
m.httpRequests = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "fetchml_http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "endpoint", "status"},
)
m.httpDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "fetchml_http_duration_seconds",
Help: "HTTP request duration",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "endpoint"},
)
// Prewarm metrics
m.prewarmSnapshotHit = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "fetchml_prewarm_snapshot_hit_total",
Help: "Total number of prewarmed snapshot hits (snapshots found in .prewarm/)",
},
)
m.prewarmSnapshotMiss = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "fetchml_prewarm_snapshot_miss_total",
Help: "Total number of prewarmed snapshot misses (snapshots not found in .prewarm/)",
},
)
m.prewarmSnapshotBuilt = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "fetchml_prewarm_snapshot_built_total",
Help: "Total number of snapshots prewarmed into .prewarm/",
},
)
m.prewarmSnapshotTime = prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "fetchml_prewarm_snapshot_duration_seconds",
Help: "Time spent prewarming snapshots",
Buckets: []float64{0.1, 0.5, 1, 2, 5, 10, 30, 60, 120},
},
)
// Register all metrics
m.registry.MustRegister(
m.wsConnections,
m.wsMessages,
m.wsDuration,
m.wsErrors,
m.jobsQueued,
m.jobsCompleted,
m.jobsActive,
m.jobDuration,
m.queueLength,
m.jupyterServices,
m.jupyterOps,
m.httpRequests,
m.httpDuration,
m.prewarmSnapshotHit,
m.prewarmSnapshotMiss,
m.prewarmSnapshotBuilt,
m.prewarmSnapshotTime,
)
}
// Handler returns the Prometheus HTTP handler
func (m *Metrics) Handler() http.Handler {
return promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{})
}
// WebSocket metrics methods
func (m *Metrics) IncWSConnections(status string) {
m.wsConnections.WithLabelValues(status).Inc()
}
func (m *Metrics) DecWSConnections(status string) {
m.wsConnections.WithLabelValues(status).Dec()
}
func (m *Metrics) IncWSMessages(opcode, status string) {
m.wsMessages.WithLabelValues(opcode, status).Inc()
}
func (m *Metrics) ObserveWSDuration(opcode string, duration time.Duration) {
m.wsDuration.WithLabelValues(opcode).Observe(duration.Seconds())
}
func (m *Metrics) IncWSErrors(errType string) {
m.wsErrors.WithLabelValues(errType).Inc()
}
// Job queue metrics methods
func (m *Metrics) IncJobsQueued() {
m.jobsQueued.Inc()
}
func (m *Metrics) IncJobsCompleted(status string) {
m.jobsCompleted.WithLabelValues(status).Inc()
}
func (m *Metrics) SetJobsActive(count float64) {
m.jobsActive.Set(count)
}
func (m *Metrics) ObserveJobDuration(status string, duration time.Duration) {
m.jobDuration.WithLabelValues(status).Observe(duration.Seconds())
}
func (m *Metrics) SetQueueLength(length float64) {
m.queueLength.Set(length)
}
// Jupyter metrics methods
func (m *Metrics) SetJupyterServices(status string, count float64) {
m.jupyterServices.WithLabelValues(status).Set(count)
}
func (m *Metrics) IncJupyterOps(operation, status string) {
m.jupyterOps.WithLabelValues(operation, status).Inc()
}
// HTTP metrics methods
func (m *Metrics) IncHTTPRequests(method, endpoint, status string) {
m.httpRequests.WithLabelValues(method, endpoint, status).Inc()
}
func (m *Metrics) ObserveHTTPDuration(method, endpoint string, duration time.Duration) {
m.httpDuration.WithLabelValues(method, endpoint).Observe(duration.Seconds())
}
// Prewarm metrics methods
func (m *Metrics) IncPrewarmSnapshotHit() {
m.prewarmSnapshotHit.Inc()
}
func (m *Metrics) IncPrewarmSnapshotMiss() {
m.prewarmSnapshotMiss.Inc()
}
func (m *Metrics) IncPrewarmSnapshotBuilt() {
m.prewarmSnapshotBuilt.Inc()
}
func (m *Metrics) ObservePrewarmSnapshotDuration(duration time.Duration) {
m.prewarmSnapshotTime.Observe(duration.Seconds())
}