295 lines
7.2 KiB
Go
295 lines
7.2 KiB
Go
package prommetrics
|
|
|
|
import (
|
|
"net/http"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
|
)
|
|
|
|
// Metrics holds all Prometheus metrics for the application
|
|
type Metrics struct {
|
|
// WebSocket metrics
|
|
wsConnections *prometheus.GaugeVec
|
|
wsMessages *prometheus.CounterVec
|
|
wsDuration *prometheus.HistogramVec
|
|
wsErrors *prometheus.CounterVec
|
|
|
|
// Job queue metrics
|
|
jobsQueued prometheus.Counter
|
|
jobsCompleted *prometheus.CounterVec
|
|
jobsActive prometheus.Gauge
|
|
jobDuration *prometheus.HistogramVec
|
|
queueLength prometheus.Gauge
|
|
|
|
// Jupyter metrics
|
|
jupyterServices *prometheus.GaugeVec
|
|
jupyterOps *prometheus.CounterVec
|
|
|
|
// HTTP metrics
|
|
httpRequests *prometheus.CounterVec
|
|
httpDuration *prometheus.HistogramVec
|
|
|
|
// Prewarm metrics
|
|
prewarmSnapshotHit prometheus.Counter
|
|
prewarmSnapshotMiss prometheus.Counter
|
|
prewarmSnapshotBuilt prometheus.Counter
|
|
prewarmSnapshotTime prometheus.Histogram
|
|
|
|
registry *prometheus.Registry
|
|
}
|
|
|
|
// New creates a new Prometheus Metrics instance
|
|
func New() *Metrics {
|
|
m := &Metrics{
|
|
registry: prometheus.NewRegistry(),
|
|
}
|
|
|
|
m.initMetrics()
|
|
return m
|
|
}
|
|
|
|
// initMetrics initializes all Prometheus metrics
|
|
func (m *Metrics) initMetrics() {
|
|
// WebSocket metrics
|
|
m.wsConnections = prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "fetchml_websocket_connections",
|
|
Help: "Number of active WebSocket connections",
|
|
},
|
|
[]string{"status"},
|
|
)
|
|
|
|
m.wsMessages = prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "fetchml_websocket_messages_total",
|
|
Help: "Total number of WebSocket messages",
|
|
},
|
|
[]string{"opcode", "status"},
|
|
)
|
|
|
|
m.wsDuration = prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "fetchml_websocket_duration_seconds",
|
|
Help: "WebSocket message processing duration",
|
|
Buckets: prometheus.DefBuckets,
|
|
},
|
|
[]string{"opcode"},
|
|
)
|
|
|
|
m.wsErrors = prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "fetchml_websocket_errors_total",
|
|
Help: "Total number of WebSocket errors",
|
|
},
|
|
[]string{"type"},
|
|
)
|
|
|
|
// Job queue metrics
|
|
m.jobsQueued = prometheus.NewCounter(
|
|
prometheus.CounterOpts{
|
|
Name: "fetchml_jobs_queued_total",
|
|
Help: "Total number of jobs queued",
|
|
},
|
|
)
|
|
|
|
m.jobsCompleted = prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "fetchml_jobs_completed_total",
|
|
Help: "Total number of completed jobs",
|
|
},
|
|
[]string{"status"},
|
|
)
|
|
|
|
m.jobsActive = prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Name: "fetchml_jobs_active",
|
|
Help: "Number of currently active jobs",
|
|
},
|
|
)
|
|
|
|
m.jobDuration = prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "fetchml_job_duration_seconds",
|
|
Help: "Job execution duration",
|
|
Buckets: []float64{1, 5, 10, 30, 60, 300, 600, 1800, 3600},
|
|
},
|
|
[]string{"status"},
|
|
)
|
|
|
|
m.queueLength = prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Name: "fetchml_queue_length",
|
|
Help: "Current job queue length",
|
|
},
|
|
)
|
|
|
|
// Jupyter metrics
|
|
m.jupyterServices = prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "fetchml_jupyter_services",
|
|
Help: "Number of Jupyter services",
|
|
},
|
|
[]string{"status"},
|
|
)
|
|
|
|
m.jupyterOps = prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "fetchml_jupyter_operations_total",
|
|
Help: "Total number of Jupyter operations",
|
|
},
|
|
[]string{"operation", "status"},
|
|
)
|
|
|
|
// HTTP metrics
|
|
m.httpRequests = prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "fetchml_http_requests_total",
|
|
Help: "Total number of HTTP requests",
|
|
},
|
|
[]string{"method", "endpoint", "status"},
|
|
)
|
|
|
|
m.httpDuration = prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "fetchml_http_duration_seconds",
|
|
Help: "HTTP request duration",
|
|
Buckets: prometheus.DefBuckets,
|
|
},
|
|
[]string{"method", "endpoint"},
|
|
)
|
|
|
|
// Prewarm metrics
|
|
m.prewarmSnapshotHit = prometheus.NewCounter(
|
|
prometheus.CounterOpts{
|
|
Name: "fetchml_prewarm_snapshot_hit_total",
|
|
Help: "Total number of prewarmed snapshot hits (snapshots found in .prewarm/)",
|
|
},
|
|
)
|
|
|
|
m.prewarmSnapshotMiss = prometheus.NewCounter(
|
|
prometheus.CounterOpts{
|
|
Name: "fetchml_prewarm_snapshot_miss_total",
|
|
Help: "Total number of prewarmed snapshot misses (snapshots not found in .prewarm/)",
|
|
},
|
|
)
|
|
|
|
m.prewarmSnapshotBuilt = prometheus.NewCounter(
|
|
prometheus.CounterOpts{
|
|
Name: "fetchml_prewarm_snapshot_built_total",
|
|
Help: "Total number of snapshots prewarmed into .prewarm/",
|
|
},
|
|
)
|
|
|
|
m.prewarmSnapshotTime = prometheus.NewHistogram(
|
|
prometheus.HistogramOpts{
|
|
Name: "fetchml_prewarm_snapshot_duration_seconds",
|
|
Help: "Time spent prewarming snapshots",
|
|
Buckets: []float64{0.1, 0.5, 1, 2, 5, 10, 30, 60, 120},
|
|
},
|
|
)
|
|
|
|
// Register all metrics
|
|
m.registry.MustRegister(
|
|
m.wsConnections,
|
|
m.wsMessages,
|
|
m.wsDuration,
|
|
m.wsErrors,
|
|
m.jobsQueued,
|
|
m.jobsCompleted,
|
|
m.jobsActive,
|
|
m.jobDuration,
|
|
m.queueLength,
|
|
m.jupyterServices,
|
|
m.jupyterOps,
|
|
m.httpRequests,
|
|
m.httpDuration,
|
|
m.prewarmSnapshotHit,
|
|
m.prewarmSnapshotMiss,
|
|
m.prewarmSnapshotBuilt,
|
|
m.prewarmSnapshotTime,
|
|
)
|
|
}
|
|
|
|
// Handler returns the Prometheus HTTP handler
|
|
func (m *Metrics) Handler() http.Handler {
|
|
return promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{})
|
|
}
|
|
|
|
// WebSocket metrics methods
|
|
func (m *Metrics) IncWSConnections(status string) {
|
|
m.wsConnections.WithLabelValues(status).Inc()
|
|
}
|
|
|
|
func (m *Metrics) DecWSConnections(status string) {
|
|
m.wsConnections.WithLabelValues(status).Dec()
|
|
}
|
|
|
|
func (m *Metrics) IncWSMessages(opcode, status string) {
|
|
m.wsMessages.WithLabelValues(opcode, status).Inc()
|
|
}
|
|
|
|
func (m *Metrics) ObserveWSDuration(opcode string, duration time.Duration) {
|
|
m.wsDuration.WithLabelValues(opcode).Observe(duration.Seconds())
|
|
}
|
|
|
|
func (m *Metrics) IncWSErrors(errType string) {
|
|
m.wsErrors.WithLabelValues(errType).Inc()
|
|
}
|
|
|
|
// Job queue metrics methods
|
|
func (m *Metrics) IncJobsQueued() {
|
|
m.jobsQueued.Inc()
|
|
}
|
|
|
|
func (m *Metrics) IncJobsCompleted(status string) {
|
|
m.jobsCompleted.WithLabelValues(status).Inc()
|
|
}
|
|
|
|
func (m *Metrics) SetJobsActive(count float64) {
|
|
m.jobsActive.Set(count)
|
|
}
|
|
|
|
func (m *Metrics) ObserveJobDuration(status string, duration time.Duration) {
|
|
m.jobDuration.WithLabelValues(status).Observe(duration.Seconds())
|
|
}
|
|
|
|
func (m *Metrics) SetQueueLength(length float64) {
|
|
m.queueLength.Set(length)
|
|
}
|
|
|
|
// Jupyter metrics methods
|
|
func (m *Metrics) SetJupyterServices(status string, count float64) {
|
|
m.jupyterServices.WithLabelValues(status).Set(count)
|
|
}
|
|
|
|
func (m *Metrics) IncJupyterOps(operation, status string) {
|
|
m.jupyterOps.WithLabelValues(operation, status).Inc()
|
|
}
|
|
|
|
// HTTP metrics methods
|
|
func (m *Metrics) IncHTTPRequests(method, endpoint, status string) {
|
|
m.httpRequests.WithLabelValues(method, endpoint, status).Inc()
|
|
}
|
|
|
|
func (m *Metrics) ObserveHTTPDuration(method, endpoint string, duration time.Duration) {
|
|
m.httpDuration.WithLabelValues(method, endpoint).Observe(duration.Seconds())
|
|
}
|
|
|
|
// Prewarm metrics methods
|
|
func (m *Metrics) IncPrewarmSnapshotHit() {
|
|
m.prewarmSnapshotHit.Inc()
|
|
}
|
|
|
|
func (m *Metrics) IncPrewarmSnapshotMiss() {
|
|
m.prewarmSnapshotMiss.Inc()
|
|
}
|
|
|
|
func (m *Metrics) IncPrewarmSnapshotBuilt() {
|
|
m.prewarmSnapshotBuilt.Inc()
|
|
}
|
|
|
|
func (m *Metrics) ObservePrewarmSnapshotDuration(duration time.Duration) {
|
|
m.prewarmSnapshotTime.Observe(duration.Seconds())
|
|
}
|