refactor(phase7): Restore resource metrics in metrics.go

- Re-enabled all resource metrics (CPU, GPU, acquisition stats)
- Metrics are conditionally registered only when w.resources != nil
- Added nil check to prevent panics if resource manager not initialized

Build status: Compiles successfully
This commit is contained in:
Jeremie Fraeys 2026-02-17 16:38:47 -05:00
parent 1ba67e419d
commit 51698d60de
No known key found for this signature in database

View file

@ -2,6 +2,7 @@ package worker
import (
"net/http"
"strconv"
"time"
"github.com/prometheus/client_golang/prometheus"
@ -135,8 +136,75 @@ func (w *Worker) setupMetricsExporter() {
return float64(w.config.MaxWorkers)
}))
// Note: Resource metrics temporarily disabled during migration
// These will be re-enabled once resource manager is integrated
// Resource metrics
if w.resources != nil {
reg.MustRegister(prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "fetchml_resources_cpu_total",
Help: "Total CPU tokens managed by the worker resource manager.",
ConstLabels: labels,
}, func() float64 {
return float64(w.resources.Snapshot().TotalCPU)
}))
reg.MustRegister(prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "fetchml_resources_cpu_free",
Help: "Free CPU tokens currently available in the worker resource manager.",
ConstLabels: labels,
}, func() float64 {
return float64(w.resources.Snapshot().FreeCPU)
}))
reg.MustRegister(prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "fetchml_resources_acquire_total",
Help: "Total resource acquisition attempts.",
ConstLabels: labels,
}, func() float64 {
return float64(w.resources.Snapshot().AcquireTotal)
}))
reg.MustRegister(prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "fetchml_resources_acquire_wait_total",
Help: "Total resource acquisitions that had to wait for resources.",
ConstLabels: labels,
}, func() float64 {
return float64(w.resources.Snapshot().AcquireWaitTotal)
}))
reg.MustRegister(prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "fetchml_resources_acquire_timeout_total",
Help: "Total resource acquisition attempts that timed out.",
ConstLabels: labels,
}, func() float64 {
return float64(w.resources.Snapshot().AcquireTimeoutTotal)
}))
reg.MustRegister(prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "fetchml_resources_acquire_wait_seconds_total",
Help: "Total seconds spent waiting for resources across all acquisitions.",
ConstLabels: labels,
}, func() float64 {
return w.resources.Snapshot().AcquireWaitSeconds
}))
snap := w.resources.Snapshot()
for i := range snap.GPUFree {
gpuLabels := prometheus.Labels{"worker_id": w.id, "gpu_index": strconv.Itoa(i)}
idx := i
reg.MustRegister(prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "fetchml_resources_gpu_slots_total",
Help: "Total GPU slots per GPU index.",
ConstLabels: gpuLabels,
}, func() float64 {
return float64(w.resources.Snapshot().SlotsPerGPU)
}))
reg.MustRegister(prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "fetchml_resources_gpu_slots_free",
Help: "Free GPU slots per GPU index.",
ConstLabels: gpuLabels,
}, func() float64 {
s := w.resources.Snapshot()
if idx < 0 || idx >= len(s.GPUFree) {
return 0
}
return float64(s.GPUFree[idx])
}))
}
}
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{}))