fetch_ml/internal/metrics/metrics.go
Jeremie Fraeys 38fa017b8e
refactor: Phase 6 - Complete migration, remove legacy files
BREAKING CHANGE: Legacy worker files removed, Worker struct simplified

Changes:
1. worker.go - Simplified to 8 fields using composed dependencies:
   - runLoop, runner, metrics, health (from new packages)
   - Removed: server, queue, running, datasetCache, ctx, cancel, etc.

2. factory.go - Updated NewWorker to use new structure
   - Uses lifecycle.NewRunLoop
   - Integrates jupyter.Manager properly

3. Removed legacy files:
   - execution.go (1,016 lines)
   - data_integrity.go (929 lines)
   - runloop.go (555 lines)
   - jupyter_task.go (144 lines)
   - simplified.go (demonstration no longer needed)

4. Fixed references to use new packages:
   - hash_selector.go -> integrity.DirOverallSHA256Hex
   - snapshot_store.go -> integrity.NormalizeSHA256ChecksumHex
   - metrics.go - Removed resource-dependent metrics temporarily

5. Added RecordQueueLatency to metrics.Metrics for lifecycle.MetricsRecorder

Worker struct: 27 fields -> 8 fields (70% reduction)

Build status: Compiles successfully
2026-02-17 14:39:48 -05:00

127 lines
4 KiB
Go

// Package metrics provides performance tracking and statistics collection.
package metrics
import (
"sync/atomic"
"time"
)
func int64Max(a, b int64) int64 {
if a > b {
return a
}
return b
}
// Metrics tracks various performance counters and statistics.
type Metrics struct {
TasksProcessed atomic.Int64
TasksFailed atomic.Int64
DataFetchTime atomic.Int64 // Total nanoseconds
ExecutionTime atomic.Int64
DataTransferred atomic.Int64 // Total bytes
ActiveTasks atomic.Int64
QueuedTasks atomic.Int64
PrewarmEnvHit atomic.Int64
PrewarmEnvMiss atomic.Int64
PrewarmEnvBuilt atomic.Int64
PrewarmEnvTime atomic.Int64 // Total nanoseconds
PrewarmSnapshotHit atomic.Int64
PrewarmSnapshotMiss atomic.Int64
PrewarmSnapshotBuilt atomic.Int64
PrewarmSnapshotTime atomic.Int64 // Total nanoseconds
}
// RecordTaskSuccess records successful task completion with duration.
func (m *Metrics) RecordTaskSuccess(duration time.Duration) {
m.TasksProcessed.Add(1)
m.ExecutionTime.Add(duration.Nanoseconds())
}
// RecordTaskFailure records a task failure.
func (m *Metrics) RecordTaskFailure() {
m.TasksFailed.Add(1)
}
// RecordTaskStart records the start of a task.
func (m *Metrics) RecordTaskStart() {
m.ActiveTasks.Add(1)
}
// RecordTaskCompletion decrements the number of active tasks. It is safe to call
// even if no tasks are currently recorded; the caller should ensure calls are
// balanced with RecordTaskStart.
func (m *Metrics) RecordTaskCompletion() {
m.ActiveTasks.Add(-1)
}
// RecordDataTransfer records data transfer statistics.
func (m *Metrics) RecordDataTransfer(bytes int64, duration time.Duration) {
m.DataTransferred.Add(bytes)
m.DataFetchTime.Add(duration.Nanoseconds())
}
func (m *Metrics) RecordPrewarmEnvHit() {
m.PrewarmEnvHit.Add(1)
}
func (m *Metrics) RecordPrewarmEnvMiss() {
m.PrewarmEnvMiss.Add(1)
}
func (m *Metrics) RecordPrewarmEnvBuilt(duration time.Duration) {
m.PrewarmEnvBuilt.Add(1)
m.PrewarmEnvTime.Add(duration.Nanoseconds())
}
func (m *Metrics) RecordPrewarmSnapshotHit() {
m.PrewarmSnapshotHit.Add(1)
}
func (m *Metrics) RecordPrewarmSnapshotMiss() {
m.PrewarmSnapshotMiss.Add(1)
}
func (m *Metrics) RecordPrewarmSnapshotBuilt(duration time.Duration) {
m.PrewarmSnapshotBuilt.Add(1)
m.PrewarmSnapshotTime.Add(duration.Nanoseconds())
}
// RecordQueueLatency records the queue latency duration.
// This method implements the lifecycle.MetricsRecorder interface.
func (m *Metrics) RecordQueueLatency(duration time.Duration) {
// Queue latency tracking is currently a no-op
// This can be implemented in the future if needed
}
// SetQueuedTasks sets the number of queued tasks.
func (m *Metrics) SetQueuedTasks(count int64) {
m.QueuedTasks.Store(count)
}
// GetStats returns current metrics as a map.
func (m *Metrics) GetStats() map[string]any {
processed := m.TasksProcessed.Load()
failed := m.TasksFailed.Load()
dataTransferred := m.DataTransferred.Load()
dataFetchTime := m.DataFetchTime.Load()
return map[string]any{
"tasks_processed": processed,
"tasks_failed": failed,
"active_tasks": m.ActiveTasks.Load(),
"queued_tasks": m.QueuedTasks.Load(),
"success_rate": float64(processed-failed) / float64(int64Max(processed, 1)),
"avg_exec_time": time.Duration(m.ExecutionTime.Load() / int64Max(processed, 1)),
"data_transferred_gb": float64(dataTransferred) / (1024 * 1024 * 1024),
"avg_fetch_time": time.Duration(dataFetchTime / int64Max(processed, 1)),
"prewarm_env_hit": m.PrewarmEnvHit.Load(),
"prewarm_env_miss": m.PrewarmEnvMiss.Load(),
"prewarm_env_built": m.PrewarmEnvBuilt.Load(),
"prewarm_env_time": time.Duration(m.PrewarmEnvTime.Load()),
"prewarm_snapshot_hit": m.PrewarmSnapshotHit.Load(),
"prewarm_snapshot_miss": m.PrewarmSnapshotMiss.Load(),
"prewarm_snapshot_built": m.PrewarmSnapshotBuilt.Load(),
"prewarm_snapshot_time": time.Duration(m.PrewarmSnapshotTime.Load()),
}
}