fetch_ml/monitoring/dashboards/grafana-dashboard.json
Jeremie Fraeys ea15af1833 Fix multi-user authentication and clean up debug code
- Fix YAML tags in auth config struct (json -> yaml)
- Update CLI configs to use pre-hashed API keys
- Remove double hashing in WebSocket client
- Fix port mapping (9102 -> 9103) in CLI commands
- Update permission keys to use jobs:read, jobs:create, etc.
- Clean up all debug logging from CLI and server
- All user roles now authenticate correctly:
  * Admin: Can queue jobs and see all jobs
  * Researcher: Can queue jobs and see own jobs
  * Analyst: Can see status (read-only access)

Multi-user authentication is now fully functional.
2025-12-06 12:35:32 -05:00

147 lines
No EOL
4.3 KiB
JSON

{
"dashboard": {
"title": "ML Task Queue Monitoring",
"tags": [
"ml",
"queue",
"fetch_ml"
],
"timezone": "browser",
"panels": [
{
"title": "Queue Depth",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "fetch_ml_queue_depth",
"legendFormat": "Queue Depth"
}
]
},
{
"title": "Active Tasks",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "sum(fetch_ml_active_tasks) by (worker_id)",
"legendFormat": "{{worker_id}}"
}
]
},
{
"title": "Task Duration (p50, p95, p99)",
"type": "graph",
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 8
},
"targets": [
{
"expr": "histogram_quantile(0.50, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
"legendFormat": "p50"
},
{
"expr": "histogram_quantile(0.95, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
"legendFormat": "p95"
},
{
"expr": "histogram_quantile(0.99, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
"legendFormat": "p99"
}
]
},
{
"title": "Task Completion Rate",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"targets": [
{
"expr": "rate(fetch_ml_tasks_completed_total[5m])",
"legendFormat": "{{status}}"
}
]
},
{
"title": "Failure Rate by Error Category",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"targets": [
{
"expr": "rate(fetch_ml_task_failures_total[5m])",
"legendFormat": "{{error_category}}"
}
]
},
{
"title": "Retry Rate",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"targets": [
{
"expr": "rate(fetch_ml_task_retries_total[5m])",
"legendFormat": "{{error_category}}"
}
]
},
{
"title": "Dead Letter Queue Size",
"type": "stat",
"gridPos": {
"h": 8,
"w": 6,
"x": 12,
"y": 24
},
"targets": [
{
"expr": "fetch_ml_dlq_size"
}
]
},
{
"title": "Lease Expirations",
"type": "stat",
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 24
},
"targets": [
{
"expr": "fetch_ml_lease_expirations_total"
}
]
}
]
}
}