- Fix YAML tags in auth config struct (json -> yaml) - Update CLI configs to use pre-hashed API keys - Remove double hashing in WebSocket client - Fix port mapping (9102 -> 9103) in CLI commands - Update permission keys to use jobs:read, jobs:create, etc. - Clean up all debug logging from CLI and server - All user roles now authenticate correctly: * Admin: Can queue jobs and see all jobs * Researcher: Can queue jobs and see own jobs * Analyst: Can see status (read-only access) Multi-user authentication is now fully functional.
147 lines
No EOL
4.3 KiB
JSON
147 lines
No EOL
4.3 KiB
JSON
{
|
|
"dashboard": {
|
|
"title": "ML Task Queue Monitoring",
|
|
"tags": [
|
|
"ml",
|
|
"queue",
|
|
"fetch_ml"
|
|
],
|
|
"timezone": "browser",
|
|
"panels": [
|
|
{
|
|
"title": "Queue Depth",
|
|
"type": "graph",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 0
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "fetch_ml_queue_depth",
|
|
"legendFormat": "Queue Depth"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Active Tasks",
|
|
"type": "graph",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 0
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "sum(fetch_ml_active_tasks) by (worker_id)",
|
|
"legendFormat": "{{worker_id}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Task Duration (p50, p95, p99)",
|
|
"type": "graph",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 24,
|
|
"x": 0,
|
|
"y": 8
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.50, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "p50"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "p95"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.99, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "p99"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Task Completion Rate",
|
|
"type": "graph",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 16
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(fetch_ml_tasks_completed_total[5m])",
|
|
"legendFormat": "{{status}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Failure Rate by Error Category",
|
|
"type": "graph",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 16
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(fetch_ml_task_failures_total[5m])",
|
|
"legendFormat": "{{error_category}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Retry Rate",
|
|
"type": "graph",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 24
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "rate(fetch_ml_task_retries_total[5m])",
|
|
"legendFormat": "{{error_category}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Dead Letter Queue Size",
|
|
"type": "stat",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 6,
|
|
"x": 12,
|
|
"y": 24
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "fetch_ml_dlq_size"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Lease Expirations",
|
|
"type": "stat",
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 6,
|
|
"x": 18,
|
|
"y": 24
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "fetch_ml_lease_expirations_total"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
} |