fetch_ml/monitoring/grafana-dashboard.json
Jeremie Fraeys 4aecd469a1 feat: implement comprehensive monitoring and container orchestration
- Add Prometheus, Grafana, and Loki monitoring stack
- Include pre-configured dashboards for ML metrics and logs
- Add Podman container support with security policies
- Implement ML runtime environments for multiple frameworks
- Add containerized ML project templates (PyTorch, TensorFlow, etc.)
- Include secure runner with isolation and resource limits
- Add comprehensive log aggregation and alerting
2025-12-04 16:54:49 -05:00

147 lines
No EOL
4.3 KiB
JSON

{
"dashboard": {
"title": "ML Task Queue Monitoring",
"tags": [
"ml",
"queue",
"fetch_ml"
],
"timezone": "browser",
"panels": [
{
"title": "Queue Depth",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "fetch_ml_queue_depth",
"legendFormat": "Queue Depth"
}
]
},
{
"title": "Active Tasks",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "sum(fetch_ml_active_tasks) by (worker_id)",
"legendFormat": "{{worker_id}}"
}
]
},
{
"title": "Task Duration (p50, p95, p99)",
"type": "graph",
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 8
},
"targets": [
{
"expr": "histogram_quantile(0.50, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
"legendFormat": "p50"
},
{
"expr": "histogram_quantile(0.95, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
"legendFormat": "p95"
},
{
"expr": "histogram_quantile(0.99, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
"legendFormat": "p99"
}
]
},
{
"title": "Task Completion Rate",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"targets": [
{
"expr": "rate(fetch_ml_tasks_completed_total[5m])",
"legendFormat": "{{status}}"
}
]
},
{
"title": "Failure Rate by Error Category",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"targets": [
{
"expr": "rate(fetch_ml_task_failures_total[5m])",
"legendFormat": "{{error_category}}"
}
]
},
{
"title": "Retry Rate",
"type": "graph",
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"targets": [
{
"expr": "rate(fetch_ml_task_retries_total[5m])",
"legendFormat": "{{error_category}}"
}
]
},
{
"title": "Dead Letter Queue Size",
"type": "stat",
"gridPos": {
"h": 8,
"w": 6,
"x": 12,
"y": 24
},
"targets": [
{
"expr": "fetch_ml_dlq_size"
}
]
},
{
"title": "Lease Expirations",
"type": "stat",
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 24
},
"targets": [
{
"expr": "fetch_ml_lease_expirations_total"
}
]
}
]
}
}