{ "dashboard": { "title": "ML Task Queue Monitoring", "tags": [ "ml", "queue", "fetch_ml" ], "timezone": "browser", "panels": [ { "title": "Queue Depth", "type": "graph", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, "targets": [ { "expr": "fetch_ml_queue_depth", "legendFormat": "Queue Depth" } ] }, { "title": "Active Tasks", "type": "graph", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, "targets": [ { "expr": "sum(fetch_ml_active_tasks) by (worker_id)", "legendFormat": "{{worker_id}}" } ] }, { "title": "Task Duration (p50, p95, p99)", "type": "graph", "gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 }, "targets": [ { "expr": "histogram_quantile(0.50, rate(fetch_ml_task_duration_seconds_bucket[5m]))", "legendFormat": "p50" }, { "expr": "histogram_quantile(0.95, rate(fetch_ml_task_duration_seconds_bucket[5m]))", "legendFormat": "p95" }, { "expr": "histogram_quantile(0.99, rate(fetch_ml_task_duration_seconds_bucket[5m]))", "legendFormat": "p99" } ] }, { "title": "Task Completion Rate", "type": "graph", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, "targets": [ { "expr": "rate(fetch_ml_tasks_completed_total[5m])", "legendFormat": "{{status}}" } ] }, { "title": "Failure Rate by Error Category", "type": "graph", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, "targets": [ { "expr": "rate(fetch_ml_task_failures_total[5m])", "legendFormat": "{{error_category}}" } ] }, { "title": "Retry Rate", "type": "graph", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, "targets": [ { "expr": "rate(fetch_ml_task_retries_total[5m])", "legendFormat": "{{error_category}}" } ] }, { "title": "Dead Letter Queue Size", "type": "stat", "gridPos": { "h": 8, "w": 6, "x": 12, "y": 24 }, "targets": [ { "expr": "fetch_ml_dlq_size" } ] }, { "title": "Lease Expirations", "type": "stat", "gridPos": { "h": 8, "w": 6, "x": 18, "y": 24 }, "targets": [ { "expr": "fetch_ml_lease_expirations_total" } ] } ] } }