fetch_ml/monitoring/dashboards/grafana-dashboard.json

{
    "dashboard": {
        "title": "ML Task Queue Monitoring",
        "tags": [
            "ml",
            "queue",
            "fetch_ml"
        ],
        "timezone": "browser",
        "panels": [
            {
                "title": "Queue Depth",
                "type": "graph",
                "gridPos": {
                    "h": 8,
                    "w": 12,
                    "x": 0,
                    "y": 0
                },
                "targets": [
                    {
                        "expr": "fetch_ml_queue_depth",
                        "legendFormat": "Queue Depth"
                    }
                ]
            },
            {
                "title": "Active Tasks",
                "type": "graph",
                "gridPos": {
                    "h": 8,
                    "w": 12,
                    "x": 12,
                    "y": 0
                },
                "targets": [
                    {
                        "expr": "sum(fetch_ml_active_tasks) by (worker_id)",
                        "legendFormat": "{{worker_id}}"
                    }
                ]
            },
            {
                "title": "Task Duration (p50, p95, p99)",
                "type": "graph",
                "gridPos": {
                    "h": 8,
                    "w": 24,
                    "x": 0,
                    "y": 8
                },
                "targets": [
                    {
                        "expr": "histogram_quantile(0.50, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
                        "legendFormat": "p50"
                    },
                    {
                        "expr": "histogram_quantile(0.95, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
                        "legendFormat": "p95"
                    },
                    {
                        "expr": "histogram_quantile(0.99, rate(fetch_ml_task_duration_seconds_bucket[5m]))",
                        "legendFormat": "p99"
                    }
                ]
            },
            {
                "title": "Task Completion Rate",
                "type": "graph",
                "gridPos": {
                    "h": 8,
                    "w": 12,
                    "x": 0,
                    "y": 16
                },
                "targets": [
                    {
                        "expr": "rate(fetch_ml_tasks_completed_total[5m])",
                        "legendFormat": "{{status}}"
                    }
                ]
            },
            {
                "title": "Failure Rate by Error Category",
                "type": "graph",
                "gridPos": {
                    "h": 8,
                    "w": 12,
                    "x": 12,
                    "y": 16
                },
                "targets": [
                    {
                        "expr": "rate(fetch_ml_task_failures_total[5m])",
                        "legendFormat": "{{error_category}}"
                    }
                ]
            },
            {
                "title": "Retry Rate",
                "type": "graph",
                "gridPos": {
                    "h": 8,
                    "w": 12,
                    "x": 0,
                    "y": 24
                },
                "targets": [
                    {
                        "expr": "rate(fetch_ml_task_retries_total[5m])",
                        "legendFormat": "{{error_category}}"
                    }
                ]
            },
            {
                "title": "Dead Letter Queue Size",
                "type": "stat",
                "gridPos": {
                    "h": 8,
                    "w": 6,
                    "x": 12,
                    "y": 24
                },
                "targets": [
                    {
                        "expr": "fetch_ml_dlq_size"
                    }
                ]
            },
            {
                "title": "Lease Expirations",
                "type": "stat",
                "gridPos": {
                    "h": 8,
                    "w": 6,
                    "x": 18,
                    "y": 24
                },
                "targets": [
                    {
                        "expr": "fetch_ml_lease_expirations_total"
                    }
                ]
            }
        ]
    }
}