fetch_ml/monitoring/grafana/dashboards/worker-resources.json

280 lines
6.7 KiB
JSON

{
"id": null,
"title": "Worker Resources",
"tags": [
"worker",
"resources"
],
"panels": [
{
"id": 1,
"title": "CPU Free",
"type": "stat",
"targets": [
{
"expr": "fetchml_resources_cpu_free",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {
"h": 6,
"w": 6,
"x": 0,
"y": 0
}
},
{
"id": 2,
"title": "CPU Total",
"type": "stat",
"targets": [
{
"expr": "fetchml_resources_cpu_total",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {
"h": 6,
"w": 6,
"x": 6,
"y": 0
}
},
{
"id": 3,
"title": "CPU Utilization (%)",
"type": "graph",
"targets": [
{
"expr": "100 * (1 - (fetchml_resources_cpu_free / clamp_min(fetchml_resources_cpu_total, 1)))",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
}
},
{
"id": 4,
"title": "GPU Slots Free",
"type": "graph",
"targets": [
{
"expr": "fetchml_resources_gpu_slots_free",
"legendFormat": "{{worker_id}} gpu={{gpu_index}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 6
}
},
{
"id": 5,
"title": "Acquire Wait / Timeout (Totals)",
"type": "graph",
"targets": [
{
"expr": "fetchml_resources_acquire_wait_total",
"legendFormat": "wait {{worker_id}}"
},
{
"expr": "fetchml_resources_acquire_timeout_total",
"legendFormat": "timeout {{worker_id}}"
},
{
"expr": "fetchml_resources_acquire_total",
"legendFormat": "total {{worker_id}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
}
},
{
"id": 6,
"title": "Avg Acquire Wait (seconds)",
"type": "stat",
"targets": [
{
"expr": "fetchml_resources_acquire_wait_seconds_total / clamp_min(fetchml_resources_acquire_wait_total, 1)",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {
"h": 6,
"w": 6,
"x": 0,
"y": 14
}
},
{
"id": 7,
"title": "Acquire Wait Ratio",
"type": "stat",
"targets": [
{
"expr": "fetchml_resources_acquire_wait_total / clamp_min(fetchml_resources_acquire_total, 1)",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {
"h": 6,
"w": 6,
"x": 6,
"y": 14
}
},
{
"id": 8,
"title": "Environment Prewarm Hit Rate (%)",
"type": "stat",
"targets": [
{
"expr": "100 * (fetchml_prewarm_env_hit_total / clamp_min(fetchml_prewarm_env_hit_total + fetchml_prewarm_env_miss_total, 1))",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {
"h": 6,
"w": 6,
"x": 12,
"y": 14
},
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 50},
{"color": "green", "value": 80}
]
}
}
}
},
{
"id": 9,
"title": "Snapshot Prewarm Hit Rate (%)",
"type": "stat",
"targets": [
{
"expr": "100 * (fetchml_prewarm_snapshot_hit_total / clamp_min(fetchml_prewarm_snapshot_hit_total + fetchml_prewarm_snapshot_miss_total, 1))",
"legendFormat": "{{worker_id}}"
}
],
"gridPos": {
"h": 6,
"w": 6,
"x": 18,
"y": 14
},
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 50},
{"color": "green", "value": 80}
]
}
}
}
},
{
"id": 10,
"title": "Prewarm Hits vs Misses",
"type": "graph",
"targets": [
{
"expr": "rate(fetchml_prewarm_env_hit_total[5m])",
"legendFormat": "env hits {{worker_id}}"
},
{
"expr": "rate(fetchml_prewarm_env_miss_total[5m])",
"legendFormat": "env misses {{worker_id}}"
},
{
"expr": "rate(fetchml_prewarm_snapshot_hit_total[5m])",
"legendFormat": "snapshot hits {{worker_id}}"
},
{
"expr": "rate(fetchml_prewarm_snapshot_miss_total[5m])",
"legendFormat": "snapshot misses {{worker_id}}"
}
],
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 20
},
"yAxes": [
{"unit": "reqps"}
]
},
{
"id": 11,
"title": "Prewarm Build Time",
"type": "graph",
"targets": [
{
"expr": "rate(fetchml_prewarm_env_time_seconds_total[5m])",
"legendFormat": "env build {{worker_id}}"
},
{
"expr": "rate(fetchml_prewarm_snapshot_time_seconds_total[5m])",
"legendFormat": "snapshot prewarm {{worker_id}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 28
},
"yAxes": [
{"unit": "seconds"}
]
},
{
"id": 12,
"title": "Prewarm Builds",
"type": "graph",
"targets": [
{
"expr": "increase(fetchml_prewarm_env_built_total[1h])",
"legendFormat": "env built {{worker_id}}"
},
{
"expr": "increase(fetchml_prewarm_snapshot_built_total[1h])",
"legendFormat": "snapshots prewarmed {{worker_id}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 28
},
"yAxes": [
{"unit": "short"}
]
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "5s"
}