280 lines
6.7 KiB
JSON
280 lines
6.7 KiB
JSON
{
|
|
"id": null,
|
|
"title": "Worker Resources",
|
|
"tags": [
|
|
"worker",
|
|
"resources"
|
|
],
|
|
"panels": [
|
|
{
|
|
"id": 1,
|
|
"title": "CPU Free",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "fetchml_resources_cpu_free",
|
|
"legendFormat": "{{worker_id}}"
|
|
}
|
|
],
|
|
"gridPos": {
|
|
"h": 6,
|
|
"w": 6,
|
|
"x": 0,
|
|
"y": 0
|
|
}
|
|
},
|
|
{
|
|
"id": 2,
|
|
"title": "CPU Total",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "fetchml_resources_cpu_total",
|
|
"legendFormat": "{{worker_id}}"
|
|
}
|
|
],
|
|
"gridPos": {
|
|
"h": 6,
|
|
"w": 6,
|
|
"x": 6,
|
|
"y": 0
|
|
}
|
|
},
|
|
{
|
|
"id": 3,
|
|
"title": "CPU Utilization (%)",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "100 * (1 - (fetchml_resources_cpu_free / clamp_min(fetchml_resources_cpu_total, 1)))",
|
|
"legendFormat": "{{worker_id}}"
|
|
}
|
|
],
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 0
|
|
}
|
|
},
|
|
{
|
|
"id": 4,
|
|
"title": "GPU Slots Free",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "fetchml_resources_gpu_slots_free",
|
|
"legendFormat": "{{worker_id}} gpu={{gpu_index}}"
|
|
}
|
|
],
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 6
|
|
}
|
|
},
|
|
{
|
|
"id": 5,
|
|
"title": "Acquire Wait / Timeout (Totals)",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "fetchml_resources_acquire_wait_total",
|
|
"legendFormat": "wait {{worker_id}}"
|
|
},
|
|
{
|
|
"expr": "fetchml_resources_acquire_timeout_total",
|
|
"legendFormat": "timeout {{worker_id}}"
|
|
},
|
|
{
|
|
"expr": "fetchml_resources_acquire_total",
|
|
"legendFormat": "total {{worker_id}}"
|
|
}
|
|
],
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 8
|
|
}
|
|
},
|
|
{
|
|
"id": 6,
|
|
"title": "Avg Acquire Wait (seconds)",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "fetchml_resources_acquire_wait_seconds_total / clamp_min(fetchml_resources_acquire_wait_total, 1)",
|
|
"legendFormat": "{{worker_id}}"
|
|
}
|
|
],
|
|
"gridPos": {
|
|
"h": 6,
|
|
"w": 6,
|
|
"x": 0,
|
|
"y": 14
|
|
}
|
|
},
|
|
{
|
|
"id": 7,
|
|
"title": "Acquire Wait Ratio",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "fetchml_resources_acquire_wait_total / clamp_min(fetchml_resources_acquire_total, 1)",
|
|
"legendFormat": "{{worker_id}}"
|
|
}
|
|
],
|
|
"gridPos": {
|
|
"h": 6,
|
|
"w": 6,
|
|
"x": 6,
|
|
"y": 14
|
|
}
|
|
},
|
|
{
|
|
"id": 8,
|
|
"title": "Environment Prewarm Hit Rate (%)",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "100 * (fetchml_prewarm_env_hit_total / clamp_min(fetchml_prewarm_env_hit_total + fetchml_prewarm_env_miss_total, 1))",
|
|
"legendFormat": "{{worker_id}}"
|
|
}
|
|
],
|
|
"gridPos": {
|
|
"h": 6,
|
|
"w": 6,
|
|
"x": 12,
|
|
"y": 14
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"thresholds": {
|
|
"steps": [
|
|
{"color": "red", "value": 0},
|
|
{"color": "yellow", "value": 50},
|
|
{"color": "green", "value": 80}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"id": 9,
|
|
"title": "Snapshot Prewarm Hit Rate (%)",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "100 * (fetchml_prewarm_snapshot_hit_total / clamp_min(fetchml_prewarm_snapshot_hit_total + fetchml_prewarm_snapshot_miss_total, 1))",
|
|
"legendFormat": "{{worker_id}}"
|
|
}
|
|
],
|
|
"gridPos": {
|
|
"h": 6,
|
|
"w": 6,
|
|
"x": 18,
|
|
"y": 14
|
|
},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"thresholds": {
|
|
"steps": [
|
|
{"color": "red", "value": 0},
|
|
{"color": "yellow", "value": 50},
|
|
{"color": "green", "value": 80}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"id": 10,
|
|
"title": "Prewarm Hits vs Misses",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(fetchml_prewarm_env_hit_total[5m])",
|
|
"legendFormat": "env hits {{worker_id}}"
|
|
},
|
|
{
|
|
"expr": "rate(fetchml_prewarm_env_miss_total[5m])",
|
|
"legendFormat": "env misses {{worker_id}}"
|
|
},
|
|
{
|
|
"expr": "rate(fetchml_prewarm_snapshot_hit_total[5m])",
|
|
"legendFormat": "snapshot hits {{worker_id}}"
|
|
},
|
|
{
|
|
"expr": "rate(fetchml_prewarm_snapshot_miss_total[5m])",
|
|
"legendFormat": "snapshot misses {{worker_id}}"
|
|
}
|
|
],
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 24,
|
|
"x": 0,
|
|
"y": 20
|
|
},
|
|
"yAxes": [
|
|
{"unit": "reqps"}
|
|
]
|
|
},
|
|
{
|
|
"id": 11,
|
|
"title": "Prewarm Build Time",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(fetchml_prewarm_env_time_seconds_total[5m])",
|
|
"legendFormat": "env build {{worker_id}}"
|
|
},
|
|
{
|
|
"expr": "rate(fetchml_prewarm_snapshot_time_seconds_total[5m])",
|
|
"legendFormat": "snapshot prewarm {{worker_id}}"
|
|
}
|
|
],
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 28
|
|
},
|
|
"yAxes": [
|
|
{"unit": "seconds"}
|
|
]
|
|
},
|
|
{
|
|
"id": 12,
|
|
"title": "Prewarm Builds",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "increase(fetchml_prewarm_env_built_total[1h])",
|
|
"legendFormat": "env built {{worker_id}}"
|
|
},
|
|
{
|
|
"expr": "increase(fetchml_prewarm_snapshot_built_total[1h])",
|
|
"legendFormat": "snapshots prewarmed {{worker_id}}"
|
|
}
|
|
],
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 28
|
|
},
|
|
"yAxes": [
|
|
{"unit": "short"}
|
|
]
|
|
}
|
|
],
|
|
"time": {
|
|
"from": "now-1h",
|
|
"to": "now"
|
|
},
|
|
"refresh": "5s"
|
|
}
|