Enhance monitoring stack (Prometheus, Grafana)
- Add Prometheus alert rules configuration (alerts.yml.j2) - Update Prometheus docker-compose and main configuration - Add Grafana tasks for improved deployment and verification - Integrate Alertmanager with Prometheus for alerting pipeline
This commit is contained in:
parent
7d66552482
commit
ed6101be76
5 changed files with 72 additions and 1 deletions
|
|
@ -28,6 +28,16 @@
|
|||
command: docker network create monitoring
|
||||
when: monitoring_network.rc != 0
|
||||
|
||||
- name: Ensure proxy network exists
|
||||
command: docker network inspect proxy
|
||||
register: proxy_network
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Create proxy network if missing
|
||||
command: docker network create proxy
|
||||
when: proxy_network.rc != 0
|
||||
|
||||
- name: Copy Docker Compose file for Grafana
|
||||
template:
|
||||
src: docker-compose.yml.j2
|
||||
|
|
@ -39,6 +49,6 @@
|
|||
dest: /opt/grafana/provisioning/datasources/datasources.yml
|
||||
|
||||
- name: Deploy Grafana
|
||||
command: docker compose up -d --force-recreate
|
||||
command: docker compose up -d
|
||||
args:
|
||||
chdir: /opt/grafana
|
||||
|
|
|
|||
|
|
@ -19,6 +19,11 @@
|
|||
src: prometheus.yml.j2
|
||||
dest: /opt/prometheus/prometheus.yml
|
||||
|
||||
- name: Copy Prometheus alert rules
|
||||
template:
|
||||
src: alerts.yml.j2
|
||||
dest: /opt/prometheus/alerts.yml
|
||||
|
||||
- name: Copy Docker Compose file for Prometheus
|
||||
template:
|
||||
src: docker-compose.yml.j2
|
||||
|
|
|
|||
41
roles/prometheus/templates/alerts.yml.j2
Normal file
41
roles/prometheus/templates/alerts.yml.j2
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
{% raw %}
|
||||
groups:
|
||||
- name: base
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Target down ({{ $labels.job }})"
|
||||
description: "{{ $labels.instance }} is not responding to Prometheus scrapes."
|
||||
|
||||
- alert: HostDiskSpaceLow
|
||||
expr: |
|
||||
(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/var/lib/docker/.+"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/var/lib/docker/.+"}) < 0.10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low disk space"
|
||||
description: "{{ $labels.instance }} mount {{ $labels.mountpoint }} has <10% free."
|
||||
|
||||
- alert: HostMemoryPressure
|
||||
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low available memory"
|
||||
description: "{{ $labels.instance }} has <10% memory available."
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr: (node_load1 / count without (cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU load"
|
||||
description: "{{ $labels.instance }} has high load1 for 15m."
|
||||
{% endraw %}
|
||||
|
|
@ -7,9 +7,13 @@ services:
|
|||
- --storage.tsdb.retention.time=15d
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./alerts.yml:/etc/prometheus/alerts.yml:ro
|
||||
- prometheus_data:/prometheus
|
||||
networks:
|
||||
- monitoring
|
||||
- proxy
|
||||
ports:
|
||||
- "127.0.0.1:9090:9090"
|
||||
restart: unless-stopped
|
||||
labels:
|
||||
- com.centurylinklabs.watchtower.enable=true
|
||||
|
|
@ -21,3 +25,6 @@ networks:
|
|||
monitoring:
|
||||
external: true
|
||||
name: monitoring
|
||||
|
||||
proxy:
|
||||
external: true
|
||||
|
|
|
|||
|
|
@ -1,6 +1,14 @@
|
|||
global:
|
||||
scrape_interval: 15s
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/alerts.yml
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
scrape_configs:
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
|
|
|
|||
Loading…
Reference in a new issue