Enhance monitoring stack (Prometheus, Grafana)

- Add Prometheus alert rules configuration (alerts.yml.j2)
- Update Prometheus docker-compose and main configuration
- Add Grafana tasks for improved deployment and verification
- Integrate Alertmanager with Prometheus for alerting pipeline
This commit is contained in:
Jeremie Fraeys 2026-02-21 18:30:57 -05:00
parent 7d66552482
commit ed6101be76
No known key found for this signature in database
5 changed files with 72 additions and 1 deletions

View file

@ -28,6 +28,16 @@
command: docker network create monitoring
when: monitoring_network.rc != 0
- name: Ensure proxy network exists
command: docker network inspect proxy
register: proxy_network
changed_when: false
failed_when: false
- name: Create proxy network if missing
command: docker network create proxy
when: proxy_network.rc != 0
- name: Copy Docker Compose file for Grafana
template:
src: docker-compose.yml.j2
@ -39,6 +49,6 @@
dest: /opt/grafana/provisioning/datasources/datasources.yml
- name: Deploy Grafana
command: docker compose up -d --force-recreate
command: docker compose up -d
args:
chdir: /opt/grafana

View file

@ -19,6 +19,11 @@
src: prometheus.yml.j2
dest: /opt/prometheus/prometheus.yml
- name: Copy Prometheus alert rules
template:
src: alerts.yml.j2
dest: /opt/prometheus/alerts.yml
- name: Copy Docker Compose file for Prometheus
template:
src: docker-compose.yml.j2

View file

@ -0,0 +1,41 @@
{% raw %}
groups:
- name: base
rules:
- alert: InstanceDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Target down ({{ $labels.job }})"
description: "{{ $labels.instance }} is not responding to Prometheus scrapes."
- alert: HostDiskSpaceLow
expr: |
(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/var/lib/docker/.+"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/var/lib/docker/.+"}) < 0.10
for: 10m
labels:
severity: warning
annotations:
summary: "Low disk space"
description: "{{ $labels.instance }} mount {{ $labels.mountpoint }} has <10% free."
- alert: HostMemoryPressure
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.10
for: 10m
labels:
severity: warning
annotations:
summary: "Low available memory"
description: "{{ $labels.instance }} has <10% memory available."
- alert: HostHighCpuLoad
expr: (node_load1 / count without (cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 2
for: 15m
labels:
severity: warning
annotations:
summary: "High CPU load"
description: "{{ $labels.instance }} has high load1 for 15m."
{% endraw %}

View file

@ -7,9 +7,13 @@ services:
- --storage.tsdb.retention.time=15d
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./alerts.yml:/etc/prometheus/alerts.yml:ro
- prometheus_data:/prometheus
networks:
- monitoring
- proxy
ports:
- "127.0.0.1:9090:9090"
restart: unless-stopped
labels:
- com.centurylinklabs.watchtower.enable=true
@ -21,3 +25,6 @@ networks:
monitoring:
external: true
name: monitoring
proxy:
external: true

View file

@ -1,6 +1,14 @@
global:
scrape_interval: 15s
rule_files:
- /etc/prometheus/alerts.yml
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
scrape_configs:
- job_name: prometheus
static_configs: