From ed6101be766aa2c52dc0e13203e8010c2ddd9d19 Mon Sep 17 00:00:00 2001 From: Jeremie Fraeys Date: Sat, 21 Feb 2026 18:30:57 -0500 Subject: [PATCH] Enhance monitoring stack (Prometheus, Grafana) - Add Prometheus alert rules configuration (alerts.yml.j2) - Update Prometheus docker-compose and main configuration - Add Grafana tasks for improved deployment and verification - Integrate Alertmanager with Prometheus for alerting pipeline --- roles/grafana/tasks/main.yml | 12 +++++- roles/prometheus/tasks/main.yml | 5 +++ roles/prometheus/templates/alerts.yml.j2 | 41 +++++++++++++++++++ .../templates/docker-compose.yml.j2 | 7 ++++ roles/prometheus/templates/prometheus.yml.j2 | 8 ++++ 5 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 roles/prometheus/templates/alerts.yml.j2 diff --git a/roles/grafana/tasks/main.yml b/roles/grafana/tasks/main.yml index 8490720..445c0ca 100644 --- a/roles/grafana/tasks/main.yml +++ b/roles/grafana/tasks/main.yml @@ -28,6 +28,16 @@ command: docker network create monitoring when: monitoring_network.rc != 0 +- name: Ensure proxy network exists + command: docker network inspect proxy + register: proxy_network + changed_when: false + failed_when: false + +- name: Create proxy network if missing + command: docker network create proxy + when: proxy_network.rc != 0 + - name: Copy Docker Compose file for Grafana template: src: docker-compose.yml.j2 @@ -39,6 +49,6 @@ dest: /opt/grafana/provisioning/datasources/datasources.yml - name: Deploy Grafana - command: docker compose up -d --force-recreate + command: docker compose up -d args: chdir: /opt/grafana diff --git a/roles/prometheus/tasks/main.yml b/roles/prometheus/tasks/main.yml index 64f9169..33273cd 100644 --- a/roles/prometheus/tasks/main.yml +++ b/roles/prometheus/tasks/main.yml @@ -19,6 +19,11 @@ src: prometheus.yml.j2 dest: /opt/prometheus/prometheus.yml +- name: Copy Prometheus alert rules + template: + src: alerts.yml.j2 + dest: /opt/prometheus/alerts.yml + - name: Copy Docker Compose file for Prometheus template: src: docker-compose.yml.j2 diff --git a/roles/prometheus/templates/alerts.yml.j2 b/roles/prometheus/templates/alerts.yml.j2 new file mode 100644 index 0000000..637fc17 --- /dev/null +++ b/roles/prometheus/templates/alerts.yml.j2 @@ -0,0 +1,41 @@ +{% raw %} +groups: + - name: base + rules: + - alert: InstanceDown + expr: up == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Target down ({{ $labels.job }})" + description: "{{ $labels.instance }} is not responding to Prometheus scrapes." + + - alert: HostDiskSpaceLow + expr: | + (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/var/lib/docker/.+"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/var/lib/docker/.+"}) < 0.10 + for: 10m + labels: + severity: warning + annotations: + summary: "Low disk space" + description: "{{ $labels.instance }} mount {{ $labels.mountpoint }} has <10% free." + + - alert: HostMemoryPressure + expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.10 + for: 10m + labels: + severity: warning + annotations: + summary: "Low available memory" + description: "{{ $labels.instance }} has <10% memory available." + + - alert: HostHighCpuLoad + expr: (node_load1 / count without (cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 2 + for: 15m + labels: + severity: warning + annotations: + summary: "High CPU load" + description: "{{ $labels.instance }} has high load1 for 15m." +{% endraw %} diff --git a/roles/prometheus/templates/docker-compose.yml.j2 b/roles/prometheus/templates/docker-compose.yml.j2 index c51ebf8..e479943 100644 --- a/roles/prometheus/templates/docker-compose.yml.j2 +++ b/roles/prometheus/templates/docker-compose.yml.j2 @@ -7,9 +7,13 @@ services: - --storage.tsdb.retention.time=15d volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./alerts.yml:/etc/prometheus/alerts.yml:ro - prometheus_data:/prometheus networks: - monitoring + - proxy + ports: + - "127.0.0.1:9090:9090" restart: unless-stopped labels: - com.centurylinklabs.watchtower.enable=true @@ -21,3 +25,6 @@ networks: monitoring: external: true name: monitoring + + proxy: + external: true diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index 6d54b58..925cc82 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -1,6 +1,14 @@ global: scrape_interval: 15s +rule_files: + - /etc/prometheus/alerts.yml + +alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + scrape_configs: - job_name: prometheus static_configs: