Enhance monitoring stack (Prometheus, Grafana)

- Add Prometheus alert rules configuration (alerts.yml.j2) - Update Prometheus docker-compose and main configuration - Add Grafana tasks for improved deployment and verification - Integrate Alertmanager with Prometheus for alerting pipeline
2026-02-21 18:30:57 -05:00 · 2026-02-21 18:30:57 -05:00 · ed6101be76
commit ed6101be76
parent 7d66552482
5 changed files with 72 additions and 1 deletions
--- a/roles/grafana/tasks/main.yml
+++ b/roles/grafana/tasks/main.yml
@ -28,6 +28,16 @@
  command: docker network create monitoring
  when: monitoring_network.rc != 0

+- name: Ensure proxy network exists
+  command: docker network inspect proxy
+  register: proxy_network
+  changed_when: false
+  failed_when: false
+
+- name: Create proxy network if missing
+  command: docker network create proxy
+  when: proxy_network.rc != 0
+
 - name: Copy Docker Compose file for Grafana
  template:
    src: docker-compose.yml.j2
@ -39,6 +49,6 @@
    dest: /opt/grafana/provisioning/datasources/datasources.yml

 - name: Deploy Grafana
-  command: docker compose up -d --force-recreate
+  command: docker compose up -d
  args:
    chdir: /opt/grafana
--- a/roles/prometheus/tasks/main.yml
+++ b/roles/prometheus/tasks/main.yml
@ -19,6 +19,11 @@
    src: prometheus.yml.j2
    dest: /opt/prometheus/prometheus.yml

+- name: Copy Prometheus alert rules
+  template:
+    src: alerts.yml.j2
+    dest: /opt/prometheus/alerts.yml
+
 - name: Copy Docker Compose file for Prometheus
  template:
    src: docker-compose.yml.j2
--- a/roles/prometheus/templates/alerts.yml.j2
+++ b/roles/prometheus/templates/alerts.yml.j2
@ -0,0 +1,41 @@
+{% raw %}
+groups:
+  - name: base
+    rules:
+      - alert: InstanceDown
+        expr: up == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Target down ({{ $labels.job }})"
+          description: "{{ $labels.instance }} is not responding to Prometheus scrapes."
+
+      - alert: HostDiskSpaceLow
+        expr: |
+          (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/var/lib/docker/.+"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/var/lib/docker/.+"}) < 0.10
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Low disk space"
+          description: "{{ $labels.instance }} mount {{ $labels.mountpoint }} has <10% free."
+
+      - alert: HostMemoryPressure
+        expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.10
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Low available memory"
+          description: "{{ $labels.instance }} has <10% memory available."
+
+      - alert: HostHighCpuLoad
+        expr: (node_load1 / count without (cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 2
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High CPU load"
+          description: "{{ $labels.instance }} has high load1 for 15m."
+{% endraw %}
--- a/roles/prometheus/templates/docker-compose.yml.j2
+++ b/roles/prometheus/templates/docker-compose.yml.j2
@ -7,9 +7,13 @@ services:
      - --storage.tsdb.retention.time=15d
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - ./alerts.yml:/etc/prometheus/alerts.yml:ro
      - prometheus_data:/prometheus
    networks:
      - monitoring
+      - proxy
+    ports:
+      - "127.0.0.1:9090:9090"
    restart: unless-stopped
    labels:
      - com.centurylinklabs.watchtower.enable=true
@ -21,3 +25,6 @@ networks:
  monitoring:
    external: true
    name: monitoring
+
+  proxy:
+    external: true
--- a/roles/prometheus/templates/prometheus.yml.j2
+++ b/roles/prometheus/templates/prometheus.yml.j2
@ -1,6 +1,14 @@
 global:
  scrape_interval: 15s

+rule_files:
+  - /etc/prometheus/alerts.yml
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets: ['alertmanager:9093']
+
 scrape_configs:
  - job_name: prometheus
    static_configs: