- Remove unimplemented placeholder roles (airflow, spark) - Delete cache files (__pycache__, .DS_Store) and generated inventory - Remove outdated INFRA_GAP_ANALYSIS.md (functionality now in README) - Standardize DISABLED comments for monitoring stack (Prometheus, Loki, Grafana) - Add ROLLBACK.md with comprehensive recovery procedures - Expand vault.example.yml with all backup and alerting variables - Update README with complete vault variables documentation
176 lines
7 KiB
YAML
176 lines
7 KiB
YAML
---
|
|
- hosts: services_hosts
|
|
become: true
|
|
pre_tasks:
|
|
- name: Load vault vars if present
|
|
include_vars:
|
|
file: "{{ playbook_dir }}/../secrets/vault.yml"
|
|
when: (lookup('ansible.builtin.fileglob', playbook_dir ~ '/../secrets/vault.yml', wantlist=True) | length) > 0
|
|
tags: [vault, backups, forgejo, traefik, alertmanager, lldap, authelia, postfix]
|
|
|
|
- name: Ensure minimal required directories exist
|
|
file:
|
|
path: "{{ item }}"
|
|
state: directory
|
|
mode: "0755"
|
|
loop: "{{ ['/var/run/active-apps', '/var/lib/infra-controller'] }}"
|
|
tags: always
|
|
roles:
|
|
- role: docker
|
|
tags: [docker]
|
|
- role: firewall
|
|
tags: [firewall]
|
|
- role: traefik
|
|
tags: [traefik]
|
|
- role: app_ssh_access
|
|
vars:
|
|
app_ssh_user: deployer
|
|
tags: [app_ssh_access]
|
|
- role: lldap
|
|
tags: [lldap]
|
|
- role: authelia
|
|
tags: [authelia]
|
|
- role: exporters
|
|
tags: [exporters]
|
|
- role: alertmanager
|
|
tags: [alertmanager]
|
|
# DISABLED: Monitoring stack (Prometheus) - uncomment to enable
|
|
# - role: prometheus
|
|
# tags: [prometheus]
|
|
# DISABLED: Monitoring stack (Loki) - uncomment to enable
|
|
# - role: loki
|
|
# tags: [loki]
|
|
# DISABLED: Monitoring stack (Grafana) - uncomment to enable
|
|
# - role: grafana
|
|
# tags: [grafana]
|
|
- role: forgejo
|
|
tags: [forgejo]
|
|
- role: forgejo_runner
|
|
tags: [forgejo_runner]
|
|
- role: watchtower
|
|
tags: [watchtower]
|
|
- role: postfix
|
|
tags: [postfix]
|
|
- role: backups
|
|
tags: [backups]
|
|
|
|
post_tasks:
|
|
# DISABLED: Grafana post-tasks - uncomment when Grafana is enabled
|
|
# Grafana post-tasks disabled (monitoring stack not deployed on 1GB node)
|
|
# - name: Read Grafana Traefik router rule label
|
|
# shell: |
|
|
# set -euo pipefail
|
|
# id=$(docker compose ps -q grafana)
|
|
# docker inspect ${id} | python3 -c 'import json,sys; d=json.load(sys.stdin)[0]; print(d.get("Config",{}).get("Labels",{}).get("traefik.http.routers.grafana.rule",""))'
|
|
# args:
|
|
# chdir: /opt/grafana
|
|
# register: grafana_router_rule
|
|
# changed_when: false
|
|
# tags: [grafana]
|
|
#
|
|
# - name: Fail if Grafana Traefik router rule label is not configured as expected
|
|
# assert:
|
|
# that:
|
|
# - grafana_router_rule.stdout == ("Host(`" ~ grafana_hostname ~ `)")
|
|
# fail_msg: "Grafana Traefik router rule label mismatch. expected=Host(`{{ grafana_hostname }}`) got={{ grafana_router_rule.stdout | default('') }}. If you used --start-at-task, rerun the play without it so docker compose can recreate the container with updated labels."
|
|
# tags: [grafana]
|
|
#
|
|
# - name: Trigger Traefik certificate request for Grafana hostname
|
|
# command: curl -k -s -o /dev/null -w "%{http_code}" --resolve "{{ grafana_hostname }}:443:127.0.0.1" "https://{{ grafana_hostname }}/"
|
|
# register: grafana_tls_warmup
|
|
# changed_when: false
|
|
# retries: 30
|
|
# delay: 2
|
|
# until: grafana_tls_warmup.stdout != '000'
|
|
# tags: [grafana]
|
|
#
|
|
# - name: Wait for Traefik certificate SAN to include Grafana hostname
|
|
# shell: |
|
|
# set -euo pipefail
|
|
# echo | openssl s_client -servername "{{ grafana_hostname }}" -connect 127.0.0.1:443 2>/dev/null | openssl x509 -noout -text | grep -q "DNS:{{ grafana_hostname }}"
|
|
# register: grafana_origin_tls
|
|
# changed_when: false
|
|
# retries: 90
|
|
# delay: 5
|
|
# until: grafana_origin_tls.rc == 0
|
|
# tags: [grafana]
|
|
|
|
- name: Trigger Traefik certificate request for Forgejo hostname
|
|
command: curl -k -s -o /dev/null -w "%{http_code}" --resolve "{{ forgejo_hostname }}:443:127.0.0.1" "https://{{ forgejo_hostname }}/"
|
|
register: forgejo_tls_warmup
|
|
changed_when: false
|
|
retries: 30
|
|
delay: 2
|
|
until: forgejo_tls_warmup.stdout != '000'
|
|
tags: [forgejo]
|
|
|
|
- name: Read Forgejo Traefik router rule label
|
|
shell: |
|
|
set -euo pipefail
|
|
id=$(docker compose ps -q forgejo)
|
|
docker inspect ${id} | python3 -c 'import json,sys; d=json.load(sys.stdin)[0]; print(d.get("Config",{}).get("Labels",{}).get("traefik.http.routers.forgejo.rule",""))'
|
|
args:
|
|
chdir: /opt/forgejo
|
|
register: forgejo_router_rule
|
|
changed_when: false
|
|
tags: [forgejo]
|
|
|
|
- name: Fail if Forgejo Traefik router rule label is not configured as expected
|
|
assert:
|
|
that:
|
|
- "forgejo_router_rule.stdout == 'Host(`' ~ forgejo_hostname ~ '`)'"
|
|
fail_msg: "Forgejo Traefik router rule label mismatch. expected=Host(`{{ forgejo_hostname }}`) got={{ forgejo_router_rule.stdout | default('') }}. If you used --start-at-task, rerun the play without it so docker compose can recreate the container with updated labels."
|
|
tags: [forgejo]
|
|
|
|
- name: Wait for Traefik certificate SAN to include Forgejo hostname
|
|
shell: |
|
|
set -euo pipefail
|
|
echo | openssl s_client -servername "{{ forgejo_hostname }}" -connect 127.0.0.1:443 2>/dev/null | openssl x509 -noout -text | grep -q "DNS:{{ forgejo_hostname }}"
|
|
register: forgejo_origin_tls
|
|
changed_when: false
|
|
retries: 90
|
|
delay: 5
|
|
until: forgejo_origin_tls.rc == 0
|
|
tags: [forgejo]
|
|
|
|
# DISABLED: Prometheus post-tasks - uncomment when Prometheus is enabled
|
|
# Prometheus post-tasks disabled (monitoring stack not deployed on 1GB node)
|
|
# - name: Trigger Traefik certificate request for Prometheus hostname
|
|
# command: curl -k -s -o /dev/null -w "%{http_code}" --resolve "{{ prometheus_hostname }}:443:127.0.0.1" "https://{{ prometheus_hostname }}/"
|
|
# register: prometheus_tls_warmup
|
|
# changed_when: false
|
|
# retries: 30
|
|
# delay: 2
|
|
# until: prometheus_tls_warmup.stdout != '000'
|
|
# tags: [prometheus]
|
|
#
|
|
# - name: Wait for Traefik certificate SAN to include Prometheus hostname
|
|
# shell: |
|
|
# set -euo pipefail
|
|
# echo | openssl s_client -servername "{{ prometheus_hostname }}" -connect 127.0.0.1:443 2>/dev/null | openssl x509 -noout -text | grep -q "DNS:{{ prometheus_hostname }}"
|
|
# register: prometheus_origin_tls
|
|
# changed_when: false
|
|
# retries: 90
|
|
# delay: 5
|
|
# until: prometheus_origin_tls.rc == 0
|
|
# tags: [prometheus]
|
|
|
|
- name: Trigger Traefik certificate request for Authelia hostname
|
|
command: curl -k -s -o /dev/null -w "%{http_code}" --resolve "{{ auth_hostname }}:443:127.0.0.1" "https://{{ auth_hostname }}/"
|
|
register: authelia_tls_warmup
|
|
changed_when: false
|
|
retries: 30
|
|
delay: 2
|
|
until: authelia_tls_warmup.stdout != '000'
|
|
tags: [authelia]
|
|
|
|
- name: Wait for Traefik certificate SAN to include Authelia hostname
|
|
shell: |
|
|
set -euo pipefail
|
|
echo | openssl s_client -servername "{{ auth_hostname }}" -connect 127.0.0.1:443 2>/dev/null | openssl x509 -noout -text | grep -q "DNS:{{ auth_hostname }}"
|
|
register: authelia_origin_tls
|
|
changed_when: false
|
|
retries: 90
|
|
delay: 5
|
|
until: authelia_origin_tls.rc == 0
|
|
tags: [authelia]
|