infra/playbooks/services.yml
Jeremie Fraeys e2f732c0f5
infra: cleanup repository and add rollback documentation
- Remove unimplemented placeholder roles (airflow, spark)
- Delete cache files (__pycache__, .DS_Store) and generated inventory
- Remove outdated INFRA_GAP_ANALYSIS.md (functionality now in README)
- Standardize DISABLED comments for monitoring stack (Prometheus, Loki, Grafana)
- Add ROLLBACK.md with comprehensive recovery procedures
- Expand vault.example.yml with all backup and alerting variables
- Update README with complete vault variables documentation
2026-03-06 14:40:56 -05:00

176 lines
7 KiB
YAML

---
- hosts: services_hosts
become: true
pre_tasks:
- name: Load vault vars if present
include_vars:
file: "{{ playbook_dir }}/../secrets/vault.yml"
when: (lookup('ansible.builtin.fileglob', playbook_dir ~ '/../secrets/vault.yml', wantlist=True) | length) > 0
tags: [vault, backups, forgejo, traefik, alertmanager, lldap, authelia, postfix]
- name: Ensure minimal required directories exist
file:
path: "{{ item }}"
state: directory
mode: "0755"
loop: "{{ ['/var/run/active-apps', '/var/lib/infra-controller'] }}"
tags: always
roles:
- role: docker
tags: [docker]
- role: firewall
tags: [firewall]
- role: traefik
tags: [traefik]
- role: app_ssh_access
vars:
app_ssh_user: deployer
tags: [app_ssh_access]
- role: lldap
tags: [lldap]
- role: authelia
tags: [authelia]
- role: exporters
tags: [exporters]
- role: alertmanager
tags: [alertmanager]
# DISABLED: Monitoring stack (Prometheus) - uncomment to enable
# - role: prometheus
# tags: [prometheus]
# DISABLED: Monitoring stack (Loki) - uncomment to enable
# - role: loki
# tags: [loki]
# DISABLED: Monitoring stack (Grafana) - uncomment to enable
# - role: grafana
# tags: [grafana]
- role: forgejo
tags: [forgejo]
- role: forgejo_runner
tags: [forgejo_runner]
- role: watchtower
tags: [watchtower]
- role: postfix
tags: [postfix]
- role: backups
tags: [backups]
post_tasks:
# DISABLED: Grafana post-tasks - uncomment when Grafana is enabled
# Grafana post-tasks disabled (monitoring stack not deployed on 1GB node)
# - name: Read Grafana Traefik router rule label
# shell: |
# set -euo pipefail
# id=$(docker compose ps -q grafana)
# docker inspect ${id} | python3 -c 'import json,sys; d=json.load(sys.stdin)[0]; print(d.get("Config",{}).get("Labels",{}).get("traefik.http.routers.grafana.rule",""))'
# args:
# chdir: /opt/grafana
# register: grafana_router_rule
# changed_when: false
# tags: [grafana]
#
# - name: Fail if Grafana Traefik router rule label is not configured as expected
# assert:
# that:
# - grafana_router_rule.stdout == ("Host(`" ~ grafana_hostname ~ `)")
# fail_msg: "Grafana Traefik router rule label mismatch. expected=Host(`{{ grafana_hostname }}`) got={{ grafana_router_rule.stdout | default('') }}. If you used --start-at-task, rerun the play without it so docker compose can recreate the container with updated labels."
# tags: [grafana]
#
# - name: Trigger Traefik certificate request for Grafana hostname
# command: curl -k -s -o /dev/null -w "%{http_code}" --resolve "{{ grafana_hostname }}:443:127.0.0.1" "https://{{ grafana_hostname }}/"
# register: grafana_tls_warmup
# changed_when: false
# retries: 30
# delay: 2
# until: grafana_tls_warmup.stdout != '000'
# tags: [grafana]
#
# - name: Wait for Traefik certificate SAN to include Grafana hostname
# shell: |
# set -euo pipefail
# echo | openssl s_client -servername "{{ grafana_hostname }}" -connect 127.0.0.1:443 2>/dev/null | openssl x509 -noout -text | grep -q "DNS:{{ grafana_hostname }}"
# register: grafana_origin_tls
# changed_when: false
# retries: 90
# delay: 5
# until: grafana_origin_tls.rc == 0
# tags: [grafana]
- name: Trigger Traefik certificate request for Forgejo hostname
command: curl -k -s -o /dev/null -w "%{http_code}" --resolve "{{ forgejo_hostname }}:443:127.0.0.1" "https://{{ forgejo_hostname }}/"
register: forgejo_tls_warmup
changed_when: false
retries: 30
delay: 2
until: forgejo_tls_warmup.stdout != '000'
tags: [forgejo]
- name: Read Forgejo Traefik router rule label
shell: |
set -euo pipefail
id=$(docker compose ps -q forgejo)
docker inspect ${id} | python3 -c 'import json,sys; d=json.load(sys.stdin)[0]; print(d.get("Config",{}).get("Labels",{}).get("traefik.http.routers.forgejo.rule",""))'
args:
chdir: /opt/forgejo
register: forgejo_router_rule
changed_when: false
tags: [forgejo]
- name: Fail if Forgejo Traefik router rule label is not configured as expected
assert:
that:
- "forgejo_router_rule.stdout == 'Host(`' ~ forgejo_hostname ~ '`)'"
fail_msg: "Forgejo Traefik router rule label mismatch. expected=Host(`{{ forgejo_hostname }}`) got={{ forgejo_router_rule.stdout | default('') }}. If you used --start-at-task, rerun the play without it so docker compose can recreate the container with updated labels."
tags: [forgejo]
- name: Wait for Traefik certificate SAN to include Forgejo hostname
shell: |
set -euo pipefail
echo | openssl s_client -servername "{{ forgejo_hostname }}" -connect 127.0.0.1:443 2>/dev/null | openssl x509 -noout -text | grep -q "DNS:{{ forgejo_hostname }}"
register: forgejo_origin_tls
changed_when: false
retries: 90
delay: 5
until: forgejo_origin_tls.rc == 0
tags: [forgejo]
# DISABLED: Prometheus post-tasks - uncomment when Prometheus is enabled
# Prometheus post-tasks disabled (monitoring stack not deployed on 1GB node)
# - name: Trigger Traefik certificate request for Prometheus hostname
# command: curl -k -s -o /dev/null -w "%{http_code}" --resolve "{{ prometheus_hostname }}:443:127.0.0.1" "https://{{ prometheus_hostname }}/"
# register: prometheus_tls_warmup
# changed_when: false
# retries: 30
# delay: 2
# until: prometheus_tls_warmup.stdout != '000'
# tags: [prometheus]
#
# - name: Wait for Traefik certificate SAN to include Prometheus hostname
# shell: |
# set -euo pipefail
# echo | openssl s_client -servername "{{ prometheus_hostname }}" -connect 127.0.0.1:443 2>/dev/null | openssl x509 -noout -text | grep -q "DNS:{{ prometheus_hostname }}"
# register: prometheus_origin_tls
# changed_when: false
# retries: 90
# delay: 5
# until: prometheus_origin_tls.rc == 0
# tags: [prometheus]
- name: Trigger Traefik certificate request for Authelia hostname
command: curl -k -s -o /dev/null -w "%{http_code}" --resolve "{{ auth_hostname }}:443:127.0.0.1" "https://{{ auth_hostname }}/"
register: authelia_tls_warmup
changed_when: false
retries: 30
delay: 2
until: authelia_tls_warmup.stdout != '000'
tags: [authelia]
- name: Wait for Traefik certificate SAN to include Authelia hostname
shell: |
set -euo pipefail
echo | openssl s_client -servername "{{ auth_hostname }}" -connect 127.0.0.1:443 2>/dev/null | openssl x509 -noout -text | grep -q "DNS:{{ auth_hostname }}"
register: authelia_origin_tls
changed_when: false
retries: 90
delay: 5
until: authelia_origin_tls.rc == 0
tags: [authelia]