infra/playbooks/tests/test_config.yml
Jeremie Fraeys 4842c71cae
Add new playbooks and update inventory configuration
- Add deploy-app.yml playbook for application-specific deployments
- Add web.yml playbook for web infrastructure management
- Restructure tests/test_config.yml for better organization
- Update inventory/group_vars/all.yml with new hostnames and settings
2026-02-21 18:31:20 -05:00

498 lines
19 KiB
YAML

---
- name: Test Deployment Configuration
hosts: all
become: true
tasks:
- name: Load vault vars if present
include_vars:
file: "{{ playbook_dir }}/../secrets/vault.yml"
no_log: true
when: (lookup('ansible.builtin.fileglob', playbook_dir ~ '/../secrets/vault.yml', wantlist=True) | length) > 0
- name: Check SSH service status
command: systemctl is-active sshd
register: ssh_status
changed_when: false
- debug:
msg: "SSH service is {{ ssh_status.stdout | default('') }}"
- name: Check SSH Port Configuration
command: sshd -T
register: ssh_port
changed_when: false
failed_when: false
- debug:
msg: "SSH port configured as {{ (ssh_port.stdout | default('') | regex_search('(?m)^port\\s+([0-9]+)$', '\\1')) | default('Unknown') }}"
- name: Check Docker version
command: docker --version
register: docker_version
changed_when: false
- debug:
msg: "Docker Version: {{ docker_version.stdout }}"
- name: Check Docker Compose version (hyphen)
command: docker-compose --version
register: docker_compose_version_hyphen
failed_when: false
changed_when: false
- name: Check Docker Compose version (docker compose)
command: docker compose version
register: docker_compose_version_space
failed_when: false
changed_when: false
- name: Display Docker Compose version
debug:
msg: >
{% if docker_compose_version_hyphen.stdout %}
Docker Compose version (docker-compose): {{ docker_compose_version_hyphen.stdout }}
{% elif docker_compose_version_space.stdout %}
Docker Compose version (docker compose): {{ docker_compose_version_space.stdout }}
{% else %}
Docker Compose not found
{% endif %}
- name: Check Ansible version
command: ansible --version
register: ansible_version
changed_when: false
failed_when: false
- debug:
msg: "Ansible Version: {{ (ansible_version.stdout | default('')) .split('\n')[0] if (ansible_version.stdout | default('') | length) > 0 else 'Not installed' }}"
- name: Check UFW status
command: ufw status verbose
register: ufw_status
changed_when: false
- debug:
msg: "UFW Status: {{ ufw_status.stdout }}"
- name: Check Fail2ban service status
command: systemctl is-active fail2ban
register: fail2ban_status
changed_when: false
failed_when: false
- debug:
msg: "Fail2ban is {{ fail2ban_status.stdout }}"
- name: Display logrotate custom config
command: cat /etc/logrotate.d/custom
register: logrotate_config
changed_when: false
failed_when: false
- debug:
msg: "Logrotate custom config:\n{{ logrotate_config.stdout | default('No custom logrotate config found') }}"
- name: Check running Docker containers
command: docker ps
register: docker_ps
changed_when: false
- debug:
msg: "Docker containers:\n{{ docker_ps.stdout }}"
- name: Determine host role
set_fact:
is_services_host: "{{ 'services_hosts' in group_names }}"
is_web_host: "{{ 'web_hosts' in group_names }}"
- name: Define expected stacks for services host
set_fact:
expected_stacks:
- { name: traefik, dir: /opt/traefik }
- { name: lldap, dir: /opt/lldap }
- { name: authelia, dir: /opt/authelia }
- { name: exporters, dir: /opt/exporters }
- { name: alertmanager, dir: /opt/alertmanager }
- { name: prometheus, dir: /opt/prometheus }
- { name: loki, dir: /opt/loki }
- { name: grafana, dir: /opt/grafana }
- { name: forgejo, dir: /opt/forgejo }
- { name: watchtower, dir: /opt/watchtower }
when: is_services_host
- name: Define expected stacks for web host
set_fact:
expected_stacks:
- { name: traefik, dir: /opt/traefik }
- { name: app_core, dir: /opt/app }
- { name: forgejo_runner, dir: /opt/forgejo-runner }
when: is_web_host
- name: Check minimal infra-controller directories exist on services host
stat:
path: "{{ item }}"
register: infra_dirs
loop:
- /var/run/active-apps
- /var/lib/infra-controller
changed_when: false
when: is_services_host
- name: Fail if any minimal infra-controller directory is missing on services host
assert:
that:
- item.stat.exists
- item.stat.isdir
fail_msg: "Missing required directory on services host: {{ item.item }}. This typically means the services playbook has not been applied yet. Run ./setup.sh (or ansible-playbook playbooks/services.yml) and re-run this test."
loop: "{{ infra_dirs.results | default([]) }}"
when: is_services_host
- name: Read deployer authorized_keys on services host
slurp:
src: /home/deployer/.ssh/authorized_keys
register: deployer_authorized_keys
changed_when: false
when: is_services_host
- name: Fail if deployer authorized_keys is missing forced-command restrictions
assert:
that:
- (deployer_authorized_keys.content | b64decode) is search('command="/usr/local/sbin/infra-register-stdin"')
- (deployer_authorized_keys.content | b64decode) is search('command="/usr/local/sbin/infra-deregister"')
fail_msg: "deployer authorized_keys does not include forced-command keys for infra-register-stdin/infra-deregister"
when: is_services_host
- name: Check that expected compose directories exist
stat:
path: "{{ item.dir }}/docker-compose.yml"
register: compose_files
loop: "{{ expected_stacks | default([]) }}"
changed_when: false
- name: Fail if any compose file is missing
assert:
that:
- item.stat.exists
fail_msg: "Missing docker-compose.yml for {{ item.item.name }} at {{ item.item.dir }}/docker-compose.yml"
loop: "{{ compose_files.results | default([]) }}"
when: expected_stacks is defined
- name: Read expected services per stack
command: docker compose config --services
args:
chdir: "{{ item.dir }}"
register: stack_expected
loop: "{{ expected_stacks | default([]) }}"
changed_when: false
- name: Read service status/health per stack (docker inspect)
shell: |
set -euo pipefail
ids=$(docker compose ps -q)
if [ -z "${ids}" ]; then
exit 0
fi
{% raw %}docker inspect --format '{{ index .Config.Labels "com.docker.compose.service" }} {{ .State.Status }} {{ if .State.Health }}{{ .State.Health.Status }}{{ else }}none{{ end }}' ${ids}{% endraw %}
args:
chdir: "{{ item.dir }}"
register: stack_status
loop: "{{ expected_stacks | default([]) }}"
changed_when: false
failed_when: false
- name: Assert all services in each stack are running (and healthy if healthcheck exists)
assert:
that:
- (expected | difference(running_services)) | length == 0
- bad_health_services | length == 0
fail_msg: >-
Stack {{ stack.name }} service status unhealthy.
Missing running={{ expected | difference(running_services) }}.
Bad health={{ bad_health_services }}.
Expected={{ expected }}
Inspect={{ status_lines }}
loop: "{{ (expected_stacks | default([])) | zip(stack_expected.results, stack_status.results) | list }}"
vars:
stack: "{{ item.0 }}"
expected: "{{ item.1.stdout_lines | default([]) }}"
status_lines: "{{ item.2.stdout_lines | default([]) }}"
running_services: >-
{{ status_lines
| map('regex_findall', '^(\S+)\s+running\s+')
| select('truthy')
| map('first')
| list }}
ok_services: >-
{{ status_lines
| map('regex_findall', '^(\S+)\s+running\s+(?:healthy|none)\s*$')
| select('truthy')
| map('first')
| list }}
bad_health_services: >-
{{ (running_services | default([])) | difference(ok_services | default([])) }}
when: expected_stacks is defined
- name: Ensure proxy network exists
command: docker network inspect proxy
register: proxy_network
changed_when: false
- name: Ensure monitoring network exists on services host
command: docker network inspect monitoring
register: monitoring_network
changed_when: false
when: is_services_host
- name: Check Prometheus readiness on services host
command: docker compose exec -T prometheus wget -qO- http://127.0.0.1:9090/-/ready
args:
chdir: /opt/prometheus
register: prometheus_ready
changed_when: false
when: is_services_host
- name: Fail if Prometheus is not ready
assert:
that:
- prometheus_ready.stdout | default('') in ['Prometheus is Ready.', 'Prometheus Server is Ready.']
fail_msg: "Prometheus readiness check failed. Output={{ prometheus_ready.stdout | default('') }}"
when: is_services_host
- name: Check Alertmanager readiness (from Prometheus container)
command: docker compose exec -T prometheus wget -qO- http://alertmanager:9093/-/ready
args:
chdir: /opt/prometheus
register: alertmanager_ready
changed_when: false
when: is_services_host
- name: Fail if Alertmanager is not ready
assert:
that:
- alertmanager_ready.stdout | default('') == 'Ready'
fail_msg: "Alertmanager readiness check failed. Output={{ alertmanager_ready.stdout | default('') }}"
when: is_services_host
- name: Check Grafana health on services host
command: docker compose exec -T grafana wget -qO- http://127.0.0.1:3000/api/health
args:
chdir: /opt/grafana
register: grafana_health
changed_when: false
failed_when: false
when: is_services_host
- name: Fail if Grafana health endpoint is not reachable
assert:
that:
- grafana_health.rc == 0
fail_msg: "Grafana health endpoint check failed (inside container). rc={{ grafana_health.rc }} output={{ grafana_health.stdout | default('') }}"
when: is_services_host
- name: Check Loki readiness on services host
uri:
url: http://127.0.0.1:3100/ready
method: GET
status_code: [200, 503]
register: loki_ready
until: loki_ready.status == 200
retries: 30
delay: 2
changed_when: false
when: is_services_host
- name: Check Traefik dynamic config contains Grafana router rule
shell: |
set -euo pipefail
grep -Fq 'Host(`{{ grafana_hostname }}`)' /opt/traefik/dynamic/base.yml
register: grafana_router_rule
changed_when: false
failed_when: false
when: is_services_host
- name: Fail if Grafana Traefik router rule is not configured as expected
assert:
that:
- grafana_router_rule.rc == 0
fail_msg: "Grafana Traefik router rule mismatch in /opt/traefik/dynamic/base.yml. expected=Host(`{{ grafana_hostname }}`)"
when: is_services_host
- name: Check Traefik dynamic config contains Forgejo router rule
shell: |
set -euo pipefail
grep -Fq 'Host(`{{ forgejo_hostname }}`)' /opt/traefik/dynamic/base.yml
register: forgejo_router_rule
changed_when: false
failed_when: false
when: is_services_host
- name: Fail if Forgejo Traefik router rule is not configured as expected
assert:
that:
- forgejo_router_rule.rc == 0
fail_msg: "Forgejo Traefik router rule mismatch in /opt/traefik/dynamic/base.yml. expected=Host(`{{ forgejo_hostname }}`)"
when: is_services_host
- name: Check Traefik dynamic config contains Authelia router rule
shell: |
set -euo pipefail
grep -Fq 'Host(`{{ auth_hostname }}`)' /opt/traefik/dynamic/base.yml
register: authelia_router_rule
changed_when: false
failed_when: false
when: is_services_host
- name: Fail if Authelia Traefik router rule is not configured as expected
assert:
that:
- authelia_router_rule.rc == 0
fail_msg: "Authelia Traefik router rule mismatch in /opt/traefik/dynamic/base.yml. expected=Host(`{{ auth_hostname }}`)"
when: is_services_host
- name: Check Traefik serves a valid TLS certificate for Grafana hostname (origin)
shell: |
set -euo pipefail
echo | openssl s_client -servername "{{ grafana_hostname }}" -connect 127.0.0.1:443 2>/dev/null | grep -q "Verify return code: 0 (ok)"
register: grafana_origin_tls
changed_when: false
retries: 30
delay: 2
until: grafana_origin_tls.rc == 0
when: is_services_host
- name: Check Traefik serves a valid TLS certificate for Forgejo hostname (origin)
shell: |
set -euo pipefail
echo | openssl s_client -servername "{{ forgejo_hostname }}" -connect 127.0.0.1:443 2>/dev/null | grep -q "Verify return code: 0 (ok)"
register: forgejo_origin_tls
changed_when: false
retries: 30
delay: 2
until: forgejo_origin_tls.rc == 0
when: is_services_host
- name: Check Traefik serves a valid TLS certificate for Authelia hostname (origin)
shell: |
set -euo pipefail
echo | openssl s_client -servername "{{ auth_hostname }}" -connect 127.0.0.1:443 2>/dev/null | grep -q "Verify return code: 0 (ok)"
register: authelia_origin_tls
changed_when: false
retries: 30
delay: 2
until: authelia_origin_tls.rc == 0
when: is_services_host
- name: Check Authelia OIDC discovery issuer (origin)
shell: |
set -euo pipefail
curl -k -sS --resolve "{{ auth_hostname }}:443:127.0.0.1" "https://{{ auth_hostname }}/.well-known/openid-configuration" \
| python3 -c 'import json,sys; print(json.load(sys.stdin).get("issuer",""))'
register: authelia_oidc_issuer
changed_when: false
retries: 30
delay: 2
until: authelia_oidc_issuer.stdout | default('') | length > 0
when: is_services_host
- name: Fail if Authelia OIDC discovery issuer is not configured as expected
assert:
that:
- authelia_oidc_issuer.stdout == ("https://" ~ auth_hostname)
fail_msg: "Authelia OIDC issuer mismatch. expected=https://{{ auth_hostname }} got={{ authelia_oidc_issuer.stdout | default('') }}"
when: is_services_host
- name: Check LLDAP web UI is reachable on services host
uri:
url: http://127.0.0.1:17170/
method: GET
status_code: [200, 302]
register: lldap_web
changed_when: false
when: is_services_host
- name: Check infra-backup systemd timer is enabled on services host
command: systemctl is-enabled infra-backup.timer
register: infra_backup_timer_enabled
changed_when: false
failed_when: false
when: is_services_host
- name: Fail if infra-backup systemd timer is not enabled on services host
assert:
that:
- infra_backup_timer_enabled.stdout | default('') in ['enabled', 'enabled-runtime']
fail_msg: "infra-backup.timer is not enabled (got={{ infra_backup_timer_enabled.stdout | default('') }})"
when: is_services_host
- name: Check infra-backup systemd timer is active on services host
command: systemctl is-active infra-backup.timer
register: infra_backup_timer_active
changed_when: false
failed_when: false
when: is_services_host
- name: Fail if infra-backup systemd timer is not active on services host
assert:
that:
- infra_backup_timer_active.stdout | default('') == 'active'
fail_msg: "infra-backup.timer is not active (got={{ infra_backup_timer_active.stdout | default('') }})"
when: is_services_host
- name: Check restic can list snapshots (using /etc/infra-backup.env)
shell: |
set -euo pipefail
set -a
. /etc/infra-backup.env
set +a
restic snapshots
register: restic_snapshots
changed_when: false
no_log: true
when: is_services_host
- name: Check Forgejo dump command works (reduced export)
shell: |
set -euo pipefail
docker exec --user 1000:1000 forgejo-forgejo-1 forgejo dump --file - --type zip --skip-log --skip-repository --skip-lfs-data --skip-attachment-data --skip-package-data >/dev/null
register: forgejo_dump_smoke
changed_when: false
when: is_services_host
- name: Read object storage configuration from controller environment
set_fact:
s3_bucket: "{{ lookup('env', 'S3_BUCKET') | default('', true) }}"
s3_region: "{{ lookup('env', 'S3_REGION') | default(lookup('env', 'TF_VAR_object_storage_region'), true) | default('us-east-1', true) }}"
changed_when: false
- name: Compute object storage endpoint from controller environment
set_fact:
s3_endpoint: "{{ lookup('env', 'S3_ENDPOINT') | default('https://' ~ s3_region ~ '.linodeobjects.com', true) }}"
changed_when: false
- name: Smoke test Linode Object Storage credentials (head-bucket)
command: >-
docker run --rm
-e AWS_ACCESS_KEY_ID
-e AWS_SECRET_ACCESS_KEY
-e AWS_DEFAULT_REGION
-e AWS_EC2_METADATA_DISABLED=true
amazon/aws-cli:2.15.57
s3api head-bucket --bucket {{ s3_bucket | quote }} --endpoint-url {{ s3_endpoint | quote }}
environment:
AWS_ACCESS_KEY_ID: "{{ S3_ACCESS_KEY_ID | default('') }}"
AWS_SECRET_ACCESS_KEY: "{{ S3_SECRET_ACCESS_KEY | default('') }}"
AWS_DEFAULT_REGION: "{{ s3_region }}"
register: s3_head_bucket
changed_when: false
no_log: true
when:
- (s3_bucket | default('') | length) > 0
- (S3_ACCESS_KEY_ID | default('') | length) > 0
- (S3_SECRET_ACCESS_KEY | default('') | length) > 0
- name: Fail if object storage smoke test failed
assert:
that:
- s3_head_bucket.rc == 0
fail_msg: "Object storage smoke test failed (head-bucket). Check S3_BUCKET/S3_REGION/S3_ENDPOINT and S3_ACCESS_KEY_ID/S3_SECRET_ACCESS_KEY in vault."
when:
- (s3_bucket | default('') | length) > 0
- (S3_ACCESS_KEY_ID | default('') | length) > 0
- (S3_SECRET_ACCESS_KEY | default('') | length) > 0
- name: Check Loki is reachable from web host (allowlist)
uri:
url: "http://{{ hostvars['services'].ansible_host }}:3100/ready"
method: GET
status_code: 200
register: loki_from_web_ready
when: is_web_host