Add new playbooks and update inventory configuration

- Add deploy-app.yml playbook for application-specific deployments
- Add web.yml playbook for web infrastructure management
- Restructure tests/test_config.yml for better organization
- Update inventory/group_vars/all.yml with new hostnames and settings
This commit is contained in:
Jeremie Fraeys 2026-02-21 18:31:20 -05:00
parent b9c5cdff12
commit 4842c71cae
No known key found for this signature in database
4 changed files with 684 additions and 1 deletions

View file

@ -2,10 +2,19 @@ traefik_acme_email: "admin@jfraeys.com"
traefik_certresolver: "cloudflare" traefik_certresolver: "cloudflare"
ansible_port: "{{ lookup('env', 'TF_VAR_ssh_port') | default(22, true) }}" ansible_port: "{{ lookup('env', 'TF_VAR_ssh_port') | default(22, true) }}"
ansible_ssh_private_key_file: "{{ lookup('env', 'ANSIBLE_PRIVATE_KEY_FILE') | default('~/.ssh/id_ed25519', true) }}" ansible_ssh_private_key_file: "{{ lookup('env', 'ANSIBLE_PRIVATE_KEY_FILE') | default(lookup('env', 'HOME') ~ '/.ssh/id_ed25519', true) }}"
grafana_hostname: "grafana.jfraeys.com" grafana_hostname: "grafana.jfraeys.com"
forgejo_hostname: "git.jfraeys.com" forgejo_hostname: "git.jfraeys.com"
prometheus_hostname: "prometheus.jfraeys.com"
app_hostname: "app.jfraeys.com"
web_apps_scheme: "http"
web_apps_port: 80
auth_hostname: "auth.jfraeys.com" auth_hostname: "auth.jfraeys.com"
lldap_base_dn: "dc=jfraeys,dc=com" lldap_base_dn: "dc=jfraeys,dc=com"
# App deployment versioning - overridden at deploy time via --extra-vars
app_version: "latest"
app_name: ""

158
playbooks/deploy-app.yml Normal file
View file

@ -0,0 +1,158 @@
---
# Generic app deployment playbook
# Deploys any app without requiring per-app playbooks
# Required extra-vars: app_name, app_version
# Optional extra-vars: env (default: prod), app_port, app_env_vars
# Run on the web host (which acts as deployment server)
- hosts: localhost
become: true
vars:
env: "{{ env | default('prod') }}"
app_dir: "/opt/apps/{{ app_name }}"
app_binary: "{{ app_dir }}/app"
systemd_service: "{{ app_name }}"
artifacts_dir: "/opt/artifacts"
max_artifacts_to_keep: 5 # Keep last 5 versions for rollback
pre_tasks:
- name: Load vault vars if present
include_vars:
file: "{{ playbook_dir }}/../secrets/vault.yml"
when: (lookup('ansible.builtin.fileglob', playbook_dir ~ '/../secrets/vault.yml', wantlist=True) | length) > 0
tags: [vault]
- name: Validate required variables
assert:
that:
- app_name | length > 0
- app_version | length > 0
fail_msg: "app_name and app_version are required. Use --extra-vars 'app_name=myapp app_version=abc123'"
- name: Ensure app artifact exists on deployment server
stat:
path: "{{ artifacts_dir }}/{{ app_name }}-{{ app_version }}"
register: app_artifact
delegate_to: localhost
become: false
- name: Check for artifact checksum file
stat:
path: "{{ artifacts_dir }}/{{ app_name }}-{{ app_version }}.sha256"
register: artifact_checksum_file
delegate_to: localhost
become: false
- name: Verify artifact checksum (if available)
shell: |
cd {{ artifacts_dir }} && sha256sum -c {{ app_name }}-{{ app_version }}.sha256
register: checksum_result
changed_when: false
failed_when: checksum_result.rc != 0
when: artifact_checksum_file.stat.exists
delegate_to: localhost
become: false
vars:
ansible_ssh_pipelining: false
- name: Fail if artifact checksum verification fails
fail:
msg: "Artifact checksum verification failed for {{ app_name }}-{{ app_version }}. Possible tampering or corruption."
when:
- artifact_checksum_file.stat.exists
- checksum_result.rc != 0
delegate_to: localhost
become: false
- name: Warn if no checksum available
debug:
msg: "WARNING: No checksum file found for {{ app_name }}-{{ app_version }}. Skipping integrity verification."
when: not artifact_checksum_file.stat.exists
delegate_to: localhost
become: false
tasks:
- name: Ensure deploy user exists
user:
name: "{{ app_name }}"
system: true
create_home: false
shell: /bin/false
state: present
- name: Create app directory
file:
path: "{{ app_dir }}"
state: directory
owner: "{{ app_name }}"
group: "{{ app_name }}"
mode: '0755'
- name: Copy app artifact to target host
copy:
src: "{{ artifacts_dir }}/{{ app_name }}-{{ app_version }}"
dest: "{{ app_binary }}-{{ app_version }}"
owner: "{{ app_name }}"
group: "{{ app_name }}"
mode: '0755'
- name: Create/update current symlink for rollback capability
file:
src: "{{ app_binary }}-{{ app_version }}"
dest: "{{ app_binary }}"
state: link
force: yes
owner: "{{ app_name }}"
group: "{{ app_name }}"
- name: Record deployed version for rollback tracking
copy:
content: "{{ app_version }}"
dest: "{{ app_dir }}/.current-version"
owner: "{{ app_name }}"
group: "{{ app_name }}"
mode: '0644'
- name: Cleanup old artifacts (keep last {{ max_artifacts_to_keep }})
shell: |
ls -t {{ artifacts_dir }}/{{ app_name }}-* 2>/dev/null | grep -v '.sha256$' | tail -n +{{ max_artifacts_to_keep + 1 }} | xargs -r rm -f
ls -t {{ artifacts_dir }}/{{ app_name }}-*.sha256 2>/dev/null | tail -n +{{ max_artifacts_to_keep + 1 }} | xargs -r rm -f
changed_when: true
failed_when: false
delegate_to: localhost
become: false
when: app_artifact.stat.exists
- name: Write environment file if app_env_vars provided
copy:
dest: "{{ app_dir }}/.env"
owner: "{{ app_name }}"
group: "{{ app_name }}"
mode: '0600'
content: "{% for key, value in app_env_vars.items() %}{{ key }}={{ value }}\n{% endfor %}"
when: app_env_vars is defined
notify: restart app
- name: Remove environment file if not provided
file:
path: "{{ app_dir }}/.env"
state: absent
when: app_env_vars is not defined
- name: Write systemd service for app
template:
src: "/opt/deploy/templates/app.service.j2"
dest: "/etc/systemd/system/{{ systemd_service }}.service"
notify: restart app
- name: Enable and start app service
systemd:
name: "{{ systemd_service }}"
enabled: true
state: started
daemon_reload: true
handlers:
- name: restart app
systemd:
name: "{{ systemd_service }}"
state: restarted

View file

@ -0,0 +1,498 @@
---
- name: Test Deployment Configuration
hosts: all
become: true
tasks:
- name: Load vault vars if present
include_vars:
file: "{{ playbook_dir }}/../secrets/vault.yml"
no_log: true
when: (lookup('ansible.builtin.fileglob', playbook_dir ~ '/../secrets/vault.yml', wantlist=True) | length) > 0
- name: Check SSH service status
command: systemctl is-active sshd
register: ssh_status
changed_when: false
- debug:
msg: "SSH service is {{ ssh_status.stdout | default('') }}"
- name: Check SSH Port Configuration
command: sshd -T
register: ssh_port
changed_when: false
failed_when: false
- debug:
msg: "SSH port configured as {{ (ssh_port.stdout | default('') | regex_search('(?m)^port\\s+([0-9]+)$', '\\1')) | default('Unknown') }}"
- name: Check Docker version
command: docker --version
register: docker_version
changed_when: false
- debug:
msg: "Docker Version: {{ docker_version.stdout }}"
- name: Check Docker Compose version (hyphen)
command: docker-compose --version
register: docker_compose_version_hyphen
failed_when: false
changed_when: false
- name: Check Docker Compose version (docker compose)
command: docker compose version
register: docker_compose_version_space
failed_when: false
changed_when: false
- name: Display Docker Compose version
debug:
msg: >
{% if docker_compose_version_hyphen.stdout %}
Docker Compose version (docker-compose): {{ docker_compose_version_hyphen.stdout }}
{% elif docker_compose_version_space.stdout %}
Docker Compose version (docker compose): {{ docker_compose_version_space.stdout }}
{% else %}
Docker Compose not found
{% endif %}
- name: Check Ansible version
command: ansible --version
register: ansible_version
changed_when: false
failed_when: false
- debug:
msg: "Ansible Version: {{ (ansible_version.stdout | default('')) .split('\n')[0] if (ansible_version.stdout | default('') | length) > 0 else 'Not installed' }}"
- name: Check UFW status
command: ufw status verbose
register: ufw_status
changed_when: false
- debug:
msg: "UFW Status: {{ ufw_status.stdout }}"
- name: Check Fail2ban service status
command: systemctl is-active fail2ban
register: fail2ban_status
changed_when: false
failed_when: false
- debug:
msg: "Fail2ban is {{ fail2ban_status.stdout }}"
- name: Display logrotate custom config
command: cat /etc/logrotate.d/custom
register: logrotate_config
changed_when: false
failed_when: false
- debug:
msg: "Logrotate custom config:\n{{ logrotate_config.stdout | default('No custom logrotate config found') }}"
- name: Check running Docker containers
command: docker ps
register: docker_ps
changed_when: false
- debug:
msg: "Docker containers:\n{{ docker_ps.stdout }}"
- name: Determine host role
set_fact:
is_services_host: "{{ 'services_hosts' in group_names }}"
is_web_host: "{{ 'web_hosts' in group_names }}"
- name: Define expected stacks for services host
set_fact:
expected_stacks:
- { name: traefik, dir: /opt/traefik }
- { name: lldap, dir: /opt/lldap }
- { name: authelia, dir: /opt/authelia }
- { name: exporters, dir: /opt/exporters }
- { name: alertmanager, dir: /opt/alertmanager }
- { name: prometheus, dir: /opt/prometheus }
- { name: loki, dir: /opt/loki }
- { name: grafana, dir: /opt/grafana }
- { name: forgejo, dir: /opt/forgejo }
- { name: watchtower, dir: /opt/watchtower }
when: is_services_host
- name: Define expected stacks for web host
set_fact:
expected_stacks:
- { name: traefik, dir: /opt/traefik }
- { name: app_core, dir: /opt/app }
- { name: forgejo_runner, dir: /opt/forgejo-runner }
when: is_web_host
- name: Check minimal infra-controller directories exist on services host
stat:
path: "{{ item }}"
register: infra_dirs
loop:
- /var/run/active-apps
- /var/lib/infra-controller
changed_when: false
when: is_services_host
- name: Fail if any minimal infra-controller directory is missing on services host
assert:
that:
- item.stat.exists
- item.stat.isdir
fail_msg: "Missing required directory on services host: {{ item.item }}. This typically means the services playbook has not been applied yet. Run ./setup.sh (or ansible-playbook playbooks/services.yml) and re-run this test."
loop: "{{ infra_dirs.results | default([]) }}"
when: is_services_host
- name: Read deployer authorized_keys on services host
slurp:
src: /home/deployer/.ssh/authorized_keys
register: deployer_authorized_keys
changed_when: false
when: is_services_host
- name: Fail if deployer authorized_keys is missing forced-command restrictions
assert:
that:
- (deployer_authorized_keys.content | b64decode) is search('command="/usr/local/sbin/infra-register-stdin"')
- (deployer_authorized_keys.content | b64decode) is search('command="/usr/local/sbin/infra-deregister"')
fail_msg: "deployer authorized_keys does not include forced-command keys for infra-register-stdin/infra-deregister"
when: is_services_host
- name: Check that expected compose directories exist
stat:
path: "{{ item.dir }}/docker-compose.yml"
register: compose_files
loop: "{{ expected_stacks | default([]) }}"
changed_when: false
- name: Fail if any compose file is missing
assert:
that:
- item.stat.exists
fail_msg: "Missing docker-compose.yml for {{ item.item.name }} at {{ item.item.dir }}/docker-compose.yml"
loop: "{{ compose_files.results | default([]) }}"
when: expected_stacks is defined
- name: Read expected services per stack
command: docker compose config --services
args:
chdir: "{{ item.dir }}"
register: stack_expected
loop: "{{ expected_stacks | default([]) }}"
changed_when: false
- name: Read service status/health per stack (docker inspect)
shell: |
set -euo pipefail
ids=$(docker compose ps -q)
if [ -z "${ids}" ]; then
exit 0
fi
{% raw %}docker inspect --format '{{ index .Config.Labels "com.docker.compose.service" }} {{ .State.Status }} {{ if .State.Health }}{{ .State.Health.Status }}{{ else }}none{{ end }}' ${ids}{% endraw %}
args:
chdir: "{{ item.dir }}"
register: stack_status
loop: "{{ expected_stacks | default([]) }}"
changed_when: false
failed_when: false
- name: Assert all services in each stack are running (and healthy if healthcheck exists)
assert:
that:
- (expected | difference(running_services)) | length == 0
- bad_health_services | length == 0
fail_msg: >-
Stack {{ stack.name }} service status unhealthy.
Missing running={{ expected | difference(running_services) }}.
Bad health={{ bad_health_services }}.
Expected={{ expected }}
Inspect={{ status_lines }}
loop: "{{ (expected_stacks | default([])) | zip(stack_expected.results, stack_status.results) | list }}"
vars:
stack: "{{ item.0 }}"
expected: "{{ item.1.stdout_lines | default([]) }}"
status_lines: "{{ item.2.stdout_lines | default([]) }}"
running_services: >-
{{ status_lines
| map('regex_findall', '^(\S+)\s+running\s+')
| select('truthy')
| map('first')
| list }}
ok_services: >-
{{ status_lines
| map('regex_findall', '^(\S+)\s+running\s+(?:healthy|none)\s*$')
| select('truthy')
| map('first')
| list }}
bad_health_services: >-
{{ (running_services | default([])) | difference(ok_services | default([])) }}
when: expected_stacks is defined
- name: Ensure proxy network exists
command: docker network inspect proxy
register: proxy_network
changed_when: false
- name: Ensure monitoring network exists on services host
command: docker network inspect monitoring
register: monitoring_network
changed_when: false
when: is_services_host
- name: Check Prometheus readiness on services host
command: docker compose exec -T prometheus wget -qO- http://127.0.0.1:9090/-/ready
args:
chdir: /opt/prometheus
register: prometheus_ready
changed_when: false
when: is_services_host
- name: Fail if Prometheus is not ready
assert:
that:
- prometheus_ready.stdout | default('') in ['Prometheus is Ready.', 'Prometheus Server is Ready.']
fail_msg: "Prometheus readiness check failed. Output={{ prometheus_ready.stdout | default('') }}"
when: is_services_host
- name: Check Alertmanager readiness (from Prometheus container)
command: docker compose exec -T prometheus wget -qO- http://alertmanager:9093/-/ready
args:
chdir: /opt/prometheus
register: alertmanager_ready
changed_when: false
when: is_services_host
- name: Fail if Alertmanager is not ready
assert:
that:
- alertmanager_ready.stdout | default('') == 'Ready'
fail_msg: "Alertmanager readiness check failed. Output={{ alertmanager_ready.stdout | default('') }}"
when: is_services_host
- name: Check Grafana health on services host
command: docker compose exec -T grafana wget -qO- http://127.0.0.1:3000/api/health
args:
chdir: /opt/grafana
register: grafana_health
changed_when: false
failed_when: false
when: is_services_host
- name: Fail if Grafana health endpoint is not reachable
assert:
that:
- grafana_health.rc == 0
fail_msg: "Grafana health endpoint check failed (inside container). rc={{ grafana_health.rc }} output={{ grafana_health.stdout | default('') }}"
when: is_services_host
- name: Check Loki readiness on services host
uri:
url: http://127.0.0.1:3100/ready
method: GET
status_code: [200, 503]
register: loki_ready
until: loki_ready.status == 200
retries: 30
delay: 2
changed_when: false
when: is_services_host
- name: Check Traefik dynamic config contains Grafana router rule
shell: |
set -euo pipefail
grep -Fq 'Host(`{{ grafana_hostname }}`)' /opt/traefik/dynamic/base.yml
register: grafana_router_rule
changed_when: false
failed_when: false
when: is_services_host
- name: Fail if Grafana Traefik router rule is not configured as expected
assert:
that:
- grafana_router_rule.rc == 0
fail_msg: "Grafana Traefik router rule mismatch in /opt/traefik/dynamic/base.yml. expected=Host(`{{ grafana_hostname }}`)"
when: is_services_host
- name: Check Traefik dynamic config contains Forgejo router rule
shell: |
set -euo pipefail
grep -Fq 'Host(`{{ forgejo_hostname }}`)' /opt/traefik/dynamic/base.yml
register: forgejo_router_rule
changed_when: false
failed_when: false
when: is_services_host
- name: Fail if Forgejo Traefik router rule is not configured as expected
assert:
that:
- forgejo_router_rule.rc == 0
fail_msg: "Forgejo Traefik router rule mismatch in /opt/traefik/dynamic/base.yml. expected=Host(`{{ forgejo_hostname }}`)"
when: is_services_host
- name: Check Traefik dynamic config contains Authelia router rule
shell: |
set -euo pipefail
grep -Fq 'Host(`{{ auth_hostname }}`)' /opt/traefik/dynamic/base.yml
register: authelia_router_rule
changed_when: false
failed_when: false
when: is_services_host
- name: Fail if Authelia Traefik router rule is not configured as expected
assert:
that:
- authelia_router_rule.rc == 0
fail_msg: "Authelia Traefik router rule mismatch in /opt/traefik/dynamic/base.yml. expected=Host(`{{ auth_hostname }}`)"
when: is_services_host
- name: Check Traefik serves a valid TLS certificate for Grafana hostname (origin)
shell: |
set -euo pipefail
echo | openssl s_client -servername "{{ grafana_hostname }}" -connect 127.0.0.1:443 2>/dev/null | grep -q "Verify return code: 0 (ok)"
register: grafana_origin_tls
changed_when: false
retries: 30
delay: 2
until: grafana_origin_tls.rc == 0
when: is_services_host
- name: Check Traefik serves a valid TLS certificate for Forgejo hostname (origin)
shell: |
set -euo pipefail
echo | openssl s_client -servername "{{ forgejo_hostname }}" -connect 127.0.0.1:443 2>/dev/null | grep -q "Verify return code: 0 (ok)"
register: forgejo_origin_tls
changed_when: false
retries: 30
delay: 2
until: forgejo_origin_tls.rc == 0
when: is_services_host
- name: Check Traefik serves a valid TLS certificate for Authelia hostname (origin)
shell: |
set -euo pipefail
echo | openssl s_client -servername "{{ auth_hostname }}" -connect 127.0.0.1:443 2>/dev/null | grep -q "Verify return code: 0 (ok)"
register: authelia_origin_tls
changed_when: false
retries: 30
delay: 2
until: authelia_origin_tls.rc == 0
when: is_services_host
- name: Check Authelia OIDC discovery issuer (origin)
shell: |
set -euo pipefail
curl -k -sS --resolve "{{ auth_hostname }}:443:127.0.0.1" "https://{{ auth_hostname }}/.well-known/openid-configuration" \
| python3 -c 'import json,sys; print(json.load(sys.stdin).get("issuer",""))'
register: authelia_oidc_issuer
changed_when: false
retries: 30
delay: 2
until: authelia_oidc_issuer.stdout | default('') | length > 0
when: is_services_host
- name: Fail if Authelia OIDC discovery issuer is not configured as expected
assert:
that:
- authelia_oidc_issuer.stdout == ("https://" ~ auth_hostname)
fail_msg: "Authelia OIDC issuer mismatch. expected=https://{{ auth_hostname }} got={{ authelia_oidc_issuer.stdout | default('') }}"
when: is_services_host
- name: Check LLDAP web UI is reachable on services host
uri:
url: http://127.0.0.1:17170/
method: GET
status_code: [200, 302]
register: lldap_web
changed_when: false
when: is_services_host
- name: Check infra-backup systemd timer is enabled on services host
command: systemctl is-enabled infra-backup.timer
register: infra_backup_timer_enabled
changed_when: false
failed_when: false
when: is_services_host
- name: Fail if infra-backup systemd timer is not enabled on services host
assert:
that:
- infra_backup_timer_enabled.stdout | default('') in ['enabled', 'enabled-runtime']
fail_msg: "infra-backup.timer is not enabled (got={{ infra_backup_timer_enabled.stdout | default('') }})"
when: is_services_host
- name: Check infra-backup systemd timer is active on services host
command: systemctl is-active infra-backup.timer
register: infra_backup_timer_active
changed_when: false
failed_when: false
when: is_services_host
- name: Fail if infra-backup systemd timer is not active on services host
assert:
that:
- infra_backup_timer_active.stdout | default('') == 'active'
fail_msg: "infra-backup.timer is not active (got={{ infra_backup_timer_active.stdout | default('') }})"
when: is_services_host
- name: Check restic can list snapshots (using /etc/infra-backup.env)
shell: |
set -euo pipefail
set -a
. /etc/infra-backup.env
set +a
restic snapshots
register: restic_snapshots
changed_when: false
no_log: true
when: is_services_host
- name: Check Forgejo dump command works (reduced export)
shell: |
set -euo pipefail
docker exec --user 1000:1000 forgejo-forgejo-1 forgejo dump --file - --type zip --skip-log --skip-repository --skip-lfs-data --skip-attachment-data --skip-package-data >/dev/null
register: forgejo_dump_smoke
changed_when: false
when: is_services_host
- name: Read object storage configuration from controller environment
set_fact:
s3_bucket: "{{ lookup('env', 'S3_BUCKET') | default('', true) }}"
s3_region: "{{ lookup('env', 'S3_REGION') | default(lookup('env', 'TF_VAR_object_storage_region'), true) | default('us-east-1', true) }}"
changed_when: false
- name: Compute object storage endpoint from controller environment
set_fact:
s3_endpoint: "{{ lookup('env', 'S3_ENDPOINT') | default('https://' ~ s3_region ~ '.linodeobjects.com', true) }}"
changed_when: false
- name: Smoke test Linode Object Storage credentials (head-bucket)
command: >-
docker run --rm
-e AWS_ACCESS_KEY_ID
-e AWS_SECRET_ACCESS_KEY
-e AWS_DEFAULT_REGION
-e AWS_EC2_METADATA_DISABLED=true
amazon/aws-cli:2.15.57
s3api head-bucket --bucket {{ s3_bucket | quote }} --endpoint-url {{ s3_endpoint | quote }}
environment:
AWS_ACCESS_KEY_ID: "{{ S3_ACCESS_KEY_ID | default('') }}"
AWS_SECRET_ACCESS_KEY: "{{ S3_SECRET_ACCESS_KEY | default('') }}"
AWS_DEFAULT_REGION: "{{ s3_region }}"
register: s3_head_bucket
changed_when: false
no_log: true
when:
- (s3_bucket | default('') | length) > 0
- (S3_ACCESS_KEY_ID | default('') | length) > 0
- (S3_SECRET_ACCESS_KEY | default('') | length) > 0
- name: Fail if object storage smoke test failed
assert:
that:
- s3_head_bucket.rc == 0
fail_msg: "Object storage smoke test failed (head-bucket). Check S3_BUCKET/S3_REGION/S3_ENDPOINT and S3_ACCESS_KEY_ID/S3_SECRET_ACCESS_KEY in vault."
when:
- (s3_bucket | default('') | length) > 0
- (S3_ACCESS_KEY_ID | default('') | length) > 0
- (S3_SECRET_ACCESS_KEY | default('') | length) > 0
- name: Check Loki is reachable from web host (allowlist)
uri:
url: "http://{{ hostvars['services'].ansible_host }}:3100/ready"
method: GET
status_code: 200
register: loki_from_web_ready
when: is_web_host

18
playbooks/web.yml Normal file
View file

@ -0,0 +1,18 @@
---
- hosts: web_hosts
become: true
pre_tasks:
- name: Load vault vars if present
include_vars:
file: "{{ playbook_dir }}/../secrets/vault.yml"
when: (lookup('ansible.builtin.fileglob', playbook_dir ~ '/../secrets/vault.yml', wantlist=True) | length) > 0
tags: [vault]
roles:
- role: docker
tags: [docker]
- role: app_deployer
tags: [app_deployer]
- role: app_core
tags: [app_core]
- role: forgejo_runner
tags: [forgejo_runner]