diff --git a/playbooks/app.yml b/playbooks/app.yml deleted file mode 100644 index fba96c1..0000000 --- a/playbooks/app.yml +++ /dev/null @@ -1,18 +0,0 @@ ---- -- hosts: web_hosts - become: true - pre_tasks: - - name: Load vault vars if present - include_vars: - file: "{{ playbook_dir }}/../secrets/vault.yml" - when: (lookup('ansible.builtin.fileglob', playbook_dir ~ '/../secrets/vault.yml', wantlist=True) | length) > 0 - tags: always - roles: - - role: docker - tags: [docker] - - role: traefik - tags: [traefik] - - role: app_core - tags: [app_core] - - role: forgejo_runner - tags: [forgejo_runner] diff --git a/playbooks/deploy.yml b/playbooks/deploy.yml index 97806fb..d293dec 100644 --- a/playbooks/deploy.yml +++ b/playbooks/deploy.yml @@ -11,6 +11,7 @@ roles: - docker + - fail2ban - traefik - lldap - authelia diff --git a/playbooks/services.yml b/playbooks/services.yml index 4b3252f..8c2188b 100644 --- a/playbooks/services.yml +++ b/playbooks/services.yml @@ -6,7 +6,7 @@ include_vars: file: "{{ playbook_dir }}/../secrets/vault.yml" when: (lookup('ansible.builtin.fileglob', playbook_dir ~ '/../secrets/vault.yml', wantlist=True) | length) > 0 - tags: always + tags: [vault, backups, forgejo] - name: Ensure minimal required directories exist file: @@ -18,6 +18,8 @@ roles: - role: docker tags: [docker] + - role: firewall + tags: [firewall] - role: traefik tags: [traefik] - role: app_ssh_access @@ -30,6 +32,8 @@ tags: [authelia] - role: exporters tags: [exporters] + - role: alertmanager + tags: [alertmanager] - role: prometheus tags: [prometheus] - role: loki @@ -40,6 +44,8 @@ tags: [forgejo] - role: watchtower tags: [watchtower] + - role: backups + tags: [backups] post_tasks: - name: Read Grafana Traefik router rule label @@ -137,3 +143,23 @@ delay: 5 until: authelia_origin_tls.rc == 0 tags: [authelia] + + - name: Trigger Traefik certificate request for Prometheus hostname + command: curl -k -s -o /dev/null -w "%{http_code}" --resolve "{{ prometheus_hostname }}:443:127.0.0.1" "https://{{ prometheus_hostname }}/" + register: prometheus_tls_warmup + changed_when: false + retries: 30 + delay: 2 + until: prometheus_tls_warmup.stdout != '000' + tags: [prometheus] + + - name: Wait for Traefik certificate SAN to include Prometheus hostname + shell: | + set -euo pipefail + echo | openssl s_client -servername "{{ prometheus_hostname }}" -connect 127.0.0.1:443 2>/dev/null | openssl x509 -noout -text | grep -q "DNS:{{ prometheus_hostname }}" + register: prometheus_origin_tls + changed_when: false + retries: 90 + delay: 5 + until: prometheus_origin_tls.rc == 0 + tags: [prometheus] diff --git a/playbooks/test_config.yml b/playbooks/test_config.yml deleted file mode 100644 index 1aa73e8..0000000 --- a/playbooks/test_config.yml +++ /dev/null @@ -1,434 +0,0 @@ ---- -- name: Test Deployment Configuration - hosts: all - become: true - tasks: - - name: Load vault vars if present - include_vars: - file: "{{ playbook_dir }}/../secrets/vault.yml" - no_log: true - when: (lookup('ansible.builtin.fileglob', playbook_dir ~ '/../secrets/vault.yml', wantlist=True) | length) > 0 - - - name: Check SSH service status - command: systemctl is-active sshd - register: ssh_status - changed_when: false - - debug: - msg: "SSH service is {{ ssh_status.stdout | default('') }}" - - name: Check SSH Port Configuration - command: sshd -T - register: ssh_port - changed_when: false - failed_when: false - - debug: - msg: "SSH port configured as {{ (ssh_port.stdout | default('') | regex_search('(?m)^port\\s+([0-9]+)$', '\\1')) | default('Unknown') }}" - - name: Check Docker version - command: docker --version - register: docker_version - changed_when: false - - debug: - msg: "Docker Version: {{ docker_version.stdout }}" - - name: Check Docker Compose version (hyphen) - command: docker-compose --version - register: docker_compose_version_hyphen - failed_when: false - changed_when: false - - name: Check Docker Compose version (docker compose) - command: docker compose version - register: docker_compose_version_space - failed_when: false - changed_when: false - - name: Display Docker Compose version - debug: - msg: > - {% if docker_compose_version_hyphen.stdout %} - - - Docker Compose version (docker-compose): {{ docker_compose_version_hyphen.stdout }} - {% elif docker_compose_version_space.stdout %} - - - Docker Compose version (docker compose): {{ docker_compose_version_space.stdout }} - {% else %} - - - Docker Compose not found - {% endif %} - - - name: Check Ansible version - command: ansible --version - register: ansible_version - changed_when: false - failed_when: false - - debug: - msg: "Ansible Version: {{ (ansible_version.stdout | default('')) .split('\n')[0] if (ansible_version.stdout | default('') | length) > 0 else 'Not installed' }}" - - name: Check UFW status - command: ufw status verbose - register: ufw_status - changed_when: false - - debug: - msg: "UFW Status: {{ ufw_status.stdout }}" - - name: Check Fail2ban service status - command: systemctl is-active fail2ban - register: fail2ban_status - changed_when: false - failed_when: false - - debug: - msg: "Fail2ban is {{ fail2ban_status.stdout }}" - - name: Display logrotate custom config - command: cat /etc/logrotate.d/custom - register: logrotate_config - changed_when: false - failed_when: false - - debug: - msg: "Logrotate custom config:\n{{ logrotate_config.stdout | default('No custom logrotate config found') }}" - - name: Check running Docker containers - command: docker ps - register: docker_ps - changed_when: false - - debug: - msg: "Docker containers:\n{{ docker_ps.stdout }}" - - - name: Determine host role - set_fact: - is_services_host: "{{ 'services_hosts' in group_names }}" - is_web_host: "{{ 'web_hosts' in group_names }}" - - - name: Define expected stacks for services host - set_fact: - expected_stacks: - - { name: traefik, dir: /opt/traefik } - - { name: lldap, dir: /opt/lldap } - - { name: authelia, dir: /opt/authelia } - - { name: exporters, dir: /opt/exporters } - - { name: prometheus, dir: /opt/prometheus } - - { name: loki, dir: /opt/loki } - - { name: grafana, dir: /opt/grafana } - - { name: forgejo, dir: /opt/forgejo } - - { name: watchtower, dir: /opt/watchtower } - when: is_services_host - - - name: Define expected stacks for web host - set_fact: - expected_stacks: - - { name: traefik, dir: /opt/traefik } - - { name: app_core, dir: /opt/app } - - { name: forgejo_runner, dir: /opt/forgejo-runner } - when: is_web_host - - - name: Check minimal infra-controller directories exist on services host - stat: - path: "{{ item }}" - register: infra_dirs - loop: - - /var/run/active-apps - - /var/lib/infra-controller - changed_when: false - when: is_services_host - - - name: Fail if any minimal infra-controller directory is missing on services host - assert: - that: - - item.stat.exists - - item.stat.isdir - fail_msg: "Missing required directory on services host: {{ item.item }}. This typically means the services playbook has not been applied yet. Run ./setup.sh (or ansible-playbook playbooks/services.yml) and re-run this test." - loop: "{{ infra_dirs.results | default([]) }}" - when: is_services_host - - - name: Read deployer authorized_keys on services host - slurp: - src: /home/deployer/.ssh/authorized_keys - register: deployer_authorized_keys - changed_when: false - when: is_services_host - - - name: Fail if deployer authorized_keys is missing forced-command restrictions - assert: - that: - - (deployer_authorized_keys.content | b64decode) is search('command="/usr/local/sbin/infra-register-stdin"') - - (deployer_authorized_keys.content | b64decode) is search('command="/usr/local/sbin/infra-deregister"') - fail_msg: "deployer authorized_keys does not include forced-command keys for infra-register-stdin/infra-deregister" - when: is_services_host - - - name: Check that expected compose directories exist - stat: - path: "{{ item.dir }}/docker-compose.yml" - register: compose_files - loop: "{{ expected_stacks | default([]) }}" - changed_when: false - - - name: Fail if any compose file is missing - assert: - that: - - item.stat.exists - fail_msg: "Missing docker-compose.yml for {{ item.item.name }} at {{ item.item.dir }}/docker-compose.yml" - loop: "{{ compose_files.results | default([]) }}" - when: expected_stacks is defined - - - name: Read expected services per stack - command: docker compose config --services - args: - chdir: "{{ item.dir }}" - register: stack_expected - loop: "{{ expected_stacks | default([]) }}" - changed_when: false - - - name: Read service status/health per stack (docker inspect) - shell: | - set -euo pipefail - ids=$(docker compose ps -q) - if [ -z "${ids}" ]; then - exit 0 - fi - {% raw %}docker inspect --format '{{ index .Config.Labels "com.docker.compose.service" }} {{ .State.Status }} {{ if .State.Health }}{{ .State.Health.Status }}{{ else }}none{{ end }}' ${ids}{% endraw %} - args: - chdir: "{{ item.dir }}" - register: stack_status - loop: "{{ expected_stacks | default([]) }}" - changed_when: false - failed_when: false - - - name: Assert all services in each stack are running (and healthy if healthcheck exists) - assert: - that: - - (expected | difference(running_services)) | length == 0 - - bad_health_services | length == 0 - fail_msg: >- - Stack {{ stack.name }} service status unhealthy. - Missing running={{ expected | difference(running_services) }}. - Bad health={{ bad_health_services }}. - Expected={{ expected }} - Inspect={{ status_lines }} - loop: "{{ (expected_stacks | default([])) | zip(stack_expected.results, stack_status.results) | list }}" - vars: - stack: "{{ item.0 }}" - expected: "{{ item.1.stdout_lines | default([]) }}" - status_lines: "{{ item.2.stdout_lines | default([]) }}" - running_services: >- - {{ status_lines - | map('regex_findall', '^(\S+)\s+running\s+') - | select('truthy') - | map('first') - | list }} - ok_services: >- - {{ status_lines - | map('regex_findall', '^(\S+)\s+running\s+(?:healthy|none)\s*$') - | select('truthy') - | map('first') - | list }} - bad_health_services: >- - {{ (running_services | default([])) | difference(ok_services | default([])) }} - when: expected_stacks is defined - - - name: Ensure proxy network exists - command: docker network inspect proxy - register: proxy_network - changed_when: false - - - name: Ensure monitoring network exists on services host - command: docker network inspect monitoring - register: monitoring_network - changed_when: false - when: is_services_host - - - name: Check Prometheus readiness on services host - command: docker compose exec -T prometheus wget -qO- http://127.0.0.1:9090/-/ready - args: - chdir: /opt/prometheus - register: prometheus_ready - changed_when: false - when: is_services_host - - - name: Fail if Prometheus is not ready - assert: - that: - - prometheus_ready.stdout | default('') in ['Prometheus is Ready.', 'Prometheus Server is Ready.'] - fail_msg: "Prometheus readiness check failed. Output={{ prometheus_ready.stdout | default('') }}" - when: is_services_host - - - name: Check Grafana health on services host - command: docker compose exec -T grafana wget -qO- http://127.0.0.1:3000/api/health - args: - chdir: /opt/grafana - register: grafana_health - changed_when: false - failed_when: false - when: is_services_host - - - name: Fail if Grafana health endpoint is not reachable - assert: - that: - - grafana_health.rc == 0 - fail_msg: "Grafana health endpoint check failed (inside container). rc={{ grafana_health.rc }} output={{ grafana_health.stdout | default('') }}" - when: is_services_host - - - name: Check Loki readiness on services host - uri: - url: http://127.0.0.1:3100/ready - method: GET - status_code: [200, 503] - register: loki_ready - until: loki_ready.status == 200 - retries: 30 - delay: 2 - changed_when: false - when: is_services_host - - - name: Check Traefik dynamic config contains Grafana router rule - shell: | - set -euo pipefail - grep -Fq 'Host(`{{ grafana_hostname }}`)' /opt/traefik/dynamic/base.yml - register: grafana_router_rule - changed_when: false - failed_when: false - when: is_services_host - - - name: Fail if Grafana Traefik router rule is not configured as expected - assert: - that: - - grafana_router_rule.rc == 0 - fail_msg: "Grafana Traefik router rule mismatch in /opt/traefik/dynamic/base.yml. expected=Host(`{{ grafana_hostname }}`)" - when: is_services_host - - - name: Check Traefik dynamic config contains Forgejo router rule - shell: | - set -euo pipefail - grep -Fq 'Host(`{{ forgejo_hostname }}`)' /opt/traefik/dynamic/base.yml - register: forgejo_router_rule - changed_when: false - failed_when: false - when: is_services_host - - - name: Fail if Forgejo Traefik router rule is not configured as expected - assert: - that: - - forgejo_router_rule.rc == 0 - fail_msg: "Forgejo Traefik router rule mismatch in /opt/traefik/dynamic/base.yml. expected=Host(`{{ forgejo_hostname }}`)" - when: is_services_host - - - name: Check Traefik dynamic config contains Authelia router rule - shell: | - set -euo pipefail - grep -Fq 'Host(`{{ auth_hostname }}`)' /opt/traefik/dynamic/base.yml - register: authelia_router_rule - changed_when: false - failed_when: false - when: is_services_host - - - name: Fail if Authelia Traefik router rule is not configured as expected - assert: - that: - - authelia_router_rule.rc == 0 - fail_msg: "Authelia Traefik router rule mismatch in /opt/traefik/dynamic/base.yml. expected=Host(`{{ auth_hostname }}`)" - when: is_services_host - - - name: Check Traefik serves a valid TLS certificate for Grafana hostname (origin) - shell: | - set -euo pipefail - echo | openssl s_client -servername "{{ grafana_hostname }}" -connect 127.0.0.1:443 2>/dev/null | grep -q "Verify return code: 0 (ok)" - register: grafana_origin_tls - changed_when: false - retries: 30 - delay: 2 - until: grafana_origin_tls.rc == 0 - when: is_services_host - - - name: Check Traefik serves a valid TLS certificate for Forgejo hostname (origin) - shell: | - set -euo pipefail - echo | openssl s_client -servername "{{ forgejo_hostname }}" -connect 127.0.0.1:443 2>/dev/null | grep -q "Verify return code: 0 (ok)" - register: forgejo_origin_tls - changed_when: false - retries: 30 - delay: 2 - until: forgejo_origin_tls.rc == 0 - when: is_services_host - - - name: Check Traefik serves a valid TLS certificate for Authelia hostname (origin) - shell: | - set -euo pipefail - echo | openssl s_client -servername "{{ auth_hostname }}" -connect 127.0.0.1:443 2>/dev/null | grep -q "Verify return code: 0 (ok)" - register: authelia_origin_tls - changed_when: false - retries: 30 - delay: 2 - until: authelia_origin_tls.rc == 0 - when: is_services_host - - - name: Check Authelia OIDC discovery issuer (origin) - shell: | - set -euo pipefail - curl -k -sS --resolve "{{ auth_hostname }}:443:127.0.0.1" "https://{{ auth_hostname }}/.well-known/openid-configuration" \ - | python3 -c 'import json,sys; print(json.load(sys.stdin).get("issuer",""))' - register: authelia_oidc_issuer - changed_when: false - retries: 30 - delay: 2 - until: authelia_oidc_issuer.stdout | default('') | length > 0 - when: is_services_host - - - name: Fail if Authelia OIDC discovery issuer is not configured as expected - assert: - that: - - authelia_oidc_issuer.stdout == ("https://" ~ auth_hostname) - fail_msg: "Authelia OIDC issuer mismatch. expected=https://{{ auth_hostname }} got={{ authelia_oidc_issuer.stdout | default('') }}" - when: is_services_host - - - name: Check LLDAP web UI is reachable on services host - uri: - url: http://127.0.0.1:17170/ - method: GET - status_code: [200, 302] - register: lldap_web - changed_when: false - when: is_services_host - - - name: Read object storage configuration from controller environment - set_fact: - s3_bucket: "{{ lookup('env', 'S3_BUCKET') | default('', true) }}" - s3_region: "{{ lookup('env', 'S3_REGION') | default(lookup('env', 'TF_VAR_object_storage_region'), true) | default('us-east-1', true) }}" - changed_when: false - - - name: Compute object storage endpoint from controller environment - set_fact: - s3_endpoint: "{{ lookup('env', 'S3_ENDPOINT') | default('https://' ~ s3_region ~ '.linodeobjects.com', true) }}" - changed_when: false - - - name: Smoke test Linode Object Storage credentials (head-bucket) - command: >- - docker run --rm - -e AWS_ACCESS_KEY_ID - -e AWS_SECRET_ACCESS_KEY - -e AWS_DEFAULT_REGION - -e AWS_EC2_METADATA_DISABLED=true - amazon/aws-cli:2.15.57 - s3api head-bucket --bucket {{ s3_bucket | quote }} --endpoint-url {{ s3_endpoint | quote }} - environment: - AWS_ACCESS_KEY_ID: "{{ S3_ACCESS_KEY_ID | default('') }}" - AWS_SECRET_ACCESS_KEY: "{{ S3_SECRET_ACCESS_KEY | default('') }}" - AWS_DEFAULT_REGION: "{{ s3_region }}" - register: s3_head_bucket - changed_when: false - no_log: true - when: - - (s3_bucket | default('') | length) > 0 - - (S3_ACCESS_KEY_ID | default('') | length) > 0 - - (S3_SECRET_ACCESS_KEY | default('') | length) > 0 - - - name: Fail if object storage smoke test failed - assert: - that: - - s3_head_bucket.rc == 0 - fail_msg: "Object storage smoke test failed (head-bucket). Check S3_BUCKET/S3_REGION/S3_ENDPOINT and S3_ACCESS_KEY_ID/S3_SECRET_ACCESS_KEY in vault." - when: - - (s3_bucket | default('') | length) > 0 - - (S3_ACCESS_KEY_ID | default('') | length) > 0 - - (S3_SECRET_ACCESS_KEY | default('') | length) > 0 - - - name: Check Loki is reachable from web host (allowlist) - uri: - url: "http://{{ hostvars['services'].ansible_host }}:3100/ready" - method: GET - status_code: 200 - register: loki_from_web_ready - when: is_web_host diff --git a/setup.sh b/setup.sh deleted file mode 100755 index 0bcac4c..0000000 --- a/setup.sh +++ /dev/null @@ -1,198 +0,0 @@ -#! /usr/bin/env bash - -set -euo pipefail - -vault_args=() -temp_vault_pass_file="" - -usage() { - cat <<'EOF' -Usage: ./setup.sh [--no-ansible] [--no-terraform|--ansible-only] [--] [terraform ] - -Defaults: - - Runs Terraform (plan/apply) in terraform/ - - Generates Ansible inventory from Terraform outputs - - Runs Ansible playbooks - -Options: - --no-ansible Run Terraform only (no Ansible). - --no-terraform Skip Terraform; requires existing inventory/hosts.yml. - --ansible-only Alias for --no-terraform. - --help Show this help. - -Terraform passthrough: - ./setup.sh -- terraform [args] - ./setup.sh -- [args] -EOF -} - -cleanup() { - if [[ -n "${temp_vault_pass_file}" ]] && [[ -f "${temp_vault_pass_file}" ]]; then - rm -f "${temp_vault_pass_file}" - fi -} -trap cleanup EXIT -ansible_extra_args=() -terraform_apply_args=() -terraform_passthrough=() - -run_ansible=true -run_terraform=true - -if [[ "${1:-}" == "--help" ]] || [[ "${1:-}" == "-h" ]]; then - usage - exit 0 -fi - -if [[ "${1:-}" == "--no-ansible" ]]; then - run_ansible=false - shift -fi - -if [[ "${1:-}" == "--no-terraform" ]] || [[ "${1:-}" == "--ansible-only" ]]; then - run_terraform=false - shift -fi - -if [[ "${1:-}" == "--" ]]; then - shift - if [[ "${1:-}" == "terraform" ]]; then - shift - terraform_passthrough=("$@") - else - case "${1:-}" in - output|state|workspace|providers|version|validate|fmt|taint|untaint|graph|show|console|import) - terraform_passthrough=("$@") - ;; - *) - terraform_apply_args=("$@") - ;; - esac - fi -fi - -if [[ -f ".env" ]]; then - set -a - source .env - set +a -fi - -if [[ "${run_terraform}" == "true" ]]; then - if ! command -v terraform >/dev/null 2>&1; then - echo "terraform is required (install terraform or run with --no-terraform)" >&2 - exit 2 - fi -fi - -if [[ "${run_ansible}" == "true" ]]; then - if ! command -v ansible-playbook >/dev/null 2>&1; then - echo "ansible-playbook is required (install ansible or run with --no-ansible)" >&2 - exit 2 - fi -fi - -if [[ -f "secrets/vault.yml" ]]; then - if ! command -v ansible-vault >/dev/null 2>&1; then - echo "ansible-vault is required to read secrets/vault.yml" >&2 - exit 2 - fi - if [[ -f "secrets/.vault_pass" ]]; then - vault_args+=(--vault-password-file "secrets/.vault_pass") - elif [[ -f ".vault_pass" ]]; then - vault_args+=(--vault-password-file ".vault_pass") - else - read -rsp "Vault password: " vault_password - echo - temp_vault_pass_file=$(mktemp) - chmod 600 "${temp_vault_pass_file}" - printf '%s' "${vault_password}" > "${temp_vault_pass_file}" - unset vault_password - vault_args+=(--vault-password-file "${temp_vault_pass_file}") - fi - - if (( ${#vault_args[@]} )); then - vault_plain=$(ansible-vault view secrets/vault.yml "${vault_args[@]}") - else - vault_plain=$(ansible-vault view secrets/vault.yml) - fi - while IFS= read -r line; do - [[ -z "${line}" ]] && continue - [[ "${line}" == "---" ]] && continue - [[ "${line}" != TF_VAR_*:* ]] && [[ "${line}" != CF_DNS_API_TOKEN:* ]] && [[ "${line}" != CF_ZONE_API_TOKEN:* ]] && [[ "${line}" != S3_ACCESS_KEY_ID:* ]] && [[ "${line}" != S3_SECRET_ACCESS_KEY:* ]] && continue - key="${line%%:*}" - value="${line#*:}" - value="${value# }" - [[ -z "${value}" ]] && continue - escaped=$(printf '%q' "${value}") - eval "export ${key}=${escaped}" - done <<< "${vault_plain}" - - if [[ -z "${TF_VAR_cloudflare_api_token:-}" ]] && [[ -n "${CF_DNS_API_TOKEN:-}" ]]; then - export TF_VAR_cloudflare_api_token="${CF_DNS_API_TOKEN}" - fi - - if [[ -z "${TF_VAR_cloudflare_zone_id:-}" ]] && [[ -n "${CF_ZONE_API_TOKEN:-}" ]]; then - export TF_VAR_cloudflare_zone_id="${CF_ZONE_API_TOKEN}" - fi -fi - -if [[ "${run_terraform}" == "true" ]]; then - terraform -chdir=terraform init - - if (( ${#terraform_passthrough[@]} )); then - terraform -chdir=terraform "${terraform_passthrough[@]}" - exit 0 - fi - - if (( ${#terraform_apply_args[@]} )); then - terraform -chdir=terraform apply "${terraform_apply_args[@]}" - else - terraform -chdir=terraform plan -out=tfplan - terraform -chdir=terraform apply tfplan - fi - - rm -f terraform/tfplan - - web_ipv4=$(terraform -chdir=terraform output -raw web_ip) - web_ipv6=$(terraform -chdir=terraform output -raw web_ipv6) - services_ipv4=$(terraform -chdir=terraform output -raw services_ip) - - ssh_user=${TF_VAR_user:-ansible} - - mkdir -p inventory/host_vars - - cat > inventory/hosts.yml < inventory/host_vars/web.yml <&2 - exit 2 - fi -fi - -if [[ "${run_ansible}" == "true" ]]; then - if [[ -n "${vault_args+x}" ]] && (( ${#vault_args[@]} )); then - ansible_extra_args=("${vault_args[@]}") - fi - ansible-playbook playbooks/services.yml ${ansible_extra_args[@]+"${ansible_extra_args[@]}"} - ansible-playbook playbooks/app.yml ${ansible_extra_args[@]+"${ansible_extra_args[@]}"} -fi