fetch_ml/.forgejo/workflows/deploy-prod.yml

name: Deploy to Production

on:
  workflow_dispatch:
    inputs:
      deploy_tag:
        description: 'Image tag to deploy (default: staging)'
        required: false
        default: 'staging'
      confirm_hipaa:
        description: 'Confirm HIPAA compliance verification (required for HIPAA mode)'
        required: false
        default: 'false'

concurrency:
  group: deploy-prod-${{ gitea.workflow }}-${{ gitea.ref }}
  cancel-in-progress: false

permissions:
  contents: read
  actions: read

env:
  DEPLOY_ENV: prod
  COMPOSE_FILE: deployments/docker-compose.prod.yml

jobs:
  manual-approval:
    name: Manual Approval Gate
    runs-on: self-hosted
    timeout-minutes: 1

    steps:
    - name: Verify manual trigger
      run: |
        echo "=== Production Deployment Approval ==="
        echo "This deployment requires manual approval."
        echo "Triggered by: ${{ gitea.actor }}"
        echo "Deploy tag: ${{ gitea.event.inputs.deploy_tag || 'latest' }}"
        echo ""
        echo "Please verify:"
        echo "  ✓ Staging deployment was successful"
        echo "  ✓ Smoke tests passed in staging"
        echo "  ✓ SLSA provenance is verified"
        echo "  ✓ HIPAA config signature is valid (if HIPAA mode)"
        echo ""
        echo "If all checks pass, this deployment will proceed."

  pre-deployment-gates:
    name: Pre-Deployment Gates
    runs-on: self-hosted
    needs: manual-approval
    timeout-minutes: 15

    steps:
    - name: Checkout code
      uses: actions/checkout@v4

    - name: Verify SLSA provenance
      run: |
        echo "=== Verifying SLSA provenance ==="

        # In production, verify the provenance file
        # For now, this is a placeholder
        echo "Provenance verification (placeholder)"
        echo "In production, this would:"
        echo "  - Download provenance artifact from build workflow"
        echo "  - Verify signature and chain"
        echo "  - Confirm build source and materials"

        # Example verification with slsa-verifier:
        # slsa-verifier verify-artifact fetchml-worker \
        #   --provenance-path fetchml-worker.intoto.jsonl \
        #   --source-uri forgejo.example.com/jfraeysd/fetch_ml \
        #   --source-tag ${{ gitea.sha }}

    - name: Verify HIPAA config signature
      run: |
        echo "=== Verifying HIPAA config signature ==="

        # Check if we're deploying in HIPAA mode
        if [ -f "deployments/configs/worker/docker-prod.yaml" ]; then
          if grep -q "compliance_mode.*hipaa" deployments/configs/worker/docker-prod.yaml; then
            echo "HIPAA mode detected - signature verification REQUIRED"

            # Check if signature file exists
            if [ -f "deployments/configs/worker/docker-hipaa.yaml.sig" ]; then
              echo "✓ HIPAA config signature file exists"

              # Verify signature with cosign
              if command -v cosign &> /dev/null && [ -n "${{ secrets.COSIGN_PUBLIC_KEY }}" ]; then
                cosign verify-blob \
                  --key ${{ secrets.COSIGN_PUBLIC_KEY }} \
                  --signature deployments/configs/worker/docker-hipaa.yaml.sig \
                  deployments/configs/worker/docker-hipaa.yaml || {
                  echo "✗ HIPAA config signature verification FAILED"
                  exit 1
                }
                echo "✓ HIPAA config signature verified"
              else
                echo "⚠ cosign or COSIGN_PUBLIC_KEY not available"
                echo "Manual verification required - confirm with: ${{ gitea.event.inputs.confirm_hipaa }}"

                if [ "${{ gitea.event.inputs.confirm_hipaa }}" != "true" ]; then
                  echo "✗ HIPAA mode deployment requires explicit confirmation"
                  exit 1
                fi
              fi
            else
              echo "✗ HIPAA config signature file NOT FOUND"
              echo "Deployment BLOCKED - HIPAA mode requires signed config"
              exit 1
            fi
          else
            echo "Not in HIPAA mode - skipping signature verification"
          fi
        fi

    - name: Check audit sink reachability
      run: |
        echo "=== Checking audit sink reachability ==="

        # Check if audit sink check script exists
        if [ -f "scripts/check-audit-sink.sh" ]; then
          chmod +x scripts/check-audit-sink.sh
          ./scripts/check-audit-sink.sh --env prod --timeout 10s || {
            echo "✗ Audit sink check FAILED"
            echo "Deployment BLOCKED - audit sink must be reachable"
            exit 1
          }
          echo "✓ Audit sink is reachable"
        else
          echo "⚠ Audit sink check script not found"
          echo "This is a WARNING - audit logging may be unavailable"
        fi

    - name: Verify image digest
      run: |
        echo "=== Verifying image digest ==="

        DEPLOY_TAG="${{ gitea.event.inputs.deploy_tag || 'latest' }}"
        echo "Deploy tag: $DEPLOY_TAG"

        # In production, verify the image digest
        # This ensures we're deploying the exact image that was built and tested
        echo "Image digest verification (placeholder)"
        echo "Expected digest: (from build artifacts)"
        echo "Actual digest: (would be fetched from registry)"

        # Example:
        # EXPECTED_DIGEST=$(cat .forgejo/artifacts/image-digest.txt)
        # ACTUAL_DIGEST=$(docker inspect --format='{{index .RepoDigests 0}}' fetchml-worker:$DEPLOY_TAG)
        # [ "$EXPECTED_DIGEST" = "$ACTUAL_DIGEST" ] || exit 1

  deploy:
    name: Deploy to Production
    runs-on: self-hosted
    needs: pre-deployment-gates
    timeout-minutes: 30

    steps:
    - name: Checkout code
      uses: actions/checkout@v4

    - name: Set up environment
      run: |
        DEPLOY_TAG="${{ gitea.event.inputs.deploy_tag || 'latest' }}"
        echo "DEPLOY_ENV=${{ env.DEPLOY_ENV }}"
        echo "COMPOSE_FILE=${{ env.COMPOSE_FILE }}"
        echo "DEPLOY_TAG=$DEPLOY_TAG"

        # Ensure environment file exists
        if [ ! -f "deployments/.env.prod" ]; then
          echo "Creating production environment file..."
          cat > deployments/.env.prod << 'EOF'
DATA_DIR=./data/prod
LOG_LEVEL=warn
COMPLIANCE_MODE=standard
EOF
        fi

    - name: Deploy to production
      run: |
        echo "=== Deploying to production environment ==="

        DEPLOY_TAG="${{ gitea.event.inputs.deploy_tag || 'latest' }}"

        # Change to deployments directory
        cd deployments

        # Source the environment file
        set -a
        source .env.prod
        set +a

        # Record current deployment for potential rollback
        docker compose -f docker-compose.prod.yml ps > .prod-previous-state.txt 2>/dev/null || true

        # Pull specified image tag
        echo "Pulling image tag: $DEPLOY_TAG"
        docker pull ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:$DEPLOY_TAG || {
          echo "⚠ Image pull failed - may need to build locally or use different tag"
        }

        # Deploy the stack
        docker compose -f docker-compose.prod.yml up -d

        echo "✓ Production deployment initiated"

    - name: Post-deployment health check
      run: |
        echo "=== Running post-deployment health checks ==="

        # Wait for services to start
        sleep 15

        # Check if services are running
        cd deployments
        docker compose -f docker-compose.prod.yml ps

        # Check health endpoints with retries
        MAX_RETRIES=5
        RETRY_DELAY=10

        for i in $(seq 1 $MAX_RETRIES); do
          echo "Health check attempt $i/$MAX_RETRIES..."

          if curl -fsS http://localhost:9101/health > /dev/null 2>&1; then
            echo "✓ API health check passed"
            break
          fi

          if [ $i -eq $MAX_RETRIES ]; then
            echo "✗ API health check failed after $MAX_RETRIES attempts"
            exit 1
          fi

          echo "Retrying in ${RETRY_DELAY}s..."
          sleep $RETRY_DELAY
        done

        # Check compliance_mode
        echo "Checking compliance_mode..."
        COMPLIANCE_MODE=$(curl -fsS http://localhost:9101/health 2>/dev/null | grep -o '"compliance_mode":"[^"]*"' | cut -d'"' -f4 || echo "unknown")
        echo "Compliance mode reported: $COMPLIANCE_MODE"

        # Verify it matches expected
        EXPECTED_MODE=$(grep "compliance_mode" deployments/configs/worker/docker-prod.yaml 2>/dev/null | head -1 | sed 's/.*: *//' || echo "standard")
        if [ "$COMPLIANCE_MODE" = "$EXPECTED_MODE" ]; then
          echo "✓ compliance_mode matches expected: $EXPECTED_MODE"
        else
          echo "⚠ compliance_mode mismatch: expected $EXPECTED_MODE, got $COMPLIANCE_MODE"
          # Don't fail here - log for monitoring
        fi

    - name: Run smoke tests
      run: |
        echo "=== Running production smoke tests ==="

        # Wait for services to be fully ready
        sleep 20

        # Basic connectivity test
        curl -fsS http://localhost:9101/health && echo "✓ API is responding"

        # Check Redis
        docker exec ml-prod-redis redis-cli ping && echo "✓ Redis is responding"

        # Check worker (if running)
        if docker ps | grep -q ml-prod-worker; then
          echo "✓ Worker container is running"
        fi

        echo "✓ Production smoke tests passed"

    - name: Send deployment notification
      if: always()
      run: |
        echo "=== Deployment Notification ==="

        if [ "${{ job.status }}" = "success" ]; then
          echo "✓ Production deployment ${{ gitea.run_id }} SUCCESSFUL"
          echo "Deployed by: ${{ gitea.actor }}"
          echo "Tag: ${{ gitea.event.inputs.deploy_tag || 'latest' }}"
          echo "SHA: ${{ gitea.sha }}"
        else
          echo "✗ Production deployment ${{ gitea.run_id }} FAILED"
          echo "Deployed by: ${{ gitea.actor }}"
          echo "Check logs for details"
        fi

        # In production, integrate with notification system:
        # - Slack webhook
        # - Email notification
        # - PagerDuty (for failures)

    - name: Write audit log
      if: always()
      run: |
        echo "=== Writing Audit Log Entry ==="

        AUDIT_LOG="deployments/.prod-audit.log"
        TIMESTAMP=$(date -Iseconds)
        STATUS="${{ job.status }}"
        RUN_ID="${{ gitea.run_id }}"
        ACTOR="${{ gitea.actor }}"

        echo "$TIMESTAMP | deployment | $STATUS | run_id=$RUN_ID | actor=$ACTOR | tag=${{ gitea.event.inputs.deploy_tag || 'latest' }}" >> "$AUDIT_LOG"

        echo "✓ Audit log entry written"

    - name: Rollback on failure
      if: failure()
      run: |
        echo "=== Production deployment failed ==="
        echo "Rollback procedure:"
        echo "1. Identify previous working image tag from .prod-audit.log"
        echo "2. Run: cd deployments && docker compose -f docker-compose.prod.yml down"
        echo "3. Deploy previous tag: docker compose -f docker-compose.prod.yml up -d"
        echo "4. Verify health endpoints"
        echo ""
        echo "Note: Audit log chain is NOT rolled back - chain integrity preserved"
        echo "Note: Redis queue state is NOT rolled back - may need manual cleanup"

        exit 1