diff --git a/roles/forgejo/files/update-ai-scrapers.sh b/roles/forgejo/files/update-ai-scrapers.sh new file mode 100644 index 0000000..f36b498 --- /dev/null +++ b/roles/forgejo/files/update-ai-scrapers.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# Auto-update AI scrapers list for robots.txt +# Fetches latest AI agents from knownagents.com + +FORGEJO_DIR="/opt/forgejo" +ROBOTS_FILE="$FORGEJO_DIR/robots.txt" +TEMP_FILE=$(mktemp) +FORGEJO_HOST="${FORGEJO_HOST:-git.example.com}" + +# Base robots.txt content (generic rules) +cat > "$TEMP_FILE" << 'BASEEOF' +User-agent: * +Disallow: /api/ +Disallow: /user/ +Disallow: /login +Disallow: /explore +Disallow: /admin +Disallow: /repo/create +Disallow: /*/*/raw/ +Disallow: /*/*/archive/ +Disallow: /*/*/commits/ +Disallow: /*/*/blame/ +Disallow: /*/*/compare/ +Disallow: /*/*/activity +Disallow: /*/*/issues/ +Disallow: /*/*/pulls/ +Disallow: /*/*/settings/ +Disallow: /*/*/*/issues/ +Disallow: /*/*/*/pulls/ +Disallow: /*/*/*/wiki/ +Disallow: /*/*/*/actions/ +Disallow: /*/*/*/projects/ +Disallow: /*/*/*/packages/ +Disallow: /*/*/*/releases/ +Disallow: /*/*/*/milestones/ +Disallow: /*/*/*/labels/ +Disallow: /*/*/*/branches/ +Disallow: /*/*/*/tags/ +Disallow: /*/*/*/graphs/ + +BASEEOF + +# Static AI agents list (always included as fallback) +STATIC_AGENTS=( + "GPTBot" "ChatGPT-User" "ChatGPT-User v2.0" "ChatGPT-Browser" "OAI-SearchBot" + "CCBot" "anthropic-ai" "Anthropic-Claude" "Claude-Web" "ClaudeBot" + "PerplexityBot" "Perplexity-User" "YouBot" "AI2Bot" "Amazonbot" + "Applebot-Extended" "Bytespider" "DeepseekBot" "Diffbot" "DuckAssistBot" + "FacebookBot" "Meta-ExternalAgent" "meta-webindexer" "Gemini-Ai" "Google-Extended" + "GoogleAgent-Mariner" "Google-NotebookLM" "xAI-Bot" "MistralAI-User" + "Together-Bot" "Groq-Bot" "HuggingFace-Bot" "Cohere-Ai" "Cohere-Command" + "Replicate-Bot" "FirecrawlAgent" "Crawlspace" "Webzio-Extended" "TimpiBot" + "ImagesiftBot" "RunPod-Bot" "Devin" "Character-AI" "Brightbot" + "TerraCotta" "Andibot" "bigsur.ai" "IbouBot" "Kangaroo Bot" + "PanguBot" "Cotoyogi" "Google-CloudVertexBot" "Gemini-Deep-Research" + "Bard-Ai" "Ai2Bot-Dolma" "ChatGLM-Spider" "ChatGPT-Agent" "GoogleOther" + "ICC-Crawler" "imageSpider" "laion-huggingface-processor" "LCC" "Manus-User" + "NovaAct" "omgili" "SBIntuitionsBot" "Spider" "TwinAgent" + "VelenPublicWebCrawler" "AmazonBuyForMe" "netEstate-Imprint-Crawler" + "Datenbank-Crawler" "iAskBot" "iaskspider" "KunatoCrawler" "WRTNBot" + "Crawl4AI" "cohere-training-data-crawler" "DeepSeekBot" +) + +# Fetch dynamic AI agents from knownagents.com +DYNAMIC_AGENTS="" +if command -v curl &> /dev/null && command -v grep &> /dev/null; then + # Try to fetch from knownagents.com + RAW_AGENTS=$(curl -s --max-time 10 "https://knownagents.com/agents?category=ai-data-scrapers" 2>/dev/null || echo "") + + if [[ -n "$RAW_AGENTS" ]]; then + # Extract agent names from HTML + DYNAMIC_AGENTS=$(echo "$RAW_AGENTS" | grep -oE 'href="/agents/[^"]+' | sed 's|href="/agents/||' | sort -u | head -100 || echo "") + fi +fi + +# Combine static and dynamic agents, remove duplicates +COMBINED_AGENTS=() +declare -A SEEN + +# Add static agents +for agent in "${STATIC_AGENTS[@]}"; do + if [[ -z "${SEEN[$agent]:-}" ]]; then + SEEN[$agent]=1 + COMBINED_AGENTS+=("$agent") + fi +done + +# Add dynamic agents +if [[ -n "$DYNAMIC_AGENTS" ]]; then + while IFS= read -r agent; do + if [[ -n "$agent" && -z "${SEEN[$agent]:-}" ]]; then + SEEN[$agent]=1 + COMBINED_AGENTS+=("$agent") + fi + done <<< "$DYNAMIC_AGENTS" +fi + +# Generate User-agent blocks +for agent in "${COMBINED_AGENTS[@]}"; do + echo "User-agent: $agent" >> "$TEMP_FILE" + echo "Disallow: /" >> "$TEMP_FILE" + echo "" >> "$TEMP_FILE" +done + +# Add sitemap +echo "Sitemap: https://${FORGEJO_HOST}/sitemap.xml" >> "$TEMP_FILE" + +# Compare with existing file and update if changed +if [[ -f "$ROBOTS_FILE" ]]; then + if ! diff -q "$TEMP_FILE" "$ROBOTS_FILE" > /dev/null 2>&1; then + cp "$TEMP_FILE" "$ROBOTS_FILE" + echo "[$(date)] robots.txt updated with ${#COMBINED_AGENTS[@]} AI agents" + + # Restart forgejo to pick up changes if running + if docker ps -q --filter name=forgejo-forgejo-1 2>/dev/null | grep -q .; then + docker compose -f "$FORGEJO_DIR/docker-compose.yml" restart forgejo 2>/dev/null || true + fi + else + echo "[$(date)] No changes to robots.txt" + fi +else + cp "$TEMP_FILE" "$ROBOTS_FILE" + echo "[$(date)] robots.txt created with ${#COMBINED_AGENTS[@]} AI agents" +fi + +rm -f "$TEMP_FILE" +exit 0 diff --git a/roles/forgejo/tasks/main.yml b/roles/forgejo/tasks/main.yml index 79e5dae..518e356 100644 --- a/roles/forgejo/tasks/main.yml +++ b/roles/forgejo/tasks/main.yml @@ -14,13 +14,53 @@ path: /opt/forgejo state: directory +- name: Ensure proxy network exists + command: docker network inspect proxy + register: proxy_network + changed_when: false + failed_when: false + +- name: Create proxy network if missing + command: docker network create proxy + when: proxy_network.rc != 0 + +- name: Copy update-ai-scrapers script + copy: + src: update-ai-scrapers.sh + dest: /opt/forgejo/update-ai-scrapers.sh + mode: "0755" + +- name: Run AI scrapers update script (initial) + command: /opt/forgejo/update-ai-scrapers.sh + args: + chdir: /opt/forgejo + environment: + FORGEJO_HOST: "{{ forgejo_hostname }}" + register: scraper_update + changed_when: "'updated' in scraper_update.stdout" + +- name: Set up cron job for periodic AI scrapers update + cron: + name: "Update AI scrapers robots.txt" + minute: "0" + hour: "2" + weekday: "6" + job: "cd /opt/forgejo && FORGEJO_HOST={{ forgejo_hostname }} /opt/forgejo/update-ai-scrapers.sh >> /var/log/forgejo-ai-scrapers-update.log 2>&1" + user: root + state: present + +- name: Copy robots.txt for Forgejo (fallback) + template: + src: robots.txt.j2 + dest: /opt/forgejo/robots.txt.backup + - name: Copy Docker Compose file for Forgejo template: src: docker-compose.yml.j2 dest: /opt/forgejo/docker-compose.yml - name: Deploy Forgejo - command: docker compose up -d --force-recreate + command: docker compose up -d args: chdir: /opt/forgejo diff --git a/roles/forgejo/templates/robots.txt.j2 b/roles/forgejo/templates/robots.txt.j2 new file mode 100644 index 0000000..17cdd59 --- /dev/null +++ b/roles/forgejo/templates/robots.txt.j2 @@ -0,0 +1,267 @@ +User-agent: * +Disallow: /api/ +Disallow: /user/ +Disallow: /login +Disallow: /explore +Disallow: /admin +Disallow: /repo/create +Disallow: /*/*/raw/ +Disallow: /*/*/archive/ +Disallow: /*/*/commits/ +Disallow: /*/*/blame/ +Disallow: /*/*/compare/ +Disallow: /*/*/activity +Disallow: /*/*/issues/ +Disallow: /*/*/pulls/ +Disallow: /*/*/settings/ +Disallow: /*/*/*/issues/ +Disallow: /*/*/*/pulls/ +Disallow: /*/*/*/wiki/ +Disallow: /*/*/*/actions/ +Disallow: /*/*/*/projects/ +Disallow: /*/*/*/packages/ +Disallow: /*/*/*/releases/ +Disallow: /*/*/*/milestones/ +Disallow: /*/*/*/labels/ +Disallow: /*/*/*/branches/ +Disallow: /*/*/*/tags/ +Disallow: /*/*/*/graphs/ + +User-agent: GPTBot +Disallow: / + +User-agent: ChatGPT-User +Disallow: / + +User-agent: ChatGPT-User v2.0 +Disallow: / + +User-agent: ChatGPT-Browser +Disallow: / + +User-agent: OAI-SearchBot +Disallow: / + +User-agent: CCBot +Disallow: / + +User-agent: anthropic-ai +Disallow: / + +User-agent: Anthropic-Claude +Disallow: / + +User-agent: Claude-Web +Disallow: / + +User-agent: ClaudeBot +Disallow: / + +User-agent: PerplexityBot +Disallow: / + +User-agent: Perplexity-User +Disallow: / + +User-agent: YouBot +Disallow: / + +User-agent: AI2Bot +Disallow: / + +User-agent: Amazonbot +Disallow: / + +User-agent: Applebot-Extended +Disallow: / + +User-agent: Bytespider +Disallow: / + +User-agent: DeepseekBot +Disallow: / + +User-agent: Diffbot +Disallow: / + +User-agent: DuckAssistBot +Disallow: / + +User-agent: FacebookBot +Disallow: / + +User-agent: Meta-ExternalAgent +Disallow: / + +User-agent: meta-webindexer +Disallow: / + +User-agent: Gemini-Ai +Disallow: / + +User-agent: Google-Extended +Disallow: / + +User-agent: GoogleAgent-Mariner +Disallow: / + +User-agent: Google-NotebookLM +Disallow: / + +User-agent: xAI-Bot +Disallow: / + +User-agent: MistralAI-User +Disallow: / + +User-agent: Together-Bot +Disallow: / + +User-agent: Groq-Bot +Disallow: / + +User-agent: HuggingFace-Bot +Disallow: / + +User-agent: Cohere-Ai +Disallow: / + +User-agent: Cohere-Command +Disallow: / + +User-agent: Replicate-Bot +Disallow: / + +User-agent: FirecrawlAgent +Disallow: / + +User-agent: Crawlspace +Disallow: / + +User-agent: Webzio-Extended +Disallow: / + +User-agent: TimpiBot +Disallow: / + +User-agent: ImagesiftBot +Disallow: / + +User-agent: RunPod-Bot +Disallow: / + +User-agent: Devin +Disallow: / + +User-agent: Character-AI +Disallow: / + +User-agent: Brightbot +Disallow: / + +User-agent: TerraCotta +Disallow: / + +User-agent: Andibot +Disallow: / + +User-agent: bigsur.ai +Disallow: / + +User-agent: IbouBot +Disallow: / + +User-agent: Kangaroo Bot +Disallow: / + +User-agent: PanguBot +Disallow: / + +User-agent: Cotoyogi +Disallow: / + +User-agent: Google-CloudVertexBot +Disallow: / + +User-agent: Gemini-Deep-Research +Disallow: / + +User-agent: Bard-Ai +Disallow: / + +User-agent: Ai2Bot-Dolma +Disallow: / + +User-agent: ChatGLM-Spider +Disallow: / + +User-agent: ChatGPT-Agent +Disallow: / + +User-agent: GoogleOther +Disallow: / + +User-agent: ICC-Crawler +Disallow: / + +User-agent: imageSpider +Disallow: / + +User-agent: laion-huggingface-processor +Disallow: / + +User-agent: LCC +Disallow: / + +User-agent: Manus-User +Disallow: / + +User-agent: NovaAct +Disallow: / + +User-agent: omgili +Disallow: / + +User-agent: SBIntuitionsBot +Disallow: / + +User-agent: Spider +Disallow: / + +User-agent: TwinAgent +Disallow: / + +User-agent: VelenPublicWebCrawler +Disallow: / + +User-agent: AmazonBuyForMe +Disallow: / + +User-agent: netEstate-Imprint-Crawler +Disallow: / + +User-agent: Datenbank-Crawler +Disallow: / + +User-agent: iAskBot +Disallow: / + +User-agent: iaskspider +Disallow: / + +User-agent: KunatoCrawler +Disallow: / + +User-agent: WRTNBot +Disallow: / + +User-agent: Crawl4AI +Disallow: / + +User-agent: cohere-training-data-crawler +Disallow: / + +User-agent: DeepSeekBot +Disallow: / + +Sitemap: https://{{ forgejo_hostname }}/sitemap.xml diff --git a/roles/forgejo_runner/tasks/main.yml b/roles/forgejo_runner/tasks/main.yml index d4c1d3d..0c35b56 100644 --- a/roles/forgejo_runner/tasks/main.yml +++ b/roles/forgejo_runner/tasks/main.yml @@ -114,6 +114,6 @@ mode: "0644" - name: Deploy Forgejo runner - command: docker compose up -d --force-recreate + command: docker compose up -d args: chdir: /opt/forgejo-runner