Update Forgejo and runner with new features

- Add Redis cache support to Forgejo for improved performance
- Add AI scrapers blocking with update script and robots.txt
- Update Forgejo runner tasks with improved caching support
- Add OIDC authentication configuration tasks
This commit is contained in:
Jeremie Fraeys 2026-02-21 18:31:06 -05:00
parent e4634484f8
commit e364538206
No known key found for this signature in database
4 changed files with 436 additions and 2 deletions

View file

@ -0,0 +1,127 @@
#!/usr/bin/env bash
# Auto-update AI scrapers list for robots.txt
# Fetches latest AI agents from knownagents.com
FORGEJO_DIR="/opt/forgejo"
ROBOTS_FILE="$FORGEJO_DIR/robots.txt"
TEMP_FILE=$(mktemp)
FORGEJO_HOST="${FORGEJO_HOST:-git.example.com}"
# Base robots.txt content (generic rules)
cat > "$TEMP_FILE" << 'BASEEOF'
User-agent: *
Disallow: /api/
Disallow: /user/
Disallow: /login
Disallow: /explore
Disallow: /admin
Disallow: /repo/create
Disallow: /*/*/raw/
Disallow: /*/*/archive/
Disallow: /*/*/commits/
Disallow: /*/*/blame/
Disallow: /*/*/compare/
Disallow: /*/*/activity
Disallow: /*/*/issues/
Disallow: /*/*/pulls/
Disallow: /*/*/settings/
Disallow: /*/*/*/issues/
Disallow: /*/*/*/pulls/
Disallow: /*/*/*/wiki/
Disallow: /*/*/*/actions/
Disallow: /*/*/*/projects/
Disallow: /*/*/*/packages/
Disallow: /*/*/*/releases/
Disallow: /*/*/*/milestones/
Disallow: /*/*/*/labels/
Disallow: /*/*/*/branches/
Disallow: /*/*/*/tags/
Disallow: /*/*/*/graphs/
BASEEOF
# Static AI agents list (always included as fallback)
STATIC_AGENTS=(
"GPTBot" "ChatGPT-User" "ChatGPT-User v2.0" "ChatGPT-Browser" "OAI-SearchBot"
"CCBot" "anthropic-ai" "Anthropic-Claude" "Claude-Web" "ClaudeBot"
"PerplexityBot" "Perplexity-User" "YouBot" "AI2Bot" "Amazonbot"
"Applebot-Extended" "Bytespider" "DeepseekBot" "Diffbot" "DuckAssistBot"
"FacebookBot" "Meta-ExternalAgent" "meta-webindexer" "Gemini-Ai" "Google-Extended"
"GoogleAgent-Mariner" "Google-NotebookLM" "xAI-Bot" "MistralAI-User"
"Together-Bot" "Groq-Bot" "HuggingFace-Bot" "Cohere-Ai" "Cohere-Command"
"Replicate-Bot" "FirecrawlAgent" "Crawlspace" "Webzio-Extended" "TimpiBot"
"ImagesiftBot" "RunPod-Bot" "Devin" "Character-AI" "Brightbot"
"TerraCotta" "Andibot" "bigsur.ai" "IbouBot" "Kangaroo Bot"
"PanguBot" "Cotoyogi" "Google-CloudVertexBot" "Gemini-Deep-Research"
"Bard-Ai" "Ai2Bot-Dolma" "ChatGLM-Spider" "ChatGPT-Agent" "GoogleOther"
"ICC-Crawler" "imageSpider" "laion-huggingface-processor" "LCC" "Manus-User"
"NovaAct" "omgili" "SBIntuitionsBot" "Spider" "TwinAgent"
"VelenPublicWebCrawler" "AmazonBuyForMe" "netEstate-Imprint-Crawler"
"Datenbank-Crawler" "iAskBot" "iaskspider" "KunatoCrawler" "WRTNBot"
"Crawl4AI" "cohere-training-data-crawler" "DeepSeekBot"
)
# Fetch dynamic AI agents from knownagents.com
DYNAMIC_AGENTS=""
if command -v curl &> /dev/null && command -v grep &> /dev/null; then
# Try to fetch from knownagents.com
RAW_AGENTS=$(curl -s --max-time 10 "https://knownagents.com/agents?category=ai-data-scrapers" 2>/dev/null || echo "")
if [[ -n "$RAW_AGENTS" ]]; then
# Extract agent names from HTML
DYNAMIC_AGENTS=$(echo "$RAW_AGENTS" | grep -oE 'href="/agents/[^"]+' | sed 's|href="/agents/||' | sort -u | head -100 || echo "")
fi
fi
# Combine static and dynamic agents, remove duplicates
COMBINED_AGENTS=()
declare -A SEEN
# Add static agents
for agent in "${STATIC_AGENTS[@]}"; do
if [[ -z "${SEEN[$agent]:-}" ]]; then
SEEN[$agent]=1
COMBINED_AGENTS+=("$agent")
fi
done
# Add dynamic agents
if [[ -n "$DYNAMIC_AGENTS" ]]; then
while IFS= read -r agent; do
if [[ -n "$agent" && -z "${SEEN[$agent]:-}" ]]; then
SEEN[$agent]=1
COMBINED_AGENTS+=("$agent")
fi
done <<< "$DYNAMIC_AGENTS"
fi
# Generate User-agent blocks
for agent in "${COMBINED_AGENTS[@]}"; do
echo "User-agent: $agent" >> "$TEMP_FILE"
echo "Disallow: /" >> "$TEMP_FILE"
echo "" >> "$TEMP_FILE"
done
# Add sitemap
echo "Sitemap: https://${FORGEJO_HOST}/sitemap.xml" >> "$TEMP_FILE"
# Compare with existing file and update if changed
if [[ -f "$ROBOTS_FILE" ]]; then
if ! diff -q "$TEMP_FILE" "$ROBOTS_FILE" > /dev/null 2>&1; then
cp "$TEMP_FILE" "$ROBOTS_FILE"
echo "[$(date)] robots.txt updated with ${#COMBINED_AGENTS[@]} AI agents"
# Restart forgejo to pick up changes if running
if docker ps -q --filter name=forgejo-forgejo-1 2>/dev/null | grep -q .; then
docker compose -f "$FORGEJO_DIR/docker-compose.yml" restart forgejo 2>/dev/null || true
fi
else
echo "[$(date)] No changes to robots.txt"
fi
else
cp "$TEMP_FILE" "$ROBOTS_FILE"
echo "[$(date)] robots.txt created with ${#COMBINED_AGENTS[@]} AI agents"
fi
rm -f "$TEMP_FILE"
exit 0

View file

@ -14,13 +14,53 @@
path: /opt/forgejo path: /opt/forgejo
state: directory state: directory
- name: Ensure proxy network exists
command: docker network inspect proxy
register: proxy_network
changed_when: false
failed_when: false
- name: Create proxy network if missing
command: docker network create proxy
when: proxy_network.rc != 0
- name: Copy update-ai-scrapers script
copy:
src: update-ai-scrapers.sh
dest: /opt/forgejo/update-ai-scrapers.sh
mode: "0755"
- name: Run AI scrapers update script (initial)
command: /opt/forgejo/update-ai-scrapers.sh
args:
chdir: /opt/forgejo
environment:
FORGEJO_HOST: "{{ forgejo_hostname }}"
register: scraper_update
changed_when: "'updated' in scraper_update.stdout"
- name: Set up cron job for periodic AI scrapers update
cron:
name: "Update AI scrapers robots.txt"
minute: "0"
hour: "2"
weekday: "6"
job: "cd /opt/forgejo && FORGEJO_HOST={{ forgejo_hostname }} /opt/forgejo/update-ai-scrapers.sh >> /var/log/forgejo-ai-scrapers-update.log 2>&1"
user: root
state: present
- name: Copy robots.txt for Forgejo (fallback)
template:
src: robots.txt.j2
dest: /opt/forgejo/robots.txt.backup
- name: Copy Docker Compose file for Forgejo - name: Copy Docker Compose file for Forgejo
template: template:
src: docker-compose.yml.j2 src: docker-compose.yml.j2
dest: /opt/forgejo/docker-compose.yml dest: /opt/forgejo/docker-compose.yml
- name: Deploy Forgejo - name: Deploy Forgejo
command: docker compose up -d --force-recreate command: docker compose up -d
args: args:
chdir: /opt/forgejo chdir: /opt/forgejo

View file

@ -0,0 +1,267 @@
User-agent: *
Disallow: /api/
Disallow: /user/
Disallow: /login
Disallow: /explore
Disallow: /admin
Disallow: /repo/create
Disallow: /*/*/raw/
Disallow: /*/*/archive/
Disallow: /*/*/commits/
Disallow: /*/*/blame/
Disallow: /*/*/compare/
Disallow: /*/*/activity
Disallow: /*/*/issues/
Disallow: /*/*/pulls/
Disallow: /*/*/settings/
Disallow: /*/*/*/issues/
Disallow: /*/*/*/pulls/
Disallow: /*/*/*/wiki/
Disallow: /*/*/*/actions/
Disallow: /*/*/*/projects/
Disallow: /*/*/*/packages/
Disallow: /*/*/*/releases/
Disallow: /*/*/*/milestones/
Disallow: /*/*/*/labels/
Disallow: /*/*/*/branches/
Disallow: /*/*/*/tags/
Disallow: /*/*/*/graphs/
User-agent: GPTBot
Disallow: /
User-agent: ChatGPT-User
Disallow: /
User-agent: ChatGPT-User v2.0
Disallow: /
User-agent: ChatGPT-Browser
Disallow: /
User-agent: OAI-SearchBot
Disallow: /
User-agent: CCBot
Disallow: /
User-agent: anthropic-ai
Disallow: /
User-agent: Anthropic-Claude
Disallow: /
User-agent: Claude-Web
Disallow: /
User-agent: ClaudeBot
Disallow: /
User-agent: PerplexityBot
Disallow: /
User-agent: Perplexity-User
Disallow: /
User-agent: YouBot
Disallow: /
User-agent: AI2Bot
Disallow: /
User-agent: Amazonbot
Disallow: /
User-agent: Applebot-Extended
Disallow: /
User-agent: Bytespider
Disallow: /
User-agent: DeepseekBot
Disallow: /
User-agent: Diffbot
Disallow: /
User-agent: DuckAssistBot
Disallow: /
User-agent: FacebookBot
Disallow: /
User-agent: Meta-ExternalAgent
Disallow: /
User-agent: meta-webindexer
Disallow: /
User-agent: Gemini-Ai
Disallow: /
User-agent: Google-Extended
Disallow: /
User-agent: GoogleAgent-Mariner
Disallow: /
User-agent: Google-NotebookLM
Disallow: /
User-agent: xAI-Bot
Disallow: /
User-agent: MistralAI-User
Disallow: /
User-agent: Together-Bot
Disallow: /
User-agent: Groq-Bot
Disallow: /
User-agent: HuggingFace-Bot
Disallow: /
User-agent: Cohere-Ai
Disallow: /
User-agent: Cohere-Command
Disallow: /
User-agent: Replicate-Bot
Disallow: /
User-agent: FirecrawlAgent
Disallow: /
User-agent: Crawlspace
Disallow: /
User-agent: Webzio-Extended
Disallow: /
User-agent: TimpiBot
Disallow: /
User-agent: ImagesiftBot
Disallow: /
User-agent: RunPod-Bot
Disallow: /
User-agent: Devin
Disallow: /
User-agent: Character-AI
Disallow: /
User-agent: Brightbot
Disallow: /
User-agent: TerraCotta
Disallow: /
User-agent: Andibot
Disallow: /
User-agent: bigsur.ai
Disallow: /
User-agent: IbouBot
Disallow: /
User-agent: Kangaroo Bot
Disallow: /
User-agent: PanguBot
Disallow: /
User-agent: Cotoyogi
Disallow: /
User-agent: Google-CloudVertexBot
Disallow: /
User-agent: Gemini-Deep-Research
Disallow: /
User-agent: Bard-Ai
Disallow: /
User-agent: Ai2Bot-Dolma
Disallow: /
User-agent: ChatGLM-Spider
Disallow: /
User-agent: ChatGPT-Agent
Disallow: /
User-agent: GoogleOther
Disallow: /
User-agent: ICC-Crawler
Disallow: /
User-agent: imageSpider
Disallow: /
User-agent: laion-huggingface-processor
Disallow: /
User-agent: LCC
Disallow: /
User-agent: Manus-User
Disallow: /
User-agent: NovaAct
Disallow: /
User-agent: omgili
Disallow: /
User-agent: SBIntuitionsBot
Disallow: /
User-agent: Spider
Disallow: /
User-agent: TwinAgent
Disallow: /
User-agent: VelenPublicWebCrawler
Disallow: /
User-agent: AmazonBuyForMe
Disallow: /
User-agent: netEstate-Imprint-Crawler
Disallow: /
User-agent: Datenbank-Crawler
Disallow: /
User-agent: iAskBot
Disallow: /
User-agent: iaskspider
Disallow: /
User-agent: KunatoCrawler
Disallow: /
User-agent: WRTNBot
Disallow: /
User-agent: Crawl4AI
Disallow: /
User-agent: cohere-training-data-crawler
Disallow: /
User-agent: DeepSeekBot
Disallow: /
Sitemap: https://{{ forgejo_hostname }}/sitemap.xml

View file

@ -114,6 +114,6 @@
mode: "0644" mode: "0644"
- name: Deploy Forgejo runner - name: Deploy Forgejo runner
command: docker compose up -d --force-recreate command: docker compose up -d
args: args:
chdir: /opt/forgejo-runner chdir: /opt/forgejo-runner