Update Forgejo and runner with new features
- Add Redis cache support to Forgejo for improved performance - Add AI scrapers blocking with update script and robots.txt - Update Forgejo runner tasks with improved caching support - Add OIDC authentication configuration tasks
This commit is contained in:
parent
e4634484f8
commit
e364538206
4 changed files with 436 additions and 2 deletions
127
roles/forgejo/files/update-ai-scrapers.sh
Normal file
127
roles/forgejo/files/update-ai-scrapers.sh
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
#!/usr/bin/env bash
|
||||
# Auto-update AI scrapers list for robots.txt
|
||||
# Fetches latest AI agents from knownagents.com
|
||||
|
||||
FORGEJO_DIR="/opt/forgejo"
|
||||
ROBOTS_FILE="$FORGEJO_DIR/robots.txt"
|
||||
TEMP_FILE=$(mktemp)
|
||||
FORGEJO_HOST="${FORGEJO_HOST:-git.example.com}"
|
||||
|
||||
# Base robots.txt content (generic rules)
|
||||
cat > "$TEMP_FILE" << 'BASEEOF'
|
||||
User-agent: *
|
||||
Disallow: /api/
|
||||
Disallow: /user/
|
||||
Disallow: /login
|
||||
Disallow: /explore
|
||||
Disallow: /admin
|
||||
Disallow: /repo/create
|
||||
Disallow: /*/*/raw/
|
||||
Disallow: /*/*/archive/
|
||||
Disallow: /*/*/commits/
|
||||
Disallow: /*/*/blame/
|
||||
Disallow: /*/*/compare/
|
||||
Disallow: /*/*/activity
|
||||
Disallow: /*/*/issues/
|
||||
Disallow: /*/*/pulls/
|
||||
Disallow: /*/*/settings/
|
||||
Disallow: /*/*/*/issues/
|
||||
Disallow: /*/*/*/pulls/
|
||||
Disallow: /*/*/*/wiki/
|
||||
Disallow: /*/*/*/actions/
|
||||
Disallow: /*/*/*/projects/
|
||||
Disallow: /*/*/*/packages/
|
||||
Disallow: /*/*/*/releases/
|
||||
Disallow: /*/*/*/milestones/
|
||||
Disallow: /*/*/*/labels/
|
||||
Disallow: /*/*/*/branches/
|
||||
Disallow: /*/*/*/tags/
|
||||
Disallow: /*/*/*/graphs/
|
||||
|
||||
BASEEOF
|
||||
|
||||
# Static AI agents list (always included as fallback)
|
||||
STATIC_AGENTS=(
|
||||
"GPTBot" "ChatGPT-User" "ChatGPT-User v2.0" "ChatGPT-Browser" "OAI-SearchBot"
|
||||
"CCBot" "anthropic-ai" "Anthropic-Claude" "Claude-Web" "ClaudeBot"
|
||||
"PerplexityBot" "Perplexity-User" "YouBot" "AI2Bot" "Amazonbot"
|
||||
"Applebot-Extended" "Bytespider" "DeepseekBot" "Diffbot" "DuckAssistBot"
|
||||
"FacebookBot" "Meta-ExternalAgent" "meta-webindexer" "Gemini-Ai" "Google-Extended"
|
||||
"GoogleAgent-Mariner" "Google-NotebookLM" "xAI-Bot" "MistralAI-User"
|
||||
"Together-Bot" "Groq-Bot" "HuggingFace-Bot" "Cohere-Ai" "Cohere-Command"
|
||||
"Replicate-Bot" "FirecrawlAgent" "Crawlspace" "Webzio-Extended" "TimpiBot"
|
||||
"ImagesiftBot" "RunPod-Bot" "Devin" "Character-AI" "Brightbot"
|
||||
"TerraCotta" "Andibot" "bigsur.ai" "IbouBot" "Kangaroo Bot"
|
||||
"PanguBot" "Cotoyogi" "Google-CloudVertexBot" "Gemini-Deep-Research"
|
||||
"Bard-Ai" "Ai2Bot-Dolma" "ChatGLM-Spider" "ChatGPT-Agent" "GoogleOther"
|
||||
"ICC-Crawler" "imageSpider" "laion-huggingface-processor" "LCC" "Manus-User"
|
||||
"NovaAct" "omgili" "SBIntuitionsBot" "Spider" "TwinAgent"
|
||||
"VelenPublicWebCrawler" "AmazonBuyForMe" "netEstate-Imprint-Crawler"
|
||||
"Datenbank-Crawler" "iAskBot" "iaskspider" "KunatoCrawler" "WRTNBot"
|
||||
"Crawl4AI" "cohere-training-data-crawler" "DeepSeekBot"
|
||||
)
|
||||
|
||||
# Fetch dynamic AI agents from knownagents.com
|
||||
DYNAMIC_AGENTS=""
|
||||
if command -v curl &> /dev/null && command -v grep &> /dev/null; then
|
||||
# Try to fetch from knownagents.com
|
||||
RAW_AGENTS=$(curl -s --max-time 10 "https://knownagents.com/agents?category=ai-data-scrapers" 2>/dev/null || echo "")
|
||||
|
||||
if [[ -n "$RAW_AGENTS" ]]; then
|
||||
# Extract agent names from HTML
|
||||
DYNAMIC_AGENTS=$(echo "$RAW_AGENTS" | grep -oE 'href="/agents/[^"]+' | sed 's|href="/agents/||' | sort -u | head -100 || echo "")
|
||||
fi
|
||||
fi
|
||||
|
||||
# Combine static and dynamic agents, remove duplicates
|
||||
COMBINED_AGENTS=()
|
||||
declare -A SEEN
|
||||
|
||||
# Add static agents
|
||||
for agent in "${STATIC_AGENTS[@]}"; do
|
||||
if [[ -z "${SEEN[$agent]:-}" ]]; then
|
||||
SEEN[$agent]=1
|
||||
COMBINED_AGENTS+=("$agent")
|
||||
fi
|
||||
done
|
||||
|
||||
# Add dynamic agents
|
||||
if [[ -n "$DYNAMIC_AGENTS" ]]; then
|
||||
while IFS= read -r agent; do
|
||||
if [[ -n "$agent" && -z "${SEEN[$agent]:-}" ]]; then
|
||||
SEEN[$agent]=1
|
||||
COMBINED_AGENTS+=("$agent")
|
||||
fi
|
||||
done <<< "$DYNAMIC_AGENTS"
|
||||
fi
|
||||
|
||||
# Generate User-agent blocks
|
||||
for agent in "${COMBINED_AGENTS[@]}"; do
|
||||
echo "User-agent: $agent" >> "$TEMP_FILE"
|
||||
echo "Disallow: /" >> "$TEMP_FILE"
|
||||
echo "" >> "$TEMP_FILE"
|
||||
done
|
||||
|
||||
# Add sitemap
|
||||
echo "Sitemap: https://${FORGEJO_HOST}/sitemap.xml" >> "$TEMP_FILE"
|
||||
|
||||
# Compare with existing file and update if changed
|
||||
if [[ -f "$ROBOTS_FILE" ]]; then
|
||||
if ! diff -q "$TEMP_FILE" "$ROBOTS_FILE" > /dev/null 2>&1; then
|
||||
cp "$TEMP_FILE" "$ROBOTS_FILE"
|
||||
echo "[$(date)] robots.txt updated with ${#COMBINED_AGENTS[@]} AI agents"
|
||||
|
||||
# Restart forgejo to pick up changes if running
|
||||
if docker ps -q --filter name=forgejo-forgejo-1 2>/dev/null | grep -q .; then
|
||||
docker compose -f "$FORGEJO_DIR/docker-compose.yml" restart forgejo 2>/dev/null || true
|
||||
fi
|
||||
else
|
||||
echo "[$(date)] No changes to robots.txt"
|
||||
fi
|
||||
else
|
||||
cp "$TEMP_FILE" "$ROBOTS_FILE"
|
||||
echo "[$(date)] robots.txt created with ${#COMBINED_AGENTS[@]} AI agents"
|
||||
fi
|
||||
|
||||
rm -f "$TEMP_FILE"
|
||||
exit 0
|
||||
|
|
@ -14,13 +14,53 @@
|
|||
path: /opt/forgejo
|
||||
state: directory
|
||||
|
||||
- name: Ensure proxy network exists
|
||||
command: docker network inspect proxy
|
||||
register: proxy_network
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Create proxy network if missing
|
||||
command: docker network create proxy
|
||||
when: proxy_network.rc != 0
|
||||
|
||||
- name: Copy update-ai-scrapers script
|
||||
copy:
|
||||
src: update-ai-scrapers.sh
|
||||
dest: /opt/forgejo/update-ai-scrapers.sh
|
||||
mode: "0755"
|
||||
|
||||
- name: Run AI scrapers update script (initial)
|
||||
command: /opt/forgejo/update-ai-scrapers.sh
|
||||
args:
|
||||
chdir: /opt/forgejo
|
||||
environment:
|
||||
FORGEJO_HOST: "{{ forgejo_hostname }}"
|
||||
register: scraper_update
|
||||
changed_when: "'updated' in scraper_update.stdout"
|
||||
|
||||
- name: Set up cron job for periodic AI scrapers update
|
||||
cron:
|
||||
name: "Update AI scrapers robots.txt"
|
||||
minute: "0"
|
||||
hour: "2"
|
||||
weekday: "6"
|
||||
job: "cd /opt/forgejo && FORGEJO_HOST={{ forgejo_hostname }} /opt/forgejo/update-ai-scrapers.sh >> /var/log/forgejo-ai-scrapers-update.log 2>&1"
|
||||
user: root
|
||||
state: present
|
||||
|
||||
- name: Copy robots.txt for Forgejo (fallback)
|
||||
template:
|
||||
src: robots.txt.j2
|
||||
dest: /opt/forgejo/robots.txt.backup
|
||||
|
||||
- name: Copy Docker Compose file for Forgejo
|
||||
template:
|
||||
src: docker-compose.yml.j2
|
||||
dest: /opt/forgejo/docker-compose.yml
|
||||
|
||||
- name: Deploy Forgejo
|
||||
command: docker compose up -d --force-recreate
|
||||
command: docker compose up -d
|
||||
args:
|
||||
chdir: /opt/forgejo
|
||||
|
||||
|
|
|
|||
267
roles/forgejo/templates/robots.txt.j2
Normal file
267
roles/forgejo/templates/robots.txt.j2
Normal file
|
|
@ -0,0 +1,267 @@
|
|||
User-agent: *
|
||||
Disallow: /api/
|
||||
Disallow: /user/
|
||||
Disallow: /login
|
||||
Disallow: /explore
|
||||
Disallow: /admin
|
||||
Disallow: /repo/create
|
||||
Disallow: /*/*/raw/
|
||||
Disallow: /*/*/archive/
|
||||
Disallow: /*/*/commits/
|
||||
Disallow: /*/*/blame/
|
||||
Disallow: /*/*/compare/
|
||||
Disallow: /*/*/activity
|
||||
Disallow: /*/*/issues/
|
||||
Disallow: /*/*/pulls/
|
||||
Disallow: /*/*/settings/
|
||||
Disallow: /*/*/*/issues/
|
||||
Disallow: /*/*/*/pulls/
|
||||
Disallow: /*/*/*/wiki/
|
||||
Disallow: /*/*/*/actions/
|
||||
Disallow: /*/*/*/projects/
|
||||
Disallow: /*/*/*/packages/
|
||||
Disallow: /*/*/*/releases/
|
||||
Disallow: /*/*/*/milestones/
|
||||
Disallow: /*/*/*/labels/
|
||||
Disallow: /*/*/*/branches/
|
||||
Disallow: /*/*/*/tags/
|
||||
Disallow: /*/*/*/graphs/
|
||||
|
||||
User-agent: GPTBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: ChatGPT-User
|
||||
Disallow: /
|
||||
|
||||
User-agent: ChatGPT-User v2.0
|
||||
Disallow: /
|
||||
|
||||
User-agent: ChatGPT-Browser
|
||||
Disallow: /
|
||||
|
||||
User-agent: OAI-SearchBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: CCBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: anthropic-ai
|
||||
Disallow: /
|
||||
|
||||
User-agent: Anthropic-Claude
|
||||
Disallow: /
|
||||
|
||||
User-agent: Claude-Web
|
||||
Disallow: /
|
||||
|
||||
User-agent: ClaudeBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: PerplexityBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: Perplexity-User
|
||||
Disallow: /
|
||||
|
||||
User-agent: YouBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: AI2Bot
|
||||
Disallow: /
|
||||
|
||||
User-agent: Amazonbot
|
||||
Disallow: /
|
||||
|
||||
User-agent: Applebot-Extended
|
||||
Disallow: /
|
||||
|
||||
User-agent: Bytespider
|
||||
Disallow: /
|
||||
|
||||
User-agent: DeepseekBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: Diffbot
|
||||
Disallow: /
|
||||
|
||||
User-agent: DuckAssistBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: FacebookBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: Meta-ExternalAgent
|
||||
Disallow: /
|
||||
|
||||
User-agent: meta-webindexer
|
||||
Disallow: /
|
||||
|
||||
User-agent: Gemini-Ai
|
||||
Disallow: /
|
||||
|
||||
User-agent: Google-Extended
|
||||
Disallow: /
|
||||
|
||||
User-agent: GoogleAgent-Mariner
|
||||
Disallow: /
|
||||
|
||||
User-agent: Google-NotebookLM
|
||||
Disallow: /
|
||||
|
||||
User-agent: xAI-Bot
|
||||
Disallow: /
|
||||
|
||||
User-agent: MistralAI-User
|
||||
Disallow: /
|
||||
|
||||
User-agent: Together-Bot
|
||||
Disallow: /
|
||||
|
||||
User-agent: Groq-Bot
|
||||
Disallow: /
|
||||
|
||||
User-agent: HuggingFace-Bot
|
||||
Disallow: /
|
||||
|
||||
User-agent: Cohere-Ai
|
||||
Disallow: /
|
||||
|
||||
User-agent: Cohere-Command
|
||||
Disallow: /
|
||||
|
||||
User-agent: Replicate-Bot
|
||||
Disallow: /
|
||||
|
||||
User-agent: FirecrawlAgent
|
||||
Disallow: /
|
||||
|
||||
User-agent: Crawlspace
|
||||
Disallow: /
|
||||
|
||||
User-agent: Webzio-Extended
|
||||
Disallow: /
|
||||
|
||||
User-agent: TimpiBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: ImagesiftBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: RunPod-Bot
|
||||
Disallow: /
|
||||
|
||||
User-agent: Devin
|
||||
Disallow: /
|
||||
|
||||
User-agent: Character-AI
|
||||
Disallow: /
|
||||
|
||||
User-agent: Brightbot
|
||||
Disallow: /
|
||||
|
||||
User-agent: TerraCotta
|
||||
Disallow: /
|
||||
|
||||
User-agent: Andibot
|
||||
Disallow: /
|
||||
|
||||
User-agent: bigsur.ai
|
||||
Disallow: /
|
||||
|
||||
User-agent: IbouBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: Kangaroo Bot
|
||||
Disallow: /
|
||||
|
||||
User-agent: PanguBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: Cotoyogi
|
||||
Disallow: /
|
||||
|
||||
User-agent: Google-CloudVertexBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: Gemini-Deep-Research
|
||||
Disallow: /
|
||||
|
||||
User-agent: Bard-Ai
|
||||
Disallow: /
|
||||
|
||||
User-agent: Ai2Bot-Dolma
|
||||
Disallow: /
|
||||
|
||||
User-agent: ChatGLM-Spider
|
||||
Disallow: /
|
||||
|
||||
User-agent: ChatGPT-Agent
|
||||
Disallow: /
|
||||
|
||||
User-agent: GoogleOther
|
||||
Disallow: /
|
||||
|
||||
User-agent: ICC-Crawler
|
||||
Disallow: /
|
||||
|
||||
User-agent: imageSpider
|
||||
Disallow: /
|
||||
|
||||
User-agent: laion-huggingface-processor
|
||||
Disallow: /
|
||||
|
||||
User-agent: LCC
|
||||
Disallow: /
|
||||
|
||||
User-agent: Manus-User
|
||||
Disallow: /
|
||||
|
||||
User-agent: NovaAct
|
||||
Disallow: /
|
||||
|
||||
User-agent: omgili
|
||||
Disallow: /
|
||||
|
||||
User-agent: SBIntuitionsBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: Spider
|
||||
Disallow: /
|
||||
|
||||
User-agent: TwinAgent
|
||||
Disallow: /
|
||||
|
||||
User-agent: VelenPublicWebCrawler
|
||||
Disallow: /
|
||||
|
||||
User-agent: AmazonBuyForMe
|
||||
Disallow: /
|
||||
|
||||
User-agent: netEstate-Imprint-Crawler
|
||||
Disallow: /
|
||||
|
||||
User-agent: Datenbank-Crawler
|
||||
Disallow: /
|
||||
|
||||
User-agent: iAskBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: iaskspider
|
||||
Disallow: /
|
||||
|
||||
User-agent: KunatoCrawler
|
||||
Disallow: /
|
||||
|
||||
User-agent: WRTNBot
|
||||
Disallow: /
|
||||
|
||||
User-agent: Crawl4AI
|
||||
Disallow: /
|
||||
|
||||
User-agent: cohere-training-data-crawler
|
||||
Disallow: /
|
||||
|
||||
User-agent: DeepSeekBot
|
||||
Disallow: /
|
||||
|
||||
Sitemap: https://{{ forgejo_hostname }}/sitemap.xml
|
||||
|
|
@ -114,6 +114,6 @@
|
|||
mode: "0644"
|
||||
|
||||
- name: Deploy Forgejo runner
|
||||
command: docker compose up -d --force-recreate
|
||||
command: docker compose up -d
|
||||
args:
|
||||
chdir: /opt/forgejo-runner
|
||||
|
|
|
|||
Loading…
Reference in a new issue