Update Forgejo and runner with new features
- Add Redis cache support to Forgejo for improved performance - Add AI scrapers blocking with update script and robots.txt - Update Forgejo runner tasks with improved caching support - Add OIDC authentication configuration tasks
This commit is contained in:
parent
e4634484f8
commit
e364538206
4 changed files with 436 additions and 2 deletions
127
roles/forgejo/files/update-ai-scrapers.sh
Normal file
127
roles/forgejo/files/update-ai-scrapers.sh
Normal file
|
|
@ -0,0 +1,127 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# Auto-update AI scrapers list for robots.txt
|
||||||
|
# Fetches latest AI agents from knownagents.com
|
||||||
|
|
||||||
|
FORGEJO_DIR="/opt/forgejo"
|
||||||
|
ROBOTS_FILE="$FORGEJO_DIR/robots.txt"
|
||||||
|
TEMP_FILE=$(mktemp)
|
||||||
|
FORGEJO_HOST="${FORGEJO_HOST:-git.example.com}"
|
||||||
|
|
||||||
|
# Base robots.txt content (generic rules)
|
||||||
|
cat > "$TEMP_FILE" << 'BASEEOF'
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /api/
|
||||||
|
Disallow: /user/
|
||||||
|
Disallow: /login
|
||||||
|
Disallow: /explore
|
||||||
|
Disallow: /admin
|
||||||
|
Disallow: /repo/create
|
||||||
|
Disallow: /*/*/raw/
|
||||||
|
Disallow: /*/*/archive/
|
||||||
|
Disallow: /*/*/commits/
|
||||||
|
Disallow: /*/*/blame/
|
||||||
|
Disallow: /*/*/compare/
|
||||||
|
Disallow: /*/*/activity
|
||||||
|
Disallow: /*/*/issues/
|
||||||
|
Disallow: /*/*/pulls/
|
||||||
|
Disallow: /*/*/settings/
|
||||||
|
Disallow: /*/*/*/issues/
|
||||||
|
Disallow: /*/*/*/pulls/
|
||||||
|
Disallow: /*/*/*/wiki/
|
||||||
|
Disallow: /*/*/*/actions/
|
||||||
|
Disallow: /*/*/*/projects/
|
||||||
|
Disallow: /*/*/*/packages/
|
||||||
|
Disallow: /*/*/*/releases/
|
||||||
|
Disallow: /*/*/*/milestones/
|
||||||
|
Disallow: /*/*/*/labels/
|
||||||
|
Disallow: /*/*/*/branches/
|
||||||
|
Disallow: /*/*/*/tags/
|
||||||
|
Disallow: /*/*/*/graphs/
|
||||||
|
|
||||||
|
BASEEOF
|
||||||
|
|
||||||
|
# Static AI agents list (always included as fallback)
|
||||||
|
STATIC_AGENTS=(
|
||||||
|
"GPTBot" "ChatGPT-User" "ChatGPT-User v2.0" "ChatGPT-Browser" "OAI-SearchBot"
|
||||||
|
"CCBot" "anthropic-ai" "Anthropic-Claude" "Claude-Web" "ClaudeBot"
|
||||||
|
"PerplexityBot" "Perplexity-User" "YouBot" "AI2Bot" "Amazonbot"
|
||||||
|
"Applebot-Extended" "Bytespider" "DeepseekBot" "Diffbot" "DuckAssistBot"
|
||||||
|
"FacebookBot" "Meta-ExternalAgent" "meta-webindexer" "Gemini-Ai" "Google-Extended"
|
||||||
|
"GoogleAgent-Mariner" "Google-NotebookLM" "xAI-Bot" "MistralAI-User"
|
||||||
|
"Together-Bot" "Groq-Bot" "HuggingFace-Bot" "Cohere-Ai" "Cohere-Command"
|
||||||
|
"Replicate-Bot" "FirecrawlAgent" "Crawlspace" "Webzio-Extended" "TimpiBot"
|
||||||
|
"ImagesiftBot" "RunPod-Bot" "Devin" "Character-AI" "Brightbot"
|
||||||
|
"TerraCotta" "Andibot" "bigsur.ai" "IbouBot" "Kangaroo Bot"
|
||||||
|
"PanguBot" "Cotoyogi" "Google-CloudVertexBot" "Gemini-Deep-Research"
|
||||||
|
"Bard-Ai" "Ai2Bot-Dolma" "ChatGLM-Spider" "ChatGPT-Agent" "GoogleOther"
|
||||||
|
"ICC-Crawler" "imageSpider" "laion-huggingface-processor" "LCC" "Manus-User"
|
||||||
|
"NovaAct" "omgili" "SBIntuitionsBot" "Spider" "TwinAgent"
|
||||||
|
"VelenPublicWebCrawler" "AmazonBuyForMe" "netEstate-Imprint-Crawler"
|
||||||
|
"Datenbank-Crawler" "iAskBot" "iaskspider" "KunatoCrawler" "WRTNBot"
|
||||||
|
"Crawl4AI" "cohere-training-data-crawler" "DeepSeekBot"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fetch dynamic AI agents from knownagents.com
|
||||||
|
DYNAMIC_AGENTS=""
|
||||||
|
if command -v curl &> /dev/null && command -v grep &> /dev/null; then
|
||||||
|
# Try to fetch from knownagents.com
|
||||||
|
RAW_AGENTS=$(curl -s --max-time 10 "https://knownagents.com/agents?category=ai-data-scrapers" 2>/dev/null || echo "")
|
||||||
|
|
||||||
|
if [[ -n "$RAW_AGENTS" ]]; then
|
||||||
|
# Extract agent names from HTML
|
||||||
|
DYNAMIC_AGENTS=$(echo "$RAW_AGENTS" | grep -oE 'href="/agents/[^"]+' | sed 's|href="/agents/||' | sort -u | head -100 || echo "")
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Combine static and dynamic agents, remove duplicates
|
||||||
|
COMBINED_AGENTS=()
|
||||||
|
declare -A SEEN
|
||||||
|
|
||||||
|
# Add static agents
|
||||||
|
for agent in "${STATIC_AGENTS[@]}"; do
|
||||||
|
if [[ -z "${SEEN[$agent]:-}" ]]; then
|
||||||
|
SEEN[$agent]=1
|
||||||
|
COMBINED_AGENTS+=("$agent")
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Add dynamic agents
|
||||||
|
if [[ -n "$DYNAMIC_AGENTS" ]]; then
|
||||||
|
while IFS= read -r agent; do
|
||||||
|
if [[ -n "$agent" && -z "${SEEN[$agent]:-}" ]]; then
|
||||||
|
SEEN[$agent]=1
|
||||||
|
COMBINED_AGENTS+=("$agent")
|
||||||
|
fi
|
||||||
|
done <<< "$DYNAMIC_AGENTS"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Generate User-agent blocks
|
||||||
|
for agent in "${COMBINED_AGENTS[@]}"; do
|
||||||
|
echo "User-agent: $agent" >> "$TEMP_FILE"
|
||||||
|
echo "Disallow: /" >> "$TEMP_FILE"
|
||||||
|
echo "" >> "$TEMP_FILE"
|
||||||
|
done
|
||||||
|
|
||||||
|
# Add sitemap
|
||||||
|
echo "Sitemap: https://${FORGEJO_HOST}/sitemap.xml" >> "$TEMP_FILE"
|
||||||
|
|
||||||
|
# Compare with existing file and update if changed
|
||||||
|
if [[ -f "$ROBOTS_FILE" ]]; then
|
||||||
|
if ! diff -q "$TEMP_FILE" "$ROBOTS_FILE" > /dev/null 2>&1; then
|
||||||
|
cp "$TEMP_FILE" "$ROBOTS_FILE"
|
||||||
|
echo "[$(date)] robots.txt updated with ${#COMBINED_AGENTS[@]} AI agents"
|
||||||
|
|
||||||
|
# Restart forgejo to pick up changes if running
|
||||||
|
if docker ps -q --filter name=forgejo-forgejo-1 2>/dev/null | grep -q .; then
|
||||||
|
docker compose -f "$FORGEJO_DIR/docker-compose.yml" restart forgejo 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "[$(date)] No changes to robots.txt"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
cp "$TEMP_FILE" "$ROBOTS_FILE"
|
||||||
|
echo "[$(date)] robots.txt created with ${#COMBINED_AGENTS[@]} AI agents"
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f "$TEMP_FILE"
|
||||||
|
exit 0
|
||||||
|
|
@ -14,13 +14,53 @@
|
||||||
path: /opt/forgejo
|
path: /opt/forgejo
|
||||||
state: directory
|
state: directory
|
||||||
|
|
||||||
|
- name: Ensure proxy network exists
|
||||||
|
command: docker network inspect proxy
|
||||||
|
register: proxy_network
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Create proxy network if missing
|
||||||
|
command: docker network create proxy
|
||||||
|
when: proxy_network.rc != 0
|
||||||
|
|
||||||
|
- name: Copy update-ai-scrapers script
|
||||||
|
copy:
|
||||||
|
src: update-ai-scrapers.sh
|
||||||
|
dest: /opt/forgejo/update-ai-scrapers.sh
|
||||||
|
mode: "0755"
|
||||||
|
|
||||||
|
- name: Run AI scrapers update script (initial)
|
||||||
|
command: /opt/forgejo/update-ai-scrapers.sh
|
||||||
|
args:
|
||||||
|
chdir: /opt/forgejo
|
||||||
|
environment:
|
||||||
|
FORGEJO_HOST: "{{ forgejo_hostname }}"
|
||||||
|
register: scraper_update
|
||||||
|
changed_when: "'updated' in scraper_update.stdout"
|
||||||
|
|
||||||
|
- name: Set up cron job for periodic AI scrapers update
|
||||||
|
cron:
|
||||||
|
name: "Update AI scrapers robots.txt"
|
||||||
|
minute: "0"
|
||||||
|
hour: "2"
|
||||||
|
weekday: "6"
|
||||||
|
job: "cd /opt/forgejo && FORGEJO_HOST={{ forgejo_hostname }} /opt/forgejo/update-ai-scrapers.sh >> /var/log/forgejo-ai-scrapers-update.log 2>&1"
|
||||||
|
user: root
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Copy robots.txt for Forgejo (fallback)
|
||||||
|
template:
|
||||||
|
src: robots.txt.j2
|
||||||
|
dest: /opt/forgejo/robots.txt.backup
|
||||||
|
|
||||||
- name: Copy Docker Compose file for Forgejo
|
- name: Copy Docker Compose file for Forgejo
|
||||||
template:
|
template:
|
||||||
src: docker-compose.yml.j2
|
src: docker-compose.yml.j2
|
||||||
dest: /opt/forgejo/docker-compose.yml
|
dest: /opt/forgejo/docker-compose.yml
|
||||||
|
|
||||||
- name: Deploy Forgejo
|
- name: Deploy Forgejo
|
||||||
command: docker compose up -d --force-recreate
|
command: docker compose up -d
|
||||||
args:
|
args:
|
||||||
chdir: /opt/forgejo
|
chdir: /opt/forgejo
|
||||||
|
|
||||||
|
|
|
||||||
267
roles/forgejo/templates/robots.txt.j2
Normal file
267
roles/forgejo/templates/robots.txt.j2
Normal file
|
|
@ -0,0 +1,267 @@
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /api/
|
||||||
|
Disallow: /user/
|
||||||
|
Disallow: /login
|
||||||
|
Disallow: /explore
|
||||||
|
Disallow: /admin
|
||||||
|
Disallow: /repo/create
|
||||||
|
Disallow: /*/*/raw/
|
||||||
|
Disallow: /*/*/archive/
|
||||||
|
Disallow: /*/*/commits/
|
||||||
|
Disallow: /*/*/blame/
|
||||||
|
Disallow: /*/*/compare/
|
||||||
|
Disallow: /*/*/activity
|
||||||
|
Disallow: /*/*/issues/
|
||||||
|
Disallow: /*/*/pulls/
|
||||||
|
Disallow: /*/*/settings/
|
||||||
|
Disallow: /*/*/*/issues/
|
||||||
|
Disallow: /*/*/*/pulls/
|
||||||
|
Disallow: /*/*/*/wiki/
|
||||||
|
Disallow: /*/*/*/actions/
|
||||||
|
Disallow: /*/*/*/projects/
|
||||||
|
Disallow: /*/*/*/packages/
|
||||||
|
Disallow: /*/*/*/releases/
|
||||||
|
Disallow: /*/*/*/milestones/
|
||||||
|
Disallow: /*/*/*/labels/
|
||||||
|
Disallow: /*/*/*/branches/
|
||||||
|
Disallow: /*/*/*/tags/
|
||||||
|
Disallow: /*/*/*/graphs/
|
||||||
|
|
||||||
|
User-agent: GPTBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: ChatGPT-User
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: ChatGPT-User v2.0
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: ChatGPT-Browser
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: OAI-SearchBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: CCBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: anthropic-ai
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Anthropic-Claude
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Claude-Web
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: ClaudeBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: PerplexityBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Perplexity-User
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: YouBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: AI2Bot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Amazonbot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Applebot-Extended
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Bytespider
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: DeepseekBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Diffbot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: DuckAssistBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: FacebookBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Meta-ExternalAgent
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: meta-webindexer
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Gemini-Ai
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Google-Extended
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: GoogleAgent-Mariner
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Google-NotebookLM
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: xAI-Bot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: MistralAI-User
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Together-Bot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Groq-Bot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: HuggingFace-Bot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Cohere-Ai
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Cohere-Command
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Replicate-Bot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: FirecrawlAgent
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Crawlspace
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Webzio-Extended
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: TimpiBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: ImagesiftBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: RunPod-Bot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Devin
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Character-AI
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Brightbot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: TerraCotta
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Andibot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: bigsur.ai
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: IbouBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Kangaroo Bot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: PanguBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Cotoyogi
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Google-CloudVertexBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Gemini-Deep-Research
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Bard-Ai
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Ai2Bot-Dolma
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: ChatGLM-Spider
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: ChatGPT-Agent
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: GoogleOther
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: ICC-Crawler
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: imageSpider
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: laion-huggingface-processor
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: LCC
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Manus-User
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: NovaAct
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: omgili
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: SBIntuitionsBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Spider
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: TwinAgent
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: VelenPublicWebCrawler
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: AmazonBuyForMe
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: netEstate-Imprint-Crawler
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Datenbank-Crawler
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: iAskBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: iaskspider
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: KunatoCrawler
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: WRTNBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: Crawl4AI
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: cohere-training-data-crawler
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
User-agent: DeepSeekBot
|
||||||
|
Disallow: /
|
||||||
|
|
||||||
|
Sitemap: https://{{ forgejo_hostname }}/sitemap.xml
|
||||||
|
|
@ -114,6 +114,6 @@
|
||||||
mode: "0644"
|
mode: "0644"
|
||||||
|
|
||||||
- name: Deploy Forgejo runner
|
- name: Deploy Forgejo runner
|
||||||
command: docker compose up -d --force-recreate
|
command: docker compose up -d
|
||||||
args:
|
args:
|
||||||
chdir: /opt/forgejo-runner
|
chdir: /opt/forgejo-runner
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue