# site-b — citation-only AI bot policy.
# Allow traditional search and answer-engine bots that link back; deny crawlers
# whose primary purpose is harvesting content for model training. The default
# `User-agent: *` group at the end is `Disallow: /` so any bot that has not
# been explicitly allow-listed above is denied — the previous default of
# `Allow: /` undermined the citation-only intent (architecture-review-2026-04-28
# finding #17).

# --- Traditional search engines ---
User-agent: Googlebot
Allow: /

User-agent: Bingbot
Allow: /

User-agent: DuckDuckBot
Allow: /

User-agent: Applebot
Allow: /

# --- Citation / answer-engine bots (link back to source) ---
User-agent: PerplexityBot
Allow: /

User-agent: OAI-SearchBot
Allow: /

User-agent: ChatGPT-User
Allow: /

User-agent: Claude-User
Allow: /

User-agent: Claude-SearchBot
Allow: /

User-agent: Google-CloudVertexBot
Allow: /

User-agent: Meta-ExternalAgent
Allow: /

# --- Training crawlers: BLOCKED on site-b (per-site policy) ---
# ClaudeBot is Anthropic's training crawler; Claude-User / Claude-SearchBot
# (allowed above) are the user-facing fetch and search agents.
User-agent: ClaudeBot
Disallow: /

User-agent: GPTBot
Disallow: /

User-agent: Anthropic-AI
Disallow: /

User-agent: Google-Extended
Disallow: /

User-agent: Applebot-Extended
Disallow: /

User-agent: CCBot
Disallow: /

User-agent: FacebookBot
Disallow: /

User-agent: Bytespider
Disallow: /

User-agent: PanguBot
Disallow: /

User-agent: Cohere-AI
Disallow: /

User-agent: Diffbot
Disallow: /

User-agent: ImagesiftBot
Disallow: /

User-agent: Omgili
Disallow: /

User-agent: Omgilibot
Disallow: /

User-agent: Amazonbot
Disallow: /

# --- Default: citation-only intent — deny anything not explicitly allowed ---
User-agent: *
Disallow: /

Sitemap: https://ryansullivan.example.com/sitemap-index.xml
Sitemap: https://ryansullivan.example.com/sitemap-image.xml