# ────────────────────────────────────────────────────────────────────
# robots.txt for arcounselingandwellness.com
# Last updated: 2026-04-29 (Tier 3 ACM expansion)
#
# Allow policy: citation-friendly AI crawlers and search engines.
# Disallow: training-only or non-citation crawlers where distinguishable.
# Disallow: /admin/ (operational stubs, not for crawl).
#
# Brand-language-feed: /brand-language-feed.json
# AI context: /llms.txt, /llms-full.txt, /ai.txt
# Image sitemap: /image-sitemap.xml
# ────────────────────────────────────────────────────────────────────

User-agent: *
Allow: /
Disallow: /private/
Disallow: /admin/

# ─── Search engine crawlers (full allow) ────────────────────────────
User-agent: Googlebot
Allow: /

User-agent: Bingbot
Allow: /

User-agent: DuckDuckBot
Allow: /

# ─── Citation-friendly AI crawlers (full allow) ─────────────────────
# These bots cite the source URL in AI responses, providing user-visible
# attribution back to the practice. Allowing improves discoverability in
# AI assistants and answer engines.

# ChatGPT search index (cited in ChatGPT search results)
User-agent: OAI-SearchBot
Allow: /

# ChatGPT direct user fetches (when a user asks ChatGPT to read a URL)
User-agent: ChatGPT-User
Allow: /

# OpenAI training crawler (provides public attribution)
User-agent: GPTBot
Allow: /

# Anthropic — Claude search and indexing
User-agent: ClaudeBot
Allow: /

# Anthropic — Claude direct user fetches
User-agent: Claude-Web
Allow: /

# Anthropic — legacy training crawler
User-agent: anthropic-ai
Allow: /

# Perplexity — answer engine that cites sources prominently
User-agent: PerplexityBot
Allow: /
# Meta AI external agent (LLM training and search)
User-agent: Meta-ExternalAgent
Allow: /


# Perplexity direct user fetches
User-agent: Perplexity-User
Allow: /

# Google — Gemini and Bard training (allows opt-in for SGE/AI Overviews)
User-agent: Google-Extended
Allow: /

# Apple — Siri/Spotlight AI (cited in Apple Intelligence)
User-agent: Applebot-Extended
Allow: /

# Common Crawl — public training corpus, widely cited downstream
User-agent: CCBot
Allow: /

# Cohere — citation-friendly indexer
User-agent: cohere-ai
Allow: /

User-agent: Cohere-AI
Allow: /

# Amazon — Alexa/Anthropic-via-Bedrock
User-agent: Amazonbot
Allow: /

# DuckDuckGo AI Assist
User-agent: DuckAssistBot
Allow: /

# Mistral direct user fetches
User-agent: MistralAI-User
Allow: /

# Diffbot — structured data indexer cited by AI engines
User-agent: Diffbot
Allow: /

# Meta — Facebook indexer
User-agent: FacebookBot
Allow: /

# ─── Disallowed: non-citation training crawlers ─────────────────────
# Healthcare YMYL site: prefer citable surfaces over uncited training scrapes.

# Bytespider (TikTok/ByteDance) — not citation friendly for healthcare
User-agent: Bytespider
Disallow: /

# Sitemaps
Sitemap: https://arcounselingandwellness.com/sitemap.xml
Sitemap: https://arcounselingandwellness.com/image-sitemap.xml