Skip to content

Commit a74ec3b

Browse files
feat: search_engine_indexing usage category — free for allowed crawlers
- New UsageCategory.SEARCH_ENGINE_INDEXING (price 0 when allowed) - Config: FAIRFETCH_SEARCH_ENGINES_ALLOWED, FAIRFETCH_SEARCH_ENGINES_BLOCKED - Default allowlist: Googlebot, Bingbot, DuckDuckBot, Slurp, Baidu, Yandex, Sogou, Exabot, etc. - core/search_engine: is_allowed_search_engine(); middleware free pass for allowed UA - Non-allowed crawlers pay base (1x) for search_engine_indexing - Docs: README, PUBLISHER_GUIDE, CONCEPTS, AI_AGENT_GUIDE - Tests: test_search_engine_indexing.py (category, parse_list_env, allow/block, 200 free, 402)
1 parent 61db1d6 commit a74ec3b

11 files changed

Lines changed: 306 additions & 15 deletions

File tree

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,7 @@ You get back a `402 Payment Required` response — this is not an error, it's a
305305
},
306306
"available_tiers": {
307307
"summary": { "price": "1000", "compliance_level": "standard" },
308+
"search_engine_indexing": { "price": "0", "compliance_level": "standard" },
308309
"rag": { "price": "2000", "compliance_level": "standard" },
309310
"research": { "price": "3000", "compliance_level": "elevated" },
310311
"training": { "price": "5000", "compliance_level": "strict" },
@@ -412,6 +413,7 @@ Not all content usage is equal. Fairfetch defines **usage categories** that cont
412413
| Category | Compliance | Price Multiplier | Use Case |
413414
|----------|-----------|-----------------|----------|
414415
| `summary` | Standard | 1x | Display a short summary or snippet |
416+
| `search_engine_indexing` | Standard | 0x (free) | Search engine crawling for indexing; free when publisher allows (see [config](#-configuration)) |
415417
| `rag` | Standard | 2x | Retrieval-Augmented Generation / search grounding |
416418
| `research` | Elevated | 3x | Academic or internal research use |
417419
| `training` | Strict | 5x | Model fine-tuning or pre-training |
@@ -689,6 +691,8 @@ fairfetch/
689691
| `FAIRFETCH_SIGNING_KEY` | *(generated)* | Ed25519 private key (b64) |
690692
| `FAIRFETCH_LICENSE_TYPE` | `publisher-terms` | Default license |
691693
| `FAIRFETCH_DEFAULT_USAGE_CATEGORY` | `summary` | Default usage tier for pricing |
694+
| `FAIRFETCH_SEARCH_ENGINES_ALLOWED` | *(built-in list)* | Comma-separated User-Agent substrings for search engines allowed **free** indexing (e.g. Googlebot, Bingbot, DuckDuckBot). Overrides default. |
695+
| `FAIRFETCH_SEARCH_ENGINES_BLOCKED` | *(empty)* | Comma-separated User-Agent substrings never given free indexing (takes precedence over allowed). |
692696
| `FAIRFETCH_ENABLE_GRANTS` | `true` | Issue Usage Grants |
693697
| `FAIRFETCH_PREFERRED_ACCESS` | `true` | Inject bot-steering headers |
694698
| `LITELLM_MODEL` | `gpt-4o-mini` | LLM for summarization |

api/dependencies.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,20 @@
1919
from payments.mock_facilitator import MockFacilitator
2020
from payments.mock_license_facilitator import MockLicenseFacilitator, MockLicenseProvider
2121

22+
# Default User-Agent substrings for search engines allowed free indexing (publisher can override)
23+
_DEFAULT_SEARCH_ENGINES_ALLOWED = (
24+
"Googlebot",
25+
"Bingbot",
26+
"DuckDuckBot",
27+
"Slurp", # Yahoo
28+
"Baiduspider",
29+
"YandexBot",
30+
"Sogou",
31+
"Exabot",
32+
"facebookexternalhit",
33+
"ia_archiver", # Alexa
34+
)
35+
2236

2337
class FairFetchConfig(BaseModel):
2438
"""Centralized runtime configuration — loaded entirely from env vars.
@@ -47,6 +61,15 @@ class FairFetchConfig(BaseModel):
4761
enable_usage_grants: bool = Field(default=True)
4862
enable_preferred_access: bool = Field(default=True)
4963

64+
search_engines_allowed: list[str] = Field(
65+
default_factory=lambda: list(_DEFAULT_SEARCH_ENGINES_ALLOWED),
66+
description="User-Agent substrings for search engines allowed free indexing",
67+
)
68+
search_engines_blocked: list[str] = Field(
69+
default_factory=list,
70+
description="User-Agent substrings for search engines never given free indexing",
71+
)
72+
5073
@classmethod
5174
def from_env(cls) -> FairFetchConfig:
5275
return cls(
@@ -72,6 +95,10 @@ def from_env(cls) -> FairFetchConfig:
7295
enable_preferred_access=(
7396
os.getenv("FAIRFETCH_PREFERRED_ACCESS", "true").lower() == "true"
7497
),
98+
search_engines_allowed=_parse_list_env(
99+
"FAIRFETCH_SEARCH_ENGINES_ALLOWED", _DEFAULT_SEARCH_ENGINES_ALLOWED
100+
),
101+
search_engines_blocked=_parse_list_env("FAIRFETCH_SEARCH_ENGINES_BLOCKED", ()),
75102
)
76103

77104

@@ -113,6 +140,17 @@ def _is_valid_price_string(s: str) -> bool:
113140
return isinstance(s, str) and len(s) <= 20 and s.isdigit()
114141

115142

143+
def _parse_list_env(
144+
key: str,
145+
default: tuple[str, ...] | list[str],
146+
) -> list[str]:
147+
"""Parse comma-separated env var into list of stripped strings; use default if unset."""
148+
raw = os.getenv(key, "").strip()
149+
if not raw:
150+
return list(default)
151+
return [s.strip() for s in raw.split(",") if s.strip()]
152+
153+
116154
def _parse_price_by_route(raw: str) -> dict[str, str]:
117155
"""Parse FAIRFETCH_PRICE_BY_ROUTE JSON (path prefix -> price). Only numeric prices kept."""
118156
if not raw or not raw.strip():

api/main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ def get_requirement(url: str) -> PaymentRequirement:
9191
get_requirement=get_requirement,
9292
license_provider=license_provider,
9393
wallet_ledger=wallet_ledger,
94+
search_engines_allowed=config.search_engines_allowed,
95+
search_engines_blocked=config.search_engines_blocked,
9496
paid_path_prefixes=["/content/"],
9597
exempt_paths=[
9698
"/health",

core/search_engine.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
"""Search engine allow/block list for free indexing (usage category search_engine_indexing)."""
2+
3+
from __future__ import annotations
4+
5+
6+
def is_allowed_search_engine(
7+
user_agent: str,
8+
allowed: list[str],
9+
blocked: list[str],
10+
) -> bool:
11+
"""True if UA matches an allowed search engine and is not in the blocklist."""
12+
ua = (user_agent or "").strip()
13+
if not ua:
14+
return False
15+
ua_lower = ua.lower()
16+
for sub in blocked:
17+
if sub.strip().lower() in ua_lower:
18+
return False
19+
return any(sub.strip().lower() in ua_lower for sub in allowed)

docs/AI_AGENT_GUIDE.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ curl -s "http://localhost:8402/content/fetch?url=https://example.com"
180180
},
181181
"available_tiers": {
182182
"summary": { "price": "1000", "compliance_level": "standard" },
183+
"search_engine_indexing": { "price": "0", "compliance_level": "standard" },
183184
"rag": { "price": "2000", "compliance_level": "standard" },
184185
"research": { "price": "3000", "compliance_level": "elevated" },
185186
"training": { "price": "5000", "compliance_level": "strict" },

docs/CONCEPTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ what it costs. Here's what each field means:
246246
},
247247
"available_tiers": {
248248
"summary": { "price": "1000", "compliance_level": "standard" },
249+
"search_engine_indexing": { "price": "0", "compliance_level": "standard" },
249250
"rag": { "price": "2000", "compliance_level": "standard" },
250251
"research": { "price": "3000", "compliance_level": "elevated" },
251252
"training": { "price": "5000", "compliance_level": "strict" },

docs/PUBLISHER_GUIDE.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ LITELLM_MODEL=gpt-4o-mini
163163
| `FAIRFETCH_PUBLISHER_DOMAIN` | Your website’s domain (no `https://`) | `newstoday.com` |
164164
| `FAIRFETCH_CONTENT_PRICE` | Default price per request in smallest USDC unit (1000 ≈ $0.001). Used when no route rule matches. | `1000` |
165165
| `FAIRFETCH_PRICE_BY_ROUTE` | *(Optional)* JSON map of **path prefix → price** so different sections have different prices. Longest matching path wins. E.g. `{"": "1000", "/business": "2000", "/sports": "500"}` makes `/business` cost 2000, `/sports` 500, and everything else 1000. | (omit for one price site-wide) |
166+
| `FAIRFETCH_SEARCH_ENGINES_ALLOWED` | *(Optional)* Comma-separated User-Agent substrings for search engines allowed **free** indexing (e.g. `Googlebot,Bingbot,DuckDuckBot`). Default includes Google, Bing, DuckDuckGo, Yahoo, Baidu, Yandex, Sogou, Exabot, and others. | (omit to use default) |
167+
| `FAIRFETCH_SEARCH_ENGINES_BLOCKED` | *(Optional)* Comma-separated User-Agent substrings never given free indexing (overrides allowed list). | (omit for none) |
166168
| `FAIRFETCH_LICENSE_TYPE` | Legal terms you offer: `publisher-terms`, `commercial`, or `research-only` | `publisher-terms` |
167169
| `FAIRFETCH_SIGNING_KEY` | Leave empty at first; we’ll generate a key next. | (empty) |
168170
| `LITELLM_MODEL` | Model used to generate summaries (needs an API key in production) | `gpt-4o-mini` |
@@ -182,6 +184,9 @@ Here, `/business` (and `/business/...`) is 2000, `/sports` is 500, and all other
182184

183185
**Behavior and limits:** Prices must be numeric (digits only); non-numeric values are ignored. The content URL path is normalized (percent-encoding decoded, `.` and `..` segments collapsed) so route matching cannot be bypassed. At most 256 route entries are used; extra entries are ignored.
184186

187+
**Search engine indexing (free for allowed crawlers)**
188+
The usage category `search_engine_indexing` lets search engines (Google, Bing, DuckDuckGo, etc.) index your site for **free** when you allow them. Set `FAIRFETCH_SEARCH_ENGINES_ALLOWED` to a comma-separated list of User-Agent substrings (e.g. `Googlebot,Bingbot,DuckDuckBot`). The default allowlist includes Googlebot, Bingbot, DuckDuckBot, Slurp, Baiduspider, YandexBot, Sogou, Exabot, and a few others. Set `FAIRFETCH_SEARCH_ENGINES_BLOCKED` to block specific crawlers from free access (takes precedence over the allowlist). Crawlers not on the allowlist that request `usage=search_engine_indexing` pay the base price (1x).
189+
185190
### 3.2 Generate a signing key (recommended for production)
186191

187192
This key lets AI agents (and you) verify that content really came from your server.

interfaces/facilitator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class PaymentRequirement(BaseModel):
4444
description: str = Field(default="Content access fee")
4545
usage_category: str = Field(
4646
default=UsageCategory.SUMMARY,
47-
description="Intended usage: summary, rag, research, training, commercial",
47+
description="Intended usage: summary, search_engine_indexing, rag, research, training, commercial", # noqa: E501
4848
)
4949
extra: dict[str, str] = Field(default_factory=dict)
5050

interfaces/license_provider.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,15 @@ class UsageCategory(StrEnum):
3030
3131
Categories are ordered by escalating compliance requirements and pricing:
3232
- SUMMARY: Display a short summary or snippet (lowest tier)
33+
- SEARCH_ENGINE_INDEXING: Search engine crawling for indexing (free when allowed by publisher)
3334
- RAG: Retrieval-Augmented Generation / search grounding
3435
- RESEARCH: Academic or internal research use
3536
- TRAINING: Model fine-tuning or pre-training (highest tier)
3637
- COMMERCIAL: Redistribution or commercial derivative works
3738
"""
3839

3940
SUMMARY = "summary"
41+
SEARCH_ENGINE_INDEXING = "search_engine_indexing"
4042
RAG = "rag"
4143
RESEARCH = "research"
4244
TRAINING = "training"
@@ -58,6 +60,12 @@ class ComplianceLevel(StrEnum):
5860
"requires_audit_trail": False,
5961
"description": "Short summary or snippet display",
6062
},
63+
UsageCategory.SEARCH_ENGINE_INDEXING: {
64+
"compliance_level": ComplianceLevel.STANDARD,
65+
"price_multiplier": 0,
66+
"requires_audit_trail": False,
67+
"description": "Search engine crawling for indexing (free when publisher allows)",
68+
},
6169
UsageCategory.RAG: {
6270
"compliance_level": ComplianceLevel.STANDARD,
6371
"price_multiplier": 2,
@@ -110,7 +118,7 @@ class UsageGrant(BaseModel):
110118
license_type: str = Field(default="publisher-terms")
111119
usage_category: str = Field(
112120
default=UsageCategory.SUMMARY,
113-
description="Permitted use: summary, rag, research, training, commercial",
121+
description="Permitted use: summary, search_engine_indexing, rag, research, training, commercial", # noqa: E501
114122
)
115123
granted_to: str = Field(default="", description="Payer wallet or agent identifier")
116124
granted_at: str = Field(default_factory=lambda: datetime.now(UTC).isoformat())

payments/x402.py

Lines changed: 76 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from fastapi.responses import JSONResponse
2424
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
2525

26+
from core.search_engine import is_allowed_search_engine
2627
from interfaces.facilitator import BaseFacilitator, PaymentRequirement
2728
from interfaces.license_provider import BaseLicenseProvider, UsageCategory
2829
from payments.wallet_ledger import WalletLedger
@@ -52,6 +53,8 @@ def __init__(
5253
get_requirement: Callable[[str], PaymentRequirement],
5354
license_provider: BaseLicenseProvider | None = None,
5455
wallet_ledger: WalletLedger | None = None,
56+
search_engines_allowed: list[str] | None = None,
57+
search_engines_blocked: list[str] | None = None,
5558
paid_path_prefixes: list[str] | None = None,
5659
exempt_paths: list[str] | None = None,
5760
) -> None:
@@ -60,6 +63,8 @@ def __init__(
6063
self._get_requirement = get_requirement
6164
self._license_provider = license_provider
6265
self._wallet_ledger = wallet_ledger
66+
self._search_engines_allowed = search_engines_allowed or []
67+
self._search_engines_blocked = search_engines_blocked or []
6368
self._paid_prefixes = paid_path_prefixes or ["/content/"]
6469
self._exempt = set(exempt_paths or ["/health", "/openapi.json", "/docs", "/redoc"])
6570

@@ -80,6 +85,30 @@ def _resolve_usage_category(
8085
pass
8186
return default_requirement.usage_category
8287

88+
def _402_body_with_price(
89+
self,
90+
req_for_category: PaymentRequirement,
91+
usage_cat: str,
92+
effective_price: int,
93+
**extra: object,
94+
) -> dict[str, object]:
95+
"""Build 402 body; override displayed price when search_engine_indexing and not free."""
96+
body = {**req_for_category.to_402_body(), **extra}
97+
if usage_cat == UsageCategory.SEARCH_ENGINE_INDEXING.value and effective_price != 0:
98+
accepts = body.get("accepts")
99+
if isinstance(accepts, dict):
100+
body["accepts"] = {**accepts, "price": str(effective_price)}
101+
if "available_tiers" in body and isinstance(body["available_tiers"], dict):
102+
tiers = dict(body["available_tiers"])
103+
if "search_engine_indexing" in tiers and isinstance(
104+
tiers["search_engine_indexing"], dict
105+
):
106+
tier = dict(tiers["search_engine_indexing"])
107+
tier["price"] = str(effective_price)
108+
tiers["search_engine_indexing"] = tier
109+
body["available_tiers"] = tiers
110+
return body
111+
83112
async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
84113
if not self._is_paid_route(request.url.path):
85114
return await call_next(request)
@@ -91,6 +120,30 @@ async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -
91120
request.state.payment_requirement = req_for_category
92121
effective_price = int(req_for_category.effective_price())
93122

123+
# --- Search engine indexing: free for allowed crawlers, 1x base for others ---
124+
if usage_cat == UsageCategory.SEARCH_ENGINE_INDEXING.value:
125+
user_agent = request.headers.get("user-agent", "") or ""
126+
if is_allowed_search_engine(
127+
user_agent,
128+
self._search_engines_allowed,
129+
self._search_engines_blocked,
130+
):
131+
effective_price = 0
132+
else:
133+
effective_price = int(requirement.price)
134+
135+
# --- Free pass: no payment required (e.g. allowed search engine) ---
136+
if effective_price == 0:
137+
payer = request.headers.get("user-agent", "search_engine") or "search_engine"
138+
request.state.payment_result = None
139+
request.state.payment_payer = payer
140+
request.state.usage_category = usage_cat
141+
response = await call_next(request)
142+
response.headers["X-FairFetch-Payment-Method"] = "free"
143+
response.headers[RECEIPT_HEADER] = "free"
144+
await self._issue_grant(request, response, usage_cat, payer)
145+
return response
146+
94147
wallet_token = request.headers.get(WALLET_TOKEN_HEADER)
95148
payment_header = request.headers.get(PAYMENT_HEADER)
96149

@@ -140,18 +193,20 @@ async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -
140193
)
141194
return JSONResponse(
142195
status_code=402,
143-
content={
144-
**req_for_category.to_402_body(),
145-
"wallet_error": "insufficient_balance",
146-
"wallet_balance": balance,
147-
"amount_required": effective_price,
148-
"shortfall": effective_price - balance,
149-
},
196+
content=self._402_body_with_price(
197+
req_for_category,
198+
usage_cat,
199+
effective_price,
200+
wallet_error="insufficient_balance",
201+
wallet_balance=balance,
202+
amount_required=effective_price,
203+
shortfall=effective_price - balance,
204+
),
150205
)
151206

152207
# --- Standard path: x402 one-time payment ---
153208
if not payment_header:
154-
body = req_for_category.to_402_body()
209+
body = self._402_body_with_price(req_for_category, usage_cat, effective_price)
155210
body["hint"] = (
156211
"For faster access without 402 round-trips, use a pre-funded wallet. "
157212
"Send X-WALLET-TOKEN header instead of X-PAYMENT. "
@@ -164,16 +219,24 @@ async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -
164219
headers={"X-Payment-Required": "true"},
165220
)
166221

167-
result = await self._facilitator.settle(payment_header, req_for_category)
222+
# When search_engine_indexing but not free, settle at base price (1x)
223+
req_for_settlement = req_for_category
224+
if usage_cat == UsageCategory.SEARCH_ENGINE_INDEXING.value and effective_price != 0:
225+
req_for_settlement = requirement.model_copy(
226+
update={"usage_category": UsageCategory.SUMMARY.value}
227+
)
228+
result = await self._facilitator.settle(payment_header, req_for_settlement)
168229

169230
if not result.valid:
170231
logger.warning("Payment rejected for %s: %s", request.url.path, result.error)
171232
return JSONResponse(
172233
status_code=402,
173-
content={
174-
**req_for_category.to_402_body(),
175-
"verification_error": result.error,
176-
},
234+
content=self._402_body_with_price(
235+
req_for_category,
236+
usage_cat,
237+
effective_price,
238+
verification_error=result.error,
239+
),
177240
)
178241

179242
logger.info(

0 commit comments

Comments
 (0)