diff --git a/AGENTS.md b/AGENTS.md index eb3b3d07..7a3d3368 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -41,7 +41,8 @@ https://deadends.dev/country/{cc}/ for each country's entries. MCP tools: lookup_error, get_error_detail, search_errors, batch_lookup, get_error_chain, list_error_domains, list_errors_by_domain, -get_domain_stats, report_outcome. +get_domain_stats, list_errors_by_country, get_country_summary, +report_outcome. MCP config: ```json diff --git a/CLAUDE.md b/CLAUDE.md index badb3bed..f012782f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -62,7 +62,7 @@ generator/ validate.py # Validation: schema, business rules, HTML, cross-refs, staleness mcp/ - server.py # MCP server (JSON-RPC over stdio) - 8 tools for AI agents + server.py # MCP server (JSON-RPC over stdio) - 11 tools for AI agents api/ mcp.py # Vercel serverless MCP endpoint @@ -209,7 +209,8 @@ Optional overrides for `generator/build_site.py` (defaults work out of the box): ## MCP Server -The MCP server exposes 8 read-only tools over stdio (JSON-RPC): +The MCP server exposes 11 tools over stdio (JSON-RPC). All are read-only +except `report_outcome`, which appends feedback to `data/outcomes/`: 1. `lookup_error` - Match error message against regex patterns 2. `get_error_detail` - Full canon by ID @@ -219,6 +220,9 @@ The MCP server exposes 8 read-only tools over stdio (JSON-RPC): 6. `batch_lookup` - Look up multiple errors at once (max 10) 7. `get_domain_stats` - Domain statistics and confidence levels 8. `get_error_chain` - Traverse error transition graph +9. `list_errors_by_country` - Country-scoped dead ends (ISO alpha-2 code) +10. `get_country_summary` - Country-level coverage summary +11. `report_outcome` - Record whether a workaround worked (write) Configuration via environment variables: - `DEADENDS_PREFERRED_DOMAINS` - Comma-separated domain boost list diff --git a/generator/build_site.py b/generator/build_site.py index 36b60281..50f3b8ca 100644 --- a/generator/build_site.py +++ b/generator/build_site.py @@ -39,6 +39,25 @@ # Previously, non-tech domains were noindexed to preserve crawl budget. # Removed: all domains are now indexed to maximise Google coverage. +# Canonical MCP tool list - the single source of truth for every AI +# discovery surface this builder emits (llms.txt, ai-plugin.json, +# agent.json, mcp.json, server-card.json, CLAUDE.md, .cursorrules, +# homepage ai-summary). Must match mcp/server.py TOOLS; enforced by +# tests/test_build.py::test_mcp_tool_names_match_server. +MCP_TOOL_NAMES = [ + "lookup_error", + "get_error_detail", + "list_error_domains", + "search_errors", + "list_errors_by_domain", + "batch_lookup", + "get_domain_stats", + "list_errors_by_country", + "get_country_summary", + "get_error_chain", + "report_outcome", +] + def load_canons(data_dir: Path) -> list[dict]: """Load all ErrorCanon JSON files from the data directory.""" @@ -970,6 +989,7 @@ def build_index_page(canons: list[dict], jinja_env: Environment) -> None: mrr=mrr, precision_at_3=precision_at_3, demo_errors=demo_errors, + mcp_tools=MCP_TOOL_NAMES, google_verification=GOOGLE_VERIFICATION, bing_verification=BING_VERIFICATION, ) @@ -3119,10 +3139,8 @@ def build_llms_txt(canons: list[dict]) -> None: "python -m mcp.server # stdio mode", "```", "", - "Tools: `lookup_error`, `get_error_detail`, `search_errors`, " - "`batch_lookup`, `get_error_chain`, `list_error_domains`, " - "`list_errors_by_domain`, `get_domain_stats`, " - "`list_errors_by_country`, `get_country_summary`", + f"Tools ({len(MCP_TOOL_NAMES)}): " + + ", ".join(f"`{t}`" for t in MCP_TOOL_NAMES), "", "### Option 2: REST API", "", @@ -3139,6 +3157,9 @@ def build_llms_txt(canons: list[dict]) -> None: "", f"- [Complete Database]({BASE_URL}/llms-full.txt): " "All errors in plaintext (load into context window)", + f"- Per-domain slices: `{BASE_URL}/llms-full-{{domain}}.txt` " + "(bounded size - load only the domain you need, " + "e.g. `llms-full-python.txt`, `llms-full-docker.txt`)", "", "## How to Use", "", @@ -3876,7 +3897,8 @@ def build_well_known(canons: list[dict]) -> None: "(leads_to, preceded_by, frequently_confused_with). " "Alt: GET /llms.txt for text summary, " "GET /api/v1/errors.ndjson for streaming, " - "or use MCP server (8 tools). No auth required." + f"or use MCP server ({len(MCP_TOOL_NAMES)} tools). " + "No auth required." ), "auth": {"type": "none"}, "api": { @@ -4019,6 +4041,43 @@ def build_well_known(canons: list[dict]) -> None: "inputModes": ["text"], "outputModes": ["text"], }, + { + "id": "list-errors-by-country", + "name": "List Errors By Country", + "description": ( + "List country-scoped dead ends by ISO alpha-2 code: " + "visa, banking, legal, cultural, medical, food-safety, " + "emergency - jurisdiction knowledge generic LLM " + "training data gets wrong." + ), + "tags": ["country", "visa", "legal", "travel", "jurisdiction"], + "examples": ["kr", "jp", "us", "de", "th"], + "inputModes": ["text"], + "outputModes": ["text"], + }, + { + "id": "get-country-summary", + "name": "Country Coverage Summary", + "description": ( + "Country-level summary: total entries, domain " + "breakdown, average fix rate, latest update. Use to " + "assess coverage before relying on country data." + ), + "tags": ["country", "stats", "coverage"], + "inputModes": ["text"], + "outputModes": ["text"], + }, + { + "id": "report-outcome", + "name": "Report Workaround Outcome", + "description": ( + "Report whether a workaround worked or failed. " + "Feedback improves fix_success_rate for future agents." + ), + "tags": ["feedback", "outcomes", "write"], + "inputModes": ["text"], + "outputModes": ["text"], + }, ], "authentication": {"schemes": ["none"]}, "documentationUrl": f"{BASE_URL}/api/v1/openapi.json", @@ -4056,12 +4115,7 @@ def build_well_known(canons: list[dict]) -> None: "args": ["-m", "mcp.server"], "transport": "stdio", }, - "tools": [ - "lookup_error", "get_error_detail", "search_errors", - "batch_lookup", "get_error_chain", "list_error_domains", - "list_errors_by_domain", "get_domain_stats", - "list_errors_by_country", "get_country_summary", - ], + "tools": list(MCP_TOOL_NAMES), "domains": domains, "homepage": BASE_URL, "repository": "https://github.com/dbwls99706/deadends.dev", @@ -4083,7 +4137,7 @@ def build_well_known(canons: list[dict]) -> None: f"workarounds, error chains. {len(canons)} error entries across " f"{len(domains)} domains." ), - "version": "1.5.0", + "version": "1.6.0", "homepage": BASE_URL, "repository": "https://github.com/dbwls99706/deadends.dev", "license": "MIT", @@ -4158,6 +4212,13 @@ def build_well_known(canons: list[dict]) -> None: "average fix rate, latest update." ), }, + { + "name": "report_outcome", + "description": ( + "Report whether a workaround worked or failed - feedback " + "improves fix_success_rate for future agents." + ), + }, ], "domains": domains, } @@ -4441,10 +4502,13 @@ def build_ai_config_files(canons: list[dict]) -> None: | `/api/v1/index.json` | Complete error index | | `/api/v1/stats.json` | Dataset quality metrics | | `/api/v1/errors.ndjson` | Streaming format | +| `/api/v1/countries.json` | Country index (country-scoped dead ends) | +| `/api/v1/country/{{cc}}.json` | Per-country aggregate (ISO alpha-2) | | `/llms.txt` | LLM-optimized summary | | `/llms-full.txt` | Complete plaintext dump | +| `/llms-full-{{domain}}.txt` | Per-domain plaintext slice | -## MCP Server (8 tools) +## MCP Server ({len(MCP_TOOL_NAMES)} tools) ```json {{ @@ -4458,9 +4522,7 @@ def build_ai_config_files(canons: list[dict]) -> None: }} ``` -Tools: `lookup_error`, `get_error_detail`, `search_errors`, -`batch_lookup`, `get_error_chain`, `list_error_domains`, -`list_errors_by_domain`, `get_domain_stats` +Tools: {", ".join(f"`{t}`" for t in MCP_TOOL_NAMES)} """ (SITE_DIR / "CLAUDE.md").write_text(claude_md, encoding="utf-8") @@ -4488,9 +4550,11 @@ def build_ai_config_files(canons: list[dict]) -> None: → dead_ends: "pip install X" fails 85% when the issue is a venv mismatch → workaround: "python -m pip install X" in the correct venv works 90% -MCP server available with 8 tools: lookup_error, get_error_detail, -search_errors, batch_lookup, get_error_chain, list_error_domains, -list_errors_by_domain, get_domain_stats +Country-specific dead ends (visa, banking, legal, cultural, medical, +emergency) are also covered: {BASE_URL}/api/v1/country/{{cc}}.json +(ISO alpha-2 code, e.g. kr, jp, us). + +MCP server available with {len(MCP_TOOL_NAMES)} tools: {", ".join(MCP_TOOL_NAMES)} Full API docs: {BASE_URL}/api/v1/openapi.json """ @@ -4525,9 +4589,18 @@ def build_ai_config_files(canons: list[dict]) -> None: | Full error data | `/api/v1/{{id}}.json` | | All errors | `/api/v1/index.json` | | By domain | `/api/v1/stats.json` | +| By country (visa/legal/etc.) | `/api/v1/country/{{cc}}.json` | | Stream all | `/api/v1/errors.ndjson` | | LLM summary | `/llms.txt` | | Full dump | `/llms-full.txt` | +| Per-domain dump | `/llms-full-{{domain}}.txt` | + +## MCP Server ({len(MCP_TOOL_NAMES)} tools) + +`python -m mcp.server` (stdio) or HTTPS endpoint at +https://deadends-dev.vercel.app/api/mcp + +Tools: {", ".join(f"`{t}`" for t in MCP_TOOL_NAMES)} """ (SITE_DIR / "AGENTS.md").write_text(agents_md, encoding="utf-8") diff --git a/generator/lookup.py b/generator/lookup.py index 7ab3d22c..faf63d4a 100644 --- a/generator/lookup.py +++ b/generator/lookup.py @@ -30,6 +30,12 @@ _CANONS_CACHE: list[dict] | None = None +# Compiled regex cache, index-aligned with the canons list it was built from. +# The stdlib re module caches at most 512 patterns internally, so with 2000+ +# canons every lookup would otherwise recompile the full pattern set. +_REGEX_CACHE: list["re.Pattern | None"] = [] +_REGEX_CACHE_SOURCE: list[dict] | None = None + def _load_canons() -> list[dict]: """Load all canon data (cached after first call).""" @@ -45,6 +51,32 @@ def _load_canons() -> list[dict]: return canons +def _get_compiled_regexes(canons: list[dict]) -> list["re.Pattern | None"]: + """Compile each canon's regex once, invalidating if the canon list changes. + + Entries are None for canons with missing or invalid regexes (warned once + at compile time instead of on every lookup). + """ + global _REGEX_CACHE, _REGEX_CACHE_SOURCE + if _REGEX_CACHE_SOURCE is canons: + return _REGEX_CACHE + + compiled: list[re.Pattern | None] = [] + for canon in canons: + try: + compiled.append(re.compile(canon["error"]["regex"], re.IGNORECASE)) + except (re.error, KeyError, TypeError) as e: + print( + f"[lookup] skipping invalid regex in canon " + f"{canon.get('id', '?')}: {e}", + file=sys.stderr, + ) + compiled.append(None) + _REGEX_CACHE = compiled + _REGEX_CACHE_SOURCE = canons + return compiled + + def _compute_freshness(canon: dict) -> str: """Compute freshness status based on last_confirmed date. @@ -143,13 +175,11 @@ def lookup_all(error_message: str) -> list[dict]: extracted = _extract_error_lines(error_message) canons = _load_canons() + patterns = _get_compiled_regexes(canons) matches = [] - for canon in canons: - try: - pattern = re.compile(canon["error"]["regex"], re.IGNORECASE) - except re.error as e: - print(f"[lookup] skipping canon with invalid regex: {e}", file=sys.stderr) + for canon, pattern in zip(canons, patterns): + if pattern is None: continue try: diff --git a/generator/templates/index.html b/generator/templates/index.html index 44809502..ca202217 100644 --- a/generator/templates/index.html +++ b/generator/templates/index.html @@ -137,7 +137,9 @@ NDJSON_STREAM={{ base_url }}/api/v1/errors.ndjson STATS_API={{ base_url }}/api/v1/stats.json FEED={{ base_url }}/feed.xml -MCP_SERVER=python -m mcp.server (8 tools: lookup_error, get_error_detail, search_errors, batch_lookup, get_error_chain, list_error_domains, list_errors_by_domain, get_domain_stats) +COUNTRY_INDEX={{ base_url }}/api/v1/countries.json +COUNTRY_PATTERN={{ base_url }}/api/v1/country/{cc}.json +MCP_SERVER=python -m mcp.server ({{ mcp_tools|length }} tools: {{ mcp_tools | join(', ') }})

deadends.dev

diff --git a/scripts/collect_github_signals.py b/scripts/collect_github_signals.py index 96eb8458..34dcc0c1 100644 --- a/scripts/collect_github_signals.py +++ b/scripts/collect_github_signals.py @@ -65,7 +65,7 @@ def score_item(item: dict, labels: list[str]) -> tuple[int, list[str]]: if (item.get("comments") or 0) >= 2: score += 1 reasons.append("has_multiple_comments") - lower_labels = [l.lower() for l in labels if l] + lower_labels = [label.lower() for label in labels if label] if any(k in lower_labels for k in ["bug", "fix", "regression", "confirmed"]): score += 1 reasons.append("quality_labels") diff --git a/tests/test_build.py b/tests/test_build.py index 109088f7..08bab675 100644 --- a/tests/test_build.py +++ b/tests/test_build.py @@ -460,3 +460,21 @@ def test_non_string_returns_empty(self): def test_valid_signature_returns_list(self): result = _generate_variations("ModuleNotFoundError", "Module.*", "python") assert isinstance(result, list) + + +class TestMcpToolNames: + """MCP_TOOL_NAMES is the single source of truth for every AI + discovery surface (llms.txt, ai-plugin.json, agent.json, mcp.json, + server-card.json, CLAUDE.md, .cursorrules, homepage ai-summary). + It must never drift from the actual server tool registry.""" + + def test_mcp_tool_names_match_server(self): + from generator.build_site import MCP_TOOL_NAMES + from mcp.server import TOOLS + + server_tools = [t["name"] for t in TOOLS] + assert MCP_TOOL_NAMES == server_tools, ( + "generator/build_site.py MCP_TOOL_NAMES is out of sync with " + "mcp/server.py TOOLS - update MCP_TOOL_NAMES so AI discovery " + "files advertise the real tool set." + ) diff --git a/tests/test_lookup_regex_cache.py b/tests/test_lookup_regex_cache.py new file mode 100644 index 00000000..f4fa4782 --- /dev/null +++ b/tests/test_lookup_regex_cache.py @@ -0,0 +1,77 @@ +"""Tests for the compiled-regex cache in generator/lookup.py.""" + +import copy + +from tests.conftest import VALID_CANON + + +def _make_canon(canon_id, signature, regex): + canon = copy.deepcopy(VALID_CANON) + canon["id"] = canon_id + canon["url"] = f"https://deadends.dev/{canon_id}" + canon["error"]["signature"] = signature + canon["error"]["regex"] = regex + return canon + + +def _reset(lookup): + lookup._CANONS_CACHE = None + lookup._REGEX_CACHE = [] + lookup._REGEX_CACHE_SOURCE = None + + +class TestRegexCache: + def test_lookup_uses_compiled_cache(self): + from generator import lookup + lookup._CANONS_CACHE = [ + _make_canon( + "python/mem/env1", "MemoryError: out of memory", + r"MemoryError: .+", + ), + ] + try: + results = lookup.lookup_all("MemoryError: out of memory") + assert len(results) == 1 + # Cache is populated and aligned with the canon list + assert lookup._REGEX_CACHE_SOURCE is lookup._CANONS_CACHE + assert len(lookup._REGEX_CACHE) == 1 + # Second call reuses the same compiled list (identity) + cache_before = lookup._REGEX_CACHE + lookup.lookup_all("MemoryError: out of memory") + assert lookup._REGEX_CACHE is cache_before + finally: + _reset(lookup) + + def test_cache_invalidates_when_canons_change(self): + from generator import lookup + lookup._CANONS_CACHE = [ + _make_canon("python/a/env1", "AError: x", r"AError: .+"), + ] + try: + assert len(lookup.lookup_all("AError: x")) == 1 + + # Swap the canon list (as other tests do) - cache must rebuild + lookup._CANONS_CACHE = [ + _make_canon("python/b/env1", "BError: y", r"BError: .+"), + ] + results = lookup.lookup_all("BError: y") + assert len(results) == 1 + assert results[0]["id"] == "python/b/env1" + assert lookup.lookup_all("AError: x") == [] + finally: + _reset(lookup) + + def test_invalid_regex_skipped(self): + from generator import lookup + lookup._CANONS_CACHE = [ + _make_canon("python/bad/env1", "unmatchable-xyz", r"[invalid("), + _make_canon("python/good/env1", "GoodError: ok", r"GoodError: .+"), + ] + try: + results = lookup.lookup_all("GoodError: ok") + assert [r["id"] for r in results] == ["python/good/env1"] + # Invalid regex cached as None, not recompiled per call + assert lookup._REGEX_CACHE[0] is None + assert lookup._REGEX_CACHE[1] is not None + finally: + _reset(lookup)