|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Build Context7 filesystem corpus as one JSON tree file.""" |
| 3 | + |
| 4 | +from __future__ import annotations |
| 5 | + |
| 6 | +import json |
| 7 | +import os |
| 8 | +import re |
| 9 | +import urllib.request |
| 10 | +from datetime import datetime, timezone |
| 11 | +from pathlib import Path |
| 12 | +from urllib.parse import urlparse |
| 13 | + |
| 14 | +BASE_DIR = Path(__file__).resolve().parent / "filesystem" / "context7" |
| 15 | +LLMS_TXT_URL = "https://context7.com/docs/llms.txt" |
| 16 | +TREE_FILENAME = "tree.json" |
| 17 | +# 0 means "all docs found in llms.txt". |
| 18 | +MAX_DOCS = int(os.environ.get("CONTEXT7_MAX_DOCS", "0")) |
| 19 | +TIMEOUT = 15 |
| 20 | + |
| 21 | + |
| 22 | +def fetch_text(url: str) -> str: |
| 23 | + req = urllib.request.Request( |
| 24 | + url, headers={"User-Agent": "ConDB-context7-builder/1.0"} |
| 25 | + ) |
| 26 | + with urllib.request.urlopen(req, timeout=TIMEOUT) as resp: |
| 27 | + return resp.read().decode("utf-8", errors="replace") |
| 28 | + |
| 29 | + |
| 30 | +def parse_doc_urls(llms_txt: str) -> list[str]: |
| 31 | + urls = re.findall(r"\((https://context7\.com/docs/[^)]+)\)", llms_txt) |
| 32 | + dedup: list[str] = [] |
| 33 | + seen: set[str] = set() |
| 34 | + for url in urls: |
| 35 | + if url in seen: |
| 36 | + continue |
| 37 | + seen.add(url) |
| 38 | + dedup.append(url) |
| 39 | + return dedup |
| 40 | + |
| 41 | + |
| 42 | +def logical_path_for_url(url: str) -> str: |
| 43 | + path = urlparse(url).path.lstrip("/") |
| 44 | + if not path: |
| 45 | + path = "index.md" |
| 46 | + if path.endswith("/"): |
| 47 | + path += "index.md" |
| 48 | + if "." not in Path(path).name: |
| 49 | + path += ".md" |
| 50 | + return path |
| 51 | + |
| 52 | + |
| 53 | +def write_tree_file(out_path: Path, docs: list[dict[str, str]]) -> None: |
| 54 | + payload = { |
| 55 | + "generated_at": datetime.now(timezone.utc).isoformat(), |
| 56 | + "source_index": LLMS_TXT_URL, |
| 57 | + "docs_count": len(docs), |
| 58 | + "docs": docs, |
| 59 | + } |
| 60 | + out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") |
| 61 | + |
| 62 | + |
| 63 | +def main() -> None: |
| 64 | + BASE_DIR.mkdir(parents=True, exist_ok=True) |
| 65 | + |
| 66 | + llms_txt = fetch_text(LLMS_TXT_URL) |
| 67 | + all_doc_urls = parse_doc_urls(llms_txt) |
| 68 | + doc_urls = [url for url in all_doc_urls if url.endswith(".md")] |
| 69 | + if not doc_urls: |
| 70 | + raise RuntimeError(f"no markdown doc urls found from {LLMS_TXT_URL}") |
| 71 | + if MAX_DOCS > 0: |
| 72 | + doc_urls = doc_urls[:MAX_DOCS] |
| 73 | + |
| 74 | + docs: list[dict[str, str]] = [] |
| 75 | + errors: list[dict[str, str]] = [] |
| 76 | + |
| 77 | + for url in doc_urls: |
| 78 | + try: |
| 79 | + content = fetch_text(url) |
| 80 | + path = logical_path_for_url(url) |
| 81 | + docs.append({"url": url, "path": path, "content": content}) |
| 82 | + print(f"+ {url} -> {TREE_FILENAME}::{path}") |
| 83 | + except Exception as e: # noqa: BLE001 |
| 84 | + errors.append({"url": url, "error": str(e)}) |
| 85 | + print(f"! failed: {url} ({e})") |
| 86 | + |
| 87 | + if not docs: |
| 88 | + raise RuntimeError(f"failed to fetch any context7 docs; errors={len(errors)}") |
| 89 | + |
| 90 | + tree_path = BASE_DIR / TREE_FILENAME |
| 91 | + write_tree_file(tree_path, docs) |
| 92 | + |
| 93 | + print( |
| 94 | + f"done: docs={len(docs)} errors={len(errors)} " |
| 95 | + f"tree={tree_path}" |
| 96 | + ) |
| 97 | + if errors: |
| 98 | + print("error_summary:") |
| 99 | + for err in errors: |
| 100 | + print(f"- {err['url']} :: {err['error']}") |
| 101 | + |
| 102 | + |
| 103 | +if __name__ == "__main__": |
| 104 | + main() |
0 commit comments