|
1 | 1 | #!/usr/bin/env python3 |
2 | 2 | """ |
3 | | -Generate llms-full.txt from documentation sources. |
| 3 | +Generate llms.txt and llms-full.txt from documentation sources. |
4 | 4 |
|
5 | | -This script concatenates all markdown documentation into a single file |
6 | | -optimized for LLM consumption. |
| 5 | +- llms.txt: Index with links derived from mkdocs.yaml nav |
| 6 | +- llms-full.txt: Complete documentation concatenated for LLM consumption |
7 | 7 |
|
8 | | -The generated file is NOT committed to git - it's auto-generated during |
9 | | -the build process with current version metadata. |
| 8 | +Both files are auto-generated during the build process. |
10 | 9 | """ |
11 | 10 |
|
12 | 11 | import json |
| 12 | +import re |
13 | 13 | import subprocess |
14 | 14 | from datetime import datetime, timezone |
15 | 15 | from pathlib import Path |
16 | 16 |
|
| 17 | +import yaml |
| 18 | + |
17 | 19 | # Documentation root |
18 | | -DOCS_DIR = Path(__file__).parent.parent / "src" |
| 20 | +PROJECT_DIR = Path(__file__).parent.parent |
| 21 | +DOCS_DIR = PROJECT_DIR / "src" |
| 22 | +MKDOCS_FILE = PROJECT_DIR / "mkdocs.yaml" |
19 | 23 | OUTPUT_FILE = DOCS_DIR / "llms-full.txt" |
| 24 | +OUTPUT_INDEX = DOCS_DIR / "llms.txt" |
20 | 25 |
|
21 | 26 | # Sections in order of importance |
22 | 27 | SECTIONS = [ |
@@ -115,6 +120,114 @@ def get_doc_files(directory: Path) -> list[Path]: |
115 | 120 | return sorted(files) |
116 | 121 |
|
117 | 122 |
|
| 123 | +def source_path_to_url(path: str) -> str: |
| 124 | + """Convert a source file path to a deployed MkDocs URL. |
| 125 | +
|
| 126 | + MkDocs with use_directory_urls=true (default) serves: |
| 127 | + about/whats-new-2.md -> /about/whats-new-2/ |
| 128 | + tutorials/basics/01-first-pipeline.ipynb -> /tutorials/basics/01-first-pipeline/ |
| 129 | + index.md -> / |
| 130 | + section/index.md -> /section/ |
| 131 | + """ |
| 132 | + # Strip file extension |
| 133 | + url = re.sub(r"\.(md|ipynb)$", "", path) |
| 134 | + # index pages -> parent directory |
| 135 | + url = re.sub(r"/index$", "", url) |
| 136 | + if url == "index": |
| 137 | + return "/" |
| 138 | + # Avoid double slash for paths like "api/" |
| 139 | + if url.endswith("/"): |
| 140 | + return f"/{url}" |
| 141 | + return f"/{url}/" |
| 142 | + |
| 143 | + |
| 144 | +def extract_nav_entries(nav, section_path=""): |
| 145 | + """Recursively extract (title, url) pairs from mkdocs nav structure.""" |
| 146 | + entries = [] |
| 147 | + if isinstance(nav, list): |
| 148 | + for item in nav: |
| 149 | + entries.extend(extract_nav_entries(item, section_path)) |
| 150 | + elif isinstance(nav, dict): |
| 151 | + for key, value in nav.items(): |
| 152 | + if isinstance(value, str): |
| 153 | + # Leaf node: "Title: path.md" or external URL |
| 154 | + if value.startswith("http"): |
| 155 | + continue # skip external links |
| 156 | + url = source_path_to_url(value) |
| 157 | + entries.append((key, url)) |
| 158 | + elif isinstance(value, list): |
| 159 | + # Section with children |
| 160 | + entries.extend(extract_nav_entries(value, key)) |
| 161 | + elif isinstance(nav, str): |
| 162 | + # Bare path without title (e.g., index pages) |
| 163 | + if not nav.startswith("http"): |
| 164 | + url = source_path_to_url(nav) |
| 165 | + entries.append((None, url)) |
| 166 | + return entries |
| 167 | + |
| 168 | + |
| 169 | +def load_mkdocs_nav(): |
| 170 | + """Load just the nav section from mkdocs.yaml. |
| 171 | +
|
| 172 | + mkdocs.yaml contains !!python/name tags that standard YAML loaders |
| 173 | + can't resolve without the material theme installed. We add a custom |
| 174 | + constructor that ignores these tags. |
| 175 | + """ |
| 176 | + loader = yaml.SafeLoader |
| 177 | + # Handle !!python/name and !!python/object tags by returning None |
| 178 | + loader.add_multi_constructor( |
| 179 | + "tag:yaml.org,2002:python/", |
| 180 | + lambda loader, suffix, node: None, |
| 181 | + ) |
| 182 | + with open(MKDOCS_FILE, "r") as f: |
| 183 | + return yaml.load(f, Loader=loader) |
| 184 | + |
| 185 | + |
| 186 | +def generate_llms_txt(): |
| 187 | + """Generate llms.txt index from mkdocs.yaml nav.""" |
| 188 | + mkdocs_config = load_mkdocs_nav() |
| 189 | + |
| 190 | + nav = mkdocs_config.get("nav", []) |
| 191 | + |
| 192 | + # Map top-level nav sections to llms.txt sections |
| 193 | + # Each top-level nav item is a dict like {"Concepts": [...]} |
| 194 | + lines = [ |
| 195 | + "# DataJoint Documentation", |
| 196 | + "", |
| 197 | + "> DataJoint is a Python framework for building scientific data pipelines " |
| 198 | + "with automated computation, integrity constraints, and seamless integration " |
| 199 | + "of relational databases with object storage.", |
| 200 | + "", |
| 201 | + "> For the complete documentation in a single file, see [/llms-full.txt](/llms-full.txt)", |
| 202 | + "", |
| 203 | + ] |
| 204 | + |
| 205 | + for nav_item in nav: |
| 206 | + if isinstance(nav_item, dict): |
| 207 | + for section_name, section_content in nav_item.items(): |
| 208 | + if isinstance(section_content, str): |
| 209 | + # Skip "Home: index.md" but keep other top-level leaves |
| 210 | + if section_content == "index.md" or section_content.startswith("http"): |
| 211 | + continue |
| 212 | + url = source_path_to_url(section_content) |
| 213 | + lines.append(f"- [{section_name}]({url})") |
| 214 | + lines.append("") |
| 215 | + elif isinstance(section_content, list): |
| 216 | + lines.append(f"## {section_name}") |
| 217 | + lines.append("") |
| 218 | + entries = extract_nav_entries(section_content) |
| 219 | + for title, url in entries: |
| 220 | + if title: |
| 221 | + lines.append(f"- [{title}]({url})") |
| 222 | + lines.append("") |
| 223 | + |
| 224 | + content = "\n".join(lines) + "\n" |
| 225 | + with open(OUTPUT_INDEX, "w", encoding="utf-8") as f: |
| 226 | + f.write(content) |
| 227 | + |
| 228 | + print(f"Generated {OUTPUT_INDEX} ({len(content):,} bytes)") |
| 229 | + |
| 230 | + |
118 | 231 | def generate_llms_full(): |
119 | 232 | """Generate the llms-full.txt file.""" |
120 | 233 | # Get current git info for version metadata |
@@ -153,4 +266,5 @@ def generate_llms_full(): |
153 | 266 |
|
154 | 267 |
|
155 | 268 | if __name__ == "__main__": |
| 269 | + generate_llms_txt() |
156 | 270 | generate_llms_full() |
0 commit comments