Skip to content

Commit c7eb390

Browse files
Merge pull request #160 from datajoint/fix/generate-llms-txt
fix: generate llms.txt from mkdocs nav to fix dead links
2 parents b675478 + 75f5d6f commit c7eb390

File tree

4 files changed

+123
-94
lines changed

4 files changed

+123
-94
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ temp*
77
.secrets/
88

99
# Generated documentation files
10+
src/llms.txt
1011
src/llms-full.txt
1112
site/llms-full.txt
1213
dj_local_conf.json

docker-compose.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,12 +84,13 @@ services:
8484
# LIVE mode: install datajoint and notebook dependencies for interactive development
8585
pip install -e /datajoint-python
8686
pip install scikit-image pooch
87+
python scripts/gen_llms_full.py
8788
mkdocs serve --config-file ./mkdocs.yaml -a 0.0.0.0:8000
8889
elif echo "$${MODE}" | grep -i build &>/dev/null; then
8990
# BUILD mode: build static site from pre-executed notebooks
9091
# Install datajoint-python for mkdocstrings (needs to import for API docs)
9192
pip install -e /datajoint-python
92-
# Generate llms-full.txt with current git info
93+
# Generate llms.txt and llms-full.txt
9394
python scripts/gen_llms_full.py
9495
mkdocs build --config-file ./mkdocs.yaml
9596
elif echo "$${MODE}" | grep -i execute_pg &>/dev/null; then

scripts/gen_llms_full.py

Lines changed: 120 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,27 @@
11
#!/usr/bin/env python3
22
"""
3-
Generate llms-full.txt from documentation sources.
3+
Generate llms.txt and llms-full.txt from documentation sources.
44
5-
This script concatenates all markdown documentation into a single file
6-
optimized for LLM consumption.
5+
- llms.txt: Index with links derived from mkdocs.yaml nav
6+
- llms-full.txt: Complete documentation concatenated for LLM consumption
77
8-
The generated file is NOT committed to git - it's auto-generated during
9-
the build process with current version metadata.
8+
Both files are auto-generated during the build process.
109
"""
1110

1211
import json
12+
import re
1313
import subprocess
1414
from datetime import datetime, timezone
1515
from pathlib import Path
1616

17+
import yaml
18+
1719
# Documentation root
18-
DOCS_DIR = Path(__file__).parent.parent / "src"
20+
PROJECT_DIR = Path(__file__).parent.parent
21+
DOCS_DIR = PROJECT_DIR / "src"
22+
MKDOCS_FILE = PROJECT_DIR / "mkdocs.yaml"
1923
OUTPUT_FILE = DOCS_DIR / "llms-full.txt"
24+
OUTPUT_INDEX = DOCS_DIR / "llms.txt"
2025

2126
# Sections in order of importance
2227
SECTIONS = [
@@ -115,6 +120,114 @@ def get_doc_files(directory: Path) -> list[Path]:
115120
return sorted(files)
116121

117122

123+
def source_path_to_url(path: str) -> str:
124+
"""Convert a source file path to a deployed MkDocs URL.
125+
126+
MkDocs with use_directory_urls=true (default) serves:
127+
about/whats-new-2.md -> /about/whats-new-2/
128+
tutorials/basics/01-first-pipeline.ipynb -> /tutorials/basics/01-first-pipeline/
129+
index.md -> /
130+
section/index.md -> /section/
131+
"""
132+
# Strip file extension
133+
url = re.sub(r"\.(md|ipynb)$", "", path)
134+
# index pages -> parent directory
135+
url = re.sub(r"/index$", "", url)
136+
if url == "index":
137+
return "/"
138+
# Avoid double slash for paths like "api/"
139+
if url.endswith("/"):
140+
return f"/{url}"
141+
return f"/{url}/"
142+
143+
144+
def extract_nav_entries(nav, section_path=""):
145+
"""Recursively extract (title, url) pairs from mkdocs nav structure."""
146+
entries = []
147+
if isinstance(nav, list):
148+
for item in nav:
149+
entries.extend(extract_nav_entries(item, section_path))
150+
elif isinstance(nav, dict):
151+
for key, value in nav.items():
152+
if isinstance(value, str):
153+
# Leaf node: "Title: path.md" or external URL
154+
if value.startswith("http"):
155+
continue # skip external links
156+
url = source_path_to_url(value)
157+
entries.append((key, url))
158+
elif isinstance(value, list):
159+
# Section with children
160+
entries.extend(extract_nav_entries(value, key))
161+
elif isinstance(nav, str):
162+
# Bare path without title (e.g., index pages)
163+
if not nav.startswith("http"):
164+
url = source_path_to_url(nav)
165+
entries.append((None, url))
166+
return entries
167+
168+
169+
def load_mkdocs_nav():
170+
"""Load just the nav section from mkdocs.yaml.
171+
172+
mkdocs.yaml contains !!python/name tags that standard YAML loaders
173+
can't resolve without the material theme installed. We add a custom
174+
constructor that ignores these tags.
175+
"""
176+
loader = yaml.SafeLoader
177+
# Handle !!python/name and !!python/object tags by returning None
178+
loader.add_multi_constructor(
179+
"tag:yaml.org,2002:python/",
180+
lambda loader, suffix, node: None,
181+
)
182+
with open(MKDOCS_FILE, "r") as f:
183+
return yaml.load(f, Loader=loader)
184+
185+
186+
def generate_llms_txt():
187+
"""Generate llms.txt index from mkdocs.yaml nav."""
188+
mkdocs_config = load_mkdocs_nav()
189+
190+
nav = mkdocs_config.get("nav", [])
191+
192+
# Map top-level nav sections to llms.txt sections
193+
# Each top-level nav item is a dict like {"Concepts": [...]}
194+
lines = [
195+
"# DataJoint Documentation",
196+
"",
197+
"> DataJoint is a Python framework for building scientific data pipelines "
198+
"with automated computation, integrity constraints, and seamless integration "
199+
"of relational databases with object storage.",
200+
"",
201+
"> For the complete documentation in a single file, see [/llms-full.txt](/llms-full.txt)",
202+
"",
203+
]
204+
205+
for nav_item in nav:
206+
if isinstance(nav_item, dict):
207+
for section_name, section_content in nav_item.items():
208+
if isinstance(section_content, str):
209+
# Skip "Home: index.md" but keep other top-level leaves
210+
if section_content == "index.md" or section_content.startswith("http"):
211+
continue
212+
url = source_path_to_url(section_content)
213+
lines.append(f"- [{section_name}]({url})")
214+
lines.append("")
215+
elif isinstance(section_content, list):
216+
lines.append(f"## {section_name}")
217+
lines.append("")
218+
entries = extract_nav_entries(section_content)
219+
for title, url in entries:
220+
if title:
221+
lines.append(f"- [{title}]({url})")
222+
lines.append("")
223+
224+
content = "\n".join(lines) + "\n"
225+
with open(OUTPUT_INDEX, "w", encoding="utf-8") as f:
226+
f.write(content)
227+
228+
print(f"Generated {OUTPUT_INDEX} ({len(content):,} bytes)")
229+
230+
118231
def generate_llms_full():
119232
"""Generate the llms-full.txt file."""
120233
# Get current git info for version metadata
@@ -153,4 +266,5 @@ def generate_llms_full():
153266

154267

155268
if __name__ == "__main__":
269+
generate_llms_txt()
156270
generate_llms_full()

src/llms.txt

Lines changed: 0 additions & 87 deletions
This file was deleted.

0 commit comments

Comments
 (0)