Skip to content

Commit ac07c8c

Browse files
committed
docs: update llms file generator
1 parent abebd62 commit ac07c8c

4 files changed

Lines changed: 2539 additions & 42 deletions

File tree

website/package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
"docusaurus": "docusaurus",
77
"dev": "docusaurus start",
88
"build": "docusaurus build",
9+
"compile-llm": "TS_NODE_SKIP_PROJECT=1 TS_NODE_COMPILER_OPTIONS='{\"module\":\"CommonJS\"}' node -r ts-node/register/transpile-only -e \"require('./scripts/compile-llm-docs.ts')\"",
10+
"postbuild": "TS_NODE_SKIP_PROJECT=1 TS_NODE_COMPILER_OPTIONS='{\"module\":\"CommonJS\"}' node -r ts-node/register/transpile-only -e \"require('./scripts/compile-llm-docs.ts')\"",
911
"swizzle": "docusaurus swizzle",
1012
"deploy": "docusaurus deploy",
1113
"clear": "docusaurus clear",
Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
#!/usr/bin/env ts-node
2+
/**
3+
* Generate a single llms.txt file by collecting Docusaurus content.
4+
* - Scans markdown sources: website/docs, website/blog, website/src/pages
5+
* - Strips frontmatter and MDX import/export statements
6+
* - Flattens most MDX JSX elements while preserving inner text
7+
* - Keeps code fences, headings, links and images as Markdown
8+
* - Writes merged content to website/static/llms.txt
9+
*
10+
* Inspired by Mantine's compiler:
11+
* https://raw.githubusercontent.com/mantinedev/mantine/ddae40ed58986b1260e8422771ac04c1ed95e58b/scripts/llm/compile-llm-doc.ts
12+
*/
13+
14+
import * as fs from 'fs';
15+
import * as path from 'path';
16+
17+
// Resolve website root relative to this script location to avoid relying on cwd
18+
const WEBSITE_ROOT = path.resolve(__dirname, '..');
19+
const TARGETS = [
20+
path.join(WEBSITE_ROOT, 'docs'),
21+
path.join(WEBSITE_ROOT, 'src', 'pages'),
22+
];
23+
const OUTPUT_TITLES_FILE = path.join(WEBSITE_ROOT, 'static', 'llms.txt');
24+
const OUTPUT_FULL_FILE = path.join(WEBSITE_ROOT, 'static', 'llms-full.txt');
25+
const ALLOWED_EXTENSIONS = new Set(['.md', '.mdx']);
26+
27+
async function fileExists(filePath: string): Promise<boolean> {
28+
return new Promise((resolve) => {
29+
fs.stat(filePath, (err) => resolve(!err));
30+
});
31+
}
32+
33+
function isDirectory(p: string): boolean {
34+
try {
35+
return fs.statSync(p).isDirectory();
36+
} catch {
37+
return false;
38+
}
39+
}
40+
41+
function collectFiles(dir: string, out: string[] = []): string[] {
42+
if (!isDirectory(dir)) return out;
43+
for (const entry of fs.readdirSync(dir)) {
44+
const full = path.join(dir, entry);
45+
try {
46+
const stat = fs.statSync(full);
47+
if (stat.isDirectory()) {
48+
collectFiles(full, out);
49+
} else if (stat.isFile()) {
50+
const ext = path.extname(full).toLowerCase();
51+
if (ALLOWED_EXTENSIONS.has(ext)) out.push(full);
52+
}
53+
} catch {
54+
// ignore
55+
}
56+
}
57+
return out;
58+
}
59+
60+
function stripFrontmatter(src: string): string {
61+
// Remove YAML frontmatter at the very beginning of the file
62+
return src.replace(/^(---\r?\n[\s\S]*?\r?\n---\r?\n)/, '');
63+
}
64+
65+
function stripMdxEsm(src: string): string {
66+
// Remove MDX ESM import/export at start of line
67+
return src.replace(/^\s*(import|export)\s[\s\S]*?;\s*$/gm, '');
68+
}
69+
70+
function flattenMdxJsx(src: string): string {
71+
// Replace MDX JSX blocks with their inner text when possible
72+
// <Component ...>inner</Component> -> inner
73+
let out = src.replace(/<([A-Z][\w.-]*)([^>]*)>([\s\S]*?)<\/\1>/g, '$3');
74+
// Self-closing MDX JSX: <Component .../> -> ''
75+
out = out.replace(/<([A-Z][\w.-]*)([^>]*)\/>/g, '');
76+
return out;
77+
}
78+
79+
function sanitizeContent(src: string): string {
80+
let out = src;
81+
out = stripFrontmatter(out);
82+
out = stripMdxEsm(out);
83+
out = flattenMdxJsx(out);
84+
// Normalize Windows newlines just in case
85+
out = out.replace(/\r\n/g, '\n');
86+
// Collapse 3+ empty lines to 2
87+
out = out.replace(/\n{3,}/g, '\n\n');
88+
return out.trim() + '\n';
89+
}
90+
91+
/**
92+
* Try to read docusaurus config to get site url and baseUrl
93+
*/
94+
function readSiteConfig(): { url: string; baseUrl: string } {
95+
const fallback = { url: 'https://tianji.dev', baseUrl: '/' };
96+
try {
97+
// eslint-disable-next-line @typescript-eslint/no-var-requires
98+
const cfg = require(path.join(WEBSITE_ROOT, 'docusaurus.config'));
99+
const url = cfg?.url || fallback.url;
100+
const baseUrl = cfg?.baseUrl || fallback.baseUrl;
101+
return { url, baseUrl };
102+
} catch {
103+
return fallback;
104+
}
105+
}
106+
107+
/**
108+
* Extract title from frontmatter if present.
109+
*/
110+
function extractTitleFromFrontmatter(src: string): string | '' {
111+
const fmMatch = src.match(/^---\r?\n([\s\S]*?)\r?\n---/);
112+
if (!fmMatch) return '';
113+
const fm = fmMatch[1];
114+
const titleMatch = fm.match(/^title:\s*(.*)$/m);
115+
if (!titleMatch) return '';
116+
return titleMatch[1].replace(/^['"`]?(.*)['"`]?$/, '$1').trim();
117+
}
118+
119+
/**
120+
* Extract first markdown H1 as title.
121+
*/
122+
function extractH1Title(src: string): string | '' {
123+
const m = src.match(/^\s*#\s+(.+?)\s*$/m);
124+
return m ? m[1].trim() : '';
125+
}
126+
127+
/**
128+
* Fallback: create readable title from file path.
129+
*/
130+
function deriveTitleFromPath(absPath: string, baseDir: string): string {
131+
const rel = path.relative(baseDir, absPath);
132+
const noExt = rel.replace(/\.(md|mdx)$/i, '');
133+
const parts = noExt.split(path.sep).filter(Boolean);
134+
let base = parts[parts.length - 1] || 'Untitled';
135+
if (base.toLowerCase() === 'index' && parts.length > 1) {
136+
base = parts[parts.length - 2];
137+
}
138+
return base.replace(/[-_]/g, ' ').replace(/\b\w/g, (m) => m.toUpperCase());
139+
}
140+
141+
/**
142+
* Compute public route from file path for docs and pages.
143+
*/
144+
function computeRouteForFile(
145+
absPath: string,
146+
roots: { docs: string; pages: string },
147+
baseUrl: string
148+
): string {
149+
const toPosix = (p: string) => p.split(path.sep).join('/');
150+
const ensureSlash = (p: string) => (p.endsWith('/') ? p : p + '/');
151+
if (absPath.startsWith(roots.docs + path.sep) || absPath === roots.docs) {
152+
const rel = path.relative(roots.docs, absPath);
153+
const noExt = rel.replace(/\.(md|mdx)$/i, '');
154+
const segments = noExt.split(path.sep).filter(Boolean);
155+
if (
156+
segments.length &&
157+
segments[segments.length - 1].toLowerCase() === 'index'
158+
) {
159+
segments.pop();
160+
}
161+
const route = ensureSlash(baseUrl) + 'docs/' + toPosix(segments.join('/'));
162+
return route.replace(/\/$/, '') || ensureSlash(baseUrl) + 'docs';
163+
}
164+
if (absPath.startsWith(roots.pages + path.sep) || absPath === roots.pages) {
165+
const rel = path.relative(roots.pages, absPath);
166+
const noExt = rel.replace(/\.(md|mdx)$/i, '');
167+
const segments = noExt.split(path.sep).filter(Boolean);
168+
if (
169+
segments.length &&
170+
segments[segments.length - 1].toLowerCase() === 'index'
171+
) {
172+
segments.pop();
173+
}
174+
const route = '/' + toPosix(segments.join('/'));
175+
return route === '/'
176+
? ensureSlash(baseUrl)
177+
: ensureSlash(baseUrl) + route.replace(/^\//, '');
178+
}
179+
return ensureSlash(baseUrl);
180+
}
181+
182+
async function main() {
183+
const existingTargets = (
184+
await Promise.all(
185+
TARGETS.map(async (p) => ((await fileExists(p)) ? p : ''))
186+
)
187+
).filter(Boolean) as string[];
188+
189+
const files = existingTargets.flatMap((dir) => collectFiles(dir)).sort();
190+
191+
const headerFull = [
192+
'# Tianji Documentation Export (Full)',
193+
'',
194+
'This file aggregates documentation for LLM consumption.',
195+
'It preserves Markdown headings, links, and code fences; frontmatter and MDX ESM are removed.',
196+
'',
197+
'='.repeat(80),
198+
'',
199+
].join('\n');
200+
201+
const chunksFull: string[] = [headerFull];
202+
203+
const { url: siteUrl, baseUrl } = readSiteConfig();
204+
const docsRoot = path.join(WEBSITE_ROOT, 'docs');
205+
const pagesRoot = path.join(WEBSITE_ROOT, 'src', 'pages');
206+
const routes: Array<{ title: string; url: string }> = [];
207+
208+
for (const absPath of files) {
209+
let content = '';
210+
try {
211+
content = fs.readFileSync(absPath, 'utf-8');
212+
} catch {
213+
continue;
214+
}
215+
216+
const relPath = path.relative(WEBSITE_ROOT, absPath);
217+
const sanitized = sanitizeContent(content);
218+
const sectionHeader = [`===== FILE: ${relPath} =====`, ''].join('\n');
219+
chunksFull.push(sectionHeader, sanitized, '');
220+
221+
// Build titles list
222+
const titleFromFm = extractTitleFromFrontmatter(content);
223+
const titleFromH1 = titleFromFm || extractH1Title(content);
224+
const title =
225+
titleFromH1 ||
226+
deriveTitleFromPath(
227+
absPath,
228+
absPath.includes('/src/pages') ? pagesRoot : docsRoot
229+
);
230+
const routePath = computeRouteForFile(
231+
absPath,
232+
{ docs: docsRoot, pages: pagesRoot },
233+
baseUrl
234+
);
235+
const absoluteUrl =
236+
siteUrl.replace(/\/+$/, '') + '/' + routePath.replace(/^\//, '');
237+
routes.push({ title, url: absoluteUrl });
238+
}
239+
240+
const outputFull = chunksFull.join('\n');
241+
const headerTitles = [
242+
'# Tianji Documentation Index',
243+
'',
244+
'This file lists documentation titles with absolute URLs.',
245+
'',
246+
'='.repeat(80),
247+
'',
248+
].join('\n');
249+
const outputTitles =
250+
[headerTitles, ...routes.map((r) => `${r.title} - ${r.url}`)].join('\n') +
251+
'\n';
252+
253+
// Ensure output directory exists
254+
const outDir = path.join(WEBSITE_ROOT, 'static');
255+
if (!fs.existsSync(outDir)) {
256+
fs.mkdirSync(outDir, { recursive: true });
257+
}
258+
fs.writeFileSync(OUTPUT_TITLES_FILE, outputTitles, 'utf-8');
259+
fs.writeFileSync(OUTPUT_FULL_FILE, outputFull, 'utf-8');
260+
261+
// eslint-disable-next-line no-console
262+
console.log(
263+
`Generated ${path.relative(WEBSITE_ROOT, OUTPUT_TITLES_FILE)} (titles) and ${path.relative(WEBSITE_ROOT, OUTPUT_FULL_FILE)} (full) from ${files.length} files`
264+
);
265+
}
266+
267+
main().catch((err) => {
268+
// eslint-disable-next-line no-console
269+
console.error('Failed to generate llms.txt', err);
270+
process.exit(1);
271+
});

0 commit comments

Comments
 (0)