Skip to content

Commit eca3205

Browse files
authored
Merge pull request #497 from takaebato/add-llms-txt
feat: add llms.txt and LLM-friendly markdown docs generation
2 parents 6ef8a13 + 8d32be0 commit eca3205

5 files changed

Lines changed: 436 additions & 0 deletions

File tree

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,10 @@ slides/webgl
181181
/public/zh/coding-standard-content.html
182182
/public/en/coding-standard-content.html
183183
/public/*.html
184+
/public/en/llms.txt
185+
/public/en/llms-documents
186+
/public/zh/llms.txt
187+
/public/zh/llms-documents
184188

185189

186190
# Editor temporal files

build/build-doc.js

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,15 @@ async function run() {
184184
console.log('Error happens when copying to dest folders.');
185185
console.log(e);
186186
}
187+
188+
try {
189+
const buildLlms = require('./build-llms');
190+
buildLlms();
191+
}
192+
catch (e) {
193+
console.log('Error happens when building llms documents.');
194+
console.log(e);
195+
}
187196
}
188197

189198
console.log('All done.');

build/build-llms.js

Lines changed: 370 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,370 @@
1+
/**
2+
* Converts built part JSONs (HTML desc) to Markdown using turndown,
3+
* and generates llms.txt + individual .md files.
4+
*
5+
* Mechanically converts documents/*-parts/*.json to llms-documents/ (.md files).
6+
* Root files (e.g. option.md) are placed at llms-documents/, while part files
7+
* (e.g. option.title.md) are placed at llms-documents/*-parts/.
8+
* Type information is extracted from documents/*.json (full schema) via traverse.
9+
*
10+
* Prerequisites: JSON must be built first (node build.js --env dev)
11+
* Usage: node build/build-llms.js --env dev
12+
*/
13+
const fs = require('fs');
14+
const fse = require('fs-extra');
15+
const path = require('path');
16+
const globby = require('globby');
17+
const TurndownService = require('turndown');
18+
const {gfm} = require('turndown-plugin-gfm');
19+
const {traverse} = require('../tool/schemaHelper');
20+
const {readConfigEnvFile} = require('./helper');
21+
22+
// --- Constants ---
23+
24+
const LANGUAGES = ['en', 'zh'];
25+
const OUTPUT_DIR_NAME = 'llms-documents';
26+
const MAX_HEADING_DEPTH = 6;
27+
28+
const SECTION_LABELS = {
29+
en: {'option-parts': 'Option', 'option-gl-parts': 'Option GL', 'api-parts': 'API', 'tutorial-parts': 'Tutorial'},
30+
zh: {'option-parts': '配置项 (Option)', 'option-gl-parts': 'GL配置项 (Option GL)', 'api-parts': 'API', 'tutorial-parts': '教程 (Tutorial)'}
31+
};
32+
33+
const LLMS_TXT_HEADER = [
34+
'# Apache ECharts Documentation',
35+
'',
36+
'> Apache ECharts is a free, powerful charting and visualization library offering easy ways to add intuitive, interactive, and highly customizable charts to your commercial products.',
37+
''
38+
].join('\n');
39+
40+
// --- Config ---
41+
42+
const argv = require('yargs').argv;
43+
const envType = (argv.dev != null || argv.debug != null || argv.env === 'dev') ? 'dev' : argv.env;
44+
if (!envType) throw new Error('--env MUST be specified');
45+
const config = readConfigEnvFile(envType);
46+
47+
// --- Turndown ---
48+
49+
const td = new TurndownService({headingStyle: 'atx', codeBlockStyle: 'fenced'});
50+
td.use(gfm);
51+
td.addRule('iframe', {filter: 'iframe', replacement: () => ''});
52+
53+
function htmlToMd(html) {
54+
return html ? td.turndown(html).replace(/\n{3,}/g, '\n\n').trim() : '';
55+
}
56+
57+
// --- Extract type info from full schema JSON ---
58+
59+
/**
60+
* Extract type and default value info from a full schema JSON by traversing
61+
* the nested schema tree.
62+
*
63+
* @param {string} schemaJsonPath - path to schema JSON (e.g. "documents/option.json")
64+
* @param {string} docName - e.g. "option", "api"
65+
* @returns {Object<string, {type: string|null, default: string|null}>}
66+
* e.g. { "option.title.show": {type: "boolean", default: "true"} }
67+
*/
68+
function buildTypeMap(schemaJsonPath, docName) {
69+
if (docName === 'tutorial' || !fs.existsSync(schemaJsonPath)) return {};
70+
const schema = JSON.parse(fs.readFileSync(schemaJsonPath, 'utf-8'));
71+
const typeMap = {};
72+
traverse(schema, docName, (schemaPath, node) => {
73+
if (node.type || node.default != null) {
74+
typeMap[schemaPath] = {
75+
type: node.type ? (Array.isArray(node.type) ? node.type.join('|') : node.type) : null,
76+
default: node.default != null
77+
? (typeof node.default === 'object' ? JSON.stringify(node.default) : String(node.default))
78+
: null
79+
};
80+
}
81+
});
82+
return typeMap;
83+
}
84+
85+
// --- Resolve links in HTML ---
86+
// Best-effort rewriting of <a href="#path"> and <a href="api.html#path"> in HTML
87+
// so that turndown produces markdown links pointing to the correct .md files.
88+
// Some source links have non-standard formats (e.g. missing "#", no dot separator)
89+
// that cannot be resolved; these are left as-is or linked to the root file.
90+
91+
/**
92+
* Split linkPath into a part key (first segment) and fragment (rest), matching
93+
* the key against partKeys with case-insensitive and singular/plural fallback.
94+
*
95+
* @param {string} linkPath - e.g. "title.show", "echarts.init"
96+
* @param {Set<string>} partKeys - e.g. Set{'title','series-bar','geo',...}
97+
* @returns {{key: string, frag: string|null}|null}
98+
* e.g. "title.show" -> {key: "title", frag: "show"}
99+
* "angleAxis.axisLabel.interval" -> {key: "angleAxis", frag: "axisLabel.interval"}
100+
* "geo" -> {key: "geo", frag: null}
101+
* "unknown" -> null
102+
*/
103+
function tryResolvePartKey(linkPath, partKeys) {
104+
const [seg, ...rest] = linkPath.split('.');
105+
const frag = rest.length > 0 ? rest.join('.') : null;
106+
107+
if (partKeys.has(seg)) return {key: seg, frag};
108+
109+
// Fallback: case-insensitive and singular/plural matching
110+
const segL = seg.toLowerCase();
111+
for (const k of partKeys) {
112+
if (k.toLowerCase() === segL) return {key: k, frag};
113+
}
114+
for (const k of partKeys) {
115+
const kl = k.toLowerCase();
116+
if (kl === segL + 's' || kl + 's' === segL) return {key: k, frag};
117+
}
118+
return null;
119+
}
120+
121+
/**
122+
* Resolve a link path to an href pointing to the correct .md file.
123+
* If partKeys contains a match, link to the individual part file;
124+
* otherwise fall back to the root file.
125+
*
126+
* @param {string} linkPath - e.g. "title.show", "visualMap"
127+
* @param {Set<string>} partKeys - keys of individual part files
128+
* @param {string} pathPrefix - path prefix for part files
129+
* from part: "option" -> "option.title.md"
130+
* from root: "option-parts/option" -> "option-parts/option.title.md"
131+
* cross-doc: "../api-parts/api" -> "../api-parts/api.echarts.md"
132+
* @param {string} rootPath - path prefix for root file fallback
133+
* from part: "../option" -> "../option.md#visualMap"
134+
* from root: "option" -> "option.md#visualMap"
135+
* cross-doc: "../api" -> "../api.md#events"
136+
* @returns {string} resolved href attribute string
137+
*/
138+
function resolveLink(linkPath, partKeys, pathPrefix, rootPath) {
139+
const resolved = tryResolvePartKey(linkPath, partKeys);
140+
if (!resolved) {
141+
return `href="${rootPath}.md#${linkPath}"`;
142+
}
143+
return `href="${pathPrefix}.${resolved.key}.md${resolved.frag ? '#' + resolved.frag : ''}"`;
144+
}
145+
146+
/**
147+
* Rewrite internal links and image paths in HTML before turndown conversion.
148+
* Handles three patterns:
149+
* 1. Same-doc: href="#title.show" -> href="option.title.md#show"
150+
* 2. Cross-doc: href="api.html#echarts.init" -> href="../api-parts/api.echarts.md#init"
151+
* 3. Images: src="documents/asset/img/..." -> src="../../documents/asset/img/..."
152+
* Unresolvable links fall back to the root file.
153+
*
154+
* @param {string} html - HTML string containing <a href="..."> links
155+
* @param {Object<string, Set<string>>} partKeysByDoc - part keys for all docs
156+
* @param {string} docName - current doc name (e.g. "option")
157+
* @param {boolean} isRoot - whether the current file is a root file
158+
* @returns {string} HTML with rewritten href attributes and image paths
159+
*/
160+
function tryResolveHtmlLinks(html, partKeysByDoc, docName, isRoot) {
161+
const partKeys = partKeysByDoc[docName];
162+
// Path prefixes differ depending on whether current file is root or part:
163+
// root (llms-documents/option.md) -> part: "option-parts/option", root: "option"
164+
// part (llms-documents/option-parts/option.*.md) -> part: "option", root: "../option"
165+
const sameDocPartPrefix = isRoot ? `${docName}-parts/${docName}` : docName;
166+
const sameDocRootPath = isRoot ? docName : `../${docName}`;
167+
const crossDocPrefix = isRoot ? '' : '../';
168+
169+
// Same-doc links: href="#title.show" -> href="option.title.md#show"
170+
const resolved = html.replace(/href="#([^"]+)"/g, (match, linkPath) =>
171+
partKeys ? resolveLink(linkPath, partKeys, sameDocPartPrefix, sameDocRootPath) : match
172+
);
173+
174+
// Cross-doc links: href="api.html#echarts.init" -> href="../api-parts/api.echarts.md#init"
175+
const crossResolved = resolved.replace(
176+
/href="(option-gl|option|api|tutorial)\.html#([^"]+)"/g,
177+
(match, targetDoc, fragment) => {
178+
const keys = partKeysByDoc[targetDoc];
179+
if (!keys) return match;
180+
return resolveLink(fragment, keys, `${crossDocPrefix}${targetDoc}-parts/${targetDoc}`, `${crossDocPrefix}${targetDoc}`);
181+
}
182+
);
183+
184+
// Image paths: src="documents/asset/..." -> relative path to public/{lang}/documents/asset/
185+
const imgPrefix = isRoot ? '../' : '../../';
186+
return crossResolved.replace(
187+
/src="(documents\/asset\/[^"]*)"/g,
188+
(_, src) => `src="${imgPrefix}${src}"`
189+
);
190+
}
191+
192+
// --- Convert part JSON to Markdown ---
193+
194+
function formatPropertyEntry(key, entry, typeInfo, linkResolver) {
195+
const heading = '#'.repeat(Math.min(key.split('.').length + 1, MAX_HEADING_DEPTH)) + ' ' + key;
196+
const meta = [
197+
typeInfo && typeInfo.type && `- **Type**: \`${typeInfo.type}\``,
198+
typeInfo && typeInfo.default != null && `- **Default**: \`${typeInfo.default}\``
199+
].filter(Boolean);
200+
const body = entry.desc ? htmlToMd(linkResolver(entry.desc)) : '';
201+
return [heading, ...meta, ...(body ? ['', body] : []), ''];
202+
}
203+
204+
function jsonToMd(data, typeMap, baseName, linkResolver) {
205+
const lines = Object.entries(data).flatMap(([key, entry]) => {
206+
const fullKey = baseName ? `${baseName}.${key}` : key;
207+
return formatPropertyEntry(key, entry, typeMap[fullKey], linkResolver);
208+
});
209+
return lines.join('\n').replace(/\n{3,}/g, '\n\n').trimEnd() + '\n';
210+
}
211+
212+
// --- Collect part JSON files ---
213+
214+
/**
215+
* Collect part JSON files for each *-parts/ directory, excluding outline files.
216+
*
217+
* @param {string[]} partsDirs - paths to *-parts/ directories
218+
* @returns {Object<string, string[]>} dir path -> JSON file paths
219+
*/
220+
function collectPartJsonFiles(partsDirs) {
221+
const jsonFilesByDir = {};
222+
for (const dir of partsDirs) {
223+
jsonFilesByDir[dir] = globby.sync('*.json', { cwd: dir, absolute: true })
224+
.filter(filePath => !path.basename(filePath).includes('-outline'));
225+
}
226+
return jsonFilesByDir;
227+
}
228+
229+
// --- Collect file keys for link resolution across docs ---
230+
231+
/**
232+
* Build a map of doc name -> Set of part keys for all *-parts/ directories.
233+
* Part keys are file names with the doc name stripped (e.g. "option.title" -> "title").
234+
* Root files (e.g. "option.json") are excluded since they are not individual part files.
235+
*
236+
* @param {string[]} partsDirs - paths to *-parts/ directories
237+
* @param {Object<string, string[]>} jsonFilesByDir - pre-collected JSON file paths
238+
* @returns {Object<string, Set<string>>} partKeysByDoc - e.g. { option: Set{'title','geo',...}, api: Set{'echarts',...} }
239+
*/
240+
function buildPartKeysByDoc(partsDirs, jsonFilesByDir) {
241+
const partKeysByDoc = {};
242+
for (const dir of partsDirs) {
243+
const docName = path.basename(dir).replace(/-parts$/, '');
244+
partKeysByDoc[docName] = new Set(
245+
jsonFilesByDir[dir].map(filePath => path.basename(filePath, '.json'))
246+
.filter(k => k !== docName)
247+
.map(k => k.startsWith(docName + '.') ? k.slice(docName.length + 1) : k)
248+
);
249+
}
250+
return partKeysByDoc;
251+
}
252+
253+
// --- Process a single *-parts/ directory ---
254+
255+
/**
256+
* Convert part JSON files in a single *-parts/ directory to Markdown.
257+
* Each JSON file becomes a .md file with resolved links and type info.
258+
* Root files (e.g. option.json) are output to the parent directory.
259+
*
260+
* @param {string} partsDir - path to a *-parts/ directory (e.g. "documents/option-parts")
261+
* @param {string} outDir - output base directory (e.g. "llms-documents")
262+
* @param {Object} typeMap - property path -> {type, default} map
263+
* @param {Object<string, Set<string>>} partKeysByDoc - part keys for all docs
264+
* @param {string[]} jsonFiles - pre-collected JSON file paths for this directory
265+
* @returns {{name: string, path: string, section: string}[]} output file descriptors
266+
*/
267+
function processPartsDir(partsDir, outDir, typeMap, partKeysByDoc, jsonFiles) {
268+
const dirName = path.basename(partsDir);
269+
const docName = dirName.replace(/-parts$/, '');
270+
271+
return jsonFiles.map(filePath => {
272+
const baseName = path.basename(filePath, '.json');
273+
const data = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
274+
const isRoot = baseName === docName;
275+
const linkResolver = (html) => tryResolveHtmlLinks(html, partKeysByDoc, docName, isRoot);
276+
const content = `# ${baseName}\n\n` + jsonToMd(data, typeMap, baseName, linkResolver);
277+
const fileName = isRoot ? `${baseName}.md` : `${dirName}/${baseName}.md`;
278+
const fullPath = path.resolve(outDir, fileName);
279+
fse.ensureDirSync(path.dirname(fullPath));
280+
fs.writeFileSync(fullPath, content, 'utf-8');
281+
return {name: fileName, path: fullPath, section: dirName};
282+
});
283+
}
284+
285+
// --- Generate docs for a single language ---
286+
287+
function generateDocsForLang(lang) {
288+
const docsDir = path.resolve(config.releaseDestDir, lang, 'documents');
289+
const outDir = path.resolve(config.releaseDestDir, lang, OUTPUT_DIR_NAME);
290+
fse.ensureDirSync(outDir);
291+
292+
// Step 1: Build a type map from full schema JSONs (option.json, api.json, etc.)
293+
// by traversing the nested schema tree to collect type/default for each
294+
// property path (e.g. "option.title.show" -> {type: "boolean", default: "true"}).
295+
const schemaFiles = globby.sync('*.json', { cwd: docsDir, absolute: true });
296+
const typeMap = {};
297+
for (const filePath of schemaFiles) {
298+
Object.assign(typeMap, buildTypeMap(filePath, path.basename(filePath, '.json')));
299+
}
300+
301+
// Step 2: Collect part JSON files and file keys for all *-parts/ directories upfront,
302+
// so that cross-doc links can be resolved against actual files.
303+
const partsDirs = globby.sync('*-parts', {
304+
cwd: docsDir,
305+
absolute: true,
306+
onlyDirectories: true
307+
});
308+
const jsonFilesByDir = collectPartJsonFiles(partsDirs);
309+
const partKeysByDoc = buildPartKeysByDoc(partsDirs, jsonFilesByDir);
310+
311+
// Step 3: For each *-parts/ directory, read part JSONs (e.g. option.title.json),
312+
// resolve internal links in HTML, convert desc to Markdown via turndown,
313+
// attach type/default from the type map, and write as .md files.
314+
const files = partsDirs
315+
.flatMap(dir => processPartsDir(dir, outDir, typeMap, partKeysByDoc, jsonFilesByDir[dir]))
316+
.sort((a, b) => a.name.localeCompare(b.name));
317+
318+
console.log(`Generated ${files.length} docs for ${lang}`);
319+
return files;
320+
}
321+
322+
// --- llms.txt ---
323+
324+
function writeLlmsTxt(lang, files) {
325+
const langDir = path.resolve(config.releaseDestDir, lang);
326+
fse.ensureDirSync(langDir);
327+
const labels = SECTION_LABELS[lang] || SECTION_LABELS.en;
328+
const groups = {};
329+
for (const file of files) {
330+
if (!groups[file.section]) groups[file.section] = [];
331+
groups[file.section].push(file);
332+
}
333+
334+
const sections = Object.keys(groups)
335+
.sort()
336+
.flatMap(sectionKey => [
337+
`## ${labels[sectionKey] || sectionKey}`, '',
338+
...groups[sectionKey]
339+
.sort((a, b) => {
340+
const aIsRoot = !a.name.includes('/');
341+
const bIsRoot = !b.name.includes('/');
342+
if (aIsRoot !== bIsRoot) return aIsRoot ? -1 : 1;
343+
return a.name.localeCompare(b.name);
344+
})
345+
.map(file =>
346+
`- [${path.basename(file.name, '.md')}](${OUTPUT_DIR_NAME}/${file.name})`
347+
),
348+
''
349+
]);
350+
351+
const content = [LLMS_TXT_HEADER, ...sections].join('\n').trimEnd() + '\n';
352+
fs.writeFileSync(path.join(langDir, 'llms.txt'), content, 'utf-8');
353+
console.log(`Generated ${lang}/llms.txt`);
354+
}
355+
356+
// --- Main ---
357+
358+
function buildLlms() {
359+
console.log('Building llms documents ...');
360+
for (const lang of LANGUAGES) {
361+
// Step 1-2: Generate individual .md files from part JSONs.
362+
const files = generateDocsForLang(lang);
363+
// Step 3: Generate llms.txt index listing all .md files.
364+
if (files.length > 0) writeLlmsTxt(lang, files);
365+
}
366+
console.log('Build llms documents done.');
367+
}
368+
369+
module.exports = buildLlms;
370+
if (require.main === module) buildLlms();

0 commit comments

Comments
 (0)