Skip to content

Commit 20d8156

Browse files
cscheidclaude
andcommitted
feat(website): add llms.txt support for LLM-friendly content
Add support for generating llms.txt and .llms.md files for Quarto websites, providing LLM-friendly markdown versions of HTML pages. Features: - New `llms-txt: true` option in website config - Generates .llms.md companion files alongside HTML output - Creates llms.txt index file linking to all markdown pages - Converts HTML to clean markdown using Pandoc with Lua filter - Handles callouts (blockquotes with bold type markers) - Converts images to markdown syntax - Converts internal links from .html to .llms.md - Respects draft settings (excludes drafts from output) - Cleans listing pages (removes empty links, category badges) - Matches sitemap behavior for incremental builds New files: - src/project/types/website/website-llms.ts - src/resources/filters/llms/llms.lua Test coverage: - Basic file generation - Content conversion (callouts, code, tables, links) - Draft handling - Listing page cleanup Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent a7d8b6a commit 20d8156

28 files changed

Lines changed: 863 additions & 56 deletions

src/project/types/website/website-config.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ type WebsiteConfigKey =
7777
| "other-links"
7878
| "code-links"
7979
| "reader-mode"
80+
| "llms-txt"
8081
| "announcement"
8182
| "draft-mode"
8283
| "drafts";

src/project/types/website/website-constants.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ export const kSiteRepoLinkRel = "repo-link-rel";
1818
export const kSiteIssueUrl = "issue-url";
1919
export const kSiteRepoActions = "repo-actions";
2020
export const kSiteReaderMode = "reader-mode";
21+
export const kLlmsTxt = "llms-txt";
2122

2223
export const kSiteNavbar = "navbar";
2324
export const kSiteSidebar = "sidebar";
Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
/*
2+
* website-llms.ts
3+
*
4+
* Copyright (C) 2020-2024 Posit Software, PBC
5+
*/
6+
7+
import { basename, dirname, join, relative } from "../../../deno_ral/path.ts";
8+
import { existsSync } from "../../../deno_ral/fs.ts";
9+
10+
import { Document, Element } from "../../../core/deno-dom.ts";
11+
import { execProcess } from "../../../core/process.ts";
12+
import { dirAndStem, pathWithForwardSlashes } from "../../../core/path.ts";
13+
import { pandocBinaryPath, resourcePath } from "../../../core/resources.ts";
14+
15+
import { kProject404File, ProjectContext } from "../../types.ts";
16+
import { projectOutputDir } from "../../project-shared.ts";
17+
import { ProjectOutputFile } from "../types.ts";
18+
19+
import { kLlmsTxt } from "./website-constants.ts";
20+
import {
21+
websiteConfigBoolean,
22+
websiteDescription,
23+
websiteTitle,
24+
} from "./website-config.ts";
25+
import { isDraftVisible, isProjectDraft, projectDraftMode } from "./website-utils.ts";
26+
import { resolveInputTargetForOutputFile } from "../../project-index.ts";
27+
import { Format } from "../../../config/types.ts";
28+
29+
/**
30+
* Compute the output HTML file path from the source file.
31+
*/
32+
function computeOutputFilePath(source: string, project: ProjectContext): string {
33+
const outputDir = projectOutputDir(project);
34+
const sourceRelative = relative(project.dir, source);
35+
const [hrefDir, hrefStem] = dirAndStem(sourceRelative);
36+
const htmlPath = join(hrefDir, `${hrefStem}.html`);
37+
return join(outputDir, htmlPath);
38+
}
39+
40+
/**
41+
* HTML finalizer that generates .llms.md files from rendered HTML.
42+
* This runs after all HTML postprocessors have completed.
43+
*/
44+
export function llmsHtmlFinalizer(
45+
source: string,
46+
project: ProjectContext,
47+
_format: Format,
48+
) {
49+
return async (doc: Document): Promise<void> => {
50+
// Check if llms-txt is enabled
51+
if (!websiteConfigBoolean(kLlmsTxt, false, project.config)) {
52+
return;
53+
}
54+
55+
// Check draft status via multiple mechanisms
56+
const draftMode = projectDraftMode(project);
57+
58+
// Check 1: quarto:status meta tag (set by draft processing)
59+
const statusEl = doc.querySelector("meta[name='quarto:status']");
60+
const status = statusEl?.getAttribute("content");
61+
const isDraftByStatus = status === "draft" || status === "draft-remove";
62+
63+
// Check 2: drafts array in project config
64+
const sourceRelative = relative(project.dir, source);
65+
const isDraftByConfig = isProjectDraft(sourceRelative, project);
66+
67+
const isDraft = isDraftByStatus || isDraftByConfig;
68+
69+
if (isDraft && !isDraftVisible(draftMode)) {
70+
return; // Skip draft pages
71+
}
72+
73+
// Extract main content from HTML
74+
const htmlContent = extractMainContent(doc);
75+
76+
// Compute the output file path and derive the .llms.md path
77+
const outputFile = computeOutputFilePath(source, project);
78+
const llmsOutputPath = outputFile.replace(/\.html$/, ".llms.md");
79+
80+
// Convert HTML to markdown using Pandoc with the llms.lua filter
81+
await convertHtmlToLlmsMarkdown(htmlContent, llmsOutputPath);
82+
};
83+
}
84+
85+
/**
86+
* Extract the main content from an HTML document, removing navigation,
87+
* sidebars, footers, scripts, and styles.
88+
*/
89+
function extractMainContent(doc: Document): string {
90+
// Clone the document to avoid mutating the original
91+
const clone = doc.cloneNode(true) as Document;
92+
93+
// Remove elements that shouldn't be in llms output
94+
const selectorsToRemove = [
95+
"#quarto-header",
96+
".nav-footer",
97+
"#quarto-sidebar",
98+
"#quarto-margin-sidebar",
99+
"#quarto-search-results",
100+
".sidebar",
101+
".quarto-search",
102+
"nav.navbar",
103+
"script",
104+
"style",
105+
"link[rel='stylesheet']",
106+
"meta",
107+
"noscript",
108+
];
109+
110+
for (const selector of selectorsToRemove) {
111+
const elements = clone.querySelectorAll(selector);
112+
for (const el of elements) {
113+
(el as Element).remove();
114+
}
115+
}
116+
117+
// Get the main content area
118+
const main = clone.querySelector("main") ||
119+
clone.querySelector("#quarto-document-content") ||
120+
clone.querySelector("article") ||
121+
clone.body;
122+
123+
if (!main) {
124+
return "";
125+
}
126+
127+
// Return a minimal HTML document with just the content
128+
return `<!DOCTYPE html>
129+
<html>
130+
<head><meta charset="utf-8"></head>
131+
<body>
132+
${main.innerHTML}
133+
</body>
134+
</html>`;
135+
}
136+
137+
/**
138+
* Convert HTML content to markdown using Pandoc with the llms.lua filter.
139+
*/
140+
async function convertHtmlToLlmsMarkdown(
141+
htmlContent: string,
142+
outputPath: string,
143+
): Promise<void> {
144+
const filterPath = resourcePath("filters/llms/llms.lua");
145+
146+
// Create a temporary file for the HTML content
147+
const tempDir = Deno.makeTempDirSync();
148+
const tempHtml = join(tempDir, "input.html");
149+
Deno.writeTextFileSync(tempHtml, htmlContent);
150+
151+
try {
152+
// Run Pandoc to convert HTML to markdown
153+
// Use gfm-raw_html for clean markdown output:
154+
// - gfm gives us proper table and code block handling
155+
// - -raw_html strips remaining HTML tags, converting figures to markdown images
156+
// Note: We use plain "html" input format (not html-native_divs-native_spans)
157+
// because native_divs interferes with the Lua filter's callout processing
158+
const cmd = [pandocBinaryPath()];
159+
cmd.push(tempHtml);
160+
cmd.push("-f", "html");
161+
cmd.push("-t", "gfm-raw_html");
162+
cmd.push("--lua-filter", filterPath);
163+
cmd.push("-o", outputPath);
164+
cmd.push("--wrap=none");
165+
166+
const result = await execProcess({
167+
cmd: cmd[0],
168+
args: cmd.slice(1),
169+
stdout: "piped",
170+
stderr: "piped",
171+
});
172+
173+
if (!result.success) {
174+
console.error(`Error converting HTML to markdown: ${result.stderr}`);
175+
}
176+
} finally {
177+
// Cleanup temp files
178+
try {
179+
Deno.removeSync(tempDir, { recursive: true });
180+
} catch {
181+
// Ignore cleanup errors
182+
}
183+
}
184+
}
185+
186+
/**
187+
* Generate the llms.txt index file after all HTML files have been rendered.
188+
* This is called during the post-render phase.
189+
*/
190+
export async function updateLlmsTxt(
191+
context: ProjectContext,
192+
outputFiles: ProjectOutputFile[],
193+
incremental: boolean,
194+
): Promise<void> {
195+
// Check if llms-txt is enabled
196+
if (!websiteConfigBoolean(kLlmsTxt, false, context.config)) {
197+
return;
198+
}
199+
200+
const outputDir = projectOutputDir(context);
201+
const llmsTxtPath = join(outputDir, "llms.txt");
202+
203+
// Match sitemap behavior: only regenerate on full render
204+
if (incremental && existsSync(llmsTxtPath)) {
205+
return;
206+
}
207+
208+
const siteTitle = websiteTitle(context.config) || "Untitled";
209+
const siteDesc = websiteDescription(context.config) || "";
210+
const draftMode = projectDraftMode(context);
211+
212+
// Helper to check if output file is a draft
213+
const isDraft = async (outputFile: ProjectOutputFile): Promise<boolean> => {
214+
const index = await resolveInputTargetForOutputFile(
215+
context,
216+
relative(outputDir, outputFile.file),
217+
);
218+
return index?.draft ?? false;
219+
};
220+
221+
// Filter out 404 page
222+
const doc404 = join(outputDir, kProject404File);
223+
224+
// Collect all .llms.md files, excluding drafts and 404
225+
const llmsFiles: Array<{ path: string; title: string }> = [];
226+
227+
for (const file of outputFiles) {
228+
// Skip 404 page
229+
if (file.file === doc404) {
230+
continue;
231+
}
232+
233+
// Skip drafts unless in visible draft mode
234+
const draft = await isDraft(file);
235+
if (draft && !isDraftVisible(draftMode)) {
236+
continue;
237+
}
238+
239+
const llmsPath = file.file.replace(/\.html$/, ".llms.md");
240+
if (existsSync(llmsPath)) {
241+
// Extract title from the format metadata or use filename
242+
const title = (file.format.metadata?.title as string) ||
243+
basename(file.file, ".html");
244+
llmsFiles.push({
245+
path: relative(outputDir, llmsPath),
246+
title,
247+
});
248+
}
249+
}
250+
251+
// Generate llms.txt content
252+
const lines: string[] = [];
253+
lines.push(`# ${siteTitle}`);
254+
lines.push("");
255+
if (siteDesc) {
256+
lines.push(`> ${siteDesc}`);
257+
lines.push("");
258+
}
259+
lines.push("## Pages");
260+
lines.push("");
261+
for (const f of llmsFiles) {
262+
lines.push(`- [${f.title}](${f.path})`);
263+
}
264+
lines.push("");
265+
266+
Deno.writeTextFileSync(llmsTxtPath, lines.join("\n"));
267+
}

src/project/types/website/website.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,13 @@ import { updateSearchIndex } from "./website-search.ts";
5151
import {
5252
kDraftMode,
5353
kDrafts,
54+
kLlmsTxt,
5455
kSiteFavicon,
5556
kWebsite,
5657
} from "./website-constants.ts";
5758
import {
5859
websiteConfigArray,
60+
websiteConfigBoolean,
5961
websiteConfigString,
6062
websiteMetadataFields,
6163
websiteProjectConfig,
@@ -86,6 +88,7 @@ import { formatDate } from "../../../core/date.ts";
8688
import { projectExtensionPathResolver } from "../../../extension/extension.ts";
8789
import { websiteDraftPostProcessor } from "./website-draft.ts";
8890
import { projectDraftMode } from "./website-utils.ts";
91+
import { llmsHtmlFinalizer, updateLlmsTxt } from "./website-llms.ts";
8992
import { kFieldCategories } from "./listing/website-listing-shared.ts";
9093
import { pandocNativeStr } from "../../../core/pandoc/codegen.ts";
9194
import { asArray } from "../../../core/array.ts";
@@ -353,6 +356,13 @@ export const websiteProjectType: ProjectType = {
353356
extras.html[kHtmlPostprocessors].push(cookieDep.htmlPostProcessor);
354357
}
355358

359+
// Add llms.txt finalizer if enabled
360+
if (websiteConfigBoolean(kLlmsTxt, false, project.config)) {
361+
extras.html[kHtmlFinalizers]?.push(
362+
llmsHtmlFinalizer(source, project, format),
363+
);
364+
}
365+
356366
return Promise.resolve(extras);
357367
},
358368

@@ -425,6 +435,9 @@ export async function websitePostRender(
425435
// generate any page aliases
426436
await updateAliases(context, outputFiles, incremental);
427437

438+
// generate llms.txt index
439+
await updateLlmsTxt(context, outputFiles, incremental);
440+
428441
// write redirecting index.html if there is none
429442
await ensureIndexPage(context);
430443
}

0 commit comments

Comments
 (0)