Skip to content

Commit cc4e8f6

Browse files
cscheidclaude
andauthored
feat(website): add llms.txt support for LLM-friendly content (#13932)
* feat(website): add llms.txt support for LLM-friendly content Add support for generating llms.txt and .llms.md files for Quarto websites, providing LLM-friendly markdown versions of HTML pages. Features: - New `llms-txt: true` option in website config - Generates .llms.md companion files alongside HTML output - Creates llms.txt index file linking to all markdown pages - Converts HTML to clean markdown using Pandoc with Lua filter - Handles callouts (blockquotes with bold type markers) - Converts images to markdown syntax - Converts internal links from .html to .llms.md - Respects draft settings (excludes drafts from output) - Cleans listing pages (removes empty links, category badges) - Matches sitemap behavior for incremental builds New files: - src/project/types/website/website-llms.ts - src/resources/filters/llms/llms.lua Test coverage: - Basic file generation - Content conversion (callouts, code, tables, links) - Draft handling - Listing page cleanup Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(website): exclude .llms.md files from rendering and fix llms-txt tests - Add **/*.llms.md to projectHiddenIgnoreGlob() to prevent cascading renders of llms.txt companion files - Fix ensureLlmsTxt* test functions to use dirname(htmlFile) instead of treating file path as directory - Update llms-txt test files to use correct two-element array format for regex matches [matches, no_matches] - Add render-project: true where needed for llms.txt generation Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(website): normalize path separators in llms.txt for Windows compatibility Use pathWithForwardSlashes() to ensure paths in llms.txt use forward slashes on all platforms. Also adds changelog entry for the llms-txt feature. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * Use absolute URLs in llms.txt when site-url is configured When a website has site-url set, the llms.txt index file now generates absolute URLs for page links (e.g., https://example.com/about.llms.md) instead of relative paths. Falls back to relative paths when site-url is not configured. Follows the same pattern as sitemap.xml generation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent fd74e00 commit cc4e8f6

30 files changed

Lines changed: 854 additions & 28 deletions

news/changelog-1.9.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ All changes included in 1.9:
106106
- ([#13716](https://github.com/quarto-dev/quarto-cli/issues/13716)): Fix draft pages showing blank during preview when pre-render scripts are configured.
107107
- ([#13847](https://github.com/quarto-dev/quarto-cli/pull/13847)): Open graph title with markdown is now processed correctly. (author: @mcanouil)
108108
- ([#13910](https://github.com/quarto-dev/quarto-cli/issues/13910)): Add support for `logo: false` to disable sidebar and navbar logos when using `_brand.yml`. Works in website projects (`sidebar.logo: false`, `navbar.logo: false`) and book projects (`book.sidebar.logo: false`, `book.navbar.logo: false`).
109+
- ([#13932](https://github.com/quarto-dev/quarto-cli/pull/13932)): Add `llms-txt: true` option to generate LLM-friendly content for websites. Creates `.llms.md` markdown files alongside HTML pages and a root `llms.txt` index file following the [llms.txt](https://llmstxt.org/) specification.
109110
- ([#13951](https://github.com/quarto-dev/quarto-cli/issues/13951)): Fix `image-lazy-loading` not applying `loading="lazy"` attribute to auto-detected listing images.
110111
- ([#14003](https://github.com/quarto-dev/quarto-cli/pull/14003)): Add text fragments to search result links so browsers scroll to and highlight the matched text on the target page.
111112

src/project/project-context.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -877,11 +877,12 @@ export function projectYamlFiles(dir: string): string[] {
877877

878878
function projectHiddenIgnoreGlob(dir: string) {
879879
return projectIgnoreGlobs(dir) // standard ignores for all projects
880-
.concat(["**/_*", "**/_*/**"]) // underscore prefx
880+
.concat(["**/_*", "**/_*/**"]) // underscore prefix
881881
.concat(["**/.*", "**/.*/**"]) // hidden (dot prefix)
882882
.concat(["**/README.?([Rrq])md"]) // README
883883
.concat(["**/CLAUDE.md"]) // Anthropic claude code file
884-
.concat(["**/AGENTS.md"]); // https://agents.md/
884+
.concat(["**/AGENTS.md"]) // https://agents.md/
885+
.concat(["**/*.llms.md"]); // llms.txt companion markdown files
885886
}
886887

887888
export const projectInputFiles = makeTimedFunctionAsync(

src/project/types/website/website-config.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ type WebsiteConfigKey =
7777
| "other-links"
7878
| "code-links"
7979
| "reader-mode"
80+
| "llms-txt"
8081
| "announcement"
8182
| "draft-mode"
8283
| "drafts";

src/project/types/website/website-constants.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ export const kSiteRepoLinkRel = "repo-link-rel";
1818
export const kSiteIssueUrl = "issue-url";
1919
export const kSiteRepoActions = "repo-actions";
2020
export const kSiteReaderMode = "reader-mode";
21+
export const kLlmsTxt = "llms-txt";
2122

2223
export const kSiteNavbar = "navbar";
2324
export const kSiteSidebar = "sidebar";
Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
/*
2+
* website-llms.ts
3+
*
4+
* Copyright (C) 2020-2024 Posit Software, PBC
5+
*/
6+
7+
import { basename, join, relative } from "../../../deno_ral/path.ts";
8+
import { existsSync } from "../../../deno_ral/fs.ts";
9+
import { pathWithForwardSlashes } from "../../../core/path.ts";
10+
11+
import { Document, Element } from "../../../core/deno-dom.ts";
12+
import { execProcess } from "../../../core/process.ts";
13+
import { pandocBinaryPath, resourcePath } from "../../../core/resources.ts";
14+
15+
import { kProject404File, ProjectContext } from "../../types.ts";
16+
import { projectOutputDir } from "../../project-shared.ts";
17+
import { ProjectOutputFile } from "../types.ts";
18+
19+
import { kLlmsTxt } from "./website-constants.ts";
20+
import {
21+
websiteBaseurl,
22+
websiteConfigBoolean,
23+
websiteDescription,
24+
websiteTitle,
25+
} from "./website-config.ts";
26+
import { inputFileHref } from "./website-shared.ts";
27+
import { isDraftVisible, isProjectDraft, projectDraftMode } from "./website-utils.ts";
28+
import { resolveInputTargetForOutputFile } from "../../project-index.ts";
29+
import { Format } from "../../../config/types.ts";
30+
31+
/**
32+
* Compute the output HTML file path from the source file.
33+
* Uses inputFileHref to convert the relative source path to an HTML href,
34+
* then joins with the output directory.
35+
*/
36+
function computeOutputFilePath(source: string, project: ProjectContext): string {
37+
const outputDir = projectOutputDir(project);
38+
const sourceRelative = relative(project.dir, source);
39+
// inputFileHref returns "/path/to/file.html" - strip leading / and join with output dir
40+
const htmlHref = inputFileHref(sourceRelative);
41+
return join(outputDir, htmlHref.slice(1));
42+
}
43+
44+
/**
45+
* HTML finalizer that generates .llms.md files from rendered HTML.
46+
* This runs after all HTML postprocessors have completed.
47+
*/
48+
export function llmsHtmlFinalizer(
49+
source: string,
50+
project: ProjectContext,
51+
_format: Format,
52+
) {
53+
return async (doc: Document): Promise<void> => {
54+
// Check if llms-txt is enabled
55+
if (!websiteConfigBoolean(kLlmsTxt, false, project.config)) {
56+
return;
57+
}
58+
59+
// Check draft status via multiple mechanisms
60+
const draftMode = projectDraftMode(project);
61+
62+
// Check 1: quarto:status meta tag (set by draft processing)
63+
const statusEl = doc.querySelector("meta[name='quarto:status']");
64+
const status = statusEl?.getAttribute("content");
65+
const isDraftByStatus = status === "draft" || status === "draft-remove";
66+
67+
// Check 2: drafts array in project config
68+
const sourceRelative = relative(project.dir, source);
69+
const isDraftByConfig = isProjectDraft(sourceRelative, project);
70+
71+
const isDraft = isDraftByStatus || isDraftByConfig;
72+
73+
if (isDraft && !isDraftVisible(draftMode)) {
74+
return; // Skip draft pages
75+
}
76+
77+
// Extract main content from HTML
78+
const htmlContent = extractMainContent(doc);
79+
80+
// Compute the output file path and derive the .llms.md path
81+
const outputFile = computeOutputFilePath(source, project);
82+
const llmsOutputPath = outputFile.replace(/\.html$/, ".llms.md");
83+
84+
// Convert HTML to markdown using Pandoc with the llms.lua filter
85+
await convertHtmlToLlmsMarkdown(htmlContent, llmsOutputPath);
86+
};
87+
}
88+
89+
/**
90+
* Extract the main content from an HTML document, removing navigation,
91+
* sidebars, footers, scripts, and styles.
92+
*/
93+
function extractMainContent(doc: Document): string {
94+
// Clone the document to avoid mutating the original
95+
const clone = doc.cloneNode(true) as Document;
96+
97+
// Remove elements that shouldn't be in llms output
98+
const selectorsToRemove = [
99+
"#quarto-header",
100+
".nav-footer",
101+
"#quarto-sidebar",
102+
"#quarto-margin-sidebar",
103+
"#quarto-search-results",
104+
".sidebar",
105+
".quarto-search",
106+
"nav.navbar",
107+
"script",
108+
"style",
109+
"link[rel='stylesheet']",
110+
"meta",
111+
"noscript",
112+
];
113+
114+
for (const selector of selectorsToRemove) {
115+
const elements = clone.querySelectorAll(selector);
116+
for (const el of elements) {
117+
(el as Element).remove();
118+
}
119+
}
120+
121+
// Get the main content area
122+
const main = clone.querySelector("main") ||
123+
clone.querySelector("#quarto-document-content") ||
124+
clone.querySelector("article") ||
125+
clone.body;
126+
127+
if (!main) {
128+
return "";
129+
}
130+
131+
// Return a minimal HTML document with just the content
132+
return `<!DOCTYPE html>
133+
<html>
134+
<head><meta charset="utf-8"></head>
135+
<body>
136+
${main.innerHTML}
137+
</body>
138+
</html>`;
139+
}
140+
141+
/**
142+
* Convert HTML content to markdown using Pandoc with the llms.lua filter.
143+
*/
144+
async function convertHtmlToLlmsMarkdown(
145+
htmlContent: string,
146+
outputPath: string,
147+
): Promise<void> {
148+
const filterPath = resourcePath("filters/llms/llms.lua");
149+
150+
// Create a temporary file for the HTML content
151+
const tempDir = Deno.makeTempDirSync();
152+
const tempHtml = join(tempDir, "input.html");
153+
Deno.writeTextFileSync(tempHtml, htmlContent);
154+
155+
try {
156+
// Run Pandoc to convert HTML to markdown
157+
// Use gfm-raw_html for clean markdown output:
158+
// - gfm gives us proper table and code block handling
159+
// - -raw_html strips remaining HTML tags, converting figures to markdown images
160+
// Note: We use plain "html" input format (not html-native_divs-native_spans)
161+
// because native_divs interferes with the Lua filter's callout processing
162+
const cmd = [pandocBinaryPath()];
163+
cmd.push(tempHtml);
164+
cmd.push("-f", "html");
165+
cmd.push("-t", "gfm-raw_html");
166+
cmd.push("--lua-filter", filterPath);
167+
cmd.push("-o", outputPath);
168+
cmd.push("--wrap=none");
169+
170+
const result = await execProcess({
171+
cmd: cmd[0],
172+
args: cmd.slice(1),
173+
stdout: "piped",
174+
stderr: "piped",
175+
});
176+
177+
if (!result.success) {
178+
console.error(`Error converting HTML to markdown: ${result.stderr}`);
179+
}
180+
} finally {
181+
// Cleanup temp files
182+
try {
183+
Deno.removeSync(tempDir, { recursive: true });
184+
} catch {
185+
// Ignore cleanup errors
186+
}
187+
}
188+
}
189+
190+
/**
191+
* Generate the llms.txt index file after all HTML files have been rendered.
192+
* This is called during the post-render phase.
193+
*/
194+
export async function updateLlmsTxt(
195+
context: ProjectContext,
196+
outputFiles: ProjectOutputFile[],
197+
incremental: boolean,
198+
): Promise<void> {
199+
// Check if llms-txt is enabled
200+
if (!websiteConfigBoolean(kLlmsTxt, false, context.config)) {
201+
return;
202+
}
203+
204+
const outputDir = projectOutputDir(context);
205+
const llmsTxtPath = join(outputDir, "llms.txt");
206+
207+
// Match sitemap behavior: only regenerate on full render
208+
if (incremental && existsSync(llmsTxtPath)) {
209+
return;
210+
}
211+
212+
const siteTitle = websiteTitle(context.config) || "Untitled";
213+
const siteDesc = websiteDescription(context.config) || "";
214+
const baseUrl = websiteBaseurl(context.config);
215+
const draftMode = projectDraftMode(context);
216+
217+
// Helper to check if output file is a draft
218+
const isDraft = async (outputFile: ProjectOutputFile): Promise<boolean> => {
219+
const index = await resolveInputTargetForOutputFile(
220+
context,
221+
relative(outputDir, outputFile.file),
222+
);
223+
return index?.draft ?? false;
224+
};
225+
226+
// Filter out 404 page
227+
const doc404 = join(outputDir, kProject404File);
228+
229+
// Collect all .llms.md files, excluding drafts and 404
230+
const llmsFiles: Array<{ path: string; title: string }> = [];
231+
232+
for (const file of outputFiles) {
233+
// Skip 404 page
234+
if (file.file === doc404) {
235+
continue;
236+
}
237+
238+
// Skip drafts unless in visible draft mode
239+
const draft = await isDraft(file);
240+
if (draft && !isDraftVisible(draftMode)) {
241+
continue;
242+
}
243+
244+
const llmsPath = file.file.replace(/\.html$/, ".llms.md");
245+
if (existsSync(llmsPath)) {
246+
// Extract title from the format metadata or use filename
247+
const title = (file.format.metadata?.title as string) ||
248+
basename(file.file, ".html");
249+
const relativePath = pathWithForwardSlashes(relative(outputDir, llmsPath));
250+
const filePath = baseUrl
251+
? (baseUrl.endsWith("/") ? baseUrl : baseUrl + "/") + relativePath
252+
: relativePath;
253+
llmsFiles.push({
254+
path: filePath,
255+
title,
256+
});
257+
}
258+
}
259+
260+
// Generate llms.txt content
261+
const lines: string[] = [];
262+
lines.push(`# ${siteTitle}`);
263+
lines.push("");
264+
if (siteDesc) {
265+
lines.push(`> ${siteDesc}`);
266+
lines.push("");
267+
}
268+
lines.push("## Pages");
269+
lines.push("");
270+
for (const f of llmsFiles) {
271+
lines.push(`- [${f.title}](${f.path})`);
272+
}
273+
lines.push("");
274+
275+
Deno.writeTextFileSync(llmsTxtPath, lines.join("\n"));
276+
}

src/project/types/website/website.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,13 @@ import { updateSearchIndex } from "./website-search.ts";
5151
import {
5252
kDraftMode,
5353
kDrafts,
54+
kLlmsTxt,
5455
kSiteFavicon,
5556
kWebsite,
5657
} from "./website-constants.ts";
5758
import {
5859
websiteConfigArray,
60+
websiteConfigBoolean,
5961
websiteConfigString,
6062
websiteMetadataFields,
6163
websiteProjectConfig,
@@ -86,6 +88,7 @@ import { formatDate } from "../../../core/date.ts";
8688
import { projectExtensionPathResolver } from "../../../extension/extension.ts";
8789
import { websiteDraftPostProcessor } from "./website-draft.ts";
8890
import { projectDraftMode } from "./website-utils.ts";
91+
import { llmsHtmlFinalizer, updateLlmsTxt } from "./website-llms.ts";
8992
import { kFieldCategories } from "./listing/website-listing-shared.ts";
9093
import { pandocNativeStr } from "../../../core/pandoc/codegen.ts";
9194
import { asArray } from "../../../core/array.ts";
@@ -353,6 +356,13 @@ export const websiteProjectType: ProjectType = {
353356
extras.html[kHtmlPostprocessors].push(cookieDep.htmlPostProcessor);
354357
}
355358

359+
// Add llms.txt finalizer if enabled
360+
if (websiteConfigBoolean(kLlmsTxt, false, project.config)) {
361+
extras.html[kHtmlFinalizers]?.push(
362+
llmsHtmlFinalizer(source, project, format),
363+
);
364+
}
365+
356366
return Promise.resolve(extras);
357367
},
358368

@@ -425,6 +435,9 @@ export async function websitePostRender(
425435
// generate any page aliases
426436
await updateAliases(context, outputFiles, incremental);
427437

438+
// generate llms.txt index
439+
await updateLlmsTxt(context, outputFiles, incremental);
440+
428441
// write redirecting index.html if there is none
429442
await ensureIndexPage(context);
430443
}

0 commit comments

Comments
 (0)