|
| 1 | +/* |
| 2 | + * website-llms.ts |
| 3 | + * |
| 4 | + * Copyright (C) 2020-2024 Posit Software, PBC |
| 5 | + */ |
| 6 | + |
| 7 | +import { basename, dirname, join, relative } from "../../../deno_ral/path.ts"; |
| 8 | +import { existsSync } from "../../../deno_ral/fs.ts"; |
| 9 | + |
| 10 | +import { Document, Element } from "../../../core/deno-dom.ts"; |
| 11 | +import { execProcess } from "../../../core/process.ts"; |
| 12 | +import { dirAndStem, pathWithForwardSlashes } from "../../../core/path.ts"; |
| 13 | +import { pandocBinaryPath, resourcePath } from "../../../core/resources.ts"; |
| 14 | + |
| 15 | +import { kProject404File, ProjectContext } from "../../types.ts"; |
| 16 | +import { projectOutputDir } from "../../project-shared.ts"; |
| 17 | +import { ProjectOutputFile } from "../types.ts"; |
| 18 | + |
| 19 | +import { kLlmsTxt } from "./website-constants.ts"; |
| 20 | +import { |
| 21 | + websiteConfigBoolean, |
| 22 | + websiteDescription, |
| 23 | + websiteTitle, |
| 24 | +} from "./website-config.ts"; |
| 25 | +import { isDraftVisible, isProjectDraft, projectDraftMode } from "./website-utils.ts"; |
| 26 | +import { resolveInputTargetForOutputFile } from "../../project-index.ts"; |
| 27 | +import { Format } from "../../../config/types.ts"; |
| 28 | + |
| 29 | +/** |
| 30 | + * Compute the output HTML file path from the source file. |
| 31 | + */ |
| 32 | +function computeOutputFilePath(source: string, project: ProjectContext): string { |
| 33 | + const outputDir = projectOutputDir(project); |
| 34 | + const sourceRelative = relative(project.dir, source); |
| 35 | + const [hrefDir, hrefStem] = dirAndStem(sourceRelative); |
| 36 | + const htmlPath = join(hrefDir, `${hrefStem}.html`); |
| 37 | + return join(outputDir, htmlPath); |
| 38 | +} |
| 39 | + |
| 40 | +/** |
| 41 | + * HTML finalizer that generates .llms.md files from rendered HTML. |
| 42 | + * This runs after all HTML postprocessors have completed. |
| 43 | + */ |
| 44 | +export function llmsHtmlFinalizer( |
| 45 | + source: string, |
| 46 | + project: ProjectContext, |
| 47 | + _format: Format, |
| 48 | +) { |
| 49 | + return async (doc: Document): Promise<void> => { |
| 50 | + // Check if llms-txt is enabled |
| 51 | + if (!websiteConfigBoolean(kLlmsTxt, false, project.config)) { |
| 52 | + return; |
| 53 | + } |
| 54 | + |
| 55 | + // Check draft status via multiple mechanisms |
| 56 | + const draftMode = projectDraftMode(project); |
| 57 | + |
| 58 | + // Check 1: quarto:status meta tag (set by draft processing) |
| 59 | + const statusEl = doc.querySelector("meta[name='quarto:status']"); |
| 60 | + const status = statusEl?.getAttribute("content"); |
| 61 | + const isDraftByStatus = status === "draft" || status === "draft-remove"; |
| 62 | + |
| 63 | + // Check 2: drafts array in project config |
| 64 | + const sourceRelative = relative(project.dir, source); |
| 65 | + const isDraftByConfig = isProjectDraft(sourceRelative, project); |
| 66 | + |
| 67 | + const isDraft = isDraftByStatus || isDraftByConfig; |
| 68 | + |
| 69 | + if (isDraft && !isDraftVisible(draftMode)) { |
| 70 | + return; // Skip draft pages |
| 71 | + } |
| 72 | + |
| 73 | + // Extract main content from HTML |
| 74 | + const htmlContent = extractMainContent(doc); |
| 75 | + |
| 76 | + // Compute the output file path and derive the .llms.md path |
| 77 | + const outputFile = computeOutputFilePath(source, project); |
| 78 | + const llmsOutputPath = outputFile.replace(/\.html$/, ".llms.md"); |
| 79 | + |
| 80 | + // Convert HTML to markdown using Pandoc with the llms.lua filter |
| 81 | + await convertHtmlToLlmsMarkdown(htmlContent, llmsOutputPath); |
| 82 | + }; |
| 83 | +} |
| 84 | + |
| 85 | +/** |
| 86 | + * Extract the main content from an HTML document, removing navigation, |
| 87 | + * sidebars, footers, scripts, and styles. |
| 88 | + */ |
| 89 | +function extractMainContent(doc: Document): string { |
| 90 | + // Clone the document to avoid mutating the original |
| 91 | + const clone = doc.cloneNode(true) as Document; |
| 92 | + |
| 93 | + // Remove elements that shouldn't be in llms output |
| 94 | + const selectorsToRemove = [ |
| 95 | + "#quarto-header", |
| 96 | + ".nav-footer", |
| 97 | + "#quarto-sidebar", |
| 98 | + "#quarto-margin-sidebar", |
| 99 | + "#quarto-search-results", |
| 100 | + ".sidebar", |
| 101 | + ".quarto-search", |
| 102 | + "nav.navbar", |
| 103 | + "script", |
| 104 | + "style", |
| 105 | + "link[rel='stylesheet']", |
| 106 | + "meta", |
| 107 | + "noscript", |
| 108 | + ]; |
| 109 | + |
| 110 | + for (const selector of selectorsToRemove) { |
| 111 | + const elements = clone.querySelectorAll(selector); |
| 112 | + for (const el of elements) { |
| 113 | + (el as Element).remove(); |
| 114 | + } |
| 115 | + } |
| 116 | + |
| 117 | + // Get the main content area |
| 118 | + const main = clone.querySelector("main") || |
| 119 | + clone.querySelector("#quarto-document-content") || |
| 120 | + clone.querySelector("article") || |
| 121 | + clone.body; |
| 122 | + |
| 123 | + if (!main) { |
| 124 | + return ""; |
| 125 | + } |
| 126 | + |
| 127 | + // Return a minimal HTML document with just the content |
| 128 | + return `<!DOCTYPE html> |
| 129 | +<html> |
| 130 | +<head><meta charset="utf-8"></head> |
| 131 | +<body> |
| 132 | +${main.innerHTML} |
| 133 | +</body> |
| 134 | +</html>`; |
| 135 | +} |
| 136 | + |
| 137 | +/** |
| 138 | + * Convert HTML content to markdown using Pandoc with the llms.lua filter. |
| 139 | + */ |
| 140 | +async function convertHtmlToLlmsMarkdown( |
| 141 | + htmlContent: string, |
| 142 | + outputPath: string, |
| 143 | +): Promise<void> { |
| 144 | + const filterPath = resourcePath("filters/llms/llms.lua"); |
| 145 | + |
| 146 | + // Create a temporary file for the HTML content |
| 147 | + const tempDir = Deno.makeTempDirSync(); |
| 148 | + const tempHtml = join(tempDir, "input.html"); |
| 149 | + Deno.writeTextFileSync(tempHtml, htmlContent); |
| 150 | + |
| 151 | + try { |
| 152 | + // Run Pandoc to convert HTML to markdown |
| 153 | + // Use gfm-raw_html for clean markdown output: |
| 154 | + // - gfm gives us proper table and code block handling |
| 155 | + // - -raw_html strips remaining HTML tags, converting figures to markdown images |
| 156 | + // Note: We use plain "html" input format (not html-native_divs-native_spans) |
| 157 | + // because native_divs interferes with the Lua filter's callout processing |
| 158 | + const cmd = [pandocBinaryPath()]; |
| 159 | + cmd.push(tempHtml); |
| 160 | + cmd.push("-f", "html"); |
| 161 | + cmd.push("-t", "gfm-raw_html"); |
| 162 | + cmd.push("--lua-filter", filterPath); |
| 163 | + cmd.push("-o", outputPath); |
| 164 | + cmd.push("--wrap=none"); |
| 165 | + |
| 166 | + const result = await execProcess({ |
| 167 | + cmd: cmd[0], |
| 168 | + args: cmd.slice(1), |
| 169 | + stdout: "piped", |
| 170 | + stderr: "piped", |
| 171 | + }); |
| 172 | + |
| 173 | + if (!result.success) { |
| 174 | + console.error(`Error converting HTML to markdown: ${result.stderr}`); |
| 175 | + } |
| 176 | + } finally { |
| 177 | + // Cleanup temp files |
| 178 | + try { |
| 179 | + Deno.removeSync(tempDir, { recursive: true }); |
| 180 | + } catch { |
| 181 | + // Ignore cleanup errors |
| 182 | + } |
| 183 | + } |
| 184 | +} |
| 185 | + |
| 186 | +/** |
| 187 | + * Generate the llms.txt index file after all HTML files have been rendered. |
| 188 | + * This is called during the post-render phase. |
| 189 | + */ |
| 190 | +export async function updateLlmsTxt( |
| 191 | + context: ProjectContext, |
| 192 | + outputFiles: ProjectOutputFile[], |
| 193 | + incremental: boolean, |
| 194 | +): Promise<void> { |
| 195 | + // Check if llms-txt is enabled |
| 196 | + if (!websiteConfigBoolean(kLlmsTxt, false, context.config)) { |
| 197 | + return; |
| 198 | + } |
| 199 | + |
| 200 | + const outputDir = projectOutputDir(context); |
| 201 | + const llmsTxtPath = join(outputDir, "llms.txt"); |
| 202 | + |
| 203 | + // Match sitemap behavior: only regenerate on full render |
| 204 | + if (incremental && existsSync(llmsTxtPath)) { |
| 205 | + return; |
| 206 | + } |
| 207 | + |
| 208 | + const siteTitle = websiteTitle(context.config) || "Untitled"; |
| 209 | + const siteDesc = websiteDescription(context.config) || ""; |
| 210 | + const draftMode = projectDraftMode(context); |
| 211 | + |
| 212 | + // Helper to check if output file is a draft |
| 213 | + const isDraft = async (outputFile: ProjectOutputFile): Promise<boolean> => { |
| 214 | + const index = await resolveInputTargetForOutputFile( |
| 215 | + context, |
| 216 | + relative(outputDir, outputFile.file), |
| 217 | + ); |
| 218 | + return index?.draft ?? false; |
| 219 | + }; |
| 220 | + |
| 221 | + // Filter out 404 page |
| 222 | + const doc404 = join(outputDir, kProject404File); |
| 223 | + |
| 224 | + // Collect all .llms.md files, excluding drafts and 404 |
| 225 | + const llmsFiles: Array<{ path: string; title: string }> = []; |
| 226 | + |
| 227 | + for (const file of outputFiles) { |
| 228 | + // Skip 404 page |
| 229 | + if (file.file === doc404) { |
| 230 | + continue; |
| 231 | + } |
| 232 | + |
| 233 | + // Skip drafts unless in visible draft mode |
| 234 | + const draft = await isDraft(file); |
| 235 | + if (draft && !isDraftVisible(draftMode)) { |
| 236 | + continue; |
| 237 | + } |
| 238 | + |
| 239 | + const llmsPath = file.file.replace(/\.html$/, ".llms.md"); |
| 240 | + if (existsSync(llmsPath)) { |
| 241 | + // Extract title from the format metadata or use filename |
| 242 | + const title = (file.format.metadata?.title as string) || |
| 243 | + basename(file.file, ".html"); |
| 244 | + llmsFiles.push({ |
| 245 | + path: relative(outputDir, llmsPath), |
| 246 | + title, |
| 247 | + }); |
| 248 | + } |
| 249 | + } |
| 250 | + |
| 251 | + // Generate llms.txt content |
| 252 | + const lines: string[] = []; |
| 253 | + lines.push(`# ${siteTitle}`); |
| 254 | + lines.push(""); |
| 255 | + if (siteDesc) { |
| 256 | + lines.push(`> ${siteDesc}`); |
| 257 | + lines.push(""); |
| 258 | + } |
| 259 | + lines.push("## Pages"); |
| 260 | + lines.push(""); |
| 261 | + for (const f of llmsFiles) { |
| 262 | + lines.push(`- [${f.title}](${f.path})`); |
| 263 | + } |
| 264 | + lines.push(""); |
| 265 | + |
| 266 | + Deno.writeTextFileSync(llmsTxtPath, lines.join("\n")); |
| 267 | +} |
0 commit comments