Skip to content

Commit a2e3409

Browse files
committed
Enable markdown content requests
Allows agents to request markdown instead of HTML.
1 parent b5d965f commit a2e3409

2 files changed

Lines changed: 289 additions & 0 deletions

File tree

netlify.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,7 @@
1717
for = "/.well-known/api-catalog"
1818
[headers.values]
1919
Content-Type = "application/linkset+json; profile=\"https://www.rfc-editor.org/info/rfc9727\""
20+
21+
[[edge_functions]]
22+
path = "/*"
23+
function = "markdown-negotiation"
Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
const MARKDOWN_CONTENT_TYPE = "text/markdown; charset=utf-8";
2+
3+
export default async (request: Request, context: { next: () => Promise<Response> }) => {
4+
const response = await context.next();
5+
6+
if (!wantsMarkdown(request)) {
7+
return response;
8+
}
9+
10+
const contentType = response.headers.get("content-type") || "";
11+
if (!contentType.toLowerCase().includes("text/html")) {
12+
return response;
13+
}
14+
15+
const html = await response.text();
16+
const markdown = htmlToMarkdown(html, new URL(request.url));
17+
18+
const headers = new Headers(response.headers);
19+
headers.set("content-type", MARKDOWN_CONTENT_TYPE);
20+
headers.set("x-markdown-tokens", estimateTokenCount(markdown).toString());
21+
setVaryAccept(headers);
22+
23+
return new Response(markdown, {
24+
status: response.status,
25+
statusText: response.statusText,
26+
headers,
27+
});
28+
};
29+
30+
function wantsMarkdown(request: Request): boolean {
31+
const accept = request.headers.get("accept");
32+
if (!accept) {
33+
return false;
34+
}
35+
36+
return accept.toLowerCase().includes("text/markdown");
37+
}
38+
39+
function setVaryAccept(headers: Headers): void {
40+
const vary = headers.get("vary");
41+
if (!vary) {
42+
headers.set("vary", "Accept");
43+
return;
44+
}
45+
46+
const items = vary
47+
.split(",")
48+
.map((v) => v.trim().toLowerCase())
49+
.filter(Boolean);
50+
51+
if (!items.includes("accept")) {
52+
headers.set("vary", `${vary}, Accept`);
53+
}
54+
}
55+
56+
function estimateTokenCount(markdown: string): number {
57+
// Heuristic estimate commonly used for GPT-style token budgeting.
58+
return Math.max(1, Math.ceil(markdown.length / 4));
59+
}
60+
61+
function htmlToMarkdown(html: string, baseUrl: URL): string {
62+
const doc = new DOMParser().parseFromString(html, "text/html");
63+
if (!doc) {
64+
return "";
65+
}
66+
67+
for (const selector of ["script", "style", "noscript"]) {
68+
doc.querySelectorAll(selector).forEach((node) => node.remove());
69+
}
70+
71+
const title = normalizeWhitespace(doc.querySelector("title")?.textContent || "");
72+
const body = doc.body;
73+
const bodyMarkdown = body ? renderChildren(body, baseUrl) : "";
74+
75+
const parts: string[] = [];
76+
if (title) {
77+
parts.push(`---\ntitle: ${title}\n---`);
78+
parts.push(`# ${title}`);
79+
}
80+
81+
if (bodyMarkdown) {
82+
parts.push(bodyMarkdown.trim());
83+
}
84+
85+
return parts.join("\n\n").trim() + "\n";
86+
}
87+
88+
function renderChildren(parent: Element, baseUrl: URL): string {
89+
const chunks: string[] = [];
90+
for (const child of Array.from(parent.childNodes)) {
91+
const rendered = renderNode(child, baseUrl, 0);
92+
if (rendered) {
93+
chunks.push(rendered.trim());
94+
}
95+
}
96+
return chunks.join("\n\n");
97+
}
98+
99+
function renderNode(node: Node, baseUrl: URL, listDepth: number): string {
100+
if (node.nodeType === Node.TEXT_NODE) {
101+
return normalizeWhitespace(node.textContent || "");
102+
}
103+
104+
if (node.nodeType !== Node.ELEMENT_NODE) {
105+
return "";
106+
}
107+
108+
const el = node as Element;
109+
const tag = el.tagName.toLowerCase();
110+
111+
if (/^h[1-6]$/.test(tag)) {
112+
const level = Number(tag[1]);
113+
const text = renderInlineChildren(el, baseUrl);
114+
return text ? `${"#".repeat(level)} ${text}` : "";
115+
}
116+
117+
if (tag === "p") {
118+
return renderInlineChildren(el, baseUrl);
119+
}
120+
121+
if (tag === "a") {
122+
const text = renderInlineChildren(el, baseUrl) || normalizeWhitespace(el.getAttribute("href") || "");
123+
const href = toAbsoluteUrl(el.getAttribute("href"), baseUrl);
124+
return href ? `[${text}](${href})` : text;
125+
}
126+
127+
if (tag === "img") {
128+
const alt = normalizeWhitespace(el.getAttribute("alt") || "image");
129+
const src = toAbsoluteUrl(el.getAttribute("src"), baseUrl);
130+
return src ? `![${alt}](${src})` : "";
131+
}
132+
133+
if (tag === "pre") {
134+
const code = el.textContent?.trim() || "";
135+
return code ? `\`\`\`\n${code}\n\`\`\`` : "";
136+
}
137+
138+
if (tag === "code") {
139+
const code = normalizeWhitespace(el.textContent || "");
140+
return code ? `\`${code}\`` : "";
141+
}
142+
143+
if (tag === "blockquote") {
144+
const text = renderChildren(el, baseUrl)
145+
.split("\n")
146+
.map((line) => (line ? `> ${line}` : ">"))
147+
.join("\n");
148+
return text;
149+
}
150+
151+
if (tag === "ul" || tag === "ol") {
152+
return renderList(el, baseUrl, tag === "ol", listDepth);
153+
}
154+
155+
if (tag === "li") {
156+
return renderInlineChildren(el, baseUrl);
157+
}
158+
159+
if (tag === "br") {
160+
return "\n";
161+
}
162+
163+
if (tag === "hr") {
164+
return "---";
165+
}
166+
167+
return renderChildren(el, baseUrl);
168+
}
169+
170+
function renderList(list: Element, baseUrl: URL, ordered: boolean, listDepth: number): string {
171+
const lines: string[] = [];
172+
const indent = " ".repeat(listDepth);
173+
174+
let index = 1;
175+
for (const child of Array.from(list.children)) {
176+
if (child.tagName.toLowerCase() !== "li") {
177+
continue;
178+
}
179+
180+
const content = renderChildren(child, baseUrl) || renderInlineChildren(child, baseUrl);
181+
if (!content) {
182+
continue;
183+
}
184+
185+
const prefix = ordered ? `${index}. ` : "- ";
186+
const normalized = content
187+
.split("\n")
188+
.map((line, i) => (i === 0 ? `${indent}${prefix}${line}` : `${indent} ${line}`))
189+
.join("\n");
190+
191+
lines.push(normalized);
192+
index += 1;
193+
194+
for (const nested of Array.from(child.children)) {
195+
const nestedTag = nested.tagName.toLowerCase();
196+
if (nestedTag === "ul" || nestedTag === "ol") {
197+
const nestedList = renderList(nested, baseUrl, nestedTag === "ol", listDepth + 1);
198+
if (nestedList) {
199+
lines.push(nestedList);
200+
}
201+
}
202+
}
203+
}
204+
205+
return lines.join("\n");
206+
}
207+
208+
function renderInlineChildren(parent: Element, baseUrl: URL): string {
209+
const out: string[] = [];
210+
for (const child of Array.from(parent.childNodes)) {
211+
if (child.nodeType === Node.TEXT_NODE) {
212+
const text = normalizeWhitespace(child.textContent || "");
213+
if (text) {
214+
out.push(text);
215+
}
216+
continue;
217+
}
218+
219+
if (child.nodeType !== Node.ELEMENT_NODE) {
220+
continue;
221+
}
222+
223+
const el = child as Element;
224+
const tag = el.tagName.toLowerCase();
225+
226+
if (tag === "a") {
227+
const text = renderInlineChildren(el, baseUrl) || normalizeWhitespace(el.getAttribute("href") || "");
228+
const href = toAbsoluteUrl(el.getAttribute("href"), baseUrl);
229+
out.push(href ? `[${text}](${href})` : text);
230+
continue;
231+
}
232+
233+
if (tag === "code") {
234+
const text = normalizeWhitespace(el.textContent || "");
235+
if (text) {
236+
out.push(`\`${text}\``);
237+
}
238+
continue;
239+
}
240+
241+
if (tag === "strong" || tag === "b") {
242+
const text = renderInlineChildren(el, baseUrl);
243+
if (text) {
244+
out.push(`**${text}**`);
245+
}
246+
continue;
247+
}
248+
249+
if (tag === "em" || tag === "i") {
250+
const text = renderInlineChildren(el, baseUrl);
251+
if (text) {
252+
out.push(`*${text}*`);
253+
}
254+
continue;
255+
}
256+
257+
if (tag === "br") {
258+
out.push("\n");
259+
continue;
260+
}
261+
262+
const text = renderInlineChildren(el, baseUrl);
263+
if (text) {
264+
out.push(text);
265+
}
266+
}
267+
268+
return normalizeWhitespace(out.join(" ")).replace(/ \n /g, "\n").trim();
269+
}
270+
271+
function toAbsoluteUrl(href: string | null, baseUrl: URL): string {
272+
if (!href) {
273+
return "";
274+
}
275+
276+
try {
277+
return new URL(href, baseUrl).toString();
278+
} catch {
279+
return href;
280+
}
281+
}
282+
283+
function normalizeWhitespace(value: string): string {
284+
return value.replace(/\s+/g, " ").trim();
285+
}

0 commit comments

Comments
 (0)