Skip to content

Commit e81e116

Browse files
committed
feat(md): spa shell handling
1 parent fa06d7c commit e81e116

3 files changed

Lines changed: 164 additions & 16 deletions

File tree

src/md/mod.test.ts

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { expect, test } from 'vitest'
2-
import { create } from './mod.ts'
2+
import { create, defineTransport } from './mod.ts'
33
import * as profiles from './profiles.ts'
44

55
test('requests markdown directly for exdoc docs after profile detection', async () => {
@@ -245,6 +245,63 @@ test('fetches markdown from a text/markdown alternate link before converting htm
245245
expect(result.extras.source_tokens_method).toBe('html')
246246
})
247247

248+
test('browser renders spa shells when html extraction is empty', async () => {
249+
const requests: Array<{ previous: number | undefined; render: boolean; url: string }> = []
250+
const md = create({
251+
fetch: async () =>
252+
new Response(
253+
'<!doctype html><html><head><title>SPA</title></head><body><div id="root"></div><script type="module" src="/assets/app.js"></script></body></html>',
254+
{ headers: { 'content-type': 'text/html; charset=utf-8' }, status: 200 },
255+
),
256+
profiles,
257+
transport: defineTransport(async (url, init, context) => {
258+
requests.push({
259+
previous: context.previous?.status,
260+
render: context.render ?? false,
261+
url: url.href,
262+
})
263+
if (context.render)
264+
return new Response(
265+
'<!doctype html><html><head><title>SPA</title></head><body><main><h1>Rendered App</h1><p>This client-rendered paragraph is long enough to prove browser rendering produced useful page content instead of the empty shell.</p></main></body></html>',
266+
{ headers: { 'content-type': 'text/html; charset=utf-8' }, status: 200 },
267+
)
268+
return context.fetch(url, init)
269+
})(),
270+
})
271+
272+
const result = await md.fetch('https://example.com')
273+
expect(result.ok).toBe(true)
274+
if (!result.ok) return
275+
276+
expect(requests).toEqual([
277+
{ previous: undefined, render: false, url: 'https://example.com/' },
278+
{ previous: 200, render: true, url: 'https://example.com/' },
279+
])
280+
expect(result.content).toContain('# Rendered App')
281+
expect(result.content).toContain('client-rendered paragraph')
282+
expect(result.meta.title).toBe('SPA')
283+
})
284+
285+
test('uses embedded agent instructions when spa html has no rendered content', async () => {
286+
const md = create({
287+
fetch: async () =>
288+
new Response(
289+
'<!doctype html><html><head><title>Anscribe</title></head><body><div style="display:none" aria-hidden="true" data-agent-instructions="true"># Anscribe\n\n&gt; Setup instructions with &quot;quotes&quot;.</div><div id="root"></div></body></html>',
290+
{ headers: { 'content-type': 'text/html; charset=utf-8' }, status: 200 },
291+
),
292+
profiles,
293+
})
294+
295+
const result = await md.fetch('https://anscribe.dev')
296+
expect(result.ok).toBe(true)
297+
if (!result.ok) return
298+
299+
expect(result.content).toContain('# Anscribe')
300+
expect(result.content).toContain('> Setup instructions with "quotes".')
301+
expect(result.meta.title).toBe('Anscribe')
302+
expect(result.extras.source_tokens_method).toBe('html')
303+
})
304+
248305
test('keeps html extraction when vitepress markdown path returns html', async () => {
249306
const body = 'HTML body survives.'
250307
const requests: string[] = []

src/md/mod.ts

Lines changed: 103 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,16 @@ export function create(options: create.Options = {}): create.ReturnType {
140140
baseUrl: inputURL.href,
141141
profile,
142142
})
143-
if (shouldRetryMarkdownUrl(profile?.markdownUrl, htmlResult.content)) {
143+
144+
const shouldUseFallbackContent = (() => {
145+
// Only pay fallback costs when generic HTML extraction produced almost nothing.
146+
const trimmed = htmlResult.content.trim()
147+
if (trimmed === '') return true
148+
const lines = trimmed.split('\n').filter(Boolean)
149+
return trimmed.length < 120 && lines.length <= 3
150+
})()
151+
152+
if (profile?.markdownUrl && shouldUseFallbackContent) {
144153
try {
145154
const url = new URL(profile.markdownUrl)
146155
const markdownResponse = await fetchResponse(url)
@@ -154,6 +163,80 @@ export function create(options: create.Options = {}): create.ReturnType {
154163
}
155164
} catch {}
156165
}
166+
167+
const isSpaShell = (() => {
168+
// Client-only apps usually ship an empty mount node plus a JS entrypoint.
169+
if (!/<script\b[^>]*(?:\bsrc=|\btype=["']module["'])/iu.test(text)) return false
170+
return spaMountElementPattern.test(text)
171+
})()
172+
if (isSpaShell && shouldUseFallbackContent) {
173+
try {
174+
const renderedResponse = await (async () => {
175+
// Ask the transport chain for an expensive browser-rendered retry only on SPA shells.
176+
if (!options.transport) return
177+
return (
178+
(await options.transport(inputURL, requestInit, {
179+
...context,
180+
previous: response,
181+
render: true,
182+
})) ?? undefined
183+
)
184+
})()
185+
if (renderedResponse?.ok) {
186+
const renderedText = await renderedResponse.text()
187+
const renderedProfile = detectPageProfile(renderedText, inputURL, profiles) ?? profile
188+
const renderedHtmlResult = await fromHtml(renderedText, {
189+
baseUrl: inputURL.href,
190+
profile: renderedProfile,
191+
})
192+
const renderedContentIsUseful = (() => {
193+
// Rendered HTML must beat the same thin-content threshold to replace source HTML.
194+
const trimmed = renderedHtmlResult.content.trim()
195+
if (trimmed === '') return false
196+
const lines = trimmed.split('\n').filter(Boolean)
197+
return trimmed.length >= 120 || lines.length > 3
198+
})()
199+
if (renderedContentIsUseful) {
200+
profile = renderedProfile
201+
return renderedHtmlResult
202+
}
203+
}
204+
} catch {}
205+
}
206+
207+
const embeddedMarkdown = (() => {
208+
// Some SPAs embed LLM-ready markdown in hidden agent instruction blocks.
209+
for (const match of text.matchAll(embeddedMarkdownElementPattern)) {
210+
const entities: Record<string, string> = {
211+
amp: '&',
212+
apos: "'",
213+
gt: '>',
214+
lt: '<',
215+
nbsp: ' ',
216+
quot: '"',
217+
}
218+
const content = (match[2] ?? '')
219+
.replace(/<br\s*\/?>/giu, '\n')
220+
.replace(/<[^>]+>/gu, '')
221+
.replace(/&(#x[\da-f]+|#\d+|[a-z]+);/giu, (entityMatch, entity: string) => {
222+
const key = entity.toLowerCase()
223+
if (key.startsWith('#x'))
224+
return String.fromCodePoint(Number.parseInt(key.slice(2), 16))
225+
if (key.startsWith('#'))
226+
return String.fromCodePoint(Number.parseInt(key.slice(1), 10))
227+
return entities[key] ?? entityMatch
228+
})
229+
.trim()
230+
if (content) return content
231+
}
232+
})()
233+
if (embeddedMarkdown && shouldUseFallbackContent) {
234+
const split = splitFrontmatter(embeddedMarkdown)
235+
return {
236+
content: split.body,
237+
meta: { ...filterFrontmatterKeys(split.meta), ...htmlResult.meta },
238+
}
239+
}
157240
return htmlResult
158241
})()
159242

@@ -269,7 +352,7 @@ type CheckCase = {
269352
export type Transport = (
270353
url: URL,
271354
init: RequestInit | undefined,
272-
context: FetchContext & { previous: Response | undefined },
355+
context: FetchContext & { previous: Response | undefined; render?: boolean | undefined },
273356
) => Promise<Response | null>
274357

275358
export function defineTransport<options = void>(
@@ -279,6 +362,7 @@ export function defineTransport<options = void>(
279362
context: FetchContext & {
280363
options: options
281364
previous: Response | undefined
365+
render?: boolean | undefined
282366
},
283367
) => Promise<Response | null>,
284368
): (options?: options) => Transport {
@@ -388,6 +472,23 @@ const metaKeyPriority: Record<string, number> = {
388472
publish_date: 7,
389473
}
390474

475+
const spaMountIds = ['__next', '__nuxt', 'app', 'root', 'svelte'] as const
476+
const spaMountElementPattern = new RegExp(
477+
`<(?:div|main)\\b[^>]*(?:\\bid=["'](?:${spaMountIds.map(escapeRegExp).join('|')})["']|\\bdata-reactroot\\b)[^>]*>\\s*<\\/\\w+>`,
478+
'iu',
479+
)
480+
const embeddedMarkdownAttributes = ['data-agent-instructions'] as const
481+
const embeddedMarkdownElementPattern = new RegExp(
482+
'<([a-z][\\w:-]*)\\b[^>]*\\b(?:' +
483+
embeddedMarkdownAttributes.map(escapeRegExp).join('|') +
484+
')(?:\\s*=\\s*(?:"[^"]*"|\'[^\']*\'|[^\\s"\'=<>`]+))?[^>]*>([\\s\\S]*?)<\\/\\1>',
485+
'giu',
486+
)
487+
488+
function escapeRegExp(value: string): string {
489+
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
490+
}
491+
391492
function sortMeta(meta: Meta): Meta {
392493
return Object.fromEntries(
393494
Object.entries(meta).sort(
@@ -488,17 +589,6 @@ function normalizeFencedCodeBlockIndentation(content: string): string {
488589
.join('\n')
489590
}
490591

491-
function shouldRetryMarkdownUrl(
492-
markdownUrl: string | undefined,
493-
content: string,
494-
): markdownUrl is string {
495-
if (!markdownUrl) return false
496-
const trimmed = content.trim()
497-
if (trimmed === '') return true
498-
const lines = trimmed.split('\n').filter(Boolean)
499-
return trimmed.length < 120 && lines.length <= 3
500-
}
501-
502592
async function extractMarkdownResponse(
503593
response: Response,
504594
url: URL,

src/md/transports.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ export const cfBrowserRendering = defineTransport<{
55
accountId: string
66
apiToken: string
77
}>(async (url, init, context) => {
8-
if (context.previous?.status !== 403 || !context.options) return null
8+
if ((!context.render && context.previous?.status !== 403) || !context.options) return null
99
const signal = AbortSignal.timeout(20_000)
1010
const res = await context.fetch(
1111
`https://api.cloudflare.com/client/v4/accounts/${context.options.accountId}/browser-rendering/content`,
@@ -36,7 +36,8 @@ export const fetch = defineTransport<
3636
headers?: HeadersInit
3737
}
3838
| undefined
39-
>((url, init, context) => {
39+
>(async (url, init, context) => {
40+
if (context.render) return null
4041
return context.fetch(url, {
4142
...init,
4243
...(context.options && {

0 commit comments

Comments
 (0)