@@ -140,7 +140,16 @@ export function create(options: create.Options = {}): create.ReturnType {
140140 baseUrl : inputURL . href ,
141141 profile,
142142 } )
143- if ( shouldRetryMarkdownUrl ( profile ?. markdownUrl , htmlResult . content ) ) {
143+
144+ const shouldUseFallbackContent = ( ( ) => {
145+ // Only pay fallback costs when generic HTML extraction produced almost nothing.
146+ const trimmed = htmlResult . content . trim ( )
147+ if ( trimmed === '' ) return true
148+ const lines = trimmed . split ( '\n' ) . filter ( Boolean )
149+ return trimmed . length < 120 && lines . length <= 3
150+ } ) ( )
151+
152+ if ( profile ?. markdownUrl && shouldUseFallbackContent ) {
144153 try {
145154 const url = new URL ( profile . markdownUrl )
146155 const markdownResponse = await fetchResponse ( url )
@@ -154,6 +163,80 @@ export function create(options: create.Options = {}): create.ReturnType {
154163 }
155164 } catch { }
156165 }
166+
167+ const isSpaShell = ( ( ) => {
168+ // Client-only apps usually ship an empty mount node plus a JS entrypoint.
169+ if ( ! / < s c r i p t \b [ ^ > ] * (?: \b s r c = | \b t y p e = [ " ' ] m o d u l e [ " ' ] ) / iu. test ( text ) ) return false
170+ return spaMountElementPattern . test ( text )
171+ } ) ( )
172+ if ( isSpaShell && shouldUseFallbackContent ) {
173+ try {
174+ const renderedResponse = await ( async ( ) => {
175+ // Ask the transport chain for an expensive browser-rendered retry only on SPA shells.
176+ if ( ! options . transport ) return
177+ return (
178+ ( await options . transport ( inputURL , requestInit , {
179+ ...context ,
180+ previous : response ,
181+ render : true ,
182+ } ) ) ?? undefined
183+ )
184+ } ) ( )
185+ if ( renderedResponse ?. ok ) {
186+ const renderedText = await renderedResponse . text ( )
187+ const renderedProfile = detectPageProfile ( renderedText , inputURL , profiles ) ?? profile
188+ const renderedHtmlResult = await fromHtml ( renderedText , {
189+ baseUrl : inputURL . href ,
190+ profile : renderedProfile ,
191+ } )
192+ const renderedContentIsUseful = ( ( ) => {
193+ // Rendered HTML must beat the same thin-content threshold to replace source HTML.
194+ const trimmed = renderedHtmlResult . content . trim ( )
195+ if ( trimmed === '' ) return false
196+ const lines = trimmed . split ( '\n' ) . filter ( Boolean )
197+ return trimmed . length >= 120 || lines . length > 3
198+ } ) ( )
199+ if ( renderedContentIsUseful ) {
200+ profile = renderedProfile
201+ return renderedHtmlResult
202+ }
203+ }
204+ } catch { }
205+ }
206+
207+ const embeddedMarkdown = ( ( ) => {
208+ // Some SPAs embed LLM-ready markdown in hidden agent instruction blocks.
209+ for ( const match of text . matchAll ( embeddedMarkdownElementPattern ) ) {
210+ const entities : Record < string , string > = {
211+ amp : '&' ,
212+ apos : "'" ,
213+ gt : '>' ,
214+ lt : '<' ,
215+ nbsp : ' ' ,
216+ quot : '"' ,
217+ }
218+ const content = ( match [ 2 ] ?? '' )
219+ . replace ( / < b r \s * \/ ? > / giu, '\n' )
220+ . replace ( / < [ ^ > ] + > / gu, '' )
221+ . replace ( / & ( # x [ \d a - f ] + | # \d + | [ a - z ] + ) ; / giu, ( entityMatch , entity : string ) => {
222+ const key = entity . toLowerCase ( )
223+ if ( key . startsWith ( '#x' ) )
224+ return String . fromCodePoint ( Number . parseInt ( key . slice ( 2 ) , 16 ) )
225+ if ( key . startsWith ( '#' ) )
226+ return String . fromCodePoint ( Number . parseInt ( key . slice ( 1 ) , 10 ) )
227+ return entities [ key ] ?? entityMatch
228+ } )
229+ . trim ( )
230+ if ( content ) return content
231+ }
232+ } ) ( )
233+ if ( embeddedMarkdown && shouldUseFallbackContent ) {
234+ const split = splitFrontmatter ( embeddedMarkdown )
235+ return {
236+ content : split . body ,
237+ meta : { ...filterFrontmatterKeys ( split . meta ) , ...htmlResult . meta } ,
238+ }
239+ }
157240 return htmlResult
158241 } ) ( )
159242
@@ -269,7 +352,7 @@ type CheckCase = {
269352export type Transport = (
270353 url : URL ,
271354 init : RequestInit | undefined ,
272- context : FetchContext & { previous : Response | undefined } ,
355+ context : FetchContext & { previous : Response | undefined ; render ?: boolean | undefined } ,
273356) => Promise < Response | null >
274357
275358export function defineTransport < options = void > (
@@ -279,6 +362,7 @@ export function defineTransport<options = void>(
279362 context : FetchContext & {
280363 options : options
281364 previous : Response | undefined
365+ render ?: boolean | undefined
282366 } ,
283367 ) => Promise < Response | null > ,
284368) : ( options ?: options ) => Transport {
@@ -388,6 +472,23 @@ const metaKeyPriority: Record<string, number> = {
388472 publish_date : 7 ,
389473}
390474
475+ const spaMountIds = [ '__next' , '__nuxt' , 'app' , 'root' , 'svelte' ] as const
476+ const spaMountElementPattern = new RegExp (
477+ `<(?:div|main)\\b[^>]*(?:\\bid=["'](?:${ spaMountIds . map ( escapeRegExp ) . join ( '|' ) } )["']|\\bdata-reactroot\\b)[^>]*>\\s*<\\/\\w+>` ,
478+ 'iu' ,
479+ )
480+ const embeddedMarkdownAttributes = [ 'data-agent-instructions' ] as const
481+ const embeddedMarkdownElementPattern = new RegExp (
482+ '<([a-z][\\w:-]*)\\b[^>]*\\b(?:' +
483+ embeddedMarkdownAttributes . map ( escapeRegExp ) . join ( '|' ) +
484+ ')(?:\\s*=\\s*(?:"[^"]*"|\'[^\']*\'|[^\\s"\'=<>`]+))?[^>]*>([\\s\\S]*?)<\\/\\1>' ,
485+ 'giu' ,
486+ )
487+
488+ function escapeRegExp ( value : string ) : string {
489+ return value . replace ( / [ . * + ? ^ $ { } ( ) | [ \] \\ ] / g, '\\$&' )
490+ }
491+
391492function sortMeta ( meta : Meta ) : Meta {
392493 return Object . fromEntries (
393494 Object . entries ( meta ) . sort (
@@ -488,17 +589,6 @@ function normalizeFencedCodeBlockIndentation(content: string): string {
488589 . join ( '\n' )
489590}
490591
491- function shouldRetryMarkdownUrl (
492- markdownUrl : string | undefined ,
493- content : string ,
494- ) : markdownUrl is string {
495- if ( ! markdownUrl ) return false
496- const trimmed = content . trim ( )
497- if ( trimmed === '' ) return true
498- const lines = trimmed . split ( '\n' ) . filter ( Boolean )
499- return trimmed . length < 120 && lines . length <= 3
500- }
501-
502592async function extractMarkdownResponse (
503593 response : Response ,
504594 url : URL ,
0 commit comments