@@ -255,37 +255,55 @@ export async function getRemoteDocument(
255255 contentType === "application/xhtml+xml" ||
256256 contentType ?. startsWith ( "application/xhtml+xml;" ) )
257257 ) {
258- const p =
259- / < ( a | l i n k ) ( ( \s + [ a - z ] [ a - z : _ - ] * = ( " [ ^ " ] * " | ' [ ^ ' ] * ' | [ ^ \s > ] + ) ) + ) \s * \/ ? > / ig;
260- const p2 = / \s + ( [ a - z ] [ a - z : _ - ] * ) = ( " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( [ ^ \s > ] + ) ) / ig;
258+ // Security: Limit HTML response size to mitigate ReDoS attacks
259+ const MAX_HTML_SIZE = 1024 * 1024 ; // 1MB
261260 const html = await response . text ( ) ;
262- let m : RegExpExecArray | null ;
263- const rawAttribs : string [ ] = [ ] ;
264- while ( ( m = p . exec ( html ) ) !== null ) rawAttribs . push ( m [ 2 ] ) ;
265- for ( const rawAttrs of rawAttribs ) {
266- let m2 : RegExpExecArray | null ;
267- const attribs : Record < string , string > = { } ;
268- while ( ( m2 = p2 . exec ( rawAttrs ) ) !== null ) {
269- const key = m2 [ 1 ] . toLowerCase ( ) ;
270- const value = m2 [ 3 ] ?? m2 [ 4 ] ?? m2 [ 5 ] ?? "" ;
271- attribs [ key ] = value ;
272- }
273- if (
274- attribs . rel === "alternate" && "type" in attribs && (
275- attribs . type === "application/activity+json" ||
276- attribs . type === "application/ld+json" ||
277- attribs . type . startsWith ( "application/ld+json;" )
278- ) && "href" in attribs &&
279- new URL ( attribs . href , docUrl ) . href !== docUrl . href
280- ) {
281- logger . debug (
282- "Found alternate document: {alternateUrl} from {url}" ,
283- { alternateUrl : attribs . href , url : documentUrl } ,
284- ) ;
285- return await fetch ( new URL ( attribs . href , docUrl ) . href ) ;
261+ if ( html . length > MAX_HTML_SIZE ) {
262+ logger . warn (
263+ "HTML response too large, skipping alternate link discovery: {url}" ,
264+ { url : documentUrl , size : html . length } ,
265+ ) ;
266+ document = JSON . parse ( html ) ;
267+ } else {
268+ // Safe regex patterns without nested quantifiers to prevent ReDoS
269+ // (CVE-2025-68475)
270+ // Step 1: Extract <a ...> or <link ...> tags
271+ const tagPattern = / < ( a | l i n k ) \s + ( [ ^ > ] * ?) \s * \/ ? > / gi;
272+ // Step 2: Parse attributes
273+ const attrPattern =
274+ / ( [ a - z ] [ a - z : _ - ] * ) = (?: " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( [ ^ \s > ] + ) ) / gi;
275+
276+ let tagMatch : RegExpExecArray | null ;
277+ while ( ( tagMatch = tagPattern . exec ( html ) ) !== null ) {
278+ const tagContent = tagMatch [ 2 ] ;
279+ let attrMatch : RegExpExecArray | null ;
280+ const attribs : Record < string , string > = { } ;
281+
282+ // Reset regex state for attribute parsing
283+ attrPattern . lastIndex = 0 ;
284+ while ( ( attrMatch = attrPattern . exec ( tagContent ) ) !== null ) {
285+ const key = attrMatch [ 1 ] . toLowerCase ( ) ;
286+ const value = attrMatch [ 2 ] ?? attrMatch [ 3 ] ?? attrMatch [ 4 ] ?? "" ;
287+ attribs [ key ] = value ;
288+ }
289+
290+ if (
291+ attribs . rel === "alternate" && "type" in attribs && (
292+ attribs . type === "application/activity+json" ||
293+ attribs . type === "application/ld+json" ||
294+ attribs . type . startsWith ( "application/ld+json;" )
295+ ) && "href" in attribs &&
296+ new URL ( attribs . href , docUrl ) . href !== docUrl . href
297+ ) {
298+ logger . debug (
299+ "Found alternate document: {alternateUrl} from {url}" ,
300+ { alternateUrl : attribs . href , url : documentUrl } ,
301+ ) ;
302+ return await fetch ( new URL ( attribs . href , docUrl ) . href ) ;
303+ }
286304 }
305+ document = JSON . parse ( html ) ;
287306 }
288- document = JSON . parse ( html ) ;
289307 } else {
290308 document = await response . json ( ) ;
291309 }
0 commit comments