@@ -191,37 +191,55 @@ export async function getRemoteDocument(
191191 contentType === "application/xhtml+xml" ||
192192 contentType ?. startsWith ( "application/xhtml+xml;" ) )
193193 ) {
194- const p =
195- / < ( a | l i n k ) ( ( \s + [ a - z ] [ a - z : _ - ] * = ( " [ ^ " ] * " | ' [ ^ ' ] * ' | [ ^ \s > ] + ) ) + ) \s * \/ ? > / ig;
196- const p2 = / \s + ( [ a - z ] [ a - z : _ - ] * ) = ( " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( [ ^ \s > ] + ) ) / ig;
194+ // Security: Limit HTML response size to mitigate ReDoS attacks
195+ const MAX_HTML_SIZE = 1024 * 1024 ; // 1MB
197196 const html = await response . text ( ) ;
198- let m : RegExpExecArray | null ;
199- const rawAttribs : string [ ] = [ ] ;
200- while ( ( m = p . exec ( html ) ) !== null ) rawAttribs . push ( m [ 2 ] ) ;
201- for ( const rawAttrs of rawAttribs ) {
202- let m2 : RegExpExecArray | null ;
203- const attribs : Record < string , string > = { } ;
204- while ( ( m2 = p2 . exec ( rawAttrs ) ) !== null ) {
205- const key = m2 [ 1 ] . toLowerCase ( ) ;
206- const value = m2 [ 3 ] ?? m2 [ 4 ] ?? m2 [ 5 ] ?? "" ;
207- attribs [ key ] = value ;
208- }
209- if (
210- attribs . rel === "alternate" && "type" in attribs && (
211- attribs . type === "application/activity+json" ||
212- attribs . type === "application/ld+json" ||
213- attribs . type . startsWith ( "application/ld+json;" )
214- ) && "href" in attribs &&
215- new URL ( attribs . href , docUrl ) . href !== docUrl . href
216- ) {
217- logger . debug (
218- "Found alternate document: {alternateUrl} from {url}" ,
219- { alternateUrl : attribs . href , url : documentUrl } ,
220- ) ;
221- return await fetch ( new URL ( attribs . href , docUrl ) . href ) ;
197+ if ( html . length > MAX_HTML_SIZE ) {
198+ logger . warn (
199+ "HTML response too large, skipping alternate link discovery: {url}" ,
200+ { url : documentUrl , size : html . length } ,
201+ ) ;
202+ document = JSON . parse ( html ) ;
203+ } else {
204+ // Safe regex patterns without nested quantifiers to prevent ReDoS
205+ // (CVE-2025-68475)
206+ // Step 1: Extract <a ...> or <link ...> tags
207+ const tagPattern = / < ( a | l i n k ) \s + ( [ ^ > ] * ?) \s * \/ ? > / gi;
208+ // Step 2: Parse attributes
209+ const attrPattern =
210+ / ( [ a - z ] [ a - z : _ - ] * ) = (?: " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( [ ^ \s > ] + ) ) / gi;
211+
212+ let tagMatch : RegExpExecArray | null ;
213+ while ( ( tagMatch = tagPattern . exec ( html ) ) !== null ) {
214+ const tagContent = tagMatch [ 2 ] ;
215+ let attrMatch : RegExpExecArray | null ;
216+ const attribs : Record < string , string > = { } ;
217+
218+ // Reset regex state for attribute parsing
219+ attrPattern . lastIndex = 0 ;
220+ while ( ( attrMatch = attrPattern . exec ( tagContent ) ) !== null ) {
221+ const key = attrMatch [ 1 ] . toLowerCase ( ) ;
222+ const value = attrMatch [ 2 ] ?? attrMatch [ 3 ] ?? attrMatch [ 4 ] ?? "" ;
223+ attribs [ key ] = value ;
224+ }
225+
226+ if (
227+ attribs . rel === "alternate" && "type" in attribs && (
228+ attribs . type === "application/activity+json" ||
229+ attribs . type === "application/ld+json" ||
230+ attribs . type . startsWith ( "application/ld+json;" )
231+ ) && "href" in attribs &&
232+ new URL ( attribs . href , docUrl ) . href !== docUrl . href
233+ ) {
234+ logger . debug (
235+ "Found alternate document: {alternateUrl} from {url}" ,
236+ { alternateUrl : attribs . href , url : documentUrl } ,
237+ ) ;
238+ return await fetch ( new URL ( attribs . href , docUrl ) . href ) ;
239+ }
222240 }
241+ document = JSON . parse ( html ) ;
223242 }
224- document = JSON . parse ( html ) ;
225243 } else {
226244 document = await response . json ( ) ;
227245 }
0 commit comments