@@ -211,29 +211,148 @@ function stripTagBlocks(html: string, tagNames: ReadonlyArray<string>, preserveN
211211}
212212
213213/**
214- * Strip the inner contents of any HTML element carrying `lang="{langCode}"` (or `lang='{langCode}'`).
215- * Replaces the inner text with whitespace while preserving '\n' characters so that line
216- * numbering downstream remains stable. Uses a regex with a back-reference to the opening tag
217- * name to handle arbitrary element types (span, p, div, blockquote, etc.).
214+ * Locate the closing `>` of an HTML tag starting at `startIndex`, correctly skipping
215+ * quoted attribute values so that `>` inside `"..."` or `'...'` is not treated as the tag end.
216+ * Returns -1 when no closing `>` is found.
217+ */
218+ function findTagEnd ( html : string , startIndex : number ) : number {
219+ let quote : '"' | "'" | null = null ;
220+ for ( let i = startIndex + 1 ; i < html . length ; i ++ ) {
221+ const ch = html [ i ] ;
222+ if ( quote !== null ) {
223+ if ( ch === quote ) quote = null ;
224+ continue ;
225+ }
226+ if ( ch === '"' || ch === "'" ) {
227+ quote = ch ;
228+ continue ;
229+ }
230+ if ( ch === '>' ) return i ;
231+ }
232+ return - 1 ;
233+ }
234+
235+ interface ParsedTag {
236+ tagEnd : number ;
237+ tagName : string | null ;
238+ isClosing : boolean ;
239+ isSelfClosing : boolean ;
240+ rawTag : string ;
241+ }
242+
243+ /** Parse the tag starting at `startIndex`; returns null if no `<` is at that position. */
244+ function parseTagAt ( html : string , startIndex : number ) : ParsedTag | null {
245+ if ( html [ startIndex ] !== '<' ) return null ;
246+ const tagEnd = findTagEnd ( html , startIndex ) ;
247+ if ( tagEnd === - 1 ) return null ;
248+
249+ const rawTag = html . slice ( startIndex , tagEnd + 1 ) ;
250+ const inner = rawTag . slice ( 1 , - 1 ) . trim ( ) ;
251+ // Skip comments, DOCTYPE, and processing instructions.
252+ if ( inner . length === 0 || inner . startsWith ( '!' ) || inner . startsWith ( '?' ) ) {
253+ return { tagEnd, tagName : null , isClosing : false , isSelfClosing : false , rawTag } ;
254+ }
255+
256+ const isClosing = inner . startsWith ( '/' ) ;
257+ const nameSource = isClosing ? inner . slice ( 1 ) . trimStart ( ) : inner ;
258+ const nameMatch = / ^ ( [ a - z A - Z ] [ a - z A - Z 0 - 9 ] * ) / . exec ( nameSource ) ;
259+ const tagName = nameMatch ? nameMatch [ 1 ] . toLowerCase ( ) : null ;
260+ const isSelfClosing = ! isClosing && / \/ \s * > $ / . test ( rawTag ) ;
261+
262+ return { tagEnd, tagName, isClosing, isSelfClosing, rawTag } ;
263+ }
264+
265+ /** Extract the value of the `lang` attribute from a raw tag string, or null if absent. */
266+ function getLangAttributeValue ( rawTag : string ) : string | null {
267+ const m = / \b l a n g \s * = \s * (?: " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( [ ^ \s " ' = < > ` ] + ) ) / i. exec ( rawTag ) ;
268+ return m ?. [ 1 ] ?? m ?. [ 2 ] ?? m ?. [ 3 ] ?? null ;
269+ }
270+
271+ /**
272+ * Check whether a `lang` attribute value matches the given language code,
273+ * supporting BCP-47 subtags (e.g. `sv-SE` matches `sv`, `sv-FI` matches `sv`).
274+ */
275+ function langMatches ( langValue : string | null , langCode : string ) : boolean {
276+ if ( ! langValue || ! langCode ) return false ;
277+ const v = langValue . toLowerCase ( ) ;
278+ const c = langCode . toLowerCase ( ) ;
279+ return v === c || v . startsWith ( `${ c } -` ) ;
280+ }
281+
282+ /**
283+ * Find the index of the matching closing tag for the element opened at `openingTagStart`.
284+ * Uses depth tracking to correctly handle nested elements with the same tag name.
285+ * Returns the index of the final `>` of the closing tag, or -1 when not found.
286+ */
287+ function findMatchingTaggedBlockEnd ( html : string , openingTagStart : number , tagName : string ) : number {
288+ const opening = parseTagAt ( html , openingTagStart ) ;
289+ if ( ! opening ) return - 1 ;
290+
291+ let depth = 1 ;
292+ let i = opening . tagEnd + 1 ;
293+ while ( i < html . length ) {
294+ if ( html [ i ] !== '<' ) { i ++ ; continue ; }
295+ const parsed = parseTagAt ( html , i ) ;
296+ if ( ! parsed ) break ;
297+
298+ if ( parsed . tagName === tagName ) {
299+ if ( parsed . isClosing ) {
300+ depth -- ;
301+ if ( depth === 0 ) return parsed . tagEnd ;
302+ } else if ( ! parsed . isSelfClosing ) {
303+ depth ++ ;
304+ }
305+ }
306+ i = parsed . tagEnd + 1 ;
307+ }
308+ return - 1 ;
309+ }
310+
311+ /**
312+ * Strip the inner contents of any HTML element whose `lang` attribute matches `langCode`
313+ * (including BCP-47 subtags — `lang="sv-SE"` matches `langCode="sv"`). Uses an index-based
314+ * tag scanner with depth tracking so nested elements of the same tag name are handled
315+ * correctly, and quoted attribute values containing `>` do not break parsing.
316+ *
317+ * The full matched block is replaced with whitespace while `\n` characters are preserved
318+ * so downstream line numbering stays stable.
218319 *
219320 * This enables `detectSwedishLeakage` to correctly ignore deliberately quoted Swedish source
220321 * material (e.g. verbatim summaries) embedded inside `<span lang="sv">...</span>` wrappers.
221322 */
222323function stripLangTaggedBlocks ( html : string , langCode : string ) : string {
223- const safeLang = langCode . replace ( / [ ^ a - z A - Z - ] / g, '' ) ;
324+ const safeLang = langCode . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '' ) . toLowerCase ( ) ;
224325 if ( safeLang . length === 0 ) return html ;
225- // Match <TAG ...lang="XX"...>...</TAG> with the closing tag matching the opening tag name.
226- // Non-greedy inner match; multi-line enabled implicitly via [\s\S] character class.
227- const re = new RegExp (
228- `<([a-zA-Z][a-zA-Z0-9]*)(?=\\s)[^>]*\\blang\\s*=\\s*["']${ safeLang } ["'][^>]*>([\\s\\S]*?)<\\/\\1\\s*>` ,
229- 'gi'
230- ) ;
231- return html . replace ( re , ( match ) => {
232- // Preserve all newline positions from the entire matched block, including
233- // line breaks that may appear in the opening or closing tags, so downstream
234- // line numbering stays stable.
235- return match . replace ( / [ ^ \n ] / g, ' ' ) ;
236- } ) ;
326+
327+ const chars = html . split ( '' ) ;
328+
329+ let i = 0 ;
330+ while ( i < html . length ) {
331+ if ( html [ i ] !== '<' ) { i ++ ; continue ; }
332+
333+ const parsed = parseTagAt ( html , i ) ;
334+ if ( ! parsed ) break ;
335+
336+ if (
337+ ! parsed . isClosing &&
338+ ! parsed . isSelfClosing &&
339+ parsed . tagName !== null &&
340+ langMatches ( getLangAttributeValue ( parsed . rawTag ) , safeLang )
341+ ) {
342+ const blockEnd = findMatchingTaggedBlockEnd ( html , i , parsed . tagName ) ;
343+ if ( blockEnd !== - 1 ) {
344+ for ( let j = i ; j <= blockEnd ; j ++ ) {
345+ if ( chars [ j ] !== '\n' ) chars [ j ] = ' ' ;
346+ }
347+ i = blockEnd + 1 ;
348+ continue ;
349+ }
350+ }
351+
352+ i = parsed . tagEnd + 1 ;
353+ }
354+
355+ return chars . join ( '' ) ;
237356}
238357
239358/** Remove all remaining HTML tags using an index-based state machine. */
0 commit comments