Skip to content

Commit b5d9646

Browse files
Copilotpethers
andauthored
fix(leakage): BCP-47 subtag + nested-tag support in stripLangTaggedBlocks
- Replace regex-based matcher with index-based tag scanner that tracks depth so nested same-name elements (<div lang=\"sv\"><div>...</div></div>) are handled correctly. - Skip quoted attribute values containing '>' when locating tag ends. - langMatches() now treats lang=\"sv-SE\", lang=\"sv-FI\", etc. as Swedish (BCP-47 subtag prefix match), so quoted Swedish source material in any locale variant is correctly suppressed from leakage detection. - Add 5 regression tests: subtag match, nested elements, multi-line opening tags, non-sv lang attributes still scanned, and line-number preservation across multi-line blocks. Score on the published 2026-04-20 EN article is unchanged (4 occurrences, all in legitimate English prose discussing Swedish terms) — no behavioural regression, only improved robustness and correctness. Agent-Logs-Url: https://github.com/Hack23/riksdagsmonitor/sessions/60f52112-0470-4b44-a28d-2292d19280fb Co-authored-by: pethers <1726836+pethers@users.noreply.github.com>
1 parent 6c3261f commit b5d9646

2 files changed

Lines changed: 184 additions & 17 deletions

File tree

scripts/detect-swedish-leakage.ts

Lines changed: 136 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -211,29 +211,148 @@ function stripTagBlocks(html: string, tagNames: ReadonlyArray<string>, preserveN
211211
}
212212

213213
/**
214-
* Strip the inner contents of any HTML element carrying `lang="{langCode}"` (or `lang='{langCode}'`).
215-
* Replaces the inner text with whitespace while preserving '\n' characters so that line
216-
* numbering downstream remains stable. Uses a regex with a back-reference to the opening tag
217-
* name to handle arbitrary element types (span, p, div, blockquote, etc.).
214+
* Locate the closing `>` of an HTML tag starting at `startIndex`, correctly skipping
215+
* quoted attribute values so that `>` inside `"..."` or `'...'` is not treated as the tag end.
216+
* Returns -1 when no closing `>` is found.
217+
*/
218+
function findTagEnd(html: string, startIndex: number): number {
219+
let quote: '"' | "'" | null = null;
220+
for (let i = startIndex + 1; i < html.length; i++) {
221+
const ch = html[i];
222+
if (quote !== null) {
223+
if (ch === quote) quote = null;
224+
continue;
225+
}
226+
if (ch === '"' || ch === "'") {
227+
quote = ch;
228+
continue;
229+
}
230+
if (ch === '>') return i;
231+
}
232+
return -1;
233+
}
234+
235+
interface ParsedTag {
236+
tagEnd: number;
237+
tagName: string | null;
238+
isClosing: boolean;
239+
isSelfClosing: boolean;
240+
rawTag: string;
241+
}
242+
243+
/** Parse the tag starting at `startIndex`; returns null if no `<` is at that position. */
244+
function parseTagAt(html: string, startIndex: number): ParsedTag | null {
245+
if (html[startIndex] !== '<') return null;
246+
const tagEnd = findTagEnd(html, startIndex);
247+
if (tagEnd === -1) return null;
248+
249+
const rawTag = html.slice(startIndex, tagEnd + 1);
250+
const inner = rawTag.slice(1, -1).trim();
251+
// Skip comments, DOCTYPE, and processing instructions.
252+
if (inner.length === 0 || inner.startsWith('!') || inner.startsWith('?')) {
253+
return { tagEnd, tagName: null, isClosing: false, isSelfClosing: false, rawTag };
254+
}
255+
256+
const isClosing = inner.startsWith('/');
257+
const nameSource = isClosing ? inner.slice(1).trimStart() : inner;
258+
const nameMatch = /^([a-zA-Z][a-zA-Z0-9]*)/.exec(nameSource);
259+
const tagName = nameMatch ? nameMatch[1].toLowerCase() : null;
260+
const isSelfClosing = !isClosing && /\/\s*>$/.test(rawTag);
261+
262+
return { tagEnd, tagName, isClosing, isSelfClosing, rawTag };
263+
}
264+
265+
/** Extract the value of the `lang` attribute from a raw tag string, or null if absent. */
266+
function getLangAttributeValue(rawTag: string): string | null {
267+
const m = /\blang\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))/i.exec(rawTag);
268+
return m?.[1] ?? m?.[2] ?? m?.[3] ?? null;
269+
}
270+
271+
/**
272+
* Check whether a `lang` attribute value matches the given language code,
273+
* supporting BCP-47 subtags (e.g. `sv-SE` matches `sv`, `sv-FI` matches `sv`).
274+
*/
275+
function langMatches(langValue: string | null, langCode: string): boolean {
276+
if (!langValue || !langCode) return false;
277+
const v = langValue.toLowerCase();
278+
const c = langCode.toLowerCase();
279+
return v === c || v.startsWith(`${c}-`);
280+
}
281+
282+
/**
283+
* Find the index of the matching closing tag for the element opened at `openingTagStart`.
284+
* Uses depth tracking to correctly handle nested elements with the same tag name.
285+
* Returns the index of the final `>` of the closing tag, or -1 when not found.
286+
*/
287+
function findMatchingTaggedBlockEnd(html: string, openingTagStart: number, tagName: string): number {
288+
const opening = parseTagAt(html, openingTagStart);
289+
if (!opening) return -1;
290+
291+
let depth = 1;
292+
let i = opening.tagEnd + 1;
293+
while (i < html.length) {
294+
if (html[i] !== '<') { i++; continue; }
295+
const parsed = parseTagAt(html, i);
296+
if (!parsed) break;
297+
298+
if (parsed.tagName === tagName) {
299+
if (parsed.isClosing) {
300+
depth--;
301+
if (depth === 0) return parsed.tagEnd;
302+
} else if (!parsed.isSelfClosing) {
303+
depth++;
304+
}
305+
}
306+
i = parsed.tagEnd + 1;
307+
}
308+
return -1;
309+
}
310+
311+
/**
312+
* Strip the inner contents of any HTML element whose `lang` attribute matches `langCode`
313+
* (including BCP-47 subtags — `lang="sv-SE"` matches `langCode="sv"`). Uses an index-based
314+
* tag scanner with depth tracking so nested elements of the same tag name are handled
315+
* correctly, and quoted attribute values containing `>` do not break parsing.
316+
*
317+
* The full matched block is replaced with whitespace while `\n` characters are preserved
318+
* so downstream line numbering stays stable.
218319
*
219320
* This enables `detectSwedishLeakage` to correctly ignore deliberately quoted Swedish source
220321
* material (e.g. verbatim summaries) embedded inside `<span lang="sv">...</span>` wrappers.
221322
*/
222323
function stripLangTaggedBlocks(html: string, langCode: string): string {
223-
const safeLang = langCode.replace(/[^a-zA-Z-]/g, '');
324+
const safeLang = langCode.replace(/[^a-zA-Z0-9-]/g, '').toLowerCase();
224325
if (safeLang.length === 0) return html;
225-
// Match <TAG ...lang="XX"...>...</TAG> with the closing tag matching the opening tag name.
226-
// Non-greedy inner match; multi-line enabled implicitly via [\s\S] character class.
227-
const re = new RegExp(
228-
`<([a-zA-Z][a-zA-Z0-9]*)(?=\\s)[^>]*\\blang\\s*=\\s*["']${safeLang}["'][^>]*>([\\s\\S]*?)<\\/\\1\\s*>`,
229-
'gi'
230-
);
231-
return html.replace(re, (match) => {
232-
// Preserve all newline positions from the entire matched block, including
233-
// line breaks that may appear in the opening or closing tags, so downstream
234-
// line numbering stays stable.
235-
return match.replace(/[^\n]/g, ' ');
236-
});
326+
327+
const chars = html.split('');
328+
329+
let i = 0;
330+
while (i < html.length) {
331+
if (html[i] !== '<') { i++; continue; }
332+
333+
const parsed = parseTagAt(html, i);
334+
if (!parsed) break;
335+
336+
if (
337+
!parsed.isClosing &&
338+
!parsed.isSelfClosing &&
339+
parsed.tagName !== null &&
340+
langMatches(getLangAttributeValue(parsed.rawTag), safeLang)
341+
) {
342+
const blockEnd = findMatchingTaggedBlockEnd(html, i, parsed.tagName);
343+
if (blockEnd !== -1) {
344+
for (let j = i; j <= blockEnd; j++) {
345+
if (chars[j] !== '\n') chars[j] = ' ';
346+
}
347+
i = blockEnd + 1;
348+
continue;
349+
}
350+
}
351+
352+
i = parsed.tagEnd + 1;
353+
}
354+
355+
return chars.join('');
237356
}
238357

239358
/** Remove all remaining HTML tags using an index-based state machine. */

tests/detect-swedish-leakage.test.ts

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,5 +289,53 @@ describe('Swedish Leakage Detector', () => {
289289
expect(terms).toContain('arbetsmarknadsdepartementet');
290290
expect(terms).toContain('näringsdepartementet');
291291
});
292+
293+
it('should ignore Swedish text inside lang="sv" blocks when scanning an English article', () => {
294+
const html = '<p>The committee report states: <span lang="sv">riksdagen antog propositionen</span>.</p>';
295+
const report = detectSwedishLeakage(html, 'en');
296+
expect(report.leakedTerms).toEqual([]);
297+
expect(report.score).toBe(0);
298+
});
299+
300+
it('should ignore Swedish text inside lang="sv-SE" (BCP-47 subtag) blocks', () => {
301+
const html = '<p>Quote: <blockquote lang="sv-SE">riksdagen antog propositionen och utskottet.</blockquote></p>';
302+
const report = detectSwedishLeakage(html, 'en');
303+
expect(report.leakedTerms).toEqual([]);
304+
expect(report.score).toBe(0);
305+
});
306+
307+
it('should handle nested same-name elements inside a lang="sv" block', () => {
308+
// Outer <div lang="sv"> contains nested <div>s; the stack-based scanner must not
309+
// stop at the first </div> but continue to the matching closing tag.
310+
const html =
311+
'<div lang="sv"><div>propositionen</div><div>riksdagen antog utskottet</div></div>' +
312+
'<p>English text follows.</p>';
313+
const report = detectSwedishLeakage(html, 'en');
314+
expect(report.leakedTerms).toEqual([]);
315+
expect(report.score).toBe(0);
316+
});
317+
318+
it('should preserve line numbers when stripping lang="sv" blocks spanning multiple lines', () => {
319+
const html =
320+
'<p>Line 1 content</p>\n' +
321+
'<p>Line 2 content</p>\n' +
322+
'<span\nlang="sv">\nriksdagen\n</span>\n' +
323+
'<p>propositionen on line 7</p>';
324+
const report = detectSwedishLeakage(html, 'en');
325+
// The Swedish tokens inside the lang="sv" block are suppressed; the one in the plain
326+
// <p> on line 7 is detected and reported at its true line number.
327+
const terms = report.leakedTerms.map((t) => t.term);
328+
expect(terms).toContain('propositionen');
329+
const proposition = report.leakedTerms.find((t) => t.term === 'propositionen');
330+
expect(proposition?.line).toBe(7);
331+
});
332+
333+
it('should still scan text inside elements whose lang attribute is NOT sv', () => {
334+
const html = '<p lang="en">utskottet antog propositionen</p>';
335+
const report = detectSwedishLeakage(html, 'en');
336+
const terms = report.leakedTerms.map((t) => t.term);
337+
expect(terms).toContain('utskottet');
338+
expect(terms).toContain('propositionen');
339+
});
292340
});
293341
});

0 commit comments

Comments
 (0)