Skip to content

Commit fbd413f

Browse files
Copilotpethers
andauthored
fix: address review 4139627117 — lang attribute boundary + resilient strip pass
- getLangAttributeValue: require `lang` to be preceded by `<` (tag start) or whitespace, so attributes like `data-lang` / `xml:lang` are no longer mistakenly treated as `lang`. - stripLangTaggedBlocks: when parseTagAt returns null (stray `<` in text / malformed markup), advance one character and continue instead of breaking, so later lang-tagged blocks are still stripped. - Added regression tests for both cases. 41/41 leakage tests pass. Agent-Logs-Url: https://github.com/Hack23/riksdagsmonitor/sessions/5c8ca773-8b4f-480a-9557-3db1ca666cc1 Co-authored-by: pethers <1726836+pethers@users.noreply.github.com>
1 parent fed497f commit fbd413f

2 files changed

Lines changed: 30 additions & 2 deletions

File tree

scripts/detect-swedish-leakage.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,10 @@ function parseTagAt(html: string, startIndex: number): ParsedTag | null {
264264

265265
/** Extract the value of the `lang` attribute from a raw tag string, or null if absent. */
266266
function getLangAttributeValue(rawTag: string): string | null {
267-
const m = /\blang\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))/i.exec(rawTag);
267+
// Require `lang` to be a standalone attribute name — preceded by `<` (tag-start) or
268+
// whitespace — so attributes like `data-lang` or `xml:lang` are not mistakenly treated
269+
// as `lang`.
270+
const m = /(?:<|\s)lang\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))/i.exec(rawTag);
268271
return m?.[1] ?? m?.[2] ?? m?.[3] ?? null;
269272
}
270273

@@ -331,7 +334,12 @@ function stripLangTaggedBlocks(html: string, langCode: string): string {
331334
if (html[i] !== '<') { i++; continue; }
332335

333336
const parsed = parseTagAt(html, i);
334-
if (!parsed) break;
337+
if (!parsed) {
338+
// Malformed markup (stray `<` in text, etc.) — skip this character and keep
339+
// scanning so later lang-tagged blocks are still stripped.
340+
i++;
341+
continue;
342+
}
335343

336344
if (
337345
!parsed.isClosing &&

tests/detect-swedish-leakage.test.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,5 +348,25 @@ describe('Swedish Leakage Detector', () => {
348348
expect(report.leakedTerms).toEqual([]);
349349
expect(report.score).toBe(0);
350350
});
351+
352+
it('should not treat data-lang="sv" as a real lang attribute', () => {
353+
// `data-lang` is a custom data attribute, not the standard HTML `lang`.
354+
// Swedish inside such an element must still be detected as leakage.
355+
const html = '<p data-lang="sv">utskottet antog propositionen</p>';
356+
const report = detectSwedishLeakage(html, 'en');
357+
const terms = report.leakedTerms.map((t) => t.term);
358+
expect(terms).toContain('utskottet');
359+
expect(terms).toContain('propositionen');
360+
});
361+
362+
it('should keep scanning after a stray "<" in text content', () => {
363+
// A stray `<` (e.g. from "<" in prose) should not abort the strip pass —
364+
// the later lang="sv" block must still be suppressed.
365+
const html =
366+
'<p>Value is 5 < 10 when counting seats.</p>' +
367+
'<span lang="sv">riksdagen antog propositionen</span>';
368+
const report = detectSwedishLeakage(html, 'en');
369+
expect(report.leakedTerms).toEqual([]);
370+
});
351371
});
352372
});

0 commit comments

Comments
 (0)