Skip to content

Commit 6c3261f

Browse files
Copilotpethers
andauthored
fix(leakage): address reviewer feedback on b20b0b7
- Preserve newlines from entire matched block (including opening/closing tags) - Remove "kan" from comment to match actual `en` shared-word set - Clarify that lang="sv" stripping is gated by the targetLang early-return Agent-Logs-Url: https://github.com/Hack23/riksdagsmonitor/sessions/bb6c4781-9801-443e-89fc-f7445a905dca Co-authored-by: pethers <1726836+pethers@users.noreply.github.com>
1 parent 494563a commit 6c3261f

1 file changed

Lines changed: 10 additions & 6 deletions

File tree

scripts/detect-swedish-leakage.ts

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -228,10 +228,11 @@ function stripLangTaggedBlocks(html: string, langCode: string): string {
228228
`<([a-zA-Z][a-zA-Z0-9]*)(?=\\s)[^>]*\\blang\\s*=\\s*["']${safeLang}["'][^>]*>([\\s\\S]*?)<\\/\\1\\s*>`,
229229
'gi'
230230
);
231-
return html.replace(re, (_match, _tagName, inner) => {
232-
// Preserve newline count so line numbers remain accurate.
233-
const newlines = (inner.match(/\n/g) ?? []).length;
234-
return ' ' + '\n'.repeat(newlines) + ' ';
231+
return html.replace(re, (match) => {
232+
// Preserve all newline positions from the entire matched block, including
233+
// line breaks that may appear in the opening or closing tags, so downstream
234+
// line numbering stays stable.
235+
return match.replace(/[^\n]/g, ' ');
235236
});
236237
}
237238

@@ -328,7 +329,10 @@ export function detectSwedishLeakage(html: string, targetLang: Language): Leakag
328329
// Strip elements explicitly tagged as Swedish content (e.g. `<span lang="sv">...</span>`).
329330
// Text inside a `lang="sv"` element is deliberately quoted Swedish source material
330331
// (e.g. verbatim summaries from riksdagen.se) and MUST NOT count as translation leakage
331-
// in a non-Swedish article. Replacement preserves '\n' so reported line numbers stay accurate.
332+
// in a non-Swedish article. This call is only reached for non-Swedish targets because
333+
// `targetLang === 'sv'` short-circuits with an empty report above, so legitimate Swedish
334+
// text in Swedish articles is never removed. Replacement preserves '\n' so reported line
335+
// numbers stay accurate.
332336
cleaned = stripLangTaggedBlocks(cleaned, 'sv');
333337

334338
const lines = cleaned.split('\n');
@@ -396,7 +400,7 @@ const SHARED_WORDS: Partial<Record<Language, ReadonlySet<string>>> = {
396400
fr: new Set([]),
397401
es: new Set([]),
398402
fi: new Set([]),
399-
// English shares a small set of these short common forms with Swedish (e.g. "under", "kan"
403+
// English shares a very small set of these short common forms with Swedish (e.g. "under"
400404
// can appear legitimately in technical or proper-noun contexts). Keep this set narrow
401405
// and limited to tokens that are genuinely ambiguous in English prose.
402406
en: new Set(['under']),

0 commit comments

Comments
 (0)