Skip to content

Commit b38e950

Browse files
heiskrCopilot
andauthored
Fix critical regex backtracking and perf issues in translation corruption fixer (#60364)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 4d4128b commit b38e950

File tree

2 files changed

+868
-58
lines changed

2 files changed

+868
-58
lines changed

src/languages/lib/correct-translation-content.ts

Lines changed: 72 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -207,8 +207,10 @@ export function correctTranslatedContentStrings(
207207
if (context.code === 'zh') {
208208
content = content.replaceAll('{% 数据variables', '{% data variables')
209209
content = content.replaceAll('{% 数据 variables', '{% data variables')
210-
content = content.replaceAll('{% 数据可重用', '{% data reusables')
210+
// Order matters: the more specific `s.` variant must run first to
211+
// avoid the broader rule producing a double-s (`reusabless`).
211212
content = content.replaceAll('{% 数据可重用s.', '{% data reusables.')
213+
content = content.replaceAll('{% 数据可重用', '{% data reusables')
212214
content = content.replaceAll('{% 其他 %}', '{% else %}')
213215
content = content.replaceAll('{% 原始 %}', '{% raw %}')
214216
// Chinese `如果` = "if": `{ 如果 X %}` → `{% if X %}`
@@ -224,6 +226,7 @@ export function correctTranslatedContentStrings(
224226

225227
if (context.code === 'ru') {
226228
content = content.replaceAll('[«AUTOTITLE»](', '[AUTOTITLE](')
229+
content = content.replaceAll('[АВТОЗАГОЛОВОК](', '[AUTOTITLE](')
227230
content = content.replaceAll('{% данных variables', '{% data variables')
228231
content = content.replaceAll('{% данных, variables', '{% data variables')
229232
content = content.replaceAll('{% данными variables', '{% data variables')
@@ -265,7 +268,15 @@ export function correctTranslatedContentStrings(
265268
content = content.replaceAll('{% конечным %}', '{% endif %}')
266269
// `{% конец %}` after `{% raw %}` means `{% endraw %}`, not `{% endif %}`.
267270
// Handle this BEFORE the generic `{% конец %}` → `{% endif %}` fallback.
268-
content = content.replace(/(\{% raw %\}[^]*?)\{% конец %\}/g, '$1{% endraw %}')
271+
// We use a split-based approach instead of `[^]*?` regex to avoid
272+
// catastrophic backtracking on large content (~20s on 150KB inputs).
273+
if (content.includes('{% конец %}') && content.includes('{% raw %}')) {
274+
const parts = content.split('{% raw %}')
275+
for (let i = 1; i < parts.length; i++) {
276+
parts[i] = parts[i].replace('{% конец %}', '{% endraw %}')
277+
}
278+
content = parts.join('{% raw %}')
279+
}
269280
content = content.replaceAll('{% конец %}', '{% endif %}')
270281
// Cyrillic transliteration of `elsif` (lossy → else, since version param is lost)
271282
content = content.replaceAll('{% Эльсиф %}', '{% else %}')
@@ -314,11 +325,21 @@ export function correctTranslatedContentStrings(
314325
// Rearranged `{% data VARIABLE_PATH %}` → `VARIABLE_PATH %данн... {% }`
315326
// The translation moved `data` (as `данных`/`данными`/`данные`) after the path
316327
// and split `%}` into `{% }` or `{% }`. Reconstruct the original tag.
317-
content = content.replace(/([\w.-]+\.[\w.-]+\.[\w_]+) %данн\w*[^{]*\{%\s+\}/g, '{% data $1 %}')
318-
// Variant where `%}` appears BEFORE `данных`: `PATH %}данных {% .`
319-
content = content.replace(/([\w.-]+\.[\w.-]+\.[\w_]+) %\}данн\w*\s*\{%\s*\./g, '{% data $1 %}.')
320-
// Variant where path precedes `%{% data }`: `PATH %{% data }.`
321-
content = content.replace(/([\w.-]+\.[\w.-]+\.[\w_]+) %\{% data\s+\}/g, '{% data $1 %}')
328+
// Guard: these regexes start with [\w.-]+ which backtracks O(n²) on large word-char strings.
329+
if (content.includes('%данн')) {
330+
content = content.replace(
331+
/([\w.-]+\.[\w.-]+\.[\w_]+) %данн\w*[^{]*\{%\s+\}/g,
332+
'{% data $1 %}',
333+
)
334+
content = content.replace(
335+
/([\w.-]+\.[\w.-]+\.[\w_]+) %\}данн\w*\s*\{%\s*\./g,
336+
'{% data $1 %}.',
337+
)
338+
}
339+
if (content.includes('%{% data')) {
340+
// Variant where path precedes `%{% data }`: `PATH %{% data }.`
341+
content = content.replace(/([\w.-]+\.[\w.-]+\.[\w_]+) %\{% data\s+\}/g, '{% data $1 %}')
342+
}
322343

323344
// Translated octicon names
324345
content = content.replaceAll(
@@ -400,6 +421,10 @@ export function correctTranslatedContentStrings(
400421
content = content.replaceAll('{% Tipp %}', '{% tip %}')
401422
content = content.replaceAll('{%- Tipp %}', '{%- tip %}')
402423
content = content.replaceAll('{%- Tipp -%}', '{%- tip -%}')
424+
// Translated for-loop keywords: `für VARNAME in COLLECTION`
425+
content = content.replace(/\{%-? für (\w+) in /g, (match) => {
426+
return match.replace('für', 'for')
427+
})
403428
}
404429

405430
// --- Generic fixes (all languages) ---
@@ -472,7 +497,15 @@ export function correctTranslatedContentStrings(
472497
content = content.replaceAll('{% %} de dados variables.', '{% data variables.')
473498

474499
// Fix `{% %}` used as `{% endraw %}` (follows raw content with Liquid expressions).
475-
content = content.replace(/(\{% raw %\}[^]*?)\{% %\}/g, '$1{% endraw %}')
500+
// We use a split-based approach instead of `[^]*?` regex to avoid
501+
// catastrophic backtracking on large content (~20s on 150KB inputs).
502+
if (content.includes('{% %}') && content.includes('{% raw %}')) {
503+
const parts = content.split('{% raw %}')
504+
for (let i = 1; i < parts.length; i++) {
505+
parts[i] = parts[i].replace('{% %}', '{% endraw %}')
506+
}
507+
content = parts.join('{% raw %}')
508+
}
476509

477510
// Fix `{% %}` used as `{% else %}` when it appears between ifversion and
478511
// endif on the same line: `{% ifversion X %}A{% %}B{% endif %}`.
@@ -509,33 +542,49 @@ export function correctTranslatedContentStrings(
509542
// Recover linebreaks that translations lose after Liquid closing tags.
510543
// Compares each `{% ... %} ` in the translation against the English
511544
// to see if it should be `{% ... %}\n` instead.
512-
content = content.replace(/\{%(.+?)%\} /g, (match) => {
513-
if (match.lastIndexOf('{%') > 0) return match
514-
const withLinebreak = `${match.slice(0, -1)}\n`
515-
if (englishContent.includes(withLinebreak) && !englishContent.includes(match)) {
516-
return withLinebreak
545+
// Pre-build a Set of English Liquid-tag-with-linebreak strings so we
546+
// avoid O(tags × contentLength) repeated `String.includes()` scans.
547+
if (englishContent) {
548+
const englishLinebreaks = new Set<string>()
549+
const englishSpaces = new Set<string>()
550+
for (const m of englishContent.matchAll(/\{%.+?%\}[\n ]/g)) {
551+
if (m[0].endsWith('\n')) englishLinebreaks.add(m[0])
552+
else englishSpaces.add(m[0])
517553
}
518-
return match
519-
})
520-
// Special case: `{% endif %} | ` → `{% endif %}\n| ` when English has it.
521-
content = content.replace(/\{% endif %\} \| /g, (match) => {
522-
const potentiallyBetter = '{% endif %}\n| '
523-
if (englishContent.includes(potentiallyBetter)) {
524-
return potentiallyBetter
554+
if (englishLinebreaks.size > 0) {
555+
content = content.replace(/\{%(.+?)%\} /g, (match) => {
556+
if (match.lastIndexOf('{%') > 0) return match
557+
const withLinebreak = `${match.slice(0, -1)}\n`
558+
if (englishLinebreaks.has(withLinebreak) && !englishSpaces.has(match)) {
559+
return withLinebreak
560+
}
561+
return match
562+
})
563+
// Special case: `{% endif %} | ` → `{% endif %}\n| ` when English has it.
564+
if (englishContent.includes('{% endif %}\n| ')) {
565+
content = content.replace(/\{% endif %\} \| /g, '{% endif %}\n| ')
566+
}
525567
}
526-
return match
527-
})
568+
}
528569

529570
// Collapsed Markdown table rows — restore linebreaks between `|` cells.
530571
content = content.replaceAll(' | | ', ' |\n| ')
531572

532573
// Final catch-all: earlier normalizations (e.g. space-in-braces regex) can
533574
// recreate `{{% KEYWORD` patterns after the per-keyword fixes already ran.
534575
// Strip the extra `{` for known Liquid tag names.
576+
// Note: keywords without a trailing space (e.g. `raw`, `endif`) need `\b`
577+
// to ensure a space is not required, while still matching correctly.
535578
content = content.replace(
536-
/\{\{(%\s*(?:data |ifversion |elsif |endif|else |else\b|octicon |note|endnote|tip|endtip|raw|endraw|comment|endcomment|for |endfor|assign |vscode|endvscode|visualstudio|endvisualstudio|rowheaders|endrowheaders))/g,
579+
/\{\{(%\s*(?:data |ifversion |elsif |endif\b|else\b|octicon |note\b|endnote\b|tip\b|endtip\b|raw\b|endraw\b|comment\b|endcomment\b|for |endfor\b|assign |vscode\b|endvscode\b|visualstudio\b|endvisualstudio\b|rowheaders\b|endrowheaders\b))/g,
537580
'{$1',
538581
)
539582

583+
// After the catch-all, `{%raw` (no space) can appear. Normalize to `{% raw`.
584+
content = content.replaceAll('{%raw %}', '{% raw %}')
585+
content = content.replaceAll('{%raw -%}', '{% raw -%}')
586+
content = content.replaceAll('{%endraw %}', '{% endraw %}')
587+
content = content.replaceAll('{%endraw -%}', '{% endraw -%}')
588+
540589
return content
541590
}

0 commit comments

Comments
 (0)