Skip to content

Commit 98dfc84

Browse files
heiskrCopilot
andauthored
Fix translation corruptions: LLM sentinel markers and capitalized Liquid tags (#60527)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 5a55bc5 commit 98dfc84

File tree

2 files changed

+24
-0
lines changed

2 files changed

+24
-0
lines changed

src/languages/lib/correct-translation-content.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,16 @@ export function correctTranslatedContentStrings(
428428
}
429429

430430
// --- Generic fixes (all languages) ---
431+
432+
// Strip leaked LLM sentinel markers (e.g. `<|endoftext|>`) that
433+
// occasionally survive the translation pipeline. Replace the marker
434+
// and any surrounding whitespace with a single space so adjacent
435+
// words don't concatenate.
436+
content = content.replace(/\s*<\|endoftext\|>\s*/g, ' ')
437+
438+
// Capitalized Liquid keyword: `{% Data ` → `{% data `
439+
content = content.replaceAll('{% Data ', '{% data ')
440+
431441
// These run after per-language fixes so that e.g. `{{% данных variables`
432442
// first becomes `{{% data variables` and then gets caught here.
433443

src/languages/tests/correct-translation-content.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,20 @@ describe('correctTranslatedContentStrings', () => {
591591
// ─── GENERIC FIXES ────────────────────────────────────────────────
592592

593593
describe('Generic fixes (all languages)', () => {
594+
test('strips LLM sentinel markers and preserves word boundaries', () => {
595+
expect(fix('Hello<|endoftext|>World', 'es')).toBe('Hello World')
596+
expect(fix('Hello <|endoftext|> World', 'es')).toBe('Hello World')
597+
expect(fix('end of sentence.<|endoftext|>Start', 'es')).toBe('end of sentence. Start')
598+
})
599+
600+
test('fixes capitalized Data Liquid keyword', () => {
601+
expect(fix('{% Data variables.product.github %}', 'es')).toBe(
602+
'{% data variables.product.github %}',
603+
)
604+
expect(fix('{% Data reusables.foo %}', 'es')).toBe('{% data reusables.foo %}')
605+
expect(fix('{% Data ifversion ghec %}', 'es')).toBe('{% data ifversion ghec %}')
606+
})
607+
594608
test('fixes AUTOTITLE corruption patterns', () => {
595609
expect(fix('["AUTOTITLE](/path)', 'es')).toBe('"[AUTOTITLE](/path)')
596610
expect(fix('[ AUTOTITLE](/path)', 'es')).toBe('[AUTOTITLE](/path)')

0 commit comments

Comments
 (0)