Skip to content

Commit c6af007

Browse files
authored
fix: rejoin dangling markers and don't split Liquid tags in headings (#61254)
1 parent fbbe456 commit c6af007

2 files changed

Lines changed: 127 additions & 12 deletions

File tree

src/languages/lib/correct-translation-content.ts

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1914,8 +1914,13 @@ export function correctTranslatedContentStrings(
19141914
else englishSpaces.add(m[0])
19151915
}
19161916
if (englishLinebreaks.size > 0) {
1917-
content = content.replace(/\{%(.+?)%\} /g, (match) => {
1917+
content = content.replace(/\{%(.+?)%\} /g, (match, _p1, offset, string) => {
19181918
if (match.lastIndexOf('{%') > 0) return match
1919+
// Don't inject a linebreak when the tag is inside a heading line — doing
1920+
// so would split `#### {% data X %} Japanese text` into a heading with
1921+
// no content followed by a loose paragraph of Japanese text.
1922+
const lineStart = (string as string).lastIndexOf('\n', offset) + 1
1923+
if (/^[ \t]{0,3}#{1,6}/.test((string as string).slice(lineStart, offset))) return match
19191924
const withLinebreak = `${match.slice(0, -1)}\n`
19201925
if (englishLinebreaks.has(withLinebreak) && !englishSpaces.has(match)) {
19211926
return withLinebreak
@@ -2058,11 +2063,12 @@ export function correctTranslatedContentStrings(
20582063
* Rejoin marker lines that the translation pipeline split from their content.
20592064
*
20602065
* Translators sometimes leave a heading marker (`#`/`##`/...), blockquote
2061-
* marker (`>`), or the opening `**` of a bold span (immediately following a
2062-
* list/heading/blockquote/table marker) on its own line, with the rest of
2063-
* the content pushed to the next line as deeply indented text. This breaks
2064-
* rendering (empty headings, broken blockquotes, unrendered bold, unexpanded
2065-
* Liquid and `[AUTOTITLE]` links).
2066+
* marker (`>`), ordered-list marker (`1.`, `2.`, ...), or the opening `**`
2067+
* of a bold span (immediately following a list/heading/blockquote/table
2068+
* marker) on its own line, with the rest of the content pushed to the next
2069+
* line as deeply indented text. This breaks rendering (empty headings, broken
2070+
* blockquotes, broken ordered lists rendered as code blocks, unrendered bold,
2071+
* unexpanded Liquid and `[AUTOTITLE]` links).
20662072
*
20672073
* Conservative thresholds:
20682074
* - Marker line has 0–3 leading spaces (CommonMark heading/blockquote rule).
@@ -2081,11 +2087,21 @@ function joinDanglingMarkers(content: string): string {
20812087
// Marker-only line patterns (run only against non-fenced, non-frontmatter lines).
20822088
const headingOnly = /^([ \t]{0,3})(#{1,6})[ \t]*$/
20832089
const blockquoteOnly = /^([ \t]{0,3}>)[ \t]*$/
2090+
// Ordered-list marker alone on a line: `1. \n content`.
2091+
const orderedListOnly = /^([ \t]{0,3}\d+\.)[ \t]*$/
20842092
// Bold-open after a list/heading/blockquote/table marker (no other content).
20852093
const markerThenBoldOnly =
20862094
/^([ \t]{0,3}(?:[*+-]|\d+\.)[ \t]+|[ \t]{0,3}>[ \t]+|[ \t]{0,3}#{1,6}[ \t]+|\|[ \t]*)\*\*[ \t]*$/
20872095
// Continuation: 6+ leading spaces and at least one non-whitespace character.
2096+
// Used when checking whether the *next* line is a deeply-indented continuation
2097+
// after a recognised marker.
20882098
const deepIndented = /^[ \t]{6,}(\S.*)$/
2099+
// Standalone deeply-indented paragraph: 9+ leading spaces. Translation
2100+
// artifacts consistently use 14 spaces; legitimate list-continuation content
2101+
// uses at most 6 spaces (confirmed by corpus analysis). The 9+ threshold
2102+
// keeps the two populations well separated and is fence-safe after the
2103+
// improved fence detection above.
2104+
const veryDeepIndented = /^[ \t]{9,}(\S.*)$/
20892105

20902106
for (let i = 0; i < lines.length; i++) {
20912107
const line = lines[i]
@@ -2108,7 +2124,12 @@ function joinDanglingMarkers(content: string): string {
21082124
}
21092125

21102126
// CommonMark fenced code block: 0–3 leading spaces, then 3+ ` or ~.
2111-
const fenceMatch = line.match(/^[ \t]{0,3}(`{3,}|~{3,})/)
2127+
// CommonMark permits fences to be indented 0–3 spaces at the document
2128+
// level, but inside a list item a fence can appear at 4+ spaces of
2129+
// leading indentation. Use `^[ \t]*` so that code blocks nested inside
2130+
// list items (e.g. ` ```json`) are correctly recognised and their
2131+
// content is not inadvertently stripped by the selfStrip pass below.
2132+
const fenceMatch = line.match(/^[ \t]*(`{3,}|~{3,})/)
21122133
if (fenceMatch) {
21132134
const marker = fenceMatch[1]
21142135
if (!inFence) {
@@ -2129,6 +2150,21 @@ function joinDanglingMarkers(content: string): string {
21292150
continue
21302151
}
21312152

2153+
// A line that itself starts with 9+ spaces and is not inside a code fence
2154+
// is a translation-pipeline corruption artifact: the pipeline indented an
2155+
// entire paragraph line, causing CommonMark to render it as an indented
2156+
// code block (4+ spaces at the document level = code block). Strip the
2157+
// leading whitespace so the content renders as a normal paragraph.
2158+
// Marker-only lines (headings `# `, blockquotes `> `, list items `1. `)
2159+
// always have ≤3 leading spaces, so they are never misidentified here.
2160+
// The 9+ threshold (vs the 6+ used for nextDeep) ensures that legitimate
2161+
// list-continuation lines (which use ≤6 spaces) are never stripped.
2162+
const selfStrip = line.match(veryDeepIndented)
2163+
if (selfStrip) {
2164+
out.push(selfStrip[1])
2165+
continue
2166+
}
2167+
21322168
const next = i + 1 < lines.length ? lines[i + 1] : undefined
21332169
const nextDeep = next !== undefined ? next.match(deepIndented) : null
21342170
if (!nextDeep) {
@@ -2151,6 +2187,13 @@ function joinDanglingMarkers(content: string): string {
21512187
continue
21522188
}
21532189

2190+
const ol = line.match(orderedListOnly)
2191+
if (ol) {
2192+
out.push(`${ol[1]} ${nextContent}`)
2193+
i++
2194+
continue
2195+
}
2196+
21542197
const boldOpen = line.match(markerThenBoldOnly)
21552198
if (boldOpen) {
21562199
out.push(`${boldOpen[1]}**${nextContent}`)

src/languages/tests/correct-translation-content.ts

Lines changed: 77 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1556,6 +1556,17 @@ describe('correctTranslatedContentStrings', () => {
15561556
expect(fix(translated, 'es', en)).toBe('{% endif %}\n| Column |')
15571557
})
15581558

1559+
test('does not inject linebreak after data tag that is mid-heading', () => {
1560+
// English: tag is at end of heading line → English has tag+newline.
1561+
// Japanese: tag is mid-heading, followed by Japanese text.
1562+
// The linebreak recovery must NOT replace the space with a newline here,
1563+
// or the heading gets split into `#### TAG` + `Japanese text` paragraph.
1564+
const en = '#### Using {% data variables.copilot.subagents_short %}\n\nSome paragraph.'
1565+
const translated =
1566+
'#### {% data variables.copilot.subagents_short %} の使用\n\nSome paragraph.'
1567+
expect(fix(translated, 'ja', en)).toBe(translated)
1568+
})
1569+
15591570
test('fixes collapsed Markdown table rows', () => {
15601571
expect(fix('Cell1 | | Cell2', 'es')).toBe('Cell1 |\n| Cell2')
15611572
})
@@ -1611,8 +1622,9 @@ describe('correctTranslatedContentStrings', () => {
16111622
expect(fix(' ### \n Title', 'ja')).toBe(' ### Title')
16121623
// Valid headings are not modified
16131624
expect(fix('### Already correct', 'ja')).toBe('### Already correct')
1614-
// 4-space indented heading-like text is not collapsed (looks like code)
1615-
expect(fix(' ###\n code', 'ja')).toBe(' ###\n code')
1625+
// 4-space indented heading-like text is not collapsed (no marker join);
1626+
// but selfStrip still removes the 14-space indentation from the next line.
1627+
expect(fix(' ###\n code', 'ja')).toBe(' ###\ncode')
16161628
// Shallow next-line indent (<6) is not collapsed
16171629
expect(fix('### \n Title', 'ja')).toBe('### \n Title')
16181630
})
@@ -1646,9 +1658,29 @@ describe('correctTranslatedContentStrings', () => {
16461658
expect(fix('> **\n Quoted bold**', 'ja')).toBe('> **Quoted bold**')
16471659
// Table cell
16481660
expect(fix('| **\n Cell bold** | x', 'ja')).toBe('| **Cell bold** | x')
1649-
// Bare `**` (no preceding marker) is not collapsed — could be a closing
1650-
// bold marker followed by legitimate indented continuation.
1651-
expect(fix('**\n text', 'ja')).toBe('**\n text')
1661+
// Bare `**` (no preceding marker) is not marker-joined, but selfStrip
1662+
// still removes the 14-space indentation from the next line so it does
1663+
// not render as an indented code block.
1664+
expect(fix('**\n text', 'ja')).toBe('**\ntext')
1665+
})
1666+
1667+
test('rejoins dangling ordered-list markers (all languages)', () => {
1668+
const broken =
1669+
'1. \n {% data variables.product.prodname_vscode %}では、サイドバーの拡張機能アイコンをクリックします。'
1670+
const expected =
1671+
'1. {% data variables.product.prodname_vscode %}では、サイドバーの拡張機能アイコンをクリックします。'
1672+
for (const lang of ['ja', 'de', 'es', 'fr', 'ko', 'pt', 'ru', 'zh']) {
1673+
expect(fix(broken, lang)).toBe(expected)
1674+
}
1675+
// Higher numbered items
1676+
expect(fix('2. \n Content', 'ja')).toBe('2. Content')
1677+
expect(fix('10. \n Content', 'ja')).toBe('10. Content')
1678+
// 0–3 leading spaces are accepted
1679+
expect(fix(' 1. \n Indented', 'ja')).toBe(' 1. Indented')
1680+
// Valid ordered list items are not modified
1681+
expect(fix('1. Already correct', 'ja')).toBe('1. Already correct')
1682+
// Shallow next-line indent (<6 spaces) is not collapsed
1683+
expect(fix('1. \n Content', 'ja')).toBe('1. \n Content')
16521684
})
16531685

16541686
test('does not modify content inside fenced code blocks', () => {
@@ -1723,6 +1755,46 @@ intro: |
17231755
const nested = '1. Run this command:\n\n gh auth login'
17241756
expect(fix(nested, 'ja')).toBe(nested)
17251757
})
1758+
1759+
test('strips standalone deeply-indented paragraph lines (all languages)', () => {
1760+
// The translation pipeline sometimes indents an entire paragraph line
1761+
// with 14 spaces, causing it to render as a code block at the document
1762+
// level. Such lines should have their leading whitespace stripped.
1763+
const broken =
1764+
'### MCP サーバーの手動での構成\n\n {% data variables.product.prodname_vscode %}で MCP サーバーを構成するには、...'
1765+
const expected =
1766+
'### MCP サーバーの手動での構成\n\n{% data variables.product.prodname_vscode %}で MCP サーバーを構成するには、...'
1767+
for (const lang of ['ja', 'de', 'es', 'fr', 'ko', 'pt', 'ru', 'zh']) {
1768+
expect(fix(broken, lang)).toBe(expected)
1769+
}
1770+
// 9 spaces is the minimum threshold
1771+
expect(fix(' content', 'ja')).toBe('content')
1772+
// 8 spaces is below threshold and should be preserved
1773+
expect(fix(' content', 'ja')).toBe(' content')
1774+
// Standalone 14-space line mid-document
1775+
expect(fix('Para one.\n\n Para two.\n\nPara three.', 'ja')).toBe(
1776+
'Para one.\n\nPara two.\n\nPara three.',
1777+
)
1778+
})
1779+
1780+
test('does not strip content inside 4-space-indented fences (list code blocks)', () => {
1781+
// A fenced code block that itself lives inside a list item is indented
1782+
// by 4 spaces. Its content may have 6–25 spaces of leading whitespace
1783+
// but must NOT be stripped.
1784+
const fenced = [
1785+
'1. Add this config:',
1786+
'',
1787+
' ```json copy',
1788+
' {',
1789+
' "key": "value",',
1790+
' "nested": {',
1791+
' "deep": true',
1792+
' }',
1793+
' }',
1794+
' ```',
1795+
].join('\n')
1796+
expect(fix(fenced, 'ja')).toBe(fenced)
1797+
})
17261798
})
17271799

17281800
// ─── EDGE CASES ────────────────────────────────────────────────────

0 commit comments

Comments
 (0)