Skip to content

Commit 28769c3

Browse files
committed
Merge branch 'localization' into develop
2 parents 08c60b8 + eea104d commit 28769c3

1 file changed

Lines changed: 51 additions & 30 deletions

File tree

scripts/translate.mjs

Lines changed: 51 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -499,31 +499,29 @@ async function translateFileWithSections(client, file, systemPrompt, localesDir,
499499

500500
const seeded = {};
501501
for (const s of sections) {
502-
// Para chunks have IDs like `h2-foo-p{8hexchars}`; strip suffix to get parent heading ID
503-
const parentId = s.id.replace(/-p[0-9a-f]{8}$/, '');
504-
const zhHeadContent = zhByHeadId.get(parentId) ?? zhByHeadId.get(s.id);
505-
if (!zhHeadContent) continue;
506-
507-
// Determine which zh content fragment corresponds to this paragraph chunk
508-
let zhContent;
509-
const siblings = sections.filter(
510-
sec => sec.id.replace(/-p[0-9a-f]{8}$/, '') === parentId
511-
);
512-
if (siblings.length <= 1) {
513-
// Section wasn't split into paragraph chunks — use the full zh heading content
514-
zhContent = zhHeadContent;
515-
} else {
516-
// Para chunk — split zh heading content the same way and match by position
517-
const zhChunks = splitByParagraphBlocks({ id: parentId, content: zhHeadContent });
518-
const pos = siblings.indexOf(s);
519-
if (pos >= zhChunks.length) continue; // zh split differently — skip (will retranslate)
520-
zhContent = zhChunks[pos].content;
521-
}
502+
// Para chunks (IDs ending in -p{8hexchars}) are skipped during seeding.
503+
//
504+
// Why: Chinese text is ~30-50% shorter than English. When zh heading sections are
505+
// split into paragraph chunks using the same 600-char threshold, they produce
506+
// FEWER chunks than the English. Position-based matching then assigns wrong zh
507+
// content to English chunks, and unmatched English chunks get retranslated by Claude
508+
// — producing content that already exists in an earlier chunk. This creates
509+
// duplicate paragraphs in the output (root cause of the 168-line diff for a
510+
// 1-line change).
511+
//
512+
// Para chunks are translated once by Claude on the first incremental run, then
513+
// correctly cached with prose-hash IDs. Subsequent code-only changes trigger
514+
// patchCodeBlocks; prose changes trigger targeted retranslation.
515+
if (/-p[0-9a-f]{8}$/.test(s.id)) continue;
516+
517+
// Heading-level section (not split): seed directly from the zh heading section.
518+
const zhContent = zhByHeadId.get(s.id);
519+
if (!zhContent) continue;
522520

523521
const cH = 'sha256:' + crypto.createHash('sha256').update(s.content).digest('hex');
524522
const pH = 'sha256:' + crypto.createHash('sha256').update(stripCodeBlocks(s.content)).digest('hex');
525523

526-
// Code blocks are never translated — if they differ the English source changed
524+
// Code blocks are never translated — a mismatch means the English source changed
527525
// since the zh translation was produced. Seed as "code stale" so patchCodeBlocks
528526
// runs instead of Claude.
529527
const enBlocks = extractCodeBlocks(s.content);
@@ -534,8 +532,6 @@ async function translateFileWithSections(client, file, systemPrompt, localesDir,
534532
if (codeUnchanged) {
535533
seeded[s.id] = { contentHash: cH, proseHash: pH, translation: zhContent };
536534
} else {
537-
// Fake the "old" contentHash so the main loop doesn't treat this as a cache hit,
538-
// while proseHash stays current — this triggers patchCodeBlocks, not Claude.
539535
const oldHash = 'sha256:' + crypto.createHash('sha256').update(zhContent).digest('hex');
540536
seeded[s.id] = { contentHash: oldHash, proseHash: pH, translation: zhContent };
541537
}
@@ -1362,24 +1358,49 @@ function splitByParagraphBlocks(section) {
13621358

13631359
if (rawBlocks.length <= 1) return [section]; // can't split further
13641360

1365-
// Merge consecutive paragraph blocks into chunks that stay under the threshold
1366-
const chunks = [];
1361+
// Merge consecutive paragraph blocks into chunks that stay under the threshold.
1362+
//
1363+
// ID strategy — two goals in tension:
1364+
// (A) Stable across code-only changes → use prose hash → patchCodeBlocks can fire
1365+
// (B) Unique when prose is identical → use content hash → switching chunks is detected
1366+
//
1367+
// Resolution: use prose hash when it is unique within this section; fall back to
1368+
// content hash for chunks whose prose hash collides with another chunk's.
1369+
// Colliding chunks are rare (e.g. two code-only blocks) and sacrifice patchCodeBlocks,
1370+
// but gain correct switch detection without positional dedup suffixes.
1371+
1372+
// First pass: collect raw chunk contents
1373+
const rawChunks = [];
13671374
let current = '';
13681375
for (const block of rawBlocks) {
13691376
const candidate = current ? `${current}\n${block}` : block;
13701377
if (current && candidate.length > PARAGRAPH_FALLBACK_CHARS) {
1371-
const h = crypto.createHash('sha256').update(current).digest('hex').slice(0, 8);
1372-
chunks.push({ id: `${section.id}-p${h}`, content: current });
1378+
rawChunks.push(current);
13731379
current = block;
13741380
} else {
13751381
current = candidate;
13761382
}
13771383
}
1378-
if (current) {
1379-
const h = crypto.createHash('sha256').update(current).digest('hex').slice(0, 8);
1380-
chunks.push({ id: `${section.id}-p${h}`, content: current });
1384+
if (current) rawChunks.push(current);
1385+
1386+
if (rawChunks.length <= 1) return [section];
1387+
1388+
// Count prose-hash occurrences to detect collisions within this section
1389+
const proseHashCount = new Map();
1390+
for (const c of rawChunks) {
1391+
const ph = crypto.createHash('sha256').update(stripCodeBlocks(c)).digest('hex').slice(0, 8);
1392+
proseHashCount.set(ph, (proseHashCount.get(ph) ?? 0) + 1);
13811393
}
13821394

1395+
// Second pass: assign stable IDs
1396+
const chunks = rawChunks.map(c => {
1397+
const ph = crypto.createHash('sha256').update(stripCodeBlocks(c)).digest('hex').slice(0, 8);
1398+
const h = proseHashCount.get(ph) === 1
1399+
? ph // unique prose → stable ID, enables patchCodeBlocks
1400+
: crypto.createHash('sha256').update(c).digest('hex').slice(0, 8); // collision → content hash
1401+
return { id: `${section.id}-p${h}`, content: c };
1402+
});
1403+
13831404
return chunks.length > 1 ? chunks : [section];
13841405
}
13851406

0 commit comments

Comments
 (0)