@@ -499,31 +499,29 @@ async function translateFileWithSections(client, file, systemPrompt, localesDir,
499499
500500 const seeded = { } ;
501501 for ( const s of sections ) {
502- // Para chunks have IDs like `h2-foo-p{8hexchars}`; strip suffix to get parent heading ID
503- const parentId = s . id . replace ( / - p [ 0 - 9 a - f ] { 8 } $ / , '' ) ;
504- const zhHeadContent = zhByHeadId . get ( parentId ) ?? zhByHeadId . get ( s . id ) ;
505- if ( ! zhHeadContent ) continue ;
506-
507- // Determine which zh content fragment corresponds to this paragraph chunk
508- let zhContent ;
509- const siblings = sections . filter (
510- sec => sec . id . replace ( / - p [ 0 - 9 a - f ] { 8 } $ / , '' ) === parentId
511- ) ;
512- if ( siblings . length <= 1 ) {
513- // Section wasn't split into paragraph chunks — use the full zh heading content
514- zhContent = zhHeadContent ;
515- } else {
516- // Para chunk — split zh heading content the same way and match by position
517- const zhChunks = splitByParagraphBlocks ( { id : parentId , content : zhHeadContent } ) ;
518- const pos = siblings . indexOf ( s ) ;
519- if ( pos >= zhChunks . length ) continue ; // zh split differently — skip (will retranslate)
520- zhContent = zhChunks [ pos ] . content ;
521- }
502+ // Para chunks (IDs ending in -p{8hexchars}) are skipped during seeding.
503+ //
504+ // Why: Chinese text is ~30-50% shorter than English. When zh heading sections are
505+ // split into paragraph chunks using the same 600-char threshold, they produce
506+ // FEWER chunks than the English. Position-based matching then assigns wrong zh
507+ // content to English chunks, and unmatched English chunks get retranslated by Claude
508+ // — producing content that already exists in an earlier chunk. This creates
509+ // duplicate paragraphs in the output (root cause of the 168-line diff for a
510+ // 1-line change).
511+ //
512+ // Para chunks are translated once by Claude on the first incremental run, then
513+ // correctly cached with prose-hash IDs. Subsequent code-only changes trigger
514+ // patchCodeBlocks; prose changes trigger targeted retranslation.
515+ if ( / - p [ 0 - 9 a - f ] { 8 } $ / . test ( s . id ) ) continue ;
516+
517+ // Heading-level section (not split): seed directly from the zh heading section.
518+ const zhContent = zhByHeadId . get ( s . id ) ;
519+ if ( ! zhContent ) continue ;
522520
523521 const cH = 'sha256:' + crypto . createHash ( 'sha256' ) . update ( s . content ) . digest ( 'hex' ) ;
524522 const pH = 'sha256:' + crypto . createHash ( 'sha256' ) . update ( stripCodeBlocks ( s . content ) ) . digest ( 'hex' ) ;
525523
526- // Code blocks are never translated — if they differ the English source changed
524+ // Code blocks are never translated — a mismatch means the English source changed
527525 // since the zh translation was produced. Seed as "code stale" so patchCodeBlocks
528526 // runs instead of Claude.
529527 const enBlocks = extractCodeBlocks ( s . content ) ;
@@ -534,8 +532,6 @@ async function translateFileWithSections(client, file, systemPrompt, localesDir,
534532 if ( codeUnchanged ) {
535533 seeded [ s . id ] = { contentHash : cH , proseHash : pH , translation : zhContent } ;
536534 } else {
537- // Fake the "old" contentHash so the main loop doesn't treat this as a cache hit,
538- // while proseHash stays current — this triggers patchCodeBlocks, not Claude.
539535 const oldHash = 'sha256:' + crypto . createHash ( 'sha256' ) . update ( zhContent ) . digest ( 'hex' ) ;
540536 seeded [ s . id ] = { contentHash : oldHash , proseHash : pH , translation : zhContent } ;
541537 }
@@ -1362,24 +1358,49 @@ function splitByParagraphBlocks(section) {
13621358
13631359 if ( rawBlocks . length <= 1 ) return [ section ] ; // can't split further
13641360
1365- // Merge consecutive paragraph blocks into chunks that stay under the threshold
1366- const chunks = [ ] ;
1361+ // Merge consecutive paragraph blocks into chunks that stay under the threshold.
1362+ //
1363+ // ID strategy — two goals in tension:
1364+ // (A) Stable across code-only changes → use prose hash → patchCodeBlocks can fire
1365+ // (B) Unique when prose is identical → use content hash → switching chunks is detected
1366+ //
1367+ // Resolution: use prose hash when it is unique within this section; fall back to
1368+ // content hash for chunks whose prose hash collides with another chunk's.
1369+ // Colliding chunks are rare (e.g. two code-only blocks) and sacrifice patchCodeBlocks,
1370+ // but gain correct switch detection without positional dedup suffixes.
1371+
1372+ // First pass: collect raw chunk contents
1373+ const rawChunks = [ ] ;
13671374 let current = '' ;
13681375 for ( const block of rawBlocks ) {
13691376 const candidate = current ? `${ current } \n${ block } ` : block ;
13701377 if ( current && candidate . length > PARAGRAPH_FALLBACK_CHARS ) {
1371- const h = crypto . createHash ( 'sha256' ) . update ( current ) . digest ( 'hex' ) . slice ( 0 , 8 ) ;
1372- chunks . push ( { id : `${ section . id } -p${ h } ` , content : current } ) ;
1378+ rawChunks . push ( current ) ;
13731379 current = block ;
13741380 } else {
13751381 current = candidate ;
13761382 }
13771383 }
1378- if ( current ) {
1379- const h = crypto . createHash ( 'sha256' ) . update ( current ) . digest ( 'hex' ) . slice ( 0 , 8 ) ;
1380- chunks . push ( { id : `${ section . id } -p${ h } ` , content : current } ) ;
1384+ if ( current ) rawChunks . push ( current ) ;
1385+
1386+ if ( rawChunks . length <= 1 ) return [ section ] ;
1387+
1388+ // Count prose-hash occurrences to detect collisions within this section
1389+ const proseHashCount = new Map ( ) ;
1390+ for ( const c of rawChunks ) {
1391+ const ph = crypto . createHash ( 'sha256' ) . update ( stripCodeBlocks ( c ) ) . digest ( 'hex' ) . slice ( 0 , 8 ) ;
1392+ proseHashCount . set ( ph , ( proseHashCount . get ( ph ) ?? 0 ) + 1 ) ;
13811393 }
13821394
1395+ // Second pass: assign stable IDs
1396+ const chunks = rawChunks . map ( c => {
1397+ const ph = crypto . createHash ( 'sha256' ) . update ( stripCodeBlocks ( c ) ) . digest ( 'hex' ) . slice ( 0 , 8 ) ;
1398+ const h = proseHashCount . get ( ph ) === 1
1399+ ? ph // unique prose → stable ID, enables patchCodeBlocks
1400+ : crypto . createHash ( 'sha256' ) . update ( c ) . digest ( 'hex' ) . slice ( 0 , 8 ) ; // collision → content hash
1401+ return { id : `${ section . id } -p${ h } ` , content : c } ;
1402+ } ) ;
1403+
13831404 return chunks . length > 1 ? chunks : [ section ] ;
13841405}
13851406
0 commit comments