@@ -466,14 +466,104 @@ async function translateSync(client, files, systemPrompt, localesDir, hashesDir,
466466}
467467
468468// ---------------------------------------------------------------------------
469- // Section cache seeding (no Claude — heading sections only )
469+ // Section cache seeding (no Claude — heading + para chunk sections )
470470// ---------------------------------------------------------------------------
471471
472- // Seeds the section-level cache for a file that already has a zh translation but no section
473- // cache. Only heading-level sections are seeded (para chunks are intentionally skipped — they
474- // will be translated by Claude on the first incremental run that touches the file, and cached
475- // correctly from that point onward). Running this eagerly means the NEXT snippet-only change
476- // triggers patchCodeBlocks for heading sections rather than a full Claude retranslation.
472+ /**
473+ * Given an en heading-section content and its zh translation, split both into
474+ * paragraph-chunk pairs using the same raw-block split + greedy 600-char grouping
475+ * that splitByParagraphBlocks uses.
476+ *
477+ * En and zh share the same blank-line paragraph structure (same number of raw
478+ * blocks), so we can match blocks by index and group zh blocks according to en
479+ * chunk boundaries — producing one zh chunk per en para chunk.
480+ *
481+ * Returns an array of { en, zh } content pairs, or null if:
482+ * - fewer than 2 chunks (no para split needed), OR
483+ * - en and zh have different raw-block counts (structure diverged in translation)
484+ */
485+ function mapParaChunksToZh ( enContent , zhContent ) {
486+ // Collect raw blocks (blank-line-separated, code fences kept intact) — same
487+ // logic as splitByParagraphBlocks so the resulting chunks are identical.
488+ function getRawBlocks ( text ) {
489+ const lines = text . split ( '\n' ) ;
490+ const blocks = [ ] ;
491+ let start = 0 ;
492+ let codeBlockFence = null ;
493+ for ( let i = 0 ; i < lines . length ; i ++ ) {
494+ const line = lines [ i ] ;
495+ const fenceMatch = line . match ( / ^ ( ` { 3 , } | ~ { 3 , } ) / ) ;
496+ if ( fenceMatch ) {
497+ if ( codeBlockFence === null ) codeBlockFence = fenceMatch [ 1 ] [ 0 ] ;
498+ else if ( line [ 0 ] === codeBlockFence ) codeBlockFence = null ;
499+ }
500+ if ( codeBlockFence === null && line . trim ( ) === '' && i > start ) {
501+ const block = lines . slice ( start , i + 1 ) . join ( '\n' ) ;
502+ if ( block . trim ( ) ) blocks . push ( block ) ;
503+ start = i + 1 ;
504+ }
505+ }
506+ const tail = lines . slice ( start ) . join ( '\n' ) ;
507+ if ( tail . trim ( ) ) blocks . push ( tail ) ;
508+ return blocks ;
509+ }
510+
511+ const enRaw = getRawBlocks ( enContent ) ;
512+ const zhRaw = getRawBlocks ( zhContent ) ;
513+ if ( enRaw . length !== zhRaw . length ) return null ; // structure diverged — skip
514+
515+ // Greedy grouping: accumulate raw blocks until adding the next would exceed
516+ // PARAGRAPH_FALLBACK_CHARS (based on en length). Apply identical index grouping
517+ // to zh so each zh chunk corresponds to the same blocks as the en chunk.
518+ const pairs = [ ] ;
519+ let enCur = '' , zhCur = '' ;
520+ for ( let i = 0 ; i < enRaw . length ; i ++ ) {
521+ const enCand = enCur ? `${ enCur } \n${ enRaw [ i ] } ` : enRaw [ i ] ;
522+ const zhCand = zhCur ? `${ zhCur } \n${ zhRaw [ i ] } ` : zhRaw [ i ] ;
523+ if ( enCur && enCand . length > PARAGRAPH_FALLBACK_CHARS ) {
524+ pairs . push ( { en : enCur , zh : zhCur } ) ;
525+ enCur = enRaw [ i ] ; zhCur = zhRaw [ i ] ;
526+ } else {
527+ enCur = enCand ; zhCur = zhCand ;
528+ }
529+ }
530+ if ( enCur ) pairs . push ( { en : enCur , zh : zhCur } ) ;
531+
532+ return pairs . length > 1 ? pairs : null ;
533+ }
534+
535+ /**
536+ * Build a single section cache entry from an en content + its existing zh translation.
537+ * If en and zh have the same code blocks, stores the current en hash (cache hit on next
538+ * unchanged run). If code differs (snippet changed since last translation), stores the
539+ * zh content hash as a sentinel so the next run triggers patchCodeBlocks instead of Claude.
540+ */
541+ function makeSeedEntry ( enContent , zhContent ) {
542+ const cH = 'sha256:' + crypto . createHash ( 'sha256' ) . update ( enContent ) . digest ( 'hex' ) ;
543+ const pH = 'sha256:' + crypto . createHash ( 'sha256' ) . update ( stripCodeBlocks ( enContent ) ) . digest ( 'hex' ) ;
544+ const enCode = extractCodeBlocks ( enContent ) ;
545+ const zhCode = extractCodeBlocks ( zhContent ) ;
546+ const codeUnchanged = enCode . length === zhCode . length && enCode . every ( ( b , j ) => b === zhCode [ j ] ) ;
547+ return {
548+ contentHash : codeUnchanged ? cH : 'sha256:' + crypto . createHash ( 'sha256' ) . update ( zhContent ) . digest ( 'hex' ) ,
549+ proseHash : pH ,
550+ translation : zhContent ,
551+ } ;
552+ }
553+
554+ /**
555+ * Builds a section-level hash cache for a file that already has a zh translation
556+ * but no section cache, without calling Claude.
557+ *
558+ * Heading sections: matched by position between en and zh heading splits.
559+ * Para chunks: matched by applying the same raw-block index grouping to the zh
560+ * heading section content. If en and zh have different block counts (rare —
561+ * translator restructured paragraphs), that heading section's para chunks are
562+ * skipped safely (they'll be translated by Claude on first change).
563+ *
564+ * After this runs, both heading sections AND para chunks are cached. Subsequent
565+ * snippet-only changes trigger patchCodeBlocks with zero Claude calls.
566+ */
477567async function seedSectionCache ( file , localesDir , hashesDir , lang ) {
478568 const basename = path . basename ( file , '.mdx' ) ;
479569 const content = await fs . readFile ( file , 'utf-8' ) ;
@@ -486,44 +576,57 @@ async function seedSectionCache(file, localesDir, hashesDir, lang) {
486576 return ; // no zh file to seed from
487577 }
488578
489- const rawSections = splitIntoSections ( content ) ;
490- const sections = deduplicateSectionIds ( rawSections ) ;
579+ const sections = deduplicateSectionIds ( splitIntoSections ( content ) ) ;
491580
492581 const engHeadSecs = splitIntoSections ( content , { paragraphFallback : false } ) ;
493582 const zhHeadSecs = splitIntoSections ( existingTranslation , { paragraphFallback : false } ) ;
494- if ( engHeadSecs . length !== zhHeadSecs . length ) return ; // mismatch — can't seed safely
495-
496- const zhByHeadId = new Map ( engHeadSecs . map ( ( s , i ) => [ s . id , zhHeadSecs [ i ] . content ] ) ) ;
583+ if ( engHeadSecs . length !== zhHeadSecs . length ) return ; // heading count mismatch
584+
585+ // Map: (deduplicated) heading section ID → zh heading section content.
586+ // deduplicateSectionIds is applied to engHeadSecs so the IDs match what
587+ // translateFileWithSections stores in the cache.
588+ const dedupEngHead = deduplicateSectionIds ( engHeadSecs ) ;
589+ const dedupZhHead = deduplicateSectionIds ( zhHeadSecs ) ;
590+ const zhByHeadId = new Map ( dedupEngHead . map ( ( s , i ) => [ s . id , dedupZhHead [ i ] . content ] ) ) ;
591+
592+ // Build: (deduplicated) heading ID → zh para-chunk array (or null if no split needed)
593+ const paraChunksByHeadId = new Map ( ) ;
594+ for ( const s of dedupEngHead ) {
595+ const zhHead = zhByHeadId . get ( s . id ) ;
596+ if ( ! zhHead ) continue ;
597+ const pairs = mapParaChunksToZh ( s . content , zhHead ) ;
598+ if ( pairs ) paraChunksByHeadId . set ( s . id , pairs ) ;
599+ }
497600
498601 const seeded = { } ;
499- for ( const s of sections ) {
500- if ( / - p [ 0 - 9 a - f ] { 8 } $ / . test ( s . id ) ) continue ; // skip para chunks
501602
502- const zhContent = zhByHeadId . get ( s . id ) ;
503- if ( ! zhContent ) continue ;
603+ for ( const s of sections ) {
604+ const isParaChunk = / - p [ 0 - 9 a - f ] { 8 } $ / . test ( s . id ) ;
504605
505- const cH = 'sha256:' + crypto . createHash ( 'sha256' ) . update ( s . content ) . digest ( 'hex' ) ;
506- const pH = 'sha256:' + crypto . createHash ( 'sha256' ) . update ( stripCodeBlocks ( s . content ) ) . digest ( 'hex' ) ;
606+ if ( ! isParaChunk ) {
607+ const zhContent = zhByHeadId . get ( s . id ) ;
608+ if ( zhContent ) seeded [ s . id ] = makeSeedEntry ( s . content , zhContent ) ;
609+ continue ;
610+ }
507611
508- const enBlocks = extractCodeBlocks ( s . content ) ;
509- const zhBlocks = extractCodeBlocks ( zhContent ) ;
510- const codeUnchanged = enBlocks . length === zhBlocks . length && enBlocks . every ( ( b , j ) => b === zhBlocks [ j ] ) ;
612+ const headId = s . id . replace ( / - p [ 0 - 9 a - f ] { 8 } $ / , '' ) ;
613+ const pairs = paraChunksByHeadId . get ( headId ) ;
614+ if ( ! pairs ) continue ;
511615
512- if ( codeUnchanged ) {
513- seeded [ s . id ] = { contentHash : cH , proseHash : pH , translation : zhContent } ;
514- } else {
515- const oldHash = 'sha256:' + crypto . createHash ( 'sha256' ) . update ( zhContent ) . digest ( 'hex' ) ;
516- seeded [ s . id ] = { contentHash : oldHash , proseHash : pH , translation : zhContent } ;
517- }
616+ const pair = pairs . find ( p => p . en === s . content ) ;
617+ if ( pair ) seeded [ s . id ] = makeSeedEntry ( pair . en , pair . zh ) ;
518618 }
519619
520620 if ( Object . keys ( seeded ) . length === 0 ) return ;
521621
622+ const headingCount = Object . keys ( seeded ) . filter ( id => ! / - p [ 0 - 9 a - f ] { 8 } $ / . test ( id ) ) . length ;
623+ const paraCount = Object . keys ( seeded ) . length - headingCount ;
624+
522625 const currentHash = await fileHash ( file ) ;
523626 const hashFile = path . join ( hashesDir , `${ basename } .json` ) ;
524627 await fs . mkdir ( hashesDir , { recursive : true } ) ;
525628 await fs . writeFile ( hashFile , JSON . stringify ( { fileHash : currentHash , sections : seeded } ) , 'utf-8' ) ;
526- console . log ( ` ⟳ ${ basename } : seeded section cache ( ${ Object . keys ( seeded ) . length } heading sections) ` ) ;
629+ console . log ( ` ⟳ ${ basename } : seeded ${ headingCount } heading + ${ paraCount } para-chunk sections` ) ;
527630}
528631
529632// ---------------------------------------------------------------------------
@@ -561,61 +664,41 @@ async function translateFileWithSections(client, file, systemPrompt, localesDir,
561664 // Covers files translated via batch (which stores only fileHash, no sections),
562665 // and the post-migration run where the cache was discarded due to stale IDs.
563666 //
564- // Strategy: match at heading-section level (immune to paragraph-chunk count
565- // differences caused by Chinese text being shorter than English). Then for each
566- // English paragraph chunk find the corresponding zh chunk by position within
567- // its parent heading section — skip it (will retranslate once) if zh split
568- // differently .
667+ // Uses the same raw-block matching as seedSectionCache: heading sections are matched
668+ // by position; para chunks are seeded by matching raw blocks between en and zh heading
669+ // sections (same structure → same block count in 94%+ of articles). This means the
670+ // FIRST incremental run on a changed file also seeds the section cache fully, so
671+ // unchanged para chunks get cache hits and changed ones get patchCodeBlocks .
569672 if ( ! storedData ?. sections ) {
570673 const translatedPath = path . join ( localesDir , `${ basename } .mdx` ) ;
571674 try {
572675 const existingTranslation = await fs . readFile ( translatedPath , 'utf-8' ) ;
573676
574- // Heading-only splits for reliable count matching (no paragraph fallback)
575- const engHeadSecs = splitIntoSections ( content , { paragraphFallback : false } ) ;
576- const zhHeadSecs = splitIntoSections ( existingTranslation , { paragraphFallback : false } ) ;
677+ const engHeadSecs = deduplicateSectionIds ( splitIntoSections ( content , { paragraphFallback : false } ) ) ;
678+ const zhHeadSecs = deduplicateSectionIds ( splitIntoSections ( existingTranslation , { paragraphFallback : false } ) ) ;
577679
578680 if ( engHeadSecs . length === zhHeadSecs . length ) {
579- // Map: English heading section ID → zh heading section content (matched by position)
580- const zhByHeadId = new Map ( engHeadSecs . map ( ( s , i ) => [ s . id , zhHeadSecs [ i ] . content ] ) ) ;
681+ const zhByHeadId = new Map ( engHeadSecs . map ( ( s , i ) => [ s . id , zhHeadSecs [ i ] . content ] ) ) ;
682+ const paraChunksByHead = new Map ( ) ;
683+ for ( const s of engHeadSecs ) {
684+ const zhHead = zhByHeadId . get ( s . id ) ;
685+ if ( ! zhHead ) continue ;
686+ const pairs = mapParaChunksToZh ( s . content , zhHead ) ;
687+ if ( pairs ) paraChunksByHead . set ( s . id , pairs ) ;
688+ }
581689
582690 const seeded = { } ;
583691 for ( const s of sections ) {
584- // Para chunks (IDs ending in -p{8hexchars}) are skipped during seeding.
585- //
586- // Why: Chinese text is ~30-50% shorter than English. When zh heading sections are
587- // split into paragraph chunks using the same 600-char threshold, they produce
588- // FEWER chunks than the English. Position-based matching then assigns wrong zh
589- // content to English chunks, and unmatched English chunks get retranslated by Claude
590- // — producing content that already exists in an earlier chunk. This creates
591- // duplicate paragraphs in the output (root cause of the 168-line diff for a
592- // 1-line change).
593- //
594- // Para chunks are translated once by Claude on the first incremental run, then
595- // correctly cached with prose-hash IDs. Subsequent code-only changes trigger
596- // patchCodeBlocks; prose changes trigger targeted retranslation.
597- if ( / - p [ 0 - 9 a - f ] { 8 } $ / . test ( s . id ) ) continue ;
598-
599- // Heading-level section (not split): seed directly from the zh heading section.
600- const zhContent = zhByHeadId . get ( s . id ) ;
601- if ( ! zhContent ) continue ;
602-
603- const cH = 'sha256:' + crypto . createHash ( 'sha256' ) . update ( s . content ) . digest ( 'hex' ) ;
604- const pH = 'sha256:' + crypto . createHash ( 'sha256' ) . update ( stripCodeBlocks ( s . content ) ) . digest ( 'hex' ) ;
605-
606- // Code blocks are never translated — a mismatch means the English source changed
607- // since the zh translation was produced. Seed as "code stale" so patchCodeBlocks
608- // runs instead of Claude.
609- const enBlocks = extractCodeBlocks ( s . content ) ;
610- const zhBlocks = extractCodeBlocks ( zhContent ) ;
611- const codeUnchanged = enBlocks . length === zhBlocks . length &&
612- enBlocks . every ( ( b , j ) => b === zhBlocks [ j ] ) ;
613-
614- if ( codeUnchanged ) {
615- seeded [ s . id ] = { contentHash : cH , proseHash : pH , translation : zhContent } ;
692+ const isParaChunk = / - p [ 0 - 9 a - f ] { 8 } $ / . test ( s . id ) ;
693+ if ( ! isParaChunk ) {
694+ const zhContent = zhByHeadId . get ( s . id ) ;
695+ if ( zhContent ) seeded [ s . id ] = makeSeedEntry ( s . content , zhContent ) ;
616696 } else {
617- const oldHash = 'sha256:' + crypto . createHash ( 'sha256' ) . update ( zhContent ) . digest ( 'hex' ) ;
618- seeded [ s . id ] = { contentHash : oldHash , proseHash : pH , translation : zhContent } ;
697+ const headId = s . id . replace ( / - p [ 0 - 9 a - f ] { 8 } $ / , '' ) ;
698+ const pairs = paraChunksByHead . get ( headId ) ;
699+ if ( ! pairs ) continue ;
700+ const pair = pairs . find ( p => p . en === s . content ) ;
701+ if ( pair ) seeded [ s . id ] = makeSeedEntry ( pair . en , pair . zh ) ;
619702 }
620703 }
621704
0 commit comments