Skip to content

Commit d7f4e3c

Browse files
committed
Merge branch 'localization' into develop
2 parents 6773cb1 + 3a0aeb9 commit d7f4e3c

2 files changed

Lines changed: 156 additions & 71 deletions

File tree

scripts/translate.mjs

Lines changed: 154 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -466,14 +466,104 @@ async function translateSync(client, files, systemPrompt, localesDir, hashesDir,
466466
}
467467

468468
// ---------------------------------------------------------------------------
469-
// Section cache seeding (no Claude — heading sections only)
469+
// Section cache seeding (no Claude — heading + para chunk sections)
470470
// ---------------------------------------------------------------------------
471471

472-
// Seeds the section-level cache for a file that already has a zh translation but no section
473-
// cache. Only heading-level sections are seeded (para chunks are intentionally skipped — they
474-
// will be translated by Claude on the first incremental run that touches the file, and cached
475-
// correctly from that point onward). Running this eagerly means the NEXT snippet-only change
476-
// triggers patchCodeBlocks for heading sections rather than a full Claude retranslation.
472+
/**
473+
* Given an en heading-section content and its zh translation, split both into
474+
* paragraph-chunk pairs using the same raw-block split + greedy 600-char grouping
475+
* that splitByParagraphBlocks uses.
476+
*
477+
* En and zh share the same blank-line paragraph structure (same number of raw
478+
* blocks), so we can match blocks by index and group zh blocks according to en
479+
* chunk boundaries — producing one zh chunk per en para chunk.
480+
*
481+
* Returns an array of { en, zh } content pairs, or null if:
482+
* - fewer than 2 chunks (no para split needed), OR
483+
* - en and zh have different raw-block counts (structure diverged in translation)
484+
*/
485+
function mapParaChunksToZh(enContent, zhContent) {
486+
// Collect raw blocks (blank-line-separated, code fences kept intact) — same
487+
// logic as splitByParagraphBlocks so the resulting chunks are identical.
488+
function getRawBlocks(text) {
489+
const lines = text.split('\n');
490+
const blocks = [];
491+
let start = 0;
492+
let codeBlockFence = null;
493+
for (let i = 0; i < lines.length; i++) {
494+
const line = lines[i];
495+
const fenceMatch = line.match(/^(`{3,}|~{3,})/);
496+
if (fenceMatch) {
497+
if (codeBlockFence === null) codeBlockFence = fenceMatch[1][0];
498+
else if (line[0] === codeBlockFence) codeBlockFence = null;
499+
}
500+
if (codeBlockFence === null && line.trim() === '' && i > start) {
501+
const block = lines.slice(start, i + 1).join('\n');
502+
if (block.trim()) blocks.push(block);
503+
start = i + 1;
504+
}
505+
}
506+
const tail = lines.slice(start).join('\n');
507+
if (tail.trim()) blocks.push(tail);
508+
return blocks;
509+
}
510+
511+
const enRaw = getRawBlocks(enContent);
512+
const zhRaw = getRawBlocks(zhContent);
513+
if (enRaw.length !== zhRaw.length) return null; // structure diverged — skip
514+
515+
// Greedy grouping: accumulate raw blocks until adding the next would exceed
516+
// PARAGRAPH_FALLBACK_CHARS (based on en length). Apply identical index grouping
517+
// to zh so each zh chunk corresponds to the same blocks as the en chunk.
518+
const pairs = [];
519+
let enCur = '', zhCur = '';
520+
for (let i = 0; i < enRaw.length; i++) {
521+
const enCand = enCur ? `${enCur}\n${enRaw[i]}` : enRaw[i];
522+
const zhCand = zhCur ? `${zhCur}\n${zhRaw[i]}` : zhRaw[i];
523+
if (enCur && enCand.length > PARAGRAPH_FALLBACK_CHARS) {
524+
pairs.push({ en: enCur, zh: zhCur });
525+
enCur = enRaw[i]; zhCur = zhRaw[i];
526+
} else {
527+
enCur = enCand; zhCur = zhCand;
528+
}
529+
}
530+
if (enCur) pairs.push({ en: enCur, zh: zhCur });
531+
532+
return pairs.length > 1 ? pairs : null;
533+
}
534+
535+
/**
536+
* Build a single section cache entry from an en content + its existing zh translation.
537+
* If en and zh have the same code blocks, stores the current en hash (cache hit on next
538+
* unchanged run). If code differs (snippet changed since last translation), stores the
539+
* zh content hash as a sentinel so the next run triggers patchCodeBlocks instead of Claude.
540+
*/
541+
function makeSeedEntry(enContent, zhContent) {
542+
const cH = 'sha256:' + crypto.createHash('sha256').update(enContent).digest('hex');
543+
const pH = 'sha256:' + crypto.createHash('sha256').update(stripCodeBlocks(enContent)).digest('hex');
544+
const enCode = extractCodeBlocks(enContent);
545+
const zhCode = extractCodeBlocks(zhContent);
546+
const codeUnchanged = enCode.length === zhCode.length && enCode.every((b, j) => b === zhCode[j]);
547+
return {
548+
contentHash: codeUnchanged ? cH : 'sha256:' + crypto.createHash('sha256').update(zhContent).digest('hex'),
549+
proseHash: pH,
550+
translation: zhContent,
551+
};
552+
}
553+
554+
/**
555+
* Builds a section-level hash cache for a file that already has a zh translation
556+
* but no section cache, without calling Claude.
557+
*
558+
* Heading sections: matched by position between en and zh heading splits.
559+
* Para chunks: matched by applying the same raw-block index grouping to the zh
560+
* heading section content. If en and zh have different block counts (rare —
561+
* translator restructured paragraphs), that heading section's para chunks are
562+
* skipped safely (they'll be translated by Claude on first change).
563+
*
564+
* After this runs, both heading sections AND para chunks are cached. Subsequent
565+
* snippet-only changes trigger patchCodeBlocks with zero Claude calls.
566+
*/
477567
async function seedSectionCache(file, localesDir, hashesDir, lang) {
478568
const basename = path.basename(file, '.mdx');
479569
const content = await fs.readFile(file, 'utf-8');
@@ -486,44 +576,57 @@ async function seedSectionCache(file, localesDir, hashesDir, lang) {
486576
return; // no zh file to seed from
487577
}
488578

489-
const rawSections = splitIntoSections(content);
490-
const sections = deduplicateSectionIds(rawSections);
579+
const sections = deduplicateSectionIds(splitIntoSections(content));
491580

492581
const engHeadSecs = splitIntoSections(content, { paragraphFallback: false });
493582
const zhHeadSecs = splitIntoSections(existingTranslation, { paragraphFallback: false });
494-
if (engHeadSecs.length !== zhHeadSecs.length) return; // mismatch — can't seed safely
495-
496-
const zhByHeadId = new Map(engHeadSecs.map((s, i) => [s.id, zhHeadSecs[i].content]));
583+
if (engHeadSecs.length !== zhHeadSecs.length) return; // heading count mismatch
584+
585+
// Map: (deduplicated) heading section ID → zh heading section content.
586+
// deduplicateSectionIds is applied to engHeadSecs so the IDs match what
587+
// translateFileWithSections stores in the cache.
588+
const dedupEngHead = deduplicateSectionIds(engHeadSecs);
589+
const dedupZhHead = deduplicateSectionIds(zhHeadSecs);
590+
const zhByHeadId = new Map(dedupEngHead.map((s, i) => [s.id, dedupZhHead[i].content]));
591+
592+
// Build: (deduplicated) heading ID → zh para-chunk array (or null if no split needed)
593+
const paraChunksByHeadId = new Map();
594+
for (const s of dedupEngHead) {
595+
const zhHead = zhByHeadId.get(s.id);
596+
if (!zhHead) continue;
597+
const pairs = mapParaChunksToZh(s.content, zhHead);
598+
if (pairs) paraChunksByHeadId.set(s.id, pairs);
599+
}
497600

498601
const seeded = {};
499-
for (const s of sections) {
500-
if (/-p[0-9a-f]{8}$/.test(s.id)) continue; // skip para chunks
501602

502-
const zhContent = zhByHeadId.get(s.id);
503-
if (!zhContent) continue;
603+
for (const s of sections) {
604+
const isParaChunk = /-p[0-9a-f]{8}$/.test(s.id);
504605

505-
const cH = 'sha256:' + crypto.createHash('sha256').update(s.content).digest('hex');
506-
const pH = 'sha256:' + crypto.createHash('sha256').update(stripCodeBlocks(s.content)).digest('hex');
606+
if (!isParaChunk) {
607+
const zhContent = zhByHeadId.get(s.id);
608+
if (zhContent) seeded[s.id] = makeSeedEntry(s.content, zhContent);
609+
continue;
610+
}
507611

508-
const enBlocks = extractCodeBlocks(s.content);
509-
const zhBlocks = extractCodeBlocks(zhContent);
510-
const codeUnchanged = enBlocks.length === zhBlocks.length && enBlocks.every((b, j) => b === zhBlocks[j]);
612+
const headId = s.id.replace(/-p[0-9a-f]{8}$/, '');
613+
const pairs = paraChunksByHeadId.get(headId);
614+
if (!pairs) continue;
511615

512-
if (codeUnchanged) {
513-
seeded[s.id] = { contentHash: cH, proseHash: pH, translation: zhContent };
514-
} else {
515-
const oldHash = 'sha256:' + crypto.createHash('sha256').update(zhContent).digest('hex');
516-
seeded[s.id] = { contentHash: oldHash, proseHash: pH, translation: zhContent };
517-
}
616+
const pair = pairs.find(p => p.en === s.content);
617+
if (pair) seeded[s.id] = makeSeedEntry(pair.en, pair.zh);
518618
}
519619

520620
if (Object.keys(seeded).length === 0) return;
521621

622+
const headingCount = Object.keys(seeded).filter(id => !/-p[0-9a-f]{8}$/.test(id)).length;
623+
const paraCount = Object.keys(seeded).length - headingCount;
624+
522625
const currentHash = await fileHash(file);
523626
const hashFile = path.join(hashesDir, `${basename}.json`);
524627
await fs.mkdir(hashesDir, { recursive: true });
525628
await fs.writeFile(hashFile, JSON.stringify({ fileHash: currentHash, sections: seeded }), 'utf-8');
526-
console.log(` ⟳ ${basename}: seeded section cache (${Object.keys(seeded).length} heading sections)`);
629+
console.log(` ⟳ ${basename}: seeded ${headingCount} heading + ${paraCount} para-chunk sections`);
527630
}
528631

529632
// ---------------------------------------------------------------------------
@@ -561,61 +664,41 @@ async function translateFileWithSections(client, file, systemPrompt, localesDir,
561664
// Covers files translated via batch (which stores only fileHash, no sections),
562665
// and the post-migration run where the cache was discarded due to stale IDs.
563666
//
564-
// Strategy: match at heading-section level (immune to paragraph-chunk count
565-
// differences caused by Chinese text being shorter than English). Then for each
566-
// English paragraph chunk find the corresponding zh chunk by position within
567-
// its parent heading section — skip it (will retranslate once) if zh split
568-
// differently.
667+
// Uses the same raw-block matching as seedSectionCache: heading sections are matched
668+
// by position; para chunks are seeded by matching raw blocks between en and zh heading
669+
// sections (same structure → same block count in 94%+ of articles). This means the
670+
// FIRST incremental run on a changed file also seeds the section cache fully, so
671+
// unchanged para chunks get cache hits and changed ones get patchCodeBlocks.
569672
if (!storedData?.sections) {
570673
const translatedPath = path.join(localesDir, `${basename}.mdx`);
571674
try {
572675
const existingTranslation = await fs.readFile(translatedPath, 'utf-8');
573676

574-
// Heading-only splits for reliable count matching (no paragraph fallback)
575-
const engHeadSecs = splitIntoSections(content, { paragraphFallback: false });
576-
const zhHeadSecs = splitIntoSections(existingTranslation, { paragraphFallback: false });
677+
const engHeadSecs = deduplicateSectionIds(splitIntoSections(content, { paragraphFallback: false }));
678+
const zhHeadSecs = deduplicateSectionIds(splitIntoSections(existingTranslation, { paragraphFallback: false }));
577679

578680
if (engHeadSecs.length === zhHeadSecs.length) {
579-
// Map: English heading section ID → zh heading section content (matched by position)
580-
const zhByHeadId = new Map(engHeadSecs.map((s, i) => [s.id, zhHeadSecs[i].content]));
681+
const zhByHeadId = new Map(engHeadSecs.map((s, i) => [s.id, zhHeadSecs[i].content]));
682+
const paraChunksByHead = new Map();
683+
for (const s of engHeadSecs) {
684+
const zhHead = zhByHeadId.get(s.id);
685+
if (!zhHead) continue;
686+
const pairs = mapParaChunksToZh(s.content, zhHead);
687+
if (pairs) paraChunksByHead.set(s.id, pairs);
688+
}
581689

582690
const seeded = {};
583691
for (const s of sections) {
584-
// Para chunks (IDs ending in -p{8hexchars}) are skipped during seeding.
585-
//
586-
// Why: Chinese text is ~30-50% shorter than English. When zh heading sections are
587-
// split into paragraph chunks using the same 600-char threshold, they produce
588-
// FEWER chunks than the English. Position-based matching then assigns wrong zh
589-
// content to English chunks, and unmatched English chunks get retranslated by Claude
590-
// — producing content that already exists in an earlier chunk. This creates
591-
// duplicate paragraphs in the output (root cause of the 168-line diff for a
592-
// 1-line change).
593-
//
594-
// Para chunks are translated once by Claude on the first incremental run, then
595-
// correctly cached with prose-hash IDs. Subsequent code-only changes trigger
596-
// patchCodeBlocks; prose changes trigger targeted retranslation.
597-
if (/-p[0-9a-f]{8}$/.test(s.id)) continue;
598-
599-
// Heading-level section (not split): seed directly from the zh heading section.
600-
const zhContent = zhByHeadId.get(s.id);
601-
if (!zhContent) continue;
602-
603-
const cH = 'sha256:' + crypto.createHash('sha256').update(s.content).digest('hex');
604-
const pH = 'sha256:' + crypto.createHash('sha256').update(stripCodeBlocks(s.content)).digest('hex');
605-
606-
// Code blocks are never translated — a mismatch means the English source changed
607-
// since the zh translation was produced. Seed as "code stale" so patchCodeBlocks
608-
// runs instead of Claude.
609-
const enBlocks = extractCodeBlocks(s.content);
610-
const zhBlocks = extractCodeBlocks(zhContent);
611-
const codeUnchanged = enBlocks.length === zhBlocks.length &&
612-
enBlocks.every((b, j) => b === zhBlocks[j]);
613-
614-
if (codeUnchanged) {
615-
seeded[s.id] = { contentHash: cH, proseHash: pH, translation: zhContent };
692+
const isParaChunk = /-p[0-9a-f]{8}$/.test(s.id);
693+
if (!isParaChunk) {
694+
const zhContent = zhByHeadId.get(s.id);
695+
if (zhContent) seeded[s.id] = makeSeedEntry(s.content, zhContent);
616696
} else {
617-
const oldHash = 'sha256:' + crypto.createHash('sha256').update(zhContent).digest('hex');
618-
seeded[s.id] = { contentHash: oldHash, proseHash: pH, translation: zhContent };
697+
const headId = s.id.replace(/-p[0-9a-f]{8}$/, '');
698+
const pairs = paraChunksByHead.get(headId);
699+
if (!pairs) continue;
700+
const pair = pairs.find(p => p.en === s.content);
701+
if (pair) seeded[s.id] = makeSeedEntry(pair.en, pair.zh);
619702
}
620703
}
621704

src/locales/zh/adapty-cursor-android.mdx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@ displayed_sidebar: sdkandroid
66
---
77

88
import Zoom from 'react-medium-image-zoom';
9+
910
import 'react-medium-image-zoom/dist/styles.css';
1011
import Tabs from '@theme/Tabs';
12+
1113
import TabItem from '@theme/TabItem';
1214

1315
本指南帮助您借助 LLM 将 Adapty 集成到 Android 应用。您将首先准备好 Adapty 看板,然后通过向 LLM 发送相关文档链接来逐步完成每个实施阶段。最后,您还会找到使用 AI 工具配合 Adapty 文档的最佳实践。

0 commit comments

Comments
 (0)