diff --git a/.changeset/blue-guests-listen.md b/.changeset/blue-guests-listen.md new file mode 100644 index 000000000..3449edcb1 --- /dev/null +++ b/.changeset/blue-guests-listen.md @@ -0,0 +1,5 @@ +--- +"@react-pdf/textkit": minor +--- + +fix(textkit): treat surrogate pairs as single code points in font substitution diff --git a/packages/examples/vite/generate-pdfs.mjs b/packages/examples/vite/generate-pdfs.mjs index 291279fee..cabc92ea7 100644 --- a/packages/examples/vite/generate-pdfs.mjs +++ b/packages/examples/vite/generate-pdfs.mjs @@ -41,6 +41,7 @@ const exampleNames = [ 'resume', 'scripts', 'soft-hyphens', + 'surrogate-pair', 'svg', 'svg-transform', 'transform-origin', diff --git a/packages/examples/vite/src/examples/index.ts b/packages/examples/vite/src/examples/index.ts index 733528bae..ecdc5e9dc 100644 --- a/packages/examples/vite/src/examples/index.ts +++ b/packages/examples/vite/src/examples/index.ts @@ -26,6 +26,7 @@ import math from './math'; import mermaid from './mermaid'; import passwordProtection from './password-protection'; import softHyphens from './soft-hyphens'; +import surrogatePair from './surrogate-pair'; const EXAMPLES = [ scripts, @@ -56,6 +57,7 @@ const EXAMPLES = [ mermaid, passwordProtection, softHyphens, + surrogatePair, ]; export default EXAMPLES; diff --git a/packages/examples/vite/src/examples/surrogate-pair/index.tsx b/packages/examples/vite/src/examples/surrogate-pair/index.tsx new file mode 100644 index 000000000..3a3ffd362 --- /dev/null +++ b/packages/examples/vite/src/examples/surrogate-pair/index.tsx @@ -0,0 +1,158 @@ +import React from 'react'; +import { + Document, + Page, + View, + Text, + StyleSheet, + Font, +} from '@react-pdf/renderer'; + +// Each entry below renders a sample string in a font that supports the +// non-BMP code points it contains. All sample characters live in U+10000+ +// (SMP / SIP / TIP), i.e. they are UTF-16 surrogate pairs. The purpose of +// this example is to show that surrogate pairs are looked up in the font +// stack as a single code point — without that, every character here would +// fall back to Helvetica and render as a missing glyph (tofu). +// +// URLs were resolved from +// https://fonts.googleapis.com/css2?family=...&text=... +// so each subset includes exactly the characters used below. +const SCRIPTS: ReadonlyArray<{ + family: string; + label: string; + caption: string; + url: string; + sample: string; +}> = [ + { + family: 'Noto Sans JP', + label: 'CJK Ext-B (Japanese name kanji)', + caption: + '\u{20BB7} U+20BB7 / \u{20B9F} U+20B9F / \u{29E3D} U+29E3D — SIP', + url: 'https://fonts.gstatic.com/l/font?kit=-F6jfjtqLzI2JPCgQBnw7HFyzSD-AsregP8VFBEj75jY0rw_mMKldF3n7ULdqbuE7JzuU-TQZ2_b_xbeAw&skey=72472b0eb8793570&v=v56', + // 𠮷田と吉田 𠮟 𩸽 + sample: '\u{20BB7}田と吉田 \u{20B9F} \u{29E3D}', + }, + { + family: 'Noto Sans Egyptian Hieroglyphs', + label: 'Egyptian Hieroglyphs', + caption: + '\u{13080} U+13080 / \u{1313F} U+1313F / \u{13142} U+13142 / \u{13189} U+13189 — SMP', + url: 'https://fonts.gstatic.com/l/font?kit=vEF42-tODB8RrNDvZSUmRhcQHzx1s7y_F9-j3qSzEcbEYindSlK6xRkRmi-6DJyNmGC4VqdZ3IChdUU&skey=fc1730efc5589785&v=v30', + // 𓂀 𓄿 𓅂 𓆉 + sample: '\u{13080} \u{1313F} \u{13142} \u{13189}', + }, + { + family: 'Noto Sans Cuneiform', + label: 'Cuneiform', + caption: + '\u{12000} U+12000 / \u{12041} U+12041 / \u{1208A} U+1208A / \u{120FB} U+120FB — SMP', + url: 'https://fonts.gstatic.com/l/font?kit=bMrrmTWK7YY-MF22aHGGd7H8PhJtvBDWgbxJkxQvU_W-t7uZ9YpFIaCIMrlrtE58&skey=730a3aeff71371d1&v=v18', + // 𒀀 𒁁 𒂊 𒃻 + sample: '\u{12000} \u{12041} \u{1208A} \u{120FB}', + }, + { + family: 'Noto Sans Math', + label: 'Mathematical Alphanumeric Symbols', + caption: + '\u{1D400} U+1D400 / \u{1D49C} U+1D49C / \u{1D538} U+1D538 / \u{1D56E} U+1D56E / \u{1D505} U+1D505 — SMP', + url: 'https://fonts.gstatic.com/l/font?kit=7Aump_cpkSecTWaHRlH2hyV5UHkF-Vs48d-hNu0TJ8LlkGihpLHoHLXvHsBpBiI&skey=27a26f5b3f2d5ea1&v=v19', + // 𝐀 𝒜 𝔸 𝕮 𝔅 + sample: '\u{1D400}\u{1D49C}\u{1D538} \u{1D56E} \u{1D505}', + }, + { + family: 'Noto Music', + label: 'Musical Symbols', + caption: + '\u{1D11E} U+1D11E / \u{1D122} U+1D122 / \u{1D158}\u{1D165} U+1D158+U+1D165 / \u{1D158}\u{1D165}\u{1D16E} — SMP', + url: 'https://fonts.gstatic.com/l/font?kit=pe0rMIiSN5pO63htf1sxItSQAdZqQUDYyhXhOzpYe6-hOgNGBD6n32j4Lg&skey=a1f7640827c2d625&v=v21', + // 𝄞 𝄢 𝅘𝅥 𝅘𝅥𝅮 + sample: '\u{1D11E} \u{1D122} \u{1D158}\u{1D165} \u{1D158}\u{1D165}\u{1D16E}', + }, + { + family: 'Noto Sans Adlam', + label: 'Adlam (modern West African)', + caption: + '\u{1E900} U+1E900 / \u{1E901} U+1E901 / \u{1E902} U+1E902 / \u{1E903} U+1E903 / \u{1E904} U+1E904 — SMP', + url: 'https://fonts.gstatic.com/l/font?kit=neIczCCpqp0s5pPusPamd81eMfjPonvqdbYxxpgufnv0TGrBZLwggvomO9nbazENrX7-_ZwTdOY6tB2ltaK3&skey=1fb7f26201009a1b&v=v27', + // 𞤀 𞤁 𞤂 𞤃 𞤄 + sample: '\u{1E900}\u{1E901}\u{1E902}\u{1E903}\u{1E904}', + }, +]; + +SCRIPTS.forEach(({ family, url }) => { + Font.register({ family, src: url }); +}); + +const styles = StyleSheet.create({ + body: { + padding: 40, + backgroundColor: '#fafafa', + }, + title: { + fontSize: 18, + fontWeight: 'bold', + color: '#1a1a1a', + }, + subtitle: { + fontSize: 9, + color: '#888', + marginBottom: 20, + }, + card: { + backgroundColor: '#ffffff', + borderRadius: 5, + padding: 16, + borderWidth: 1, + borderColor: '#e8e8e8', + marginBottom: 8, + }, + cardLabel: { + fontSize: 8, + color: '#999', + textTransform: 'uppercase', + letterSpacing: 0.5, + marginBottom: 6, + }, + cardCaption: { + fontSize: 8, + color: '#bbb', + marginTop: 8, + }, + sample: { + fontSize: 24, + }, +}); + +const MyDoc = () => ( + + Surrogate pair rendering (SMP / SIP / TIP) + + Every character below sits above U+FFFF and is encoded as a UTF-16 + surrogate pair. Each should resolve to its supporting font (Noto) + and render as a real glyph, not as tofu. + + + {SCRIPTS.map(({ family, label, caption, sample }) => ( + + {label} + {sample} + {caption} + + ))} + +); + +const SurrogatePair = () => ( + + + +); + +export default { + id: 'surrogate-pair', + name: 'Surrogate pair', + description: '', + Document: SurrogatePair, +}; diff --git a/packages/textkit/src/engines/fontSubstitution/index.ts b/packages/textkit/src/engines/fontSubstitution/index.ts index 8e7408c8a..bba17cf91 100644 --- a/packages/textkit/src/engines/fontSubstitution/index.ts +++ b/packages/textkit/src/engines/fontSubstitution/index.ts @@ -52,9 +52,13 @@ const fontSubstitution = const chars = string.slice(run.start, run.end); - for (let j = 0; j < chars.length; j += 1) { - const char = chars[j]; - const codePoint = char.codePointAt(0); + // Iterate by code point so that surrogate pairs (e.g. SIP characters + // U+10000 and above) are looked up as a single code point in the font + // stack, not as separate high/low surrogates. + let j = 0; + while (j < chars.length) { + const codePoint = chars.codePointAt(j)!; + const charLength = codePoint > 0xffff ? 2 : 1; // If the default font does not have a glyph and the fallback font does, we use it const font = pickFontFromFontStack( @@ -87,7 +91,8 @@ const fontSubstitution = lastIndex = index; } - index += char.length; + j += charLength; + index += charLength; } } diff --git a/packages/textkit/tests/engines/fontSubstitution.test.ts b/packages/textkit/tests/engines/fontSubstitution.test.ts index 0a76efc8f..a0bbb32bb 100644 --- a/packages/textkit/tests/engines/fontSubstitution.test.ts +++ b/packages/textkit/tests/engines/fontSubstitution.test.ts @@ -116,4 +116,57 @@ describe('FontSubstitution', () => { expect(string.runs[1].attributes.font).toEqual([SimplifiedChineseFont]); }); }); + + describe('Surrogate pairs', () => { + // 𠮷 is U+20BB7 (SIP, beyond U+FFFF), encoded as the UTF-16 surrogate + // pair 𠮷 — JS string length 2. + const sipFont = { + name: 'SipFont', + unitsPerEm: 1000, + hasGlyphForCodePoint: (codePoint: number) => codePoint === 0x20bb7, + }; + const noGlyphFont = { + name: 'NoGlyphFont', + unitsPerEm: 1000, + hasGlyphForCodePoint: () => false, + }; + + test('should treat a surrogate pair as a single code point when picking a font', () => { + const run = { + start: 0, + end: 2, + attributes: { font: [sipFont, noGlyphFont] }, + } as any; + + const string = instance({ string: '𠮷', runs: [run] }); + + expect(string).toHaveProperty('string', '𠮷'); + expect(string.runs).toHaveLength(1); + expect(string.runs[0]).toHaveProperty('start', 0); + expect(string.runs[0]).toHaveProperty('end', 2); + expect(string.runs[0].attributes.font).toEqual([sipFont]); + }); + + test('should track UTF-16 indices correctly when mixing BMP and SIP code points', () => { + const run = { + start: 0, + end: 4, + attributes: { font: [sipFont, noGlyphFont] }, + } as any; + + const string = instance({ string: 'A𠮷B', runs: [run] }); + + expect(string).toHaveProperty('string', 'A𠮷B'); + expect(string.runs).toHaveLength(3); + expect(string.runs[0]).toHaveProperty('start', 0); + expect(string.runs[0]).toHaveProperty('end', 1); + expect(string.runs[0].attributes.font).toEqual([noGlyphFont]); + expect(string.runs[1]).toHaveProperty('start', 1); + expect(string.runs[1]).toHaveProperty('end', 3); + expect(string.runs[1].attributes.font).toEqual([sipFont]); + expect(string.runs[2]).toHaveProperty('start', 3); + expect(string.runs[2]).toHaveProperty('end', 4); + expect(string.runs[2].attributes.font).toEqual([noGlyphFont]); + }); + }); });