diff --git a/.changeset/blue-guests-listen.md b/.changeset/blue-guests-listen.md
new file mode 100644
index 000000000..3449edcb1
--- /dev/null
+++ b/.changeset/blue-guests-listen.md
@@ -0,0 +1,5 @@
+---
+"@react-pdf/textkit": minor
+---
+
+fix(textkit): treat surrogate pairs as single code points in font substitution
diff --git a/packages/examples/vite/generate-pdfs.mjs b/packages/examples/vite/generate-pdfs.mjs
index 291279fee..cabc92ea7 100644
--- a/packages/examples/vite/generate-pdfs.mjs
+++ b/packages/examples/vite/generate-pdfs.mjs
@@ -41,6 +41,7 @@ const exampleNames = [
'resume',
'scripts',
'soft-hyphens',
+ 'surrogate-pair',
'svg',
'svg-transform',
'transform-origin',
diff --git a/packages/examples/vite/src/examples/index.ts b/packages/examples/vite/src/examples/index.ts
index 733528bae..ecdc5e9dc 100644
--- a/packages/examples/vite/src/examples/index.ts
+++ b/packages/examples/vite/src/examples/index.ts
@@ -26,6 +26,7 @@ import math from './math';
import mermaid from './mermaid';
import passwordProtection from './password-protection';
import softHyphens from './soft-hyphens';
+import surrogatePair from './surrogate-pair';
const EXAMPLES = [
scripts,
@@ -56,6 +57,7 @@ const EXAMPLES = [
mermaid,
passwordProtection,
softHyphens,
+ surrogatePair,
];
export default EXAMPLES;
diff --git a/packages/examples/vite/src/examples/surrogate-pair/index.tsx b/packages/examples/vite/src/examples/surrogate-pair/index.tsx
new file mode 100644
index 000000000..3a3ffd362
--- /dev/null
+++ b/packages/examples/vite/src/examples/surrogate-pair/index.tsx
@@ -0,0 +1,158 @@
+import React from 'react';
+import {
+ Document,
+ Page,
+ View,
+ Text,
+ StyleSheet,
+ Font,
+} from '@react-pdf/renderer';
+
+// Each entry below renders a sample string in a font that supports the
+// non-BMP code points it contains. All sample characters live in U+10000+
+// (SMP / SIP / TIP), i.e. they are UTF-16 surrogate pairs. The purpose of
+// this example is to show that surrogate pairs are looked up in the font
+// stack as a single code point — without that, every character here would
+// fall back to Helvetica and render as a missing glyph (tofu).
+//
+// URLs were resolved from
+// https://fonts.googleapis.com/css2?family=...&text=...
+// so each subset includes exactly the characters used below.
+const SCRIPTS: ReadonlyArray<{
+ family: string;
+ label: string;
+ caption: string;
+ url: string;
+ sample: string;
+}> = [
+ {
+ family: 'Noto Sans JP',
+ label: 'CJK Ext-B (Japanese name kanji)',
+ caption:
+ '\u{20BB7} U+20BB7 / \u{20B9F} U+20B9F / \u{29E3D} U+29E3D — SIP',
+ url: 'https://fonts.gstatic.com/l/font?kit=-F6jfjtqLzI2JPCgQBnw7HFyzSD-AsregP8VFBEj75jY0rw_mMKldF3n7ULdqbuE7JzuU-TQZ2_b_xbeAw&skey=72472b0eb8793570&v=v56',
+ // 𠮷田と吉田 𠮟 𩸽
+ sample: '\u{20BB7}田と吉田 \u{20B9F} \u{29E3D}',
+ },
+ {
+ family: 'Noto Sans Egyptian Hieroglyphs',
+ label: 'Egyptian Hieroglyphs',
+ caption:
+ '\u{13080} U+13080 / \u{1313F} U+1313F / \u{13142} U+13142 / \u{13189} U+13189 — SMP',
+ url: 'https://fonts.gstatic.com/l/font?kit=vEF42-tODB8RrNDvZSUmRhcQHzx1s7y_F9-j3qSzEcbEYindSlK6xRkRmi-6DJyNmGC4VqdZ3IChdUU&skey=fc1730efc5589785&v=v30',
+ // 𓂀 𓄿 𓅂 𓆉
+ sample: '\u{13080} \u{1313F} \u{13142} \u{13189}',
+ },
+ {
+ family: 'Noto Sans Cuneiform',
+ label: 'Cuneiform',
+ caption:
+ '\u{12000} U+12000 / \u{12041} U+12041 / \u{1208A} U+1208A / \u{120FB} U+120FB — SMP',
+ url: 'https://fonts.gstatic.com/l/font?kit=bMrrmTWK7YY-MF22aHGGd7H8PhJtvBDWgbxJkxQvU_W-t7uZ9YpFIaCIMrlrtE58&skey=730a3aeff71371d1&v=v18',
+ // 𒀀 𒁁 𒂊 𒃻
+ sample: '\u{12000} \u{12041} \u{1208A} \u{120FB}',
+ },
+ {
+ family: 'Noto Sans Math',
+ label: 'Mathematical Alphanumeric Symbols',
+ caption:
+ '\u{1D400} U+1D400 / \u{1D49C} U+1D49C / \u{1D538} U+1D538 / \u{1D56E} U+1D56E / \u{1D505} U+1D505 — SMP',
+ url: 'https://fonts.gstatic.com/l/font?kit=7Aump_cpkSecTWaHRlH2hyV5UHkF-Vs48d-hNu0TJ8LlkGihpLHoHLXvHsBpBiI&skey=27a26f5b3f2d5ea1&v=v19',
+ // 𝐀 𝒜 𝔸 𝕮 𝔅
+ sample: '\u{1D400}\u{1D49C}\u{1D538} \u{1D56E} \u{1D505}',
+ },
+ {
+ family: 'Noto Music',
+ label: 'Musical Symbols',
+ caption:
+ '\u{1D11E} U+1D11E / \u{1D122} U+1D122 / \u{1D158}\u{1D165} U+1D158+U+1D165 / \u{1D158}\u{1D165}\u{1D16E} — SMP',
+ url: 'https://fonts.gstatic.com/l/font?kit=pe0rMIiSN5pO63htf1sxItSQAdZqQUDYyhXhOzpYe6-hOgNGBD6n32j4Lg&skey=a1f7640827c2d625&v=v21',
+ // 𝄞 𝄢 𝅘𝅥 𝅘𝅥𝅮
+ sample: '\u{1D11E} \u{1D122} \u{1D158}\u{1D165} \u{1D158}\u{1D165}\u{1D16E}',
+ },
+ {
+ family: 'Noto Sans Adlam',
+ label: 'Adlam (modern West African)',
+ caption:
+ '\u{1E900} U+1E900 / \u{1E901} U+1E901 / \u{1E902} U+1E902 / \u{1E903} U+1E903 / \u{1E904} U+1E904 — SMP',
+ url: 'https://fonts.gstatic.com/l/font?kit=neIczCCpqp0s5pPusPamd81eMfjPonvqdbYxxpgufnv0TGrBZLwggvomO9nbazENrX7-_ZwTdOY6tB2ltaK3&skey=1fb7f26201009a1b&v=v27',
+ // 𞤀 𞤁 𞤂 𞤃 𞤄
+ sample: '\u{1E900}\u{1E901}\u{1E902}\u{1E903}\u{1E904}',
+ },
+];
+
+SCRIPTS.forEach(({ family, url }) => {
+ Font.register({ family, src: url });
+});
+
+const styles = StyleSheet.create({
+ body: {
+ padding: 40,
+ backgroundColor: '#fafafa',
+ },
+ title: {
+ fontSize: 18,
+ fontWeight: 'bold',
+ color: '#1a1a1a',
+ },
+ subtitle: {
+ fontSize: 9,
+ color: '#888',
+ marginBottom: 20,
+ },
+ card: {
+ backgroundColor: '#ffffff',
+ borderRadius: 5,
+ padding: 16,
+ borderWidth: 1,
+ borderColor: '#e8e8e8',
+ marginBottom: 8,
+ },
+ cardLabel: {
+ fontSize: 8,
+ color: '#999',
+ textTransform: 'uppercase',
+ letterSpacing: 0.5,
+ marginBottom: 6,
+ },
+ cardCaption: {
+ fontSize: 8,
+ color: '#bbb',
+ marginTop: 8,
+ },
+ sample: {
+ fontSize: 24,
+ },
+});
+
+const MyDoc = () => (
+
+ Surrogate pair rendering (SMP / SIP / TIP)
+
+ Every character below sits above U+FFFF and is encoded as a UTF-16
+ surrogate pair. Each should resolve to its supporting font (Noto)
+ and render as a real glyph, not as tofu.
+
+
+ {SCRIPTS.map(({ family, label, caption, sample }) => (
+
+ {label}
+ {sample}
+ {caption}
+
+ ))}
+
+);
+
+const SurrogatePair = () => (
+
+
+
+);
+
+export default {
+ id: 'surrogate-pair',
+ name: 'Surrogate pair',
+ description: '',
+ Document: SurrogatePair,
+};
diff --git a/packages/textkit/src/engines/fontSubstitution/index.ts b/packages/textkit/src/engines/fontSubstitution/index.ts
index 8e7408c8a..bba17cf91 100644
--- a/packages/textkit/src/engines/fontSubstitution/index.ts
+++ b/packages/textkit/src/engines/fontSubstitution/index.ts
@@ -52,9 +52,13 @@ const fontSubstitution =
const chars = string.slice(run.start, run.end);
- for (let j = 0; j < chars.length; j += 1) {
- const char = chars[j];
- const codePoint = char.codePointAt(0);
+ // Iterate by code point so that surrogate pairs (e.g. SIP characters
+ // U+10000 and above) are looked up as a single code point in the font
+ // stack, not as separate high/low surrogates.
+ let j = 0;
+ while (j < chars.length) {
+ const codePoint = chars.codePointAt(j)!;
+ const charLength = codePoint > 0xffff ? 2 : 1;
// If the default font does not have a glyph and the fallback font does, we use it
const font = pickFontFromFontStack(
@@ -87,7 +91,8 @@ const fontSubstitution =
lastIndex = index;
}
- index += char.length;
+ j += charLength;
+ index += charLength;
}
}
diff --git a/packages/textkit/tests/engines/fontSubstitution.test.ts b/packages/textkit/tests/engines/fontSubstitution.test.ts
index 0a76efc8f..a0bbb32bb 100644
--- a/packages/textkit/tests/engines/fontSubstitution.test.ts
+++ b/packages/textkit/tests/engines/fontSubstitution.test.ts
@@ -116,4 +116,57 @@ describe('FontSubstitution', () => {
expect(string.runs[1].attributes.font).toEqual([SimplifiedChineseFont]);
});
});
+
+ describe('Surrogate pairs', () => {
+ // 𠮷 is U+20BB7 (SIP, beyond U+FFFF), encoded as the UTF-16 surrogate
+ // pair 𠮷 — JS string length 2.
+ const sipFont = {
+ name: 'SipFont',
+ unitsPerEm: 1000,
+ hasGlyphForCodePoint: (codePoint: number) => codePoint === 0x20bb7,
+ };
+ const noGlyphFont = {
+ name: 'NoGlyphFont',
+ unitsPerEm: 1000,
+ hasGlyphForCodePoint: () => false,
+ };
+
+ test('should treat a surrogate pair as a single code point when picking a font', () => {
+ const run = {
+ start: 0,
+ end: 2,
+ attributes: { font: [sipFont, noGlyphFont] },
+ } as any;
+
+ const string = instance({ string: '𠮷', runs: [run] });
+
+ expect(string).toHaveProperty('string', '𠮷');
+ expect(string.runs).toHaveLength(1);
+ expect(string.runs[0]).toHaveProperty('start', 0);
+ expect(string.runs[0]).toHaveProperty('end', 2);
+ expect(string.runs[0].attributes.font).toEqual([sipFont]);
+ });
+
+ test('should track UTF-16 indices correctly when mixing BMP and SIP code points', () => {
+ const run = {
+ start: 0,
+ end: 4,
+ attributes: { font: [sipFont, noGlyphFont] },
+ } as any;
+
+ const string = instance({ string: 'A𠮷B', runs: [run] });
+
+ expect(string).toHaveProperty('string', 'A𠮷B');
+ expect(string.runs).toHaveLength(3);
+ expect(string.runs[0]).toHaveProperty('start', 0);
+ expect(string.runs[0]).toHaveProperty('end', 1);
+ expect(string.runs[0].attributes.font).toEqual([noGlyphFont]);
+ expect(string.runs[1]).toHaveProperty('start', 1);
+ expect(string.runs[1]).toHaveProperty('end', 3);
+ expect(string.runs[1].attributes.font).toEqual([sipFont]);
+ expect(string.runs[2]).toHaveProperty('start', 3);
+ expect(string.runs[2]).toHaveProperty('end', 4);
+ expect(string.runs[2].attributes.font).toEqual([noGlyphFont]);
+ });
+ });
});