Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/blue-guests-listen.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@react-pdf/textkit": minor
---

fix(textkit): treat surrogate pairs as single code points in font substitution
1 change: 1 addition & 0 deletions packages/examples/vite/generate-pdfs.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ const exampleNames = [
'resume',
'scripts',
'soft-hyphens',
'surrogate-pair',
'svg',
'svg-transform',
'transform-origin',
Expand Down
2 changes: 2 additions & 0 deletions packages/examples/vite/src/examples/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import math from './math';
import mermaid from './mermaid';
import passwordProtection from './password-protection';
import softHyphens from './soft-hyphens';
import surrogatePair from './surrogate-pair';

const EXAMPLES = [
scripts,
Expand Down Expand Up @@ -56,6 +57,7 @@ const EXAMPLES = [
mermaid,
passwordProtection,
softHyphens,
surrogatePair,
];

export default EXAMPLES;
158 changes: 158 additions & 0 deletions packages/examples/vite/src/examples/surrogate-pair/index.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import React from 'react';
import {
Document,
Page,
View,
Text,
StyleSheet,
Font,
} from '@react-pdf/renderer';

// Each entry below renders a sample string in a font that supports the
// non-BMP code points it contains. All sample characters live in U+10000+
// (SMP / SIP / TIP), i.e. they are UTF-16 surrogate pairs. The purpose of
// this example is to show that surrogate pairs are looked up in the font
// stack as a single code point — without that, every character here would
// fall back to Helvetica and render as a missing glyph (tofu).
//
// URLs were resolved from
// https://fonts.googleapis.com/css2?family=...&text=...
// so each subset includes exactly the characters used below.
const SCRIPTS: ReadonlyArray<{
family: string;
label: string;
caption: string;
url: string;
sample: string;
}> = [
{
family: 'Noto Sans JP',
label: 'CJK Ext-B (Japanese name kanji)',
caption:
'\u{20BB7} U+20BB7 / \u{20B9F} U+20B9F / \u{29E3D} U+29E3D — SIP',
url: 'https://fonts.gstatic.com/l/font?kit=-F6jfjtqLzI2JPCgQBnw7HFyzSD-AsregP8VFBEj75jY0rw_mMKldF3n7ULdqbuE7JzuU-TQZ2_b_xbeAw&skey=72472b0eb8793570&v=v56',
// 𠮷田と吉田 𠮟 𩸽
sample: '\u{20BB7}田と吉田 \u{20B9F} \u{29E3D}',
},
{
family: 'Noto Sans Egyptian Hieroglyphs',
label: 'Egyptian Hieroglyphs',
caption:
'\u{13080} U+13080 / \u{1313F} U+1313F / \u{13142} U+13142 / \u{13189} U+13189 — SMP',
url: 'https://fonts.gstatic.com/l/font?kit=vEF42-tODB8RrNDvZSUmRhcQHzx1s7y_F9-j3qSzEcbEYindSlK6xRkRmi-6DJyNmGC4VqdZ3IChdUU&skey=fc1730efc5589785&v=v30',
// 𓂀 𓄿 𓅂 𓆉
sample: '\u{13080} \u{1313F} \u{13142} \u{13189}',
},
{
family: 'Noto Sans Cuneiform',
label: 'Cuneiform',
caption:
'\u{12000} U+12000 / \u{12041} U+12041 / \u{1208A} U+1208A / \u{120FB} U+120FB — SMP',
url: 'https://fonts.gstatic.com/l/font?kit=bMrrmTWK7YY-MF22aHGGd7H8PhJtvBDWgbxJkxQvU_W-t7uZ9YpFIaCIMrlrtE58&skey=730a3aeff71371d1&v=v18',
// 𒀀 𒁁 𒂊 𒃻
sample: '\u{12000} \u{12041} \u{1208A} \u{120FB}',
},
{
family: 'Noto Sans Math',
label: 'Mathematical Alphanumeric Symbols',
caption:
'\u{1D400} U+1D400 / \u{1D49C} U+1D49C / \u{1D538} U+1D538 / \u{1D56E} U+1D56E / \u{1D505} U+1D505 — SMP',
url: 'https://fonts.gstatic.com/l/font?kit=7Aump_cpkSecTWaHRlH2hyV5UHkF-Vs48d-hNu0TJ8LlkGihpLHoHLXvHsBpBiI&skey=27a26f5b3f2d5ea1&v=v19',
// 𝐀 𝒜 𝔸 𝕮 𝔅
sample: '\u{1D400}\u{1D49C}\u{1D538} \u{1D56E} \u{1D505}',
},
{
family: 'Noto Music',
label: 'Musical Symbols',
caption:
'\u{1D11E} U+1D11E / \u{1D122} U+1D122 / \u{1D158}\u{1D165} U+1D158+U+1D165 / \u{1D158}\u{1D165}\u{1D16E} — SMP',
url: 'https://fonts.gstatic.com/l/font?kit=pe0rMIiSN5pO63htf1sxItSQAdZqQUDYyhXhOzpYe6-hOgNGBD6n32j4Lg&skey=a1f7640827c2d625&v=v21',
// 𝄞 𝄢 𝅘𝅥 𝅘𝅥𝅮
sample: '\u{1D11E} \u{1D122} \u{1D158}\u{1D165} \u{1D158}\u{1D165}\u{1D16E}',
},
{
family: 'Noto Sans Adlam',
label: 'Adlam (modern West African)',
caption:
'\u{1E900} U+1E900 / \u{1E901} U+1E901 / \u{1E902} U+1E902 / \u{1E903} U+1E903 / \u{1E904} U+1E904 — SMP',
url: 'https://fonts.gstatic.com/l/font?kit=neIczCCpqp0s5pPusPamd81eMfjPonvqdbYxxpgufnv0TGrBZLwggvomO9nbazENrX7-_ZwTdOY6tB2ltaK3&skey=1fb7f26201009a1b&v=v27',
// 𞤀 𞤁 𞤂 𞤃 𞤄
sample: '\u{1E900}\u{1E901}\u{1E902}\u{1E903}\u{1E904}',
},
];

SCRIPTS.forEach(({ family, url }) => {
Font.register({ family, src: url });
});

const styles = StyleSheet.create({
body: {
padding: 40,
backgroundColor: '#fafafa',
},
title: {
fontSize: 18,
fontWeight: 'bold',
color: '#1a1a1a',
},
subtitle: {
fontSize: 9,
color: '#888',
marginBottom: 20,
},
card: {
backgroundColor: '#ffffff',
borderRadius: 5,
padding: 16,
borderWidth: 1,
borderColor: '#e8e8e8',
marginBottom: 8,
},
cardLabel: {
fontSize: 8,
color: '#999',
textTransform: 'uppercase',
letterSpacing: 0.5,
marginBottom: 6,
},
cardCaption: {
fontSize: 8,
color: '#bbb',
marginTop: 8,
},
sample: {
fontSize: 24,
},
});

const MyDoc = () => (
<Page style={styles.body}>
<Text style={styles.title}>Surrogate pair rendering (SMP / SIP / TIP)</Text>
<Text style={styles.subtitle}>
Every character below sits above U+FFFF and is encoded as a UTF-16
surrogate pair. Each should resolve to its supporting font (Noto)
and render as a real glyph, not as tofu.
</Text>

{SCRIPTS.map(({ family, label, caption, sample }) => (
<View key={family} style={styles.card}>
<Text style={styles.cardLabel}>{label}</Text>
<Text style={[styles.sample, { fontFamily: family }]}>{sample}</Text>
<Text style={styles.cardCaption}>{caption}</Text>
</View>
))}
</Page>
);

const SurrogatePair = () => (
<Document>
<MyDoc />
</Document>
);

export default {
id: 'surrogate-pair',
name: 'Surrogate pair',
description: '',
Document: SurrogatePair,
};
13 changes: 9 additions & 4 deletions packages/textkit/src/engines/fontSubstitution/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,13 @@ const fontSubstitution =

const chars = string.slice(run.start, run.end);

for (let j = 0; j < chars.length; j += 1) {
const char = chars[j];
const codePoint = char.codePointAt(0);
// Iterate by code point so that surrogate pairs (e.g. SIP characters
// U+10000 and above) are looked up as a single code point in the font
// stack, not as separate high/low surrogates.
let j = 0;
while (j < chars.length) {
const codePoint = chars.codePointAt(j)!;
const charLength = codePoint > 0xffff ? 2 : 1;

// If the default font does not have a glyph and the fallback font does, we use it
const font = pickFontFromFontStack(
Expand Down Expand Up @@ -87,7 +91,8 @@ const fontSubstitution =
lastIndex = index;
}

index += char.length;
j += charLength;
index += charLength;
}
}

Expand Down
53 changes: 53 additions & 0 deletions packages/textkit/tests/engines/fontSubstitution.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,57 @@ describe('FontSubstitution', () => {
expect(string.runs[1].attributes.font).toEqual([SimplifiedChineseFont]);
});
});

describe('Surrogate pairs', () => {
// 𠮷 is U+20BB7 (SIP, beyond U+FFFF), encoded as the UTF-16 surrogate
// pair 𠮷 — JS string length 2.
const sipFont = {
name: 'SipFont',
unitsPerEm: 1000,
hasGlyphForCodePoint: (codePoint: number) => codePoint === 0x20bb7,
};
const noGlyphFont = {
name: 'NoGlyphFont',
unitsPerEm: 1000,
hasGlyphForCodePoint: () => false,
};

test('should treat a surrogate pair as a single code point when picking a font', () => {
const run = {
start: 0,
end: 2,
attributes: { font: [sipFont, noGlyphFont] },
} as any;

const string = instance({ string: '𠮷', runs: [run] });

expect(string).toHaveProperty('string', '𠮷');
expect(string.runs).toHaveLength(1);
expect(string.runs[0]).toHaveProperty('start', 0);
expect(string.runs[0]).toHaveProperty('end', 2);
expect(string.runs[0].attributes.font).toEqual([sipFont]);
});

test('should track UTF-16 indices correctly when mixing BMP and SIP code points', () => {
const run = {
start: 0,
end: 4,
attributes: { font: [sipFont, noGlyphFont] },
} as any;

const string = instance({ string: 'A𠮷B', runs: [run] });

expect(string).toHaveProperty('string', 'A𠮷B');
expect(string.runs).toHaveLength(3);
expect(string.runs[0]).toHaveProperty('start', 0);
expect(string.runs[0]).toHaveProperty('end', 1);
expect(string.runs[0].attributes.font).toEqual([noGlyphFont]);
expect(string.runs[1]).toHaveProperty('start', 1);
expect(string.runs[1]).toHaveProperty('end', 3);
expect(string.runs[1].attributes.font).toEqual([sipFont]);
expect(string.runs[2]).toHaveProperty('start', 3);
expect(string.runs[2]).toHaveProperty('end', 4);
expect(string.runs[2].attributes.font).toEqual([noGlyphFont]);
});
});
});
Loading