Skip to content

Commit 67a37d5

Browse files
fix(api): improve text cleaning for PDF rendering by stripping invisible unicode characters (#1962)
Co-authored-by: Tofik Hasanov <annexcies@gmail.com>
1 parent 0173ba7 commit 67a37d5

2 files changed

Lines changed: 44 additions & 5 deletions

File tree

apps/api/src/trust-portal/policy-pdf-renderer.service.ts

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,17 @@ interface PolicyForPDF {
2828
@Injectable()
2929
export class PolicyPdfRendererService {
3030
private cleanTextForPDF(text: string): string {
31+
// Strip invisible/control-ish unicode chars that commonly appear via copy/paste.
32+
// These aren't visible in the editor, but previous logic converted unknown unicode to
33+
// "?" which looks like random corruption in the generated PDF.
34+
const strippedText = text
35+
.replace(/\u00AD/g, '')
36+
.replace(/[\u200B-\u200F]/g, '')
37+
.replace(/[\u202A-\u202E]/g, '')
38+
.replace(/[\u2060-\u206F]/g, '')
39+
.replace(/\uFEFF/g, '')
40+
.replace(/\uFFFD/g, '');
41+
3142
const replacements: { [key: string]: string } = {
3243
'\u2018': "'",
3344
'\u2019': "'",
@@ -52,7 +63,7 @@ export class PolicyPdfRendererService {
5263
'\u2194': '<->',
5364
};
5465

55-
let cleanedText = text;
66+
let cleanedText = strippedText;
5667
for (const [unicode, replacement] of Object.entries(replacements)) {
5768
cleanedText = cleanedText.replace(new RegExp(unicode, 'g'), replacement);
5869
}
@@ -122,7 +133,10 @@ export class PolicyPdfRendererService {
122133
Ç: 'C',
123134
Ý: 'Y',
124135
};
125-
return fallbacks[char] || '?';
136+
// Preserve unknown characters instead of coercing to "?".
137+
// If a glyph isn't supported by the active PDF font, viewers may show a tofu box,
138+
// but inserting "?" is worse because it looks like text was modified.
139+
return fallbacks[char] ?? char;
126140
});
127141
}
128142

apps/app/src/lib/pdf-generator.ts

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,28 @@ interface PDFConfig {
3232

3333
// Helper function to clean text for PDF rendering
3434
const cleanTextForPDF = (text: string): string => {
35+
// Strip invisible/control-ish unicode chars that commonly appear via copy/paste.
36+
// These aren't visible in the editor, but our previous implementation converted them
37+
// into "?" which *is* visible and looks like random corruption in PDFs.
38+
//
39+
// - U+00AD: soft hyphen
40+
// - U+200B..U+200F: zero-width space/joiners + direction marks
41+
// - U+202A..U+202E: bidi embedding/override marks
42+
// - U+2060..U+206F: word joiner + other format chars
43+
// - U+FEFF: byte order mark
44+
// - U+FFFD: replacement character
45+
const stripInvisibleChars = (value: string): string => {
46+
return value
47+
.replace(/\u00AD/g, '')
48+
.replace(/[\u200B-\u200F]/g, '')
49+
.replace(/[\u202A-\u202E]/g, '')
50+
.replace(/[\u2060-\u206F]/g, '')
51+
.replace(/\uFEFF/g, '')
52+
.replace(/\uFFFD/g, '');
53+
};
54+
55+
const strippedText = stripInvisibleChars(text);
56+
3557
// First, handle specific problematic characters that cause font issues
3658
const replacements: { [key: string]: string } = {
3759
'\u2018': "'", // left single quotation mark
@@ -58,14 +80,14 @@ const cleanTextForPDF = (text: string): string => {
5880
};
5981

6082
// Replace known problematic characters
61-
let cleanedText = text;
83+
let cleanedText = strippedText;
6284
for (const [unicode, replacement] of Object.entries(replacements)) {
6385
cleanedText = cleanedText.replace(new RegExp(unicode, 'g'), replacement);
6486
}
6587

6688
// For any remaining non-ASCII characters, try to preserve them first
6789
// Only replace if they cause font rendering issues
68-
return cleanedText.replace(/[^\x00-\x7F]/g, function(char) {
90+
return cleanedText.replace(/[^\x00-\x7F]/g, function (char) {
6991
// Common accented characters that should work fine in most PDF fonts
7092
const safeChars = /[àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞß]/;
7193

@@ -89,7 +111,10 @@ const cleanTextForPDF = (text: string): string => {
89111
'Ñ': 'N', 'Ç': 'C', 'Ý': 'Y'
90112
};
91113

92-
return fallbacks[char] || '?'; // Use ? for unknown characters
114+
// Preserve unknown characters instead of coercing to "?".
115+
// If the active PDF font can't render a glyph, viewers may show a tofu box,
116+
// but that's still preferable to inserting random "?" where the editor shows nothing.
117+
return fallbacks[char] ?? char;
93118
});
94119
};
95120

0 commit comments

Comments
 (0)