Skip to content

Commit 96aea33

Browse files
committed
fix(render): Type0/CIDFontType2 font CID→GID mapping (issue #215)
Root cause: open-pdf-render/src/fonts.rs treated a MISSING /CIDToGIDMap entry as false. Per ISO 32000-1 §9.7.4.2 the spec default for CIDFontType2 is Identity — absence must mean true. Combined with a system-font fallback heuristic that triggered when GIDs 1–10 of an embedded subset had empty outlines (common — those slots are placeholder/control glyphs), CIDs were ultimately looked up as Unicode codepoints in the system Arial cmap. CID 46 ('K' in the embedded ArialMT) → Unicode 46 = '.', producing the observed -29 shift "Kelder" → ".HOGHU". Fix: - check_cid_to_gid_identity: missing /CIDToGIDMap → true (spec Identity); explicit Stream → false. - build_font_entry: never apply system-font fallback to Type0/CID fonts. CID→GID mapping is embed-specific; no system Arial substitute is ever correct. - cid_to_glyph_id: identity → ToUnicode→cmap → CID-as-Unicode fallback (last resort only, preserves any previously-working PDFs). Verified on the user's test PDF (Vloerverwarming Woning Bert van Dorp): all 4 pages now render text correctly — "Kelder", "Begane Grond", "groep 01.01", "Regelunit 01", "Kunststof 8", etc. Note: text *extraction* (selection layer) still returns U+FFFD for these PDFs because the embedded /ToUnicode streams are empty. Selectable-text support for such PDFs is tracked separately.
1 parent 076557d commit 96aea33

4 files changed

Lines changed: 57 additions & 34 deletions

File tree

open-pdf-render/src/fonts.rs

Lines changed: 54 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -124,33 +124,44 @@ impl FontRegistry {
124124
HashMap::new()
125125
};
126126

127-
// Try to extract and parse embedded font data
128-
let mut parsed = Self::extract_and_parse_font(font_dict, doc);
129-
130-
// Check if the embedded font has usable glyph outlines for common character codes.
131-
// Some PDFs embed fonts with empty glyph entries for subset codes — fall back
132-
// to system font when most glyphs used by the document are empty.
133-
let embedded_usable = parsed.as_ref().map(|p| {
134-
// Check first 10 glyph IDs (common subset range) for actual outlines
135-
let check_range = 1u16..=10;
136-
let with_outlines = check_range.clone().filter(|gid| {
137-
p.glyphs.get(gid).map(|g| !g.commands.is_empty()).unwrap_or(false)
138-
}).count();
139-
// If less than half of checked glyphs have outlines, the font is likely broken
140-
with_outlines > check_range.count() / 2
141-
}).unwrap_or(false);
142-
143-
if parsed.is_none() || !embedded_usable {
127+
// Try to extract and parse embedded font data.
128+
// For Type0/CID fonts the embedded font lives on the DescendantFont's
129+
// FontDescriptor, not on the parent Type0 dict — try descendant first.
130+
let mut parsed = if is_cid {
131+
Self::extract_descendant_font(font_dict, doc)
132+
.or_else(|| Self::extract_and_parse_font(font_dict, doc))
133+
} else {
134+
Self::extract_and_parse_font(font_dict, doc)
135+
};
136+
137+
// For simple (non-CID) fonts, optionally fall back to a system font when
138+
// the embedded subset has no usable outlines for the common range.
139+
//
140+
// CRITICAL: never apply this fallback to Type0/CID fonts. CID fonts use
141+
// CID→GID mappings that are specific to the embedded TrueType (e.g. CID 46
142+
// = GID 46 = 'K' in this PDF). Substituting a system font would invalidate
143+
// that mapping and produce garbled text (issue #215).
144+
if !is_cid {
145+
let embedded_usable = parsed.as_ref().map(|p| {
146+
let check_range = 1u16..=10;
147+
let with_outlines = check_range.clone().filter(|gid| {
148+
p.glyphs.get(gid).map(|g| !g.commands.is_empty()).unwrap_or(false)
149+
}).count();
150+
with_outlines > check_range.count() / 2
151+
}).unwrap_or(false);
152+
153+
if parsed.is_none() || !embedded_usable {
154+
if let Some(sys_font) = Self::try_system_font(&base_font) {
155+
parsed = Some(sys_font);
156+
}
157+
}
158+
} else if parsed.is_none() {
159+
// CID font with no embedded data — last resort system fallback.
144160
if let Some(sys_font) = Self::try_system_font(&base_font) {
145161
parsed = Some(sys_font);
146162
}
147163
}
148164

149-
// For Type0 fonts with DescendantFonts, also check the descendant for embedded data
150-
if parsed.is_none() && is_cid {
151-
parsed = Self::extract_descendant_font(font_dict, doc);
152-
}
153-
154165
FontEntry {
155166
parsed,
156167
encoding_name,
@@ -314,15 +325,23 @@ impl FontRegistry {
314325
Some(Object::Dictionary(d)) => d,
315326
_ => return false,
316327
};
317-
// Check CIDToGIDMap
328+
// Check CIDToGIDMap.
329+
// Per ISO 32000-1 §9.7.4.2, the DEFAULT value of CIDToGIDMap for a
330+
// CIDFontType2 font is /Identity when the entry is absent — meaning
331+
// CID values map directly to GIDs in the embedded TrueType. We must
332+
// therefore treat a missing entry as Identity (true), not as false.
318333
match cid_dict.get(b"CIDToGIDMap") {
319334
Ok(obj) => {
320335
match Self::resolve_obj(obj, doc) {
321336
Some(Object::Name(n)) => n == b"Identity",
322-
_ => false,
337+
// A stream-based CIDToGIDMap is a non-identity custom map
338+
Some(Object::Stream(_)) => false,
339+
// Anything else: treat as Identity (spec default)
340+
_ => true,
323341
}
324342
}
325-
_ => false,
343+
// Entry absent → spec default is Identity
344+
Err(_) => true,
326345
}
327346
}
328347

@@ -484,14 +503,18 @@ impl FontRegistry {
484503
if entry.cid_to_gid_identity {
485504
// Direct mapping: CID = GID
486505
if parsed.glyphs.contains_key(&cid) {
487-
Some(cid)
488-
} else {
489-
None
506+
return Some(cid);
507+
}
508+
}
509+
// ToUnicode CMap → font cmap (text-extraction CMap, but may help when the
510+
// CID space is Unicode-aligned, e.g. Adobe-Japan1 etc.)
511+
if let Some(&unicode_char) = entry.cid_to_unicode.get(&cid) {
512+
if let Some(&gid) = parsed.cmap.get(&(unicode_char as u32)) {
513+
return Some(gid);
490514
}
491-
} else {
492-
// Try Unicode lookup: CID might be a Unicode codepoint
493-
parsed.cmap.get(&(cid as u32)).copied()
494515
}
516+
// Last resort: try the CID directly as a Unicode codepoint in cmap.
517+
parsed.cmap.get(&(cid as u32)).copied()
495518
}
496519

497520
/// Extract ToUnicode CMap from font dictionary.

open-pdf-studio/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "open-pdf-studio",
3-
"version": "1.48.0",
3+
"version": "1.48.1",
44
"description": "A free, open-source PDF annotation editor built with Tauri",
55
"scripts": {
66
"dev": "vite",

open-pdf-studio/src-tauri/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "open-pdf-studio"
3-
version = "1.48.0"
3+
version = "1.48.1"
44
description = "A free, open-source PDF annotation editor"
55
authors = ["OpenAEC Foundation"]
66
license = "MIT"

open-pdf-studio/src-tauri/tauri.conf.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"$schema": "https://schema.tauri.app/config/2",
33
"productName": "Open PDF Studio",
4-
"version": "1.48.0",
4+
"version": "1.48.1",
55
"identifier": "org.openaec.openpdfstudio",
66
"build": {
77
"frontendDist": "../dist",

0 commit comments

Comments
 (0)