From 1e10055dd567e84d6c0dd1f2187a849e03e027c7 Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Tue, 2 Sep 2025 10:13:56 +0100 Subject: [PATCH 1/4] tests/test_release.py: fix comment. --- tests/test_release.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_release.py b/tests/test_release.py index f8c5ed046..7c3e56ae8 100644 --- a/tests/test_release.py +++ b/tests/test_release.py @@ -58,9 +58,8 @@ def test_release_bug_template(): def test_release_changelog_version(): ''' - In changes.txt, first mentioned of MuPDF must match setup.version_mupdf. + In changes.txt, first item must match setup.version_p. ''' - # In changes.txt, first item must match setup.version_p. p = f'{g_root}/changes.txt' with open(p) as f: text = f.read() From adc3cc5c7b7a94fd6e82ecb001935471985a72e8 Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Tue, 2 Sep 2025 11:29:26 +0100 Subject: [PATCH 2/4] scripts/test.py: cibw_do_test_project(): fix incorrectly-commited commented-out code. --- scripts/test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/test.py b/scripts/test.py index 004624d73..a76bb8a3d 100755 --- a/scripts/test.py +++ b/scripts/test.py @@ -988,7 +988,7 @@ def build(): int bar1(const char* text) { int ret = 0; - /*if (setjmp(jmpbuf) == 0) + if (setjmp(jmpbuf) == 0) { ret = bar0(text); } @@ -996,7 +996,7 @@ def build(): { printf("setjmp() returned non-zero.\\n"); throw std::runtime_error("deliberate exception"); - }*/ + } throw std::runtime_error("deliberate exception"); return ret; } @@ -1029,6 +1029,7 @@ def build(): log(f'text is:\n{textwrap.indent(text, " ")}') assert e assert 'module="env" function="__c_longjmp": tag import requires a WebAssembly.Tag' in text + log(f'Have detected expected exception from Pyodide.') else: run(f'cd {testdir} && cibuildwheel{cibw_pyodide_args}', env_extra=env_extra) run(f'ls -ld {testdir}/wheelhouse/*') From 03ad7e96be95ea34e852094bb51136a7359d1184 Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Tue, 2 Sep 2025 16:31:45 +0100 Subject: [PATCH 3/4] src/__init__.py: class Annot: define get_text() and get_textbox() in class. --- src/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index 7daf3f463..d22df1a1b 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1035,6 +1035,12 @@ def get_sound(self): stream = JM_BinFromBuffer(buf) res['stream'] = stream return res + + def get_text(self, *args, **kwargs): + return utils.get_text(self, *args, **kwargs) + + def get_textbox(self, *args, **kwargs): + return utils.get_textbox(self, *args, **kwargs) def get_textpage(self, clip=None, flags=0): """Make annotation TextPage.""" @@ -20938,8 +20944,6 @@ def colors_wx_list(): recover_quad = utils.recover_quad recover_span_quad = utils.recover_span_quad -Annot.get_text = utils.get_text -Annot.get_textbox = utils.get_textbox Document._do_links = utils.do_links Document._do_widgets = utils.do_widgets From 7d4e8cb3511a99f439ec40aeb486c68d57997d5e Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Tue, 2 Sep 2025 16:34:39 +0100 Subject: [PATCH 4/4] src/: class Document: moved utils.py code into class methods. --- src/__init__.py | 1931 ++++++++++++++++++++++++++++++++++++++++++++++- src/utils.py | 1926 ---------------------------------------------- 2 files changed, 1902 insertions(+), 1955 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index d22df1a1b..5b9802216 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -3064,6 +3064,14 @@ def _addFormFont(self, name, font): v = JM_pdf_obj_from_str( pdf, font) mupdf.pdf_dict_put( fonts, k, v) + def del_toc_item( + self, + idx: int, + ) -> None: + """Delete TOC / bookmark item by index.""" + xref = self.get_outline_xrefs()[idx] + self._remove_toc_item(xref) + def _delToC(self): """Delete the TOC.""" if self.is_closed or self.is_encrypted: @@ -3109,6 +3117,454 @@ def _deleteObject(self, xref): raise ValueError( MSG_BAD_XREF) mupdf.pdf_delete_object(pdf, xref) + def _do_links( + doc1: typing.Self, + doc2: typing.Self, + from_page: int = -1, + to_page: int = -1, + start_at: int = -1, + ) -> None: + """Insert links contained in copied page range into destination PDF. + + Parameter values **must** equal those of method insert_pdf(), which must + have been previously executed. + """ + #pymupdf.log( 'utils.do_links()') + # -------------------------------------------------------------------------- + # internal function to create the actual "/Annots" object string + # -------------------------------------------------------------------------- + def cre_annot(lnk, xref_dst, pno_src, ctm): + """Create annotation object string for a passed-in link.""" + + r = lnk["from"] * ctm # rect in PDF coordinates + rect = _format_g(tuple(r)) + if lnk["kind"] == LINK_GOTO: + txt = annot_skel["goto1"] # annot_goto + idx = pno_src.index(lnk["page"]) + p = lnk["to"] * ctm # target point in PDF coordinates + annot = txt(xref_dst[idx], p.x, p.y, lnk["zoom"], rect) + + elif lnk["kind"] == LINK_GOTOR: + if lnk["page"] >= 0: + txt = annot_skel["gotor1"] # annot_gotor + pnt = lnk.get("to", Point(0, 0)) # destination point + if type(pnt) is not Point: + pnt = Point(0, 0) + annot = txt( + lnk["page"], + pnt.x, + pnt.y, + lnk["zoom"], + lnk["file"], + lnk["file"], + rect, + ) + else: + txt = annot_skel["gotor2"] # annot_gotor_n + to = get_pdf_str(lnk["to"]) + to = to[1:-1] + f = lnk["file"] + annot = txt(to, f, rect) + + elif lnk["kind"] == LINK_LAUNCH: + txt = annot_skel["launch"] # annot_launch + annot = txt(lnk["file"], lnk["file"], rect) + + elif lnk["kind"] == LINK_URI: + txt = annot_skel["uri"] # annot_uri + annot = txt(lnk["uri"], rect) + + else: + annot = "" + + return annot + + # -------------------------------------------------------------------------- + + # validate & normalize parameters + if from_page < 0: + fp = 0 + elif from_page >= doc2.page_count: + fp = doc2.page_count - 1 + else: + fp = from_page + + if to_page < 0 or to_page >= doc2.page_count: + tp = doc2.page_count - 1 + else: + tp = to_page + + if start_at < 0: + raise ValueError("'start_at' must be >= 0") + sa = start_at + + incr = 1 if fp <= tp else -1 # page range could be reversed + + # lists of source / destination page numbers + pno_src = list(range(fp, tp + incr, incr)) + pno_dst = [sa + i for i in range(len(pno_src))] + + # lists of source / destination page xrefs + xref_src = [] + xref_dst = [] + for i in range(len(pno_src)): + p_src = pno_src[i] + p_dst = pno_dst[i] + old_xref = doc2.page_xref(p_src) + new_xref = doc1.page_xref(p_dst) + xref_src.append(old_xref) + xref_dst.append(new_xref) + + # create the links for each copied page in destination PDF + for i in range(len(xref_src)): + page_src = doc2[pno_src[i]] # load source page + links = page_src.get_links() # get all its links + #log( '{pno_src=}') + #log( '{type(page_src)=}') + #log( '{page_src=}') + #log( '{=i len(links)}') + if len(links) == 0: # no links there + page_src = None + continue + ctm = ~page_src.transformation_matrix # calc page transformation matrix + page_dst = doc1[pno_dst[i]] # load destination page + link_tab = [] # store all link definitions here + for l in links: + if l["kind"] == LINK_GOTO and (l["page"] not in pno_src): + continue # GOTO link target not in copied pages + annot_text = cre_annot(l, xref_dst, pno_src, ctm) + if annot_text: + link_tab.append(annot_text) + if link_tab != []: + page_dst._addAnnot_FromString( tuple(link_tab)) + #log( 'utils.do_links() returning.') + + def _do_widgets( + tar: typing.Self, + src: typing.Self, + graftmap, + from_page: int = -1, + to_page: int = -1, + start_at: int = -1, + join_duplicates=0, + ) -> None: + """Insert widgets of copied page range into target PDF. + + Parameter values **must** equal those of method insert_pdf() which + must have been previously executed. + """ + if not src.is_form_pdf: # nothing to do: source PDF has no fields + return + + def clean_kid_parents(acro_fields): + """ Make sure all kids have correct "Parent" pointers.""" + for i in range(acro_fields.pdf_array_len()): + parent = acro_fields.pdf_array_get(i) + kids = parent.pdf_dict_get(PDF_NAME("Kids")) + for j in range(kids.pdf_array_len()): + kid = kids.pdf_array_get(j) + kid.pdf_dict_put(PDF_NAME("Parent"), parent) + + def join_widgets(pdf, acro_fields, xref1, xref2, name): + """Called for each pair of widgets having the same name. + + Args: + pdf: target MuPDF document + acro_fields: object Root/AcroForm/Fields + xref1, xref2: widget xrefs having same names + name: (str) the name + + Result: + Defined or updated widget parent that points to both widgets. + """ + + def re_target(pdf, acro_fields, xref1, kids1, xref2, kids2): + """Merge widget in xref2 into "Kids" list of widget xref1. + + Args: + xref1, kids1: target widget and its "Kids" array. + xref2, kids2: source wwidget and its "Kids" array (may be empty). + """ + # make indirect objects from widgets + w1_ind = mupdf.pdf_new_indirect(pdf, xref1, 0) + w2_ind = mupdf.pdf_new_indirect(pdf, xref2, 0) + # find source widget in "Fields" array + idx = acro_fields.pdf_array_find(w2_ind) + acro_fields.pdf_array_delete(idx) + + if not kids2.pdf_is_array(): # source widget has no kids + widget = mupdf.pdf_load_object(pdf, xref2) + + # delete name from widget and insert target as parent + widget.pdf_dict_del(PDF_NAME("T")) + widget.pdf_dict_put(PDF_NAME("Parent"), w1_ind) + + # put in target Kids + kids1.pdf_array_push(w2_ind) + else: # copy source kids to target kids + for i in range(kids2.pdf_array_len()): + kid = kids2.pdf_array_get(i) + kid.pdf_dict_put(PDF_NAME("Parent"), w1_ind) + kid_ind = mupdf.pdf_new_indirect(pdf, kid.pdf_to_num(), 0) + kids1.pdf_array_push(kid_ind) + + def new_target(pdf, acro_fields, xref1, w1, xref2, w2, name): + """Make new "Parent" for two widgets with same name. + + Args: + xref1, w1: first widget + xref2, w2: second widget + name: field name + + Result: + Both widgets have no "Kids". We create a new object with the + name and a "Kids" array containing the widgets. + Original widgets must be removed from AcroForm/Fields. + """ + # make new "Parent" object + new = mupdf.pdf_new_dict(pdf, 5) + new.pdf_dict_put_text_string(PDF_NAME("T"), name) + kids = new.pdf_dict_put_array(PDF_NAME("Kids"), 2) + new_obj = mupdf.pdf_add_object(pdf, new) + new_obj_xref = new_obj.pdf_to_num() + new_ind = mupdf.pdf_new_indirect(pdf, new_obj_xref, 0) + + # copy over some required source widget properties + ft = w1.pdf_dict_get(PDF_NAME("FT")) + w1.pdf_dict_del(PDF_NAME("FT")) + new_obj.pdf_dict_put(PDF_NAME("FT"), ft) + + aa = w1.pdf_dict_get(PDF_NAME("AA")) + w1.pdf_dict_del(PDF_NAME("AA")) + new_obj.pdf_dict_put(PDF_NAME("AA"), aa) + + # remove name field, insert "Parent" field in source widgets + w1.pdf_dict_del(PDF_NAME("T")) + w1.pdf_dict_put(PDF_NAME("Parent"), new_ind) + w2.pdf_dict_del(PDF_NAME("T")) + w2.pdf_dict_put(PDF_NAME("Parent"), new_ind) + + # put source widgets in "kids" array + ind1 = mupdf.pdf_new_indirect(pdf, xref1, 0) + ind2 = mupdf.pdf_new_indirect(pdf, xref2, 0) + kids.pdf_array_push(ind1) + kids.pdf_array_push(ind2) + + # remove source widgets from "AcroForm/Fields" + idx = acro_fields.pdf_array_find(ind1) + acro_fields.pdf_array_delete(idx) + idx = acro_fields.pdf_array_find(ind2) + acro_fields.pdf_array_delete(idx) + + acro_fields.pdf_array_push(new_ind) + + w1 = mupdf.pdf_load_object(pdf, xref1) + w2 = mupdf.pdf_load_object(pdf, xref2) + kids1 = w1.pdf_dict_get(PDF_NAME("Kids")) + kids2 = w2.pdf_dict_get(PDF_NAME("Kids")) + + # check which widget has a suitable "Kids" array + if kids1.pdf_is_array(): + re_target(pdf, acro_fields, xref1, kids1, xref2, kids2) # pylint: disable=arguments-out-of-order + elif kids2.pdf_is_array(): + re_target(pdf, acro_fields, xref2, kids2, xref1, kids1) # pylint: disable=arguments-out-of-order + else: + new_target(pdf, acro_fields, xref1, w1, xref2, w2, name) # pylint: disable=arguments-out-of-order + + def get_kids(parent, kids_list): + """Return xref list of leaf kids for a parent. + + Call with an empty list. + """ + kids = mupdf.pdf_dict_get(parent, PDF_NAME("Kids")) + if not kids.pdf_is_array(): + return kids_list + for i in range(kids.pdf_array_len()): + kid = kids.pdf_array_get(i) + if mupdf.pdf_is_dict(mupdf.pdf_dict_get(kid, PDF_NAME("Kids"))): + kids_list = get_kids(kid, kids_list) + else: + kids_list.append(kid.pdf_to_num()) + return kids_list + + def kids_xrefs(widget): + """Get the xref of top "Parent" and the list of leaf widgets.""" + kids_list = [] + parent = mupdf.pdf_dict_get(widget, PDF_NAME("Parent")) + parent_xref = parent.pdf_to_num() + if parent_xref == 0: + return parent_xref, kids_list + kids_list = get_kids(parent, kids_list) + return parent_xref, kids_list + + def deduplicate_names(pdf, acro_fields, join_duplicates=False): + """Handle any widget name duplicates caused by the merge.""" + names = {} # key is a widget name, value a list of widgets having it. + + # extract all names and widgets in "AcroForm/Fields" + for i in range(mupdf.pdf_array_len(acro_fields)): + wobject = mupdf.pdf_array_get(acro_fields, i) + xref = wobject.pdf_to_num() + + # extract widget name and collect widget(s) using it + T = mupdf.pdf_dict_get_text_string(wobject, PDF_NAME("T")) + xrefs = names.get(T, []) + xrefs.append(xref) + names[T] = xrefs + + for name, xrefs in names.items(): + if len(xrefs) < 2: + continue + xref0, xref1 = xrefs[:2] # only exactly 2 should occur! + if join_duplicates: # combine fields with equal names + join_widgets(pdf, acro_fields, xref0, xref1, name) + else: # make field names unique + newname = name + f" [{xref1}]" # append this to the name + wobject = mupdf.pdf_load_object(pdf, xref1) + wobject.pdf_dict_put_text_string(PDF_NAME("T"), newname) + + clean_kid_parents(acro_fields) + + def get_acroform(doc): + """Retrieve the AcroForm dictionary form a PDF.""" + pdf = mupdf.pdf_document_from_fz_document(doc) + # AcroForm (= central form field info) + return mupdf.pdf_dict_getp(mupdf.pdf_trailer(pdf), "Root/AcroForm") + + tarpdf = mupdf.pdf_document_from_fz_document(tar) + srcpdf = mupdf.pdf_document_from_fz_document(src) + + if tar.is_form_pdf: + # target is a Form PDF, so use it to include source fields + acro = get_acroform(tar) + # Important arrays in AcroForm + acro_fields = acro.pdf_dict_get(PDF_NAME("Fields")) + tar_co = acro.pdf_dict_get(PDF_NAME("CO")) + if not tar_co.pdf_is_array(): + tar_co = acro.pdf_dict_put_array(PDF_NAME("CO"), 5) + else: + # target is no Form PDF, so copy over source AcroForm + acro = mupdf.pdf_deep_copy_obj(get_acroform(src)) # make a copy + + # Clear "Fields" and "CO" arrays: will be populated by page fields. + # This is required to avoid copying unneeded objects. + acro.pdf_dict_del(PDF_NAME("Fields")) + acro.pdf_dict_put_array(PDF_NAME("Fields"), 5) + acro.pdf_dict_del(PDF_NAME("CO")) + acro.pdf_dict_put_array(PDF_NAME("CO"), 5) + + # Enrich AcroForm for copying to target + acro_graft = mupdf.pdf_graft_mapped_object(graftmap, acro) + + # Insert AcroForm into target PDF + acro_tar = mupdf.pdf_add_object(tarpdf, acro_graft) + acro_fields = acro_tar.pdf_dict_get(PDF_NAME("Fields")) + tar_co = acro_tar.pdf_dict_get(PDF_NAME("CO")) + + # get its xref and insert it into target catalog + tar_xref = acro_tar.pdf_to_num() + acro_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) + root = mupdf.pdf_dict_get(mupdf.pdf_trailer(tarpdf), PDF_NAME("Root")) + root.pdf_dict_put(PDF_NAME("AcroForm"), acro_tar_ind) + + if from_page <= to_page: + src_range = range(from_page, to_page + 1) + else: + src_range = range(from_page, to_page - 1, -1) + + parents = {} # information about widget parents + + # remove "P" owning page reference from all widgets of all source pages + for i in src_range: + src_page = src[i] + for xref in [ + xref + for xref, wtype, _ in src_page.annot_xrefs() + if wtype == mupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member + ]: + w_obj = mupdf.pdf_load_object(srcpdf, xref) + w_obj.pdf_dict_del(PDF_NAME("P")) + + # get the widget's parent structure + parent_xref, old_kids = kids_xrefs(w_obj) + if parent_xref: + parents[parent_xref] = { + "new_xref": 0, + "old_kids": old_kids, + "new_kids": [], + } + # Copy over Parent widgets first - they are not page-dependent + for xref in parents.keys(): # pylint: disable=consider-using-dict-items + parent = mupdf.pdf_load_object(srcpdf, xref) + parent_graft = mupdf.pdf_graft_mapped_object(graftmap, parent) + parent_tar = mupdf.pdf_add_object(tarpdf, parent_graft) + kids_xrefs_new = get_kids(parent_tar, []) + parent_xref_new = parent_tar.pdf_to_num() + parent_ind = mupdf.pdf_new_indirect(tarpdf, parent_xref_new, 0) + acro_fields.pdf_array_push(parent_ind) + parents[xref]["new_xref"] = parent_xref_new + parents[xref]["new_kids"] = kids_xrefs_new + + for i in range(len(src_range)): + # read first copied over page in target + tar_page = tar[start_at + i] + + # read the original page in the source PDF + src_page = src[src_range[i]] + + # now walk through source page widgets and copy over + w_xrefs = [ # widget xrefs of the source page + xref + for xref, wtype, _ in src_page.annot_xrefs() + if wtype == mupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member + ] + if not w_xrefs: # no widgets on this source page + continue + + # convert to formal PDF page + tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page) + + # extract annotations array + tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), PDF_NAME("Annots")) + if not mupdf.pdf_is_array(tar_annots): + tar_annots = mupdf.pdf_dict_put_array( + tar_page_pdf.obj(), PDF_NAME("Annots"), 5 + ) + + for xref in w_xrefs: + w_obj = mupdf.pdf_load_object(srcpdf, xref) + + # check if field takes part in inter-field validations + is_aac = mupdf.pdf_is_dict(mupdf.pdf_dict_getp(w_obj, "AA/C")) + + # check if parent of widget already in target + parent_xref = mupdf.pdf_to_num( + w_obj.pdf_dict_get(PDF_NAME("Parent")) + ) + if parent_xref == 0: # parent not in target yet + try: + w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj) + except Exception as e: + message_warning(f"cannot copy widget at {xref=}: {e}") + continue + w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft) + tar_xref = w_obj_tar.pdf_to_num() + w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) + mupdf.pdf_array_push(tar_annots, w_obj_tar_ind) + mupdf.pdf_array_push(acro_fields, w_obj_tar_ind) + else: + parent = parents[parent_xref] + idx = parent["old_kids"].index(xref) # search for xref in parent + tar_xref = parent["new_kids"][idx] + w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) + mupdf.pdf_array_push(tar_annots, w_obj_tar_ind) + + # Into "AcroForm/CO" if a computation field. + if is_aac: + mupdf.pdf_array_push(tar_co, w_obj_tar_ind) + + deduplicate_names(tarpdf, acro_fields, join_duplicates=join_duplicates) + def _embeddedFileGet(self, idx): pdf = _as_pdf_document(self) names = mupdf.pdf_dict_getl( @@ -4272,6 +4728,107 @@ def fullcopy_page(self, pno, to=-1): self._reset_page_refs() + def get_char_widths( + doc: typing.Self, + xref: int, + limit: int = 256, + idx: int = 0, + fontdict: OptDict = None, + ) -> list: + """Get list of glyph information of a font. + + Notes: + Must be provided by its XREF number. If we already dealt with the + font, it will be recorded in doc.FontInfos. Otherwise we insert an + entry there. + Finally we return the glyphs for the font. This is a list of + (glyph, width) where glyph is an integer controlling the char + appearance, and width is a float controlling the char's spacing: + width * fontsize is the actual space. + For 'simple' fonts, glyph == ord(char) will usually be true. + Exceptions are 'Symbol' and 'ZapfDingbats'. We are providing data for these directly here. + """ + fontinfo = CheckFontInfo(doc, xref) + if fontinfo is None: # not recorded yet: create it + if fontdict is None: + name, ext, stype, asc, dsc = utils._get_font_properties(doc, xref) + fontdict = { + "name": name, + "type": stype, + "ext": ext, + "ascender": asc, + "descender": dsc, + } + else: + name = fontdict["name"] + ext = fontdict["ext"] + stype = fontdict["type"] + ordering = fontdict["ordering"] + simple = fontdict["simple"] + + if ext == "": + raise ValueError("xref is not a font") + + # check for 'simple' fonts + if stype in ("Type1", "MMType1", "TrueType"): + simple = True + else: + simple = False + + # check for CJK fonts + if name in ("Fangti", "Ming"): + ordering = 0 + elif name in ("Heiti", "Song"): + ordering = 1 + elif name in ("Gothic", "Mincho"): + ordering = 2 + elif name in ("Dotum", "Batang"): + ordering = 3 + else: + ordering = -1 + + fontdict["simple"] = simple + + if name == "ZapfDingbats": + glyphs = zapf_glyphs + elif name == "Symbol": + glyphs = symbol_glyphs + else: + glyphs = None + + fontdict["glyphs"] = glyphs + fontdict["ordering"] = ordering + fontinfo = [xref, fontdict] + doc.FontInfos.append(fontinfo) + else: + fontdict = fontinfo[1] + glyphs = fontdict["glyphs"] + simple = fontdict["simple"] + ordering = fontdict["ordering"] + + if glyphs is None: + oldlimit = 0 + else: + oldlimit = len(glyphs) + + mylimit = max(256, limit) + + if mylimit <= oldlimit: + return glyphs + + if ordering < 0: # not a CJK font + glyphs = doc._get_char_widths( + xref, fontdict["name"], fontdict["ext"], fontdict["ordering"], mylimit, idx + ) + else: # CJK fonts use char codes and width = 1 + glyphs = None + + fontdict["glyphs"] = glyphs + fontinfo[1] = fontdict + UpdateFontInfo(doc, fontinfo) + + return glyphs + def get_layer(self, config=-1): """Content of ON, OFF, RBGroups of an OC layer.""" pdf = _as_pdf_document(self) @@ -4329,6 +4886,23 @@ def get_new_xref(self): xref = mupdf.pdf_create_object(pdf) return xref + def get_oc(doc: typing.Self, xref: int) -> int: + """Return optional content object xref for an image or form xobject. + + Args: + xref: (int) xref number of an image or form xobject. + """ + if doc.is_closed or doc.is_encrypted: + raise ValueError("document close or encrypted") + t, name = doc.xref_get_key(xref, "Subtype") + if t != "name" or name not in ("/Image", "/Form"): + raise ValueError("bad object type at xref %i" % xref) + t, oc = doc.xref_get_key(xref, "OC") + if t != "xref": + return 0 + rc = int(oc.replace("0 R", "")) + return rc + def get_ocgs(self): """Show existing optional content groups.""" ci = mupdf.pdf_new_name( "CreatorInfo") @@ -4372,6 +4946,73 @@ def get_ocgs(self): rc[ temp] = item return rc + def get_ocmd(doc: typing.Self, xref: int) -> dict: + """Return the definition of an OCMD (optional content membership dictionary). + + Recognizes PDF dict keys /OCGs (PDF array of OCGs), /P (policy string) and + /VE (visibility expression, PDF array). Via string manipulation, this + info is converted to a Python dictionary with keys "xref", "ocgs", "policy" + and "ve" - ready to recycle as input for 'set_ocmd()'. + """ + + if xref not in range(doc.xref_length()): + raise ValueError("bad xref") + text = doc.xref_object(xref, compressed=True) + if "/Type/OCMD" not in text: + raise ValueError("bad object type") + textlen = len(text) + + p0 = text.find("/OCGs[") # look for /OCGs key + p1 = text.find("]", p0) + if p0 < 0 or p1 < 0: # no OCGs found + ocgs = None + else: + ocgs = text[p0 + 6 : p1].replace("0 R", " ").split() + ocgs = list(map(int, ocgs)) + + p0 = text.find("/P/") # look for /P policy key + if p0 < 0: + policy = None + else: + p1 = text.find("ff", p0) + if p1 < 0: + p1 = text.find("on", p0) + if p1 < 0: # some irregular syntax + raise ValueError("bad object at xref") + else: + policy = text[p0 + 3 : p1 + 2] + + p0 = text.find("/VE[") # look for /VE visibility expression key + if p0 < 0: # no visibility expression found + ve = None + else: + lp = rp = 0 # find end of /VE by finding last ']'. + p1 = p0 + while lp < 1 or lp != rp: + p1 += 1 + if not p1 < textlen: # some irregular syntax + raise ValueError("bad object at xref") + if text[p1] == "[": + lp += 1 + if text[p1] == "]": + rp += 1 + # p1 now positioned at the last "]" + ve = text[p0 + 3 : p1 + 1] # the PDF /VE array + ve = ( + ve.replace("/And", '"and",') + .replace("/Not", '"not",') + .replace("/Or", '"or",') + ) + ve = ve.replace(" 0 R]", "]").replace(" 0 R", ",").replace("][", "],[") + import json + try: + ve = json.loads(ve) + except Exception: + exception_info() + message(f"bad /VE key: {ve!r}") + raise + return {"xref": xref, "ocgs": ocgs, "policy": policy, "ve": ve} + def get_outline_xrefs(self): """Get list of outline xref numbers.""" xrefs = [] @@ -4420,6 +5061,98 @@ def get_page_images(self, pno: int, full: bool =False) -> list: return [v[:-1] for v in val] return val + def get_page_labels(self): + """Return page label definitions in PDF document. + + Returns: + A list of dictionaries with the following format: + {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}. + """ + # Jorj McKie, 2021-01-10 + return [utils.rule_dict(item) for item in self._get_page_labels()] + + def get_page_numbers(doc, label, only_one=False): + """Return a list of page numbers with the given label. + + Args: + doc: PDF document object (resp. 'self'). + label: (str) label. + only_one: (bool) stop searching after first hit. + Returns: + List of page numbers having this label. + """ + # Jorj McKie, 2021-01-06 + + numbers = [] + if not label: + return numbers + labels = doc._get_page_labels() + if labels == []: + return numbers + for i in range(doc.page_count): + plabel = utils.get_label_pno(i, labels) + if plabel == label: + numbers.append(i) + if only_one: + break + return numbers + + def get_page_pixmap( + doc: typing.Self, + pno: int, + *, + matrix: matrix_like = None, + dpi=None, + colorspace: Colorspace = None, + clip: rect_like = None, + alpha: bool = False, + annots: bool = True, + ) -> 'Pixmap': + """Create pixmap of document page by page number. + + Notes: + Convenience function calling page.get_pixmap. + Args: + pno: (int) page number + matrix: pymupdf.Matrix for transformation (default: pymupdf.Identity). + colorspace: (str,pymupdf.Colorspace) rgb, rgb, gray - case ignored, default csRGB. + clip: (irect-like) restrict rendering to this area. + alpha: (bool) include alpha channel + annots: (bool) also render annotations + """ + if matrix is None: + matrix = Identity + if colorspace is None: + colorspace = csRGB + return doc[pno].get_pixmap( + matrix=matrix, + dpi=dpi, colorspace=colorspace, + clip=clip, + alpha=alpha, + annots=annots + ) + + def get_page_text( + doc: typing.Self, + pno: int, + option: str = "text", + clip: rect_like = None, + flags: OptInt = None, + textpage: 'TextPage' = None, + sort: bool = False, + ) -> typing.Any: + """Extract a document page's text by page number. + + Notes: + Convenience function calling page.get_text(). + Args: + pno: page number + option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml. + Returns: + output from page.TextPage(). + """ + return doc[pno].get_text(option, clip=clip, flags=flags, sort=sort) + def get_page_xobjects(self, pno: int) -> list: """Retrieve a list of XObjects used on a page. """ @@ -4446,6 +5179,60 @@ def get_sigflags(self): sigflag = mupdf.pdf_to_int(sigflags) return sigflag + def get_toc( + doc: typing.Self, + simple: bool = True, + ) -> list: + """Create a table of contents. + + Args: + simple: a bool to control output. Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation. + """ + def recurse(olItem, liste, lvl): + """Recursively follow the outline item chain and record item information in a list.""" + while olItem and olItem.this.m_internal: + if olItem.title: + title = olItem.title + else: + title = " " + + if not olItem.is_external: + if olItem.uri: + if olItem.page == -1: + resolve = doc.resolve_link(olItem.uri) + page = resolve[0] + 1 + else: + page = olItem.page + 1 + else: + page = -1 + else: + page = -1 + + if not simple: + link = utils.getLinkDict(olItem, doc) + liste.append([lvl, title, page, link]) + else: + liste.append([lvl, title, page]) + + if olItem.down: + liste = recurse(olItem.down, liste, lvl + 1) + olItem = olItem.next + return liste + + # ensure document is open + if doc.is_closed: + raise ValueError("document closed") + doc.init_doc() + olItem = doc.outline + if not olItem: + return [] + lvl = 1 + liste = [] + toc = recurse(olItem, liste, lvl) + if doc.is_pdf and not simple: + doc._extend_toc_items(toc) + return toc + def get_xml_metadata(self): """Get document XML metadata.""" xml = None @@ -4463,6 +5250,31 @@ def get_xml_metadata(self): rc = '' return rc + def has_annots(doc: typing.Self) -> bool: + """Check whether there are annotations on any page.""" + if doc.is_closed: + raise ValueError("document closed") + if not doc.is_pdf: + raise ValueError("is no PDF") + for i in range(doc.page_count): + for item in doc.page_annot_xrefs(i): + # pylint: disable=no-member + if not (item[1] == mupdf.PDF_ANNOT_LINK or item[1] == mupdf.PDF_ANNOT_WIDGET): # pylint: disable=no-member + return True + return False + + def has_links(doc: typing.Self) -> bool: + """Check whether there are links on any page.""" + if doc.is_closed: + raise ValueError("document closed") + if not doc.is_pdf: + raise ValueError("is no PDF") + for i in range(doc.page_count): + for item in doc.page_annot_xrefs(i): + if item[1] == mupdf.PDF_ANNOT_LINK: # pylint: disable=no-member + return True + return False + def init_doc(self): if self.is_encrypted: raise ValueError("cannot initialize - document still encrypted") @@ -4528,6 +5340,36 @@ def insert_file(self, final=final, ) + def insert_page( + doc: typing.Self, + pno: int, + text: typing.Union[str, list, None] = None, + fontsize: float = 11, + width: float = 595, + height: float = 842, + fontname: str = "helv", + fontfile: OptStr = None, + color: OptSeq = (0,), + ) -> int: + """Create a new PDF page and insert some text. + + Notes: + Function combining pymupdf.Document.new_page() and pymupdf.Page.insert_text(). + For parameter details see these methods. + """ + page = doc.new_page(pno=pno, width=width, height=height) + if not bool(text): + return 0 + rc = page.insert_text( + (50, 72), + text, + fontsize=fontsize, + fontname=fontname, + fontfile=fontfile, + color=color, + ) + return rc + def insert_pdf( self, docsrc, @@ -5028,6 +5870,24 @@ def needs_pass(self): ret = mupdf.fz_needs_password( document) return ret + def new_page( + doc: typing.Self, + pno: int = -1, + width: float = 595, + height: float = 842, + ) -> Page: + """Create and return a new page object. + + Args: + pno: (int) insert before this page. Default: after last page. + width: (float) page width in points. Default: 595 (ISO A4 width). + height: (float) page height in points. Default 842 (ISO A4 height). + Returns: + A pymupdf.Page object. + """ + doc._newPage(pno, width=width, height=height) + return doc[pno] + def next_location(self, page_id): """Get (chapter, page) of next page.""" if self.is_closed or self.is_encrypted: @@ -5674,6 +6534,199 @@ def saveIncr(self): """ Save PDF incrementally""" return self.save(self.name, incremental=True, encryption=mupdf.PDF_ENCRYPT_KEEP) + # ------------------------------------------------------------------------------ + # Remove potentially sensitive data from a PDF. Similar to the Adobe + # Acrobat 'sanitize' function + # ------------------------------------------------------------------------------ + def scrub( + doc: typing.Self, + attached_files: bool = True, + clean_pages: bool = True, + embedded_files: bool = True, + hidden_text: bool = True, + javascript: bool = True, + metadata: bool = True, + redactions: bool = True, + redact_images: int = 0, + remove_links: bool = True, + reset_fields: bool = True, + reset_responses: bool = True, + thumbnails: bool = True, + xml_metadata: bool = True, + ) -> None: + + def remove_hidden(cont_lines): + """Remove hidden text from a PDF page. + + Args: + cont_lines: list of lines with /Contents content. Should have status + from after page.cleanContents(). + + Returns: + List of /Contents lines from which hidden text has been removed. + + Notes: + The input must have been created after the page's /Contents object(s) + have been cleaned with page.cleanContents(). This ensures a standard + formatting: one command per line, single spaces between operators. + This allows for drastic simplification of this code. + """ + out_lines = [] # will return this + in_text = False # indicate if within BT/ET object + suppress = False # indicate text suppression active + make_return = False + for line in cont_lines: + if line == b"BT": # start of text object + in_text = True # switch on + out_lines.append(line) # output it + continue + if line == b"ET": # end of text object + in_text = False # switch off + out_lines.append(line) # output it + continue + if line == b"3 Tr": # text suppression operator + suppress = True # switch on + make_return = True + continue + if line[-2:] == b"Tr" and line[0] != b"3": + suppress = False # text rendering changed + out_lines.append(line) + continue + if line == b"Q": # unstack command also switches off + suppress = False + out_lines.append(line) + continue + if suppress and in_text: # suppress hidden lines + continue + out_lines.append(line) + if make_return: + return out_lines + else: + return None + + if not doc.is_pdf: # only works for PDF + raise ValueError("is no PDF") + if doc.is_encrypted or doc.is_closed: + raise ValueError("closed or encrypted doc") + + if not clean_pages: + hidden_text = False + redactions = False + + if metadata: + doc.set_metadata({}) # remove standard metadata + + for page in doc: + if reset_fields: + # reset form fields (widgets) + for widget in page.widgets(): + widget.reset() + + if remove_links: + links = page.get_links() # list of all links on page + for link in links: # remove all links + page.delete_link(link) + + found_redacts = False + for annot in page.annots(): + if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files: + annot.update_file(buffer_=b" ") # set file content to empty + if reset_responses: + annot.delete_responses() + if annot.type[0] == mupdf.PDF_ANNOT_REDACT: # pylint: disable=no-member + found_redacts = True + + if redactions and found_redacts: + page.apply_redactions(images=redact_images) + + if not (clean_pages or hidden_text): + continue # done with the page + + page.clean_contents() + if not page.get_contents(): + continue + if hidden_text: + xref = page.get_contents()[0] # only one b/o cleaning! + cont = doc.xref_stream(xref) + cont_lines = remove_hidden(cont.splitlines()) # remove hidden text + if cont_lines: # something was actually removed + cont = b"\n".join(cont_lines) + doc.update_stream(xref, cont) # rewrite the page /Contents + + if thumbnails: # remove page thumbnails? + if doc.xref_get_key(page.xref, "Thumb")[0] != "null": + doc.xref_set_key(page.xref, "Thumb", "null") + + # pages are scrubbed, now perform document-wide scrubbing + # remove embedded files + if embedded_files: + for name in doc.embfile_names(): + doc.embfile_del(name) + + if xml_metadata: + doc.del_xml_metadata() + if not (xml_metadata or javascript): + xref_limit = 0 + else: + xref_limit = doc.xref_length() + for xref in range(1, xref_limit): + if not doc.xref_object(xref): + msg = "bad xref %i - clean PDF before scrubbing" % xref + raise ValueError(msg) + if javascript and doc.xref_get_key(xref, "S")[1] == "/JavaScript": + # a /JavaScript action object + obj = "<>" # replace with a null JavaScript + doc.update_object(xref, obj) # update this object + continue # no further handling + + if not xml_metadata: + continue + + if doc.xref_get_key(xref, "Type")[1] == "/Metadata": + # delete any metadata object directly + doc.update_object(xref, "<<>>") + doc.update_stream(xref, b"deleted", new=True) + continue + + if doc.xref_get_key(xref, "Metadata")[0] != "null": + doc.xref_set_key(xref, "Metadata", "null") + + def search_page_for( + doc: typing.Self, + pno: int, + text: str, + quads: bool = False, + clip: rect_like = None, + flags: int = None, + textpage: 'TextPage' = None, + ) -> list: + """Search for a string on a page. + + Args: + pno: page number + text: string to be searched for + clip: restrict search to this rectangle + quads: (bool) return quads instead of rectangles + flags: bit switches, default: join hyphened words + textpage: reuse a prepared textpage + Returns: + a list of rectangles or quads, each containing an occurrence. + """ + if flags is None: + flags = (0 + | TEXT_DEHYPHENATE + | TEXT_PRESERVE_LIGATURES + | TEXT_PRESERVE_WHITESPACE + | TEXT_MEDIABOX_CLIP + ) + return doc[pno].search_for( + text, + quads=quads, + clip=clip, + flags=flags, + textpage=textpage, + ) + def select(self, pyliste): """Build sub-pdf with page numbers in the list.""" if self.is_closed or self.is_encrypted: @@ -5818,6 +6871,162 @@ def set_markinfo(self, markinfo: dict) -> bool: self.xref_set_key(xref, "MarkInfo", pdfdict) return True + def set_metadata(doc: typing.Self, m: dict = None) -> None: + """Update the PDF /Info object. + + Args: + m: a dictionary like doc.metadata. + """ + if not doc.is_pdf: + raise ValueError("is no PDF") + if doc.is_closed or doc.is_encrypted: + raise ValueError("document closed or encrypted") + if m is None: + m = {} + elif type(m) is not dict: + raise ValueError("bad metadata") + keymap = { + "author": "Author", + "producer": "Producer", + "creator": "Creator", + "title": "Title", + "format": None, + "encryption": None, + "creationDate": "CreationDate", + "modDate": "ModDate", + "subject": "Subject", + "keywords": "Keywords", + "trapped": "Trapped", + } + valid_keys = set(keymap.keys()) + diff_set = set(m.keys()).difference(valid_keys) + if diff_set != set(): + msg = "bad dict key(s): %s" % diff_set + raise ValueError(msg) + + t, temp = doc.xref_get_key(-1, "Info") + if t != "xref": + info_xref = 0 + else: + info_xref = int(temp.replace("0 R", "")) + + if m == {} and info_xref == 0: # nothing to do + return + + if info_xref == 0: # no prev metadata: get new xref + info_xref = doc.get_new_xref() + doc.update_object(info_xref, "<<>>") # fill it with empty object + doc.xref_set_key(-1, "Info", "%i 0 R" % info_xref) + elif m == {}: # remove existing metadata + doc.xref_set_key(-1, "Info", "null") + doc.init_doc() + return + + for key, val in [(k, v) for k, v in m.items() if keymap[k] is not None]: + pdf_key = keymap[key] + if not bool(val) or val in ("none", "null"): + val = "null" + else: + val = get_pdf_str(val) + doc.xref_set_key(info_xref, pdf_key, val) + doc.init_doc() + return + + def set_oc(doc: typing.Self, xref: int, oc: int) -> None: + """Attach optional content object to image or form xobject. + + Args: + xref: (int) xref number of an image or form xobject + oc: (int) xref number of an OCG or OCMD + """ + if doc.is_closed or doc.is_encrypted: + raise ValueError("document close or encrypted") + t, name = doc.xref_get_key(xref, "Subtype") + if t != "name" or name not in ("/Image", "/Form"): + raise ValueError("bad object type at xref %i" % xref) + if oc > 0: + t, name = doc.xref_get_key(oc, "Type") + if t != "name" or name not in ("/OCG", "/OCMD"): + raise ValueError("bad object type at xref %i" % oc) + if oc == 0 and "OC" in doc.xref_get_keys(xref): + doc.xref_set_key(xref, "OC", "null") + return None + doc.xref_set_key(xref, "OC", "%i 0 R" % oc) + return None + + def set_ocmd( + doc: typing.Self, + xref: int = 0, + ocgs: typing.Union[list, None] = None, + policy: OptStr = None, + ve: typing.Union[list, None] = None, + ) -> int: + """Create or update an OCMD object in a PDF document. + + Args: + xref: (int) 0 for creating a new object, otherwise update existing one. + ocgs: (list) OCG xref numbers, which shall be subject to 'policy'. + policy: one of 'AllOn', 'AllOff', 'AnyOn', 'AnyOff' (any casing). + ve: (list) visibility expression. Use instead of 'ocgs' with 'policy'. + + Returns: + Xref of the created or updated OCMD. + """ + + all_ocgs = set(doc.get_ocgs().keys()) + + def ve_maker(ve): + if type(ve) not in (list, tuple) or len(ve) < 2: + raise ValueError("bad 've' format: %s" % ve) + if ve[0].lower() not in ("and", "or", "not"): + raise ValueError("bad operand: %s" % ve[0]) + if ve[0].lower() == "not" and len(ve) != 2: + raise ValueError("bad 've' format: %s" % ve) + item = "[/%s" % ve[0].title() + for x in ve[1:]: + if type(x) is int: + if x not in all_ocgs: + raise ValueError("bad OCG %i" % x) + item += " %i 0 R" % x + else: + item += " %s" % ve_maker(x) + item += "]" + return item + + text = "<>". + """ + s = "%i<<" % label["startpage"] + if label.get("prefix", "") != "": + s += "/P(%s)" % label["prefix"] + if label.get("style", "") != "": + s += "/S/%s" % label["style"] + if label.get("firstpagenum", 1) > 1: + s += "/St %i" % label["firstpagenum"] + s += ">>" + return s + + def create_nums(labels): + """Return concatenated string of all labels rules. + + Args: + labels: (list) dictionaries as created by function 'rule_dict'. + Returns: + PDF compatible string for page label definitions, ready to be + enclosed in PDF array 'Nums[...]'. + """ + labels.sort(key=lambda x: x["startpage"]) + s = "".join([create_label_str(label) for label in labels]) + return s + + doc._set_page_labels(create_nums(labels)) + + def set_toc( + doc: typing.Self, + toc: list, + collapse: int = 1, + ) -> int: + """Create new outline tree (table of contents, TOC). + + Args: + toc: (list, tuple) each entry must contain level, title, page and + optionally top margin on the page. None or '()' remove the TOC. + collapse: (int) collapses entries beyond this level. Zero or None + shows all entries unfolded. + Returns: + the number of inserted items, or the number of removed items respectively. + """ + if doc.is_closed or doc.is_encrypted: + raise ValueError("document closed or encrypted") + if not doc.is_pdf: + raise ValueError("is no PDF") + if not toc: # remove all entries + return len(doc._delToC()) + + # validity checks -------------------------------------------------------- + if type(toc) not in (list, tuple): + raise ValueError("'toc' must be list or tuple") + toclen = len(toc) + page_count = doc.page_count + t0 = toc[0] + if type(t0) not in (list, tuple): + raise ValueError("items must be sequences of 3 or 4 items") + if t0[0] != 1: + raise ValueError("hierarchy level of item 0 must be 1") + for i in list(range(toclen - 1)): + t1 = toc[i] + t2 = toc[i + 1] + if not -1 <= t1[2] <= page_count: + raise ValueError("row %i: page number out of range" % i) + if (type(t2) not in (list, tuple)) or len(t2) not in (3, 4): + raise ValueError("bad row %i" % (i + 1)) + if (type(t2[0]) is not int) or t2[0] < 1: + raise ValueError("bad hierarchy level in row %i" % (i + 1)) + if t2[0] > t1[0] + 1: + raise ValueError("bad hierarchy level in row %i" % (i + 1)) + # no formal errors in toc -------------------------------------------------- + + # -------------------------------------------------------------------------- + # make a list of xref numbers, which we can use for our TOC entries + # -------------------------------------------------------------------------- + old_xrefs = doc._delToC() # del old outlines, get their xref numbers + + # prepare table of xrefs for new bookmarks + old_xrefs = [] + xref = [0] + old_xrefs + xref[0] = doc._getOLRootNumber() # entry zero is outline root xref number + if toclen > len(old_xrefs): # too few old xrefs? + for i in range((toclen - len(old_xrefs))): + xref.append(doc.get_new_xref()) # acquire new ones + + lvltab = {0: 0} # to store last entry per hierarchy level + + # ------------------------------------------------------------------------------ + # contains new outline objects as strings - first one is the outline root + # ------------------------------------------------------------------------------ + olitems = [{"count": 0, "first": -1, "last": -1, "xref": xref[0]}] + # ------------------------------------------------------------------------------ + # build olitems as a list of PDF-like connected dictionaries + # ------------------------------------------------------------------------------ + for i in range(toclen): + o = toc[i] + lvl = o[0] # level + title = get_pdf_str(o[1]) # title + pno = min(doc.page_count - 1, max(0, o[2] - 1)) # page number + page_xref = doc.page_xref(pno) + page_height = doc.page_cropbox(pno).height + top = Point(72, page_height - 36) + dest_dict = {"to": top, "kind": LINK_GOTO} # fall back target + if o[2] < 0: + dest_dict["kind"] = LINK_NONE + if len(o) > 3: # some target is specified + if type(o[3]) in (int, float): # convert a number to a point + dest_dict["to"] = Point(72, page_height - o[3]) + else: # if something else, make sure we have a dict + # We make a copy of o[3] to avoid modifying our caller's data. + dest_dict = o[3].copy() if type(o[3]) is dict else dest_dict + if "to" not in dest_dict: # target point not in dict? + dest_dict["to"] = top # put default in + else: # transform target to PDF coordinates + page = doc[pno] + point = Point(dest_dict["to"]) + point.y = page.cropbox.height - point.y + point = point * page.rotation_matrix + dest_dict["to"] = (point.x, point.y) + d = {} + d["first"] = -1 + d["count"] = 0 + d["last"] = -1 + d["prev"] = -1 + d["next"] = -1 + d["dest"] = utils.getDestStr(page_xref, dest_dict) + d["top"] = dest_dict["to"] + d["title"] = title + d["parent"] = lvltab[lvl - 1] + d["xref"] = xref[i + 1] + d["color"] = dest_dict.get("color") + d["flags"] = dest_dict.get("italic", 0) + 2 * dest_dict.get("bold", 0) + lvltab[lvl] = i + 1 + parent = olitems[lvltab[lvl - 1]] # the parent entry + + if ( + dest_dict.get("collapse") or collapse and lvl > collapse + ): # suppress expansion + parent["count"] -= 1 # make /Count negative + else: + parent["count"] += 1 # positive /Count + + if parent["first"] == -1: + parent["first"] = i + 1 + parent["last"] = i + 1 + else: + d["prev"] = parent["last"] + prev = olitems[parent["last"]] + prev["next"] = i + 1 + parent["last"] = i + 1 + olitems.append(d) + + # ------------------------------------------------------------------------------ + # now create each outline item as a string and insert it in the PDF + # ------------------------------------------------------------------------------ + for i, ol in enumerate(olitems): + txt = "<<" + if ol["count"] != 0: + txt += "/Count %i" % ol["count"] + try: + txt += ol["dest"] + except Exception: + # Verbose in PyMuPDF/tests. + if g_exceptions_verbose >= 2: exception_info() + pass + try: + if ol["first"] > -1: + txt += "/First %i 0 R" % xref[ol["first"]] + except Exception: + if g_exceptions_verbose >= 2: exception_info() + pass + try: + if ol["last"] > -1: + txt += "/Last %i 0 R" % xref[ol["last"]] + except Exception: + if g_exceptions_verbose >= 2: exception_info() + pass + try: + if ol["next"] > -1: + txt += "/Next %i 0 R" % xref[ol["next"]] + except Exception: + # Verbose in PyMuPDF/tests. + if g_exceptions_verbose >= 2: exception_info() + pass + try: + if ol["parent"] > -1: + txt += "/Parent %i 0 R" % xref[ol["parent"]] + except Exception: + # Verbose in PyMuPDF/tests. + if g_exceptions_verbose >= 2: exception_info() + pass + try: + if ol["prev"] > -1: + txt += "/Prev %i 0 R" % xref[ol["prev"]] + except Exception: + # Verbose in PyMuPDF/tests. + if g_exceptions_verbose >= 2: exception_info() + pass + try: + txt += "/Title" + ol["title"] + except Exception: + # Verbose in PyMuPDF/tests. + if g_exceptions_verbose >= 2: exception_info() + pass + + if ol.get("color") and len(ol["color"]) == 3: + txt += f"/C[ {_format_g(tuple(ol['color']))}]" + if ol.get("flags", 0) > 0: + txt += "/F %i" % ol["flags"] + + if i == 0: # special: this is the outline root + txt += "/Type/Outlines" # so add the /Type entry + txt += ">>" + doc.update_object(xref[i], txt) # insert the PDF object + + doc.init_doc() + return toclen + + def set_toc_item( + doc: typing.Self, + idx: int, + dest_dict: OptDict = None, + kind: OptInt = None, + pno: OptInt = None, + uri: OptStr = None, + title: OptStr = None, + to: point_like = None, + filename: OptStr = None, + zoom: float = 0, + ) -> None: + """Update TOC item by index. + + It allows changing the item's title and link destination. + + Args: + idx: + (int) desired index of the TOC list, as created by get_toc. + dest_dict: + (dict) destination dictionary as created by get_toc(False). + Outrules all other parameters. If None, the remaining parameters + are used to make a dest dictionary. + kind: + (int) kind of link (pymupdf.LINK_GOTO, etc.). If None, then only + the title will be updated. If pymupdf.LINK_NONE, the TOC item will + be deleted. + pno: + (int) page number (1-based like in get_toc). Required if + pymupdf.LINK_GOTO. + uri: + (str) the URL, required if pymupdf.LINK_URI. + title: + (str) the new title. No change if None. + to: + (point-like) destination on the target page. If omitted, (72, 36) + will be used as target coordinates. + filename: + (str) destination filename, required for pymupdf.LINK_GOTOR and + pymupdf.LINK_LAUNCH. + name: + (str) a destination name for pymupdf.LINK_NAMED. + zoom: + (float) a zoom factor for the target location (pymupdf.LINK_GOTO). + """ + xref = doc.get_outline_xrefs()[idx] + page_xref = 0 + if type(dest_dict) is dict: + if dest_dict["kind"] == LINK_GOTO: + pno = dest_dict["page"] + page_xref = doc.page_xref(pno) + page_height = doc.page_cropbox(pno).height + to = dest_dict.get('to', Point(72, 36)) + to.y = page_height - to.y + dest_dict["to"] = to + action = utils.getDestStr(page_xref, dest_dict) + if not action.startswith("/A"): + raise ValueError("bad bookmark dest") + color = dest_dict.get("color") + if color: + color = list(map(float, color)) + if len(color) != 3 or min(color) < 0 or max(color) > 1: + raise ValueError("bad color value") + bold = dest_dict.get("bold", False) + italic = dest_dict.get("italic", False) + flags = italic + 2 * bold + collapse = dest_dict.get("collapse") + return doc._update_toc_item( + xref, + action=action[2:], + title=title, + color=color, + flags=flags, + collapse=collapse, + ) + + if kind == LINK_NONE: # delete bookmark item + return doc.del_toc_item(idx) + if kind is None and title is None: # treat as no-op + return None + if kind is None: # only update title text + return doc._update_toc_item(xref, action=None, title=title) + + if kind == LINK_GOTO: + if pno is None or pno not in range(1, doc.page_count + 1): + raise ValueError("bad page number") + page_xref = doc.page_xref(pno - 1) + page_height = doc.page_cropbox(pno - 1).height + if to is None: + to = Point(72, page_height - 36) + else: + to = Point(to) + to.y = page_height - to.y + + ddict = { + "kind": kind, + "to": to, + "uri": uri, + "page": pno, + "file": filename, + "zoom": zoom, + } + action = utils.getDestStr(page_xref, ddict) + if action == "" or not action.startswith("/A"): + raise ValueError("bad bookmark dest") + + return doc._update_toc_item(xref, action=action[2:], title=title) + def set_xml_metadata(self, metadata): """Store XML document level metadata.""" if self.is_closed or self.is_encrypted: @@ -5868,6 +7420,318 @@ def set_xml_metadata(self, metadata): mupdf.pdf_dict_put( xml, PDF_NAME('Subtype'), PDF_NAME('XML')) mupdf.pdf_dict_put( root, PDF_NAME('Metadata'), xml) + def subset_fonts(doc: typing.Self, verbose: bool = False, fallback: bool = False) -> OptInt: + """Build font subsets in a PDF. + + Eligible fonts are potentially replaced by smaller versions. Page text is + NOT rewritten and thus should retain properties like being hidden or + controlled by optional content. + + This method by default uses MuPDF's own internal feature to create subset + fonts. As this is a new function, errors may still occur. In this case, + please fall back to using the previous version by using "fallback=True". + Fallback mode requires the external package 'fontTools'. + + Args: + fallback: use the older deprecated implementation. + verbose: only used by fallback mode. + + Returns: + The new MuPDF-based code returns None. The deprecated fallback + mode returns 0 if there are no fonts to subset. Otherwise, it + returns the decrease in fontsize (the difference in fontsize), + measured in bytes. + """ + # Font binaries: - "buffer" -> (names, xrefs, (unicodes, glyphs)) + # An embedded font is uniquely defined by its fontbuffer only. It may have + # multiple names and xrefs. + # Once the sets of used unicodes and glyphs are known, we compute a + # smaller version of the buffer user package fontTools. + + if not fallback: # by default use MuPDF function + pdf = mupdf.pdf_document_from_fz_document(doc) + mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count))) + return + + font_buffers = {} + + def get_old_widths(xref): + """Retrieve old font '/W' and '/DW' values.""" + df = doc.xref_get_key(xref, "DescendantFonts") + if df[0] != "array": # only handle xref specifications + return None, None + df_xref = int(df[1][1:-1].replace("0 R", "")) + widths = doc.xref_get_key(df_xref, "W") + if widths[0] != "array": # no widths key found + widths = None + else: + widths = widths[1] + dwidths = doc.xref_get_key(df_xref, "DW") + if dwidths[0] != "int": + dwidths = None + else: + dwidths = dwidths[1] + return widths, dwidths + + def set_old_widths(xref, widths, dwidths): + """Restore the old '/W' and '/DW' in subsetted font. + + If either parameter is None or evaluates to False, the corresponding + dictionary key will be set to null. + """ + df = doc.xref_get_key(xref, "DescendantFonts") + if df[0] != "array": # only handle xref specs + return None + df_xref = int(df[1][1:-1].replace("0 R", "")) + if (type(widths) is not str or not widths) and doc.xref_get_key(df_xref, "W")[ + 0 + ] != "null": + doc.xref_set_key(df_xref, "W", "null") + else: + doc.xref_set_key(df_xref, "W", widths) + if (type(dwidths) is not str or not dwidths) and doc.xref_get_key( + df_xref, "DW" + )[0] != "null": + doc.xref_set_key(df_xref, "DW", "null") + else: + doc.xref_set_key(df_xref, "DW", dwidths) + return None + + def set_subset_fontname(new_xref): + """Generate a name prefix to tag a font as subset. + + We use a random generator to select 6 upper case ASCII characters. + The prefixed name must be put in the font xref as the "/BaseFont" value + and in the FontDescriptor object as the '/FontName' value. + """ + # The following generates a prefix like 'ABCDEF+' + import random + import string + prefix = "".join(random.choices(tuple(string.ascii_uppercase), k=6)) + "+" + font_str = doc.xref_object(new_xref, compressed=True) + font_str = font_str.replace("/BaseFont/", "/BaseFont/" + prefix) + df = doc.xref_get_key(new_xref, "DescendantFonts") + if df[0] == "array": + df_xref = int(df[1][1:-1].replace("0 R", "")) + fd = doc.xref_get_key(df_xref, "FontDescriptor") + if fd[0] == "xref": + fd_xref = int(fd[1].replace("0 R", "")) + fd_str = doc.xref_object(fd_xref, compressed=True) + fd_str = fd_str.replace("/FontName/", "/FontName/" + prefix) + doc.update_object(fd_xref, fd_str) + doc.update_object(new_xref, font_str) + + def build_subset(buffer, unc_set, gid_set): + """Build font subset using fontTools. + + Args: + buffer: (bytes) the font given as a binary buffer. + unc_set: (set) required glyph ids. + Returns: + Either None if subsetting is unsuccessful or the subset font buffer. + """ + try: + import fontTools.subset as fts + except ImportError: + if g_exceptions_verbose: exception_info() + message("This method requires fontTools to be installed.") + raise + import tempfile + with tempfile.TemporaryDirectory() as tmp_dir: + oldfont_path = f"{tmp_dir}/oldfont.ttf" + newfont_path = f"{tmp_dir}/newfont.ttf" + uncfile_path = f"{tmp_dir}/uncfile.txt" + args = [ + oldfont_path, + "--retain-gids", + f"--output-file={newfont_path}", + "--layout-features=*", + "--passthrough-tables", + "--ignore-missing-glyphs", + "--ignore-missing-unicodes", + "--symbol-cmap", + ] + + # store glyph ids or unicodes as file + with open(f"{tmp_dir}/uncfile.txt", "w", encoding='utf8') as unc_file: + if 0xFFFD in unc_set: # error unicode exists -> use glyphs + args.append(f"--gids-file={uncfile_path}") + gid_set.add(189) + unc_list = list(gid_set) + for unc in unc_list: + unc_file.write("%i\n" % unc) + else: + args.append(f"--unicodes-file={uncfile_path}") + unc_set.add(255) + unc_list = list(unc_set) + for unc in unc_list: + unc_file.write("%04x\n" % unc) + + # store fontbuffer as a file + with open(oldfont_path, "wb") as fontfile: + fontfile.write(buffer) + try: + os.remove(newfont_path) # remove old file + except Exception: + pass + try: # invoke fontTools subsetter + fts.main(args) + font = Font(fontfile=newfont_path) + new_buffer = font.buffer # subset font binary + if font.glyph_count == 0: # intercept empty font + new_buffer = None + except Exception: + exception_info() + new_buffer = None + return new_buffer + + def repl_fontnames(doc): + """Populate 'font_buffers'. + + For each font candidate, store its xref and the list of names + by which PDF text may refer to it (there may be multiple). + """ + + def norm_name(name): + """Recreate font name that contains PDF hex codes. + + E.g. #20 -> space, chr(32) + """ + while "#" in name: + p = name.find("#") + c = int(name[p + 1 : p + 3], 16) + name = name.replace(name[p : p + 3], chr(c)) + return name + + def get_fontnames(doc, item): + """Return a list of fontnames for an item of page.get_fonts(). + + There may be multiple names e.g. for Type0 fonts. + """ + fontname = item[3] + names = [fontname] + fontname = doc.xref_get_key(item[0], "BaseFont")[1][1:] + fontname = norm_name(fontname) + if fontname not in names: + names.append(fontname) + descendents = doc.xref_get_key(item[0], "DescendantFonts") + if descendents[0] != "array": + return names + descendents = descendents[1][1:-1] + if descendents.endswith(" 0 R"): + xref = int(descendents[:-4]) + descendents = doc.xref_object(xref, compressed=True) + p1 = descendents.find("/BaseFont") + if p1 >= 0: + p2 = descendents.find("/", p1 + 1) + p1 = min(descendents.find("/", p2 + 1), descendents.find(">>", p2 + 1)) + fontname = descendents[p2 + 1 : p1] + fontname = norm_name(fontname) + if fontname not in names: + names.append(fontname) + return names + + for i in range(doc.page_count): + for f in doc.get_page_fonts(i, full=True): + font_xref = f[0] # font xref + font_ext = f[1] # font file extension + basename = f[3] # font basename + + if font_ext not in ( # skip if not supported by fontTools + "otf", + "ttf", + "woff", + "woff2", + ): + continue + # skip fonts which already are subsets + if len(basename) > 6 and basename[6] == "+": + continue + + extr = doc.extract_font(font_xref) + fontbuffer = extr[-1] + names = get_fontnames(doc, f) + name_set, xref_set, subsets = font_buffers.get( + fontbuffer, (set(), set(), (set(), set())) + ) + xref_set.add(font_xref) + for name in names: + name_set.add(name) + font = Font(fontbuffer=fontbuffer) + name_set.add(font.name) + del font + font_buffers[fontbuffer] = (name_set, xref_set, subsets) + + def find_buffer_by_name(name): + for buffer, (name_set, _, _) in font_buffers.items(): + if name in name_set: + return buffer + return None + + # ----------------- + # main function + # ----------------- + repl_fontnames(doc) # populate font information + if not font_buffers: # nothing found to do + if verbose: + message(f'No fonts to subset.') + return 0 + + old_fontsize = 0 + new_fontsize = 0 + for fontbuffer in font_buffers.keys(): + old_fontsize += len(fontbuffer) + + # Scan page text for usage of subsettable fonts + for page in doc: + # go through the text and extend set of used glyphs by font + # we use a modified MuPDF trace device, which delivers us glyph ids. + for span in page.get_texttrace(): + if type(span) is not dict: # skip useless information + continue + fontname = span["font"][:33] # fontname for the span + buffer = find_buffer_by_name(fontname) + if buffer is None: + continue + name_set, xref_set, (set_ucs, set_gid) = font_buffers[buffer] + for c in span["chars"]: + set_ucs.add(c[0]) # unicode + set_gid.add(c[1]) # glyph id + font_buffers[buffer] = (name_set, xref_set, (set_ucs, set_gid)) + + # build the font subsets + for old_buffer, (name_set, xref_set, subsets) in font_buffers.items(): + new_buffer = build_subset(old_buffer, subsets[0], subsets[1]) + fontname = list(name_set)[0] + if new_buffer is None or len(new_buffer) >= len(old_buffer): + # subset was not created or did not get smaller + if verbose: + message(f'Cannot subset {fontname!r}.') + continue + if verbose: + message(f"Built subset of font {fontname!r}.") + val = doc._insert_font(fontbuffer=new_buffer) # store subset font in PDF + new_xref = val[0] # get its xref + set_subset_fontname(new_xref) # tag fontname as subset font + font_str = doc.xref_object( # get its object definition + new_xref, + compressed=True, + ) + # walk through the original font xrefs and replace each by the subset def + for font_xref in xref_set: + # we need the original '/W' and '/DW' width values + width_table, def_width = get_old_widths(font_xref) + # ... and replace original font definition at xref with it + doc.update_object(font_xref, font_str) + # now copy over old '/W' and '/DW' values + if width_table or def_width: + set_old_widths(font_xref, width_table, def_width) + # 'new_xref' remains unused in the PDF and must be removed + # by garbage collection. + new_fontsize += len(new_buffer) + + return old_fontsize - new_fontsize + def switch_layer(self, config, as_default=0): """Activate an OC layer.""" pdf = _as_pdf_document(self) @@ -5978,6 +7842,9 @@ def write( compression_effort=compression_effort, ) return bio.getvalue() + + def tobytes(self, *args, **kwargs): + return self.write(*args, **kwargs) @property def xref(self): @@ -5985,6 +7852,41 @@ def xref(self): CheckParent(self) return self.parent.page_xref(self.number) + def xref_copy(doc: typing.Self, source: int, target: int, *, keep: list = None) -> None: + """Copy a PDF dictionary object to another one given their xref numbers. + + Args: + doc: PDF document object + source: source xref number + target: target xref number, the xref must already exist + keep: an optional list of 1st level keys in target that should not be + removed before copying. + Notes: + This works similar to the copy() method of dictionaries in Python. The + source may be a stream object. + """ + if doc.xref_is_stream(source): + # read new xref stream, maintaining compression + stream = doc.xref_stream_raw(source) + doc.update_stream( + target, + stream, + compress=False, # keeps source compression + new=True, # in case target is no stream + ) + + # empty the target completely, observe exceptions + if keep is None: + keep = [] + for key in doc.xref_get_keys(target): + if key in keep: + continue + doc.xref_set_key(target, key, "null") + # copy over all source dict items + for key in doc.xref_get_keys(source): + item = doc.xref_get_key(source, key) + doc.xref_set_key(target, key, item[1]) + def xref_get_key(self, xref, key): """Get PDF dict key value of object at 'xref'.""" pdf = _as_pdf_document(self) @@ -6201,7 +8103,6 @@ def xref_xml_metadata(self): __slots__ = ('this', 'page_count2', 'this_is_pdf', '__dict__') outline = property(lambda self: self._outline) - tobytes = write is_stream = xref_is_stream open = Document @@ -20944,34 +22845,6 @@ def colors_wx_list(): recover_quad = utils.recover_quad recover_span_quad = utils.recover_span_quad - -Document._do_links = utils.do_links -Document._do_widgets = utils.do_widgets -Document.del_toc_item = utils.del_toc_item -Document.get_char_widths = utils.get_char_widths -Document.get_oc = utils.get_oc -Document.get_ocmd = utils.get_ocmd -Document.get_page_labels = utils.get_page_labels -Document.get_page_numbers = utils.get_page_numbers -Document.get_page_pixmap = utils.get_page_pixmap -Document.get_page_text = utils.get_page_text -Document.get_toc = utils.get_toc -Document.has_annots = utils.has_annots -Document.has_links = utils.has_links -Document.insert_page = utils.insert_page -Document.new_page = utils.new_page -Document.scrub = utils.scrub -Document.search_page_for = utils.search_page_for -Document.set_metadata = utils.set_metadata -Document.set_oc = utils.set_oc -Document.set_ocmd = utils.set_ocmd -Document.set_page_labels = utils.set_page_labels -Document.set_toc = utils.set_toc -Document.set_toc_item = utils.set_toc_item -Document.subset_fonts = utils.subset_fonts -Document.tobytes = Document.write -Document.xref_copy = utils.xref_copy - IRect.get_area = utils.get_area Page.apply_redactions = utils.apply_redactions diff --git a/src/utils.py b/src/utils.py index e764a5cd1..85b0bd02c 100644 --- a/src/utils.py +++ b/src/utils.py @@ -438,41 +438,6 @@ def search_for( return rlist -def search_page_for( - doc: pymupdf.Document, - pno: int, - text: str, - quads: bool = False, - clip: rect_like = None, - flags: int = pymupdf.TEXT_DEHYPHENATE - | pymupdf.TEXT_PRESERVE_LIGATURES - | pymupdf.TEXT_PRESERVE_WHITESPACE - | pymupdf.TEXT_MEDIABOX_CLIP - , - textpage: pymupdf.TextPage = None, -) -> list: - """Search for a string on a page. - - Args: - pno: page number - text: string to be searched for - clip: restrict search to this rectangle - quads: (bool) return quads instead of rectangles - flags: bit switches, default: join hyphened words - textpage: reuse a prepared textpage - Returns: - a list of rectangles or quads, each containing an occurrence. - """ - - return doc[pno].search_for( - text, - quads=quads, - clip=clip, - flags=flags, - textpage=textpage, - ) - - def get_text_blocks( page: pymupdf.Page, clip: rect_like = None, @@ -1005,28 +970,6 @@ def get_text( del tp return t - -def get_page_text( - doc: pymupdf.Document, - pno: int, - option: str = "text", - clip: rect_like = None, - flags: OptInt = None, - textpage: pymupdf.TextPage = None, - sort: bool = False, -) -> typing.Any: - """Extract a document page's text by page number. - - Notes: - Convenience function calling page.get_text(). - Args: - pno: page number - option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml. - Returns: - output from page.TextPage(). - """ - return doc[pno].get_text(option, clip=clip, flags=flags, sort=sort) - def get_pixmap( page: pymupdf.Page, *, @@ -1069,38 +1012,6 @@ def get_pixmap( return pix -def get_page_pixmap( - doc: pymupdf.Document, - pno: int, - *, - matrix: matrix_like = pymupdf.Identity, - dpi=None, - colorspace: pymupdf.Colorspace = pymupdf.csRGB, - clip: rect_like = None, - alpha: bool = False, - annots: bool = True, -) -> pymupdf.Pixmap: - """Create pixmap of document page by page number. - - Notes: - Convenience function calling page.get_pixmap. - Args: - pno: (int) page number - matrix: pymupdf.Matrix for transformation (default: pymupdf.Identity). - colorspace: (str,pymupdf.Colorspace) rgb, rgb, gray - case ignored, default csRGB. - clip: (irect-like) restrict rendering to this area. - alpha: (bool) include alpha channel - annots: (bool) also render annotations - """ - return doc[pno].get_pixmap( - matrix=matrix, - dpi=dpi, colorspace=colorspace, - clip=clip, - alpha=alpha, - annots=annots - ) - - def getLinkDict(ln, document=None) -> dict: if isinstance(ln, pymupdf.Outline): dest = ln.destination(document) @@ -1187,179 +1098,6 @@ def get_links(page: pymupdf.Page) -> list: return links -def get_toc( - doc: pymupdf.Document, - simple: bool = True, -) -> list: - """Create a table of contents. - - Args: - simple: a bool to control output. Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation. - """ - def recurse(olItem, liste, lvl): - """Recursively follow the outline item chain and record item information in a list.""" - while olItem and olItem.this.m_internal: - if olItem.title: - title = olItem.title - else: - title = " " - - if not olItem.is_external: - if olItem.uri: - if olItem.page == -1: - resolve = doc.resolve_link(olItem.uri) - page = resolve[0] + 1 - else: - page = olItem.page + 1 - else: - page = -1 - else: - page = -1 - - if not simple: - link = getLinkDict(olItem, doc) - liste.append([lvl, title, page, link]) - else: - liste.append([lvl, title, page]) - - if olItem.down: - liste = recurse(olItem.down, liste, lvl + 1) - olItem = olItem.next - return liste - - # ensure document is open - if doc.is_closed: - raise ValueError("document closed") - doc.init_doc() - olItem = doc.outline - if not olItem: - return [] - lvl = 1 - liste = [] - toc = recurse(olItem, liste, lvl) - if doc.is_pdf and not simple: - doc._extend_toc_items(toc) - return toc - - -def del_toc_item( - doc: pymupdf.Document, - idx: int, -) -> None: - """Delete TOC / bookmark item by index.""" - xref = doc.get_outline_xrefs()[idx] - doc._remove_toc_item(xref) - - -def set_toc_item( - doc: pymupdf.Document, - idx: int, - dest_dict: OptDict = None, - kind: OptInt = None, - pno: OptInt = None, - uri: OptStr = None, - title: OptStr = None, - to: point_like = None, - filename: OptStr = None, - zoom: float = 0, -) -> None: - """Update TOC item by index. - - It allows changing the item's title and link destination. - - Args: - idx: - (int) desired index of the TOC list, as created by get_toc. - dest_dict: - (dict) destination dictionary as created by get_toc(False). - Outrules all other parameters. If None, the remaining parameters - are used to make a dest dictionary. - kind: - (int) kind of link (pymupdf.LINK_GOTO, etc.). If None, then only - the title will be updated. If pymupdf.LINK_NONE, the TOC item will - be deleted. - pno: - (int) page number (1-based like in get_toc). Required if - pymupdf.LINK_GOTO. - uri: - (str) the URL, required if pymupdf.LINK_URI. - title: - (str) the new title. No change if None. - to: - (point-like) destination on the target page. If omitted, (72, 36) - will be used as target coordinates. - filename: - (str) destination filename, required for pymupdf.LINK_GOTOR and - pymupdf.LINK_LAUNCH. - name: - (str) a destination name for pymupdf.LINK_NAMED. - zoom: - (float) a zoom factor for the target location (pymupdf.LINK_GOTO). - """ - xref = doc.get_outline_xrefs()[idx] - page_xref = 0 - if type(dest_dict) is dict: - if dest_dict["kind"] == pymupdf.LINK_GOTO: - pno = dest_dict["page"] - page_xref = doc.page_xref(pno) - page_height = doc.page_cropbox(pno).height - to = dest_dict.get('to', pymupdf.Point(72, 36)) - to.y = page_height - to.y - dest_dict["to"] = to - action = getDestStr(page_xref, dest_dict) - if not action.startswith("/A"): - raise ValueError("bad bookmark dest") - color = dest_dict.get("color") - if color: - color = list(map(float, color)) - if len(color) != 3 or min(color) < 0 or max(color) > 1: - raise ValueError("bad color value") - bold = dest_dict.get("bold", False) - italic = dest_dict.get("italic", False) - flags = italic + 2 * bold - collapse = dest_dict.get("collapse") - return doc._update_toc_item( - xref, - action=action[2:], - title=title, - color=color, - flags=flags, - collapse=collapse, - ) - - if kind == pymupdf.LINK_NONE: # delete bookmark item - return doc.del_toc_item(idx) - if kind is None and title is None: # treat as no-op - return None - if kind is None: # only update title text - return doc._update_toc_item(xref, action=None, title=title) - - if kind == pymupdf.LINK_GOTO: - if pno is None or pno not in range(1, doc.page_count + 1): - raise ValueError("bad page number") - page_xref = doc.page_xref(pno - 1) - page_height = doc.page_cropbox(pno - 1).height - if to is None: - to = pymupdf.Point(72, page_height - 36) - else: - to = pymupdf.Point(to) - to.y = page_height - to.y - - ddict = { - "kind": kind, - "to": to, - "uri": uri, - "page": pno, - "file": filename, - "zoom": zoom, - } - action = getDestStr(page_xref, ddict) - if action == "" or not action.startswith("/A"): - raise ValueError("bad bookmark dest") - - return doc._update_toc_item(xref, action=action[2:], title=title) - - def get_area(*args) -> float: """Calculate area of rectangle.\nparameter is one of 'px' (default), 'in', 'cm', or 'mm'.""" rect = args[0] @@ -1372,68 +1110,6 @@ def get_area(*args) -> float: return f * rect.width * rect.height -def set_metadata(doc: pymupdf.Document, m: dict = None) -> None: - """Update the PDF /Info object. - - Args: - m: a dictionary like doc.metadata. - """ - if not doc.is_pdf: - raise ValueError("is no PDF") - if doc.is_closed or doc.is_encrypted: - raise ValueError("document closed or encrypted") - if m is None: - m = {} - elif type(m) is not dict: - raise ValueError("bad metadata") - keymap = { - "author": "Author", - "producer": "Producer", - "creator": "Creator", - "title": "Title", - "format": None, - "encryption": None, - "creationDate": "CreationDate", - "modDate": "ModDate", - "subject": "Subject", - "keywords": "Keywords", - "trapped": "Trapped", - } - valid_keys = set(keymap.keys()) - diff_set = set(m.keys()).difference(valid_keys) - if diff_set != set(): - msg = "bad dict key(s): %s" % diff_set - raise ValueError(msg) - - t, temp = doc.xref_get_key(-1, "Info") - if t != "xref": - info_xref = 0 - else: - info_xref = int(temp.replace("0 R", "")) - - if m == {} and info_xref == 0: # nothing to do - return - - if info_xref == 0: # no prev metadata: get new xref - info_xref = doc.get_new_xref() - doc.update_object(info_xref, "<<>>") # fill it with empty object - doc.xref_set_key(-1, "Info", "%i 0 R" % info_xref) - elif m == {}: # remove existing metadata - doc.xref_set_key(-1, "Info", "null") - doc.init_doc() - return - - for key, val in [(k, v) for k, v in m.items() if keymap[k] is not None]: - pdf_key = keymap[key] - if not bool(val) or val in ("none", "null"): - val = "null" - else: - val = pymupdf.get_pdf_str(val) - doc.xref_set_key(info_xref, pdf_key, val) - doc.init_doc() - return - - def getDestStr(xref: int, ddict: dict) -> str: """Calculate the PDF action string. @@ -1492,647 +1168,6 @@ def getDestStr(xref: int, ddict: dict) -> str: return "" -def set_toc( - doc: pymupdf.Document, - toc: list, - collapse: int = 1, -) -> int: - """Create new outline tree (table of contents, TOC). - - Args: - toc: (list, tuple) each entry must contain level, title, page and - optionally top margin on the page. None or '()' remove the TOC. - collapse: (int) collapses entries beyond this level. Zero or None - shows all entries unfolded. - Returns: - the number of inserted items, or the number of removed items respectively. - """ - if doc.is_closed or doc.is_encrypted: - raise ValueError("document closed or encrypted") - if not doc.is_pdf: - raise ValueError("is no PDF") - if not toc: # remove all entries - return len(doc._delToC()) - - # validity checks -------------------------------------------------------- - if type(toc) not in (list, tuple): - raise ValueError("'toc' must be list or tuple") - toclen = len(toc) - page_count = doc.page_count - t0 = toc[0] - if type(t0) not in (list, tuple): - raise ValueError("items must be sequences of 3 or 4 items") - if t0[0] != 1: - raise ValueError("hierarchy level of item 0 must be 1") - for i in list(range(toclen - 1)): - t1 = toc[i] - t2 = toc[i + 1] - if not -1 <= t1[2] <= page_count: - raise ValueError("row %i: page number out of range" % i) - if (type(t2) not in (list, tuple)) or len(t2) not in (3, 4): - raise ValueError("bad row %i" % (i + 1)) - if (type(t2[0]) is not int) or t2[0] < 1: - raise ValueError("bad hierarchy level in row %i" % (i + 1)) - if t2[0] > t1[0] + 1: - raise ValueError("bad hierarchy level in row %i" % (i + 1)) - # no formal errors in toc -------------------------------------------------- - - # -------------------------------------------------------------------------- - # make a list of xref numbers, which we can use for our TOC entries - # -------------------------------------------------------------------------- - old_xrefs = doc._delToC() # del old outlines, get their xref numbers - - # prepare table of xrefs for new bookmarks - old_xrefs = [] - xref = [0] + old_xrefs - xref[0] = doc._getOLRootNumber() # entry zero is outline root xref number - if toclen > len(old_xrefs): # too few old xrefs? - for i in range((toclen - len(old_xrefs))): - xref.append(doc.get_new_xref()) # acquire new ones - - lvltab = {0: 0} # to store last entry per hierarchy level - - # ------------------------------------------------------------------------------ - # contains new outline objects as strings - first one is the outline root - # ------------------------------------------------------------------------------ - olitems = [{"count": 0, "first": -1, "last": -1, "xref": xref[0]}] - # ------------------------------------------------------------------------------ - # build olitems as a list of PDF-like connected dictionaries - # ------------------------------------------------------------------------------ - for i in range(toclen): - o = toc[i] - lvl = o[0] # level - title = pymupdf.get_pdf_str(o[1]) # title - pno = min(doc.page_count - 1, max(0, o[2] - 1)) # page number - page_xref = doc.page_xref(pno) - page_height = doc.page_cropbox(pno).height - top = pymupdf.Point(72, page_height - 36) - dest_dict = {"to": top, "kind": pymupdf.LINK_GOTO} # fall back target - if o[2] < 0: - dest_dict["kind"] = pymupdf.LINK_NONE - if len(o) > 3: # some target is specified - if type(o[3]) in (int, float): # convert a number to a point - dest_dict["to"] = pymupdf.Point(72, page_height - o[3]) - else: # if something else, make sure we have a dict - # We make a copy of o[3] to avoid modifying our caller's data. - dest_dict = o[3].copy() if type(o[3]) is dict else dest_dict - if "to" not in dest_dict: # target point not in dict? - dest_dict["to"] = top # put default in - else: # transform target to PDF coordinates - page = doc[pno] - point = pymupdf.Point(dest_dict["to"]) - point.y = page.cropbox.height - point.y - point = point * page.rotation_matrix - dest_dict["to"] = (point.x, point.y) - d = {} - d["first"] = -1 - d["count"] = 0 - d["last"] = -1 - d["prev"] = -1 - d["next"] = -1 - d["dest"] = getDestStr(page_xref, dest_dict) - d["top"] = dest_dict["to"] - d["title"] = title - d["parent"] = lvltab[lvl - 1] - d["xref"] = xref[i + 1] - d["color"] = dest_dict.get("color") - d["flags"] = dest_dict.get("italic", 0) + 2 * dest_dict.get("bold", 0) - lvltab[lvl] = i + 1 - parent = olitems[lvltab[lvl - 1]] # the parent entry - - if ( - dest_dict.get("collapse") or collapse and lvl > collapse - ): # suppress expansion - parent["count"] -= 1 # make /Count negative - else: - parent["count"] += 1 # positive /Count - - if parent["first"] == -1: - parent["first"] = i + 1 - parent["last"] = i + 1 - else: - d["prev"] = parent["last"] - prev = olitems[parent["last"]] - prev["next"] = i + 1 - parent["last"] = i + 1 - olitems.append(d) - - # ------------------------------------------------------------------------------ - # now create each outline item as a string and insert it in the PDF - # ------------------------------------------------------------------------------ - for i, ol in enumerate(olitems): - txt = "<<" - if ol["count"] != 0: - txt += "/Count %i" % ol["count"] - try: - txt += ol["dest"] - except Exception: - # Verbose in PyMuPDF/tests. - if g_exceptions_verbose >= 2: pymupdf.exception_info() - pass - try: - if ol["first"] > -1: - txt += "/First %i 0 R" % xref[ol["first"]] - except Exception: - if g_exceptions_verbose >= 2: pymupdf.exception_info() - pass - try: - if ol["last"] > -1: - txt += "/Last %i 0 R" % xref[ol["last"]] - except Exception: - if g_exceptions_verbose >= 2: pymupdf.exception_info() - pass - try: - if ol["next"] > -1: - txt += "/Next %i 0 R" % xref[ol["next"]] - except Exception: - # Verbose in PyMuPDF/tests. - if g_exceptions_verbose >= 2: pymupdf.exception_info() - pass - try: - if ol["parent"] > -1: - txt += "/Parent %i 0 R" % xref[ol["parent"]] - except Exception: - # Verbose in PyMuPDF/tests. - if g_exceptions_verbose >= 2: pymupdf.exception_info() - pass - try: - if ol["prev"] > -1: - txt += "/Prev %i 0 R" % xref[ol["prev"]] - except Exception: - # Verbose in PyMuPDF/tests. - if g_exceptions_verbose >= 2: pymupdf.exception_info() - pass - try: - txt += "/Title" + ol["title"] - except Exception: - # Verbose in PyMuPDF/tests. - if g_exceptions_verbose >= 2: pymupdf.exception_info() - pass - - if ol.get("color") and len(ol["color"]) == 3: - txt += f"/C[ {_format_g(tuple(ol['color']))}]" - if ol.get("flags", 0) > 0: - txt += "/F %i" % ol["flags"] - - if i == 0: # special: this is the outline root - txt += "/Type/Outlines" # so add the /Type entry - txt += ">>" - doc.update_object(xref[i], txt) # insert the PDF object - - doc.init_doc() - return toclen - - -def do_widgets( - tar: pymupdf.Document, - src: pymupdf.Document, - graftmap, - from_page: int = -1, - to_page: int = -1, - start_at: int = -1, - join_duplicates=0, -) -> None: - """Insert widgets of copied page range into target PDF. - - Parameter values **must** equal those of method insert_pdf() which - must have been previously executed. - """ - if not src.is_form_pdf: # nothing to do: source PDF has no fields - return - - def clean_kid_parents(acro_fields): - """ Make sure all kids have correct "Parent" pointers.""" - for i in range(acro_fields.pdf_array_len()): - parent = acro_fields.pdf_array_get(i) - kids = parent.pdf_dict_get(pymupdf.PDF_NAME("Kids")) - for j in range(kids.pdf_array_len()): - kid = kids.pdf_array_get(j) - kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), parent) - - def join_widgets(pdf, acro_fields, xref1, xref2, name): - """Called for each pair of widgets having the same name. - - Args: - pdf: target MuPDF document - acro_fields: object Root/AcroForm/Fields - xref1, xref2: widget xrefs having same names - name: (str) the name - - Result: - Defined or updated widget parent that points to both widgets. - """ - - def re_target(pdf, acro_fields, xref1, kids1, xref2, kids2): - """Merge widget in xref2 into "Kids" list of widget xref1. - - Args: - xref1, kids1: target widget and its "Kids" array. - xref2, kids2: source wwidget and its "Kids" array (may be empty). - """ - # make indirect objects from widgets - w1_ind = mupdf.pdf_new_indirect(pdf, xref1, 0) - w2_ind = mupdf.pdf_new_indirect(pdf, xref2, 0) - # find source widget in "Fields" array - idx = acro_fields.pdf_array_find(w2_ind) - acro_fields.pdf_array_delete(idx) - - if not kids2.pdf_is_array(): # source widget has no kids - widget = mupdf.pdf_load_object(pdf, xref2) - - # delete name from widget and insert target as parent - widget.pdf_dict_del(pymupdf.PDF_NAME("T")) - widget.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind) - - # put in target Kids - kids1.pdf_array_push(w2_ind) - else: # copy source kids to target kids - for i in range(kids2.pdf_array_len()): - kid = kids2.pdf_array_get(i) - kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind) - kid_ind = mupdf.pdf_new_indirect(pdf, kid.pdf_to_num(), 0) - kids1.pdf_array_push(kid_ind) - - def new_target(pdf, acro_fields, xref1, w1, xref2, w2, name): - """Make new "Parent" for two widgets with same name. - - Args: - xref1, w1: first widget - xref2, w2: second widget - name: field name - - Result: - Both widgets have no "Kids". We create a new object with the - name and a "Kids" array containing the widgets. - Original widgets must be removed from AcroForm/Fields. - """ - # make new "Parent" object - new = mupdf.pdf_new_dict(pdf, 5) - new.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), name) - kids = new.pdf_dict_put_array(pymupdf.PDF_NAME("Kids"), 2) - new_obj = mupdf.pdf_add_object(pdf, new) - new_obj_xref = new_obj.pdf_to_num() - new_ind = mupdf.pdf_new_indirect(pdf, new_obj_xref, 0) - - # copy over some required source widget properties - ft = w1.pdf_dict_get(pymupdf.PDF_NAME("FT")) - w1.pdf_dict_del(pymupdf.PDF_NAME("FT")) - new_obj.pdf_dict_put(pymupdf.PDF_NAME("FT"), ft) - - aa = w1.pdf_dict_get(pymupdf.PDF_NAME("AA")) - w1.pdf_dict_del(pymupdf.PDF_NAME("AA")) - new_obj.pdf_dict_put(pymupdf.PDF_NAME("AA"), aa) - - # remove name field, insert "Parent" field in source widgets - w1.pdf_dict_del(pymupdf.PDF_NAME("T")) - w1.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind) - w2.pdf_dict_del(pymupdf.PDF_NAME("T")) - w2.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind) - - # put source widgets in "kids" array - ind1 = mupdf.pdf_new_indirect(pdf, xref1, 0) - ind2 = mupdf.pdf_new_indirect(pdf, xref2, 0) - kids.pdf_array_push(ind1) - kids.pdf_array_push(ind2) - - # remove source widgets from "AcroForm/Fields" - idx = acro_fields.pdf_array_find(ind1) - acro_fields.pdf_array_delete(idx) - idx = acro_fields.pdf_array_find(ind2) - acro_fields.pdf_array_delete(idx) - - acro_fields.pdf_array_push(new_ind) - - w1 = mupdf.pdf_load_object(pdf, xref1) - w2 = mupdf.pdf_load_object(pdf, xref2) - kids1 = w1.pdf_dict_get(pymupdf.PDF_NAME("Kids")) - kids2 = w2.pdf_dict_get(pymupdf.PDF_NAME("Kids")) - - # check which widget has a suitable "Kids" array - if kids1.pdf_is_array(): - re_target(pdf, acro_fields, xref1, kids1, xref2, kids2) # pylint: disable=arguments-out-of-order - elif kids2.pdf_is_array(): - re_target(pdf, acro_fields, xref2, kids2, xref1, kids1) # pylint: disable=arguments-out-of-order - else: - new_target(pdf, acro_fields, xref1, w1, xref2, w2, name) # pylint: disable=arguments-out-of-order - - def get_kids(parent, kids_list): - """Return xref list of leaf kids for a parent. - - Call with an empty list. - """ - kids = mupdf.pdf_dict_get(parent, pymupdf.PDF_NAME("Kids")) - if not kids.pdf_is_array(): - return kids_list - for i in range(kids.pdf_array_len()): - kid = kids.pdf_array_get(i) - if mupdf.pdf_is_dict(mupdf.pdf_dict_get(kid, pymupdf.PDF_NAME("Kids"))): - kids_list = get_kids(kid, kids_list) - else: - kids_list.append(kid.pdf_to_num()) - return kids_list - - def kids_xrefs(widget): - """Get the xref of top "Parent" and the list of leaf widgets.""" - kids_list = [] - parent = mupdf.pdf_dict_get(widget, pymupdf.PDF_NAME("Parent")) - parent_xref = parent.pdf_to_num() - if parent_xref == 0: - return parent_xref, kids_list - kids_list = get_kids(parent, kids_list) - return parent_xref, kids_list - - def deduplicate_names(pdf, acro_fields, join_duplicates=False): - """Handle any widget name duplicates caused by the merge.""" - names = {} # key is a widget name, value a list of widgets having it. - - # extract all names and widgets in "AcroForm/Fields" - for i in range(mupdf.pdf_array_len(acro_fields)): - wobject = mupdf.pdf_array_get(acro_fields, i) - xref = wobject.pdf_to_num() - - # extract widget name and collect widget(s) using it - T = mupdf.pdf_dict_get_text_string(wobject, pymupdf.PDF_NAME("T")) - xrefs = names.get(T, []) - xrefs.append(xref) - names[T] = xrefs - - for name, xrefs in names.items(): - if len(xrefs) < 2: - continue - xref0, xref1 = xrefs[:2] # only exactly 2 should occur! - if join_duplicates: # combine fields with equal names - join_widgets(pdf, acro_fields, xref0, xref1, name) - else: # make field names unique - newname = name + f" [{xref1}]" # append this to the name - wobject = mupdf.pdf_load_object(pdf, xref1) - wobject.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), newname) - - clean_kid_parents(acro_fields) - - def get_acroform(doc): - """Retrieve the AcroForm dictionary form a PDF.""" - pdf = mupdf.pdf_document_from_fz_document(doc) - # AcroForm (= central form field info) - return mupdf.pdf_dict_getp(mupdf.pdf_trailer(pdf), "Root/AcroForm") - - tarpdf = mupdf.pdf_document_from_fz_document(tar) - srcpdf = mupdf.pdf_document_from_fz_document(src) - - if tar.is_form_pdf: - # target is a Form PDF, so use it to include source fields - acro = get_acroform(tar) - # Important arrays in AcroForm - acro_fields = acro.pdf_dict_get(pymupdf.PDF_NAME("Fields")) - tar_co = acro.pdf_dict_get(pymupdf.PDF_NAME("CO")) - if not tar_co.pdf_is_array(): - tar_co = acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5) - else: - # target is no Form PDF, so copy over source AcroForm - acro = mupdf.pdf_deep_copy_obj(get_acroform(src)) # make a copy - - # Clear "Fields" and "CO" arrays: will be populated by page fields. - # This is required to avoid copying unneeded objects. - acro.pdf_dict_del(pymupdf.PDF_NAME("Fields")) - acro.pdf_dict_put_array(pymupdf.PDF_NAME("Fields"), 5) - acro.pdf_dict_del(pymupdf.PDF_NAME("CO")) - acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5) - - # Enrich AcroForm for copying to target - acro_graft = mupdf.pdf_graft_mapped_object(graftmap, acro) - - # Insert AcroForm into target PDF - acro_tar = mupdf.pdf_add_object(tarpdf, acro_graft) - acro_fields = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("Fields")) - tar_co = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("CO")) - - # get its xref and insert it into target catalog - tar_xref = acro_tar.pdf_to_num() - acro_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) - root = mupdf.pdf_dict_get(mupdf.pdf_trailer(tarpdf), pymupdf.PDF_NAME("Root")) - root.pdf_dict_put(pymupdf.PDF_NAME("AcroForm"), acro_tar_ind) - - if from_page <= to_page: - src_range = range(from_page, to_page + 1) - else: - src_range = range(from_page, to_page - 1, -1) - - parents = {} # information about widget parents - - # remove "P" owning page reference from all widgets of all source pages - for i in src_range: - src_page = src[i] - for xref in [ - xref - for xref, wtype, _ in src_page.annot_xrefs() - if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member - ]: - w_obj = mupdf.pdf_load_object(srcpdf, xref) - w_obj.pdf_dict_del(pymupdf.PDF_NAME("P")) - - # get the widget's parent structure - parent_xref, old_kids = kids_xrefs(w_obj) - if parent_xref: - parents[parent_xref] = { - "new_xref": 0, - "old_kids": old_kids, - "new_kids": [], - } - # Copy over Parent widgets first - they are not page-dependent - for xref in parents.keys(): # pylint: disable=consider-using-dict-items - parent = mupdf.pdf_load_object(srcpdf, xref) - parent_graft = mupdf.pdf_graft_mapped_object(graftmap, parent) - parent_tar = mupdf.pdf_add_object(tarpdf, parent_graft) - kids_xrefs_new = get_kids(parent_tar, []) - parent_xref_new = parent_tar.pdf_to_num() - parent_ind = mupdf.pdf_new_indirect(tarpdf, parent_xref_new, 0) - acro_fields.pdf_array_push(parent_ind) - parents[xref]["new_xref"] = parent_xref_new - parents[xref]["new_kids"] = kids_xrefs_new - - for i in range(len(src_range)): - # read first copied over page in target - tar_page = tar[start_at + i] - - # read the original page in the source PDF - src_page = src[src_range[i]] - - # now walk through source page widgets and copy over - w_xrefs = [ # widget xrefs of the source page - xref - for xref, wtype, _ in src_page.annot_xrefs() - if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member - ] - if not w_xrefs: # no widgets on this source page - continue - - # convert to formal PDF page - tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page) - - # extract annotations array - tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots")) - if not mupdf.pdf_is_array(tar_annots): - tar_annots = mupdf.pdf_dict_put_array( - tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"), 5 - ) - - for xref in w_xrefs: - w_obj = mupdf.pdf_load_object(srcpdf, xref) - - # check if field takes part in inter-field validations - is_aac = mupdf.pdf_is_dict(mupdf.pdf_dict_getp(w_obj, "AA/C")) - - # check if parent of widget already in target - parent_xref = mupdf.pdf_to_num( - w_obj.pdf_dict_get(pymupdf.PDF_NAME("Parent")) - ) - if parent_xref == 0: # parent not in target yet - try: - w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj) - except Exception as e: - pymupdf.message_warning(f"cannot copy widget at {xref=}: {e}") - continue - w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft) - tar_xref = w_obj_tar.pdf_to_num() - w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) - mupdf.pdf_array_push(tar_annots, w_obj_tar_ind) - mupdf.pdf_array_push(acro_fields, w_obj_tar_ind) - else: - parent = parents[parent_xref] - idx = parent["old_kids"].index(xref) # search for xref in parent - tar_xref = parent["new_kids"][idx] - w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) - mupdf.pdf_array_push(tar_annots, w_obj_tar_ind) - - # Into "AcroForm/CO" if a computation field. - if is_aac: - mupdf.pdf_array_push(tar_co, w_obj_tar_ind) - - deduplicate_names(tarpdf, acro_fields, join_duplicates=join_duplicates) - -def do_links( - doc1: pymupdf.Document, - doc2: pymupdf.Document, - from_page: int = -1, - to_page: int = -1, - start_at: int = -1, -) -> None: - """Insert links contained in copied page range into destination PDF. - - Parameter values **must** equal those of method insert_pdf(), which must - have been previously executed. - """ - #pymupdf.log( 'utils.do_links()') - # -------------------------------------------------------------------------- - # internal function to create the actual "/Annots" object string - # -------------------------------------------------------------------------- - def cre_annot(lnk, xref_dst, pno_src, ctm): - """Create annotation object string for a passed-in link.""" - - r = lnk["from"] * ctm # rect in PDF coordinates - rect = _format_g(tuple(r)) - if lnk["kind"] == pymupdf.LINK_GOTO: - txt = pymupdf.annot_skel["goto1"] # annot_goto - idx = pno_src.index(lnk["page"]) - p = lnk["to"] * ctm # target point in PDF coordinates - annot = txt(xref_dst[idx], p.x, p.y, lnk["zoom"], rect) - - elif lnk["kind"] == pymupdf.LINK_GOTOR: - if lnk["page"] >= 0: - txt = pymupdf.annot_skel["gotor1"] # annot_gotor - pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point - if type(pnt) is not pymupdf.Point: - pnt = pymupdf.Point(0, 0) - annot = txt( - lnk["page"], - pnt.x, - pnt.y, - lnk["zoom"], - lnk["file"], - lnk["file"], - rect, - ) - else: - txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n - to = pymupdf.get_pdf_str(lnk["to"]) - to = to[1:-1] - f = lnk["file"] - annot = txt(to, f, rect) - - elif lnk["kind"] == pymupdf.LINK_LAUNCH: - txt = pymupdf.annot_skel["launch"] # annot_launch - annot = txt(lnk["file"], lnk["file"], rect) - - elif lnk["kind"] == pymupdf.LINK_URI: - txt = pymupdf.annot_skel["uri"] # annot_uri - annot = txt(lnk["uri"], rect) - - else: - annot = "" - - return annot - - # -------------------------------------------------------------------------- - - # validate & normalize parameters - if from_page < 0: - fp = 0 - elif from_page >= doc2.page_count: - fp = doc2.page_count - 1 - else: - fp = from_page - - if to_page < 0 or to_page >= doc2.page_count: - tp = doc2.page_count - 1 - else: - tp = to_page - - if start_at < 0: - raise ValueError("'start_at' must be >= 0") - sa = start_at - - incr = 1 if fp <= tp else -1 # page range could be reversed - - # lists of source / destination page numbers - pno_src = list(range(fp, tp + incr, incr)) - pno_dst = [sa + i for i in range(len(pno_src))] - - # lists of source / destination page xrefs - xref_src = [] - xref_dst = [] - for i in range(len(pno_src)): - p_src = pno_src[i] - p_dst = pno_dst[i] - old_xref = doc2.page_xref(p_src) - new_xref = doc1.page_xref(p_dst) - xref_src.append(old_xref) - xref_dst.append(new_xref) - - # create the links for each copied page in destination PDF - for i in range(len(xref_src)): - page_src = doc2[pno_src[i]] # load source page - links = page_src.get_links() # get all its links - #pymupdf.log( '{pno_src=}') - #pymupdf.log( '{type(page_src)=}') - #pymupdf.log( '{page_src=}') - #pymupdf.log( '{=i len(links)}') - if len(links) == 0: # no links there - page_src = None - continue - ctm = ~page_src.transformation_matrix # calc page transformation matrix - page_dst = doc1[pno_dst[i]] # load destination page - link_tab = [] # store all link definitions here - for l in links: - if l["kind"] == pymupdf.LINK_GOTO and (l["page"] not in pno_src): - continue # GOTO link target not in copied pages - annot_text = cre_annot(l, xref_dst, pno_src, ctm) - if annot_text: - link_tab.append(annot_text) - if link_tab != []: - page_dst._addAnnot_FromString( tuple(link_tab)) - #pymupdf.log( 'utils.do_links() returning.') - - def getLinkText(page: pymupdf.Page, lnk: dict) -> str: # -------------------------------------------------------------------------- # define skeletons for /Annots object texts @@ -2501,56 +1536,6 @@ def rect_function(*args): return spare_height, scale -def new_page( - doc: pymupdf.Document, - pno: int = -1, - width: float = 595, - height: float = 842, -) -> pymupdf.Page: - """Create and return a new page object. - - Args: - pno: (int) insert before this page. Default: after last page. - width: (float) page width in points. Default: 595 (ISO A4 width). - height: (float) page height in points. Default 842 (ISO A4 height). - Returns: - A pymupdf.Page object. - """ - doc._newPage(pno, width=width, height=height) - return doc[pno] - - -def insert_page( - doc: pymupdf.Document, - pno: int, - text: typing.Union[str, list, None] = None, - fontsize: float = 11, - width: float = 595, - height: float = 842, - fontname: str = "helv", - fontfile: OptStr = None, - color: OptSeq = (0,), -) -> int: - """Create a new PDF page and insert some text. - - Notes: - Function combining pymupdf.Document.new_page() and pymupdf.Page.insert_text(). - For parameter details see these methods. - """ - page = doc.new_page(pno=pno, width=width, height=height) - if not bool(text): - return 0 - rc = page.insert_text( - (50, 72), - text, - fontsize=fontsize, - fontname=fontname, - fontfile=fontfile, - color=color, - ) - return rc - - def draw_line( page: pymupdf.Page, p1: point_like, @@ -3081,104 +2066,6 @@ def _get_font_properties(doc: pymupdf.Document, xref: int) -> tuple: return fontname, ext, stype, asc, dsc -def get_char_widths( - doc: pymupdf.Document, xref: int, limit: int = 256, idx: int = 0, fontdict: OptDict = None -) -> list: - """Get list of glyph information of a font. - - Notes: - Must be provided by its XREF number. If we already dealt with the - font, it will be recorded in doc.FontInfos. Otherwise we insert an - entry there. - Finally we return the glyphs for the font. This is a list of - (glyph, width) where glyph is an integer controlling the char - appearance, and width is a float controlling the char's spacing: - width * fontsize is the actual space. - For 'simple' fonts, glyph == ord(char) will usually be true. - Exceptions are 'Symbol' and 'ZapfDingbats'. We are providing data for these directly here. - """ - fontinfo = pymupdf.CheckFontInfo(doc, xref) - if fontinfo is None: # not recorded yet: create it - if fontdict is None: - name, ext, stype, asc, dsc = _get_font_properties(doc, xref) - fontdict = { - "name": name, - "type": stype, - "ext": ext, - "ascender": asc, - "descender": dsc, - } - else: - name = fontdict["name"] - ext = fontdict["ext"] - stype = fontdict["type"] - ordering = fontdict["ordering"] - simple = fontdict["simple"] - - if ext == "": - raise ValueError("xref is not a font") - - # check for 'simple' fonts - if stype in ("Type1", "MMType1", "TrueType"): - simple = True - else: - simple = False - - # check for CJK fonts - if name in ("Fangti", "Ming"): - ordering = 0 - elif name in ("Heiti", "Song"): - ordering = 1 - elif name in ("Gothic", "Mincho"): - ordering = 2 - elif name in ("Dotum", "Batang"): - ordering = 3 - else: - ordering = -1 - - fontdict["simple"] = simple - - if name == "ZapfDingbats": - glyphs = pymupdf.zapf_glyphs - elif name == "Symbol": - glyphs = pymupdf.symbol_glyphs - else: - glyphs = None - - fontdict["glyphs"] = glyphs - fontdict["ordering"] = ordering - fontinfo = [xref, fontdict] - doc.FontInfos.append(fontinfo) - else: - fontdict = fontinfo[1] - glyphs = fontdict["glyphs"] - simple = fontdict["simple"] - ordering = fontdict["ordering"] - - if glyphs is None: - oldlimit = 0 - else: - oldlimit = len(glyphs) - - mylimit = max(256, limit) - - if mylimit <= oldlimit: - return glyphs - - if ordering < 0: # not a CJK font - glyphs = doc._get_char_widths( - xref, fontdict["name"], fontdict["ext"], fontdict["ordering"], mylimit, idx - ) - else: # CJK fonts use char codes and width = 1 - glyphs = None - - fontdict["glyphs"] = glyphs - fontinfo[1] = fontdict - pymupdf.UpdateFontInfo(doc, fontinfo) - - return glyphs - - class Shape: """Create a new shape.""" @@ -4278,163 +3165,6 @@ def center_rect(annot_rect, new_text, font, fsize): return True -# ------------------------------------------------------------------------------ -# Remove potentially sensitive data from a PDF. Similar to the Adobe -# Acrobat 'sanitize' function -# ------------------------------------------------------------------------------ -def scrub( - doc: pymupdf.Document, - attached_files: bool = True, - clean_pages: bool = True, - embedded_files: bool = True, - hidden_text: bool = True, - javascript: bool = True, - metadata: bool = True, - redactions: bool = True, - redact_images: int = 0, - remove_links: bool = True, - reset_fields: bool = True, - reset_responses: bool = True, - thumbnails: bool = True, - xml_metadata: bool = True, -) -> None: - def remove_hidden(cont_lines): - """Remove hidden text from a PDF page. - - Args: - cont_lines: list of lines with /Contents content. Should have status - from after page.cleanContents(). - - Returns: - List of /Contents lines from which hidden text has been removed. - - Notes: - The input must have been created after the page's /Contents object(s) - have been cleaned with page.cleanContents(). This ensures a standard - formatting: one command per line, single spaces between operators. - This allows for drastic simplification of this code. - """ - out_lines = [] # will return this - in_text = False # indicate if within BT/ET object - suppress = False # indicate text suppression active - make_return = False - for line in cont_lines: - if line == b"BT": # start of text object - in_text = True # switch on - out_lines.append(line) # output it - continue - if line == b"ET": # end of text object - in_text = False # switch off - out_lines.append(line) # output it - continue - if line == b"3 Tr": # text suppression operator - suppress = True # switch on - make_return = True - continue - if line[-2:] == b"Tr" and line[0] != b"3": - suppress = False # text rendering changed - out_lines.append(line) - continue - if line == b"Q": # unstack command also switches off - suppress = False - out_lines.append(line) - continue - if suppress and in_text: # suppress hidden lines - continue - out_lines.append(line) - if make_return: - return out_lines - else: - return None - - if not doc.is_pdf: # only works for PDF - raise ValueError("is no PDF") - if doc.is_encrypted or doc.is_closed: - raise ValueError("closed or encrypted doc") - - if not clean_pages: - hidden_text = False - redactions = False - - if metadata: - doc.set_metadata({}) # remove standard metadata - - for page in doc: - if reset_fields: - # reset form fields (widgets) - for widget in page.widgets(): - widget.reset() - - if remove_links: - links = page.get_links() # list of all links on page - for link in links: # remove all links - page.delete_link(link) - - found_redacts = False - for annot in page.annots(): - if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files: - annot.update_file(buffer_=b" ") # set file content to empty - if reset_responses: - annot.delete_responses() - if annot.type[0] == pymupdf.PDF_ANNOT_REDACT: # pylint: disable=no-member - found_redacts = True - - if redactions and found_redacts: - page.apply_redactions(images=redact_images) - - if not (clean_pages or hidden_text): - continue # done with the page - - page.clean_contents() - if not page.get_contents(): - continue - if hidden_text: - xref = page.get_contents()[0] # only one b/o cleaning! - cont = doc.xref_stream(xref) - cont_lines = remove_hidden(cont.splitlines()) # remove hidden text - if cont_lines: # something was actually removed - cont = b"\n".join(cont_lines) - doc.update_stream(xref, cont) # rewrite the page /Contents - - if thumbnails: # remove page thumbnails? - if doc.xref_get_key(page.xref, "Thumb")[0] != "null": - doc.xref_set_key(page.xref, "Thumb", "null") - - # pages are scrubbed, now perform document-wide scrubbing - # remove embedded files - if embedded_files: - for name in doc.embfile_names(): - doc.embfile_del(name) - - if xml_metadata: - doc.del_xml_metadata() - if not (xml_metadata or javascript): - xref_limit = 0 - else: - xref_limit = doc.xref_length() - for xref in range(1, xref_limit): - if not doc.xref_object(xref): - msg = "bad xref %i - clean PDF before scrubbing" % xref - raise ValueError(msg) - if javascript and doc.xref_get_key(xref, "S")[1] == "/JavaScript": - # a /JavaScript action object - obj = "<>" # replace with a null JavaScript - doc.update_object(xref, obj) # update this object - continue # no further handling - - if not xml_metadata: - continue - - if doc.xref_get_key(xref, "Type")[1] == "/Metadata": - # delete any metadata object directly - doc.update_object(xref, "<<>>") - doc.update_stream(xref, b"deleted", new=True) - continue - - if doc.xref_get_key(xref, "Metadata")[0] != "null": - doc.xref_set_key(xref, "Metadata", "null") - - def _show_fz_text( text): #if mupdf_cppyy: # assert isinstance( text, cppyy.gbl.mupdf.Text) @@ -4678,192 +3408,6 @@ def output_justify(start, line): return new_lines # return non-written lines -# ------------------------------------------------------------------------ -# Optional Content functions -# ------------------------------------------------------------------------ -def get_oc(doc: pymupdf.Document, xref: int) -> int: - """Return optional content object xref for an image or form xobject. - - Args: - xref: (int) xref number of an image or form xobject. - """ - if doc.is_closed or doc.is_encrypted: - raise ValueError("document close or encrypted") - t, name = doc.xref_get_key(xref, "Subtype") - if t != "name" or name not in ("/Image", "/Form"): - raise ValueError("bad object type at xref %i" % xref) - t, oc = doc.xref_get_key(xref, "OC") - if t != "xref": - return 0 - rc = int(oc.replace("0 R", "")) - return rc - - -def set_oc(doc: pymupdf.Document, xref: int, oc: int) -> None: - """Attach optional content object to image or form xobject. - - Args: - xref: (int) xref number of an image or form xobject - oc: (int) xref number of an OCG or OCMD - """ - if doc.is_closed or doc.is_encrypted: - raise ValueError("document close or encrypted") - t, name = doc.xref_get_key(xref, "Subtype") - if t != "name" or name not in ("/Image", "/Form"): - raise ValueError("bad object type at xref %i" % xref) - if oc > 0: - t, name = doc.xref_get_key(oc, "Type") - if t != "name" or name not in ("/OCG", "/OCMD"): - raise ValueError("bad object type at xref %i" % oc) - if oc == 0 and "OC" in doc.xref_get_keys(xref): - doc.xref_set_key(xref, "OC", "null") - return None - doc.xref_set_key(xref, "OC", "%i 0 R" % oc) - return None - - -def set_ocmd( - doc: pymupdf.Document, - xref: int = 0, - ocgs: typing.Union[list, None] = None, - policy: OptStr = None, - ve: typing.Union[list, None] = None, -) -> int: - """Create or update an OCMD object in a PDF document. - - Args: - xref: (int) 0 for creating a new object, otherwise update existing one. - ocgs: (list) OCG xref numbers, which shall be subject to 'policy'. - policy: one of 'AllOn', 'AllOff', 'AnyOn', 'AnyOff' (any casing). - ve: (list) visibility expression. Use instead of 'ocgs' with 'policy'. - - Returns: - Xref of the created or updated OCMD. - """ - - all_ocgs = set(doc.get_ocgs().keys()) - - def ve_maker(ve): - if type(ve) not in (list, tuple) or len(ve) < 2: - raise ValueError("bad 've' format: %s" % ve) - if ve[0].lower() not in ("and", "or", "not"): - raise ValueError("bad operand: %s" % ve[0]) - if ve[0].lower() == "not" and len(ve) != 2: - raise ValueError("bad 've' format: %s" % ve) - item = "[/%s" % ve[0].title() - for x in ve[1:]: - if type(x) is int: - if x not in all_ocgs: - raise ValueError("bad OCG %i" % x) - item += " %i 0 R" % x - else: - item += " %s" % ve_maker(x) - item += "]" - return item - - text = "< dict: - """Return the definition of an OCMD (optional content membership dictionary). - - Recognizes PDF dict keys /OCGs (PDF array of OCGs), /P (policy string) and - /VE (visibility expression, PDF array). Via string manipulation, this - info is converted to a Python dictionary with keys "xref", "ocgs", "policy" - and "ve" - ready to recycle as input for 'set_ocmd()'. - """ - - if xref not in range(doc.xref_length()): - raise ValueError("bad xref") - text = doc.xref_object(xref, compressed=True) - if "/Type/OCMD" not in text: - raise ValueError("bad object type") - textlen = len(text) - - p0 = text.find("/OCGs[") # look for /OCGs key - p1 = text.find("]", p0) - if p0 < 0 or p1 < 0: # no OCGs found - ocgs = None - else: - ocgs = text[p0 + 6 : p1].replace("0 R", " ").split() - ocgs = list(map(int, ocgs)) - - p0 = text.find("/P/") # look for /P policy key - if p0 < 0: - policy = None - else: - p1 = text.find("ff", p0) - if p1 < 0: - p1 = text.find("on", p0) - if p1 < 0: # some irregular syntax - raise ValueError("bad object at xref") - else: - policy = text[p0 + 3 : p1 + 2] - - p0 = text.find("/VE[") # look for /VE visibility expression key - if p0 < 0: # no visibility expression found - ve = None - else: - lp = rp = 0 # find end of /VE by finding last ']'. - p1 = p0 - while lp < 1 or lp != rp: - p1 += 1 - if not p1 < textlen: # some irregular syntax - raise ValueError("bad object at xref") - if text[p1] == "[": - lp += 1 - if text[p1] == "]": - rp += 1 - # p1 now positioned at the last "]" - ve = text[p0 + 3 : p1 + 1] # the PDF /VE array - ve = ( - ve.replace("/And", '"and",') - .replace("/Not", '"not",') - .replace("/Or", '"or",') - ) - ve = ve.replace(" 0 R]", "]").replace(" 0 R", ",").replace("][", "],[") - import json - try: - ve = json.loads(ve) - except Exception: - pymupdf.exception_info() - pymupdf.message(f"bad /VE key: {ve!r}") - raise - return {"xref": xref, "ocgs": ocgs, "policy": policy, "ve": ve} - - """ Handle page labels for PDF documents. @@ -4954,33 +3498,6 @@ def get_label(page): return get_label_pno(page.number, labels) -def get_page_numbers(doc, label, only_one=False): - """Return a list of page numbers with the given label. - - Args: - doc: PDF document object (resp. 'self'). - label: (str) label. - only_one: (bool) stop searching after first hit. - Returns: - List of page numbers having this label. - """ - # Jorj McKie, 2021-01-06 - - numbers = [] - if not label: - return numbers - labels = doc._get_page_labels() - if labels == []: - return numbers - for i in range(doc.page_count): - plabel = get_label_pno(i, labels) - if plabel == label: - numbers.append(i) - if only_one: - break - return numbers - - def construct_label(style, prefix, pno) -> str: """Construct a label based on style, prefix and page number.""" # William Chapman, 2021-01-06 @@ -5049,94 +3566,6 @@ def roman_num(num): return "".join([a for a in roman_num(num)]) -def get_page_labels(doc): - """Return page label definitions in PDF document. - - Args: - doc: PDF document (resp. 'self'). - Returns: - A list of dictionaries with the following format: - {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}. - """ - # Jorj McKie, 2021-01-10 - return [rule_dict(item) for item in doc._get_page_labels()] - - -def set_page_labels(doc, labels): - """Add / replace page label definitions in PDF document. - - Args: - doc: PDF document (resp. 'self'). - labels: list of label dictionaries like: - {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}, - as returned by get_page_labels(). - """ - # William Chapman, 2021-01-06 - - def create_label_str(label): - """Convert Python label dict to corresponding PDF rule string. - - Args: - label: (dict) build rule for the label. - Returns: - PDF label rule string wrapped in "<<", ">>". - """ - s = "%i<<" % label["startpage"] - if label.get("prefix", "") != "": - s += "/P(%s)" % label["prefix"] - if label.get("style", "") != "": - s += "/S/%s" % label["style"] - if label.get("firstpagenum", 1) > 1: - s += "/St %i" % label["firstpagenum"] - s += ">>" - return s - - def create_nums(labels): - """Return concatenated string of all labels rules. - - Args: - labels: (list) dictionaries as created by function 'rule_dict'. - Returns: - PDF compatible string for page label definitions, ready to be - enclosed in PDF array 'Nums[...]'. - """ - labels.sort(key=lambda x: x["startpage"]) - s = "".join([create_label_str(label) for label in labels]) - return s - - doc._set_page_labels(create_nums(labels)) - - -# End of Page Label Code ------------------------------------------------- - - -def has_links(doc: pymupdf.Document) -> bool: - """Check whether there are links on any page.""" - if doc.is_closed: - raise ValueError("document closed") - if not doc.is_pdf: - raise ValueError("is no PDF") - for i in range(doc.page_count): - for item in doc.page_annot_xrefs(i): - if item[1] == pymupdf.PDF_ANNOT_LINK: # pylint: disable=no-member - return True - return False - - -def has_annots(doc: pymupdf.Document) -> bool: - """Check whether there are annotations on any page.""" - if doc.is_closed: - raise ValueError("document closed") - if not doc.is_pdf: - raise ValueError("is no PDF") - for i in range(doc.page_count): - for item in doc.page_annot_xrefs(i): - # pylint: disable=no-member - if not (item[1] == pymupdf.PDF_ANNOT_LINK or item[1] == pymupdf.PDF_ANNOT_WIDGET): # pylint: disable=no-member - return True - return False - - # ------------------------------------------------------------------- # Functions to recover the quad contained in a text extraction bbox # ------------------------------------------------------------------- @@ -5322,358 +3751,3 @@ def recover_char_quad(line_dir: tuple, span: dict, char: dict) -> pymupdf.Quad: raise ValueError("bad span argument") return recover_bbox_quad(line_dir, span, bbox) - - -# ------------------------------------------------------------------- -# Building font subsets using fontTools -# ------------------------------------------------------------------- -def subset_fonts(doc: pymupdf.Document, verbose: bool = False, fallback: bool = False) -> OptInt: - """Build font subsets in a PDF. - - Eligible fonts are potentially replaced by smaller versions. Page text is - NOT rewritten and thus should retain properties like being hidden or - controlled by optional content. - - This method by default uses MuPDF's own internal feature to create subset - fonts. As this is a new function, errors may still occur. In this case, - please fall back to using the previous version by using "fallback=True". - Fallback mode requires the external package 'fontTools'. - - Args: - fallback: use the older deprecated implementation. - verbose: only used by fallback mode. - - Returns: - The new MuPDF-based code returns None. The deprecated fallback - mode returns 0 if there are no fonts to subset. Otherwise, it - returns the decrease in fontsize (the difference in fontsize), - measured in bytes. - """ - # Font binaries: - "buffer" -> (names, xrefs, (unicodes, glyphs)) - # An embedded font is uniquely defined by its fontbuffer only. It may have - # multiple names and xrefs. - # Once the sets of used unicodes and glyphs are known, we compute a - # smaller version of the buffer user package fontTools. - - if not fallback: # by default use MuPDF function - pdf = mupdf.pdf_document_from_fz_document(doc) - mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count))) - return - - font_buffers = {} - - def get_old_widths(xref): - """Retrieve old font '/W' and '/DW' values.""" - df = doc.xref_get_key(xref, "DescendantFonts") - if df[0] != "array": # only handle xref specifications - return None, None - df_xref = int(df[1][1:-1].replace("0 R", "")) - widths = doc.xref_get_key(df_xref, "W") - if widths[0] != "array": # no widths key found - widths = None - else: - widths = widths[1] - dwidths = doc.xref_get_key(df_xref, "DW") - if dwidths[0] != "int": - dwidths = None - else: - dwidths = dwidths[1] - return widths, dwidths - - def set_old_widths(xref, widths, dwidths): - """Restore the old '/W' and '/DW' in subsetted font. - - If either parameter is None or evaluates to False, the corresponding - dictionary key will be set to null. - """ - df = doc.xref_get_key(xref, "DescendantFonts") - if df[0] != "array": # only handle xref specs - return None - df_xref = int(df[1][1:-1].replace("0 R", "")) - if (type(widths) is not str or not widths) and doc.xref_get_key(df_xref, "W")[ - 0 - ] != "null": - doc.xref_set_key(df_xref, "W", "null") - else: - doc.xref_set_key(df_xref, "W", widths) - if (type(dwidths) is not str or not dwidths) and doc.xref_get_key( - df_xref, "DW" - )[0] != "null": - doc.xref_set_key(df_xref, "DW", "null") - else: - doc.xref_set_key(df_xref, "DW", dwidths) - return None - - def set_subset_fontname(new_xref): - """Generate a name prefix to tag a font as subset. - - We use a random generator to select 6 upper case ASCII characters. - The prefixed name must be put in the font xref as the "/BaseFont" value - and in the FontDescriptor object as the '/FontName' value. - """ - # The following generates a prefix like 'ABCDEF+' - import random - import string - prefix = "".join(random.choices(tuple(string.ascii_uppercase), k=6)) + "+" - font_str = doc.xref_object(new_xref, compressed=True) - font_str = font_str.replace("/BaseFont/", "/BaseFont/" + prefix) - df = doc.xref_get_key(new_xref, "DescendantFonts") - if df[0] == "array": - df_xref = int(df[1][1:-1].replace("0 R", "")) - fd = doc.xref_get_key(df_xref, "FontDescriptor") - if fd[0] == "xref": - fd_xref = int(fd[1].replace("0 R", "")) - fd_str = doc.xref_object(fd_xref, compressed=True) - fd_str = fd_str.replace("/FontName/", "/FontName/" + prefix) - doc.update_object(fd_xref, fd_str) - doc.update_object(new_xref, font_str) - - def build_subset(buffer, unc_set, gid_set): - """Build font subset using fontTools. - - Args: - buffer: (bytes) the font given as a binary buffer. - unc_set: (set) required glyph ids. - Returns: - Either None if subsetting is unsuccessful or the subset font buffer. - """ - try: - import fontTools.subset as fts - except ImportError: - if g_exceptions_verbose: pymupdf.exception_info() - pymupdf.message("This method requires fontTools to be installed.") - raise - import tempfile - with tempfile.TemporaryDirectory() as tmp_dir: - oldfont_path = f"{tmp_dir}/oldfont.ttf" - newfont_path = f"{tmp_dir}/newfont.ttf" - uncfile_path = f"{tmp_dir}/uncfile.txt" - args = [ - oldfont_path, - "--retain-gids", - f"--output-file={newfont_path}", - "--layout-features=*", - "--passthrough-tables", - "--ignore-missing-glyphs", - "--ignore-missing-unicodes", - "--symbol-cmap", - ] - - # store glyph ids or unicodes as file - with open(f"{tmp_dir}/uncfile.txt", "w", encoding='utf8') as unc_file: - if 0xFFFD in unc_set: # error unicode exists -> use glyphs - args.append(f"--gids-file={uncfile_path}") - gid_set.add(189) - unc_list = list(gid_set) - for unc in unc_list: - unc_file.write("%i\n" % unc) - else: - args.append(f"--unicodes-file={uncfile_path}") - unc_set.add(255) - unc_list = list(unc_set) - for unc in unc_list: - unc_file.write("%04x\n" % unc) - - # store fontbuffer as a file - with open(oldfont_path, "wb") as fontfile: - fontfile.write(buffer) - try: - os.remove(newfont_path) # remove old file - except Exception: - pass - try: # invoke fontTools subsetter - fts.main(args) - font = pymupdf.Font(fontfile=newfont_path) - new_buffer = font.buffer # subset font binary - if font.glyph_count == 0: # intercept empty font - new_buffer = None - except Exception: - pymupdf.exception_info() - new_buffer = None - return new_buffer - - def repl_fontnames(doc): - """Populate 'font_buffers'. - - For each font candidate, store its xref and the list of names - by which PDF text may refer to it (there may be multiple). - """ - - def norm_name(name): - """Recreate font name that contains PDF hex codes. - - E.g. #20 -> space, chr(32) - """ - while "#" in name: - p = name.find("#") - c = int(name[p + 1 : p + 3], 16) - name = name.replace(name[p : p + 3], chr(c)) - return name - - def get_fontnames(doc, item): - """Return a list of fontnames for an item of page.get_fonts(). - - There may be multiple names e.g. for Type0 fonts. - """ - fontname = item[3] - names = [fontname] - fontname = doc.xref_get_key(item[0], "BaseFont")[1][1:] - fontname = norm_name(fontname) - if fontname not in names: - names.append(fontname) - descendents = doc.xref_get_key(item[0], "DescendantFonts") - if descendents[0] != "array": - return names - descendents = descendents[1][1:-1] - if descendents.endswith(" 0 R"): - xref = int(descendents[:-4]) - descendents = doc.xref_object(xref, compressed=True) - p1 = descendents.find("/BaseFont") - if p1 >= 0: - p2 = descendents.find("/", p1 + 1) - p1 = min(descendents.find("/", p2 + 1), descendents.find(">>", p2 + 1)) - fontname = descendents[p2 + 1 : p1] - fontname = norm_name(fontname) - if fontname not in names: - names.append(fontname) - return names - - for i in range(doc.page_count): - for f in doc.get_page_fonts(i, full=True): - font_xref = f[0] # font xref - font_ext = f[1] # font file extension - basename = f[3] # font basename - - if font_ext not in ( # skip if not supported by fontTools - "otf", - "ttf", - "woff", - "woff2", - ): - continue - # skip fonts which already are subsets - if len(basename) > 6 and basename[6] == "+": - continue - - extr = doc.extract_font(font_xref) - fontbuffer = extr[-1] - names = get_fontnames(doc, f) - name_set, xref_set, subsets = font_buffers.get( - fontbuffer, (set(), set(), (set(), set())) - ) - xref_set.add(font_xref) - for name in names: - name_set.add(name) - font = pymupdf.Font(fontbuffer=fontbuffer) - name_set.add(font.name) - del font - font_buffers[fontbuffer] = (name_set, xref_set, subsets) - - def find_buffer_by_name(name): - for buffer, (name_set, _, _) in font_buffers.items(): - if name in name_set: - return buffer - return None - - # ----------------- - # main function - # ----------------- - repl_fontnames(doc) # populate font information - if not font_buffers: # nothing found to do - if verbose: - pymupdf.message(f'No fonts to subset.') - return 0 - - old_fontsize = 0 - new_fontsize = 0 - for fontbuffer in font_buffers.keys(): - old_fontsize += len(fontbuffer) - - # Scan page text for usage of subsettable fonts - for page in doc: - # go through the text and extend set of used glyphs by font - # we use a modified MuPDF trace device, which delivers us glyph ids. - for span in page.get_texttrace(): - if type(span) is not dict: # skip useless information - continue - fontname = span["font"][:33] # fontname for the span - buffer = find_buffer_by_name(fontname) - if buffer is None: - continue - name_set, xref_set, (set_ucs, set_gid) = font_buffers[buffer] - for c in span["chars"]: - set_ucs.add(c[0]) # unicode - set_gid.add(c[1]) # glyph id - font_buffers[buffer] = (name_set, xref_set, (set_ucs, set_gid)) - - # build the font subsets - for old_buffer, (name_set, xref_set, subsets) in font_buffers.items(): - new_buffer = build_subset(old_buffer, subsets[0], subsets[1]) - fontname = list(name_set)[0] - if new_buffer is None or len(new_buffer) >= len(old_buffer): - # subset was not created or did not get smaller - if verbose: - pymupdf.message(f'Cannot subset {fontname!r}.') - continue - if verbose: - pymupdf.message(f"Built subset of font {fontname!r}.") - val = doc._insert_font(fontbuffer=new_buffer) # store subset font in PDF - new_xref = val[0] # get its xref - set_subset_fontname(new_xref) # tag fontname as subset font - font_str = doc.xref_object( # get its object definition - new_xref, - compressed=True, - ) - # walk through the original font xrefs and replace each by the subset def - for font_xref in xref_set: - # we need the original '/W' and '/DW' width values - width_table, def_width = get_old_widths(font_xref) - # ... and replace original font definition at xref with it - doc.update_object(font_xref, font_str) - # now copy over old '/W' and '/DW' values - if width_table or def_width: - set_old_widths(font_xref, width_table, def_width) - # 'new_xref' remains unused in the PDF and must be removed - # by garbage collection. - new_fontsize += len(new_buffer) - - return old_fontsize - new_fontsize - - -# ------------------------------------------------------------------- -# Copy XREF object to another XREF -# ------------------------------------------------------------------- -def xref_copy(doc: pymupdf.Document, source: int, target: int, *, keep: list = None) -> None: - """Copy a PDF dictionary object to another one given their xref numbers. - - Args: - doc: PDF document object - source: source xref number - target: target xref number, the xref must already exist - keep: an optional list of 1st level keys in target that should not be - removed before copying. - Notes: - This works similar to the copy() method of dictionaries in Python. The - source may be a stream object. - """ - if doc.xref_is_stream(source): - # read new xref stream, maintaining compression - stream = doc.xref_stream_raw(source) - doc.update_stream( - target, - stream, - compress=False, # keeps source compression - new=True, # in case target is no stream - ) - - # empty the target completely, observe exceptions - if keep is None: - keep = [] - for key in doc.xref_get_keys(target): - if key in keep: - continue - doc.xref_set_key(target, key, "null") - # copy over all source dict items - for key in doc.xref_get_keys(source): - item = doc.xref_get_key(source, key) - doc.xref_set_key(target, key, item[1])