From 3fb63a17069dd3e7322094b2c860640103452da0 Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Fri, 2 May 2025 06:33:34 -0400 Subject: [PATCH] Remove dependency on MuPDF version Add all MuPDF STEXT flags up to v1.26.0 to PyMuPDF. Use hard coded values if unknown in an earlier MuPDF version that we still want / need to support. The intention is to switch to MuPDF's symbolic names as soon as we drop support of the corresponding version. Flag bits representing current MuPDF features can always be used because the are ignored by older MuPDF versions. Also removed some duplicate definitions. --- docs/vars.rst | 36 ++++++++++++++++++++++++++++++++---- src/__init__.py | 32 ++++++++++++-------------------- 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/docs/vars.rst b/docs/vars.rst index 78cd7ac2a..7fb7bb10c 100644 --- a/docs/vars.rst +++ b/docs/vars.rst @@ -253,7 +253,7 @@ For the PyMuPDF programmer, some combination (using Python's `|` operator, or si .. py:data:: TEXT_COLLECT_STRUCTURE - 256 -- Not supported. + 256 -- Not supported yet. .. py:data:: TEXT_ACCURATE_BBOXES @@ -264,17 +264,45 @@ For the PyMuPDF programmer, some combination (using Python's `|` operator, or si .. py:data:: TEXT_COLLECT_VECTORS - 1024 -- Not supported. + 1024 -- Not supported yet. .. py:data:: TEXT_IGNORE_ACTUALTEXT - 2048 -- Ignore built-in differences between text appearing in e.g. PDF viewers versus text stored in the PDF. See :ref:`AdobeManual`, page 615 for background. If set, the **stored** ("replacement" text) is ignored in favor of the displayed text. + 2048 -- Ignore built-in differences between text appearing in e.g. PDF viewers versus text stored in the PDF. See :ref:`AdobeManual`, page 615 for background. If set, the **stored** ("replacement" text) is ignored in favor of the **displayed** text. .. py:data:: TEXT_STEXT_SEGMENT 4096 -- Attempt to segment page into different regions. -The following constants represent the default combinations of the above for text extraction and searching: +.. py:data:: TEXT_STEXT_PARAGRAPH_BREAK + + 8192 -- Not supported yet. + +.. py:data:: TEXT_STEXT_TABLE_HUNT + + 16384 -- Not supported yet. + +.. py:data:: TEXT_COLLECT_STYLES + + 32768 -- Detect underlined and strikeout text. Also detect and handle faked bold text in most cases. + +.. py:data:: TEXT_GID_FOR_UNKNOWN_UNICODE + + 65536 -- An alternative to `TEXT_CID_FOR_UNKNOWN_UNICODE` that uses the GID (glyph ID) instead of the CID (character ID). Both flags should never be used together, because results are undefined. + +.. py:data:: TEXT_CLIP_RECT + + 1 << 17 -- Not supported yet. + +.. py:data:: TEXT_ACCURATE_ASCENDERS + + 1 << 18 -- Not supported yet. + +.. py:data:: TEXT_ACCURATE_SIDE_BEARINGS + + 1 << 19 -- Not supported yet. + +The following constants represent default combinations of the above for text extraction and searching: .. py:data:: TEXTFLAGS_TEXT diff --git a/src/__init__.py b/src/__init__.py index fbcae3c44..89bc06be3 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -13516,18 +13516,18 @@ def width(self): TEXT_PRESERVE_SPANS = mupdf.FZ_STEXT_PRESERVE_SPANS TEXT_MEDIABOX_CLIP = mupdf.FZ_STEXT_MEDIABOX_CLIP TEXT_CID_FOR_UNKNOWN_UNICODE = mupdf.FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE -if mupdf_version_tuple >= (1, 25): - TEXT_COLLECT_STRUCTURE = mupdf.FZ_STEXT_COLLECT_STRUCTURE - TEXT_ACCURATE_BBOXES = mupdf.FZ_STEXT_ACCURATE_BBOXES - TEXT_COLLECT_VECTORS = mupdf.FZ_STEXT_COLLECT_VECTORS - TEXT_IGNORE_ACTUALTEXT = mupdf.FZ_STEXT_IGNORE_ACTUALTEXT - TEXT_STEXT_SEGMENT = mupdf.FZ_STEXT_SEGMENT -else: - TEXT_COLLECT_STRUCTURE = 256 - TEXT_ACCURATE_BBOXES = 512 - TEXT_COLLECT_VECTORS = 1024 - TEXT_IGNORE_ACTUALTEXT = 2048 - TEXT_STEXT_SEGMENT = 4096 +TEXT_COLLECT_STRUCTURE = 256 # mupdf.FZ_STEXT_COLLECT_STRUCTURE +TEXT_ACCURATE_BBOXES = 512 # mupdf.FZ_STEXT_ACCURATE_BBOXES +TEXT_COLLECT_VECTORS = 1024 # mupdf.FZ_STEXT_COLLECT_VECTORS +TEXT_IGNORE_ACTUALTEXT = 2048 # mupdf.FZ_STEXT_IGNORE_ACTUALTEXT +TEXT_STEXT_SEGMENT = 4096 # mupdf.FZ_STEXT_SEGMENT +TEXT_STEXT_PARAGRAPH_BREAK = 8192 # mupdf.FZ_STEXT_PARAGRAPH_BREAK +TEXT_STEXT_TABLE_HUNT = 16384 # mupdf.FZ_STEXT_TABLE_HUNT +TEXT_COLLECT_STYLES = 32768 # mupdf.FZ_STEXT_COLLECT_STYLES +TEXT_GID_FOR_UNKNOWN_UNICODE = 65536 # mupdf.FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE +TEXT_CLIP_RECT = 1 << 17 # mupdf.FZ_STEXT_CLIP_RECT +TEXT_ACCURATE_ASCENDERS = 1 << 18 # mupdf.FZ_STEXT_ACCURATE_ASCENDERS +TEXT_ACCURATE_SIDE_BEARINGS = 1 << 19 # mupdf.FZ_STEXT_ACCURATE_SIDE_BEARINGS TEXTFLAGS_WORDS = (0 | TEXT_PRESERVE_LIGATURES @@ -13620,14 +13620,6 @@ def width(self): PDF_BM_Screen = "Screen" PDF_BM_SoftLight = "Softlight" -# General text flags -TEXT_FONT_SUPERSCRIPT = 1 -TEXT_FONT_ITALIC = 2 -TEXT_FONT_SERIFED = 4 -TEXT_FONT_MONOSPACED = 8 -TEXT_FONT_BOLD = 16 - - annot_skel = { "goto1": lambda a, b, c, d, e: f"<>/Rect[{e}]/BS<>/Subtype/Link>>", "goto2": lambda a, b: f"<>/Rect[{b}]/BS<>/Subtype/Link>>",