From 2277f42af81d5bead6f75df554ce1c89cb1d141a Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Tue, 8 Jul 2025 10:17:53 -0400 Subject: [PATCH] Add Page method clip_to_rect --- docs/page.rst | 11 +++++++++++ src/__init__.py | 10 ++++++++++ tests/test_clip_page.py | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100644 tests/test_clip_page.py diff --git a/docs/page.rst b/docs/page.rst index 32f3e684d..c0617d546 100644 --- a/docs/page.rst +++ b/docs/page.rst @@ -62,6 +62,7 @@ In a nutshell, this is what you can do with PyMuPDF: :meth:`Page.annot_xrefs` PDF only: a list of annotation (and widget) xrefs :meth:`Page.annots` return a generator over the annots on the page :meth:`Page.apply_redactions` PDF only: process the redactions of the page +:meth:`Page.clip_to_rect` PDF only: remove page content outside a rectangle :meth:`Page.bound` rectangle of the page :meth:`Page.cluster_drawings` PDF only: bounding boxes of vector graphics :meth:`Page.delete_annot` PDF only: delete an annotation @@ -1961,6 +1962,16 @@ In a nutshell, this is what you can do with PyMuPDF: These changes are **permanent** and cannot be reverted. + .. method:: clip_to_rect(rect) + + PDF only: Permanently remove page content outside the given rectangle. This is similar to :meth:`Page.set_cropbox`, but the page's rectangle will not be changed, only the content outside the rectangle will be removed. + + :arg rect_like rect: The rectangle to clip to. Must be finite and its intersection with the page must not be empty. + + The method works best for text: All text on the page will be removed (decided by single character) that has no intersection with the rectangle. For vector graphics, the method will remove all paths that have no intersection with the rectangle. For images, the method will remove all images that have no intersection with the rectangle. Vectors and images **having** an intersection with the rectangle, will be kept in their entirety. + + The method roughly has the same effect as if four redactions had been applied that cover the rectangle's outside. + .. method:: remove_rotation() PDF only: Set page rotation to 0 while maintaining appearance and page content. diff --git a/src/__init__.py b/src/__init__.py index 8f9484a50..a2a94b6eb 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -8754,6 +8754,16 @@ def recolor(self, components=1): ropts = mupdf.PdfRecolorOptions(ropt) mupdf.pdf_recolor_page(pdfdoc, self.number, ropts) + def clip_to_rect(self, rect): + """Clip away page content outside the rectangle.""" + clip = Rect(rect) + if clip.is_infinite or (clip & self.rect).is_empty: + raise ValueError("rect must not be infinite or empty") + clip *= self.transformation_matrix + pdfpage = _as_pdf_page(self) + pclip = JM_rect_from_py(clip) + mupdf.pdf_clip_page(pdfpage, pclip) + @property def artbox(self): """The ArtBox""" diff --git a/tests/test_clip_page.py b/tests/test_clip_page.py new file mode 100644 index 000000000..48e4ac597 --- /dev/null +++ b/tests/test_clip_page.py @@ -0,0 +1,37 @@ +""" +Test Page method clip_to_rect. +""" + +import os +import pymupdf + + +def test_clip(): + """ + Clip a Page to a rectangle and confirm that no text has survived + that is completely outside the rectangle.. + """ + scriptdir = os.path.dirname(os.path.abspath(__file__)) + rect = pymupdf.Rect(200, 200, 400, 500) + filename = os.path.join(scriptdir, "resources", "v110-changes.pdf") + doc = pymupdf.open(filename) + page = doc[0] + page.clip_to_rect(rect) # clip the page to the rectangle + # capture font warning message of MuPDF + assert pymupdf.TOOLS.mupdf_warnings() == "bogus font ascent/descent values (0 / 0)" + # extract all text characters and assert that each one + # has a non-empty intersection with the rectangle. + chars = [ + c + for b in page.get_text("rawdict")["blocks"] + for l in b["lines"] + for s in l["spans"] + for c in s["chars"] + ] + for char in chars: + bbox = pymupdf.Rect(char["bbox"]) + if bbox.is_empty: + continue + assert bbox.intersects( + rect + ), f"Character '{char['c']}' at {bbox} is outside of {rect}."