Skip to content

Commit 4d95216

Browse files
committed
fix: Suggested fix for pypdfium2 close document
1 parent 04e5dc4 commit 4d95216

1 file changed

Lines changed: 20 additions & 3 deletions

File tree

gmft/pdf_bindings/pdfium.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
# PyPDFium2 bindings
44
from typing import Generator, Tuple
5+
import weakref
56
import pypdfium2 as pdfium
67

78
from gmft.base import Rect
@@ -17,6 +18,16 @@
1718
from gmft.formatters.base import CroppedTable
1819

1920

21+
def _close_document_quietly(doc: "PyPDFium2Document"):
22+
"""Best-effort close used by pickle cleanup finalizers."""
23+
if doc is None:
24+
return
25+
try:
26+
doc.close()
27+
except Exception:
28+
pass
29+
30+
2031
class PyPDFium2Page(BasePage):
2132
"""
2233
Note: This follows PIL's convention of (0, 0) being top left.
@@ -37,6 +48,7 @@ def __init__(
3748
self.width = page.get_width()
3849
self.height = page.get_height()
3950
self._positions_and_text_and_breaks = None
51+
self._pickle_parent_close_finalizer = None
4052
super().__init__(page_no)
4153

4254
def get_positions_and_text(
@@ -186,10 +198,15 @@ def __getstate__(self):
186198
}
187199

188200
def __setstate__(self, state):
189-
copy = PyPDFium2Utils.load_page_from_dict(state)
190-
# swap state
201+
# copy-and-swap idiom
202+
copy = PyPDFium2Utils.load_page_from_dict(state) # this opens a new document!
191203
self.__dict__, copy.__dict__ = copy.__dict__, self.__dict__
192-
# now copy has the old state, which will be garbage collected
204+
205+
# avoid pypdfium2 memory leak #55
206+
if self.parent is not None:
207+
self._pickle_parent_close_finalizer = weakref.finalize(
208+
self, _close_document_quietly, self.parent
209+
) # best-effort close document
193210

194211

195212
class PyPDFium2Document(BasePDFDocument):

0 commit comments

Comments
 (0)