Skip to content

Commit a06a587

Browse files
src/__init__.py: simplified Document constructor.
Also removed use of fz_recognize_document(), which only looks at a magic value. Instead trust MuPDF to do the right thing with both the content and whatever filename/filetype information is available.
1 parent b5aa249 commit a06a587

1 file changed

Lines changed: 67 additions & 111 deletions

File tree

src/__init__.py

Lines changed: 67 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -2885,6 +2885,7 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
28852885
global JM_mupdf_show_errors
28862886
JM_mupdf_show_errors_old = JM_mupdf_show_errors
28872887
JM_mupdf_show_errors = 0
2888+
28882889
try:
28892890
self.is_closed = False
28902891
self.is_encrypted = False
@@ -2901,138 +2902,93 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
29012902
self.this_is_pdf = True
29022903
return
29032904

2904-
# Classic implementation temporarily sets JM_mupdf_show_errors=0 then
2905-
# restores the previous value in `fz_always() {...}` before returning.
2906-
#
2907-
2908-
if not filename or type(filename) is str:
2909-
pass
2910-
elif hasattr(filename, "absolute"):
2911-
filename = str(filename)
2912-
elif hasattr(filename, "name"):
2913-
filename = filename.name
2914-
else:
2915-
raise TypeError(f"bad filename: {type(filename)=} {filename=}.")
2916-
2917-
if stream is not None:
2918-
if isinstance(stream, (bytes, memoryview)):
2919-
self.stream = stream
2920-
elif isinstance(stream, bytearray):
2921-
self.stream = bytes(stream)
2922-
elif isinstance(stream, io.BytesIO):
2923-
self.stream = stream.getvalue()
2924-
else:
2925-
raise TypeError(f"bad stream: {type(stream)=}.")
2926-
stream = self.stream
2927-
if not (filename or filetype):
2928-
filename = 'pdf'
2929-
else:
2930-
self.stream = None
2931-
2932-
if filename and self.stream is None:
2933-
from_file = True
2934-
self._name = filename
2935-
else:
2936-
from_file = False
2937-
self._name = ""
2938-
2939-
if from_file:
2940-
if not os.path.exists(filename):
2941-
msg = f"no such file: '{filename}'"
2942-
raise FileNotFoundError(msg)
2943-
elif not os.path.isfile(filename):
2944-
msg = f"'{filename}' is no file"
2945-
raise FileDataError(msg)
2946-
elif os.path.getsize(filename) == 0:
2947-
raise EmptyFileError(f'Cannot open empty file: {filename=}.')
2948-
29492905
w = width
29502906
h = height
29512907
r = JM_rect_from_py(rect)
29522908
if not mupdf.fz_is_infinite_rect(r):
29532909
w = r.x1 - r.x0
29542910
h = r.y1 - r.y0
29552911

2912+
self._name = filename
2913+
self.stream = stream
2914+
29562915
if stream is not None:
2916+
if filename is not None and filetype is None:
2917+
# 2025-05-06: Use <filename> as the filetype. This is
2918+
# reversing precedence - we used to use <filename> if both
2919+
# were set.
2920+
filetype = filename
2921+
if isinstance(stream, (bytes, memoryview)):
2922+
pass
2923+
elif isinstance(stream, bytearray):
2924+
stream = bytes(stream)
2925+
elif isinstance(stream, io.BytesIO):
2926+
stream = stream.getvalue()
2927+
else:
2928+
raise TypeError(f"bad stream: {type(stream)=}.")
2929+
self.stream = stream
2930+
29572931
assert isinstance(stream, (bytes, memoryview))
29582932
if len(stream) == 0:
2933+
# MuPDF raise an exception for this but also generates
2934+
# warnings, which is not very helpful for us. So instead we
2935+
# raise a specific exception.
29592936
raise EmptyFileError('Cannot open empty stream.')
2960-
c = stream
2961-
#len = (size_t) PyBytes_Size(stream);
2962-
2963-
if mupdf_cppyy:
2964-
buffer_ = mupdf.fz_new_buffer_from_copied_data(c)
2965-
data = mupdf.fz_open_buffer(buffer_)
2966-
else:
2967-
# Pass raw bytes data to mupdf.fz_open_memory(). This assumes
2968-
# that the bytes string will not be modified; i think the
2969-
# original PyMuPDF code makes the same assumption. Presumably
2970-
# setting self.stream above ensures that the bytes will not be
2971-
# garbage collected?
2972-
data = mupdf.fz_open_memory(mupdf.python_buffer_data(c), len(c))
2973-
magic = filename
2974-
if not magic:
2975-
magic = filetype
2976-
# fixme: pymupdf does:
2977-
# handler = fz_recognize_document(gctx, filetype);
2978-
# if (!handler) raise ValueError( MSG_BAD_FILETYPE)
2979-
# but prefer to leave fz_open_document_with_stream() to raise.
2937+
2938+
stream2 = mupdf.fz_open_memory(mupdf.python_buffer_data(stream), len(stream))
29802939
try:
2981-
doc = mupdf.fz_open_document_with_stream(magic, data)
2940+
doc = mupdf.fz_open_document_with_stream(filetype if filetype else '', stream2)
29822941
except Exception as e:
29832942
if g_exceptions_verbose > 1: exception_info()
29842943
raise FileDataError('Failed to open stream') from e
2985-
else:
2986-
if filename:
2987-
if not filetype:
2988-
try:
2989-
doc = mupdf.fz_open_document(filename)
2990-
except Exception as e:
2991-
if g_exceptions_verbose > 1: exception_info()
2992-
raise FileDataError(f'Failed to open file {filename!r}.') from e
2993-
else:
2994-
handler = mupdf.ll_fz_recognize_document(filetype)
2995-
if handler:
2996-
if handler.open:
2997-
#log( f'{handler.open=}')
2998-
#log( f'{dir(handler.open)=}')
2999-
try:
3000-
stream = mupdf.FzStream(filename)
3001-
accel = mupdf.FzStream()
3002-
archive = mupdf.FzArchive(None)
3003-
if mupdf_version_tuple >= (1, 24, 8):
3004-
doc = mupdf.ll_fz_document_handler_open(
3005-
handler,
3006-
stream.m_internal,
3007-
accel.m_internal,
3008-
archive.m_internal,
3009-
None, # recognize_state
3010-
)
3011-
else:
3012-
doc = mupdf.ll_fz_document_open_fn_call(
3013-
handler.open,
3014-
stream.m_internal,
3015-
accel.m_internal,
3016-
archive.m_internal,
3017-
)
3018-
except Exception as e:
3019-
if g_exceptions_verbose > 1: exception_info()
3020-
raise FileDataError(f'Failed to open file {filename!r} as type {filetype!r}.') from e
3021-
doc = mupdf.FzDocument( doc)
3022-
else:
3023-
assert 0
3024-
else:
3025-
raise ValueError( MSG_BAD_FILETYPE)
2944+
2945+
elif filename:
2946+
assert not stream
2947+
if isinstance(filename, str):
2948+
pass
2949+
elif hasattr(filename, "absolute"):
2950+
filename = str(filename)
2951+
elif hasattr(filename, "name"):
2952+
filename = filename.name
30262953
else:
3027-
pdf = mupdf.PdfDocument()
3028-
doc = mupdf.FzDocument(pdf)
2954+
raise TypeError(f"bad filename: {type(filename)=} {filename=}.")
2955+
self._name = filename
2956+
2957+
# Generate our own specific exceptions. This avoids MuPDF
2958+
# generating warnings etc.
2959+
if not os.path.exists(filename):
2960+
raise FileNotFoundError(f"no such file: '{filename}'")
2961+
elif not os.path.isfile(filename):
2962+
raise FileDataError(f"'{filename}' is no file")
2963+
elif os.path.getsize(filename) == 0:
2964+
raise EmptyFileError(f'Cannot open empty file: {filename=}.')
2965+
2966+
if filetype:
2967+
# Override the type implied by <filename>. MuPDF does not
2968+
# have a way to do this directly so we open via a stream.
2969+
try:
2970+
fz_stream = mupdf.fz_open_file(filename)
2971+
doc = mupdf.fz_open_document_with_stream(filetype, fz_stream)
2972+
except Exception as e:
2973+
if g_exceptions_verbose > 1: exception_info()
2974+
raise FileDataError(f'Failed to open file {filename!r} as type {filetype!r}.') from e
2975+
else:
2976+
try:
2977+
doc = mupdf.fz_open_document(filename)
2978+
except Exception as e:
2979+
if g_exceptions_verbose > 1: exception_info()
2980+
raise FileDataError(f'Failed to open file {filename!r}.') from e
2981+
2982+
else:
2983+
pdf = mupdf.PdfDocument()
2984+
doc = mupdf.FzDocument(pdf)
2985+
30292986
if w > 0 and h > 0:
30302987
mupdf.fz_layout_document(doc, w, h, fontsize)
30312988
elif mupdf.fz_is_document_reflowable(doc):
30322989
mupdf.fz_layout_document(doc, 400, 600, 11)
3033-
this = doc
30342990

3035-
self.this = this
2991+
self.this = doc
30362992

30372993
# fixme: not sure where self.thisown gets initialised in PyMuPDF.
30382994
#

0 commit comments

Comments
 (0)