From 01019071279137c1d74204d87c4dd74db12c7bfa Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Mon, 21 Apr 2025 13:39:14 -0400 Subject: [PATCH 1/2] Filetype "txt" becomes the only supported value After the introduction of a complete file content recognizer, the only relevant value for PyMuPDF's open parameter "filetype" remaining is "txt". This change will implement this such that specifying filetype="txt" will open files or memory data as plain text Documents. Other values will be silently ignored and no longer lead to confusing behavior. --- docs/document.rst | 16 +++++----------- docs/how-to-open-a-file.rst | 13 +++++-------- src/__init__.py | 34 ++++++++++++++++++++++++---------- 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/docs/document.rst b/docs/document.rst index 85b5e4c23..6ea5806f5 100644 --- a/docs/document.rst +++ b/docs/document.rst @@ -176,16 +176,12 @@ For details on **embedded files** refer to Appendix 3. * If ``stream`` is given, then the document is created from memory. * If ``stream`` is `None`, then a document is created from the file given by ``filename``. - :arg str,pathlib filename: A UTF-8 string or ``pathlib.Path`` object containing a file path. The document type is always determined from the file content. The ``filetype`` parameter can be used to ensure that the detected type is as expected or, respectively, to force treating any file as plain text. + :arg str,pathlib filename: A UTF-8 string or ``pathlib.Path`` object containing a file path. The document type is always determined from the file content. The ``filetype`` parameter can be used to override this and open the file as a plain text document. :arg bytes,bytearray,BytesIO stream: A memory area containing file data. The document type is **always** detected from the data content. The ``filetype`` parameter is ignored except for undetected data content. In that case only, using ``filetype="txt"`` will treat the data as containing plain text. - :arg str filetype: A string specifying the type of document. This may be anything looking like a filename (e.g. "x.pdf"), in which case MuPDF uses the extension to determine the type, or a mime type like ``application/pdf``. Just using strings like "pdf" or ".pdf" will also work. Can be omitted for :ref:`a supported document type`. + :arg str filetype: Currently only used to force opening the file as a plain text document. Use the value `"txt"` to achieve this. Before the implementation of MuPDF's file content recognizer, this parameter was essential to help determining the file type. As this is no longer necessary, other values are ignored. - If opening a file name / path only, it will be used to ensure that the detected type is as expected. An exception is raised for a mismatch. Using `filetype="txt"` will treat any file as containing plain text. - - When opening from memory, this parameter is ignored except for undetected data content. Only in that case, using ``filetype="txt"`` will treat the data as containing plain text. - :arg rect_like rect: a rectangle specifying the desired page size. This parameter is only meaningful for documents with a variable page layout ("reflowable" documents), like e-books or HTML, and ignored otherwise. If specified, it must be a non-empty, finite rectangle with top-left coordinates (0, 0). Together with parameter *fontsize*, each page will be accordingly laid out and hence also determine the number of pages. :arg float width: may used together with ``height`` as an alternative to ``rect`` to specify layout information. @@ -207,14 +203,12 @@ For details on **embedded files** refer to Appendix 3. Overview of possible forms, note: `open` is a synonym of `Document`:: >>> # from a file - >>> doc = pymupdf.open("some.xps") - >>> # handle wrong extension - >>> doc = pymupdf.open("some.file", filetype="xps") # assert expected type + >>> doc = pymupdf.open("some.file") # file type determined from content >>> doc = pymupdf.open("some.file", filetype="txt") # treat as plain text >>> >>> # from memory - >>> doc = pymupdf.open(stream=mem_area) # works for any supported type - >>> doc = pymupdf.open(stream=unknown-type, filetype="txt") # treat as plain text + >>> doc = pymupdf.open(stream=mem_area) # file type determined from content + >>> doc = pymupdf.open(stream=mem_area, filetype="txt") # treat as plain text >>> >>> # new empty PDF >>> doc = pymupdf.open() diff --git a/docs/how-to-open-a-file.rst b/docs/how-to-open-a-file.rst index 7cfb5012b..06c3b4c99 100644 --- a/docs/how-to-open-a-file.rst +++ b/docs/how-to-open-a-file.rst @@ -38,22 +38,19 @@ To open a file, do the following: File Recognizer: Opening with :index:`a Wrong File Extension ` """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -If you have a document with a wrong file extension for its type, do not worry: it will still be opened correctly, thanks to the integrated file "content recognizer". +If you have a document with a wrong file extension for its type, do not worry: it will still be opened correctly, thanks to the integrated file "content recognizer" of the base library. This component looks at the actual data in the file using a number of heuristics -- independent of the file extension. This of course is also true for file names **without** an extension. Here is a list of details about how the file content recognizer works: -* When opening from a file name, use the ``filetype`` parameter if you need to make sure that the created :ref:`Document` is of the expected type. An exception is raised for any mismatch. +* When opening from a file name or a memory area, all supported :ref:`Document` types are automatically recognized by their content. -* Text files are an exception: they do not contain recognizable internal structures at all. Here, the file extension ".txt" and the ``filetype`` parameter continue to play a role and are used to create a "Tex" document. Correspondingly, text files with other / no extensions, can successfully be opened using `filetype="txt"`. +* Text files are an exception: they do not contain recognizable internal structures at all. If opening from a file name with a known plain text extension (like "txt" or "text") everything will still work. -* Using `filetype="txt"` will treat **any** file as containing plain text when opened from a file name / path -- even when its content is a supported document type. +* If opening from memory or from a file extension that is not known to be plain text, then ``filetype="txt"`` must be specified. -* When opening from a stream, the file content recognizer will ignore the ``filetype`` parameter entirely for known file types -- even in case of a mismatch or when `filetype="txt"` was specified. - - * Streams with a known file type cannot be opened as plain text. - * Specifying ``filetype`` currently only has an effect when no match was found. Then using ``filetype="txt"`` will treat the file as containing plain text. +* Using `filetype="txt"` will treat **any** file as if containing plain text -- even when its content is a supported document type. ---------- diff --git a/src/__init__.py b/src/__init__.py index ba2e9e7d9..13418fb14 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -2916,8 +2916,6 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 else: raise TypeError(f"bad stream: {type(stream)=}.") stream = self.stream - if not (filename or filetype): - filename = 'pdf' else: self.stream = None @@ -2962,21 +2960,37 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 # setting self.stream above ensures that the bytes will not be # garbage collected? data = mupdf.fz_open_memory(mupdf.python_buffer_data(c), len(c)) - magic = filename - if not magic: + if filename is not None: + magic = filename + elif filetype is not None: magic = filetype - # fixme: pymupdf does: - # handler = fz_recognize_document(gctx, filetype); - # if (!handler) raise ValueError( MSG_BAD_FILETYPE) - # but prefer to leave fz_open_document_with_stream() to raise. + else: + magic = "" + if magic.endswith(("txt", "text", "log")): + magic = "txt" + else: + magic = "" try: - doc = mupdf.fz_open_document_with_stream(magic, data) + if magic == "txt": + handler = mupdf.ll_fz_recognize_document(magic) + accel = mupdf.FzStream() + archive = mupdf.FzArchive(None) + doc = mupdf.ll_fz_document_handler_open( + handler, + data.m_internal, + accel.m_internal, + archive.m_internal, + None, # recognize_state + ) + doc = mupdf.FzDocument(doc) + else: + doc = mupdf.fz_open_document_with_stream(magic, data) except Exception as e: if g_exceptions_verbose > 1: exception_info() raise FileDataError('Failed to open stream') from e else: if filename: - if not filetype: + if filetype != "txt": try: doc = mupdf.fz_open_document(filename) except Exception as e: From 84563c74530e14fd73254e4d2069ed3520653835 Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Tue, 22 Apr 2025 08:03:39 -0400 Subject: [PATCH 2/2] Update test_general.py Updating file open tests to reflect logic adjusted to file recognizer. --- tests/test_general.py | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/tests/test_general.py b/tests/test_general.py index 62a634fe1..c3fb5851e 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -133,20 +133,6 @@ def test_pdfstring(): def test_open_exceptions(): - try: - pymupdf.open(filename, filetype="xps") - except RuntimeError as e: - assert repr(e).startswith("FileDataError") - else: - assert 0 - - try: - pymupdf.open(filename, filetype="xxx") - except Exception as e: - assert repr(e).startswith("ValueError") - else: - assert 0 - try: pymupdf.open("x.y") except Exception as e: @@ -155,7 +141,7 @@ def test_open_exceptions(): assert 0 try: - pymupdf.open(stream=b"", filetype="pdf") + pymupdf.open(stream=b"") except RuntimeError as e: assert repr(e).startswith("EmptyFileError") else: @@ -1393,7 +1379,8 @@ def check(filename=None, stream=None, filetype=None, exception=None): re.escape(f'mupdf.{etype2}: code=7: cannot recognize zip archive'), re.escape(f'pymupdf.FileDataError: Failed to open file {path!r} as type {filetype!r}.'), ) - check(path, filetype=filetype, exception=(etype, eregex)) + # this is no longer relevant: + # check(path, filetype=filetype, exception=(etype, eregex)) path = f'{resources}/chinese-tables.pickle' etype = pymupdf.FileDataError @@ -1551,11 +1538,6 @@ def test_3905(): pass else: assert 0 - wt = pymupdf.TOOLS.mupdf_warnings() - if pymupdf.mupdf_version_tuple >= (1, 26): - assert wt == 'format error: cannot find version marker\ntrying to repair broken xref\nrepairing PDF document' - else: - assert wt == 'format error: cannot recognize version marker\ntrying to repair broken xref\nrepairing PDF document' def test_3624(): path = os.path.normpath(f'{__file__}/../../tests/resources/test_3624.pdf')