From 01019071279137c1d74204d87c4dd74db12c7bfa Mon Sep 17 00:00:00 2001
From: "Jorj X. McKie" <jorj.x.mckie@outlook.de>
Date: Mon, 21 Apr 2025 13:39:14 -0400
Subject: [PATCH 1/2] Filetype "txt" becomes the only supported value

After the introduction of a complete file content recognizer, the only relevant value for PyMuPDF's open parameter "filetype" remaining is "txt".
This change will implement this such that specifying filetype="txt" will open files or memory data as plain text Documents.
Other values will be silently ignored and no longer lead to confusing behavior.
---
 docs/document.rst           | 16 +++++-----------
 docs/how-to-open-a-file.rst | 13 +++++--------
 src/__init__.py             | 34 ++++++++++++++++++++++++----------
 3 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/docs/document.rst b/docs/document.rst
index 85b5e4c23..6ea5806f5 100644
--- a/docs/document.rst
+++ b/docs/document.rst
@@ -176,16 +176,12 @@ For details on **embedded files** refer to Appendix 3.
     * If ``stream`` is given, then the document is created from memory.
     * If ``stream`` is `None`, then a document is created from the file given by ``filename``. 
 
-    :arg str,pathlib filename: A UTF-8 string or ``pathlib.Path`` object containing a file path. The document type is always determined from the file content. The ``filetype`` parameter can be used to ensure that the detected type is as expected or, respectively, to force treating any file as plain text.
+    :arg str,pathlib filename: A UTF-8 string or ``pathlib.Path`` object containing a file path. The document type is always determined from the file content. The ``filetype`` parameter can be used to override this and open the file as a plain text document.
 
     :arg bytes,bytearray,BytesIO stream: A memory area containing file data. The document type is **always** detected from the data content. The ``filetype`` parameter is ignored except for undetected data content. In that case only, using ``filetype="txt"`` will treat the data as containing plain text.
 
-    :arg str filetype: A string specifying the type of document. This may be anything looking like a filename (e.g. "x.pdf"), in which case MuPDF uses the extension to determine the type, or a mime type like ``application/pdf``. Just using strings like "pdf"  or ".pdf" will also work. Can be omitted for :ref:`a supported document type<Supported_File_Types>`.
+    :arg str filetype: Currently only used to force opening the file as a plain text document. Use the value `"txt"` to achieve this. Before the implementation of MuPDF's file content recognizer, this parameter was essential to help determining the file type. As this is no longer necessary, other values are ignored.
     
-      If opening a file name / path only, it will be used to ensure that the detected type is as expected. An exception is raised for a mismatch. Using `filetype="txt"` will treat any file as containing plain text.
-      
-      When opening from memory, this parameter is ignored except for undetected data content. Only in that case, using ``filetype="txt"`` will treat the data as containing plain text.
-
     :arg rect_like rect: a rectangle specifying the desired page size. This parameter is only meaningful for documents with a variable page layout ("reflowable" documents), like e-books or HTML, and ignored otherwise. If specified, it must be a non-empty, finite rectangle with top-left coordinates (0, 0). Together with parameter *fontsize*, each page will be accordingly laid out and hence also determine the number of pages.
 
     :arg float width: may used together with ``height`` as an alternative to ``rect`` to specify layout information.
@@ -207,14 +203,12 @@ For details on **embedded files** refer to Appendix 3.
     Overview of possible forms, note: `open` is a synonym of `Document`::
 
         >>> # from a file
-        >>> doc = pymupdf.open("some.xps")
-        >>> # handle wrong extension
-        >>> doc = pymupdf.open("some.file", filetype="xps")  # assert expected type
+        >>> doc = pymupdf.open("some.file")  # file type determined from content
         >>> doc = pymupdf.open("some.file", filetype="txt")  # treat as plain text
         >>>
         >>> # from memory
-        >>> doc = pymupdf.open(stream=mem_area)  # works for any supported type
-        >>> doc = pymupdf.open(stream=unknown-type, filetype="txt")  # treat as plain text
+        >>> doc = pymupdf.open(stream=mem_area)  # file type determined from content
+        >>> doc = pymupdf.open(stream=mem_area, filetype="txt")  # treat as plain text
         >>>
         >>> # new empty PDF
         >>> doc = pymupdf.open()
diff --git a/docs/how-to-open-a-file.rst b/docs/how-to-open-a-file.rst
index 7cfb5012b..06c3b4c99 100644
--- a/docs/how-to-open-a-file.rst
+++ b/docs/how-to-open-a-file.rst
@@ -38,22 +38,19 @@ To open a file, do the following:
 File Recognizer: Opening with :index:`a Wrong File Extension <pair: wrong; file extension>`
 """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 
-If you have a document with a wrong file extension for its type, do not worry: it will still be opened correctly, thanks to the integrated file "content recognizer".
+If you have a document with a wrong file extension for its type, do not worry: it will still be opened correctly, thanks to the integrated file "content recognizer" of the base library.
 
 This component looks at the actual data in the file using a number of heuristics -- independent of the file extension. This of course is also true for file names **without** an extension.
 
 Here is a list of details about how the file content recognizer works:
 
-* When opening from a file name, use the ``filetype`` parameter if you need to make sure that the created :ref:`Document` is of the expected type. An exception is raised for any mismatch.
+* When opening from a file name or a memory area, all supported :ref:`Document` types are automatically recognized by their content.
 
-* Text files are an exception: they do not contain recognizable internal structures at all. Here, the file extension ".txt" and the ``filetype`` parameter continue to play a role and are used to create a "Tex" document. Correspondingly, text files with other / no extensions, can successfully be opened using `filetype="txt"`.
+* Text files are an exception: they do not contain recognizable internal structures at all. If opening from a file name with a known plain text extension (like "txt" or "text") everything will still work.
 
-* Using `filetype="txt"` will treat **any** file as containing plain text when opened from a file name / path -- even when its content is a supported document type.
+* If opening from memory or from a file extension that is not known to be plain text, then ``filetype="txt"`` must be specified.
 
-* When opening from a stream, the file content recognizer will ignore the ``filetype`` parameter entirely for known file types -- even in case of a mismatch or when `filetype="txt"` was specified.
-
-    * Streams with a known file type cannot be opened as plain text.
-    * Specifying ``filetype`` currently only has an effect when no match was found. Then using ``filetype="txt"`` will treat the file as containing plain text.
+* Using `filetype="txt"` will treat **any** file as if containing plain text -- even when its content is a supported document type.
 
 
 ----------
diff --git a/src/__init__.py b/src/__init__.py
index ba2e9e7d9..13418fb14 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -2916,8 +2916,6 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
                 else:
                     raise TypeError(f"bad stream: {type(stream)=}.")
                 stream = self.stream
-                if not (filename or filetype):
-                    filename = 'pdf'
             else:
                 self.stream = None
 
@@ -2962,21 +2960,37 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
                     # setting self.stream above ensures that the bytes will not be
                     # garbage collected?
                     data = mupdf.fz_open_memory(mupdf.python_buffer_data(c), len(c))
-                magic = filename
-                if not magic:
+                if filename is not None:
+                    magic = filename
+                elif filetype is not None:
                     magic = filetype
-                # fixme: pymupdf does:
-                #   handler = fz_recognize_document(gctx, filetype);
-                #   if (!handler) raise ValueError( MSG_BAD_FILETYPE)
-                # but prefer to leave fz_open_document_with_stream() to raise.
+                else:
+                    magic = ""
+                if magic.endswith(("txt", "text", "log")):
+                    magic = "txt"
+                else:
+                    magic = ""
                 try:
-                    doc = mupdf.fz_open_document_with_stream(magic, data)
+                    if magic == "txt":
+                        handler = mupdf.ll_fz_recognize_document(magic)
+                        accel = mupdf.FzStream()
+                        archive = mupdf.FzArchive(None)
+                        doc = mupdf.ll_fz_document_handler_open(
+                                  handler,
+                                  data.m_internal,
+                                  accel.m_internal,
+                                  archive.m_internal,
+                                  None,   # recognize_state
+                        )
+                        doc = mupdf.FzDocument(doc)
+                    else:
+                        doc = mupdf.fz_open_document_with_stream(magic, data)
                 except Exception as e:
                     if g_exceptions_verbose > 1:    exception_info()
                     raise FileDataError('Failed to open stream') from e
             else:
                 if filename:
-                    if not filetype:
+                    if filetype != "txt":
                         try:
                             doc = mupdf.fz_open_document(filename)
                         except Exception as e:

From 84563c74530e14fd73254e4d2069ed3520653835 Mon Sep 17 00:00:00 2001
From: "Jorj X. McKie" <jorj.x.mckie@outlook.de>
Date: Tue, 22 Apr 2025 08:03:39 -0400
Subject: [PATCH 2/2] Update test_general.py

Updating file open tests to reflect logic adjusted to file recognizer.
---
 tests/test_general.py | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/tests/test_general.py b/tests/test_general.py
index 62a634fe1..c3fb5851e 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -133,20 +133,6 @@ def test_pdfstring():
 
 
 def test_open_exceptions():
-    try:
-        pymupdf.open(filename, filetype="xps")
-    except RuntimeError as e:
-        assert repr(e).startswith("FileDataError")
-    else:
-        assert 0
-
-    try:
-        pymupdf.open(filename, filetype="xxx")
-    except Exception as e:
-        assert repr(e).startswith("ValueError")
-    else:
-        assert 0
-
     try:
         pymupdf.open("x.y")
     except Exception as e:
@@ -155,7 +141,7 @@ def test_open_exceptions():
         assert 0
 
     try:
-        pymupdf.open(stream=b"", filetype="pdf")
+        pymupdf.open(stream=b"")
     except RuntimeError as e:
         assert repr(e).startswith("EmptyFileError")
     else:
@@ -1393,7 +1379,8 @@ def check(filename=None, stream=None, filetype=None, exception=None):
             re.escape(f'mupdf.{etype2}: code=7: cannot recognize zip archive'),
             re.escape(f'pymupdf.FileDataError: Failed to open file {path!r} as type {filetype!r}.'),
             )
-    check(path, filetype=filetype, exception=(etype, eregex))
+    # this is no longer relevant:
+    # check(path, filetype=filetype, exception=(etype, eregex))
     
     path = f'{resources}/chinese-tables.pickle'
     etype = pymupdf.FileDataError
@@ -1551,11 +1538,6 @@ def test_3905():
         pass
     else:
         assert 0
-    wt = pymupdf.TOOLS.mupdf_warnings()
-    if pymupdf.mupdf_version_tuple >= (1, 26):
-        assert wt == 'format error: cannot find version marker\ntrying to repair broken xref\nrepairing PDF document'
-    else:
-        assert wt == 'format error: cannot recognize version marker\ntrying to repair broken xref\nrepairing PDF document'
 
 def test_3624():
     path = os.path.normpath(f'{__file__}/../../tests/resources/test_3624.pdf')