Skip to content

Commit 95ef131

Browse files
committed
Support documents with Archives
Allow "archives" parameters in Document creation.
1 parent 4c37225 commit 95ef131

2 files changed

Lines changed: 46 additions & 17 deletions

File tree

docs/pymupdf4llm/api.rst

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,29 @@ The PyMuPDF4LLM API
279279
}
280280
}
281281

282+
.. method:: markdown_to_pdf(md_path: str | pathlib.Path, \
283+
user_css: str | None = None, \
284+
page_rect: rect-like | None = None, \
285+
margins: rect-like | None) = None, \
286+
archive: str | pathlib.Path | None = None, \
287+
output_path: str | pathlib.Path | None = None) -> pymupdf.Document | None
288+
289+
Convert the markdown text content of the file specified by `md_path` into a PDF document.
290+
291+
The function is always available -- independently of whether you are using the PyMuPDF Layout module or not.
292+
293+
:arg str|Path md_path: the file path of the markdown file to be converted.
294+
295+
:arg str|None user_css: optional, a string of CSS code to be applied to the markdown content. This may be used to customize the appearance of the generated PDF document. If `None` (default), the built-in default CSS is used.
296+
297+
:arg rect-like|None page_rect: optional, the rectangle defining the page boundaries for the generated PDF document. If `None` (default), ISO A4 page dimensions are used. To use one of PyMuPDF's predefined page formats, use e.g. ``pymupdf.paper_rect("Letter")``.
298+
299+
:arg rect-like|None margins: optional, the margins (borders) for the generated pages. This must be a sequence of four floats ``[left, top, right, bottom]`` specifying the respective border width in points (1/72 inches). If `None` (default), the default ``[50, 50, 50, 50]`` margins are used.
300+
301+
:arg str|Archive|None archive: optional. This is be required if the markdown source references images that are **not** stored in the same folder as the markdown file. In this case, `archive` must be a `pymupdf.Archive` object which provides access to the respective image files. If `None` (default), it is assumed that all referenced images are stored in the same folder as the markdown file. The parameter **may** also be required if a custom ``user_css`` references external resources like font files.
302+
303+
:arg str|Path|None output_path: optional, the file path where the generated PDF document will be saved. If specified, the generated PDF will be saved to that location. If `None` (default), the document is returned as a `pymupdf.Document` object.
304+
282305
.. note::
283306

284307
Please see `this site <https://github.com/pymupdf/pymupdf4llm/discussions/327>`_ for more background and the current status of further improvements regarding usage with :ref:`PyMuPDF Layout <pymupdf-layout>`.

src/__init__.py

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2897,7 +2897,7 @@ def __getitem__(self, i=0):
28972897
raise IndexError(f"page {i} not in document")
28982898
return self.load_page(i)
28992899

2900-
def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0, height=0, fontsize=11):
2900+
def __init__(self, filename=None, stream=None, filetype=None, archive=None, rect=None, width=0, height=0, fontsize=11):
29012901
"""Creates a document. Use 'open' as a synonym.
29022902

29032903
Notes:
@@ -2943,7 +2943,16 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
29432943

29442944
self._name = filename
29452945
self.stream = stream
2946-
2946+
if isinstance(archive, pathlib.Path):
2947+
archive = Archive(archive.name)
2948+
elif isinstance(archive, str):
2949+
archive = Archive(archive)
2950+
if archive and not isinstance(archive, Archive):
2951+
raise TypeError(f"bad archive: {type(archive)=}.")
2952+
if archive:
2953+
archive_parm = archive.this # pass this to open
2954+
else:
2955+
archive_parm = None # means: no archive present
29472956
if stream is not None:
29482957
if filename is not None and filetype is None:
29492958
# 2025-05-06: Use <filename> as the filetype. This is
@@ -2958,6 +2967,8 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
29582967
stream = stream.getvalue()
29592968
else:
29602969
raise TypeError(f"bad stream: {type(stream)=}.")
2970+
2971+
# this prevents bad things if original goes out of existence:
29612972
self.stream = stream
29622973

29632974
assert isinstance(stream, (bytes, memoryview))
@@ -2967,9 +2978,9 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
29672978
# raise a specific exception.
29682979
raise EmptyFileError('Cannot open empty stream.')
29692980

2970-
stream2 = mupdf.fz_open_memory(mupdf.python_buffer_data(stream), len(stream))
2981+
fz_stream = mupdf.fz_open_memory(mupdf.python_buffer_data(stream), len(stream))
29712982
try:
2972-
doc = mupdf.fz_open_document_with_stream(filetype if filetype else '', stream2)
2983+
doc = mupdf.fz_open_document_with_stream_and_dir(filetype if filetype else '', fz_stream, archive_parm)
29732984
except Exception as e:
29742985
if g_exceptions_verbose > 1: exception_info()
29752986
raise FileDataError('Failed to open stream') from e
@@ -2996,20 +3007,15 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
29963007
raise EmptyFileError(f'Cannot open empty file: {filename=}.')
29973008

29983009
if filetype:
2999-
# Override the type implied by <filename>. MuPDF does not
3000-
# have a way to do this directly so we open via a stream.
3001-
try:
3002-
fz_stream = mupdf.fz_open_file(filename)
3003-
doc = mupdf.fz_open_document_with_stream(filetype, fz_stream)
3004-
except Exception as e:
3005-
if g_exceptions_verbose > 1: exception_info()
3006-
raise FileDataError(f'Failed to open file {filename!r} as type {filetype!r}.') from e
3010+
suffix = filetype
30073011
else:
3008-
try:
3009-
doc = mupdf.fz_open_document(filename)
3010-
except Exception as e:
3011-
if g_exceptions_verbose > 1: exception_info()
3012-
raise FileDataError(f'Failed to open file {filename!r}.') from e
3012+
suffix = Path(filename).suffix
3013+
try:
3014+
fz_stream = mupdf.fz_open_file(filename)
3015+
doc = mupdf.fz_open_document_with_stream_and_dir(suffix, fz_stream, archive_parm)
3016+
except Exception as e:
3017+
if g_exceptions_verbose > 1: exception_info()
3018+
raise FileDataError(f'Failed to open file {filename!r} as type {suffix}.') from e
30133019

30143020
else:
30153021
pdf = mupdf.PdfDocument()

0 commit comments

Comments
 (0)