diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d7e9988fc..6baa8f7258 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.22.3 + +### Enhancements +- **`partition_md` Markdown `extensions`**: Optional `extensions` list is passed to `markdown.markdown()`; entries may be registered names (`str`) or `markdown.extensions.Extension` instances. Defaults to `["tables", "fenced_code"]`. Invalid values raise `ValueError`. + ## 0.22.2 - Store routing in ElementMetadata diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index da54e80466..d7927ccc92 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -6,6 +6,7 @@ import os import pathlib import tempfile +import time import warnings from importlib import import_module from typing import Iterator @@ -1337,11 +1338,31 @@ def test_auto_partition_passes_user_provided_languages_arg_to_PDF(): ) def test_auto_partition_detects_pdf_language_per_element(strategy): filename = example_doc_path("language-docs/fr_olap.pdf") - elements = partition( - filename=filename, - strategy=strategy, - detect_language_per_element=True, - ) + + def _partition() -> list[Element]: + return partition( + filename=filename, + strategy=strategy, + detect_language_per_element=True, + ) + + # OCR_ONLY shells out to Tesseract with a temp PNG; under CI load the file can disappear + # before Tesseract reads it ("cannot read input file"). Retry a few times on that flake. + if strategy == PartitionStrategy.OCR_ONLY: + from unstructured_pytesseract import TesseractError + + elements: list[Element] | None = None + for attempt in range(3): + try: + elements = _partition() + break + except TesseractError as e: + if attempt == 2 or "cannot read input file" not in str(e).lower(): + raise + time.sleep(0.25 * (attempt + 1)) + assert elements is not None + else: + elements = _partition() assert len(elements) > 0 assert elements[0].metadata.languages == ["fra"] diff --git a/test_unstructured/partition/test_md.py b/test_unstructured/partition/test_md.py index bf294dbdb9..6ae8b98872 100644 --- a/test_unstructured/partition/test_md.py +++ b/test_unstructured/partition/test_md.py @@ -5,6 +5,7 @@ import pytest import requests +from markdown.extensions.fenced_code import FencedCodeExtension from pytest_mock import MockFixture from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path @@ -341,3 +342,53 @@ def test_partition_fenced_code(): assert elements[3].text == expected_xml assert elements[4].text == expected_xml + + +def test_partition_md_custom_extensions_parameter(): + """User can override markdown extensions via `extensions` kwarg (fixes #4006).""" + text = """```bash +# create the container +docker run -dt --name unstructured downloads.unstructured.io/unstructured-io/unstructured:latest +```""" + + expected_body = ( + "# create the container\n" + "docker run -dt --name unstructured " + "downloads.unstructured.io/unstructured-io/unstructured:latest" + ) + + # Without fenced_code, ``#`` inside the fence is parsed as a heading (undesired). + elements_tables_only = partition_md(text=text, extensions=["tables"]) + assert any(isinstance(el, Title) for el in elements_tables_only) + + # Default and explicit fenced_code keep the block as one element (CodeSnippet from HTML). + assert len(partition_md(text=text)) == 1 + elements_fenced = partition_md(text=text, extensions=["fenced_code"]) + assert len(elements_fenced) == 1 + assert elements_fenced[0].category == ElementType.CODE_SNIPPET + assert elements_fenced[0].text == expected_body + + # Extension instances (normal Python-Markdown API) match string extension names. + elements_instance = partition_md(text=text, extensions=[FencedCodeExtension()]) + assert elements_instance == elements_fenced + + +def test_partition_md_extensions_not_list_raises(): + with pytest.raises(ValueError, match="'extensions' must be a list"): + partition_md(text="# Hi", extensions=("tables",)) # type: ignore[arg-type] + + +def test_partition_md_extensions_invalid_item_raises(): + with pytest.raises(ValueError, match="Each entry in 'extensions'"): + partition_md(text="# Hi", extensions=[42]) # type: ignore[list-item] + + +def test_partition_md_tables_only_differs_from_default_for_code_fence(): + """Without ``fenced_code``, ``#`` inside a fence can become a Title (see #4006).""" + text = """```bash +# line +```""" + default_el = partition_md(text=text)[0] + tables_only_els = partition_md(text=text, extensions=["tables"]) + assert default_el.category == ElementType.CODE_SNIPPET + assert any(e.category == ElementType.TITLE for e in tables_only_els) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 31dba6e8f3..f0c78174b4 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.22.2" # pragma: no cover +__version__ = "0.22.3" # pragma: no cover diff --git a/unstructured/partition/md.py b/unstructured/partition/md.py index 9677d41493..731faf32aa 100644 --- a/unstructured/partition/md.py +++ b/unstructured/partition/md.py @@ -4,6 +4,7 @@ import markdown import requests +from markdown.extensions import Extension from unstructured.documents.elements import Element from unstructured.file_utils.encoding import read_txt_file @@ -21,6 +22,28 @@ def optional_decode(contents: str | bytes) -> str: DETECTION_ORIGIN: str = "md" +_DEFAULT_MARKDOWN_EXTENSIONS: list[str] = ["tables", "fenced_code"] + + +def _validate_markdown_extensions(extensions: Any) -> list[Any]: + """Return ``extensions`` if it is a list of strings and/or ``Extension`` instances. + + Python-Markdown accepts extension entry points as registered names (``str``) or configured + ``Extension`` instances; both are supported here. Any other shape raises ``ValueError``. + """ + if not isinstance(extensions, list): + raise ValueError( + "'extensions' must be a list of extension names (str) and/or " + f"markdown.extensions.Extension instances, got {type(extensions).__name__!r}" + ) + for item in extensions: + if not isinstance(item, (str, Extension)): + raise ValueError( + "Each entry in 'extensions' must be a str or markdown.extensions.Extension " + f"instance, got {type(item).__name__}: {item!r}" + ) + return extensions + def partition_md( filename: str | None = None, @@ -49,6 +72,12 @@ def partition_md( languages The languages present in the document. Use ``["auto"]`` to detect (default when None). Use ``[""]`` to disable language detection. + + Other keyword arguments are forwarded to ``partition_html``. In addition, ``extensions`` may be + passed to ``markdown.markdown()`` as a list of registered extension names (``str``) and/or + configured ``markdown.extensions.Extension`` instances. The default is + ``["tables", "fenced_code"]``. Pass e.g. ``extensions=["tables"]`` if you need the legacy + behavior where ``#`` inside unfenced content is parsed as a heading (see #4006). """ if text is None: text = "" @@ -77,7 +106,12 @@ def partition_md( text = response.text - html = markdown.markdown(text, extensions=["tables", "fenced_code"]) + # -- optional markdown extensions; default matches historical partition_md behavior -- + extensions = _validate_markdown_extensions( + kwargs.pop("extensions", _DEFAULT_MARKDOWN_EXTENSIONS) + ) + + html = markdown.markdown(text, extensions=extensions) html_kwargs: dict[str, Any] = { "text": html,