Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.22.3

### Enhancements
- **`partition_md` Markdown `extensions`**: Optional `extensions` list is passed to `markdown.markdown()`; entries may be registered names (`str`) or `markdown.extensions.Extension` instances. Defaults to `["tables", "fenced_code"]`. Invalid values raise `ValueError`.

## 0.22.2

- Store routing in ElementMetadata
Expand Down
31 changes: 26 additions & 5 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
import pathlib
import tempfile
import time
import warnings
from importlib import import_module
from typing import Iterator
Expand Down Expand Up @@ -1337,11 +1338,31 @@ def test_auto_partition_passes_user_provided_languages_arg_to_PDF():
)
def test_auto_partition_detects_pdf_language_per_element(strategy):
filename = example_doc_path("language-docs/fr_olap.pdf")
elements = partition(
filename=filename,
strategy=strategy,
detect_language_per_element=True,
)

def _partition() -> list[Element]:
return partition(
filename=filename,
strategy=strategy,
detect_language_per_element=True,
)

# OCR_ONLY shells out to Tesseract with a temp PNG; under CI load the file can disappear
# before Tesseract reads it ("cannot read input file"). Retry a few times on that flake.
if strategy == PartitionStrategy.OCR_ONLY:
from unstructured_pytesseract import TesseractError

elements: list[Element] | None = None
for attempt in range(3):
try:
elements = _partition()
break
except TesseractError as e:
if attempt == 2 or "cannot read input file" not in str(e).lower():
raise
time.sleep(0.25 * (attempt + 1))
assert elements is not None
else:
elements = _partition()

assert len(elements) > 0
assert elements[0].metadata.languages == ["fra"]
Expand Down
51 changes: 51 additions & 0 deletions test_unstructured/partition/test_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pytest
import requests
from markdown.extensions.fenced_code import FencedCodeExtension
from pytest_mock import MockFixture

from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
Expand Down Expand Up @@ -341,3 +342,53 @@ def test_partition_fenced_code():
assert elements[3].text == expected_xml

assert elements[4].text == expected_xml


def test_partition_md_custom_extensions_parameter():
"""User can override markdown extensions via `extensions` kwarg (fixes #4006)."""
text = """```bash
# create the container
docker run -dt --name unstructured downloads.unstructured.io/unstructured-io/unstructured:latest
```"""

expected_body = (
"# create the container\n"
"docker run -dt --name unstructured "
"downloads.unstructured.io/unstructured-io/unstructured:latest"
)

# Without fenced_code, ``#`` inside the fence is parsed as a heading (undesired).
elements_tables_only = partition_md(text=text, extensions=["tables"])
assert any(isinstance(el, Title) for el in elements_tables_only)

# Default and explicit fenced_code keep the block as one element (CodeSnippet from HTML).
assert len(partition_md(text=text)) == 1
elements_fenced = partition_md(text=text, extensions=["fenced_code"])
assert len(elements_fenced) == 1
assert elements_fenced[0].category == ElementType.CODE_SNIPPET
assert elements_fenced[0].text == expected_body

# Extension instances (normal Python-Markdown API) match string extension names.
elements_instance = partition_md(text=text, extensions=[FencedCodeExtension()])
assert elements_instance == elements_fenced


def test_partition_md_extensions_not_list_raises():
with pytest.raises(ValueError, match="'extensions' must be a list"):
partition_md(text="# Hi", extensions=("tables",)) # type: ignore[arg-type]


def test_partition_md_extensions_invalid_item_raises():
with pytest.raises(ValueError, match="Each entry in 'extensions'"):
partition_md(text="# Hi", extensions=[42]) # type: ignore[list-item]


def test_partition_md_tables_only_differs_from_default_for_code_fence():
"""Without ``fenced_code``, ``#`` inside a fence can become a Title (see #4006)."""
text = """```bash
# line
```"""
default_el = partition_md(text=text)[0]
tables_only_els = partition_md(text=text, extensions=["tables"])
assert default_el.category == ElementType.CODE_SNIPPET
assert any(e.category == ElementType.TITLE for e in tables_only_els)
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.22.2" # pragma: no cover
__version__ = "0.22.3" # pragma: no cover
36 changes: 35 additions & 1 deletion unstructured/partition/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import markdown
import requests
from markdown.extensions import Extension

from unstructured.documents.elements import Element
from unstructured.file_utils.encoding import read_txt_file
Expand All @@ -21,6 +22,28 @@ def optional_decode(contents: str | bytes) -> str:

DETECTION_ORIGIN: str = "md"

_DEFAULT_MARKDOWN_EXTENSIONS: list[str] = ["tables", "fenced_code"]


def _validate_markdown_extensions(extensions: Any) -> list[Any]:
"""Return ``extensions`` if it is a list of strings and/or ``Extension`` instances.

Python-Markdown accepts extension entry points as registered names (``str``) or configured
``Extension`` instances; both are supported here. Any other shape raises ``ValueError``.
"""
if not isinstance(extensions, list):
raise ValueError(
"'extensions' must be a list of extension names (str) and/or "
f"markdown.extensions.Extension instances, got {type(extensions).__name__!r}"
)
for item in extensions:
if not isinstance(item, (str, Extension)):
raise ValueError(
"Each entry in 'extensions' must be a str or markdown.extensions.Extension "
f"instance, got {type(item).__name__}: {item!r}"
)
return extensions


def partition_md(
filename: str | None = None,
Expand Down Expand Up @@ -49,6 +72,12 @@ def partition_md(
languages
The languages present in the document. Use ``["auto"]`` to detect (default when None).
Use ``[""]`` to disable language detection.

Other keyword arguments are forwarded to ``partition_html``. In addition, ``extensions`` may be
passed to ``markdown.markdown()`` as a list of registered extension names (``str``) and/or
configured ``markdown.extensions.Extension`` instances. The default is
``["tables", "fenced_code"]``. Pass e.g. ``extensions=["tables"]`` if you need the legacy
behavior where ``#`` inside unfenced content is parsed as a heading (see #4006).
"""
if text is None:
text = ""
Expand Down Expand Up @@ -77,7 +106,12 @@ def partition_md(

text = response.text

html = markdown.markdown(text, extensions=["tables", "fenced_code"])
# -- optional markdown extensions; default matches historical partition_md behavior --
extensions = _validate_markdown_extensions(
kwargs.pop("extensions", _DEFAULT_MARKDOWN_EXTENSIONS)
)

html = markdown.markdown(text, extensions=extensions)

html_kwargs: dict[str, Any] = {
"text": html,
Expand Down
Loading