Skip to content

Commit 9da4604

Browse files
fix: update
1 parent 475dcce commit 9da4604

3 files changed

Lines changed: 53 additions & 21 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
## 0.22.2
22

33
### Enhancements
4-
- **`partition_md` Markdown `extensions`**: Optional kwarg passed to `markdown.markdown()`; defaults to `["tables", "fenced_code"]`. Override with e.g. `extensions=["tables"]` when needed.
4+
- **`partition_md` Markdown `extensions` (fixes #4006)**: Optional `extensions` list is passed to `markdown.markdown()`; entries may be registered names (`str`) or `markdown.extensions.Extension` instances. Defaults to `["tables", "fenced_code"]`. Invalid values raise `ValueError`.
55

66
## 0.22.1
77

test_unstructured/partition/test_md.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import pytest
77
import requests
8+
from markdown.extensions.fenced_code import FencedCodeExtension
89
from pytest_mock import MockFixture
910

1011
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
@@ -367,15 +368,27 @@ def test_partition_md_custom_extensions_parameter():
367368
assert elements_fenced[0].category == ElementType.CODE_SNIPPET
368369
assert elements_fenced[0].text == expected_body
369370

371+
# Extension instances (normal Python-Markdown API) match string extension names.
372+
elements_instance = partition_md(text=text, extensions=[FencedCodeExtension()])
373+
assert elements_instance == elements_fenced
370374

371-
def test_partition_md_invalid_extensions_logs_and_falls_back(mocker: MockFixture):
372-
"""Invalid `extensions` value is ignored with a warning and falls back to the default list."""
373-
text = "# Heading"
374-
logger = mocker.patch("unstructured.partition.md.logging.warning")
375375

376-
elements = partition_md(text=text, extensions="not-a-list") # type: ignore[arg-type]
376+
def test_partition_md_extensions_not_list_raises():
377+
with pytest.raises(ValueError, match="'extensions' must be a list"):
378+
partition_md(text="# Hi", extensions=("tables",)) # type: ignore[arg-type]
377379

378-
# Still parses something
379-
assert len(elements) > 0
380-
# Warning was logged
381-
logger.assert_called_once()
380+
381+
def test_partition_md_extensions_invalid_item_raises():
382+
with pytest.raises(ValueError, match="Each entry in 'extensions'"):
383+
partition_md(text="# Hi", extensions=[42]) # type: ignore[list-item]
384+
385+
386+
def test_partition_md_tables_only_differs_from_default_for_code_fence():
387+
"""Without ``fenced_code``, ``#`` inside a fence can become a Title (see #4006)."""
388+
text = """```bash
389+
# line
390+
```"""
391+
default_el = partition_md(text=text)[0]
392+
tables_only_els = partition_md(text=text, extensions=["tables"])
393+
assert default_el.category == ElementType.CODE_SNIPPET
394+
assert any(e.category == ElementType.TITLE for e in tables_only_els)

unstructured/partition/md.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from __future__ import annotations
22

3-
import logging
43
from typing import IO, Any, Optional
54

65
import markdown
76
import requests
7+
from markdown.extensions import Extension
88

99
from unstructured.documents.elements import Element
1010
from unstructured.file_utils.encoding import read_txt_file
@@ -22,6 +22,28 @@ def optional_decode(contents: str | bytes) -> str:
2222

2323
DETECTION_ORIGIN: str = "md"
2424

25+
_DEFAULT_MARKDOWN_EXTENSIONS: list[str] = ["tables", "fenced_code"]
26+
27+
28+
def _validate_markdown_extensions(extensions: Any) -> list[Any]:
29+
"""Return ``extensions`` if it is a list of strings and/or ``Extension`` instances.
30+
31+
Python-Markdown accepts extension entry points as registered names (``str``) or configured
32+
``Extension`` instances; both are supported here. Any other shape raises ``ValueError``.
33+
"""
34+
if not isinstance(extensions, list):
35+
raise ValueError(
36+
"'extensions' must be a list of extension names (str) and/or "
37+
f"markdown.extensions.Extension instances, got {type(extensions).__name__!r}"
38+
)
39+
for item in extensions:
40+
if not isinstance(item, (str, Extension)):
41+
raise ValueError(
42+
"Each entry in 'extensions' must be a str or markdown.extensions.Extension "
43+
f"instance, got {type(item).__name__}: {item!r}"
44+
)
45+
return extensions
46+
2547

2648
def partition_md(
2749
filename: str | None = None,
@@ -52,9 +74,10 @@ def partition_md(
5274
Use ``[""]`` to disable language detection.
5375
5476
Other keyword arguments are forwarded to ``partition_html``. In addition, ``extensions`` may be
55-
passed to select Python-Markdown extensions. The default is ``["tables", "fenced_code"]``.
56-
Pass e.g. ``extensions=["tables"]`` if you need the legacy behavior where ``#`` inside unfenced
57-
content is parsed as a heading (see #4006).
77+
passed to ``markdown.markdown()`` as a list of registered extension names (``str``) and/or
78+
configured ``markdown.extensions.Extension`` instances. The default is
79+
``["tables", "fenced_code"]``. Pass e.g. ``extensions=["tables"]`` if you need the legacy
80+
behavior where ``#`` inside unfenced content is parsed as a heading (see #4006).
5881
"""
5982
if text is None:
6083
text = ""
@@ -84,13 +107,9 @@ def partition_md(
84107
text = response.text
85108

86109
# -- optional markdown extensions; default matches historical partition_md behavior --
87-
_default_extensions = ["tables", "fenced_code"]
88-
extensions = kwargs.pop("extensions", _default_extensions)
89-
if not (isinstance(extensions, list) and all(isinstance(ext, str) for ext in extensions)):
90-
logging.warning(
91-
"Ignoring invalid 'extensions' argument (expected list of strings): %r", extensions
92-
)
93-
extensions = _default_extensions
110+
extensions = _validate_markdown_extensions(
111+
kwargs.pop("extensions", _DEFAULT_MARKDOWN_EXTENSIONS)
112+
)
94113

95114
html = markdown.markdown(text, extensions=extensions)
96115

0 commit comments

Comments
 (0)