Skip to content

Commit 0c634f6

Browse files
authored
Merge branch 'main' into crag/review-pr-4291
2 parents 4c52c76 + 47f42b1 commit 0c634f6

4 files changed

Lines changed: 120 additions & 7 deletions

File tree

CHANGELOG.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
1+
## 0.22.4-dev0
2+
3+
### Enhancements
4+
5+
**Add ability for `Table` element to be reconstructed from `TableChunk`s**: Previously when a `Table` element was separated into chunks, there was no way to reconstruct it. Each `TableChunk` now carries `table_id`, `chunk_index`, and `total_chunks` metadata so chunk sequences can be grouped and ordered, and a new `reconstruct_table_from_chunks()` function in `unstructured.chunking.dispatch` accepts a mixed list of chunked elements and returns reconstructed `Table` objects with merged text and HTML.
6+
17
## 0.22.3
28

39
### Enhancements
4-
- **Add ability for `Table` element to be reconstructed from `TableChunk`s**: Previously when a `Table` element was separated into chunks, there was no way to reconstruct it. Each `TableChunk` now carries `table_id`, `chunk_index`, and `total_chunks` metadata so chunk sequences can be grouped and ordered, and a new `reconstruct_table_from_chunks()` function in `unstructured.chunking.dispatch` accepts a mixed list of chunked elements and returns reconstructed `Table` objects with merged text and HTML.
10+
11+
- **`partition_md` Markdown `extensions`**: Optional `extensions` list is passed to `markdown.markdown()`; entries may be registered names (`str`) or `markdown.extensions.Extension` instances. Defaults to `["tables", "fenced_code"]`. Invalid values raise `ValueError`.
512

613
## 0.22.2
714

test_unstructured/partition/test_auto.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import os
77
import pathlib
88
import tempfile
9+
import time
910
import warnings
1011
from importlib import import_module
1112
from typing import Iterator
@@ -1337,11 +1338,31 @@ def test_auto_partition_passes_user_provided_languages_arg_to_PDF():
13371338
)
13381339
def test_auto_partition_detects_pdf_language_per_element(strategy):
13391340
filename = example_doc_path("language-docs/fr_olap.pdf")
1340-
elements = partition(
1341-
filename=filename,
1342-
strategy=strategy,
1343-
detect_language_per_element=True,
1344-
)
1341+
1342+
def _partition() -> list[Element]:
1343+
return partition(
1344+
filename=filename,
1345+
strategy=strategy,
1346+
detect_language_per_element=True,
1347+
)
1348+
1349+
# OCR_ONLY shells out to Tesseract with a temp PNG; under CI load the file can disappear
1350+
# before Tesseract reads it ("cannot read input file"). Retry a few times on that flake.
1351+
if strategy == PartitionStrategy.OCR_ONLY:
1352+
from unstructured_pytesseract import TesseractError
1353+
1354+
elements: list[Element] | None = None
1355+
for attempt in range(3):
1356+
try:
1357+
elements = _partition()
1358+
break
1359+
except TesseractError as e:
1360+
if attempt == 2 or "cannot read input file" not in str(e).lower():
1361+
raise
1362+
time.sleep(0.25 * (attempt + 1))
1363+
assert elements is not None
1364+
else:
1365+
elements = _partition()
13451366

13461367
assert len(elements) > 0
13471368
assert elements[0].metadata.languages == ["fra"]

test_unstructured/partition/test_md.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import pytest
77
import requests
8+
from markdown.extensions.fenced_code import FencedCodeExtension
89
from pytest_mock import MockFixture
910

1011
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
@@ -341,3 +342,53 @@ def test_partition_fenced_code():
341342
assert elements[3].text == expected_xml
342343

343344
assert elements[4].text == expected_xml
345+
346+
347+
def test_partition_md_custom_extensions_parameter():
348+
"""User can override markdown extensions via `extensions` kwarg (fixes #4006)."""
349+
text = """```bash
350+
# create the container
351+
docker run -dt --name unstructured downloads.unstructured.io/unstructured-io/unstructured:latest
352+
```"""
353+
354+
expected_body = (
355+
"# create the container\n"
356+
"docker run -dt --name unstructured "
357+
"downloads.unstructured.io/unstructured-io/unstructured:latest"
358+
)
359+
360+
# Without fenced_code, ``#`` inside the fence is parsed as a heading (undesired).
361+
elements_tables_only = partition_md(text=text, extensions=["tables"])
362+
assert any(isinstance(el, Title) for el in elements_tables_only)
363+
364+
# Default and explicit fenced_code keep the block as one element (CodeSnippet from HTML).
365+
assert len(partition_md(text=text)) == 1
366+
elements_fenced = partition_md(text=text, extensions=["fenced_code"])
367+
assert len(elements_fenced) == 1
368+
assert elements_fenced[0].category == ElementType.CODE_SNIPPET
369+
assert elements_fenced[0].text == expected_body
370+
371+
# Extension instances (normal Python-Markdown API) match string extension names.
372+
elements_instance = partition_md(text=text, extensions=[FencedCodeExtension()])
373+
assert elements_instance == elements_fenced
374+
375+
376+
def test_partition_md_extensions_not_list_raises():
377+
with pytest.raises(ValueError, match="'extensions' must be a list"):
378+
partition_md(text="# Hi", extensions=("tables",)) # type: ignore[arg-type]
379+
380+
381+
def test_partition_md_extensions_invalid_item_raises():
382+
with pytest.raises(ValueError, match="Each entry in 'extensions'"):
383+
partition_md(text="# Hi", extensions=[42]) # type: ignore[list-item]
384+
385+
386+
def test_partition_md_tables_only_differs_from_default_for_code_fence():
387+
"""Without ``fenced_code``, ``#`` inside a fence can become a Title (see #4006)."""
388+
text = """```bash
389+
# line
390+
```"""
391+
default_el = partition_md(text=text)[0]
392+
tables_only_els = partition_md(text=text, extensions=["tables"])
393+
assert default_el.category == ElementType.CODE_SNIPPET
394+
assert any(e.category == ElementType.TITLE for e in tables_only_els)

unstructured/partition/md.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import markdown
66
import requests
7+
from markdown.extensions import Extension
78

89
from unstructured.documents.elements import Element
910
from unstructured.file_utils.encoding import read_txt_file
@@ -21,6 +22,28 @@ def optional_decode(contents: str | bytes) -> str:
2122

2223
DETECTION_ORIGIN: str = "md"
2324

25+
_DEFAULT_MARKDOWN_EXTENSIONS: list[str] = ["tables", "fenced_code"]
26+
27+
28+
def _validate_markdown_extensions(extensions: Any) -> list[Any]:
29+
"""Return ``extensions`` if it is a list of strings and/or ``Extension`` instances.
30+
31+
Python-Markdown accepts extension entry points as registered names (``str``) or configured
32+
``Extension`` instances; both are supported here. Any other shape raises ``ValueError``.
33+
"""
34+
if not isinstance(extensions, list):
35+
raise ValueError(
36+
"'extensions' must be a list of extension names (str) and/or "
37+
f"markdown.extensions.Extension instances, got {type(extensions).__name__!r}"
38+
)
39+
for item in extensions:
40+
if not isinstance(item, (str, Extension)):
41+
raise ValueError(
42+
"Each entry in 'extensions' must be a str or markdown.extensions.Extension "
43+
f"instance, got {type(item).__name__}: {item!r}"
44+
)
45+
return extensions
46+
2447

2548
def partition_md(
2649
filename: str | None = None,
@@ -49,6 +72,12 @@ def partition_md(
4972
languages
5073
The languages present in the document. Use ``["auto"]`` to detect (default when None).
5174
Use ``[""]`` to disable language detection.
75+
76+
Other keyword arguments are forwarded to ``partition_html``. In addition, ``extensions`` may be
77+
passed to ``markdown.markdown()`` as a list of registered extension names (``str``) and/or
78+
configured ``markdown.extensions.Extension`` instances. The default is
79+
``["tables", "fenced_code"]``. Pass e.g. ``extensions=["tables"]`` if you need the legacy
80+
behavior where ``#`` inside unfenced content is parsed as a heading (see #4006).
5281
"""
5382
if text is None:
5483
text = ""
@@ -77,7 +106,12 @@ def partition_md(
77106

78107
text = response.text
79108

80-
html = markdown.markdown(text, extensions=["tables", "fenced_code"])
109+
# -- optional markdown extensions; default matches historical partition_md behavior --
110+
extensions = _validate_markdown_extensions(
111+
kwargs.pop("extensions", _DEFAULT_MARKDOWN_EXTENSIONS)
112+
)
113+
114+
html = markdown.markdown(text, extensions=extensions)
81115

82116
html_kwargs: dict[str, Any] = {
83117
"text": html,

0 commit comments

Comments
 (0)