|
5 | 5 |
|
6 | 6 | import pytest |
7 | 7 | import requests |
| 8 | +from markdown.extensions.fenced_code import FencedCodeExtension |
8 | 9 | from pytest_mock import MockFixture |
9 | 10 |
|
10 | 11 | from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path |
@@ -341,3 +342,53 @@ def test_partition_fenced_code(): |
341 | 342 | assert elements[3].text == expected_xml |
342 | 343 |
|
343 | 344 | assert elements[4].text == expected_xml |
| 345 | + |
| 346 | + |
| 347 | +def test_partition_md_custom_extensions_parameter(): |
| 348 | + """User can override markdown extensions via `extensions` kwarg (fixes #4006).""" |
| 349 | + text = """```bash |
| 350 | +# create the container |
| 351 | +docker run -dt --name unstructured downloads.unstructured.io/unstructured-io/unstructured:latest |
| 352 | +```""" |
| 353 | + |
| 354 | + expected_body = ( |
| 355 | + "# create the container\n" |
| 356 | + "docker run -dt --name unstructured " |
| 357 | + "downloads.unstructured.io/unstructured-io/unstructured:latest" |
| 358 | + ) |
| 359 | + |
| 360 | + # Without fenced_code, ``#`` inside the fence is parsed as a heading (undesired). |
| 361 | + elements_tables_only = partition_md(text=text, extensions=["tables"]) |
| 362 | + assert any(isinstance(el, Title) for el in elements_tables_only) |
| 363 | + |
| 364 | + # Default and explicit fenced_code keep the block as one element (CodeSnippet from HTML). |
| 365 | + assert len(partition_md(text=text)) == 1 |
| 366 | + elements_fenced = partition_md(text=text, extensions=["fenced_code"]) |
| 367 | + assert len(elements_fenced) == 1 |
| 368 | + assert elements_fenced[0].category == ElementType.CODE_SNIPPET |
| 369 | + assert elements_fenced[0].text == expected_body |
| 370 | + |
| 371 | + # Extension instances (normal Python-Markdown API) match string extension names. |
| 372 | + elements_instance = partition_md(text=text, extensions=[FencedCodeExtension()]) |
| 373 | + assert elements_instance == elements_fenced |
| 374 | + |
| 375 | + |
| 376 | +def test_partition_md_extensions_not_list_raises(): |
| 377 | + with pytest.raises(ValueError, match="'extensions' must be a list"): |
| 378 | + partition_md(text="# Hi", extensions=("tables",)) # type: ignore[arg-type] |
| 379 | + |
| 380 | + |
| 381 | +def test_partition_md_extensions_invalid_item_raises(): |
| 382 | + with pytest.raises(ValueError, match="Each entry in 'extensions'"): |
| 383 | + partition_md(text="# Hi", extensions=[42]) # type: ignore[list-item] |
| 384 | + |
| 385 | + |
| 386 | +def test_partition_md_tables_only_differs_from_default_for_code_fence(): |
| 387 | + """Without ``fenced_code``, ``#`` inside a fence can become a Title (see #4006).""" |
| 388 | + text = """```bash |
| 389 | +# line |
| 390 | +```""" |
| 391 | + default_el = partition_md(text=text)[0] |
| 392 | + tables_only_els = partition_md(text=text, extensions=["tables"]) |
| 393 | + assert default_el.category == ElementType.CODE_SNIPPET |
| 394 | + assert any(e.category == ElementType.TITLE for e in tables_only_els) |
0 commit comments