diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f3ca1a500..52318727d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.21.6 + +### Enhancements +- **Add `create_file_from_elements()` to re-create document files from elements (fixes #3994)**: New staging helper `create_file_from_elements(elements, output_format=..., filename=...)` converts a list of elements back into a document in the given format (`"markdown"`, `"html"`, or `"text"`), optionally writing to a file. Supports the workflow: partition → modify elements (e.g. replace Image with NarrativeText using alt text) → write back to file. + ## 0.21.5 ### Fixes diff --git a/test_unstructured/staging/test_base.py b/test_unstructured/staging/test_base.py index eba155033b..94a572b66d 100644 --- a/test_unstructured/staging/test_base.py +++ b/test_unstructured/staging/test_base.py @@ -664,6 +664,141 @@ def test_elements_to_md_file_output(): os.unlink(tmp_filename) +def test_create_file_from_elements_markdown(): + """Test create_file_from_elements with format=markdown returns and optionally writes file.""" + elements = [Title("Heading"), NarrativeText("Some body text.")] + content = base.create_file_from_elements(elements, output_format="markdown") + assert content == "# Heading\nSome body text." + + with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as tmp_file: + tmp_filename = tmp_file.name + try: + out = base.create_file_from_elements( + elements, output_format="markdown", filename=tmp_filename + ) + assert out == content + with open(tmp_filename) as f: + assert f.read() == content + finally: + if os.path.exists(tmp_filename): + os.unlink(tmp_filename) + + +def test_create_file_from_elements_text(): + """Test create_file_from_elements with format=text.""" + elements = [Title("A"), NarrativeText("B")] + content = base.create_file_from_elements(elements, output_format="text") + assert content == "A\nB" + + +def test_create_file_from_elements_html(): + """Test create_file_from_elements with format=html returns HTML.""" + elements = [Title("Page"), NarrativeText("Content")] + content = base.create_file_from_elements(elements, output_format="html") + assert "" in content + assert "Page" in content + assert "Content" in content + + +def test_create_file_from_elements_unsupported_format(): + """Test create_file_from_elements raises for unsupported format.""" + elements = [Title("X")] + with pytest.raises(ValueError, match="Unsupported format"): + base.create_file_from_elements(elements, output_format="pdf") + + +def test_create_file_from_elements_html_group_by_page_drops_elements_without_page_number(): + """With no_group_by_page=False, elements without page_number are skipped (body empty).""" + elements = [Title("Page"), NarrativeText("Content")] + content = base.create_file_from_elements(elements, output_format="html", no_group_by_page=False) + assert "" in content + # Elements without metadata.page_number are not included when grouping by page + assert "Page" not in content + assert "Content" not in content + + +@pytest.mark.parametrize( + ("format_name", "expected_in_content"), + [ + ("markdown", "# Heading\nSome body text."), + ("text", "Heading\nSome body text."), + ("html", " 'markdown').""" + elements = [Title("Heading"), NarrativeText("Body")] + content = base.create_file_from_elements(elements, output_format=format_arg) + assert expected_substring in content + + def test_element_to_md_with_none_mime_type(): """Test element_to_md handles None mime_type gracefully.""" from unstructured.documents.elements import ElementMetadata, Image diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a128517d02..0f79ce5f3b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.21.5" # pragma: no cover +__version__ = "0.21.6" # pragma: no cover diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index 4f2bcee8d0..e69cb13cea 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -195,6 +195,76 @@ def elements_to_md( return markdown_content +def create_file_from_elements( + elements: Iterable[Element], + output_format: str = "markdown", + filename: Optional[str] = None, + encoding: str = "utf-8", + exclude_binary_image_data: bool = False, + no_group_by_page: bool = True, +) -> str: + """Re-create a document file from a list of elements (reverse of partition). + + Use this after partitioning a document, optionally modifying elements (e.g. replacing + Image elements with NarrativeText using alt text), then writing back to a file. + + Supported formats: "markdown", "html", "text". + + Args: + elements: Iterable of elements to convert (e.g. from partition_* or after editing). + output_format: Output format: "markdown", "html", or "text". + filename: Optional path to write the document to. + encoding: File encoding when writing to file (all formats). + exclude_binary_image_data: If True, omit base64 image data. Applies only to + **markdown** and **html**; ignored for text. + no_group_by_page: If True (default), include all elements in output. If False, + group **html** by page (elements without metadata.page_number are skipped). + Applies only to **html**; ignored for markdown and text. + + Returns: + The document content as a string. + + Example: + >>> from unstructured.partition.md import partition_md + >>> from unstructured.staging.base import create_file_from_elements + >>> elements = partition_md("README.md") + >>> # ... modify elements (e.g. replace Image with NarrativeText) ... + >>> create_file_from_elements(elements, output_format="markdown", filename="out.md") + """ + format_lower = output_format.strip().lower() + if format_lower not in ("markdown", "html", "text"): + raise ValueError( + f"Unsupported format: {output_format!r}. Supported formats: 'markdown', 'html', 'text'." + ) + + if format_lower == "markdown": + content = elements_to_md( + elements, + filename=filename, + exclude_binary_image_data=exclude_binary_image_data, + encoding=encoding, + ) + return content + elif format_lower == "html": + from unstructured.partition.html.convert import elements_to_html + + content = elements_to_html( + list(elements), + exclude_binary_image_data=exclude_binary_image_data, + no_group_by_page=no_group_by_page, + ) + if filename is not None: + with open(filename, "w", encoding=encoding) as f: + f.write(content) + return content + else: + # text: delegate write to elements_to_text when filename is set + content = convert_to_text(elements) + if filename is not None: + elements_to_text(elements, filename=filename, encoding=encoding) + return content + + def elements_to_json( elements: Iterable[Element], filename: Optional[str] = None,