From 8e7a15bae66ea6a61dfa3c71542b0fdba0a1932e Mon Sep 17 00:00:00 2001 From: Clayton Date: Mon, 23 Feb 2026 00:09:24 -0600 Subject: [PATCH 1/5] feat: add create_file_from_elements() to re-create document files from elements --- CHANGELOG.md | 5 ++ test_unstructured/staging/test_base.py | 42 ++++++++++++++++ unstructured/__version__.py | 2 +- unstructured/staging/base.py | 66 ++++++++++++++++++++++++++ 4 files changed, 114 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f3ca1a500..338278b0d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.21.6 + +### Enhancements +- **Add `create_file_from_elements()` to re-create document files from elements (fixes #3994)**: New staging helper `create_file_from_elements(elements, format=..., filename=...)` converts a list of elements back into a document in the given format (`"markdown"`, `"html"`, or `"text"`), optionally writing to a file. Supports the workflow: partition → modify elements (e.g. replace Image with NarrativeText using alt text) → write back to file. + ## 0.21.5 ### Fixes diff --git a/test_unstructured/staging/test_base.py b/test_unstructured/staging/test_base.py index eba155033b..c1c9ced348 100644 --- a/test_unstructured/staging/test_base.py +++ b/test_unstructured/staging/test_base.py @@ -664,6 +664,48 @@ def test_elements_to_md_file_output(): os.unlink(tmp_filename) +def test_create_file_from_elements_markdown(): + """Test create_file_from_elements with format=markdown returns and optionally writes file.""" + elements = [Title("Heading"), NarrativeText("Some body text.")] + content = base.create_file_from_elements(elements, format="markdown") + assert content == "# Heading\nSome body text." + + with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as tmp_file: + tmp_filename = tmp_file.name + try: + out = base.create_file_from_elements(elements, format="markdown", filename=tmp_filename) + assert out == content + with open(tmp_filename) as f: + assert f.read() == content + finally: + if os.path.exists(tmp_filename): + os.unlink(tmp_filename) + + +def test_create_file_from_elements_text(): + """Test create_file_from_elements with format=text.""" + elements = [Title("A"), NarrativeText("B")] + content = base.create_file_from_elements(elements, format="text") + assert content == "A\nB" + + +def test_create_file_from_elements_html(): + """Test create_file_from_elements with format=html returns HTML.""" + elements = [Title("Page"), NarrativeText("Content")] + content = base.create_file_from_elements(elements, format="html") + assert "" in content + assert "Page" in content + assert "Content" in content + + +def test_create_file_from_elements_unsupported_format(): + """Test create_file_from_elements raises for unsupported format.""" + elements = [Title("X")] + with pytest.raises(ValueError, match="Unsupported format"): + base.create_file_from_elements(elements, format="pdf") + + def test_element_to_md_with_none_mime_type(): """Test element_to_md handles None mime_type gracefully.""" from unstructured.documents.elements import ElementMetadata, Image diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a128517d02..0f79ce5f3b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.21.5" # pragma: no cover +__version__ = "0.21.6" # pragma: no cover diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index 4f2bcee8d0..d5f85d63b1 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -195,6 +195,72 @@ def elements_to_md( return markdown_content +def create_file_from_elements( + elements: Iterable[Element], + format: str = "markdown", + filename: Optional[str] = None, + encoding: str = "utf-8", + exclude_binary_image_data: bool = False, + no_group_by_page: bool = False, +) -> str: + """Re-create a document file from a list of elements (reverse of partition). + + Use this after partitioning a document, optionally modifying elements (e.g. replacing + Image elements with NarrativeText using alt text), then writing back to a file. + + Supported formats: "markdown", "html", "text". + + Args: + elements: Iterable of elements to convert (e.g. from partition_* or after editing). + format: Output format: "markdown", "html", or "text". + filename: Optional path to write the document to. + encoding: File encoding when writing to file. + exclude_binary_image_data: If True, omit base64 image data (markdown/html). + no_group_by_page: If True, do not group HTML output by page. + + Returns: + The document content as a string. + + Example: + >>> from unstructured.partition.md import partition_md + >>> from unstructured.staging.base import create_file_from_elements + >>> elements = partition_md("README.md") + >>> # ... modify elements (e.g. replace Image with NarrativeText) ... + >>> create_file_from_elements(elements, format="markdown", filename="out.md") + """ + format_lower = format.strip().lower() + if format_lower not in ("markdown", "html", "text"): + raise ValueError( + f"Unsupported format: {format!r}. Supported formats: 'markdown', 'html', 'text'." + ) + + if format_lower == "markdown": + content = elements_to_md( + elements, + filename=None, + exclude_binary_image_data=exclude_binary_image_data, + encoding=encoding, + ) + elif format_lower == "html": + from unstructured.partition.html.convert import elements_to_html + + content = elements_to_html( + list(elements), + exclude_binary_image_data=exclude_binary_image_data, + no_group_by_page=no_group_by_page, + ) + else: + # text + content = elements_to_text(elements, filename=None, encoding=encoding) + assert content is not None # we passed filename=None + + if filename is not None: + with open(filename, "w", encoding=encoding) as f: + f.write(content) + + return content + + def elements_to_json( elements: Iterable[Element], filename: Optional[str] = None, From 08a30013be08481d55a57c4f0e345dc4397dc98d Mon Sep 17 00:00:00 2001 From: Clayton Date: Mon, 23 Feb 2026 05:09:28 -0600 Subject: [PATCH 2/5] fix: test --- test_unstructured/staging/test_base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test_unstructured/staging/test_base.py b/test_unstructured/staging/test_base.py index c1c9ced348..a220879f0d 100644 --- a/test_unstructured/staging/test_base.py +++ b/test_unstructured/staging/test_base.py @@ -692,7 +692,10 @@ def test_create_file_from_elements_text(): def test_create_file_from_elements_html(): """Test create_file_from_elements with format=html returns HTML.""" elements = [Title("Page"), NarrativeText("Content")] - content = base.create_file_from_elements(elements, format="html") + # no_group_by_page=True so elements without page_number are included (default path skips them) + content = base.create_file_from_elements( + elements, format="html", no_group_by_page=True + ) assert "" in content assert "Page" in content From 3d9fc0679713c8b52274027e125c4e9a57472e0f Mon Sep 17 00:00:00 2001 From: Clayton Date: Mon, 23 Feb 2026 05:23:02 -0600 Subject: [PATCH 3/5] fix: lint --- test_unstructured/staging/test_base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test_unstructured/staging/test_base.py b/test_unstructured/staging/test_base.py index a220879f0d..5659d2567a 100644 --- a/test_unstructured/staging/test_base.py +++ b/test_unstructured/staging/test_base.py @@ -693,9 +693,7 @@ def test_create_file_from_elements_html(): """Test create_file_from_elements with format=html returns HTML.""" elements = [Title("Page"), NarrativeText("Content")] # no_group_by_page=True so elements without page_number are included (default path skips them) - content = base.create_file_from_elements( - elements, format="html", no_group_by_page=True - ) + content = base.create_file_from_elements(elements, format="html", no_group_by_page=True) assert "" in content assert "Page" in content From 94f7377aaef23e7a4dc849d72df881e156c2d58a Mon Sep 17 00:00:00 2001 From: Clayton Date: Mon, 23 Feb 2026 20:55:32 -0600 Subject: [PATCH 4/5] fix: update --- CHANGELOG.md | 2 +- test_unstructured/staging/test_base.py | 104 +++++++++++++++++++++++-- unstructured/staging/base.py | 42 +++++----- 3 files changed, 122 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 338278b0d9..52318727d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ ## 0.21.6 ### Enhancements -- **Add `create_file_from_elements()` to re-create document files from elements (fixes #3994)**: New staging helper `create_file_from_elements(elements, format=..., filename=...)` converts a list of elements back into a document in the given format (`"markdown"`, `"html"`, or `"text"`), optionally writing to a file. Supports the workflow: partition → modify elements (e.g. replace Image with NarrativeText using alt text) → write back to file. +- **Add `create_file_from_elements()` to re-create document files from elements (fixes #3994)**: New staging helper `create_file_from_elements(elements, output_format=..., filename=...)` converts a list of elements back into a document in the given format (`"markdown"`, `"html"`, or `"text"`), optionally writing to a file. Supports the workflow: partition → modify elements (e.g. replace Image with NarrativeText using alt text) → write back to file. ## 0.21.5 diff --git a/test_unstructured/staging/test_base.py b/test_unstructured/staging/test_base.py index 5659d2567a..96e29b9d3e 100644 --- a/test_unstructured/staging/test_base.py +++ b/test_unstructured/staging/test_base.py @@ -667,13 +667,15 @@ def test_elements_to_md_file_output(): def test_create_file_from_elements_markdown(): """Test create_file_from_elements with format=markdown returns and optionally writes file.""" elements = [Title("Heading"), NarrativeText("Some body text.")] - content = base.create_file_from_elements(elements, format="markdown") + content = base.create_file_from_elements(elements, output_format="markdown") assert content == "# Heading\nSome body text." with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as tmp_file: tmp_filename = tmp_file.name try: - out = base.create_file_from_elements(elements, format="markdown", filename=tmp_filename) + out = base.create_file_from_elements( + elements, output_format="markdown", filename=tmp_filename + ) assert out == content with open(tmp_filename) as f: assert f.read() == content @@ -685,15 +687,14 @@ def test_create_file_from_elements_markdown(): def test_create_file_from_elements_text(): """Test create_file_from_elements with format=text.""" elements = [Title("A"), NarrativeText("B")] - content = base.create_file_from_elements(elements, format="text") + content = base.create_file_from_elements(elements, output_format="text") assert content == "A\nB" def test_create_file_from_elements_html(): """Test create_file_from_elements with format=html returns HTML.""" elements = [Title("Page"), NarrativeText("Content")] - # no_group_by_page=True so elements without page_number are included (default path skips them) - content = base.create_file_from_elements(elements, format="html", no_group_by_page=True) + content = base.create_file_from_elements(elements, output_format="html") assert "" in content assert "Page" in content @@ -704,7 +705,98 @@ def test_create_file_from_elements_unsupported_format(): """Test create_file_from_elements raises for unsupported format.""" elements = [Title("X")] with pytest.raises(ValueError, match="Unsupported format"): - base.create_file_from_elements(elements, format="pdf") + base.create_file_from_elements(elements, output_format="pdf") + + +def test_create_file_from_elements_html_group_by_page_drops_elements_without_page_number(): + """With no_group_by_page=False, elements without page_number are skipped (body empty).""" + elements = [Title("Page"), NarrativeText("Content")] + content = base.create_file_from_elements(elements, output_format="html", no_group_by_page=False) + assert "" in content + # Elements without metadata.page_number are not included when grouping by page + assert "Page" not in content + assert "Content" not in content + + +@pytest.mark.parametrize( + "format_name,expected_in_content", + [ + ("markdown", "# Heading\nSome body text."), + ("text", "Heading\nSome body text."), + ("html", " 'markdown').""" + elements = [Title("Heading"), NarrativeText("Body")] + content = base.create_file_from_elements(elements, output_format=format_arg) + assert expected_substring in content def test_element_to_md_with_none_mime_type(): diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index d5f85d63b1..e69cb13cea 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -197,11 +197,11 @@ def elements_to_md( def create_file_from_elements( elements: Iterable[Element], - format: str = "markdown", + output_format: str = "markdown", filename: Optional[str] = None, encoding: str = "utf-8", exclude_binary_image_data: bool = False, - no_group_by_page: bool = False, + no_group_by_page: bool = True, ) -> str: """Re-create a document file from a list of elements (reverse of partition). @@ -212,11 +212,14 @@ def create_file_from_elements( Args: elements: Iterable of elements to convert (e.g. from partition_* or after editing). - format: Output format: "markdown", "html", or "text". + output_format: Output format: "markdown", "html", or "text". filename: Optional path to write the document to. - encoding: File encoding when writing to file. - exclude_binary_image_data: If True, omit base64 image data (markdown/html). - no_group_by_page: If True, do not group HTML output by page. + encoding: File encoding when writing to file (all formats). + exclude_binary_image_data: If True, omit base64 image data. Applies only to + **markdown** and **html**; ignored for text. + no_group_by_page: If True (default), include all elements in output. If False, + group **html** by page (elements without metadata.page_number are skipped). + Applies only to **html**; ignored for markdown and text. Returns: The document content as a string. @@ -226,21 +229,22 @@ def create_file_from_elements( >>> from unstructured.staging.base import create_file_from_elements >>> elements = partition_md("README.md") >>> # ... modify elements (e.g. replace Image with NarrativeText) ... - >>> create_file_from_elements(elements, format="markdown", filename="out.md") + >>> create_file_from_elements(elements, output_format="markdown", filename="out.md") """ - format_lower = format.strip().lower() + format_lower = output_format.strip().lower() if format_lower not in ("markdown", "html", "text"): raise ValueError( - f"Unsupported format: {format!r}. Supported formats: 'markdown', 'html', 'text'." + f"Unsupported format: {output_format!r}. Supported formats: 'markdown', 'html', 'text'." ) if format_lower == "markdown": content = elements_to_md( elements, - filename=None, + filename=filename, exclude_binary_image_data=exclude_binary_image_data, encoding=encoding, ) + return content elif format_lower == "html": from unstructured.partition.html.convert import elements_to_html @@ -249,16 +253,16 @@ def create_file_from_elements( exclude_binary_image_data=exclude_binary_image_data, no_group_by_page=no_group_by_page, ) + if filename is not None: + with open(filename, "w", encoding=encoding) as f: + f.write(content) + return content else: - # text - content = elements_to_text(elements, filename=None, encoding=encoding) - assert content is not None # we passed filename=None - - if filename is not None: - with open(filename, "w", encoding=encoding) as f: - f.write(content) - - return content + # text: delegate write to elements_to_text when filename is set + content = convert_to_text(elements) + if filename is not None: + elements_to_text(elements, filename=filename, encoding=encoding) + return content def elements_to_json( From e4401a03abcc7fd28a571b80f54b9fa1d608df2e Mon Sep 17 00:00:00 2001 From: Clayton Date: Mon, 23 Feb 2026 21:41:47 -0600 Subject: [PATCH 5/5] fix: lint --- test_unstructured/staging/test_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_unstructured/staging/test_base.py b/test_unstructured/staging/test_base.py index 96e29b9d3e..94a572b66d 100644 --- a/test_unstructured/staging/test_base.py +++ b/test_unstructured/staging/test_base.py @@ -720,7 +720,7 @@ def test_create_file_from_elements_html_group_by_page_drops_elements_without_pag @pytest.mark.parametrize( - "format_name,expected_in_content", + ("format_name", "expected_in_content"), [ ("markdown", "# Heading\nSome body text."), ("text", "Heading\nSome body text."), @@ -785,7 +785,7 @@ def test_create_file_from_elements_exclude_binary_image_data_html(): @pytest.mark.parametrize( - "format_arg,expected_substring", + ("format_arg", "expected_substring"), [ (" Markdown ", "# Heading"), ("HTML ", "