Skip to content

Commit d0f8620

Browse files
feat: add create_file_from_elements() to re-create document files from elements (#4259)
## Summary Adds `create_file_from_elements()` in `unstructured.staging.base` so users can re-build a document file from a list of elements (reverse of partition). Supports the workflow: partition -> modify elements (e.g. replace Image with NarrativeText using alt text) -> write back to file. Closes #3994. ## Changes - **`unstructured/staging/base.py`**: New `create_file_from_elements(elements, format="markdown"|"html"|"text", filename=None, ...)` that delegates to `elements_to_md`, `elements_to_html`, or `elements_to_text` and optionally writes to a file. - **`test_unstructured/staging/test_base.py`**: Tests for markdown, text, and HTML output and for unsupported format raising `ValueError`.
1 parent 5302352 commit d0f8620

4 files changed

Lines changed: 211 additions & 1 deletion

File tree

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.21.6
2+
3+
### Enhancements
4+
- **Add `create_file_from_elements()` to re-create document files from elements (fixes #3994)**: New staging helper `create_file_from_elements(elements, output_format=..., filename=...)` converts a list of elements back into a document in the given format (`"markdown"`, `"html"`, or `"text"`), optionally writing to a file. Supports the workflow: partition → modify elements (e.g. replace Image with NarrativeText using alt text) → write back to file.
5+
16
## 0.21.5
27

38
### Fixes

test_unstructured/staging/test_base.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,141 @@ def test_elements_to_md_file_output():
664664
os.unlink(tmp_filename)
665665

666666

667+
def test_create_file_from_elements_markdown():
668+
"""Test create_file_from_elements with format=markdown returns and optionally writes file."""
669+
elements = [Title("Heading"), NarrativeText("Some body text.")]
670+
content = base.create_file_from_elements(elements, output_format="markdown")
671+
assert content == "# Heading\nSome body text."
672+
673+
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as tmp_file:
674+
tmp_filename = tmp_file.name
675+
try:
676+
out = base.create_file_from_elements(
677+
elements, output_format="markdown", filename=tmp_filename
678+
)
679+
assert out == content
680+
with open(tmp_filename) as f:
681+
assert f.read() == content
682+
finally:
683+
if os.path.exists(tmp_filename):
684+
os.unlink(tmp_filename)
685+
686+
687+
def test_create_file_from_elements_text():
688+
"""Test create_file_from_elements with format=text."""
689+
elements = [Title("A"), NarrativeText("B")]
690+
content = base.create_file_from_elements(elements, output_format="text")
691+
assert content == "A\nB"
692+
693+
694+
def test_create_file_from_elements_html():
695+
"""Test create_file_from_elements with format=html returns HTML."""
696+
elements = [Title("Page"), NarrativeText("Content")]
697+
content = base.create_file_from_elements(elements, output_format="html")
698+
assert "<!DOCTYPE html" in content
699+
assert "<body>" in content
700+
assert "Page" in content
701+
assert "Content" in content
702+
703+
704+
def test_create_file_from_elements_unsupported_format():
705+
"""Test create_file_from_elements raises for unsupported format."""
706+
elements = [Title("X")]
707+
with pytest.raises(ValueError, match="Unsupported format"):
708+
base.create_file_from_elements(elements, output_format="pdf")
709+
710+
711+
def test_create_file_from_elements_html_group_by_page_drops_elements_without_page_number():
712+
"""With no_group_by_page=False, elements without page_number are skipped (body empty)."""
713+
elements = [Title("Page"), NarrativeText("Content")]
714+
content = base.create_file_from_elements(elements, output_format="html", no_group_by_page=False)
715+
assert "<!DOCTYPE html" in content
716+
assert "<body>" in content
717+
# Elements without metadata.page_number are not included when grouping by page
718+
assert "Page" not in content
719+
assert "Content" not in content
720+
721+
722+
@pytest.mark.parametrize(
723+
("format_name", "expected_in_content"),
724+
[
725+
("markdown", "# Heading\nSome body text."),
726+
("text", "Heading\nSome body text."),
727+
("html", "<!DOCTYPE html"),
728+
],
729+
)
730+
def test_create_file_from_elements_filename_write(format_name: str, expected_in_content: str):
731+
"""Test create_file_from_elements writes correct content to file for all formats."""
732+
elements = [Title("Heading"), NarrativeText("Some body text.")]
733+
ext = {"markdown": ".md", "text": ".txt", "html": ".html"}[format_name]
734+
with tempfile.NamedTemporaryFile(mode="w", suffix=ext, delete=False) as tmp_file:
735+
tmp_filename = tmp_file.name
736+
try:
737+
out = base.create_file_from_elements(
738+
elements, output_format=format_name, filename=tmp_filename
739+
)
740+
assert expected_in_content in out
741+
with open(tmp_filename) as f:
742+
written = f.read()
743+
assert expected_in_content in written
744+
assert out == written
745+
finally:
746+
if os.path.exists(tmp_filename):
747+
os.unlink(tmp_filename)
748+
749+
750+
def test_create_file_from_elements_exclude_binary_image_data_markdown():
751+
"""exclude_binary_image_data=True passthrough: markdown omits base64 image data."""
752+
elements = [
753+
Title("Doc"),
754+
Image(
755+
"Alt",
756+
metadata=ElementMetadata(
757+
image_base64="abc123",
758+
image_mime_type="image/png",
759+
),
760+
),
761+
]
762+
content = base.create_file_from_elements(
763+
elements, output_format="markdown", exclude_binary_image_data=True
764+
)
765+
assert "base64," not in content
766+
assert "Alt" in content
767+
768+
769+
def test_create_file_from_elements_exclude_binary_image_data_html():
770+
"""exclude_binary_image_data=True passthrough: HTML omits base64 image data."""
771+
elements = [
772+
Title("Doc"),
773+
Image(
774+
"Alt",
775+
metadata=ElementMetadata(
776+
image_base64="abc123",
777+
image_mime_type="image/png",
778+
),
779+
),
780+
]
781+
content = base.create_file_from_elements(
782+
elements, output_format="html", exclude_binary_image_data=True
783+
)
784+
assert "abc123" not in content
785+
786+
787+
@pytest.mark.parametrize(
788+
("format_arg", "expected_substring"),
789+
[
790+
(" Markdown ", "# Heading"),
791+
("HTML ", "<!DOCTYPE html"),
792+
(" TEXT ", "Heading"),
793+
],
794+
)
795+
def test_create_file_from_elements_format_normalization(format_arg: str, expected_substring: str):
796+
"""Format string is stripped and lowercased (e.g. ' Markdown ' -> 'markdown')."""
797+
elements = [Title("Heading"), NarrativeText("Body")]
798+
content = base.create_file_from_elements(elements, output_format=format_arg)
799+
assert expected_substring in content
800+
801+
667802
def test_element_to_md_with_none_mime_type():
668803
"""Test element_to_md handles None mime_type gracefully."""
669804
from unstructured.documents.elements import ElementMetadata, Image

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.21.5" # pragma: no cover
1+
__version__ = "0.21.6" # pragma: no cover

unstructured/staging/base.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,76 @@ def elements_to_md(
195195
return markdown_content
196196

197197

198+
def create_file_from_elements(
199+
elements: Iterable[Element],
200+
output_format: str = "markdown",
201+
filename: Optional[str] = None,
202+
encoding: str = "utf-8",
203+
exclude_binary_image_data: bool = False,
204+
no_group_by_page: bool = True,
205+
) -> str:
206+
"""Re-create a document file from a list of elements (reverse of partition).
207+
208+
Use this after partitioning a document, optionally modifying elements (e.g. replacing
209+
Image elements with NarrativeText using alt text), then writing back to a file.
210+
211+
Supported formats: "markdown", "html", "text".
212+
213+
Args:
214+
elements: Iterable of elements to convert (e.g. from partition_* or after editing).
215+
output_format: Output format: "markdown", "html", or "text".
216+
filename: Optional path to write the document to.
217+
encoding: File encoding when writing to file (all formats).
218+
exclude_binary_image_data: If True, omit base64 image data. Applies only to
219+
**markdown** and **html**; ignored for text.
220+
no_group_by_page: If True (default), include all elements in output. If False,
221+
group **html** by page (elements without metadata.page_number are skipped).
222+
Applies only to **html**; ignored for markdown and text.
223+
224+
Returns:
225+
The document content as a string.
226+
227+
Example:
228+
>>> from unstructured.partition.md import partition_md
229+
>>> from unstructured.staging.base import create_file_from_elements
230+
>>> elements = partition_md("README.md")
231+
>>> # ... modify elements (e.g. replace Image with NarrativeText) ...
232+
>>> create_file_from_elements(elements, output_format="markdown", filename="out.md")
233+
"""
234+
format_lower = output_format.strip().lower()
235+
if format_lower not in ("markdown", "html", "text"):
236+
raise ValueError(
237+
f"Unsupported format: {output_format!r}. Supported formats: 'markdown', 'html', 'text'."
238+
)
239+
240+
if format_lower == "markdown":
241+
content = elements_to_md(
242+
elements,
243+
filename=filename,
244+
exclude_binary_image_data=exclude_binary_image_data,
245+
encoding=encoding,
246+
)
247+
return content
248+
elif format_lower == "html":
249+
from unstructured.partition.html.convert import elements_to_html
250+
251+
content = elements_to_html(
252+
list(elements),
253+
exclude_binary_image_data=exclude_binary_image_data,
254+
no_group_by_page=no_group_by_page,
255+
)
256+
if filename is not None:
257+
with open(filename, "w", encoding=encoding) as f:
258+
f.write(content)
259+
return content
260+
else:
261+
# text: delegate write to elements_to_text when filename is set
262+
content = convert_to_text(elements)
263+
if filename is not None:
264+
elements_to_text(elements, filename=filename, encoding=encoding)
265+
return content
266+
267+
198268
def elements_to_json(
199269
elements: Iterable[Element],
200270
filename: Optional[str] = None,

0 commit comments

Comments
 (0)