Skip to content

Commit 1e6880c

Browse files
feat: add create_file_from_elements() to re-create document files from elements
1 parent 16482f9 commit 1e6880c

4 files changed

Lines changed: 114 additions & 1 deletion

File tree

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.21.5
2+
3+
### Enhancements
4+
- **Add `create_file_from_elements()` to re-create document files from elements (fixes #3994)**: New staging helper `create_file_from_elements(elements, format=..., filename=...)` converts a list of elements back into a document in the given format (`"markdown"`, `"html"`, or `"text"`), optionally writing to a file. Supports the workflow: partition → modify elements (e.g. replace Image with NarrativeText using alt text) → write back to file.
5+
16
## 0.21.4
27
- Add a github action for testing time regressions
38

test_unstructured/staging/test_base.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,48 @@ def test_elements_to_md_file_output():
664664
os.unlink(tmp_filename)
665665

666666

667+
def test_create_file_from_elements_markdown():
668+
"""Test create_file_from_elements with format=markdown returns and optionally writes file."""
669+
elements = [Title("Heading"), NarrativeText("Some body text.")]
670+
content = base.create_file_from_elements(elements, format="markdown")
671+
assert content == "# Heading\nSome body text."
672+
673+
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as tmp_file:
674+
tmp_filename = tmp_file.name
675+
try:
676+
out = base.create_file_from_elements(elements, format="markdown", filename=tmp_filename)
677+
assert out == content
678+
with open(tmp_filename) as f:
679+
assert f.read() == content
680+
finally:
681+
if os.path.exists(tmp_filename):
682+
os.unlink(tmp_filename)
683+
684+
685+
def test_create_file_from_elements_text():
686+
"""Test create_file_from_elements with format=text."""
687+
elements = [Title("A"), NarrativeText("B")]
688+
content = base.create_file_from_elements(elements, format="text")
689+
assert content == "A\nB"
690+
691+
692+
def test_create_file_from_elements_html():
693+
"""Test create_file_from_elements with format=html returns HTML."""
694+
elements = [Title("Page"), NarrativeText("Content")]
695+
content = base.create_file_from_elements(elements, format="html")
696+
assert "<!DOCTYPE html" in content
697+
assert "<body>" in content
698+
assert "Page" in content
699+
assert "Content" in content
700+
701+
702+
def test_create_file_from_elements_unsupported_format():
703+
"""Test create_file_from_elements raises for unsupported format."""
704+
elements = [Title("X")]
705+
with pytest.raises(ValueError, match="Unsupported format"):
706+
base.create_file_from_elements(elements, format="pdf")
707+
708+
667709
def test_element_to_md_with_none_mime_type():
668710
"""Test element_to_md handles None mime_type gracefully."""
669711
from unstructured.documents.elements import ElementMetadata, Image

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.21.4" # pragma: no cover
1+
__version__ = "0.21.5" # pragma: no cover

unstructured/staging/base.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,72 @@ def elements_to_md(
195195
return markdown_content
196196

197197

198+
def create_file_from_elements(
199+
elements: Iterable[Element],
200+
format: str = "markdown",
201+
filename: Optional[str] = None,
202+
encoding: str = "utf-8",
203+
exclude_binary_image_data: bool = False,
204+
no_group_by_page: bool = False,
205+
) -> str:
206+
"""Re-create a document file from a list of elements (reverse of partition).
207+
208+
Use this after partitioning a document, optionally modifying elements (e.g. replacing
209+
Image elements with NarrativeText using alt text), then writing back to a file.
210+
211+
Supported formats: "markdown", "html", "text".
212+
213+
Args:
214+
elements: Iterable of elements to convert (e.g. from partition_* or after editing).
215+
format: Output format: "markdown", "html", or "text".
216+
filename: Optional path to write the document to.
217+
encoding: File encoding when writing to file.
218+
exclude_binary_image_data: If True, omit base64 image data (markdown/html).
219+
no_group_by_page: If True, do not group HTML output by page.
220+
221+
Returns:
222+
The document content as a string.
223+
224+
Example:
225+
>>> from unstructured.partition.md import partition_md
226+
>>> from unstructured.staging.base import create_file_from_elements
227+
>>> elements = partition_md("README.md")
228+
>>> # ... modify elements (e.g. replace Image with NarrativeText) ...
229+
>>> create_file_from_elements(elements, format="markdown", filename="out.md")
230+
"""
231+
format_lower = format.strip().lower()
232+
if format_lower not in ("markdown", "html", "text"):
233+
raise ValueError(
234+
f"Unsupported format: {format!r}. Supported formats: 'markdown', 'html', 'text'."
235+
)
236+
237+
if format_lower == "markdown":
238+
content = elements_to_md(
239+
elements,
240+
filename=None,
241+
exclude_binary_image_data=exclude_binary_image_data,
242+
encoding=encoding,
243+
)
244+
elif format_lower == "html":
245+
from unstructured.partition.html.convert import elements_to_html
246+
247+
content = elements_to_html(
248+
list(elements),
249+
exclude_binary_image_data=exclude_binary_image_data,
250+
no_group_by_page=no_group_by_page,
251+
)
252+
else:
253+
# text
254+
content = elements_to_text(elements, filename=None, encoding=encoding)
255+
assert content is not None # we passed filename=None
256+
257+
if filename is not None:
258+
with open(filename, "w", encoding=encoding) as f:
259+
f.write(content)
260+
261+
return content
262+
263+
198264
def elements_to_json(
199265
elements: Iterable[Element],
200266
filename: Optional[str] = None,

0 commit comments

Comments
 (0)