feat: add create_file_from_elements() to re-create document files from elements (#4259)

claytonlin1110 · web-flow · commit d0f86208d7e1 · 2026-02-24T16:17:03.000Z
## Summary Adds `create_file_from_elements()` in `unstructured.staging.base` so users can re-build a document file from a list of elements (reverse of partition). Supports the workflow: partition -> modify elements (e.g. replace Image with NarrativeText using alt text) -> write back to file. Closes #3994. ## Changes - **`unstructured/staging/base.py`**: New `create_file_from_elements(elements, format="markdown"|"html"|"text", filename=None, ...)` that delegates to `elements_to_md`, `elements_to_html`, or `elements_to_text` and optionally writes to a file. - **`test_unstructured/staging/test_base.py`**: Tests for markdown, text, and HTML output and for unsupported format raising `ValueError`.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.21.6
+
+### Enhancements
+- **Add `create_file_from_elements()` to re-create document files from elements (fixes #3994)**: New staging helper `create_file_from_elements(elements, output_format=..., filename=...)` converts a list of elements back into a document in the given format (`"markdown"`, `"html"`, or `"text"`), optionally writing to a file. Supports the workflow: partition → modify elements (e.g. replace Image with NarrativeText using alt text) → write back to file.
+
 ## 0.21.5
 
 ### Fixes
diff --git a/test_unstructured/staging/test_base.py b/test_unstructured/staging/test_base.py
@@ -664,6 +664,141 @@ def test_elements_to_md_file_output():
             os.unlink(tmp_filename)
 
 
+def test_create_file_from_elements_markdown():
+    """Test create_file_from_elements with format=markdown returns and optionally writes file."""
+    elements = [Title("Heading"), NarrativeText("Some body text.")]
+    content = base.create_file_from_elements(elements, output_format="markdown")
+    assert content == "# Heading\nSome body text."
+
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as tmp_file:
+        tmp_filename = tmp_file.name
+    try:
+        out = base.create_file_from_elements(
+            elements, output_format="markdown", filename=tmp_filename
+        )
+        assert out == content
+        with open(tmp_filename) as f:
+            assert f.read() == content
+    finally:
+        if os.path.exists(tmp_filename):
+            os.unlink(tmp_filename)
+
+
+def test_create_file_from_elements_text():
+    """Test create_file_from_elements with format=text."""
+    elements = [Title("A"), NarrativeText("B")]
+    content = base.create_file_from_elements(elements, output_format="text")
+    assert content == "A\nB"
+
+
+def test_create_file_from_elements_html():
+    """Test create_file_from_elements with format=html returns HTML."""
+    elements = [Title("Page"), NarrativeText("Content")]
+    content = base.create_file_from_elements(elements, output_format="html")
+    assert "<!DOCTYPE html" in content
+    assert "<body>" in content
+    assert "Page" in content
+    assert "Content" in content
+
+
+def test_create_file_from_elements_unsupported_format():
+    """Test create_file_from_elements raises for unsupported format."""
+    elements = [Title("X")]
+    with pytest.raises(ValueError, match="Unsupported format"):
+        base.create_file_from_elements(elements, output_format="pdf")
+
+
+def test_create_file_from_elements_html_group_by_page_drops_elements_without_page_number():
+    """With no_group_by_page=False, elements without page_number are skipped (body empty)."""
+    elements = [Title("Page"), NarrativeText("Content")]
+    content = base.create_file_from_elements(elements, output_format="html", no_group_by_page=False)
+    assert "<!DOCTYPE html" in content
+    assert "<body>" in content
+    # Elements without metadata.page_number are not included when grouping by page
+    assert "Page" not in content
+    assert "Content" not in content
+
+
+@pytest.mark.parametrize(
+    ("format_name", "expected_in_content"),
+    [
+        ("markdown", "# Heading\nSome body text."),
+        ("text", "Heading\nSome body text."),
+        ("html", "<!DOCTYPE html"),
+    ],
+)
+def test_create_file_from_elements_filename_write(format_name: str, expected_in_content: str):
+    """Test create_file_from_elements writes correct content to file for all formats."""
+    elements = [Title("Heading"), NarrativeText("Some body text.")]
+    ext = {"markdown": ".md", "text": ".txt", "html": ".html"}[format_name]
+    with tempfile.NamedTemporaryFile(mode="w", suffix=ext, delete=False) as tmp_file:
+        tmp_filename = tmp_file.name
+    try:
+        out = base.create_file_from_elements(
+            elements, output_format=format_name, filename=tmp_filename
+        )
+        assert expected_in_content in out
+        with open(tmp_filename) as f:
+            written = f.read()
+        assert expected_in_content in written
+        assert out == written
+    finally:
+        if os.path.exists(tmp_filename):
+            os.unlink(tmp_filename)
+
+
+def test_create_file_from_elements_exclude_binary_image_data_markdown():
+    """exclude_binary_image_data=True passthrough: markdown omits base64 image data."""
+    elements = [
+        Title("Doc"),
+        Image(
+            "Alt",
+            metadata=ElementMetadata(
+                image_base64="abc123",
+                image_mime_type="image/png",
+            ),
+        ),
+    ]
+    content = base.create_file_from_elements(
+        elements, output_format="markdown", exclude_binary_image_data=True
+    )
+    assert "base64," not in content
+    assert "Alt" in content
+
+
+def test_create_file_from_elements_exclude_binary_image_data_html():
+    """exclude_binary_image_data=True passthrough: HTML omits base64 image data."""
+    elements = [
+        Title("Doc"),
+        Image(
+            "Alt",
+            metadata=ElementMetadata(
+                image_base64="abc123",
+                image_mime_type="image/png",
+            ),
+        ),
+    ]
+    content = base.create_file_from_elements(
+        elements, output_format="html", exclude_binary_image_data=True
+    )
+    assert "abc123" not in content
+
+
+@pytest.mark.parametrize(
+    ("format_arg", "expected_substring"),
+    [
+        (" Markdown ", "# Heading"),
+        ("HTML ", "<!DOCTYPE html"),
+        (" TEXT ", "Heading"),
+    ],
+)
+def test_create_file_from_elements_format_normalization(format_arg: str, expected_substring: str):
+    """Format string is stripped and lowercased (e.g. ' Markdown ' -> 'markdown')."""
+    elements = [Title("Heading"), NarrativeText("Body")]
+    content = base.create_file_from_elements(elements, output_format=format_arg)
+    assert expected_substring in content
+
+
 def test_element_to_md_with_none_mime_type():
     """Test element_to_md handles None mime_type gracefully."""
     from unstructured.documents.elements import ElementMetadata, Image
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.21.5"  # pragma: no cover
+__version__ = "0.21.6"  # pragma: no cover
diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py
@@ -195,6 +195,76 @@ def elements_to_md(
     return markdown_content
 
 
+def create_file_from_elements(
+    elements: Iterable[Element],
+    output_format: str = "markdown",
+    filename: Optional[str] = None,
+    encoding: str = "utf-8",
+    exclude_binary_image_data: bool = False,
+    no_group_by_page: bool = True,
+) -> str:
+    """Re-create a document file from a list of elements (reverse of partition).
+
+    Use this after partitioning a document, optionally modifying elements (e.g. replacing
+    Image elements with NarrativeText using alt text), then writing back to a file.
+
+    Supported formats: "markdown", "html", "text".
+
+    Args:
+        elements: Iterable of elements to convert (e.g. from partition_* or after editing).
+        output_format: Output format: "markdown", "html", or "text".
+        filename: Optional path to write the document to.
+        encoding: File encoding when writing to file (all formats).
+        exclude_binary_image_data: If True, omit base64 image data. Applies only to
+            **markdown** and **html**; ignored for text.
+        no_group_by_page: If True (default), include all elements in output. If False,
+            group **html** by page (elements without metadata.page_number are skipped).
+            Applies only to **html**; ignored for markdown and text.
+
+    Returns:
+        The document content as a string.
+
+    Example:
+        >>> from unstructured.partition.md import partition_md
+        >>> from unstructured.staging.base import create_file_from_elements
+        >>> elements = partition_md("README.md")
+        >>> # ... modify elements (e.g. replace Image with NarrativeText) ...
+        >>> create_file_from_elements(elements, output_format="markdown", filename="out.md")
+    """
+    format_lower = output_format.strip().lower()
+    if format_lower not in ("markdown", "html", "text"):
+        raise ValueError(
+            f"Unsupported format: {output_format!r}. Supported formats: 'markdown', 'html', 'text'."
+        )
+
+    if format_lower == "markdown":
+        content = elements_to_md(
+            elements,
+            filename=filename,
+            exclude_binary_image_data=exclude_binary_image_data,
+            encoding=encoding,
+        )
+        return content
+    elif format_lower == "html":
+        from unstructured.partition.html.convert import elements_to_html
+
+        content = elements_to_html(
+            list(elements),
+            exclude_binary_image_data=exclude_binary_image_data,
+            no_group_by_page=no_group_by_page,
+        )
+        if filename is not None:
+            with open(filename, "w", encoding=encoding) as f:
+                f.write(content)
+        return content
+    else:
+        # text: delegate write to elements_to_text when filename is set
+        content = convert_to_text(elements)
+        if filename is not None:
+            elements_to_text(elements, filename=filename, encoding=encoding)
+        return content
+
+
 def elements_to_json(
     elements: Iterable[Element],
     filename: Optional[str] = None,

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.21.5" # pragma: no cover`
	`1`	`+__version__ = "0.21.6" # pragma: no cover`