Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.21.6

### Enhancements
- **Add `create_file_from_elements()` to re-create document files from elements (fixes #3994)**: New staging helper `create_file_from_elements(elements, output_format=..., filename=...)` converts a list of elements back into a document in the given format (`"markdown"`, `"html"`, or `"text"`), optionally writing to a file. Supports the workflow: partition → modify elements (e.g. replace Image with NarrativeText using alt text) → write back to file.

## 0.21.5

### Fixes
Expand Down
135 changes: 135 additions & 0 deletions test_unstructured/staging/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,141 @@ def test_elements_to_md_file_output():
os.unlink(tmp_filename)


def test_create_file_from_elements_markdown():
"""Test create_file_from_elements with format=markdown returns and optionally writes file."""
elements = [Title("Heading"), NarrativeText("Some body text.")]
content = base.create_file_from_elements(elements, output_format="markdown")
assert content == "# Heading\nSome body text."

with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as tmp_file:
tmp_filename = tmp_file.name
try:
out = base.create_file_from_elements(
elements, output_format="markdown", filename=tmp_filename
)
assert out == content
with open(tmp_filename) as f:
assert f.read() == content
finally:
if os.path.exists(tmp_filename):
os.unlink(tmp_filename)


def test_create_file_from_elements_text():
"""Test create_file_from_elements with format=text."""
elements = [Title("A"), NarrativeText("B")]
content = base.create_file_from_elements(elements, output_format="text")
assert content == "A\nB"


def test_create_file_from_elements_html():
"""Test create_file_from_elements with format=html returns HTML."""
elements = [Title("Page"), NarrativeText("Content")]
content = base.create_file_from_elements(elements, output_format="html")
assert "<!DOCTYPE html" in content
assert "<body>" in content
assert "Page" in content
assert "Content" in content


def test_create_file_from_elements_unsupported_format():
"""Test create_file_from_elements raises for unsupported format."""
elements = [Title("X")]
with pytest.raises(ValueError, match="Unsupported format"):
base.create_file_from_elements(elements, output_format="pdf")


def test_create_file_from_elements_html_group_by_page_drops_elements_without_page_number():
"""With no_group_by_page=False, elements without page_number are skipped (body empty)."""
elements = [Title("Page"), NarrativeText("Content")]
content = base.create_file_from_elements(elements, output_format="html", no_group_by_page=False)
assert "<!DOCTYPE html" in content
assert "<body>" in content
# Elements without metadata.page_number are not included when grouping by page
assert "Page" not in content
assert "Content" not in content


@pytest.mark.parametrize(
("format_name", "expected_in_content"),
[
("markdown", "# Heading\nSome body text."),
("text", "Heading\nSome body text."),
("html", "<!DOCTYPE html"),
],
)
def test_create_file_from_elements_filename_write(format_name: str, expected_in_content: str):
"""Test create_file_from_elements writes correct content to file for all formats."""
elements = [Title("Heading"), NarrativeText("Some body text.")]
ext = {"markdown": ".md", "text": ".txt", "html": ".html"}[format_name]
with tempfile.NamedTemporaryFile(mode="w", suffix=ext, delete=False) as tmp_file:
tmp_filename = tmp_file.name
try:
out = base.create_file_from_elements(
elements, output_format=format_name, filename=tmp_filename
)
assert expected_in_content in out
with open(tmp_filename) as f:
written = f.read()
assert expected_in_content in written
assert out == written
finally:
if os.path.exists(tmp_filename):
os.unlink(tmp_filename)


def test_create_file_from_elements_exclude_binary_image_data_markdown():
"""exclude_binary_image_data=True passthrough: markdown omits base64 image data."""
elements = [
Title("Doc"),
Image(
"Alt",
metadata=ElementMetadata(
image_base64="abc123",
image_mime_type="image/png",
),
),
]
content = base.create_file_from_elements(
elements, output_format="markdown", exclude_binary_image_data=True
)
assert "base64," not in content
assert "Alt" in content


def test_create_file_from_elements_exclude_binary_image_data_html():
"""exclude_binary_image_data=True passthrough: HTML omits base64 image data."""
elements = [
Title("Doc"),
Image(
"Alt",
metadata=ElementMetadata(
image_base64="abc123",
image_mime_type="image/png",
),
),
]
content = base.create_file_from_elements(
elements, output_format="html", exclude_binary_image_data=True
)
assert "abc123" not in content


@pytest.mark.parametrize(
("format_arg", "expected_substring"),
[
(" Markdown ", "# Heading"),
("HTML ", "<!DOCTYPE html"),
(" TEXT ", "Heading"),
],
)
def test_create_file_from_elements_format_normalization(format_arg: str, expected_substring: str):
"""Format string is stripped and lowercased (e.g. ' Markdown ' -> 'markdown')."""
elements = [Title("Heading"), NarrativeText("Body")]
content = base.create_file_from_elements(elements, output_format=format_arg)
assert expected_substring in content


def test_element_to_md_with_none_mime_type():
"""Test element_to_md handles None mime_type gracefully."""
from unstructured.documents.elements import ElementMetadata, Image
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.21.5" # pragma: no cover
__version__ = "0.21.6" # pragma: no cover
70 changes: 70 additions & 0 deletions unstructured/staging/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,76 @@ def elements_to_md(
return markdown_content


def create_file_from_elements(
elements: Iterable[Element],
output_format: str = "markdown",
filename: Optional[str] = None,
encoding: str = "utf-8",
exclude_binary_image_data: bool = False,
no_group_by_page: bool = True,
) -> str:
"""Re-create a document file from a list of elements (reverse of partition).

Use this after partitioning a document, optionally modifying elements (e.g. replacing
Image elements with NarrativeText using alt text), then writing back to a file.

Supported formats: "markdown", "html", "text".

Args:
elements: Iterable of elements to convert (e.g. from partition_* or after editing).
output_format: Output format: "markdown", "html", or "text".
filename: Optional path to write the document to.
encoding: File encoding when writing to file (all formats).
exclude_binary_image_data: If True, omit base64 image data. Applies only to
**markdown** and **html**; ignored for text.
no_group_by_page: If True (default), include all elements in output. If False,
group **html** by page (elements without metadata.page_number are skipped).
Applies only to **html**; ignored for markdown and text.

Returns:
The document content as a string.

Example:
>>> from unstructured.partition.md import partition_md
>>> from unstructured.staging.base import create_file_from_elements
>>> elements = partition_md("README.md")
>>> # ... modify elements (e.g. replace Image with NarrativeText) ...
>>> create_file_from_elements(elements, output_format="markdown", filename="out.md")
"""
format_lower = output_format.strip().lower()
if format_lower not in ("markdown", "html", "text"):
raise ValueError(
f"Unsupported format: {output_format!r}. Supported formats: 'markdown', 'html', 'text'."
)

if format_lower == "markdown":
content = elements_to_md(
elements,
filename=filename,
exclude_binary_image_data=exclude_binary_image_data,
encoding=encoding,
)
return content
elif format_lower == "html":
from unstructured.partition.html.convert import elements_to_html

content = elements_to_html(
list(elements),
exclude_binary_image_data=exclude_binary_image_data,
no_group_by_page=no_group_by_page,
)
if filename is not None:
with open(filename, "w", encoding=encoding) as f:
f.write(content)
return content
else:
# text: delegate write to elements_to_text when filename is set
content = convert_to_text(elements)
if filename is not None:
elements_to_text(elements, filename=filename, encoding=encoding)
return content


def elements_to_json(
elements: Iterable[Element],
filename: Optional[str] = None,
Expand Down
Loading