Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.21.3

### Enhancements
- **Custom fallback for language detection (fixes #4091)**: Add optional `language_fallback` callable for short ASCII text (e.g. when detection would default to English). Callable receives the text and may return a list of ISO 639-3 codes or `None` to leave language unspecified; return value is validated and invalid entries are filtered out. `language_fallback` is passed through `partition()`, PDF/image partitioners, and `partition_html`; `partition_md` now accepts `languages` (use `[""]` to disable detection). Language-related parameters (`languages`, `detect_language_per_element`, `language_fallback`) are documented as top-level options and exposed explicitly on `partition_html`.

## 0.21.2

### Fixes
Expand Down
39 changes: 39 additions & 0 deletions test_unstructured/partition/common/test_lang.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,45 @@ def test_detect_languages_handles_spelled_out_languages():
assert languages == ["spa"]


def test_detect_languages_short_text_fallback_returns_none():
"""Short ASCII text with language_fallback returning None leaves language unspecified."""
result = detect_languages(
text="Hi there.",
language_fallback=lambda t: None,
)
assert result is None


def test_detect_languages_short_text_fallback_returns_custom():
"""Short ASCII text triggers fallback; we assert the fallback's return is used as-is."""
# Any short (<5 word) ASCII text would hit the fallback; content is irrelevant.
result = detect_languages(
text="Hi there.",
language_fallback=lambda t: ["fra"],
)
assert result == ["fra"]


def test_detect_languages_short_text_default_eng_without_fallback():
"""Short ASCII text without fallback still defaults to English (backward compat)."""
result = detect_languages(text="Hi there.")
assert result == ["eng"]


def test_apply_lang_metadata_with_language_fallback():
"""apply_lang_metadata passes language_fallback so short text can return None."""
elements = [NarrativeText("Hi.")]
result = list(
apply_lang_metadata(
elements=elements,
languages=["auto"],
language_fallback=lambda t: None,
)
)
assert len(result) == 1
assert result[0].metadata.languages is None


@pytest.mark.parametrize(
("languages", "ocr_languages", "expected_langs"),
[
Expand Down
26 changes: 25 additions & 1 deletion test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest):
languages=None,
metadata_filename=None,
detect_language_per_element=False,
language_fallback=None,
infer_table_structure=False,
extract_images_in_pdf=False,
extract_image_block_types=None,
Expand Down Expand Up @@ -1074,11 +1075,34 @@ def test_auto_partition_respects_detect_language_per_element_arg():
)
def test_auto_partition_respects_language_arg(file_extension: str):
elements = partition(
example_doc_path(f"language-docs/eng_spa_mult.{file_extension}"), languages=["deu"]
example_doc_path(f"language-docs/eng_spa_mult.{file_extension}"),
languages=["deu"],
)
assert all(element.metadata.languages == ["deu"] for element in elements)


def test_auto_partition_language_fallback_flows_through_call_chain():
"""Integration test: language_fallback must flow partition() -> partitioner -> apply_metadata
-> apply_lang_metadata -> detect_languages(). A fallback returning None yields no language.
"""
with tempfile.NamedTemporaryFile(
mode="w",
suffix=".txt",
delete=False,
encoding="utf-8",
) as f:
f.write("Hi.")
path = f.name
try:
elements = partition(filename=path, language_fallback=lambda t: None)
assert elements, "expected at least one element"
assert all(e.metadata.languages is None for e in elements), (
"language_fallback=lambda t: None should leave metadata.languages unset"
)
finally:
os.unlink(path)


# -- include_page_breaks --------------------------------------------------


Expand Down
9 changes: 9 additions & 0 deletions test_unstructured/partition/test_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,15 @@ def test_partition_md_respects_detect_language_per_element():
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]


def test_partition_md_languages_empty_disables_detection():
"""Passing languages=[\"\"] disables language detection (no metadata.languages set)."""
filename = "example-docs/README.md"
elements = partition_md(filename=filename, languages=[""])
# When detection is disabled, metadata.languages should not be set (None)
for el in elements:
assert el.metadata.languages is None


def test_partition_md_parse_table():
filename = example_doc_path("simple-table.md")
elements = partition_md(filename=filename)
Expand Down
Loading
Loading