Skip to content

Commit 1d0c2bc

Browse files
feat: add language_fallback for short text and languages param to partition_md
1 parent 78e21ca commit 1d0c2bc

6 files changed

Lines changed: 90 additions & 12 deletions

File tree

test_unstructured/partition/common/test_lang.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,44 @@ def test_detect_languages_handles_spelled_out_languages():
196196
assert languages == ["spa"]
197197

198198

199+
def test_detect_languages_short_text_fallback_returns_none():
200+
"""Short ASCII text with language_fallback returning None leaves language unspecified."""
201+
result = detect_languages(
202+
text="Hi there.",
203+
language_fallback=lambda t: None,
204+
)
205+
assert result is None
206+
207+
208+
def test_detect_languages_short_text_fallback_returns_custom():
209+
"""Short ASCII text with language_fallback returns custom language."""
210+
result = detect_languages(
211+
text="Bonjour monde.",
212+
language_fallback=lambda t: ["fra"],
213+
)
214+
assert result == ["fra"]
215+
216+
217+
def test_detect_languages_short_text_default_eng_without_fallback():
218+
"""Short ASCII text without fallback still defaults to English (backward compat)."""
219+
result = detect_languages(text="Hi there.")
220+
assert result == ["eng"]
221+
222+
223+
def test_apply_lang_metadata_with_language_fallback():
224+
"""apply_lang_metadata passes language_fallback so short text can return None."""
225+
elements = [NarrativeText("Hi.")]
226+
result = list(
227+
apply_lang_metadata(
228+
elements=elements,
229+
languages=["auto"],
230+
language_fallback=lambda t: None,
231+
)
232+
)
233+
assert len(result) == 1
234+
assert result[0].metadata.languages is None
235+
236+
199237
@pytest.mark.parametrize(
200238
("languages", "ocr_languages", "expected_langs"),
201239
[

test_unstructured/partition/test_md.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,15 @@ def test_partition_md_respects_detect_language_per_element():
248248
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
249249

250250

251+
def test_partition_md_languages_empty_disables_detection():
252+
"""Passing languages=[\"\"] disables language detection (no metadata.languages set)."""
253+
filename = "example-docs/README.md"
254+
elements = partition_md(filename=filename, languages=[""])
255+
# When detection is disabled, metadata.languages should not be set (None)
256+
for el in elements:
257+
assert el.metadata.languages is None
258+
259+
251260
def test_partition_md_parse_table():
252261
filename = example_doc_path("simple-table.md")
253262
elements = partition_md(filename=filename)

unstructured/partition/auto.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def partition(
4242
ocr_languages: Optional[str] = None, # changing to optional for deprecation
4343
languages: Optional[list[str]] = None,
4444
detect_language_per_element: bool = False,
45+
language_fallback: Optional[Callable[[str], Optional[list[str]]]] = None,
4546
pdf_infer_table_structure: bool = False,
4647
extract_images_in_pdf: bool = False,
4748
extract_image_block_types: Optional[list[str]] = None,
@@ -98,6 +99,10 @@ def partition(
9899
Additional Parameters:
99100
detect_language_per_element
100101
Detect language per element instead of at the document level.
102+
language_fallback
103+
Optional callable for short text (e.g. when detection defaults to English).
104+
Called with the text; return a list of ISO 639-3 codes or None to leave
105+
language unspecified.
101106
pdf_infer_table_structure
102107
Deprecated! Use `skip_infer_table_types` to opt out of table extraction for any document
103108
type.
@@ -280,6 +285,7 @@ def augment_metadata(elements: list[Element]) -> list[Element]:
280285

281286
partitioning_kwargs = copy.deepcopy(kwargs)
282287
partitioning_kwargs["detect_language_per_element"] = detect_language_per_element
288+
partitioning_kwargs["language_fallback"] = language_fallback
283289
partitioning_kwargs["encoding"] = encoding
284290
partitioning_kwargs["infer_table_structure"] = infer_table_structure
285291
partitioning_kwargs["languages"] = languages

unstructured/partition/common/lang.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import re
44
from functools import lru_cache
5-
from typing import Iterable, Iterator, Optional
5+
from typing import Callable, Iterable, Iterator, Optional
66

77
import iso639 # pyright: ignore[reportMissingTypeStubs]
88
from langdetect import ( # pyright: ignore[reportMissingTypeStubs]
@@ -383,10 +383,16 @@ def _get_all_tesseract_langcodes_with_prefix(prefix: str) -> list[str]:
383383
def detect_languages(
384384
text: str,
385385
languages: Optional[list[str]] = None,
386+
language_fallback: Optional[Callable[[str], Optional[list[str]]]] = None,
386387
) -> Optional[list[str]]:
387388
"""
388389
Detects the list of languages present in the text (in the default "auto" mode),
389390
or formats and passes through the user inputted document languages if provided.
391+
392+
For short ASCII text (fewer than 5 words), language detection is unreliable. By default
393+
such text is assigned English (["eng"]). Use ``language_fallback`` to override:
394+
pass a callable that takes the text and returns a list of ISO 639-3 codes or None.
395+
Return None to leave language unspecified so the user can handle it.
390396
"""
391397
if languages is None:
392398
languages = ["auto"]
@@ -406,6 +412,9 @@ def detect_languages(
406412
# If text contains special characters (like ñ, å, or Korean/Mandarin/etc.) it will NOT default
407413
# to English. It will default to English if text is only ascii characters and is short.
408414
if _ASCII_RE.match(text) and len(text.split()) < 5:
415+
if language_fallback is not None:
416+
result = language_fallback(text)
417+
return result
409418
logger.debug(f'short text: "{text}". Defaulting to English.')
410419
return ["eng"]
411420

@@ -469,10 +478,12 @@ def apply_lang_metadata(
469478
elements: Iterable[Element],
470479
languages: Optional[list[str]],
471480
detect_language_per_element: bool = False,
481+
language_fallback: Optional[Callable[[str], Optional[list[str]]]] = None,
472482
) -> Iterator[Element]:
473483
"""Detect language and apply it to metadata.languages for each element in `elements`.
474484
If languages is None, default to auto detection.
475-
If languages is and empty string, skip."""
485+
If languages is and empty string, skip.
486+
language_fallback is used for short text when detection is unreliable; see detect_languages."""
476487
# -- Note this function has a stream interface, but reads the full `elements` stream into memory
477488
# -- before emitting the first updated element as output.
478489

@@ -493,8 +504,13 @@ def apply_lang_metadata(
493504
if not isinstance(elements, list):
494505
elements = list(elements)
495506

507+
def detect(text: str) -> Optional[list[str]]:
508+
return detect_languages(
509+
text=text, languages=languages, language_fallback=language_fallback
510+
)
511+
496512
full_text = " ".join(str(e.text) for e in elements if hasattr(e, "text") and e.text)
497-
detected_languages = detect_languages(text=full_text, languages=languages)
513+
detected_languages = detect(full_text)
498514
if (
499515
detected_languages is not None
500516
and len(detected_languages) == 1
@@ -508,7 +524,7 @@ def apply_lang_metadata(
508524
for e in elements:
509525
if hasattr(e, "text"):
510526
text_value = str(e.text) if e.text is not None else ""
511-
e.metadata.languages = detect_languages(text_value)
527+
e.metadata.languages = detect(text_value)
512528
yield e
513529
else:
514530
yield e

unstructured/partition/common/metadata.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,11 +180,13 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
180180
# -- `language` - auto-detect language (e.g. eng, spa) --
181181
languages = call_args.get("languages")
182182
detect_language_per_element = call_args.get("detect_language_per_element", False)
183+
language_fallback = call_args.get("language_fallback")
183184
elements = list(
184185
apply_lang_metadata(
185186
elements=elements,
186187
languages=languages,
187188
detect_language_per_element=detect_language_per_element,
189+
language_fallback=language_fallback,
188190
)
189191
)
190192

unstructured/partition/md.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from __future__ import annotations
22

3-
from typing import IO, Any
3+
from typing import IO, Any, Optional
44

55
import markdown
66
import requests
@@ -29,6 +29,7 @@ def partition_md(
2929
url: str | None = None,
3030
metadata_filename: str | None = None,
3131
metadata_last_modified: str | None = None,
32+
languages: Optional[list[str]] = None,
3233
**kwargs: Any,
3334
) -> list[Element]:
3435
"""Partitions a markdown file into its constituent elements
@@ -45,6 +46,9 @@ def partition_md(
4546
The URL of a webpage to parse. Only for URLs that return a markdown document.
4647
metadata_last_modified
4748
The last modified date for the document.
49+
languages
50+
The languages present in the document. Use ``["auto"]`` to detect (default when None).
51+
Use ``[""]`` to disable language detection.
4852
"""
4953
if text is None:
5054
text = ""
@@ -75,11 +79,14 @@ def partition_md(
7579

7680
html = markdown.markdown(text, extensions=["tables", "fenced_code"])
7781

78-
return partition_html(
79-
text=html,
80-
metadata_filename=metadata_filename or filename,
81-
metadata_file_type=FileType.MD,
82-
metadata_last_modified=metadata_last_modified or last_modified,
83-
detection_origin=DETECTION_ORIGIN,
82+
html_kwargs: dict[str, Any] = {
83+
"text": html,
84+
"metadata_filename": metadata_filename or filename,
85+
"metadata_file_type": FileType.MD,
86+
"metadata_last_modified": metadata_last_modified or last_modified,
87+
"detection_origin": DETECTION_ORIGIN,
8488
**kwargs,
85-
)
89+
}
90+
if languages is not None:
91+
html_kwargs["languages"] = languages
92+
return partition_html(**html_kwargs)

0 commit comments

Comments
 (0)