Skip to content

Commit 267533b

Browse files
fix: add language_fallback for short text and expose language params on partition_html/partition_md
1 parent 8fcbf39 commit 267533b

4 files changed

Lines changed: 55 additions & 14 deletions

File tree

test_unstructured/partition/common/test_lang.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,9 +206,10 @@ def test_detect_languages_short_text_fallback_returns_none():
206206

207207

208208
def test_detect_languages_short_text_fallback_returns_custom():
209-
"""Short ASCII text with language_fallback returns custom language."""
209+
"""Short ASCII text triggers fallback; we assert the fallback's return is used as-is."""
210+
# Any short (<5 word) ASCII text would hit the fallback; content is irrelevant.
210211
result = detect_languages(
211-
text="Bonjour monde.",
212+
text="Hi there.",
212213
language_fallback=lambda t: ["fra"],
213214
)
214215
assert result == ["fra"]

unstructured/partition/auto.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -96,13 +96,12 @@ def partition(
9696
image or pdf documents with Tesseract, you'll first need to install the appropriate
9797
Tesseract language pack. For other partitions, language is detected using naive Bayesian
9898
filter via `langdetect`. Multiple languages indicates text could be in either language.
99-
Additional Parameters:
100-
detect_language_per_element
101-
Detect language per element instead of at the document level.
102-
language_fallback
103-
Optional callable for short text (e.g. when detection defaults to English).
104-
Called with the text; return a list of ISO 639-3 codes or None to leave
105-
language unspecified.
99+
detect_language_per_element
100+
Detect language per element instead of at the document level.
101+
language_fallback
102+
Optional callable for short text (e.g. when detection defaults to English).
103+
Called with the text; return a list of ISO 639-3 codes or None to leave
104+
language unspecified.
106105
pdf_infer_table_structure
107106
Deprecated! Use `skip_infer_table_types` to opt out of table extraction for any document
108107
type.
@@ -222,6 +221,7 @@ def augment_metadata(elements: list[Element]) -> list[Element]:
222221
strategy=strategy,
223222
languages=languages,
224223
detect_language_per_element=detect_language_per_element,
224+
language_fallback=language_fallback,
225225
hi_res_model_name=hi_res_model_name or model_name,
226226
extract_images_in_pdf=extract_images_in_pdf,
227227
extract_image_block_types=extract_image_block_types,
@@ -242,6 +242,7 @@ def augment_metadata(elements: list[Element]) -> list[Element]:
242242
strategy=strategy,
243243
languages=languages,
244244
detect_language_per_element=detect_language_per_element,
245+
language_fallback=language_fallback,
245246
hi_res_model_name=hi_res_model_name or model_name,
246247
extract_images_in_pdf=extract_images_in_pdf,
247248
extract_image_block_types=extract_image_block_types,

unstructured/partition/common/lang.py

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,34 @@ def _get_all_tesseract_langcodes_with_prefix(prefix: str) -> list[str]:
380380
return [langcode for langcode in PYTESSERACT_LANG_CODES if langcode.startswith(prefix)]
381381

382382

383+
def _validate_fallback_languages(
384+
value: Optional[list[str]],
385+
) -> Optional[list[str]]:
386+
"""Validate and normalize language_fallback return value to ISO 639-3 codes.
387+
Returns None for None, non-list, or when no valid codes remain (invalid entries
388+
are logged and skipped).
389+
"""
390+
if value is None:
391+
return None
392+
if not isinstance(value, list):
393+
logger.warning(
394+
f"language_fallback must return None or a list of strings, got {type(value).__name__}."
395+
)
396+
return None
397+
validated: list[str] = []
398+
for item in value:
399+
if not isinstance(item, str) or not item.strip():
400+
continue
401+
lang = item.strip()
402+
if lang == "zho":
403+
validated.append("zho")
404+
else:
405+
language = _get_iso639_language_object(lang[:3])
406+
if language is not None:
407+
validated.append(language.part3)
408+
return validated if validated else None
409+
410+
383411
def detect_languages(
384412
text: str,
385413
languages: Optional[list[str]] = None,
@@ -392,7 +420,9 @@ def detect_languages(
392420
For short ASCII text (fewer than 5 words), language detection is unreliable. By default
393421
such text is assigned English (["eng"]). Use ``language_fallback`` to override:
394422
pass a callable that takes the text and returns a list of ISO 639-3 codes or None.
395-
Return None to leave language unspecified so the user can handle it.
423+
Return None to leave language unspecified. The caller is responsible for returning
424+
valid ISO 639-3 codes (e.g. "eng", "fra"); invalid entries are filtered out and
425+
a warning is logged; if none remain, this function returns None.
396426
"""
397427
if languages is None:
398428
languages = ["auto"]
@@ -413,8 +443,7 @@ def detect_languages(
413443
# to English. It will default to English if text is only ascii characters and is short.
414444
if _ASCII_RE.match(text) and len(text.split()) < 5:
415445
if language_fallback is not None:
416-
result = language_fallback(text)
417-
return result
446+
return _validate_fallback_languages(language_fallback(text))
418447
logger.debug(f'short text: "{text}". Defaulting to English.')
419448
return ["eng"]
420449

@@ -482,7 +511,7 @@ def apply_lang_metadata(
482511
) -> Iterator[Element]:
483512
"""Detect language and apply it to metadata.languages for each element in `elements`.
484513
If languages is None, default to auto detection.
485-
If languages is and empty string, skip.
514+
If languages is an empty string, skip.
486515
language_fallback is used for short text when detection is unreliable; see detect_languages."""
487516
# -- Note this function has a stream interface, but reads the full `elements` stream into memory
488517
# -- before emitting the first updated element as output.

unstructured/partition/html/partition.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from __future__ import annotations
66

7-
from typing import IO, Any, Iterator, List, Literal, Optional, cast
7+
from typing import IO, Any, Callable, Iterator, List, Literal, Optional, cast
88

99
import requests
1010
from lxml import etree
@@ -39,6 +39,9 @@ def partition_html(
3939
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
4040
extract_image_block_to_payload: bool = False,
4141
extract_image_block_types: Optional[list[str]] = None,
42+
languages: Optional[list[str]] = None,
43+
detect_language_per_element: bool = False,
44+
language_fallback: Optional[Callable[[str], Optional[list[str]]]] = None,
4245
**kwargs: Any,
4346
) -> list[Element]:
4447
"""Partitions an HTML document into its constituent elements.
@@ -71,6 +74,13 @@ def partition_html(
7174
7275
image_alt_mode (Literal['to_text']):
7376
When set 'to_text', the v2 parser will include the alternative text of images in the output.
77+
78+
languages
79+
The languages present in the document. Use ``["auto"]`` to detect; use ``[""]`` to disable.
80+
detect_language_per_element
81+
Detect language per element instead of at the document level.
82+
language_fallback
83+
Optional callable for short text; called with the text, return ISO 639-3 codes or None.
7484
"""
7585
# -- parser rejects an empty str, nip that edge-case in the bud here --
7686
if text is not None and text.strip() == "" and not file and not filename and not url:

0 commit comments

Comments
 (0)