fix: add language_fallback for short text and expose language params on partition_html/partition_md

claytonlin1110 · claytonlin1110 · commit 267533b390c0 · 2026-02-19T12:52:04.000-06:00
diff --git a/test_unstructured/partition/common/test_lang.py b/test_unstructured/partition/common/test_lang.py
@@ -206,9 +206,10 @@ def test_detect_languages_short_text_fallback_returns_none():
 
 
 def test_detect_languages_short_text_fallback_returns_custom():
-    """Short ASCII text with language_fallback returns custom language."""
+    """Short ASCII text triggers fallback; we assert the fallback's return is used as-is."""
+    # Any short (<5 word) ASCII text would hit the fallback; content is irrelevant.
     result = detect_languages(
-        text="Bonjour monde.",
+        text="Hi there.",
         language_fallback=lambda t: ["fra"],
     )
     assert result == ["fra"]
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -96,13 +96,12 @@ def partition(
         image or pdf documents with Tesseract, you'll first need to install the appropriate
         Tesseract language pack. For other partitions, language is detected using naive Bayesian
         filter via `langdetect`. Multiple languages indicates text could be in either language.
-        Additional Parameters:
-            detect_language_per_element
-                Detect language per element instead of at the document level.
-            language_fallback
-                Optional callable for short text (e.g. when detection defaults to English).
-                Called with the text; return a list of ISO 639-3 codes or None to leave
-                language unspecified.
+    detect_language_per_element
+        Detect language per element instead of at the document level.
+    language_fallback
+        Optional callable for short text (e.g. when detection defaults to English).
+        Called with the text; return a list of ISO 639-3 codes or None to leave
+        language unspecified.
     pdf_infer_table_structure
         Deprecated! Use `skip_infer_table_types` to opt out of table extraction for any document
         type.
@@ -222,6 +221,7 @@ def augment_metadata(elements: list[Element]) -> list[Element]:
             strategy=strategy,
             languages=languages,
             detect_language_per_element=detect_language_per_element,
+            language_fallback=language_fallback,
             hi_res_model_name=hi_res_model_name or model_name,
             extract_images_in_pdf=extract_images_in_pdf,
             extract_image_block_types=extract_image_block_types,
@@ -242,6 +242,7 @@ def augment_metadata(elements: list[Element]) -> list[Element]:
             strategy=strategy,
             languages=languages,
             detect_language_per_element=detect_language_per_element,
+            language_fallback=language_fallback,
             hi_res_model_name=hi_res_model_name or model_name,
             extract_images_in_pdf=extract_images_in_pdf,
             extract_image_block_types=extract_image_block_types,
diff --git a/unstructured/partition/common/lang.py b/unstructured/partition/common/lang.py
@@ -380,6 +380,34 @@ def _get_all_tesseract_langcodes_with_prefix(prefix: str) -> list[str]:
     return [langcode for langcode in PYTESSERACT_LANG_CODES if langcode.startswith(prefix)]
 
 
+def _validate_fallback_languages(
+    value: Optional[list[str]],
+) -> Optional[list[str]]:
+    """Validate and normalize language_fallback return value to ISO 639-3 codes.
+    Returns None for None, non-list, or when no valid codes remain (invalid entries
+    are logged and skipped).
+    """
+    if value is None:
+        return None
+    if not isinstance(value, list):
+        logger.warning(
+            f"language_fallback must return None or a list of strings, got {type(value).__name__}."
+        )
+        return None
+    validated: list[str] = []
+    for item in value:
+        if not isinstance(item, str) or not item.strip():
+            continue
+        lang = item.strip()
+        if lang == "zho":
+            validated.append("zho")
+        else:
+            language = _get_iso639_language_object(lang[:3])
+            if language is not None:
+                validated.append(language.part3)
+    return validated if validated else None
+
+
 def detect_languages(
     text: str,
     languages: Optional[list[str]] = None,
@@ -392,7 +420,9 @@ def detect_languages(
     For short ASCII text (fewer than 5 words), language detection is unreliable. By default
     such text is assigned English (["eng"]). Use ``language_fallback`` to override:
     pass a callable that takes the text and returns a list of ISO 639-3 codes or None.
-    Return None to leave language unspecified so the user can handle it.
+    Return None to leave language unspecified. The caller is responsible for returning
+    valid ISO 639-3 codes (e.g. "eng", "fra"); invalid entries are filtered out and
+    a warning is logged; if none remain, this function returns None.
     """
     if languages is None:
         languages = ["auto"]
@@ -413,8 +443,7 @@ def detect_languages(
     # to English. It will default to English if text is only ascii characters and is short.
     if _ASCII_RE.match(text) and len(text.split()) < 5:
         if language_fallback is not None:
-            result = language_fallback(text)
-            return result
+            return _validate_fallback_languages(language_fallback(text))
         logger.debug(f'short text: "{text}". Defaulting to English.')
         return ["eng"]
 
@@ -482,7 +511,7 @@ def apply_lang_metadata(
 ) -> Iterator[Element]:
     """Detect language and apply it to metadata.languages for each element in `elements`.
     If languages is None, default to auto detection.
-    If languages is and empty string, skip.
+    If languages is an empty string, skip.
     language_fallback is used for short text when detection is unreliable; see detect_languages."""
     # -- Note this function has a stream interface, but reads the full `elements` stream into memory
     # -- before emitting the first updated element as output.
diff --git a/unstructured/partition/html/partition.py b/unstructured/partition/html/partition.py
@@ -4,7 +4,7 @@
 
 from __future__ import annotations
 
-from typing import IO, Any, Iterator, List, Literal, Optional, cast
+from typing import IO, Any, Callable, Iterator, List, Literal, Optional, cast
 
 import requests
 from lxml import etree
@@ -39,6 +39,9 @@ def partition_html(
     image_alt_mode: Optional[Literal["to_text"]] = "to_text",
     extract_image_block_to_payload: bool = False,
     extract_image_block_types: Optional[list[str]] = None,
+    languages: Optional[list[str]] = None,
+    detect_language_per_element: bool = False,
+    language_fallback: Optional[Callable[[str], Optional[list[str]]]] = None,
     **kwargs: Any,
 ) -> list[Element]:
     """Partitions an HTML document into its constituent elements.
@@ -71,6 +74,13 @@ def partition_html(
 
     image_alt_mode (Literal['to_text']):
         When set 'to_text', the v2 parser will include the alternative text of images in the output.
+
+    languages
+        The languages present in the document. Use ``["auto"]`` to detect; use ``[""]`` to disable.
+    detect_language_per_element
+        Detect language per element instead of at the document level.
+    language_fallback
+        Optional callable for short text; called with the text, return ISO 639-3 codes or None.
     """
     # -- parser rejects an empty str, nip that edge-case in the bud here --
     if text is not None and text.strip() == "" and not file and not filename and not url: