@@ -380,6 +380,34 @@ def _get_all_tesseract_langcodes_with_prefix(prefix: str) -> list[str]:
380380 return [langcode for langcode in PYTESSERACT_LANG_CODES if langcode .startswith (prefix )]
381381
382382
383+ def _validate_fallback_languages (
384+ value : Optional [list [str ]],
385+ ) -> Optional [list [str ]]:
386+ """Validate and normalize language_fallback return value to ISO 639-3 codes.
387+ Returns None for None, non-list, or when no valid codes remain (invalid entries
388+ are logged and skipped).
389+ """
390+ if value is None :
391+ return None
392+ if not isinstance (value , list ):
393+ logger .warning (
394+ f"language_fallback must return None or a list of strings, got { type (value ).__name__ } ."
395+ )
396+ return None
397+ validated : list [str ] = []
398+ for item in value :
399+ if not isinstance (item , str ) or not item .strip ():
400+ continue
401+ lang = item .strip ()
402+ if lang == "zho" :
403+ validated .append ("zho" )
404+ else :
405+ language = _get_iso639_language_object (lang [:3 ])
406+ if language is not None :
407+ validated .append (language .part3 )
408+ return validated if validated else None
409+
410+
383411def detect_languages (
384412 text : str ,
385413 languages : Optional [list [str ]] = None ,
@@ -392,7 +420,9 @@ def detect_languages(
392420 For short ASCII text (fewer than 5 words), language detection is unreliable. By default
393421 such text is assigned English (["eng"]). Use ``language_fallback`` to override:
394422 pass a callable that takes the text and returns a list of ISO 639-3 codes or None.
395- Return None to leave language unspecified so the user can handle it.
423+ Return None to leave language unspecified. The caller is responsible for returning
424+ valid ISO 639-3 codes (e.g. "eng", "fra"); invalid entries are filtered out and
425+ a warning is logged; if none remain, this function returns None.
396426 """
397427 if languages is None :
398428 languages = ["auto" ]
@@ -413,8 +443,7 @@ def detect_languages(
413443 # to English. It will default to English if text is only ascii characters and is short.
414444 if _ASCII_RE .match (text ) and len (text .split ()) < 5 :
415445 if language_fallback is not None :
416- result = language_fallback (text )
417- return result
446+ return _validate_fallback_languages (language_fallback (text ))
418447 logger .debug (f'short text: "{ text } ". Defaulting to English.' )
419448 return ["eng" ]
420449
@@ -482,7 +511,7 @@ def apply_lang_metadata(
482511) -> Iterator [Element ]:
483512 """Detect language and apply it to metadata.languages for each element in `elements`.
484513 If languages is None, default to auto detection.
485- If languages is and empty string, skip.
514+ If languages is an empty string, skip.
486515 language_fallback is used for short text when detection is unreliable; see detect_languages."""
487516 # -- Note this function has a stream interface, but reads the full `elements` stream into memory
488517 # -- before emitting the first updated element as output.
0 commit comments