22
33import re
44from functools import lru_cache
5- from typing import Iterable , Iterator , Optional
5+ from typing import Callable , Iterable , Iterator , Optional
66
77import iso639 # pyright: ignore[reportMissingTypeStubs]
88from langdetect import ( # pyright: ignore[reportMissingTypeStubs]
@@ -383,10 +383,16 @@ def _get_all_tesseract_langcodes_with_prefix(prefix: str) -> list[str]:
383383def detect_languages (
384384 text : str ,
385385 languages : Optional [list [str ]] = None ,
386+ language_fallback : Optional [Callable [[str ], Optional [list [str ]]]] = None ,
386387) -> Optional [list [str ]]:
387388 """
388389 Detects the list of languages present in the text (in the default "auto" mode),
389390 or formats and passes through the user inputted document languages if provided.
391+
392+ For short ASCII text (fewer than 5 words), language detection is unreliable. By default
393+ such text is assigned English (["eng"]). Use ``language_fallback`` to override:
394+ pass a callable that takes the text and returns a list of ISO 639-3 codes or None.
395+ Return None to leave language unspecified so the user can handle it.
390396 """
391397 if languages is None :
392398 languages = ["auto" ]
@@ -406,6 +412,9 @@ def detect_languages(
406412 # If text contains special characters (like ñ, å, or Korean/Mandarin/etc.) it will NOT default
407413 # to English. It will default to English if text is only ascii characters and is short.
408414 if _ASCII_RE .match (text ) and len (text .split ()) < 5 :
415+ if language_fallback is not None :
416+ result = language_fallback (text )
417+ return result
409418 logger .debug (f'short text: "{ text } ". Defaulting to English.' )
410419 return ["eng" ]
411420
@@ -469,10 +478,12 @@ def apply_lang_metadata(
469478 elements : Iterable [Element ],
470479 languages : Optional [list [str ]],
471480 detect_language_per_element : bool = False ,
481+ language_fallback : Optional [Callable [[str ], Optional [list [str ]]]] = None ,
472482) -> Iterator [Element ]:
473483 """Detect language and apply it to metadata.languages for each element in `elements`.
474484 If languages is None, default to auto detection.
475- If languages is and empty string, skip."""
485+ If languages is and empty string, skip.
486+ language_fallback is used for short text when detection is unreliable; see detect_languages."""
476487 # -- Note this function has a stream interface, but reads the full `elements` stream into memory
477488 # -- before emitting the first updated element as output.
478489
@@ -493,8 +504,13 @@ def apply_lang_metadata(
493504 if not isinstance (elements , list ):
494505 elements = list (elements )
495506
507+ def detect (text : str ) -> Optional [list [str ]]:
508+ return detect_languages (
509+ text = text , languages = languages , language_fallback = language_fallback
510+ )
511+
496512 full_text = " " .join (str (e .text ) for e in elements if hasattr (e , "text" ) and e .text )
497- detected_languages = detect_languages ( text = full_text , languages = languages )
513+ detected_languages = detect ( full_text )
498514 if (
499515 detected_languages is not None
500516 and len (detected_languages ) == 1
@@ -508,7 +524,7 @@ def apply_lang_metadata(
508524 for e in elements :
509525 if hasattr (e , "text" ):
510526 text_value = str (e .text ) if e .text is not None else ""
511- e .metadata .languages = detect_languages (text_value )
527+ e .metadata .languages = detect (text_value )
512528 yield e
513529 else :
514530 yield e
0 commit comments