2121from docling .datamodel .base_models import InputFormat
2222from docling_core .types .doc .labels import DocItemLabel
2323
24+ import langid
25+
26+
27+ def detect_language (text : str , min_chars : int = 50 ) -> tuple [str , float ]:
28+ """Detect language using langid. Returns (lang_code, confidence)."""
29+ if not text or len (text ) < min_chars :
30+ return "unknown" , 0.0
31+
32+ try :
33+ lang , score = langid .classify (text [:2000 ])
34+ # Normalize confidence: langid scores are negative log-probs, typically -500 to -3000
35+ # Map to 0-1 where closer to 0 = higher confidence
36+ confidence = max (0.0 , min (1.0 , 1.0 + score / 3000 ))
37+ return lang , confidence
38+ except Exception :
39+ return "unknown" , 0.0
40+
2441
2542@contextlib .contextmanager
2643def suppress_stderr ():
@@ -85,6 +102,9 @@ def extract(converter: DocumentConverter, file_path: str) -> dict:
85102 # Use smart extraction to avoid table padding bloat
86103 text = smart_extract_text (result .document )
87104
105+ # Detect language
106+ lang , lang_confidence = detect_language (text )
107+
88108 # Get full structured extraction (stripped of image data)
89109 extraction = result .document .export_to_dict ()
90110 extraction = strip_image_data (extraction )
@@ -95,6 +115,8 @@ def extract(converter: DocumentConverter, file_path: str) -> dict:
95115 "charCount" : len (text ),
96116 "tableCount" : len (extraction .get ("tables" , [])),
97117 "imageCount" : len (extraction .get ("pictures" , [])),
118+ "language" : lang ,
119+ "languageConfidence" : lang_confidence ,
98120 "extraction" : extraction ,
99121 }
100122
0 commit comments