@@ -263,11 +263,23 @@ def _walk_linear(
263263 for element in body :
264264 tag_name = etree .QName (element ).localname
265265 # Check for Inline Images (blip elements)
266- drawing_blip = self .blip_xpath_expr (element )
267- drawingml_els = element .findall (
266+ _raw_drawing_blip = self .blip_xpath_expr (element )
267+ _raw_drawingml_els = element .findall (
268268 ".//w:drawing" , namespaces = MsWordDocumentBackend ._BLIP_NAMESPACES
269269 )
270- vml_images = self .vml_imagedata_xpath_expr (element )
270+ _raw_vml_images = self .vml_imagedata_xpath_expr (element )
271+
272+ # Filter out images inside textboxes to prevent double-extraction
273+ # (they will be properly extracted by _handle_textbox_content instead)
274+ def _in_textbox (elem ):
275+ return any (
276+ etree .QName (anc ).localname in ["txbxContent" , "textbox" ]
277+ for anc in elem .iterancestors ()
278+ )
279+
280+ drawing_blip = [x for x in _raw_drawing_blip if not _in_textbox (x )]
281+ drawingml_els = [x for x in _raw_drawingml_els if not _in_textbox (x )]
282+ vml_images = [x for x in _raw_vml_images if not _in_textbox (x )]
271283
272284 # Check for textbox content - check multiple textbox formats
273285 # Only process if the element hasn't been processed before
@@ -1076,10 +1088,61 @@ def _handle_textbox_content(
10761088
10771089 elem_ref .extend (self ._handle_text_elements (p , doc ))
10781090
1091+ # Extract embedded images inside the text box
1092+ tb_drawing_blip = self .blip_xpath_expr (p )
1093+ tb_vml_images = self .vml_imagedata_xpath_expr (p )
1094+ tb_drawingml_els = p .findall (
1095+ ".//w:drawing" , namespaces = MsWordDocumentBackend ._BLIP_NAMESPACES
1096+ )
1097+
1098+ if tb_drawing_blip :
1099+ pics = self ._handle_pictures (tb_drawing_blip , doc )
1100+ elem_ref .extend (pics )
1101+ elif tb_vml_images :
1102+ vml_pics = self ._handle_vml_pictures (tb_vml_images , doc )
1103+ elem_ref .extend (vml_pics )
1104+ elif tb_drawingml_els :
1105+ self ._handle_drawingml (doc = doc , drawingml_els = tb_drawingml_els )
1106+
1107+ elem_ref .extend (self ._handle_text_elements (p , doc ))
1108+
10791109 # Restore original parent
10801110 self .parents [level ] = original_parent
10811111 return elem_ref
10821112
1113+ def _clean_omml_latex (self , latex_str : str ) -> str :
1114+ """Fix common OMML to LaTeX translation quirks, handling Word's hidden spaces."""
1115+ import re
1116+
1117+ # 1. Nuke any variation of texttimes (handles hidden spaces perfectly)
1118+ latex_str = re .sub (
1119+ r"\\text\{\s*\\texttimes\s*\}|\\texttimes" , r" \\times " , latex_str
1120+ )
1121+
1122+ # 2. Force all floating brackets into proper subscripts (e.g., \tau {max} -> \tau_{max})
1123+ def fix_subs (m ):
1124+ cmd = m .group (1 )
1125+ if cmd in [
1126+ "\\ frac" ,
1127+ "\\ text" ,
1128+ "\\ mathrm" ,
1129+ "\\ mathbf" ,
1130+ "\\ sqrt" ,
1131+ "\\ hat" ,
1132+ "\\ tilde" ,
1133+ ]:
1134+ return m .group (0 )
1135+ return f"{ cmd } _{{{ m .group (2 )} }}"
1136+
1137+ latex_str = re .sub (
1138+ r"([a-zA-Z0-9]|\\[a-zA-Z]+)\s*\{([^{}]+)\}" , fix_subs , latex_str
1139+ )
1140+
1141+ # 3. THE FINAL KILL-SWITCH: Remove spaces before ANY underscores
1142+ latex_str = re .sub (r"\s+_" , "_" , latex_str )
1143+
1144+ return latex_str
1145+
10831146 def _handle_equations_in_text (self , element , text ):
10841147 only_texts = []
10851148 only_equations = []
@@ -1099,7 +1162,8 @@ def _handle_equations_in_text(self, element, text):
10991162 # processing nested oMath descendants of an already-converted node.
11001163 for child in element :
11011164 if "oMath" in child .tag and "oMathPara" not in child .tag :
1102- latex_equation = str (oMath2Latex (child )).strip ()
1165+ raw_latex = str (oMath2Latex (child )).strip ()
1166+ latex_equation = self ._clean_omml_latex (raw_latex )
11031167 if len (latex_equation ) > 0 :
11041168 only_equations .append (
11051169 self .equation_bookends .format (EQ = latex_equation )
@@ -1125,7 +1189,8 @@ def _handle_equations_in_text(self, element, text):
11251189 only_texts .append (subt .text )
11261190 texts_and_equations .append (subt .text )
11271191 elif "oMath" in subt .tag and "oMathPara" not in subt .tag :
1128- latex_equation = str (oMath2Latex (subt )).strip ()
1192+ raw_latex = str (oMath2Latex (subt )).strip ()
1193+ latex_equation = self ._clean_omml_latex (raw_latex )
11291194 if len (latex_equation ) > 0 :
11301195 only_equations .append (
11311196 self .equation_bookends .format (EQ = latex_equation )
@@ -2139,6 +2204,14 @@ def _handle_pictures(
21392204 _log .warning (f"Warning: image cannot be loaded by Pillow: { e } " )
21402205 pil_image = None
21412206
2207+ if pil_image is None and image is not None :
2208+ _log .debug (
2209+ "Direct PIL loading failed, trying DOCX conversion via LibreOffice"
2210+ )
2211+ pil_image = self ._convert_elements_via_docx (
2212+ image , ["drawing" , "pict" ]
2213+ )
2214+
21422215 elem_ref .append (self ._add_picture_to_doc (doc , parent , pil_image ))
21432216 return elem_ref
21442217
0 commit comments