oidlabs-com · notsooamit · May 27, 2026 · May 27, 2026
diff --git a/lexoid/api.py b/lexoid/api.py
@@ -154,6 +154,9 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
         logger.debug("Using LLM parser")
         result = parse_llm_doc(path, **kwargs)
 
+    if "error" in result:
+        raise RuntimeError(result["error"])
+
     result["parser_used"] = parser_type
 
     # Log page numbers that were parsed in this chunk
@@ -171,7 +174,9 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
         # Non-fatal: logging should not break parsing
         logger.warning(f"Failed to log parsed page numbers: {e}")
     return_bboxes = kwargs.get("return_bboxes", False)
-    has_bboxes = bool(result["segments"][0].get("bboxes"))
+    has_bboxes = bool(
+        result["segments"] and result["segments"][0].get("bboxes")
+    )
     bbox_framework = kwargs.get("bbox_framework", None)
     framework = kwargs.get("framework", DEFAULT_STATIC_FRAMEWORK)
     bbox_framework_different = bbox_framework and bbox_framework != framework

diff --git a/lexoid/core/conversion_utils.py b/lexoid/core/conversion_utils.py
@@ -3,9 +3,10 @@
 import io
 import mimetypes
 import os
+import shutil
 import subprocess
 import sys
-from typing import Any, Dict, List, Tuple, Type, Union, get_args, get_origin
+from typing import Any, Dict, List, Optional, Tuple, Type, Union, get_args, get_origin
 
 import cv2
 import docx2pdf
@@ -161,31 +162,94 @@ def handle_load_finished(status):
     return output_path
 
 
+def _is_valid_pdf(path: str) -> bool:
+    """Check that the file exists and starts with a PDF header."""
+    if not os.path.isfile(path):
+        return False
+    try:
+        with open(path, "rb") as f:
+            return f.read(5) == b"%PDF-"
+    except Exception:
+        return False
+
+
+def _find_soffice_binary() -> Optional[str]:
+    """Locate the LibreOffice binary on the system."""
+    candidates = ["soffice", "lowriter"]
+
+    if sys.platform == "darwin":
+        candidates.extend(
+            [
+                "/Applications/LibreOffice.app/Contents/MacOS/soffice",
+            ]
+        )
+    elif sys.platform == "win32":
+        candidates.extend(
+            [
+                os.path.expandvars(r"%ProgramFiles%\LibreOffice\program\soffice.exe"),
+                os.path.expandvars(r"%ProgramFiles(x86)%\LibreOffice\program\soffice.exe"),
+            ]
+        )
+
+    for candidate in candidates:
+        if shutil.which(candidate) or os.path.isfile(candidate):
+            return candidate
+    return None
+
+
+def _convert_with_soffice(input_path: str, output_dir: str) -> str:
+    """Convert a document to PDF using LibreOffice."""
+    binary = _find_soffice_binary()
+    if not binary:
+        raise RuntimeError(
+            "LibreOffice is not installed. Install it or ensure docx2pdf works."
+        )
+
+    subprocess.run(
+        [
+            binary,
+            "--headless",
+            "--convert-to",
+            "pdf",
+            "--outdir",
+            output_dir,
+            input_path,
+        ],
+        check=True,
+        capture_output=True,
+    )
+
+    return os.path.join(
+        output_dir,
+        os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
+    )
+
+
 def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
+    # Resolve to absolute paths — docx2pdf / COM / AppleScript require them
+    input_path = os.path.abspath(input_path)
+    temp_dir = os.path.abspath(temp_dir)
+
     temp_path = os.path.join(
         temp_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf"
     )
 
-    # Convert the document to PDF
-    # docx2pdf is not supported in linux. Use LibreOffice in linux instead.
-    # May need to install LibreOffice if not already installed.
-    if "linux" in sys.platform.lower():
-        subprocess.run(
-            [
-                "lowriter",
-                "--headless",
-                "--convert-to",
-                "pdf",
-                "--outdir",
-                temp_dir,
-                input_path,
-            ],
-            check=True,
-        )
+    if sys.platform.startswith("linux"):
+        temp_path = _convert_with_soffice(input_path, temp_dir)
     else:
-        docx2pdf.convert(input_path, temp_path)
+        try:
+            docx2pdf.convert(input_path, temp_path)
+        except Exception:
+            logger.warning(
+                "docx2pdf failed, falling back to LibreOffice for conversion"
+            )
+            temp_path = _convert_with_soffice(input_path, temp_dir)
+
+    if not _is_valid_pdf(temp_path):
+        raise RuntimeError(
+            f"PDF conversion produced an invalid or missing file: {temp_path}"
+        )
 
-    # Return the path of the converted PDF
     return temp_path
 
 

diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -26,36 +26,36 @@
 @pytest.mark.parametrize("model", models)
 async def test_llm_parse(model):
     input_data = "examples/inputs/test_1.pdf"
-    expected_ouput_path = "examples/outputs/test_1.md"
+    expected_output_path = "examples/outputs/test_1.md"
     config = {"parser_type": "LLM_PARSE", "model": model, "verbose": True}
     result = parse(input_data, **config)["raw"]
     assert isinstance(result, str)
 
     # Compare the result with the expected output
-    expected_ouput = open(expected_ouput_path, "r").read()
+    expected_output = open(expected_output_path, "r").read()
     # save the result to a file
     with open(f"{output_dir}/input_table_{model.replace('/', '_')}.md", "w") as f:
         f.write(result)
-    score = calculate_similarities(result, expected_ouput)["sequence_matcher"]
+    score = calculate_similarities(result, expected_output)["sequence_matcher"]
     assert round(score, 3) > 0.75
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model", models)
 async def test_jpg_parse(model):
     input_data = "examples/inputs/test_4.jpg"
-    expected_ouput_path = "examples/outputs/test_4.md"
+    expected_output_path = "examples/outputs/test_4.md"
     config = {"parser_type": "LLM_PARSE", "model": model}
     result = parse(input_data, **config)["raw"]
     assert isinstance(result, str)
 
     # Compare the result with the expected output
-    expected_ouput = open(expected_ouput_path, "r").read()
+    expected_output = open(expected_output_path, "r").read()
     # save the result to a file
     m_name = model.replace("/", "_")
     with open(f"{output_dir}/input_image_{m_name}.md", "w") as f:
         f.write(result)
-    score = calculate_similarities(result, expected_ouput)["sequence_matcher"]
+    score = calculate_similarities(result, expected_output)["sequence_matcher"]
     assert round(score, 3) > 0.8