Skip to content

Commit 53dc0fd

Browse files
committed
fix: rewrite docx-to-PDF conversion with LibreOffice fallback
- Add _is_valid_pdf helper to validate converted output - Add _convert_with_soffice using LibreOffice (soffice/lowriter) - Use os.path.abspath to resolve paths for docx2pdf/COM/AppleScript - Linux: use LibreOffice as primary; macOS/Windows: docx2pdf with LibreOffice fallback - Add error propagation for failed LLM parses in parse_chunk - Guard against empty segments in bbox check - Fix expected_ouput -> expected_output typos in tests
1 parent ec22532 commit 53dc0fd

3 files changed

Lines changed: 95 additions & 26 deletions

File tree

lexoid/api.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,15 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
154154
logger.debug("Using LLM parser")
155155
result = parse_llm_doc(path, **kwargs)
156156

157+
if "error" in result:
158+
raise RuntimeError(result["error"])
159+
157160
result["parser_used"] = parser_type
158161

159162
return_bboxes = kwargs.get("return_bboxes", False)
160-
has_bboxes = bool(result["segments"][0].get("bboxes"))
163+
has_bboxes = bool(
164+
result["segments"] and result["segments"][0].get("bboxes")
165+
)
161166
bbox_framework = kwargs.get("bbox_framework", None)
162167
framework = kwargs.get("framework", DEFAULT_STATIC_FRAMEWORK)
163168
bbox_framework_different = bbox_framework and bbox_framework != framework

lexoid/core/conversion_utils.py

Lines changed: 83 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
import io
44
import mimetypes
55
import os
6+
import shutil
67
import subprocess
78
import sys
8-
from typing import Any, Dict, List, Tuple, Type, Union, get_args, get_origin
9+
from typing import Any, Dict, List, Optional, Tuple, Type, Union, get_args, get_origin
910

1011
import cv2
1112
import docx2pdf
@@ -158,31 +159,94 @@ def handle_load_finished(status):
158159
return output_path
159160

160161

162+
def _is_valid_pdf(path: str) -> bool:
163+
"""Check that the file exists and starts with a PDF header."""
164+
if not os.path.isfile(path):
165+
return False
166+
try:
167+
with open(path, "rb") as f:
168+
return f.read(5) == b"%PDF-"
169+
except Exception:
170+
return False
171+
172+
173+
def _find_soffice_binary() -> Optional[str]:
174+
"""Locate the LibreOffice binary on the system."""
175+
candidates = ["soffice", "lowriter"]
176+
177+
if sys.platform == "darwin":
178+
candidates.extend(
179+
[
180+
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
181+
]
182+
)
183+
elif sys.platform == "win32":
184+
candidates.extend(
185+
[
186+
os.path.expandvars(r"%ProgramFiles%\LibreOffice\program\soffice.exe"),
187+
os.path.expandvars(r"%ProgramFiles(x86)%\LibreOffice\program\soffice.exe"),
188+
]
189+
)
190+
191+
for candidate in candidates:
192+
if shutil.which(candidate) or os.path.isfile(candidate):
193+
return candidate
194+
return None
195+
196+
197+
def _convert_with_soffice(input_path: str, output_dir: str) -> str:
198+
"""Convert a document to PDF using LibreOffice."""
199+
binary = _find_soffice_binary()
200+
if not binary:
201+
raise RuntimeError(
202+
"LibreOffice is not installed. Install it or ensure docx2pdf works."
203+
)
204+
205+
subprocess.run(
206+
[
207+
binary,
208+
"--headless",
209+
"--convert-to",
210+
"pdf",
211+
"--outdir",
212+
output_dir,
213+
input_path,
214+
],
215+
check=True,
216+
capture_output=True,
217+
)
218+
219+
return os.path.join(
220+
output_dir,
221+
os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
222+
)
223+
224+
161225
def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
226+
# Resolve to absolute paths — docx2pdf / COM / AppleScript require them
227+
input_path = os.path.abspath(input_path)
228+
temp_dir = os.path.abspath(temp_dir)
229+
162230
temp_path = os.path.join(
163231
temp_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf"
164232
)
165233

166-
# Convert the document to PDF
167-
# docx2pdf is not supported in linux. Use LibreOffice in linux instead.
168-
# May need to install LibreOffice if not already installed.
169-
if "linux" in sys.platform.lower():
170-
subprocess.run(
171-
[
172-
"lowriter",
173-
"--headless",
174-
"--convert-to",
175-
"pdf",
176-
"--outdir",
177-
temp_dir,
178-
input_path,
179-
],
180-
check=True,
181-
)
234+
if sys.platform.startswith("linux"):
235+
temp_path = _convert_with_soffice(input_path, temp_dir)
182236
else:
183-
docx2pdf.convert(input_path, temp_path)
237+
try:
238+
docx2pdf.convert(input_path, temp_path)
239+
except Exception:
240+
logger.warning(
241+
"docx2pdf failed, falling back to LibreOffice for conversion"
242+
)
243+
temp_path = _convert_with_soffice(input_path, temp_dir)
244+
245+
if not _is_valid_pdf(temp_path):
246+
raise RuntimeError(
247+
f"PDF conversion produced an invalid or missing file: {temp_path}"
248+
)
184249

185-
# Return the path of the converted PDF
186250
return temp_path
187251

188252

tests/test_parser.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,36 +27,36 @@
2727
@pytest.mark.parametrize("model", models)
2828
async def test_llm_parse(model):
2929
input_data = "examples/inputs/test_1.pdf"
30-
expected_ouput_path = "examples/outputs/test_1.md"
30+
expected_output_path = "examples/outputs/test_1.md"
3131
config = {"parser_type": "LLM_PARSE", "model": model, "verbose": True}
3232
result = parse(input_data, **config)["raw"]
3333
assert isinstance(result, str)
3434

3535
# Compare the result with the expected output
36-
expected_ouput = open(expected_ouput_path, "r").read()
36+
expected_output = open(expected_output_path, "r").read()
3737
# save the result to a file
3838
with open(f"{output_dir}/input_table_{model.replace('/', '_')}.md", "w") as f:
3939
f.write(result)
40-
score = calculate_similarities(result, expected_ouput)["sequence_matcher"]
40+
score = calculate_similarities(result, expected_output)["sequence_matcher"]
4141
assert round(score, 3) > 0.75
4242

4343

4444
@pytest.mark.asyncio
4545
@pytest.mark.parametrize("model", models)
4646
async def test_jpg_parse(model):
4747
input_data = "examples/inputs/test_4.jpg"
48-
expected_ouput_path = "examples/outputs/test_4.md"
48+
expected_output_path = "examples/outputs/test_4.md"
4949
config = {"parser_type": "LLM_PARSE", "model": model}
5050
result = parse(input_data, **config)["raw"]
5151
assert isinstance(result, str)
5252

5353
# Compare the result with the expected output
54-
expected_ouput = open(expected_ouput_path, "r").read()
54+
expected_output = open(expected_output_path, "r").read()
5555
# save the result to a file
5656
m_name = model.replace("/", "_")
5757
with open(f"{output_dir}/input_image_{m_name}.md", "w") as f:
5858
f.write(result)
59-
score = calculate_similarities(result, expected_ouput)["sequence_matcher"]
59+
score = calculate_similarities(result, expected_output)["sequence_matcher"]
6060
assert round(score, 3) > 0.8
6161

6262

0 commit comments

Comments
 (0)