Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion lexoid/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
logger.debug("Using LLM parser")
result = parse_llm_doc(path, **kwargs)

if "error" in result:
raise RuntimeError(result["error"])

result["parser_used"] = parser_type

# Log page numbers that were parsed in this chunk
Expand All @@ -171,7 +174,9 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
# Non-fatal: logging should not break parsing
logger.warning(f"Failed to log parsed page numbers: {e}")
return_bboxes = kwargs.get("return_bboxes", False)
has_bboxes = bool(result["segments"][0].get("bboxes"))
has_bboxes = bool(
result["segments"] and result["segments"][0].get("bboxes")
)
bbox_framework = kwargs.get("bbox_framework", None)
framework = kwargs.get("framework", DEFAULT_STATIC_FRAMEWORK)
bbox_framework_different = bbox_framework and bbox_framework != framework
Expand Down
102 changes: 83 additions & 19 deletions lexoid/core/conversion_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
import io
import mimetypes
import os
import shutil
import subprocess
import sys
from typing import Any, Dict, List, Tuple, Type, Union, get_args, get_origin
from typing import Any, Dict, List, Optional, Tuple, Type, Union, get_args, get_origin

import cv2
import docx2pdf
Expand Down Expand Up @@ -161,31 +162,94 @@ def handle_load_finished(status):
return output_path


def _is_valid_pdf(path: str) -> bool:
"""Check that the file exists and starts with a PDF header."""
if not os.path.isfile(path):
return False
try:
with open(path, "rb") as f:
return f.read(5) == b"%PDF-"
except Exception:
return False


def _find_soffice_binary() -> Optional[str]:
"""Locate the LibreOffice binary on the system."""
candidates = ["soffice", "lowriter"]

if sys.platform == "darwin":
candidates.extend(
[
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
]
)
elif sys.platform == "win32":
candidates.extend(
[
os.path.expandvars(r"%ProgramFiles%\LibreOffice\program\soffice.exe"),
os.path.expandvars(r"%ProgramFiles(x86)%\LibreOffice\program\soffice.exe"),
]
)

for candidate in candidates:
if shutil.which(candidate) or os.path.isfile(candidate):
return candidate
return None


def _convert_with_soffice(input_path: str, output_dir: str) -> str:
"""Convert a document to PDF using LibreOffice."""
binary = _find_soffice_binary()
if not binary:
raise RuntimeError(
"LibreOffice is not installed. Install it or ensure docx2pdf works."
)

subprocess.run(
[
binary,
"--headless",
"--convert-to",
"pdf",
"--outdir",
output_dir,
input_path,
],
check=True,
capture_output=True,
)

return os.path.join(
output_dir,
os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
)


def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
# Resolve to absolute paths — docx2pdf / COM / AppleScript require them
input_path = os.path.abspath(input_path)
temp_dir = os.path.abspath(temp_dir)

temp_path = os.path.join(
temp_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf"
)

# Convert the document to PDF
# docx2pdf is not supported in linux. Use LibreOffice in linux instead.
# May need to install LibreOffice if not already installed.
if "linux" in sys.platform.lower():
subprocess.run(
[
"lowriter",
"--headless",
"--convert-to",
"pdf",
"--outdir",
temp_dir,
input_path,
],
check=True,
)
if sys.platform.startswith("linux"):
temp_path = _convert_with_soffice(input_path, temp_dir)
else:
docx2pdf.convert(input_path, temp_path)
try:
docx2pdf.convert(input_path, temp_path)
except Exception:
logger.warning(
"docx2pdf failed, falling back to LibreOffice for conversion"
)
temp_path = _convert_with_soffice(input_path, temp_dir)

if not _is_valid_pdf(temp_path):
raise RuntimeError(
f"PDF conversion produced an invalid or missing file: {temp_path}"
)

# Return the path of the converted PDF
return temp_path


Expand Down
12 changes: 6 additions & 6 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,36 +26,36 @@
@pytest.mark.parametrize("model", models)
async def test_llm_parse(model):
input_data = "examples/inputs/test_1.pdf"
expected_ouput_path = "examples/outputs/test_1.md"
expected_output_path = "examples/outputs/test_1.md"
config = {"parser_type": "LLM_PARSE", "model": model, "verbose": True}
result = parse(input_data, **config)["raw"]
assert isinstance(result, str)

# Compare the result with the expected output
expected_ouput = open(expected_ouput_path, "r").read()
expected_output = open(expected_output_path, "r").read()
# save the result to a file
with open(f"{output_dir}/input_table_{model.replace('/', '_')}.md", "w") as f:
f.write(result)
score = calculate_similarities(result, expected_ouput)["sequence_matcher"]
score = calculate_similarities(result, expected_output)["sequence_matcher"]
assert round(score, 3) > 0.75


@pytest.mark.asyncio
@pytest.mark.parametrize("model", models)
async def test_jpg_parse(model):
input_data = "examples/inputs/test_4.jpg"
expected_ouput_path = "examples/outputs/test_4.md"
expected_output_path = "examples/outputs/test_4.md"
config = {"parser_type": "LLM_PARSE", "model": model}
result = parse(input_data, **config)["raw"]
assert isinstance(result, str)

# Compare the result with the expected output
expected_ouput = open(expected_ouput_path, "r").read()
expected_output = open(expected_output_path, "r").read()
# save the result to a file
m_name = model.replace("/", "_")
with open(f"{output_dir}/input_image_{m_name}.md", "w") as f:
f.write(result)
score = calculate_similarities(result, expected_ouput)["sequence_matcher"]
score = calculate_similarities(result, expected_output)["sequence_matcher"]
assert round(score, 3) > 0.8


Expand Down