Skip to content

Commit 526db30

Browse files
committed
fix(parsers): harden MinerU poll loop and anchor image-link rewrite (#77)
1 parent 33cee68 commit 526db30

2 files changed

Lines changed: 70 additions & 2 deletions

File tree

openkb/parsers/mineru.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import io
44
import os
5+
import re
56
import time
67
import zipfile
78
from pathlib import Path
@@ -38,7 +39,11 @@ def _result_from_zip(zip_bytes: bytes) -> ParseResult:
3839
# Markdown references images as 'images/<file>'; localize_images matches on
3940
# the bare filename, so rewrite 'images/fig.png' -> 'fig.png'.
4041
for fname in images:
41-
markdown = markdown.replace(f"images/{fname}", fname)
42+
# Rewrite only `![alt](images/<fname>)` links (anchored on markdown image
43+
# syntax) to the bare filename, for localize_images to canonicalize. A
44+
# replacement function avoids regex-escape injection from arbitrary names.
45+
pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape("images/" + fname) + r"(\))")
46+
markdown = pattern.sub(lambda m, f=fname: m.group(1) + f + m.group(2), markdown)
4247
return ParseResult(markdown=markdown, images=images)
4348

4449

@@ -51,7 +56,8 @@ def __init__(self, opts: dict[str, Any] | None = None):
5156
self.opts = opts or {}
5257
self.mode = self.opts.get("mode", "cloud")
5358
self.base_url = self.opts.get("base_url")
54-
self.poll_interval = self.opts.get("poll_interval", 3)
59+
pi = self.opts.get("poll_interval", 3)
60+
self.poll_interval = pi if isinstance(pi, (int, float)) and pi > 0 else 3
5561
self.timeout = self.opts.get("timeout", 600)
5662

5763
def supports(self, suffix: str) -> bool:
@@ -105,6 +111,10 @@ def _parse_cloud(self, src: Path) -> ParseResult:
105111
)
106112
pr.raise_for_status()
107113
results = pr.json()["data"]["extract_result"]
114+
if not results:
115+
time.sleep(self.poll_interval)
116+
elapsed += self.poll_interval
117+
continue
108118
state = results[0].get("state")
109119
if state == "done":
110120
zip_url = results[0]["full_zip_url"]

tests/test_parsers_mineru.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,3 +128,61 @@ def _get(url, *a, **k):
128128
assert "![p](fig.png)" in result.markdown
129129
# drove the full poll loop: running once, then done
130130
assert _get.calls == 2
131+
132+
133+
def test_poll_interval_zero_is_clamped_to_positive():
134+
from openkb.parsers.mineru import MineruParser
135+
assert MineruParser({"poll_interval": 0}).poll_interval > 0
136+
assert MineruParser({"poll_interval": -5}).poll_interval > 0
137+
assert MineruParser({"poll_interval": 2}).poll_interval == 2
138+
139+
140+
def test_image_prefix_rewrite_is_anchored(tmp_path):
141+
import io, sys, types, zipfile
142+
from unittest.mock import MagicMock
143+
# markdown has a real image link AND an unrelated 'images/fig.png' substring in prose
144+
buf = io.BytesIO()
145+
with zipfile.ZipFile(buf, "w") as zf:
146+
zf.writestr("full.md", "See path other_images/fig.png in text.\n\n![p](images/fig.png)")
147+
zf.writestr("images/fig.png", b"PNG")
148+
from openkb.parsers.mineru import _result_from_zip
149+
result = _result_from_zip(buf.getvalue())
150+
assert "![p](fig.png)" in result.markdown # link rewritten
151+
assert "other_images/fig.png" in result.markdown # unrelated prose untouched
152+
assert result.images["fig.png"] == b"PNG"
153+
154+
155+
def test_cloud_empty_extract_result_then_done(monkeypatch, tmp_path):
156+
import io, sys, types, zipfile
157+
from unittest.mock import MagicMock
158+
monkeypatch.setenv("MINERU_API_KEY", "key")
159+
monkeypatch.setattr("openkb.parsers.mineru.time.sleep", lambda *a, **k: None)
160+
buf = io.BytesIO()
161+
with zipfile.ZipFile(buf, "w") as zf:
162+
zf.writestr("full.md", "# Ok")
163+
zip_bytes = buf.getvalue()
164+
165+
def _resp(json_data=None, content=None):
166+
r = MagicMock(); r.raise_for_status = MagicMock()
167+
if json_data is not None: r.json.return_value = json_data
168+
if content is not None: r.content = content
169+
return r
170+
client = MagicMock()
171+
client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False)
172+
client.post.return_value = _resp(json_data={"data": {"batch_id": "b1", "file_urls": ["https://up"]}})
173+
client.put.return_value = _resp()
174+
empty = _resp(json_data={"data": {"extract_result": []}}) # queued: empty list
175+
done = _resp(json_data={"data": {"extract_result": [{"state": "done", "full_zip_url": "https://zip"}]}})
176+
zipr = _resp(content=zip_bytes)
177+
def _get(url, *a, **k):
178+
if url == "https://zip": return zipr
179+
_get.n += 1
180+
return empty if _get.n == 1 else done
181+
_get.n = 0
182+
client.get.side_effect = _get
183+
httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client)
184+
monkeypatch.setitem(sys.modules, "httpx", httpx_mod)
185+
from openkb.parsers.mineru import MineruParser
186+
src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
187+
result = MineruParser({"mode": "cloud", "poll_interval": 1}).parse(src)
188+
assert "Ok" in result.markdown # survived the empty-list poll without crashing

0 commit comments

Comments
 (0)