Skip to content

Commit 6e111fb

Browse files
committed
fix(parsers): harden MinerU cloud response handling, timeout, md selection; drop redundant image rewrite (#77)
1 parent 6287dea commit 6e111fb

2 files changed

Lines changed: 124 additions & 36 deletions

File tree

openkb/parsers/mineru.py

Lines changed: 41 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
from __future__ import annotations
22

33
import io
4+
import logging
45
import os
5-
import re
66
import time
77
import zipfile
88
from pathlib import Path
99
from typing import Any
1010

1111
from openkb.parsers.base import ParseResult, Parser
1212

13+
logger = logging.getLogger(__name__)
14+
1315
_SUPPORTED = {".pdf", ".docx", ".pptx", ".xlsx", ".xls", ".html", ".htm"}
1416
_CLOUD_BASE = "https://mineru.net/api/v4"
1517

@@ -29,24 +31,32 @@ def _result_from_zip(zip_bytes: bytes) -> ParseResult:
2931
images: dict[str, bytes] = {}
3032
markdown = ""
3133
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
32-
md_names = [n for n in zf.namelist() if n.lower().endswith(".md")]
34+
names = zf.namelist()
35+
md_names = sorted(n for n in names if n.lower().endswith(".md"))
3336
if md_names:
34-
chosen = next((n for n in md_names if n.endswith("full.md")), md_names[0])
37+
chosen = next((n for n in md_names if Path(n).name == "full.md"), md_names[0])
3538
markdown = zf.read(chosen).decode("utf-8", errors="replace")
36-
for name in zf.namelist():
39+
for name in names:
3740
if name.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".webp")):
38-
images[Path(name).name] = zf.read(name)
39-
# Markdown references images as 'images/<file>'; localize_images matches on
40-
# the bare filename, so rewrite 'images/fig.png' -> 'fig.png'.
41-
for fname in images:
42-
# Rewrite only `![alt](images/<fname>)` links (anchored on markdown image
43-
# syntax) to the bare filename, for localize_images to canonicalize. A
44-
# replacement function avoids regex-escape injection from arbitrary names.
45-
pattern = re.compile(r"(!\[[^\]]*\]\()" + re.escape("images/" + fname) + r"(\))")
46-
markdown = pattern.sub(lambda m, f=fname: m.group(1) + f + m.group(2), markdown)
41+
base = Path(name).name
42+
if base in images:
43+
logger.warning(
44+
"MinerU result has multiple images named %r in different "
45+
"folders; keeping the last. Earlier one may be lost.", base
46+
)
47+
images[base] = zf.read(name)
4748
return ParseResult(markdown=markdown, images=images)
4849

4950

51+
def _mineru_body(resp):
52+
"""Return the 'data' dict from a MinerU v4 JSON response, raising on API errors."""
53+
body = resp.json()
54+
code = body.get("code")
55+
if code not in (0, None):
56+
raise RuntimeError(f"MinerU API error (code={code}): {body.get('msg')}")
57+
return body.get("data") or {}
58+
59+
5060
class MineruParser(Parser):
5161
"""MinerU via HTTP — self-hosted server or hosted cloud API."""
5262

@@ -58,7 +68,8 @@ def __init__(self, opts: dict[str, Any] | None = None):
5868
self.base_url = self.opts.get("base_url")
5969
pi = self.opts.get("poll_interval", 3)
6070
self.poll_interval = pi if isinstance(pi, (int, float)) and pi > 0 else 3
61-
self.timeout = self.opts.get("timeout", 600)
71+
t = self.opts.get("timeout", 600)
72+
self.timeout = t if isinstance(t, (int, float)) and t > 0 else 600
6273

6374
def supports(self, suffix: str) -> bool:
6475
return suffix.lower() in _SUPPORTED
@@ -92,37 +103,43 @@ def _parse_cloud(self, src: Path) -> ParseResult:
92103
)
93104
httpx = _httpx()
94105
headers = {"Authorization": f"Bearer {api_key}"}
95-
with httpx.Client(timeout=self.timeout) as client:
106+
with httpx.Client(timeout=min(self.timeout, 120)) as client:
96107
r = client.post(
97108
f"{_CLOUD_BASE}/file-urls/batch",
98109
headers=headers,
99110
json={"files": [{"name": src.name, "is_ocr": True}]},
100111
)
101112
r.raise_for_status()
102-
data = r.json()["data"]
103-
batch_id = data["batch_id"]
104-
upload_url = data["file_urls"][0]
113+
data = _mineru_body(r)
114+
batch_id = data.get("batch_id")
115+
file_urls = data.get("file_urls") or []
116+
if not batch_id or not file_urls:
117+
raise RuntimeError(f"MinerU returned no upload URL: {data}")
118+
upload_url = file_urls[0]
105119
client.put(upload_url, content=src.read_bytes()).raise_for_status()
106-
elapsed = 0
120+
deadline = time.monotonic() + self.timeout
107121
zip_url = None
108-
while elapsed < self.timeout:
122+
while time.monotonic() < deadline:
109123
pr = client.get(
110124
f"{_CLOUD_BASE}/extract-results/batch/{batch_id}", headers=headers
111125
)
112126
pr.raise_for_status()
113-
results = pr.json()["data"]["extract_result"]
127+
data = _mineru_body(pr)
128+
results = data.get("extract_result") or []
114129
if not results:
115130
time.sleep(self.poll_interval)
116-
elapsed += self.poll_interval
117131
continue
118132
state = results[0].get("state")
119133
if state == "done":
120-
zip_url = results[0]["full_zip_url"]
134+
zip_url = results[0].get("full_zip_url")
135+
if not zip_url:
136+
raise RuntimeError(
137+
f"MinerU reported done but no full_zip_url: {results[0]}"
138+
)
121139
break
122140
if state == "failed":
123141
raise RuntimeError(f"MinerU extraction failed: {results[0]}")
124142
time.sleep(self.poll_interval)
125-
elapsed += self.poll_interval
126143
if zip_url is None:
127144
raise RuntimeError("MinerU extraction timed out.")
128145
zr = client.get(zip_url)

tests/test_parsers_mineru.py

Lines changed: 83 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,12 @@ def test_self_hosted_parses_zip(monkeypatch, tmp_path):
6363
assert isinstance(result, ParseResult)
6464
assert "Mineru" in result.markdown
6565
assert result.images["fig.png"] == b"PNGBYTES"
66-
# the images/ prefix should be rewritten to the bare filename for localize_images
67-
assert "images/fig.png" not in result.markdown
68-
assert "![p](fig.png)" in result.markdown
66+
# _result_from_zip no longer rewrites links; the raw 'images/fig.png' survives
67+
assert "images/fig.png" in result.markdown
68+
# localize_images (which now rewrites by basename) canonicalizes it
69+
from openkb.images import localize_images
70+
md2 = localize_images(result.markdown, result.images, "d", tmp_path / "imgs")
71+
assert "sources/images/d/fig.png" in md2
6972

7073

7174
def test_cloud_flow_polls_then_downloads(monkeypatch, tmp_path):
@@ -74,7 +77,7 @@ def test_cloud_flow_polls_then_downloads(monkeypatch, tmp_path):
7477

7578
buf = io.BytesIO()
7679
with zipfile.ZipFile(buf, "w") as zf:
77-
zf.writestr("full.md", "# Cloud\n\n![p](images/fig.png)")
80+
zf.writestr("full.md", "# Cloud")
7881
zf.writestr("images/fig.png", b"ZBYTES")
7982
zip_bytes = buf.getvalue()
8083

@@ -124,8 +127,6 @@ def _get(url, *a, **k):
124127
assert isinstance(result, ParseResult)
125128
assert "Cloud" in result.markdown
126129
assert result.images["fig.png"] == b"ZBYTES"
127-
assert "images/fig.png" not in result.markdown
128-
assert "![p](fig.png)" in result.markdown
129130
# drove the full poll loop: running once, then done
130131
assert _get.calls == 2
131132

@@ -137,19 +138,19 @@ def test_poll_interval_zero_is_clamped_to_positive():
137138
assert MineruParser({"poll_interval": 2}).poll_interval == 2
138139

139140

140-
def test_image_prefix_rewrite_is_anchored(tmp_path):
141-
import io, sys, types, zipfile
142-
from unittest.mock import MagicMock
143-
# markdown has a real image link AND an unrelated 'images/fig.png' substring in prose
141+
def test_result_from_zip_does_not_rewrite_links(tmp_path):
142+
import io, zipfile
143+
# The images/ -> bare rewrite moved OUT of _result_from_zip into
144+
# localize_images; _result_from_zip must leave the markdown link text intact.
144145
buf = io.BytesIO()
145146
with zipfile.ZipFile(buf, "w") as zf:
146147
zf.writestr("full.md", "See path other_images/fig.png in text.\n\n![p](images/fig.png)")
147148
zf.writestr("images/fig.png", b"PNG")
148149
from openkb.parsers.mineru import _result_from_zip
149150
result = _result_from_zip(buf.getvalue())
150-
assert "![p](fig.png)" in result.markdown # link rewritten
151+
assert "![p](images/fig.png)" in result.markdown # link text unchanged
151152
assert "other_images/fig.png" in result.markdown # unrelated prose untouched
152-
assert result.images["fig.png"] == b"PNG"
153+
assert result.images["fig.png"] == b"PNG" # images keyed by basename
153154

154155

155156
def test_cloud_empty_extract_result_then_done(monkeypatch, tmp_path):
@@ -186,3 +187,73 @@ def _get(url, *a, **k):
186187
src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
187188
result = MineruParser({"mode": "cloud", "poll_interval": 1}).parse(src)
188189
assert "Ok" in result.markdown # survived the empty-list poll without crashing
190+
191+
192+
def test_timeout_invalid_is_clamped():
193+
from openkb.parsers.mineru import MineruParser
194+
assert MineruParser({"timeout": 0}).timeout == 600
195+
assert MineruParser({"timeout": "x"}).timeout == 600
196+
assert MineruParser({"timeout": 30}).timeout == 30
197+
198+
199+
def test_cloud_api_error_envelope_raises(monkeypatch, tmp_path):
200+
import sys, types
201+
from unittest.mock import MagicMock
202+
monkeypatch.setenv("MINERU_API_KEY", "key")
203+
r = MagicMock(); r.raise_for_status = MagicMock()
204+
r.json.return_value = {"code": -10001, "msg": "token expired", "data": None}
205+
client = MagicMock()
206+
client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False)
207+
client.post.return_value = r
208+
httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client)
209+
monkeypatch.setitem(sys.modules, "httpx", httpx_mod)
210+
from openkb.parsers.mineru import MineruParser
211+
src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
212+
import pytest
213+
with pytest.raises(RuntimeError) as exc:
214+
MineruParser({"mode": "cloud"}).parse(src)
215+
assert "token expired" in str(exc.value) or "-10001" in str(exc.value)
216+
217+
218+
def test_cloud_empty_file_urls_raises(monkeypatch, tmp_path):
219+
import sys, types
220+
from unittest.mock import MagicMock
221+
monkeypatch.setenv("MINERU_API_KEY", "key")
222+
r = MagicMock(); r.raise_for_status = MagicMock()
223+
r.json.return_value = {"code": 0, "data": {"batch_id": "b1", "file_urls": []}}
224+
client = MagicMock()
225+
client.__enter__ = MagicMock(return_value=client); client.__exit__ = MagicMock(return_value=False)
226+
client.post.return_value = r
227+
httpx_mod = types.ModuleType("httpx"); httpx_mod.Client = MagicMock(return_value=client)
228+
monkeypatch.setitem(sys.modules, "httpx", httpx_mod)
229+
from openkb.parsers.mineru import MineruParser
230+
src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
231+
import pytest
232+
with pytest.raises(RuntimeError) as exc:
233+
MineruParser({"mode": "cloud"}).parse(src)
234+
assert "upload URL" in str(exc.value)
235+
236+
237+
def test_full_md_basename_preferred_over_endswith(tmp_path):
238+
import io, zipfile
239+
from openkb.parsers.mineru import _result_from_zip
240+
buf = io.BytesIO()
241+
with zipfile.ZipFile(buf, "w") as zf:
242+
zf.writestr("careful.md", "# WRONG") # ends with 'full.md' but isn't it
243+
zf.writestr("full.md", "# RIGHT")
244+
result = _result_from_zip(buf.getvalue())
245+
assert "RIGHT" in result.markdown
246+
assert "WRONG" not in result.markdown
247+
248+
249+
def test_image_basename_collision_warns(tmp_path, caplog):
250+
import io, zipfile, logging as _logging
251+
from openkb.parsers.mineru import _result_from_zip
252+
buf = io.BytesIO()
253+
with zipfile.ZipFile(buf, "w") as zf:
254+
zf.writestr("full.md", "# x")
255+
zf.writestr("images/fig.png", b"A")
256+
zf.writestr("sub/fig.png", b"B")
257+
with caplog.at_level(_logging.WARNING):
258+
result = _result_from_zip(buf.getvalue())
259+
assert any("fig.png" in r.message for r in caplog.records)

0 commit comments

Comments
 (0)