@@ -63,9 +63,12 @@ def test_self_hosted_parses_zip(monkeypatch, tmp_path):
6363 assert isinstance (result , ParseResult )
6464 assert "Mineru" in result .markdown
6565 assert result .images ["fig.png" ] == b"PNGBYTES"
66- # the images/ prefix should be rewritten to the bare filename for localize_images
67- assert "images/fig.png" not in result .markdown
68- assert "" in result .markdown
66+ # _result_from_zip no longer rewrites links; the raw 'images/fig.png' survives
67+ assert "images/fig.png" in result .markdown
68+ # localize_images (which now rewrites by basename) canonicalizes it
69+ from openkb .images import localize_images
70+ md2 = localize_images (result .markdown , result .images , "d" , tmp_path / "imgs" )
71+ assert "sources/images/d/fig.png" in md2
6972
7073
7174def test_cloud_flow_polls_then_downloads (monkeypatch , tmp_path ):
@@ -74,7 +77,7 @@ def test_cloud_flow_polls_then_downloads(monkeypatch, tmp_path):
7477
7578 buf = io .BytesIO ()
7679 with zipfile .ZipFile (buf , "w" ) as zf :
77- zf .writestr ("full.md" , "# Cloud\n \n  " )
80+ zf .writestr ("full.md" , "# Cloud" )
7881 zf .writestr ("images/fig.png" , b"ZBYTES" )
7982 zip_bytes = buf .getvalue ()
8083
@@ -124,8 +127,6 @@ def _get(url, *a, **k):
124127 assert isinstance (result , ParseResult )
125128 assert "Cloud" in result .markdown
126129 assert result .images ["fig.png" ] == b"ZBYTES"
127- assert "images/fig.png" not in result .markdown
128- assert "" in result .markdown
129130 # drove the full poll loop: running once, then done
130131 assert _get .calls == 2
131132
@@ -137,19 +138,19 @@ def test_poll_interval_zero_is_clamped_to_positive():
137138 assert MineruParser ({"poll_interval" : 2 }).poll_interval == 2
138139
139140
140- def test_image_prefix_rewrite_is_anchored (tmp_path ):
141- import io , sys , types , zipfile
142- from unittest . mock import MagicMock
143- # markdown has a real image link AND an unrelated 'images/fig.png' substring in prose
141+ def test_result_from_zip_does_not_rewrite_links (tmp_path ):
142+ import io , zipfile
143+ # The images/ -> bare rewrite moved OUT of _result_from_zip into
144+ # localize_images; _result_from_zip must leave the markdown link text intact.
144145 buf = io .BytesIO ()
145146 with zipfile .ZipFile (buf , "w" ) as zf :
146147 zf .writestr ("full.md" , "See path other_images/fig.png in text.\n \n " )
147148 zf .writestr ("images/fig.png" , b"PNG" )
148149 from openkb .parsers .mineru import _result_from_zip
149150 result = _result_from_zip (buf .getvalue ())
150- assert "" in result .markdown # link rewritten
151+ assert "" in result .markdown # link text unchanged
151152 assert "other_images/fig.png" in result .markdown # unrelated prose untouched
152- assert result .images ["fig.png" ] == b"PNG"
153+ assert result .images ["fig.png" ] == b"PNG" # images keyed by basename
153154
154155
155156def test_cloud_empty_extract_result_then_done (monkeypatch , tmp_path ):
@@ -186,3 +187,73 @@ def _get(url, *a, **k):
186187 src = tmp_path / "d.pdf" ; src .write_bytes (b"%PDF" )
187188 result = MineruParser ({"mode" : "cloud" , "poll_interval" : 1 }).parse (src )
188189 assert "Ok" in result .markdown # survived the empty-list poll without crashing
190+
191+
192+ def test_timeout_invalid_is_clamped ():
193+ from openkb .parsers .mineru import MineruParser
194+ assert MineruParser ({"timeout" : 0 }).timeout == 600
195+ assert MineruParser ({"timeout" : "x" }).timeout == 600
196+ assert MineruParser ({"timeout" : 30 }).timeout == 30
197+
198+
199+ def test_cloud_api_error_envelope_raises (monkeypatch , tmp_path ):
200+ import sys , types
201+ from unittest .mock import MagicMock
202+ monkeypatch .setenv ("MINERU_API_KEY" , "key" )
203+ r = MagicMock (); r .raise_for_status = MagicMock ()
204+ r .json .return_value = {"code" : - 10001 , "msg" : "token expired" , "data" : None }
205+ client = MagicMock ()
206+ client .__enter__ = MagicMock (return_value = client ); client .__exit__ = MagicMock (return_value = False )
207+ client .post .return_value = r
208+ httpx_mod = types .ModuleType ("httpx" ); httpx_mod .Client = MagicMock (return_value = client )
209+ monkeypatch .setitem (sys .modules , "httpx" , httpx_mod )
210+ from openkb .parsers .mineru import MineruParser
211+ src = tmp_path / "d.pdf" ; src .write_bytes (b"%PDF" )
212+ import pytest
213+ with pytest .raises (RuntimeError ) as exc :
214+ MineruParser ({"mode" : "cloud" }).parse (src )
215+ assert "token expired" in str (exc .value ) or "-10001" in str (exc .value )
216+
217+
218+ def test_cloud_empty_file_urls_raises (monkeypatch , tmp_path ):
219+ import sys , types
220+ from unittest .mock import MagicMock
221+ monkeypatch .setenv ("MINERU_API_KEY" , "key" )
222+ r = MagicMock (); r .raise_for_status = MagicMock ()
223+ r .json .return_value = {"code" : 0 , "data" : {"batch_id" : "b1" , "file_urls" : []}}
224+ client = MagicMock ()
225+ client .__enter__ = MagicMock (return_value = client ); client .__exit__ = MagicMock (return_value = False )
226+ client .post .return_value = r
227+ httpx_mod = types .ModuleType ("httpx" ); httpx_mod .Client = MagicMock (return_value = client )
228+ monkeypatch .setitem (sys .modules , "httpx" , httpx_mod )
229+ from openkb .parsers .mineru import MineruParser
230+ src = tmp_path / "d.pdf" ; src .write_bytes (b"%PDF" )
231+ import pytest
232+ with pytest .raises (RuntimeError ) as exc :
233+ MineruParser ({"mode" : "cloud" }).parse (src )
234+ assert "upload URL" in str (exc .value )
235+
236+
237+ def test_full_md_basename_preferred_over_endswith (tmp_path ):
238+ import io , zipfile
239+ from openkb .parsers .mineru import _result_from_zip
240+ buf = io .BytesIO ()
241+ with zipfile .ZipFile (buf , "w" ) as zf :
242+ zf .writestr ("careful.md" , "# WRONG" ) # ends with 'full.md' but isn't it
243+ zf .writestr ("full.md" , "# RIGHT" )
244+ result = _result_from_zip (buf .getvalue ())
245+ assert "RIGHT" in result .markdown
246+ assert "WRONG" not in result .markdown
247+
248+
249+ def test_image_basename_collision_warns (tmp_path , caplog ):
250+ import io , zipfile , logging as _logging
251+ from openkb .parsers .mineru import _result_from_zip
252+ buf = io .BytesIO ()
253+ with zipfile .ZipFile (buf , "w" ) as zf :
254+ zf .writestr ("full.md" , "# x" )
255+ zf .writestr ("images/fig.png" , b"A" )
256+ zf .writestr ("sub/fig.png" , b"B" )
257+ with caplog .at_level (_logging .WARNING ):
258+ result = _result_from_zip (buf .getvalue ())
259+ assert any ("fig.png" in r .message for r in caplog .records )
0 commit comments