@@ -128,3 +128,61 @@ def _get(url, *a, **k):
128128 assert "" in result .markdown
129129 # drove the full poll loop: running once, then done
130130 assert _get .calls == 2
131+
132+
133+ def test_poll_interval_zero_is_clamped_to_positive ():
134+ from openkb .parsers .mineru import MineruParser
135+ assert MineruParser ({"poll_interval" : 0 }).poll_interval > 0
136+ assert MineruParser ({"poll_interval" : - 5 }).poll_interval > 0
137+ assert MineruParser ({"poll_interval" : 2 }).poll_interval == 2
138+
139+
140+ def test_image_prefix_rewrite_is_anchored (tmp_path ):
141+ import io , sys , types , zipfile
142+ from unittest .mock import MagicMock
143+ # markdown has a real image link AND an unrelated 'images/fig.png' substring in prose
144+ buf = io .BytesIO ()
145+ with zipfile .ZipFile (buf , "w" ) as zf :
146+ zf .writestr ("full.md" , "See path other_images/fig.png in text.\n \n " )
147+ zf .writestr ("images/fig.png" , b"PNG" )
148+ from openkb .parsers .mineru import _result_from_zip
149+ result = _result_from_zip (buf .getvalue ())
150+ assert "" in result .markdown # link rewritten
151+ assert "other_images/fig.png" in result .markdown # unrelated prose untouched
152+ assert result .images ["fig.png" ] == b"PNG"
153+
154+
155+ def test_cloud_empty_extract_result_then_done (monkeypatch , tmp_path ):
156+ import io , sys , types , zipfile
157+ from unittest .mock import MagicMock
158+ monkeypatch .setenv ("MINERU_API_KEY" , "key" )
159+ monkeypatch .setattr ("openkb.parsers.mineru.time.sleep" , lambda * a , ** k : None )
160+ buf = io .BytesIO ()
161+ with zipfile .ZipFile (buf , "w" ) as zf :
162+ zf .writestr ("full.md" , "# Ok" )
163+ zip_bytes = buf .getvalue ()
164+
165+ def _resp (json_data = None , content = None ):
166+ r = MagicMock (); r .raise_for_status = MagicMock ()
167+ if json_data is not None : r .json .return_value = json_data
168+ if content is not None : r .content = content
169+ return r
170+ client = MagicMock ()
171+ client .__enter__ = MagicMock (return_value = client ); client .__exit__ = MagicMock (return_value = False )
172+ client .post .return_value = _resp (json_data = {"data" : {"batch_id" : "b1" , "file_urls" : ["https://up" ]}})
173+ client .put .return_value = _resp ()
174+ empty = _resp (json_data = {"data" : {"extract_result" : []}}) # queued: empty list
175+ done = _resp (json_data = {"data" : {"extract_result" : [{"state" : "done" , "full_zip_url" : "https://zip" }]}})
176+ zipr = _resp (content = zip_bytes )
177+ def _get (url , * a , ** k ):
178+ if url == "https://zip" : return zipr
179+ _get .n += 1
180+ return empty if _get .n == 1 else done
181+ _get .n = 0
182+ client .get .side_effect = _get
183+ httpx_mod = types .ModuleType ("httpx" ); httpx_mod .Client = MagicMock (return_value = client )
184+ monkeypatch .setitem (sys .modules , "httpx" , httpx_mod )
185+ from openkb .parsers .mineru import MineruParser
186+ src = tmp_path / "d.pdf" ; src .write_bytes (b"%PDF" )
187+ result = MineruParser ({"mode" : "cloud" , "poll_interval" : 1 }).parse (src )
188+ assert "Ok" in result .markdown # survived the empty-list poll without crashing
0 commit comments