Skip to content

Commit b243505

Browse files
committed
fix(parsers): delete uploaded Mistral OCR files; fix patch.stopall test hygiene (#77)
1 parent 02daf52 commit b243505

3 files changed

Lines changed: 72 additions & 30 deletions

File tree

openkb/parsers/mistral.py

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -42,26 +42,37 @@ def parse(self, src: Path) -> ParseResult:
4242
) from exc
4343

4444
client = Mistral(api_key=api_key)
45-
uploaded = client.files.upload(
46-
file={"file_name": src.name, "content": src.read_bytes()}, purpose="ocr"
47-
)
48-
signed = client.files.get_signed_url(file_id=uploaded.id)
49-
resp = client.ocr.process(
50-
model=self.model,
51-
document={"type": "document_url", "document_url": signed.url},
52-
include_image_base64=True,
53-
)
45+
uploaded = None
46+
try:
47+
uploaded = client.files.upload(
48+
file={"file_name": src.name, "content": src.read_bytes()}, purpose="ocr"
49+
)
50+
signed = client.files.get_signed_url(file_id=uploaded.id)
51+
resp = client.ocr.process(
52+
model=self.model,
53+
document={"type": "document_url", "document_url": signed.url},
54+
include_image_base64=True,
55+
)
5456

55-
parts: list[str] = []
56-
images: dict[str, bytes] = {}
57-
for page in resp.pages:
58-
parts.append(page.markdown or "")
59-
for img in getattr(page, "images", None) or []:
60-
raw = img.image_base64 or ""
61-
raw = _DATA_URI_RE.sub("", raw)
57+
parts: list[str] = []
58+
images: dict[str, bytes] = {}
59+
for page in resp.pages:
60+
parts.append(page.markdown or "")
61+
for img in getattr(page, "images", None) or []:
62+
raw = img.image_base64 or ""
63+
raw = _DATA_URI_RE.sub("", raw)
64+
try:
65+
images[img.id] = base64.b64decode(raw, validate=True)
66+
except Exception:
67+
logger.warning("Skipping undecodable Mistral image: %s", getattr(img, "id", "?"))
68+
continue
69+
return ParseResult(markdown="\n\n".join(parts), images=images)
70+
finally:
71+
if uploaded is not None:
6272
try:
63-
images[img.id] = base64.b64decode(raw, validate=True)
73+
client.files.delete(file_id=uploaded.id)
6474
except Exception:
65-
logger.warning("Skipping undecodable Mistral image: %s", getattr(img, "id", "?"))
66-
continue
67-
return ParseResult(markdown="\n\n".join(parts), images=images)
75+
logger.warning(
76+
"Failed to delete uploaded Mistral OCR file %s",
77+
getattr(uploaded, "id", "?"),
78+
)

tests/test_parsers_local.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,10 @@ def test_parse_other_uses_markitdown_and_extracts_base64(tmp_path):
3939
src = tmp_path / "deck.pptx"
4040
src.write_bytes(b"PK fake")
4141
images_dir = tmp_path / "img" / "deck"
42-
fake_mid = patch("openkb.parsers.local.MarkItDown").start()
43-
fake_mid.return_value.convert.return_value.text_content = "MARKITDOWN MD"
44-
try:
45-
with patch("openkb.parsers.local.extract_base64_images", return_value="CLEANED") as ex:
46-
p = LocalParser(doc_name="deck", images_dir=images_dir, source_dir=tmp_path)
47-
result = p.parse(src)
48-
ex.assert_called_once_with("MARKITDOWN MD", "deck", images_dir)
49-
assert result.markdown == "CLEANED"
50-
finally:
51-
patch.stopall()
42+
with patch("openkb.parsers.local.MarkItDown") as fake_mid, \
43+
patch("openkb.parsers.local.extract_base64_images", return_value="CLEANED") as ex:
44+
fake_mid.return_value.convert.return_value.text_content = "MARKITDOWN MD"
45+
p = LocalParser(doc_name="deck", images_dir=images_dir, source_dir=tmp_path)
46+
result = p.parse(src)
47+
ex.assert_called_once_with("MARKITDOWN MD", "deck", images_dir)
48+
assert result.markdown == "CLEANED"

tests/test_parsers_mistral.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,37 @@ def test_undecodable_image_logged_and_skipped(monkeypatch, tmp_path, caplog):
8686
result = MistralParser({}).parse(src)
8787
assert "bad.png" not in result.images
8888
assert any("bad.png" in r.message for r in caplog.records)
89+
90+
91+
def test_uploaded_file_is_deleted(monkeypatch, tmp_path):
92+
import sys, types
93+
from unittest.mock import MagicMock
94+
monkeypatch.setenv("MISTRAL_API_KEY", "k")
95+
client = MagicMock()
96+
client.files.upload.return_value = MagicMock(id="file-1")
97+
client.files.get_signed_url.return_value = MagicMock(url="https://signed")
98+
client.ocr.process.return_value = MagicMock(pages=[])
99+
mod = types.ModuleType("mistralai"); mod.Mistral = MagicMock(return_value=client)
100+
monkeypatch.setitem(sys.modules, "mistralai", mod)
101+
from openkb.parsers.mistral import MistralParser
102+
src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
103+
MistralParser({}).parse(src)
104+
client.files.delete.assert_called_once_with(file_id="file-1")
105+
106+
107+
def test_uploaded_file_deleted_even_on_ocr_error(monkeypatch, tmp_path):
108+
import sys, types
109+
from unittest.mock import MagicMock
110+
import pytest
111+
monkeypatch.setenv("MISTRAL_API_KEY", "k")
112+
client = MagicMock()
113+
client.files.upload.return_value = MagicMock(id="file-2")
114+
client.files.get_signed_url.return_value = MagicMock(url="https://signed")
115+
client.ocr.process.side_effect = RuntimeError("ocr boom")
116+
mod = types.ModuleType("mistralai"); mod.Mistral = MagicMock(return_value=client)
117+
monkeypatch.setitem(sys.modules, "mistralai", mod)
118+
from openkb.parsers.mistral import MistralParser
119+
src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
120+
with pytest.raises(RuntimeError):
121+
MistralParser({}).parse(src)
122+
client.files.delete.assert_called_once_with(file_id="file-2")

0 commit comments

Comments
 (0)