Skip to content

Commit 02daf52

Browse files
committed
fix(parsers): warn that VLM is text-only and on silent parser downgrade (#77)
1 parent 6e111fb commit 02daf52

5 files changed

Lines changed: 40 additions & 0 deletions

File tree

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,10 @@ Each parser handles a subset of formats — `mineru` covers PDF, Word, PPT, Exce
313313
and HTML; `mistral` and `vlm` cover PDF. `.md` and any unsupported format always
314314
fall back to the local parser.
315315

316+
The `vlm` parser is **text-only**: it transcribes a document's text via a vision
317+
LLM but does **not** extract embedded figures/images. Use `mineru`, `mistral`, or
318+
`local` if you need image extraction.
319+
316320
> **Note:** Long PDFs (≥ `pageindex_threshold` pages, default 20) continue to be
317321
> indexed with PageIndex and are **not** affected by the `parser` setting. The
318322
> parser governs the file → Markdown step for shorter documents and non-PDF files.

openkb/converter.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,11 @@ def convert_document(src: Path, kb_dir: Path, parser_override: str | None = None
103103
source_dir=src.parent,
104104
)
105105
if not parser.supports(src.suffix):
106+
if parser.name != "local":
107+
logger.warning(
108+
"Parser %r does not support %r; falling back to the local parser for %s.",
109+
parser.name, src.suffix, src.name,
110+
)
106111
parser = LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=src.parent)
107112

108113
parse_result = parser.parse(src)

openkb/parsers/vlm.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,9 @@ def supports(self, suffix: str) -> bool:
3434

3535
def parse(self, src: Path) -> ParseResult:
3636
markdown = transcribe_to_markdown(src, model=self.model)
37+
logger.warning(
38+
"VLM parser transcribes %s to text only; embedded figures/images are "
39+
"not extracted. Use a parser like 'mineru' if you need figure extraction.",
40+
src.name,
41+
)
3742
return ParseResult(markdown=markdown)

tests/test_converter.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,3 +177,18 @@ def test_local_parser_skips_redundant_localize(self, kb_dir):
177177
result = convert_document(src, kb_dir)
178178
li.assert_not_called() # local path skips localize_images
179179
assert result.source_path.read_text(encoding="utf-8") == "# md final"
180+
181+
def test_warns_on_silent_downgrade(self, kb_dir, caplog):
182+
import logging as _logging
183+
src = kb_dir / "raw" / "notes.md"
184+
src.write_text("# md", encoding="utf-8")
185+
online = MagicMock()
186+
online.name = "mistral"
187+
online.supports.return_value = False
188+
with patch("openkb.converter.get_parser", return_value=online), \
189+
patch("openkb.converter.LocalParser") as LP:
190+
LP.return_value.name = "local"
191+
LP.return_value.parse.return_value = ParseResult(markdown="# md")
192+
with caplog.at_level(_logging.WARNING):
193+
convert_document(src, kb_dir)
194+
assert any("falling back to the local parser" in r.message for r in caplog.records)

tests/test_parsers_vlm.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,14 @@ def test_no_warning_when_vlm_model_set(caplog):
4545
with caplog.at_level(_logging.WARNING):
4646
VLMParser({"model": "gemini/gemini-2.5-pro"}, model="gpt-5.4-mini")
4747
assert not any("parsers.vlm.model" in r.message for r in caplog.records)
48+
49+
50+
def test_parse_warns_text_only(tmp_path, caplog):
51+
import logging as _logging
52+
from unittest.mock import patch
53+
src = tmp_path / "d.pdf"; src.write_bytes(b"%PDF")
54+
p = VLMParser({"model": "gemini/gemini-2.5-pro"})
55+
with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="# md"):
56+
with caplog.at_level(_logging.WARNING):
57+
p.parse(src)
58+
assert any("text only" in r.message for r in caplog.records)

0 commit comments

Comments
 (0)