Skip to content

Commit e424bb4

Browse files
committed
fix(parsers): warn on VLM global-model fallback; unify parser dispatch/VALID_PARSERS (#77)
1 parent 8af174f commit e424bb4

4 files changed

Lines changed: 63 additions & 16 deletions

File tree

openkb/parsers/registry.py

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,31 @@
66
from openkb.parsers.base import Parser
77
from openkb.parsers.local import LocalParser
88

9-
VALID_PARSERS = ("local", "mineru", "mistral", "vlm")
9+
10+
def _make_mistral(opts, config):
11+
from openkb.parsers.mistral import MistralParser
12+
return MistralParser(opts)
13+
14+
15+
def _make_vlm(opts, config):
16+
from openkb.parsers.vlm import VLMParser
17+
return VLMParser(opts, model=config.get("model"))
18+
19+
20+
def _make_mineru(opts, config):
21+
from openkb.parsers.mineru import MineruParser
22+
return MineruParser(opts)
23+
24+
25+
# Single source of truth: online-parser name -> lazy factory.
26+
_ONLINE_PARSERS = {
27+
"mineru": _make_mineru,
28+
"mistral": _make_mistral,
29+
"vlm": _make_vlm,
30+
}
31+
32+
# Valid parser names (drives the CLI --parser choice and error messages).
33+
VALID_PARSERS = ("local", *_ONLINE_PARSERS)
1034

1135

1236
def get_parser(
@@ -21,18 +45,10 @@ def get_parser(
2145
name = (override or config.get("parser") or "local").lower()
2246
if name == "local":
2347
return LocalParser(doc_name=doc_name, images_dir=images_dir, source_dir=source_dir)
24-
25-
parsers_cfg = config.get("parsers", {}) or {}
26-
opts = parsers_cfg.get(name, {}) or {}
27-
if name == "mistral":
28-
from openkb.parsers.mistral import MistralParser
29-
return MistralParser(opts)
30-
if name == "vlm":
31-
from openkb.parsers.vlm import VLMParser
32-
return VLMParser(opts, model=config.get("model"))
33-
if name == "mineru":
34-
from openkb.parsers.mineru import MineruParser
35-
return MineruParser(opts)
36-
raise ValueError(
37-
f"Unknown parser {name!r}. Valid options: {', '.join(VALID_PARSERS)}."
38-
)
48+
factory = _ONLINE_PARSERS.get(name)
49+
if factory is None:
50+
raise ValueError(
51+
f"Unknown parser {name!r}. Valid options: {', '.join(VALID_PARSERS)}."
52+
)
53+
opts = (config.get("parsers", {}) or {}).get(name, {}) or {}
54+
return factory(opts, config)

openkb/parsers/vlm.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
from __future__ import annotations
22

3+
import logging
34
from pathlib import Path
45
from typing import Any
56

67
from openkb.parsers.base import ParseResult, Parser
78
from openkb.parsers.vlm_client import transcribe_to_markdown
89

10+
logger = logging.getLogger(__name__)
11+
912
_SUPPORTED = {".pdf"}
1013

1114

@@ -18,6 +21,13 @@ def __init__(self, opts: dict[str, Any] | None = None, model: str | None = None)
1821
opts = opts or {}
1922
# parsers.vlm.model overrides the global model; else use the global model.
2023
self.model = opts.get("model") or model
24+
if not opts.get("model"):
25+
logger.warning(
26+
"VLM parser: 'parsers.vlm.model' is not set; using the global model "
27+
"%r for vision parsing. If that model is not vision-capable, set "
28+
"'parsers.vlm.model' to one (e.g. gemini/gemini-2.5-pro).",
29+
self.model,
30+
)
2131

2232
def supports(self, suffix: str) -> bool:
2333
return suffix.lower() in _SUPPORTED

tests/test_parsers_registry.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,10 @@ def test_unknown_name_raises_with_valid_options():
3131
get_parser({"parser": "nope"}, **_kwargs())
3232
assert "nope" in str(exc.value)
3333
assert "local" in str(exc.value)
34+
35+
36+
def test_valid_parsers_matches_dispatch():
37+
from openkb.parsers.registry import VALID_PARSERS, _ONLINE_PARSERS
38+
# local + every online factory key, no drift
39+
assert set(VALID_PARSERS) == {"local", *_ONLINE_PARSERS}
40+
assert VALID_PARSERS[0] == "local"

tests/test_parsers_vlm.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,17 @@ def test_parse_falls_back_to_global_model(tmp_path):
3131
with patch("openkb.parsers.vlm.transcribe_to_markdown", return_value="x") as t:
3232
p.parse(src)
3333
t.assert_called_once_with(src, model="global-model")
34+
35+
36+
def test_warns_when_falling_back_to_global_model(caplog):
37+
import logging as _logging
38+
with caplog.at_level(_logging.WARNING):
39+
VLMParser({}, model="gpt-5.4-mini")
40+
assert any("parsers.vlm.model" in r.message for r in caplog.records)
41+
42+
43+
def test_no_warning_when_vlm_model_set(caplog):
44+
import logging as _logging
45+
with caplog.at_level(_logging.WARNING):
46+
VLMParser({"model": "gemini/gemini-2.5-pro"}, model="gpt-5.4-mini")
47+
assert not any("parsers.vlm.model" in r.message for r in caplog.records)

0 commit comments

Comments
 (0)