33Engines:
44 * ``edgeparse`` — Rust binary built from this repository (always available)
55 * ``opendataloader`` — Published Java/Python package (opendataloader-pdf ≥ 2.0)
6+ * ``opendataloader_hybrid_docling_fast`` — OpenDataLoader hybrid with Docling Fast backend
7+ * ``opendataloader_hybrid_hancom`` — OpenDataLoader hybrid with Hancom backend
68 * ``pymupdf4llm`` — PyMuPDF4LLM (pip install pymupdf4llm)
79 * ``markitdown`` — Microsoft MarkItDown (pip install markitdown[all])
810 * ``liteparse`` — LlamaIndex LiteParse (@llamaindex/liteparse, Node.js CLI)
1416
1517Engine groups (for benchmark segmentation):
1618 NON_OCR_ENGINES — no ML models, no GPU; pure text/geometry extraction
19+ HYBRID_ENGINES — mixed local + backend routing for complex pages
1720 OCR_ENGINES — require deep-learning models; GPU optional but recommended
1821"""
1922
3538 "liteparse" ,
3639]
3740
41+ HYBRID_ENGINES : List [str ] = [
42+ "edgeparse" ,
43+ "opendataloader_hybrid_docling_fast" ,
44+ "opendataloader_hybrid_hancom" ,
45+ ]
46+
3847OCR_ENGINES : List [str ] = [
3948 "edgeparse" ,
4049 "docling" ,
5463
5564# Engine display metadata: name → (display_name, pip_package, description)
5665ENGINE_META : Dict [str , tuple ] = {
57- "edgeparse" : ("EdgeParse" , None , "Rust PDF engine (this repo)" ),
58- "opendataloader" : ("OpenDataLoader" , "opendataloader-pdf" , "Java/Python PDF engine" ),
59- "pymupdf4llm" : ("PyMuPDF4LLM" , "pymupdf4llm" , "PyMuPDF for LLM/RAG" ),
60- "markitdown" : ("MarkItDown" , "markitdown[all]" , "Microsoft multi-format converter" ),
61- "liteparse" : ("LiteParse" , "@llamaindex/liteparse" , "LlamaIndex local PDF parser" ),
66+ "edgeparse" : ("EdgeParse" , None , "Rust PDF engine (this repo)" ),
67+ "opendataloader" : ("OpenDataLoader" , "opendataloader-pdf" , "Java/Python PDF engine" ),
68+ "opendataloader_hybrid_docling_fast" : ("OpenDataLoader [hybrid/docling-fast]" , None , "OpenDataLoader hybrid with Docling Fast backend" ),
69+ "opendataloader_hybrid_hancom" : ("OpenDataLoader [hybrid/hancom]" , None , "OpenDataLoader hybrid with Hancom backend" ),
70+ "pymupdf4llm" : ("PyMuPDF4LLM" , "pymupdf4llm" , "PyMuPDF for LLM/RAG" ),
71+ "markitdown" : ("MarkItDown" , "markitdown[all]" , "Microsoft multi-format converter" ),
72+ "liteparse" : ("LiteParse" , "@llamaindex/liteparse" , "LlamaIndex local PDF parser" ),
6273 # OCR / ML engines
63- "docling" : ("Docling" , "docling" , "IBM Research document parser [OCR/ML]" ),
64- "marker" : ("Marker" , "marker-pdf" , "Marker PDF — Surya OCR [isolated venv]" ),
65- "mineru" : ("MinerU" , "mineru[all]" , "OpenDataLab PDF extractor [isolated venv]" ),
74+ "docling" : ("Docling" , "docling" , "IBM Research document parser [OCR/ML]" ),
75+ "marker" : ("Marker" , "marker-pdf" , "Marker PDF — Surya OCR [isolated venv]" ),
76+ "mineru" : ("MinerU" , "mineru[all]" , "OpenDataLab PDF extractor [isolated venv]" ),
6677}
6778
6879# ── Auto-register external engines ───────────────────────────────────────────
@@ -77,6 +88,8 @@ def _try_register(name: str, module_name: str, version_label: str = "installed")
7788 pass
7889
7990_try_register ("opendataloader" , "pdf_parser_opendataloader" , "published" )
91+ _try_register ("opendataloader_hybrid_docling_fast" , "pdf_parser_opendataloader_hybrid_docling_fast" , "local-hybrid" )
92+ _try_register ("opendataloader_hybrid_hancom" , "pdf_parser_opendataloader_hybrid_hancom" , "local-hybrid" )
8093_try_register ("docling" , "pdf_parser_docling" , "installed" )
8194_try_register ("pymupdf4llm" , "pdf_parser_pymupdf4llm" , "installed" )
8295_try_register ("markitdown" , "pdf_parser_markitdown" , "installed" )
0 commit comments