Skip to content

Commit e3fde1e

Browse files
chore: fix wasm release and benchmark grouping (#30)
1 parent 5b70a76 commit e3fde1e

13 files changed

Lines changed: 248 additions & 112 deletions

File tree

.github/workflows/release-wasm.yml

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -94,19 +94,9 @@ jobs:
9494
name: wasm-package
9595
path: crates/edgeparse-wasm/pkg/*.tgz
9696

97-
- name: Publish WASM package to npm
98-
env:
99-
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
97+
- name: Skip WASM npm publication
10098
run: |
101-
cd crates/edgeparse-wasm/pkg
102-
OUTPUT=$(npm publish --access public 2>&1) && echo "$OUTPUT" || {
103-
echo "$OUTPUT"
104-
if echo "$OUTPUT" | grep -Eq "cannot publish over the previously published versions|You cannot publish over the previously published version"; then
105-
echo "edgeparse-wasm already published at this version — skipping."
106-
else
107-
exit 1
108-
fi
109-
}
99+
echo "::warning::WASM npm publication is disabled. The package tarball will still be uploaded to the GitHub Release."
110100
111101
- name: Upload npm tarball to GitHub Release
112102
env:

benchmark/compare_all.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,17 @@
77
Engine groups:
88
Non-OCR (fast, no ML models): edgeparse, opendataloader, pymupdf4llm,
99
markitdown, liteparse
10+
Hybrid (backend-assisted): edgeparse, opendataloader_hybrid_docling_fast,
11+
opendataloader_hybrid_hancom
1012
OCR / ML (model-heavy): edgeparse, docling, marker, mineru
1113
1214
Usage:
1315
# Non-OCR comparison (fast, recommended first run):
1416
uv run python compare_all.py --group non-ocr --install
1517
18+
# OCR/ML comparison (slow — installs isolated venvs for marker & mineru):
19+
uv run python compare_all.py --group hybrid
20+
1621
# OCR/ML comparison (slow — installs isolated venvs for marker & mineru):
1722
uv run python compare_all.py --group ocr --install
1823
@@ -30,6 +35,7 @@
3035
3136
Via Makefile:
3237
make bench-non-ocr
38+
make bench-hybrid
3339
make bench-ocr
3440
make bench-ocr OCR_ENGINES=docling
3541
make bench-compare-all
@@ -53,7 +59,7 @@
5359
sys.path.insert(0, str(Path(__file__).parent / "src"))
5460

5561
from engine_registry import (
56-
ENGINES, ENGINE_META, NON_OCR_ENGINES, OCR_ENGINES,
62+
ENGINES, ENGINE_META, NON_OCR_ENGINES, HYBRID_ENGINES, OCR_ENGINES,
5763
available_engines, display_name,
5864
)
5965
from evaluation_schema import missing_evaluation_requirements
@@ -79,6 +85,8 @@
7985
ALL_ENGINES = [
8086
# Non-OCR (fast)
8187
"edgeparse", "opendataloader", "pymupdf4llm", "markitdown", "liteparse",
88+
# Hybrid
89+
"opendataloader_hybrid_docling_fast", "opendataloader_hybrid_hancom",
8290
# OCR / ML
8391
"docling", "marker", "mineru",
8492
]
@@ -461,6 +469,7 @@ def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
461469
epilog="""
462470
Examples:
463471
uv run python compare_all.py --group non-ocr --install
472+
uv run python compare_all.py --group hybrid
464473
uv run python compare_all.py --group ocr --install
465474
uv run python compare_all.py --engines edgeparse,docling,pymupdf4llm --install
466475
uv run python compare_all.py --all --no-run
@@ -469,9 +478,9 @@ def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
469478
)
470479
parser.add_argument(
471480
"--group",
472-
choices=["non-ocr", "ocr", "all"],
481+
choices=["non-ocr", "hybrid", "ocr", "all"],
473482
default=None,
474-
help="Engine group to benchmark: non-ocr (fast), ocr (ML/model-heavy), all",
483+
help="Engine group to benchmark: non-ocr (fast), hybrid (backend-assisted), ocr (ML/model-heavy), all",
475484
)
476485
parser.add_argument(
477486
"--engines",
@@ -542,6 +551,9 @@ def main(argv: Optional[Sequence[str]] = None) -> None:
542551
elif args.group == "non-ocr":
543552
engines = list(NON_OCR_ENGINES)
544553
default_title = "EdgeParse Benchmark — Non-OCR Tools"
554+
elif args.group == "hybrid":
555+
engines = list(HYBRID_ENGINES)
556+
default_title = "EdgeParse Benchmark — Hybrid Tools"
545557
elif args.group == "ocr":
546558
engines = list(OCR_ENGINES)
547559
default_title = "EdgeParse Benchmark — OCR / ML Tools"

benchmark/src/engine_registry.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
Engines:
44
* ``edgeparse`` — Rust binary built from this repository (always available)
55
* ``opendataloader`` — Published Java/Python package (opendataloader-pdf ≥ 2.0)
6+
* ``opendataloader_hybrid_docling_fast`` — OpenDataLoader hybrid with Docling Fast backend
7+
* ``opendataloader_hybrid_hancom`` — OpenDataLoader hybrid with Hancom backend
68
* ``pymupdf4llm`` — PyMuPDF4LLM (pip install pymupdf4llm)
79
* ``markitdown`` — Microsoft MarkItDown (pip install markitdown[all])
810
* ``liteparse`` — LlamaIndex LiteParse (@llamaindex/liteparse, Node.js CLI)
@@ -14,6 +16,7 @@
1416
1517
Engine groups (for benchmark segmentation):
1618
NON_OCR_ENGINES — no ML models, no GPU; pure text/geometry extraction
19+
HYBRID_ENGINES — mixed local + backend routing for complex pages
1720
OCR_ENGINES — require deep-learning models; GPU optional but recommended
1821
"""
1922

@@ -35,6 +38,12 @@
3538
"liteparse",
3639
]
3740

41+
HYBRID_ENGINES: List[str] = [
42+
"edgeparse",
43+
"opendataloader_hybrid_docling_fast",
44+
"opendataloader_hybrid_hancom",
45+
]
46+
3847
OCR_ENGINES: List[str] = [
3948
"edgeparse",
4049
"docling",
@@ -54,15 +63,17 @@
5463

5564
# Engine display metadata: name → (display_name, pip_package, description)
5665
ENGINE_META: Dict[str, tuple] = {
57-
"edgeparse": ("EdgeParse", None, "Rust PDF engine (this repo)"),
58-
"opendataloader": ("OpenDataLoader", "opendataloader-pdf", "Java/Python PDF engine"),
59-
"pymupdf4llm": ("PyMuPDF4LLM", "pymupdf4llm", "PyMuPDF for LLM/RAG"),
60-
"markitdown": ("MarkItDown", "markitdown[all]", "Microsoft multi-format converter"),
61-
"liteparse": ("LiteParse", "@llamaindex/liteparse", "LlamaIndex local PDF parser"),
66+
"edgeparse": ("EdgeParse", None, "Rust PDF engine (this repo)"),
67+
"opendataloader": ("OpenDataLoader", "opendataloader-pdf", "Java/Python PDF engine"),
68+
"opendataloader_hybrid_docling_fast": ("OpenDataLoader [hybrid/docling-fast]", None, "OpenDataLoader hybrid with Docling Fast backend"),
69+
"opendataloader_hybrid_hancom": ("OpenDataLoader [hybrid/hancom]", None, "OpenDataLoader hybrid with Hancom backend"),
70+
"pymupdf4llm": ("PyMuPDF4LLM", "pymupdf4llm", "PyMuPDF for LLM/RAG"),
71+
"markitdown": ("MarkItDown", "markitdown[all]", "Microsoft multi-format converter"),
72+
"liteparse": ("LiteParse", "@llamaindex/liteparse", "LlamaIndex local PDF parser"),
6273
# OCR / ML engines
63-
"docling": ("Docling", "docling", "IBM Research document parser [OCR/ML]"),
64-
"marker": ("Marker", "marker-pdf", "Marker PDF — Surya OCR [isolated venv]"),
65-
"mineru": ("MinerU", "mineru[all]", "OpenDataLab PDF extractor [isolated venv]"),
74+
"docling": ("Docling", "docling", "IBM Research document parser [OCR/ML]"),
75+
"marker": ("Marker", "marker-pdf", "Marker PDF — Surya OCR [isolated venv]"),
76+
"mineru": ("MinerU", "mineru[all]", "OpenDataLab PDF extractor [isolated venv]"),
6677
}
6778

6879
# ── Auto-register external engines ───────────────────────────────────────────
@@ -77,6 +88,8 @@ def _try_register(name: str, module_name: str, version_label: str = "installed")
7788
pass
7889

7990
_try_register("opendataloader", "pdf_parser_opendataloader", "published")
91+
_try_register("opendataloader_hybrid_docling_fast", "pdf_parser_opendataloader_hybrid_docling_fast", "local-hybrid")
92+
_try_register("opendataloader_hybrid_hancom", "pdf_parser_opendataloader_hybrid_hancom", "local-hybrid")
8093
_try_register("docling", "pdf_parser_docling", "installed")
8194
_try_register("pymupdf4llm", "pdf_parser_pymupdf4llm", "installed")
8295
_try_register("markitdown", "pdf_parser_markitdown", "installed")

benchmark/src/pdf_parser_edgeparse.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,14 @@ def to_markdown(document_paths: List[Path], _input_path, output_dir: Path):
4141
"--quiet",
4242
]
4343

44+
env = dict(**__import__("os").environ)
45+
env["EDGEPARSE_RASTER_TABLE_OCR"] = "off"
46+
4447
result = subprocess.run(
4548
command,
4649
capture_output=True,
4750
text=True,
51+
env=env,
4852
)
4953

5054
if result.returncode != 0:

crates/edgeparse-cli/src/main.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,10 @@ struct Cli {
8484
#[arg(long = "image-dir")]
8585
image_dir: Option<String>,
8686

87+
/// Raster table OCR recovery (on, off)
88+
#[arg(long = "raster-table-ocr", default_value = "on")]
89+
raster_table_ocr: String,
90+
8791
/// Pages to extract (e.g., "1,3,5-7")
8892
#[arg(long = "pages")]
8993
pages: Option<String>,
@@ -206,6 +210,11 @@ fn build_config(cli: &Cli) -> edgeparse_core::api::config::ProcessingConfig {
206210
use edgeparse_core::api::config::*;
207211
use edgeparse_core::api::filter::FilterConfig;
208212

213+
let raster_table_ocr = std::env::var("EDGEPARSE_RASTER_TABLE_OCR")
214+
.ok()
215+
.map(|value| !matches!(value.as_str(), "off" | "false" | "0"))
216+
.unwrap_or_else(|| !matches!(cli.raster_table_ocr.as_str(), "off" | "false" | "0"));
217+
209218
let formats = if let Some(ref fmt) = cli.format {
210219
fmt.split(',')
211220
.filter_map(|s| match s.trim() {
@@ -258,6 +267,7 @@ fn build_config(cli: &Cli) -> edgeparse_core::api::config::ProcessingConfig {
258267
_ => ImageFormat::Png,
259268
},
260269
image_dir: cli.image_dir.clone(),
270+
raster_table_ocr,
261271
pages: cli.pages.clone(),
262272
include_header_footer: cli.include_header_footer,
263273
hybrid: match cli.hybrid.as_str() {

crates/edgeparse-core/src/api/config.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ pub struct ProcessingConfig {
114114
pub image_format: ImageFormat,
115115
/// Directory for extracted images
116116
pub image_dir: Option<String>,
117+
/// Enable raster table OCR recovery on image-based tables
118+
pub raster_table_ocr: bool,
117119
/// Pages to extract (e.g., "1,3,5-7")
118120
pub pages: Option<String>,
119121
/// Include headers/footers in output
@@ -150,6 +152,7 @@ impl Default for ProcessingConfig {
150152
image_output: ImageOutput::External,
151153
image_format: ImageFormat::Png,
152154
image_dir: None,
155+
raster_table_ocr: true,
153156
pages: None,
154157
include_header_footer: false,
155158
hybrid: HybridBackend::Off,
@@ -175,6 +178,7 @@ mod tests {
175178
assert_eq!(config.table_method, TableMethod::Default);
176179
assert_eq!(config.image_output, ImageOutput::External);
177180
assert_eq!(config.image_format, ImageFormat::Png);
181+
assert!(config.raster_table_ocr);
178182
assert_eq!(config.hybrid, HybridBackend::Off);
179183
assert_eq!(config.hybrid_timeout, 30000);
180184
}

crates/edgeparse-core/src/lib.rs

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -58,17 +58,19 @@ pub fn convert(
5858
for (&page_num, &page_id) in &pages_map {
5959
let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
6060
let mut recovered_tables = Vec::new();
61-
if let Some(page_info) = page_info_list
62-
.iter()
63-
.find(|info| info.page_number == page_num)
64-
{
65-
recovered_tables = recover_raster_table_borders(
66-
input_path,
67-
&page_info.crop_box,
68-
page_num,
69-
&page_chunks.text_chunks,
70-
&page_chunks.image_chunks,
71-
);
61+
if config.raster_table_ocr {
62+
if let Some(page_info) = page_info_list
63+
.iter()
64+
.find(|info| info.page_number == page_num)
65+
{
66+
recovered_tables = recover_raster_table_borders(
67+
input_path,
68+
&page_info.crop_box,
69+
page_num,
70+
&page_chunks.text_chunks,
71+
&page_chunks.image_chunks,
72+
);
73+
}
7274
}
7375
let mut elements: Vec<ContentElement> = page_chunks
7476
.text_chunks
@@ -124,14 +126,16 @@ pub fn convert(
124126
doc.creation_date = raw_doc.metadata.creation_date;
125127
doc.modification_date = raw_doc.metadata.modification_date;
126128

127-
for (page_idx, page) in pipeline_state.pages.iter_mut().enumerate() {
128-
if let Some(page_info) = page_info_list.get(page_idx) {
129-
recover_page_raster_table_cell_text(
130-
input_path,
131-
&page_info.crop_box,
132-
page_info.page_number,
133-
page,
134-
);
129+
if config.raster_table_ocr {
130+
for (page_idx, page) in pipeline_state.pages.iter_mut().enumerate() {
131+
if let Some(page_info) = page_info_list.get(page_idx) {
132+
recover_page_raster_table_cell_text(
133+
input_path,
134+
&page_info.crop_box,
135+
page_info.page_number,
136+
page,
137+
);
138+
}
135139
}
136140
}
137141

docs/07-cicd-publishing.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ fast on mismatches.
235235

236236
- Builds the browser-targeted WASM package with `wasm-pack`
237237
- Syncs the npm package version from the tag
238-
- Publishes `edgeparse-wasm`
238+
- npm publication is currently disabled
239239
- Uploads the generated npm tarball to the GitHub Release
240240

241241
### `release-cli.yml`
@@ -278,7 +278,7 @@ and Homebrew. Docker publishing remains CI-driven through `release-docker.yml`.
278278

279279
Crates.io versions are immutable. Bump the version and retag.
280280

281-
### npm publish fails on platform packages or the WASM package
281+
### npm publish fails on platform packages
282282

283283
Use a Classic Automation token for `NPM_TOKEN`. Granular tokens often miss one
284284
or more package names and produce `E403 Forbidden`.

0 commit comments

Comments
 (0)