Skip to content

Commit 6c75f73

Browse files
authored
Merge branch 'main' into dependabot/cargo/calamine-0.34
2 parents 133d653 + 069b717 commit 6c75f73

35 files changed

Lines changed: 2648 additions & 53 deletions

.gitattributes

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Enforce byte-exact checkout for binary fixtures.
2+
# Without this, git's autodetection heuristic can misclassify uncompressed
3+
# PDFs (pageCompression=0) as text on Windows (core.autocrlf=true), which
4+
# rewrites LF to CRLF inside the PDF and corrupts xref/trailer offsets.
5+
*.pdf binary
6+
*.wasm binary
7+
*.ttf binary
8+
*.otf binary
9+
*.woff binary
10+
*.woff2 binary
11+
*.ico binary
12+
*.png binary
13+
*.jpg binary
14+
*.jpeg binary

.github/workflows/ci.yml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,51 @@ jobs:
102102
with:
103103
name: test-results-${{ matrix.os }}-py${{ matrix.python-version }}
104104
path: results.xml
105+
106+
accuracy:
107+
name: Table Extraction Accuracy
108+
needs: lint
109+
runs-on: ubuntu-latest
110+
steps:
111+
- uses: actions/checkout@v6
112+
113+
- name: Set up Python 3.13
114+
uses: actions/setup-python@v6
115+
with:
116+
python-version: "3.13"
117+
118+
- name: Install Rust toolchain
119+
uses: dtolnay/rust-toolchain@stable
120+
121+
- uses: Swatinem/rust-cache@v2.9.1
122+
with:
123+
save-if: false
124+
125+
- uses: astral-sh/setup-uv@v7
126+
with:
127+
cache-dependency-glob: pyproject.toml
128+
129+
- name: Install dependencies
130+
run: uv sync --extra dev
131+
132+
- name: Build native extension
133+
uses: PyO3/maturin-action@v1
134+
with:
135+
command: develop
136+
args: --release
137+
138+
- name: Verify fixtures are up to date
139+
run: |
140+
cp -r tests/fixtures/tables /tmp/tables_snapshot
141+
uv run python scripts/generate_table_fixtures.py
142+
diff -r /tmp/tables_snapshot tests/fixtures/tables
143+
144+
- name: Run accuracy harness
145+
run: uv run pytest tests/python/ -m accuracy -v --tb=short
146+
147+
- name: Upload accuracy report
148+
if: always()
149+
uses: actions/upload-artifact@v7
150+
with:
151+
name: accuracy-report
152+
path: tests/output/table_accuracy.json

.gitignore

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,15 @@ build/
1414
*.dll
1515
.venv/
1616
venv/
17-
.claude/
1817
junk/
1918
.mypy_cache/
2019
.pytest_cache/
2120
.ruff_cache/
2221

22+
# Claude Code
23+
CLAUDE.md
24+
.claude/
25+
2326
# IDE
2427
.idea/
2528
.vscode/
@@ -36,6 +39,9 @@ python/paperjam/libpdfium.so
3639
# Test fixtures (generated)
3740
tests/fixtures/large_*.pdf
3841

42+
# Per-session test artifacts (accuracy reports, etc.)
43+
tests/output/
44+
3945
# Sphinx
4046
_build
4147

.pre-commit-config.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,11 @@ repos:
4747
language: system
4848
files: ^docs-site/src/
4949
pass_filenames: false
50+
51+
- repo: local
52+
hooks:
53+
- id: strip-ai-attribution
54+
name: strip AI attribution from commit messages
55+
entry: .claude/hooks/strip-ai-attribution.sh
56+
language: script
57+
stages: [commit-msg]

crates/paperjam-core/Cargo.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,10 @@ render = ["dep:pdfium-render", "dep:image"]
4545
signatures = ["dep:x509-parser", "dep:cms", "dep:der", "dep:sha1", "dep:rsa", "dep:p256", "dep:pkcs8", "dep:spki"]
4646
ltv = ["signatures", "dep:ureq", "dep:rustls"]
4747
validation = ["dep:roxmltree"]
48+
49+
[dev-dependencies]
50+
criterion = { version = "0.5", features = ["html_reports"] }
51+
52+
[[bench]]
53+
name = "table_extraction"
54+
harness = false
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
//! Criterion microbench for table extraction over the fixture corpus.
2+
//!
3+
//! Run with: `cargo bench -p paperjam-core --bench table_extraction`
4+
//!
5+
//! Each synthetic PDF fixture under `tests/fixtures/tables/` becomes a bench group.
6+
//! The document and page are parsed outside the measured section so the bench only
7+
//! measures `table::extract_tables` itself — that's the code subsequent phases will
8+
//! change.
9+
10+
use std::path::PathBuf;
11+
12+
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
13+
use paperjam_core::document::Document;
14+
use paperjam_core::table::{extract_tables, TableExtractionOptions};
15+
16+
fn fixtures_dir() -> PathBuf {
17+
let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
18+
manifest.join("../../tests/fixtures/tables")
19+
}
20+
21+
fn collect_fixtures() -> Vec<PathBuf> {
22+
let mut out = Vec::new();
23+
let dir = fixtures_dir();
24+
let Ok(rd) = std::fs::read_dir(&dir) else {
25+
eprintln!(
26+
"skipping bench: fixtures dir not found at {}",
27+
dir.display()
28+
);
29+
return out;
30+
};
31+
for entry in rd.flatten() {
32+
let path = entry.path();
33+
if path.extension().and_then(|s| s.to_str()) == Some("pdf") {
34+
out.push(path);
35+
}
36+
}
37+
out.sort();
38+
out
39+
}
40+
41+
fn bench_extract(c: &mut Criterion) {
42+
let fixtures = collect_fixtures();
43+
let opts = TableExtractionOptions::default();
44+
45+
let mut group = c.benchmark_group("table_extraction");
46+
for path in &fixtures {
47+
let name = path
48+
.file_stem()
49+
.and_then(|s| s.to_str())
50+
.unwrap_or("?")
51+
.to_string();
52+
// Parse the document once and keep its pages alive for the whole bench run.
53+
let doc = match Document::open(path) {
54+
Ok(d) => d,
55+
Err(e) => {
56+
eprintln!("skipping {name}: failed to open ({e:?})");
57+
continue;
58+
}
59+
};
60+
let mut pages = Vec::new();
61+
for n in 1..=doc.page_count() as u32 {
62+
if let Ok(p) = doc.page(n) {
63+
pages.push(p);
64+
}
65+
}
66+
group.throughput(Throughput::Elements(pages.len() as u64));
67+
group.bench_with_input(BenchmarkId::from_parameter(&name), &pages, |b, pages| {
68+
b.iter(|| {
69+
for page in pages.iter() {
70+
let _ = extract_tables(page, &opts);
71+
}
72+
});
73+
});
74+
}
75+
group.finish();
76+
}
77+
78+
criterion_group!(benches, bench_extract);
79+
criterion_main!(benches);

crates/paperjam-core/src/forms/mod.rs

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -93,22 +93,14 @@ pub fn fill_form_fields(
9393

9494
for (name, value) in values {
9595
match field_name_map.get(name) {
96-
None => {
97-
not_found.push(name.clone());
96+
Some((false, field_type)) if set_field_value(&mut inner, name, value, field_type)? => {
97+
filled += 1;
98+
filled_fields.push((name.clone(), value.clone(), field_type.clone()));
9899
}
99-
Some((true, _)) => {
100-
// Read-only field, skip
100+
// Not in the map, read-only, or set_field_value returned false: record as not-found.
101+
_ => {
101102
not_found.push(name.clone());
102103
}
103-
Some((false, field_type)) => {
104-
// Find and update the field object
105-
if set_field_value(&mut inner, name, value, field_type)? {
106-
filled += 1;
107-
filled_fields.push((name.clone(), value.clone(), field_type.clone()));
108-
} else {
109-
not_found.push(name.clone());
110-
}
111-
}
112104
}
113105
}
114106

crates/paperjam-core/src/manipulation/insert.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ pub fn insert_blank_pages(doc: &Document, positions: &[(u32, f64, f64)]) -> Resu
4040

4141
// Sort positions in reverse order so insertions don't shift indices
4242
let mut sorted_positions = positions.to_vec();
43-
sorted_positions.sort_by(|a, b| b.0.cmp(&a.0));
43+
sorted_positions.sort_by_key(|p| std::cmp::Reverse(p.0));
4444

4545
for (after_page, width, height) in sorted_positions {
4646
// Create an empty content stream

crates/paperjam-epub/src/parser.rs

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -218,22 +218,20 @@ fn parse_opf(xml: &str) -> Result<(OpfMetadata, HashMap<String, String>, Vec<Str
218218
}
219219
}
220220
}
221-
Ok(Event::Text(ref e)) => {
222-
if section == Section::Metadata {
223-
let text = e.unescape().unwrap_or_default().trim().to_string();
224-
if !text.is_empty() {
225-
match current_tag.as_str() {
226-
"title" => metadata.title = Some(text),
227-
"creator" => metadata.creator = Some(text),
228-
"subject" => metadata.subject = Some(text),
229-
"description" => metadata.description = Some(text),
230-
"publisher" => metadata.publisher = Some(text),
231-
"date" => metadata.date = Some(text),
232-
"language" => metadata.language = Some(text),
233-
"identifier" => metadata.identifier = Some(text),
234-
"rights" => metadata.rights = Some(text),
235-
_ => {}
236-
}
221+
Ok(Event::Text(ref e)) if section == Section::Metadata => {
222+
let text = e.unescape().unwrap_or_default().trim().to_string();
223+
if !text.is_empty() {
224+
match current_tag.as_str() {
225+
"title" => metadata.title = Some(text),
226+
"creator" => metadata.creator = Some(text),
227+
"subject" => metadata.subject = Some(text),
228+
"description" => metadata.description = Some(text),
229+
"publisher" => metadata.publisher = Some(text),
230+
"date" => metadata.date = Some(text),
231+
"language" => metadata.language = Some(text),
232+
"identifier" => metadata.identifier = Some(text),
233+
"rights" => metadata.rights = Some(text),
234+
_ => {}
237235
}
238236
}
239237
}

crates/paperjam-epub/src/toc.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,8 @@ fn parse_nav_point(reader: &mut Reader<&[u8]>) -> Option<TocEntry> {
7676
}
7777
}
7878
}
79-
Ok(Event::Text(ref e)) => {
80-
if in_text {
81-
title = e.unescape().unwrap_or_default().trim().to_string();
82-
}
79+
Ok(Event::Text(ref e)) if in_text => {
80+
title = e.unescape().unwrap_or_default().trim().to_string();
8381
}
8482
Ok(Event::End(ref e)) => {
8583
let local = local_name(e.name().as_ref());

0 commit comments

Comments
 (0)