Skip to content

Commit b9576c7

Browse files
authored
Merge pull request #6 from atsyplenkov/feat-normalize
feat: normalize OCR tables
2 parents fd78353 + e224f0a commit b9576c7

7 files changed

Lines changed: 1060 additions & 47 deletions

File tree

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ If you work with academic papers, you know that the OCR process itself is not th
2323

2424
I used to rely on [`marker`](https://github.com/datalab-to/marker) for PDF parsing and thought it was great. However, after converting the [Batista et al. (2022)](https://hess.copernicus.org/articles/26/3753/2022/) article one day, I discovered that Table 4 was missing, regardless of the settings or LLMs I used (via the `--use-llm` flag). I then switched to [`docling`](https://github.com/docling-project/docling), and Table 4 reappeared, but all the formulas were gone. Furthermore, both tools require a GPU, and even on a Google Colab T4 instance, processing one article takes 4 to 5 minutes.
2525

26-
Therefore, this project was created because, while [`docling`](https://github.com/docling-project/docling) and [`marker`](https://github.com/datalab-to/marker) are both good tools, they can sometimes miss tables or mix up table structures in ways that require manual correction. I wanted a simple, reliable process that produces a single Markdown file I can trust, a local `figures/` folder, and the ability to process my entire library quickly on my laptop.
26+
Therefore, this project was created because, while [`docling`](https://github.com/docling-project/docling) and [`marker`](https://github.com/datalab-to/marker) are both good tools, they can sometimes miss tables or mix up table structures in ways that require manual correction. I wanted a simple, reliable process that produces a Markdown index file I can trust, local `figures/` and optional `tables/` folders, and the ability to process my entire library quickly on my laptop.
2727

2828
## Features
2929

@@ -71,6 +71,7 @@ paperdown converts one PDF or a directory of PDFs into markdown output folders.
7171
For each PDF, it creates:
7272
- <output>/<pdf_stem>/index.md
7373
- <output>/<pdf_stem>/figures/
74+
- <output>/<pdf_stem>/tables/ (when `--normalize-tables` is enabled)
7475
- <output>/<pdf_stem>/log.jsonl
7576
7677
API key lookup order:
@@ -87,7 +88,8 @@ Options:
8788
--max-download-bytes <MAX_DOWNLOAD_BYTES> Maximum allowed size (bytes) for each downloaded figure file. [default: 20971520]
8889
--workers <WORKERS> Maximum number of PDFs processed concurrently in batch mode. [default: 32]
8990
-v, --verbose Enable verbose progress messages on stderr.
90-
--overwrite Replace existing managed output artifacts (index.md and figures/).
91+
--overwrite Replace existing managed output artifacts (index.md, figures/, and tables/ when enabled).
92+
--normalize-tables Normalize OCR HTML tables into Markdown and store raw HTML under tables/.
9193
-h, --help Print help (see a summary with '-h')
9294
-V, --version Print version
9395
```

src/cli.rs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,19 @@ use std::path::PathBuf;
1010
For each PDF, it creates:\n\
1111
- <output>/<pdf_stem>/index.md\n\
1212
- <output>/<pdf_stem>/figures/\n\
13+
- <output>/<pdf_stem>/tables/ (when --normalize-tables is enabled)\n\
1314
- <output>/<pdf_stem>/log.jsonl\n\n\
1415
API key lookup order:\n\
1516
1) ZAI_API_KEY from --env-file\n\
1617
2) ZAI_API_KEY from environment",
1718
after_help = "Examples:\n \
1819
paperdown --input pdf/paper.pdf\n \
1920
paperdown --input pdf/ --output md/ --workers 4\n \
20-
paperdown --input pdf/ --output md/ --overwrite\n\n\
21+
paperdown --input pdf/ --output md/ --overwrite\n \
22+
paperdown --input pdf/ --output md/ --normalize-tables\n\n\
2123
Notes:\n \
2224
Without --overwrite, existing index.md or figures/ causes a failure.\n \
25+
When --normalize-tables is enabled, existing tables/ also causes a failure.\n \
2326
Progress bars are shown on stderr only when running in a TTY."
2427
)]
2528
pub struct Cli {
@@ -83,6 +86,13 @@ pub struct Cli {
8386
help = "Replace existing managed output artifacts (index.md and figures/)."
8487
)]
8588
pub overwrite: bool,
89+
90+
#[arg(
91+
long = "normalize-tables",
92+
action = ArgAction::SetTrue,
93+
help = "Normalize OCR HTML tables into Markdown and store raw HTML under tables/."
94+
)]
95+
pub normalize_tables: bool,
8696
}
8797

8898
pub fn default_workers() -> usize {
@@ -124,6 +134,7 @@ mod tests {
124134
assert_eq!(cli.workers, default_workers());
125135
assert!(!cli.verbose);
126136
assert!(!cli.overwrite);
137+
assert!(!cli.normalize_tables);
127138
}
128139

129140
#[test]
@@ -148,6 +159,7 @@ mod tests {
148159
let help = cmd.render_long_help().to_string();
149160
assert!(help.contains("Examples:"));
150161
assert!(help.contains("--overwrite"));
162+
assert!(help.contains("--normalize-tables"));
151163
let file_first = help.find("1) ZAI_API_KEY from --env-file");
152164
let env_second = help.find("2) ZAI_API_KEY from environment");
153165
assert!(file_first.is_some());

src/core.rs

Lines changed: 62 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ mod input;
1212
mod markdown;
1313
mod ocr;
1414
mod output;
15+
mod table_normalization;
1516

1617
pub fn collect_pdfs(input_path: &Path) -> Result<Vec<std::path::PathBuf>> {
1718
input::collect_pdfs(input_path)
@@ -29,6 +30,15 @@ pub enum ProgressEvent {
2930

3031
pub type ProgressCallback = Arc<dyn Fn(ProgressEvent) + Send + Sync>;
3132

33+
#[derive(Clone)]
34+
pub struct ProcessPdfOptions {
35+
pub timeout: Duration,
36+
pub max_download_bytes: u64,
37+
pub overwrite: bool,
38+
pub normalize_tables: bool,
39+
pub progress: Option<ProgressCallback>,
40+
}
41+
3242
#[derive(Debug, Serialize, Clone)]
3343
pub struct PdfSummary {
3444
pub pdf: String,
@@ -45,10 +55,7 @@ pub async fn process_pdf(
4555
pdf_path: &Path,
4656
output_root: &Path,
4757
env_file: &Path,
48-
timeout: Duration,
49-
max_download_bytes: u64,
50-
overwrite: bool,
51-
progress: Option<ProgressCallback>,
58+
options: ProcessPdfOptions,
5259
) -> Result<PdfSummary> {
5360
let run_started = Instant::now();
5461
let pdf_path = pdf_path
@@ -57,16 +64,23 @@ pub async fn process_pdf(
5764
if !pdf_path.is_file() || !input::is_pdf_path(&pdf_path) {
5865
return Err(anyhow!("Input must be a PDF: {}", pdf_path.display()));
5966
}
60-
let prepared = output::prepare_output_paths(output_root, &pdf_path, overwrite)?;
61-
let client = reqwest::Client::builder().timeout(timeout).build()?;
67+
let prepared = output::prepare_output_paths(
68+
output_root,
69+
&pdf_path,
70+
options.overwrite,
71+
options.normalize_tables,
72+
)?;
73+
let client = reqwest::Client::builder()
74+
.timeout(options.timeout)
75+
.build()?;
6276

6377
let api_key = input::load_api_key(env_file)?;
6478
let payload = ocr::build_payload(&pdf_path).await?;
65-
fire(&progress, ProgressEvent::OcrStarted);
79+
fire(&options.progress, ProgressEvent::OcrStarted);
6680
let ocr_started = Instant::now();
6781
let response = ocr::call_layout_parsing(&client, &api_key, payload).await?;
6882
let ocr_seconds = ocr_started.elapsed();
69-
fire(&progress, ProgressEvent::OcrFinished);
83+
fire(&options.progress, ProgressEvent::OcrFinished);
7084

7185
let (markdown, layout_details, usage) = ocr::validate_layout_response(response)?;
7286

@@ -77,22 +91,31 @@ pub async fn process_pdf(
7791
&layout_details,
7892
&client,
7993
&prepared.figures_dir,
80-
max_download_bytes,
81-
progress.clone(),
94+
options.max_download_bytes,
95+
options.progress.clone(),
8296
)
8397
.await?;
8498
let figure_seconds = figure_started.elapsed();
8599
let markdown = markdown::strip_html_img_alt_attributes(&markdown);
100+
let (markdown, table_stats) = if options.normalize_tables {
101+
let tables_dir = prepared
102+
.tables_dir
103+
.as_ref()
104+
.expect("tables_dir must exist when normalize_tables is enabled");
105+
table_normalization::normalize_tables(&markdown, tables_dir).await?
106+
} else {
107+
(markdown, table_normalization::TableStats::default())
108+
};
86109

87110
fire(
88-
&progress,
111+
&options.progress,
89112
ProgressEvent::MarkdownWriteStarted {
90113
bytes: markdown.len(),
91114
},
92115
);
93116
let write_started = Instant::now();
94117
output::atomic_write_text(&prepared.markdown_path, &markdown).await?;
95-
fire(&progress, ProgressEvent::MarkdownWriteFinished);
118+
fire(&options.progress, ProgressEvent::MarkdownWriteFinished);
96119

97120
output::append_log(
98121
&prepared.log_path,
@@ -104,6 +127,14 @@ pub async fn process_pdf(
104127
"downloaded_figures": downloaded_figures,
105128
"remote_figure_links": remote_figure_links,
106129
"image_blocks": image_blocks,
130+
"tables_found": table_stats.tables_found,
131+
"tables_raw_written": table_stats.tables_raw_written,
132+
"tables_normalized": table_stats.tables_normalized,
133+
"tables_skipped_in_code": table_stats.tables_skipped_in_code,
134+
"tables_skipped_nested": table_stats.tables_skipped_nested,
135+
"tables_skipped_too_large": table_stats.tables_skipped_too_large,
136+
"tables_failed_extract": table_stats.tables_failed_extract,
137+
"tables_failed_parse": table_stats.tables_failed_parse,
107138
"usage": usage,
108139
"timing": {
109140
"ocr_call_s": round3(ocr_seconds),
@@ -122,6 +153,7 @@ pub async fn process_pdf(
122153
downloaded_figures,
123154
remote_figure_links,
124155
image_blocks,
156+
// Table stats are logged but not surfaced in the summary.
125157
usage,
126158
log_path: prepared.log_path.display().to_string(),
127159
})
@@ -140,9 +172,11 @@ fn round3(duration: Duration) -> f64 {
140172
#[cfg(feature = "internal-testing")]
141173
#[doc(hidden)]
142174
pub mod testing {
175+
pub use super::ProcessPdfOptions;
143176
pub use super::ProgressCallback;
144177
pub use super::ProgressEvent;
145178
pub use super::process_pdf;
179+
pub use super::table_normalization::TableStats;
146180
use anyhow::Result;
147181
use serde_json::Value;
148182
use std::collections::HashMap;
@@ -153,6 +187,7 @@ pub mod testing {
153187
pub struct PreparedOutputPaths {
154188
pub output_dir: std::path::PathBuf,
155189
pub figures_dir: std::path::PathBuf,
190+
pub tables_dir: Option<std::path::PathBuf>,
156191
pub markdown_path: std::path::PathBuf,
157192
pub log_path: std::path::PathBuf,
158193
}
@@ -228,16 +263,30 @@ pub mod testing {
228263
output_root: &Path,
229264
pdf_path: &Path,
230265
overwrite: bool,
266+
normalize_tables: bool,
231267
) -> Result<PreparedOutputPaths> {
232-
let prepared = super::output::prepare_output_paths(output_root, pdf_path, overwrite)?;
268+
let prepared = super::output::prepare_output_paths(
269+
output_root,
270+
pdf_path,
271+
overwrite,
272+
normalize_tables,
273+
)?;
233274
Ok(PreparedOutputPaths {
234275
output_dir: prepared.output_dir,
235276
figures_dir: prepared.figures_dir,
277+
tables_dir: prepared.tables_dir,
236278
markdown_path: prepared.markdown_path,
237279
log_path: prepared.log_path,
238280
})
239281
}
240282

283+
pub async fn normalize_tables(
284+
markdown: &str,
285+
tables_dir: &Path,
286+
) -> Result<(String, TableStats)> {
287+
super::table_normalization::normalize_tables(markdown, tables_dir).await
288+
}
289+
241290
pub async fn append_log(log_path: &Path, entry: Value) -> Result<()> {
242291
super::output::append_log(log_path, entry).await
243292
}

src/core/output.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use tokio::io::AsyncWriteExt;
99
pub(crate) struct PreparedOutput {
1010
pub(crate) output_dir: PathBuf,
1111
pub(crate) figures_dir: PathBuf,
12+
pub(crate) tables_dir: Option<PathBuf>,
1213
pub(crate) markdown_path: PathBuf,
1314
pub(crate) log_path: PathBuf,
1415
}
@@ -17,6 +18,7 @@ pub(crate) fn prepare_output_paths(
1718
output_root: &Path,
1819
pdf_path: &Path,
1920
overwrite: bool,
21+
normalize_tables: bool,
2022
) -> Result<PreparedOutput> {
2123
let stem = pdf_path
2224
.file_stem()
@@ -28,6 +30,7 @@ pub(crate) fn prepare_output_paths(
2830

2931
let markdown_path = output_dir.join("index.md");
3032
let figures_dir = output_dir.join("figures");
33+
let tables_dir = output_dir.join("tables");
3134
let log_path = output_dir.join("log.jsonl");
3235

3336
if !overwrite {
@@ -43,6 +46,12 @@ pub(crate) fn prepare_output_paths(
4346
figures_dir.display()
4447
));
4548
}
49+
if normalize_tables && tables_dir.exists() {
50+
return Err(anyhow::anyhow!(
51+
"Output already exists: {}. Re-run with --overwrite",
52+
tables_dir.display()
53+
));
54+
}
4655
} else {
4756
if markdown_path.exists() {
4857
std::fs::remove_file(&markdown_path)?;
@@ -54,13 +63,27 @@ pub(crate) fn prepare_output_paths(
5463
std::fs::remove_file(&figures_dir)?;
5564
}
5665
}
66+
if normalize_tables && tables_dir.exists() {
67+
if tables_dir.is_dir() {
68+
std::fs::remove_dir_all(&tables_dir)?;
69+
} else {
70+
std::fs::remove_file(&tables_dir)?;
71+
}
72+
}
5773
}
5874

5975
std::fs::create_dir_all(&figures_dir)?;
76+
let tables_dir = if normalize_tables {
77+
std::fs::create_dir_all(&tables_dir)?;
78+
Some(tables_dir)
79+
} else {
80+
None
81+
};
6082

6183
Ok(PreparedOutput {
6284
output_dir,
6385
figures_dir,
86+
tables_dir,
6487
markdown_path,
6588
log_path,
6689
})

0 commit comments

Comments
 (0)