atsyplenkov
diff --git a/‎README.md‎
Lines changed: 4 additions & 2 deletions b/‎README.md‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/cli.rs‎
Lines changed: 13 additions & 1 deletion b/‎src/cli.rs‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎src/core.rs‎
Lines changed: 62 additions & 13 deletions b/‎src/core.rs‎
Lines changed: 62 additions & 13 deletions
diff --git a/‎src/core/output.rs‎
Lines changed: 23 additions & 0 deletions b/‎src/core/output.rs‎
Lines changed: 23 additions & 0 deletions
@@ -23,7 +23,7 @@ If you work with academic papers, you know that the OCR process itself is not th
 
 I used to rely on [`marker`](https://github.com/datalab-to/marker) for PDF parsing and thought it was great. However, after converting the [Batista et al. (2022)](https://hess.copernicus.org/articles/26/3753/2022/) article one day, I discovered that Table 4 was missing, regardless of the settings or LLMs I used (via the `--use-llm` flag). I then switched to [`docling`](https://github.com/docling-project/docling), and Table 4 reappeared, but all the formulas were gone. Furthermore, both tools require a GPU, and even on a Google Colab T4 instance, processing one article takes 4 to 5 minutes.
 
-Therefore, this project was created because, while [`docling`](https://github.com/docling-project/docling) and [`marker`](https://github.com/datalab-to/marker) are both good tools, they can sometimes miss tables or mix up table structures in ways that require manual correction. I wanted a simple, reliable process that produces a single Markdown file I can trust, a local `figures/` folder, and the ability to process my entire library quickly on my laptop.
+Therefore, this project was created because, while [`docling`](https://github.com/docling-project/docling) and [`marker`](https://github.com/datalab-to/marker) are both good tools, they can sometimes miss tables or mix up table structures in ways that require manual correction. I wanted a simple, reliable process that produces a Markdown index file I can trust, local `figures/` and optional `tables/` folders, and the ability to process my entire library quickly on my laptop.
 
 ## Features
 
@@ -71,6 +71,7 @@ paperdown converts one PDF or a directory of PDFs into markdown output folders.
 For each PDF, it creates:
 - <output>/<pdf_stem>/index.md
 - <output>/<pdf_stem>/figures/
+- <output>/<pdf_stem>/tables/ (when `--normalize-tables` is enabled)
 - <output>/<pdf_stem>/log.jsonl
 
 API key lookup order:
@@ -87,7 +88,8 @@ Options:
       --max-download-bytes <MAX_DOWNLOAD_BYTES>  Maximum allowed size (bytes) for each downloaded figure file. [default: 20971520]
       --workers <WORKERS>                        Maximum number of PDFs processed concurrently in batch mode. [default: 32]
   -v, --verbose                                  Enable verbose progress messages on stderr.
-      --overwrite                                Replace existing managed output artifacts (index.md and figures/).
+      --overwrite                                Replace existing managed output artifacts (index.md, figures/, and tables/ when enabled).
+      --normalize-tables                         Normalize OCR HTML tables into Markdown and store raw HTML under tables/.
   -h, --help                                     Print help (see a summary with '-h')
   -V, --version                                  Print version
 ```
 
@@ -10,16 +10,19 @@ use std::path::PathBuf;
 For each PDF, it creates:\n\
 - <output>/<pdf_stem>/index.md\n\
 - <output>/<pdf_stem>/figures/\n\
+- <output>/<pdf_stem>/tables/ (when --normalize-tables is enabled)\n\
 - <output>/<pdf_stem>/log.jsonl\n\n\
 API key lookup order:\n\
 1) ZAI_API_KEY from --env-file\n\
 2) ZAI_API_KEY from environment",
     after_help = "Examples:\n  \
 paperdown --input pdf/paper.pdf\n  \
 paperdown --input pdf/ --output md/ --workers 4\n  \
-paperdown --input pdf/ --output md/ --overwrite\n\n\
+paperdown --input pdf/ --output md/ --overwrite\n  \
+paperdown --input pdf/ --output md/ --normalize-tables\n\n\
 Notes:\n  \
 Without --overwrite, existing index.md or figures/ causes a failure.\n  \
+When --normalize-tables is enabled, existing tables/ also causes a failure.\n  \
 Progress bars are shown on stderr only when running in a TTY."
 )]
 pub struct Cli {
@@ -83,6 +86,13 @@ pub struct Cli {
         help = "Replace existing managed output artifacts (index.md and figures/)."
     )]
     pub overwrite: bool,
+
+    #[arg(
+        long = "normalize-tables",
+        action = ArgAction::SetTrue,
+        help = "Normalize OCR HTML tables into Markdown and store raw HTML under tables/."
+    )]
+    pub normalize_tables: bool,
 }
 
 pub fn default_workers() -> usize {
@@ -124,6 +134,7 @@ mod tests {
         assert_eq!(cli.workers, default_workers());
         assert!(!cli.verbose);
         assert!(!cli.overwrite);
+        assert!(!cli.normalize_tables);
     }
 
     #[test]
@@ -148,6 +159,7 @@ mod tests {
         let help = cmd.render_long_help().to_string();
         assert!(help.contains("Examples:"));
         assert!(help.contains("--overwrite"));
+        assert!(help.contains("--normalize-tables"));
         let file_first = help.find("1) ZAI_API_KEY from --env-file");
         let env_second = help.find("2) ZAI_API_KEY from environment");
         assert!(file_first.is_some());
 
@@ -12,6 +12,7 @@ mod input;
 mod markdown;
 mod ocr;
 mod output;
+mod table_normalization;
 
 pub fn collect_pdfs(input_path: &Path) -> Result<Vec<std::path::PathBuf>> {
     input::collect_pdfs(input_path)
@@ -29,6 +30,15 @@ pub enum ProgressEvent {
 
 pub type ProgressCallback = Arc<dyn Fn(ProgressEvent) + Send + Sync>;
 
+#[derive(Clone)]
+pub struct ProcessPdfOptions {
+    pub timeout: Duration,
+    pub max_download_bytes: u64,
+    pub overwrite: bool,
+    pub normalize_tables: bool,
+    pub progress: Option<ProgressCallback>,
+}
+
 #[derive(Debug, Serialize, Clone)]
 pub struct PdfSummary {
     pub pdf: String,
@@ -45,10 +55,7 @@ pub async fn process_pdf(
     pdf_path: &Path,
     output_root: &Path,
     env_file: &Path,
-    timeout: Duration,
-    max_download_bytes: u64,
-    overwrite: bool,
-    progress: Option<ProgressCallback>,
+    options: ProcessPdfOptions,
 ) -> Result<PdfSummary> {
     let run_started = Instant::now();
     let pdf_path = pdf_path
@@ -57,16 +64,23 @@ pub async fn process_pdf(
     if !pdf_path.is_file() || !input::is_pdf_path(&pdf_path) {
         return Err(anyhow!("Input must be a PDF: {}", pdf_path.display()));
     }
-    let prepared = output::prepare_output_paths(output_root, &pdf_path, overwrite)?;
-    let client = reqwest::Client::builder().timeout(timeout).build()?;
+    let prepared = output::prepare_output_paths(
+        output_root,
+        &pdf_path,
+        options.overwrite,
+        options.normalize_tables,
+    )?;
+    let client = reqwest::Client::builder()
+        .timeout(options.timeout)
+        .build()?;
 
     let api_key = input::load_api_key(env_file)?;
     let payload = ocr::build_payload(&pdf_path).await?;
-    fire(&progress, ProgressEvent::OcrStarted);
+    fire(&options.progress, ProgressEvent::OcrStarted);
     let ocr_started = Instant::now();
     let response = ocr::call_layout_parsing(&client, &api_key, payload).await?;
     let ocr_seconds = ocr_started.elapsed();
-    fire(&progress, ProgressEvent::OcrFinished);
+    fire(&options.progress, ProgressEvent::OcrFinished);
 
     let (markdown, layout_details, usage) = ocr::validate_layout_response(response)?;
 
@@ -77,22 +91,31 @@ pub async fn process_pdf(
             &layout_details,
             &client,
             &prepared.figures_dir,
-            max_download_bytes,
-            progress.clone(),
+            options.max_download_bytes,
+            options.progress.clone(),
         )
         .await?;
     let figure_seconds = figure_started.elapsed();
     let markdown = markdown::strip_html_img_alt_attributes(&markdown);
+    let (markdown, table_stats) = if options.normalize_tables {
+        let tables_dir = prepared
+            .tables_dir
+            .as_ref()
+            .expect("tables_dir must exist when normalize_tables is enabled");
+        table_normalization::normalize_tables(&markdown, tables_dir).await?
+    } else {
+        (markdown, table_normalization::TableStats::default())
+    };
 
     fire(
-        &progress,
+        &options.progress,
         ProgressEvent::MarkdownWriteStarted {
             bytes: markdown.len(),
         },
     );
     let write_started = Instant::now();
     output::atomic_write_text(&prepared.markdown_path, &markdown).await?;
-    fire(&progress, ProgressEvent::MarkdownWriteFinished);
+    fire(&options.progress, ProgressEvent::MarkdownWriteFinished);
 
     output::append_log(
         &prepared.log_path,
@@ -104,6 +127,14 @@ pub async fn process_pdf(
             "downloaded_figures": downloaded_figures,
             "remote_figure_links": remote_figure_links,
             "image_blocks": image_blocks,
+            "tables_found": table_stats.tables_found,
+            "tables_raw_written": table_stats.tables_raw_written,
+            "tables_normalized": table_stats.tables_normalized,
+            "tables_skipped_in_code": table_stats.tables_skipped_in_code,
+            "tables_skipped_nested": table_stats.tables_skipped_nested,
+            "tables_skipped_too_large": table_stats.tables_skipped_too_large,
+            "tables_failed_extract": table_stats.tables_failed_extract,
+            "tables_failed_parse": table_stats.tables_failed_parse,
             "usage": usage,
             "timing": {
                 "ocr_call_s": round3(ocr_seconds),
@@ -122,6 +153,7 @@ pub async fn process_pdf(
         downloaded_figures,
         remote_figure_links,
         image_blocks,
+        // Table stats are logged but not surfaced in the summary.
         usage,
         log_path: prepared.log_path.display().to_string(),
     })
@@ -140,9 +172,11 @@ fn round3(duration: Duration) -> f64 {
 #[cfg(feature = "internal-testing")]
 #[doc(hidden)]
 pub mod testing {
+    pub use super::ProcessPdfOptions;
     pub use super::ProgressCallback;
     pub use super::ProgressEvent;
     pub use super::process_pdf;
+    pub use super::table_normalization::TableStats;
     use anyhow::Result;
     use serde_json::Value;
     use std::collections::HashMap;
@@ -153,6 +187,7 @@ pub mod testing {
     pub struct PreparedOutputPaths {
         pub output_dir: std::path::PathBuf,
         pub figures_dir: std::path::PathBuf,
+        pub tables_dir: Option<std::path::PathBuf>,
         pub markdown_path: std::path::PathBuf,
         pub log_path: std::path::PathBuf,
     }
@@ -228,16 +263,30 @@ pub mod testing {
         output_root: &Path,
         pdf_path: &Path,
         overwrite: bool,
+        normalize_tables: bool,
     ) -> Result<PreparedOutputPaths> {
-        let prepared = super::output::prepare_output_paths(output_root, pdf_path, overwrite)?;
+        let prepared = super::output::prepare_output_paths(
+            output_root,
+            pdf_path,
+            overwrite,
+            normalize_tables,
+        )?;
         Ok(PreparedOutputPaths {
             output_dir: prepared.output_dir,
             figures_dir: prepared.figures_dir,
+            tables_dir: prepared.tables_dir,
             markdown_path: prepared.markdown_path,
             log_path: prepared.log_path,
         })
     }
 
+    pub async fn normalize_tables(
+        markdown: &str,
+        tables_dir: &Path,
+    ) -> Result<(String, TableStats)> {
+        super::table_normalization::normalize_tables(markdown, tables_dir).await
+    }
+
     pub async fn append_log(log_path: &Path, entry: Value) -> Result<()> {
         super::output::append_log(log_path, entry).await
     }
 
@@ -9,6 +9,7 @@ use tokio::io::AsyncWriteExt;
 pub(crate) struct PreparedOutput {
     pub(crate) output_dir: PathBuf,
     pub(crate) figures_dir: PathBuf,
+    pub(crate) tables_dir: Option<PathBuf>,
     pub(crate) markdown_path: PathBuf,
     pub(crate) log_path: PathBuf,
 }
@@ -17,6 +18,7 @@ pub(crate) fn prepare_output_paths(
     output_root: &Path,
     pdf_path: &Path,
     overwrite: bool,
+    normalize_tables: bool,
 ) -> Result<PreparedOutput> {
     let stem = pdf_path
         .file_stem()
@@ -28,6 +30,7 @@ pub(crate) fn prepare_output_paths(
 
     let markdown_path = output_dir.join("index.md");
     let figures_dir = output_dir.join("figures");
+    let tables_dir = output_dir.join("tables");
     let log_path = output_dir.join("log.jsonl");
 
     if !overwrite {
@@ -43,6 +46,12 @@ pub(crate) fn prepare_output_paths(
                 figures_dir.display()
             ));
         }
+        if normalize_tables && tables_dir.exists() {
+            return Err(anyhow::anyhow!(
+                "Output already exists: {}. Re-run with --overwrite",
+                tables_dir.display()
+            ));
+        }
     } else {
         if markdown_path.exists() {
             std::fs::remove_file(&markdown_path)?;
@@ -54,13 +63,27 @@ pub(crate) fn prepare_output_paths(
                 std::fs::remove_file(&figures_dir)?;
             }
         }
+        if normalize_tables && tables_dir.exists() {
+            if tables_dir.is_dir() {
+                std::fs::remove_dir_all(&tables_dir)?;
+            } else {
+                std::fs::remove_file(&tables_dir)?;
+            }
+        }
     }
 
     std::fs::create_dir_all(&figures_dir)?;
+    let tables_dir = if normalize_tables {
+        std::fs::create_dir_all(&tables_dir)?;
+        Some(tables_dir)
+    } else {
+        None
+    };
 
     Ok(PreparedOutput {
         output_dir,
         figures_dir,
+        tables_dir,
         markdown_path,
         log_path,
     })