Skip to content

Commit e224f0a

Browse files
committed
fix: satisfy clippy
1 parent 5ee7b94 commit e224f0a

4 files changed

Lines changed: 63 additions & 61 deletions

File tree

src/core.rs

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,15 @@ pub enum ProgressEvent {
3030

3131
pub type ProgressCallback = Arc<dyn Fn(ProgressEvent) + Send + Sync>;
3232

33+
#[derive(Clone)]
34+
pub struct ProcessPdfOptions {
35+
pub timeout: Duration,
36+
pub max_download_bytes: u64,
37+
pub overwrite: bool,
38+
pub normalize_tables: bool,
39+
pub progress: Option<ProgressCallback>,
40+
}
41+
3342
#[derive(Debug, Serialize, Clone)]
3443
pub struct PdfSummary {
3544
pub pdf: String,
@@ -46,11 +55,7 @@ pub async fn process_pdf(
4655
pdf_path: &Path,
4756
output_root: &Path,
4857
env_file: &Path,
49-
timeout: Duration,
50-
max_download_bytes: u64,
51-
overwrite: bool,
52-
normalize_tables: bool,
53-
progress: Option<ProgressCallback>,
58+
options: ProcessPdfOptions,
5459
) -> Result<PdfSummary> {
5560
let run_started = Instant::now();
5661
let pdf_path = pdf_path
@@ -59,17 +64,23 @@ pub async fn process_pdf(
5964
if !pdf_path.is_file() || !input::is_pdf_path(&pdf_path) {
6065
return Err(anyhow!("Input must be a PDF: {}", pdf_path.display()));
6166
}
62-
let prepared =
63-
output::prepare_output_paths(output_root, &pdf_path, overwrite, normalize_tables)?;
64-
let client = reqwest::Client::builder().timeout(timeout).build()?;
67+
let prepared = output::prepare_output_paths(
68+
output_root,
69+
&pdf_path,
70+
options.overwrite,
71+
options.normalize_tables,
72+
)?;
73+
let client = reqwest::Client::builder()
74+
.timeout(options.timeout)
75+
.build()?;
6576

6677
let api_key = input::load_api_key(env_file)?;
6778
let payload = ocr::build_payload(&pdf_path).await?;
68-
fire(&progress, ProgressEvent::OcrStarted);
79+
fire(&options.progress, ProgressEvent::OcrStarted);
6980
let ocr_started = Instant::now();
7081
let response = ocr::call_layout_parsing(&client, &api_key, payload).await?;
7182
let ocr_seconds = ocr_started.elapsed();
72-
fire(&progress, ProgressEvent::OcrFinished);
83+
fire(&options.progress, ProgressEvent::OcrFinished);
7384

7485
let (markdown, layout_details, usage) = ocr::validate_layout_response(response)?;
7586

@@ -80,13 +91,13 @@ pub async fn process_pdf(
8091
&layout_details,
8192
&client,
8293
&prepared.figures_dir,
83-
max_download_bytes,
84-
progress.clone(),
94+
options.max_download_bytes,
95+
options.progress.clone(),
8596
)
8697
.await?;
8798
let figure_seconds = figure_started.elapsed();
8899
let markdown = markdown::strip_html_img_alt_attributes(&markdown);
89-
let (markdown, table_stats) = if normalize_tables {
100+
let (markdown, table_stats) = if options.normalize_tables {
90101
let tables_dir = prepared
91102
.tables_dir
92103
.as_ref()
@@ -97,14 +108,14 @@ pub async fn process_pdf(
97108
};
98109

99110
fire(
100-
&progress,
111+
&options.progress,
101112
ProgressEvent::MarkdownWriteStarted {
102113
bytes: markdown.len(),
103114
},
104115
);
105116
let write_started = Instant::now();
106117
output::atomic_write_text(&prepared.markdown_path, &markdown).await?;
107-
fire(&progress, ProgressEvent::MarkdownWriteFinished);
118+
fire(&options.progress, ProgressEvent::MarkdownWriteFinished);
108119

109120
output::append_log(
110121
&prepared.log_path,
@@ -161,6 +172,7 @@ fn round3(duration: Duration) -> f64 {
161172
#[cfg(feature = "internal-testing")]
162173
#[doc(hidden)]
163174
pub mod testing {
175+
pub use super::ProcessPdfOptions;
164176
pub use super::ProgressCallback;
165177
pub use super::ProgressEvent;
166178
pub use super::process_pdf;

src/core/table_normalization.rs

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -192,19 +192,14 @@ fn render_normalized_table(
192192

193193
let mut out = String::new();
194194
write!(&mut out, "\n\n##### OCR Table {ordinal}\n").unwrap();
195-
write!(&mut out, "Source (OCR HTML): {artifact_rel}\n").unwrap();
195+
writeln!(&mut out, "Source (OCR HTML): {artifact_rel}").unwrap();
196196
write!(&mut out, "Columns: {}\n\n", parsed.columns.join(", ")).unwrap();
197197

198198
for (row_index, row) in parsed.rows.iter().enumerate() {
199199
if row_index > 0 && row_index % ROW_GROUP_SIZE == 0 {
200200
out.push('\n');
201201
}
202-
write!(
203-
&mut out,
204-
"Row: {}\n",
205-
render_row_json(&parsed.columns, row)?
206-
)
207-
.unwrap();
202+
writeln!(&mut out, "Row: {}", render_row_json(&parsed.columns, row)?).unwrap();
208203
}
209204
out.push('\n');
210205

@@ -275,11 +270,8 @@ fn build_columns(grid: &[Vec<Option<String>>], header_rows: usize) -> Vec<String
275270

276271
for col in 0..width {
277272
let mut parts = Vec::new();
278-
for row in 0..header_rows.min(grid.len()) {
279-
let value = grid[row][col]
280-
.as_ref()
281-
.map(|value| value.trim())
282-
.unwrap_or("");
273+
for row in grid.iter().take(header_rows.min(grid.len())) {
274+
let value = row[col].as_ref().map(|value| value.trim()).unwrap_or("");
283275
if value.is_empty() {
284276
continue;
285277
}
@@ -396,7 +388,7 @@ fn ensure_row(
396388
}
397389
}
398390

399-
fn resize_width(grid: &mut Vec<Vec<Option<String>>>, occupied: &mut Vec<Vec<bool>>, width: usize) {
391+
fn resize_width(grid: &mut [Vec<Option<String>>], occupied: &mut [Vec<bool>], width: usize) {
400392
for row in grid.iter_mut() {
401393
if row.len() < width {
402394
row.resize(width, None);
@@ -489,9 +481,8 @@ fn html_fragment_to_text(fragment: &str) -> String {
489481
if let Some(tag_end) = find_tag_end(fragment, pos) {
490482
let tag = fragment[pos + 1..tag_end - 1].trim();
491483
let lower = tag.to_ascii_lowercase();
492-
if lower.starts_with("br") {
493-
out.push('\n');
494-
} else if lower.starts_with("/p")
484+
if lower.starts_with("br")
485+
|| lower.starts_with("/p")
495486
|| lower.starts_with("/div")
496487
|| lower.starts_with("/tr")
497488
|| lower.starts_with("/td")

src/main.rs

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ use anyhow::Result;
44
use clap::Parser;
55
use futures::stream::{self, StreamExt};
66
use indicatif::{MultiProgress, ProgressBar, ProgressDrawTarget, ProgressStyle};
7-
use paperdown::core::{self, PdfSummary, ProgressCallback, ProgressEvent, collect_pdfs};
7+
use paperdown::core::{
8+
self, PdfSummary, ProcessPdfOptions, ProgressCallback, ProgressEvent, collect_pdfs,
9+
};
810
use std::io::IsTerminal;
911
use std::path::Path;
1012
use std::sync::Arc;
@@ -42,11 +44,13 @@ async fn run() -> Result<i32> {
4244
&pdfs[0],
4345
&args.output,
4446
&args.env_file,
45-
Duration::from_secs(args.timeout),
46-
args.max_download_bytes,
47-
args.overwrite,
48-
args.normalize_tables,
49-
progress_callback(&pdfs[0], progress.clone()),
47+
ProcessPdfOptions {
48+
timeout: Duration::from_secs(args.timeout),
49+
max_download_bytes: args.max_download_bytes,
50+
overwrite: args.overwrite,
51+
normalize_tables: args.normalize_tables,
52+
progress: progress_callback(&pdfs[0], progress.clone()),
53+
},
5054
)
5155
.await?;
5256
print_single_summary_stdout(&summary);
@@ -61,24 +65,17 @@ async fn run() -> Result<i32> {
6165
let permit_pool = semaphore.clone();
6266
let output = args.output.clone();
6367
let env_file = args.env_file.clone();
64-
let timeout = Duration::from_secs(args.timeout);
65-
let max_download_bytes = args.max_download_bytes;
66-
let overwrite = args.overwrite;
67-
let normalize_tables = args.normalize_tables;
6868
let progress = progress.clone();
69+
let options = ProcessPdfOptions {
70+
timeout: Duration::from_secs(args.timeout),
71+
max_download_bytes: args.max_download_bytes,
72+
overwrite: args.overwrite,
73+
normalize_tables: args.normalize_tables,
74+
progress: progress_callback(&pdf, progress),
75+
};
6976
async move {
7077
let _permit = permit_pool.acquire_owned().await.expect("semaphore");
71-
let res = core::process_pdf(
72-
&pdf,
73-
&output,
74-
&env_file,
75-
timeout,
76-
max_download_bytes,
77-
overwrite,
78-
normalize_tables,
79-
progress_callback(&pdf, progress),
80-
)
81-
.await;
78+
let res = core::process_pdf(&pdf, &output, &env_file, options).await;
8279
(pdf, res)
8380
}
8481
}))

tests/core_internal.rs

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
use httpmock::prelude::*;
55
use paperdown::core::collect_pdfs;
66
use paperdown::core::testing::{
7-
ProgressCallback, ProgressEvent, append_log, atomic_write_text, build_payload,
8-
content_type_to_suffix, extract_image_url, fire_for_test, is_http_url, load_api_key,
9-
normalize_tables, prepare_output_paths, process_pdf, replace_image_urls, round3_for_test,
10-
strip_html_img_alt_attributes, url_suffix, validate_layout_response,
7+
ProcessPdfOptions, ProgressCallback, ProgressEvent, append_log, atomic_write_text,
8+
build_payload, content_type_to_suffix, extract_image_url, fire_for_test, is_http_url,
9+
load_api_key, normalize_tables, prepare_output_paths, process_pdf, replace_image_urls,
10+
round3_for_test, strip_html_img_alt_attributes, url_suffix, validate_layout_response,
1111
};
1212
#[cfg(feature = "net-tests")]
1313
use paperdown::core::testing::{download_figure, localize_figures};
@@ -947,11 +947,13 @@ fn process_pdf_checks_output_conflict_before_env_lookup() {
947947
&pdf,
948948
&output_root,
949949
&missing_env,
950-
Duration::from_secs(1),
951-
1024,
952-
false,
953-
false,
954-
None,
950+
ProcessPdfOptions {
951+
timeout: Duration::from_secs(1),
952+
max_download_bytes: 1024,
953+
overwrite: false,
954+
normalize_tables: false,
955+
progress: None,
956+
},
955957
))
956958
.unwrap_err()
957959
.to_string();

0 commit comments

Comments
 (0)