|
| 1 | +use anyhow::Result; |
| 2 | +use futures::stream::{self, StreamExt}; |
| 3 | +use reqwest::header::{CONTENT_LENGTH, CONTENT_TYPE}; |
| 4 | +use serde_json::Value; |
| 5 | +use std::collections::HashMap; |
| 6 | +use std::path::Path; |
| 7 | + |
| 8 | +use super::markdown::replace_image_urls; |
| 9 | +use super::{ProgressCallback, ProgressEvent, fire}; |
| 10 | + |
| 11 | +const INTERNAL_FIGURE_CONCURRENCY: usize = 16; |
| 12 | + |
| 13 | +pub(crate) async fn localize_figures( |
| 14 | + markdown: String, |
| 15 | + layout_details: &[Value], |
| 16 | + client: &reqwest::Client, |
| 17 | + figures_dir: &Path, |
| 18 | + max_download_bytes: u64, |
| 19 | + progress: Option<ProgressCallback>, |
| 20 | +) -> Result<(String, usize, usize, usize)> { |
| 21 | + let mut remote_figure_links = 0usize; |
| 22 | + let mut image_blocks = 0usize; |
| 23 | + let mut first_url_order: Vec<(String, String)> = Vec::new(); |
| 24 | + let mut seen: HashMap<String, String> = HashMap::new(); |
| 25 | + |
| 26 | + for (page_index, page_blocks) in layout_details.iter().enumerate() { |
| 27 | + let Some(blocks) = page_blocks.as_array() else { |
| 28 | + continue; |
| 29 | + }; |
| 30 | + for (block_index, block) in blocks.iter().enumerate() { |
| 31 | + if block.get("label").and_then(Value::as_str) != Some("image") { |
| 32 | + continue; |
| 33 | + } |
| 34 | + image_blocks += 1; |
| 35 | + let Some(remote_url) = extract_image_url(block) else { |
| 36 | + continue; |
| 37 | + }; |
| 38 | + remote_figure_links += 1; |
| 39 | + if !seen.contains_key(&remote_url) { |
| 40 | + let base = format!("fig-{:03}-{:03}", page_index + 1, block_index + 1); |
| 41 | + seen.insert(remote_url.clone(), base.clone()); |
| 42 | + first_url_order.push((remote_url, base)); |
| 43 | + } |
| 44 | + } |
| 45 | + } |
| 46 | + |
| 47 | + fire( |
| 48 | + &progress, |
| 49 | + ProgressEvent::FigureScanStarted { |
| 50 | + total: first_url_order.len(), |
| 51 | + }, |
| 52 | + ); |
| 53 | + |
| 54 | + let figure_cap = INTERNAL_FIGURE_CONCURRENCY.max(1); |
| 55 | + let tasks = first_url_order.iter().map(|(url, base)| { |
| 56 | + let url = url.clone(); |
| 57 | + let base = base.clone(); |
| 58 | + let figures_dir = figures_dir.to_path_buf(); |
| 59 | + let client = client.clone(); |
| 60 | + let progress = progress.clone(); |
| 61 | + async move { |
| 62 | + let downloaded = |
| 63 | + download_figure(&client, &url, &figures_dir, &base, max_download_bytes).await; |
| 64 | + if downloaded.is_some() { |
| 65 | + fire(&progress, ProgressEvent::FigureDownloadFinished); |
| 66 | + } |
| 67 | + (url, downloaded) |
| 68 | + } |
| 69 | + }); |
| 70 | + |
| 71 | + let results = stream::iter(tasks) |
| 72 | + .buffer_unordered(figure_cap) |
| 73 | + .collect::<Vec<_>>() |
| 74 | + .await; |
| 75 | + |
| 76 | + let mut replacements: HashMap<String, String> = HashMap::new(); |
| 77 | + let mut downloaded_figures = 0usize; |
| 78 | + for (url, local) in results { |
| 79 | + if let Some(local_path) = local { |
| 80 | + downloaded_figures += 1; |
| 81 | + replacements.insert(url, format!("figures/{}", local_path)); |
| 82 | + } |
| 83 | + } |
| 84 | + |
| 85 | + let rewritten = replace_image_urls(&markdown, &replacements); |
| 86 | + Ok(( |
| 87 | + rewritten, |
| 88 | + downloaded_figures, |
| 89 | + remote_figure_links, |
| 90 | + image_blocks, |
| 91 | + )) |
| 92 | +} |
| 93 | + |
| 94 | +pub(crate) fn extract_image_url(block: &Value) -> Option<String> { |
| 95 | + for key in ["content", "image_url", "crop_image_url", "url", "file_url"] { |
| 96 | + if let Some(value) = block.get(key) |
| 97 | + && let Some(found) = find_http_url(value) |
| 98 | + { |
| 99 | + return Some(found); |
| 100 | + } |
| 101 | + } |
| 102 | + None |
| 103 | +} |
| 104 | + |
| 105 | +pub(crate) fn find_http_url(value: &Value) -> Option<String> { |
| 106 | + if let Some(s) = value.as_str() { |
| 107 | + if is_http_url(s) { |
| 108 | + return Some(s.to_string()); |
| 109 | + } |
| 110 | + return None; |
| 111 | + } |
| 112 | + |
| 113 | + if let Some(array) = value.as_array() { |
| 114 | + for item in array { |
| 115 | + if let Some(found) = find_http_url(item) { |
| 116 | + return Some(found); |
| 117 | + } |
| 118 | + } |
| 119 | + } |
| 120 | + |
| 121 | + if let Some(map) = value.as_object() { |
| 122 | + for item in map.values() { |
| 123 | + if let Some(found) = find_http_url(item) { |
| 124 | + return Some(found); |
| 125 | + } |
| 126 | + } |
| 127 | + } |
| 128 | + None |
| 129 | +} |
| 130 | + |
| 131 | +pub(crate) fn is_http_url(value: &str) -> bool { |
| 132 | + value.strip_prefix("http://").is_some() || value.strip_prefix("https://").is_some() |
| 133 | +} |
| 134 | + |
| 135 | +pub(crate) async fn download_figure( |
| 136 | + client: &reqwest::Client, |
| 137 | + url: &str, |
| 138 | + figures_dir: &Path, |
| 139 | + base: &str, |
| 140 | + max_download_bytes: u64, |
| 141 | +) -> Option<String> { |
| 142 | + let response = client.get(url).send().await.ok()?; |
| 143 | + if !response.status().is_success() { |
| 144 | + return None; |
| 145 | + } |
| 146 | + |
| 147 | + if let Some(length) = response |
| 148 | + .headers() |
| 149 | + .get(CONTENT_LENGTH) |
| 150 | + .and_then(|v| v.to_str().ok()) |
| 151 | + .and_then(|s| s.parse::<u64>().ok()) |
| 152 | + && length > max_download_bytes |
| 153 | + { |
| 154 | + return None; |
| 155 | + } |
| 156 | + |
| 157 | + let content_type = response |
| 158 | + .headers() |
| 159 | + .get(CONTENT_TYPE) |
| 160 | + .and_then(|v| v.to_str().ok()); |
| 161 | + if let Some(ctype) = content_type |
| 162 | + && !ctype.to_lowercase().starts_with("image/") |
| 163 | + { |
| 164 | + return None; |
| 165 | + } |
| 166 | + |
| 167 | + let suffix = content_type_to_suffix(content_type) |
| 168 | + .or_else(|| url_suffix(url)) |
| 169 | + .unwrap_or_else(|| ".img".to_string()); |
| 170 | + |
| 171 | + let filename = format!("{base}{suffix}"); |
| 172 | + let output = figures_dir.join(&filename); |
| 173 | + let mut bytes = Vec::new(); |
| 174 | + let mut stream = response.bytes_stream(); |
| 175 | + while let Some(chunk) = stream.next().await { |
| 176 | + let chunk = chunk.ok()?; |
| 177 | + let next_len = bytes.len() as u64 + chunk.len() as u64; |
| 178 | + if next_len > max_download_bytes { |
| 179 | + return None; |
| 180 | + } |
| 181 | + bytes.extend_from_slice(&chunk); |
| 182 | + } |
| 183 | + |
| 184 | + tokio::fs::create_dir_all(figures_dir).await.ok()?; |
| 185 | + if tokio::fs::write(&output, &bytes).await.is_err() { |
| 186 | + return None; |
| 187 | + } |
| 188 | + |
| 189 | + Some(filename) |
| 190 | +} |
| 191 | + |
| 192 | +pub(crate) fn content_type_to_suffix(content_type: Option<&str>) -> Option<String> { |
| 193 | + let ct = content_type?.split(';').next()?.trim().to_ascii_lowercase(); |
| 194 | + let suffix = match ct.as_str() { |
| 195 | + "image/jpeg" => ".jpg", |
| 196 | + "image/jpg" => ".jpg", |
| 197 | + "image/png" => ".png", |
| 198 | + "image/webp" => ".webp", |
| 199 | + "image/gif" => ".gif", |
| 200 | + "image/svg+xml" => ".svg", |
| 201 | + "image/bmp" => ".bmp", |
| 202 | + "image/tiff" => ".tif", |
| 203 | + _ => return None, |
| 204 | + }; |
| 205 | + Some(suffix.to_string()) |
| 206 | +} |
| 207 | + |
| 208 | +pub(crate) fn url_suffix(url: &str) -> Option<String> { |
| 209 | + let parsed = url::Url::parse(url).ok()?; |
| 210 | + let path = parsed.path(); |
| 211 | + let ext = Path::new(path).extension()?.to_str()?; |
| 212 | + if ext.is_empty() { |
| 213 | + return None; |
| 214 | + } |
| 215 | + Some(format!(".{ext}")) |
| 216 | +} |
0 commit comments