Skip to content

Commit 4f262c2

Browse files
committed
feat(core): add assets module
1 parent 0ba0c5b commit 4f262c2

1 file changed

Lines changed: 216 additions & 0 deletions

File tree

src/core/assets.rs

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
use anyhow::Result;
2+
use futures::stream::{self, StreamExt};
3+
use reqwest::header::{CONTENT_LENGTH, CONTENT_TYPE};
4+
use serde_json::Value;
5+
use std::collections::HashMap;
6+
use std::path::Path;
7+
8+
use super::markdown::replace_image_urls;
9+
use super::{ProgressCallback, ProgressEvent, fire};
10+
11+
const INTERNAL_FIGURE_CONCURRENCY: usize = 16;
12+
13+
pub(crate) async fn localize_figures(
14+
markdown: String,
15+
layout_details: &[Value],
16+
client: &reqwest::Client,
17+
figures_dir: &Path,
18+
max_download_bytes: u64,
19+
progress: Option<ProgressCallback>,
20+
) -> Result<(String, usize, usize, usize)> {
21+
let mut remote_figure_links = 0usize;
22+
let mut image_blocks = 0usize;
23+
let mut first_url_order: Vec<(String, String)> = Vec::new();
24+
let mut seen: HashMap<String, String> = HashMap::new();
25+
26+
for (page_index, page_blocks) in layout_details.iter().enumerate() {
27+
let Some(blocks) = page_blocks.as_array() else {
28+
continue;
29+
};
30+
for (block_index, block) in blocks.iter().enumerate() {
31+
if block.get("label").and_then(Value::as_str) != Some("image") {
32+
continue;
33+
}
34+
image_blocks += 1;
35+
let Some(remote_url) = extract_image_url(block) else {
36+
continue;
37+
};
38+
remote_figure_links += 1;
39+
if !seen.contains_key(&remote_url) {
40+
let base = format!("fig-{:03}-{:03}", page_index + 1, block_index + 1);
41+
seen.insert(remote_url.clone(), base.clone());
42+
first_url_order.push((remote_url, base));
43+
}
44+
}
45+
}
46+
47+
fire(
48+
&progress,
49+
ProgressEvent::FigureScanStarted {
50+
total: first_url_order.len(),
51+
},
52+
);
53+
54+
let figure_cap = INTERNAL_FIGURE_CONCURRENCY.max(1);
55+
let tasks = first_url_order.iter().map(|(url, base)| {
56+
let url = url.clone();
57+
let base = base.clone();
58+
let figures_dir = figures_dir.to_path_buf();
59+
let client = client.clone();
60+
let progress = progress.clone();
61+
async move {
62+
let downloaded =
63+
download_figure(&client, &url, &figures_dir, &base, max_download_bytes).await;
64+
if downloaded.is_some() {
65+
fire(&progress, ProgressEvent::FigureDownloadFinished);
66+
}
67+
(url, downloaded)
68+
}
69+
});
70+
71+
let results = stream::iter(tasks)
72+
.buffer_unordered(figure_cap)
73+
.collect::<Vec<_>>()
74+
.await;
75+
76+
let mut replacements: HashMap<String, String> = HashMap::new();
77+
let mut downloaded_figures = 0usize;
78+
for (url, local) in results {
79+
if let Some(local_path) = local {
80+
downloaded_figures += 1;
81+
replacements.insert(url, format!("figures/{}", local_path));
82+
}
83+
}
84+
85+
let rewritten = replace_image_urls(&markdown, &replacements);
86+
Ok((
87+
rewritten,
88+
downloaded_figures,
89+
remote_figure_links,
90+
image_blocks,
91+
))
92+
}
93+
94+
pub(crate) fn extract_image_url(block: &Value) -> Option<String> {
95+
for key in ["content", "image_url", "crop_image_url", "url", "file_url"] {
96+
if let Some(value) = block.get(key)
97+
&& let Some(found) = find_http_url(value)
98+
{
99+
return Some(found);
100+
}
101+
}
102+
None
103+
}
104+
105+
pub(crate) fn find_http_url(value: &Value) -> Option<String> {
106+
if let Some(s) = value.as_str() {
107+
if is_http_url(s) {
108+
return Some(s.to_string());
109+
}
110+
return None;
111+
}
112+
113+
if let Some(array) = value.as_array() {
114+
for item in array {
115+
if let Some(found) = find_http_url(item) {
116+
return Some(found);
117+
}
118+
}
119+
}
120+
121+
if let Some(map) = value.as_object() {
122+
for item in map.values() {
123+
if let Some(found) = find_http_url(item) {
124+
return Some(found);
125+
}
126+
}
127+
}
128+
None
129+
}
130+
131+
pub(crate) fn is_http_url(value: &str) -> bool {
132+
value.strip_prefix("http://").is_some() || value.strip_prefix("https://").is_some()
133+
}
134+
135+
pub(crate) async fn download_figure(
136+
client: &reqwest::Client,
137+
url: &str,
138+
figures_dir: &Path,
139+
base: &str,
140+
max_download_bytes: u64,
141+
) -> Option<String> {
142+
let response = client.get(url).send().await.ok()?;
143+
if !response.status().is_success() {
144+
return None;
145+
}
146+
147+
if let Some(length) = response
148+
.headers()
149+
.get(CONTENT_LENGTH)
150+
.and_then(|v| v.to_str().ok())
151+
.and_then(|s| s.parse::<u64>().ok())
152+
&& length > max_download_bytes
153+
{
154+
return None;
155+
}
156+
157+
let content_type = response
158+
.headers()
159+
.get(CONTENT_TYPE)
160+
.and_then(|v| v.to_str().ok());
161+
if let Some(ctype) = content_type
162+
&& !ctype.to_lowercase().starts_with("image/")
163+
{
164+
return None;
165+
}
166+
167+
let suffix = content_type_to_suffix(content_type)
168+
.or_else(|| url_suffix(url))
169+
.unwrap_or_else(|| ".img".to_string());
170+
171+
let filename = format!("{base}{suffix}");
172+
let output = figures_dir.join(&filename);
173+
let mut bytes = Vec::new();
174+
let mut stream = response.bytes_stream();
175+
while let Some(chunk) = stream.next().await {
176+
let chunk = chunk.ok()?;
177+
let next_len = bytes.len() as u64 + chunk.len() as u64;
178+
if next_len > max_download_bytes {
179+
return None;
180+
}
181+
bytes.extend_from_slice(&chunk);
182+
}
183+
184+
tokio::fs::create_dir_all(figures_dir).await.ok()?;
185+
if tokio::fs::write(&output, &bytes).await.is_err() {
186+
return None;
187+
}
188+
189+
Some(filename)
190+
}
191+
192+
pub(crate) fn content_type_to_suffix(content_type: Option<&str>) -> Option<String> {
193+
let ct = content_type?.split(';').next()?.trim().to_ascii_lowercase();
194+
let suffix = match ct.as_str() {
195+
"image/jpeg" => ".jpg",
196+
"image/jpg" => ".jpg",
197+
"image/png" => ".png",
198+
"image/webp" => ".webp",
199+
"image/gif" => ".gif",
200+
"image/svg+xml" => ".svg",
201+
"image/bmp" => ".bmp",
202+
"image/tiff" => ".tif",
203+
_ => return None,
204+
};
205+
Some(suffix.to_string())
206+
}
207+
208+
pub(crate) fn url_suffix(url: &str) -> Option<String> {
209+
let parsed = url::Url::parse(url).ok()?;
210+
let path = parsed.path();
211+
let ext = Path::new(path).extension()?.to_str()?;
212+
if ext.is_empty() {
213+
return None;
214+
}
215+
Some(format!(".{ext}"))
216+
}

0 commit comments

Comments
 (0)