Skip to content

Commit e79fc70

Browse files
zackeesclaude
andauthored
fix(packages): retry transient download failures (closes nightly STM32 flake) (#452)
The nightly Acceptance gates run on main hit a `dl.registry.platformio.org` hiccup downloading framework-cmsis tarball, which failed `stm32f103c8_blink_with_spi_auto_discovers_library_205_ac4` instantly: > failed to download .../framework-cmsis-2.50700.210515.tar.gz: > error sending request for url (...) The downloader had no retry — a single network blip flipped a green nightly red. Add transient-failure retry to both `download_file` and `download_file_with_progress`: - Up to 3 GET attempts total (1 initial + 2 retries). - 1 s backoff before attempt 2, 3 s before attempt 3. - Retry on connection / timeout / request-build / body-recv errors and on 5xx server errors. Skip retry on 4xx (those are deterministic). - Streaming variant retries only the initial fetch — mid-stream chunk errors stay terminal because restarting a partial multi-MB download from byte 0 is a bigger lever than the transient errors call for. Run: https://github.com/FastLED/fbuild/actions/runs/27053629497/job/79853635977 Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 385377d commit e79fc70

1 file changed

Lines changed: 250 additions & 28 deletions

File tree

crates/fbuild-packages/src/downloader.rs

Lines changed: 250 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,39 @@
55
66
use std::fmt::Write;
77
use std::path::{Path, PathBuf};
8-
use std::time::Instant;
8+
use std::time::{Duration, Instant};
99

1010
use fbuild_core::{FbuildError, Result};
1111
use sha2::{Digest, Sha256};
1212

13+
/// Number of GET attempts before giving up on a transient failure.
14+
/// One initial attempt + two retries. Worst-case wall time at the
15+
/// default backoff schedule is ~4 s of sleep before the third
16+
/// attempt — barely registers on a healthy run, large enough to ride
17+
/// out the `dl.registry.platformio.org` hiccups we keep seeing in the
18+
/// nightly STM32 acceptance gate.
19+
const MAX_ATTEMPTS: u32 = 3;
20+
21+
/// Per-attempt backoff sleeps: 1 s before attempt 2, 3 s before
22+
/// attempt 3.
23+
const RETRY_BACKOFFS: &[Duration] = &[Duration::from_secs(1), Duration::from_secs(3)];
24+
25+
/// Classify a `reqwest::Error` as worth retrying — anything that
26+
/// could plausibly succeed on a retry (connect timeout, request /
27+
/// body recv error, server-side 5xx). Deterministic-looking failures
28+
/// (URL parse, 4xx) are NOT retried; they'd just waste time.
29+
fn is_transient(err: &reqwest::Error) -> bool {
30+
if err.is_timeout() || err.is_connect() || err.is_request() || err.is_body() {
31+
return true;
32+
}
33+
if let Some(status) = err.status() {
34+
return status.is_server_error();
35+
}
36+
// No HTTP status, not classified above → most likely a
37+
// network-stack transient (DNS, TLS handshake). Retry.
38+
true
39+
}
40+
1341
fn hex_encode(bytes: &[u8]) -> String {
1442
bytes
1543
.iter()
@@ -60,22 +88,7 @@ pub async fn download_file(url: &str, dest_dir: &Path) -> Result<PathBuf> {
6088
let filename = url.rsplit('/').next().unwrap_or("download").to_string();
6189
let dest_path = dest_dir.join(&filename);
6290

63-
let response = reqwest::get(url)
64-
.await
65-
.map_err(|e| FbuildError::PackageError(format!("failed to download {}: {}", url, e)))?;
66-
67-
if !response.status().is_success() {
68-
return Err(FbuildError::PackageError(format!(
69-
"download failed for {}: HTTP {}",
70-
url,
71-
response.status()
72-
)));
73-
}
74-
75-
let bytes = response
76-
.bytes()
77-
.await
78-
.map_err(|e| FbuildError::PackageError(format!("failed to read response body: {}", e)))?;
91+
let bytes = get_with_retry(url).await?;
7992

8093
tokio::fs::write(&dest_path, &bytes).await.map_err(|e| {
8194
FbuildError::PackageError(format!(
@@ -89,6 +102,132 @@ pub async fn download_file(url: &str, dest_dir: &Path) -> Result<PathBuf> {
89102
Ok(dest_path)
90103
}
91104

105+
/// GET `url` and return the body bytes, retrying transient failures
106+
/// up to [`MAX_ATTEMPTS`] times with [`RETRY_BACKOFFS`] between
107+
/// attempts. A non-2xx HTTP status is treated as a hard failure
108+
/// (only server-side 5xx is retried).
109+
async fn get_with_retry(url: &str) -> Result<Vec<u8>> {
110+
let mut attempt: u32 = 0;
111+
loop {
112+
attempt += 1;
113+
match reqwest::get(url).await {
114+
Ok(response) => {
115+
let status = response.status();
116+
if !status.is_success() {
117+
// 4xx is deterministic, 5xx is retryable.
118+
if status.is_server_error() && attempt < MAX_ATTEMPTS {
119+
let sleep = RETRY_BACKOFFS
120+
.get(attempt as usize - 1)
121+
.copied()
122+
.unwrap_or(Duration::from_secs(5));
123+
tracing::warn!(
124+
"download {}: HTTP {} on attempt {}/{}, retrying after {:?}",
125+
url,
126+
status,
127+
attempt,
128+
MAX_ATTEMPTS,
129+
sleep
130+
);
131+
tokio::time::sleep(sleep).await;
132+
continue;
133+
}
134+
return Err(FbuildError::PackageError(format!(
135+
"download failed for {}: HTTP {}",
136+
url, status
137+
)));
138+
}
139+
return response.bytes().await.map(|b| b.to_vec()).map_err(|e| {
140+
FbuildError::PackageError(format!("failed to read response body: {}", e))
141+
});
142+
}
143+
Err(e) => {
144+
if is_transient(&e) && attempt < MAX_ATTEMPTS {
145+
let sleep = RETRY_BACKOFFS
146+
.get(attempt as usize - 1)
147+
.copied()
148+
.unwrap_or(Duration::from_secs(5));
149+
tracing::warn!(
150+
"download {}: transient error on attempt {}/{} ({}), retrying after {:?}",
151+
url,
152+
attempt,
153+
MAX_ATTEMPTS,
154+
e,
155+
sleep
156+
);
157+
tokio::time::sleep(sleep).await;
158+
continue;
159+
}
160+
return Err(FbuildError::PackageError(format!(
161+
"failed to download {}: {}",
162+
url, e
163+
)));
164+
}
165+
}
166+
}
167+
}
168+
169+
/// GET `url` and return the `Response` for streaming, retrying
170+
/// transient failures on the initial fetch. Once the response body
171+
/// has started streaming, errors are terminal (we don't restart
172+
/// large downloads from byte 0 — that's a bigger lever than the
173+
/// transient `dl.registry.platformio.org` errors call for).
174+
async fn open_with_retry(url: &str) -> Result<reqwest::Response> {
175+
let mut attempt: u32 = 0;
176+
loop {
177+
attempt += 1;
178+
match reqwest::get(url).await {
179+
Ok(response) => {
180+
let status = response.status();
181+
if status.is_success() {
182+
return Ok(response);
183+
}
184+
if status.is_server_error() && attempt < MAX_ATTEMPTS {
185+
let sleep = RETRY_BACKOFFS
186+
.get(attempt as usize - 1)
187+
.copied()
188+
.unwrap_or(Duration::from_secs(5));
189+
tracing::warn!(
190+
"download {}: HTTP {} on attempt {}/{}, retrying after {:?}",
191+
url,
192+
status,
193+
attempt,
194+
MAX_ATTEMPTS,
195+
sleep
196+
);
197+
tokio::time::sleep(sleep).await;
198+
continue;
199+
}
200+
return Err(FbuildError::PackageError(format!(
201+
"download failed for {}: HTTP {}",
202+
url, status
203+
)));
204+
}
205+
Err(e) => {
206+
if is_transient(&e) && attempt < MAX_ATTEMPTS {
207+
let sleep = RETRY_BACKOFFS
208+
.get(attempt as usize - 1)
209+
.copied()
210+
.unwrap_or(Duration::from_secs(5));
211+
tracing::warn!(
212+
"download {}: transient error on attempt {}/{} ({}), retrying after {:?}",
213+
url,
214+
attempt,
215+
MAX_ATTEMPTS,
216+
e,
217+
sleep
218+
);
219+
tokio::time::sleep(sleep).await;
220+
continue;
221+
}
222+
return Err(FbuildError::PackageError(format!(
223+
"failed to download {}: {}",
224+
url, e
225+
)));
226+
}
227+
}
228+
}
229+
}
230+
92231
/// Download a file with streaming progress reporting.
93232
///
94233
/// The `on_progress` callback is called periodically during the download
@@ -101,17 +240,7 @@ pub async fn download_file_with_progress(
101240
let filename = url.rsplit('/').next().unwrap_or("download").to_string();
102241
let dest_path = dest_dir.join(&filename);
103242

104-
let response = reqwest::get(url)
105-
.await
106-
.map_err(|e| FbuildError::PackageError(format!("failed to download {}: {}", url, e)))?;
107-
108-
if !response.status().is_success() {
109-
return Err(FbuildError::PackageError(format!(
110-
"download failed for {}: HTTP {}",
111-
url,
112-
response.status()
113-
)));
114-
}
243+
let response = open_with_retry(url).await?;
115244

116245
let total_bytes = response.content_length();
117246
let mut downloaded: u64 = 0;
@@ -263,6 +392,99 @@ mod tests {
263392
.contains("checksum mismatch"));
264393
}
265394

395+
// ---- transient-retry tests ----
396+
397+
/// Stand up a tiny raw-TCP HTTP server on a loopback port. Reads
398+
/// one request, drops the body, writes whatever 4-line HTTP
399+
/// response the caller queued for that attempt, and closes the
400+
/// connection. The caller pre-queues a Vec of responses, one per
401+
/// attempt; the server pops the next one as each connection
402+
/// comes in. Keeps the deps to tokio (already required).
403+
async fn run_flaky_server(
404+
responses: std::sync::Arc<std::sync::Mutex<Vec<&'static str>>>,
405+
) -> u16 {
406+
use tokio::io::{AsyncReadExt, AsyncWriteExt};
407+
use tokio::net::TcpListener;
408+
409+
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
410+
let port = listener.local_addr().unwrap().port();
411+
412+
tokio::spawn(async move {
413+
loop {
414+
let (mut stream, _) = match listener.accept().await {
415+
Ok(p) => p,
416+
Err(_) => break,
417+
};
418+
let resp = {
419+
let mut guard = responses.lock().unwrap();
420+
if guard.is_empty() {
421+
break;
422+
}
423+
guard.remove(0)
424+
};
425+
let mut buf = [0u8; 1024];
426+
// Read just the request headers — don't care about the
427+
// body for these tests.
428+
let _ =
429+
tokio::time::timeout(Duration::from_millis(200), stream.read(&mut buf)).await;
430+
let _ = stream.write_all(resp.as_bytes()).await;
431+
let _ = stream.shutdown().await;
432+
}
433+
});
434+
port
435+
}
436+
437+
/// #205 nightly STM32 acceptance gate started flaking on
438+
/// `dl.registry.platformio.org` transient errors. A 5xx must
439+
/// trigger a retry, and the retry must succeed.
440+
#[tokio::test]
441+
async fn get_with_retry_retries_on_5xx() {
442+
let responses = std::sync::Arc::new(std::sync::Mutex::new(vec![
443+
"HTTP/1.1 503 Service Unavailable\r\nContent-Length: 0\r\n\r\n",
444+
"HTTP/1.1 200 OK\r\nContent-Length: 5\r\n\r\nhello",
445+
]));
446+
let port = run_flaky_server(responses.clone()).await;
447+
let url = format!("http://127.0.0.1:{port}/file");
448+
let bytes = get_with_retry(&url).await.expect("retry should succeed");
449+
assert_eq!(bytes, b"hello");
450+
}
451+
452+
/// 4xx is deterministic — it must NOT retry. The test queues a
453+
/// single 404; if the implementation retried we'd hit the server's
454+
/// empty-queue branch and the test would hang or panic.
455+
#[tokio::test]
456+
async fn get_with_retry_does_not_retry_on_4xx() {
457+
let responses = std::sync::Arc::new(std::sync::Mutex::new(vec![
458+
"HTTP/1.1 404 Not Found\r\nContent-Length: 0\r\n\r\n",
459+
]));
460+
let port = run_flaky_server(responses.clone()).await;
461+
let url = format!("http://127.0.0.1:{port}/missing");
462+
let err = get_with_retry(&url).await.expect_err("should error");
463+
assert!(
464+
err.to_string().contains("404"),
465+
"expected 404 in error, got: {err}"
466+
);
467+
}
468+
469+
/// Repeated 5xx exhausts the budget and surfaces the last
470+
/// response.
471+
#[tokio::test]
472+
async fn get_with_retry_gives_up_after_max_attempts() {
473+
let responses = std::sync::Arc::new(std::sync::Mutex::new(vec![
474+
"HTTP/1.1 500 Internal Server Error\r\nContent-Length: 0\r\n\r\n",
475+
"HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\n\r\n",
476+
"HTTP/1.1 503 Service Unavailable\r\nContent-Length: 0\r\n\r\n",
477+
]));
478+
let port = run_flaky_server(responses.clone()).await;
479+
let url = format!("http://127.0.0.1:{port}/file");
480+
let err = get_with_retry(&url).await.expect_err("should give up");
481+
// Last attempt was a 503; that's what gets surfaced.
482+
assert!(
483+
err.to_string().contains("503"),
484+
"expected last-attempt 503 in error, got: {err}"
485+
);
486+
}
487+
266488
#[test]
267489
fn format_download_progress_with_total() {
268490
let p = DownloadProgress {

0 commit comments

Comments
 (0)