55
66use std:: fmt:: Write ;
77use std:: path:: { Path , PathBuf } ;
8- use std:: time:: Instant ;
8+ use std:: time:: { Duration , Instant } ;
99
1010use fbuild_core:: { FbuildError , Result } ;
1111use sha2:: { Digest , Sha256 } ;
1212
13+ /// Number of GET attempts before giving up on a transient failure.
14+ /// One initial attempt + two retries. Worst-case wall time at the
15+ /// default backoff schedule is ~4 s of sleep before the third
16+ /// attempt — barely registers on a healthy run, large enough to ride
17+ /// out the `dl.registry.platformio.org` hiccups we keep seeing in the
18+ /// nightly STM32 acceptance gate.
19+ const MAX_ATTEMPTS : u32 = 3 ;
20+
21+ /// Per-attempt backoff sleeps: 1 s before attempt 2, 3 s before
22+ /// attempt 3.
23+ const RETRY_BACKOFFS : & [ Duration ] = & [ Duration :: from_secs ( 1 ) , Duration :: from_secs ( 3 ) ] ;
24+
25+ /// Classify a `reqwest::Error` as worth retrying — anything that
26+ /// could plausibly succeed on a retry (connect timeout, request /
27+ /// body recv error, server-side 5xx). Deterministic-looking failures
28+ /// (URL parse, 4xx) are NOT retried; they'd just waste time.
29+ fn is_transient ( err : & reqwest:: Error ) -> bool {
30+ if err. is_timeout ( ) || err. is_connect ( ) || err. is_request ( ) || err. is_body ( ) {
31+ return true ;
32+ }
33+ if let Some ( status) = err. status ( ) {
34+ return status. is_server_error ( ) ;
35+ }
36+ // No HTTP status, not classified above → most likely a
37+ // network-stack transient (DNS, TLS handshake). Retry.
38+ true
39+ }
40+
1341fn hex_encode ( bytes : & [ u8 ] ) -> String {
1442 bytes
1543 . iter ( )
@@ -60,22 +88,7 @@ pub async fn download_file(url: &str, dest_dir: &Path) -> Result<PathBuf> {
6088 let filename = url. rsplit ( '/' ) . next ( ) . unwrap_or ( "download" ) . to_string ( ) ;
6189 let dest_path = dest_dir. join ( & filename) ;
6290
63- let response = reqwest:: get ( url)
64- . await
65- . map_err ( |e| FbuildError :: PackageError ( format ! ( "failed to download {}: {}" , url, e) ) ) ?;
66-
67- if !response. status ( ) . is_success ( ) {
68- return Err ( FbuildError :: PackageError ( format ! (
69- "download failed for {}: HTTP {}" ,
70- url,
71- response. status( )
72- ) ) ) ;
73- }
74-
75- let bytes = response
76- . bytes ( )
77- . await
78- . map_err ( |e| FbuildError :: PackageError ( format ! ( "failed to read response body: {}" , e) ) ) ?;
91+ let bytes = get_with_retry ( url) . await ?;
7992
8093 tokio:: fs:: write ( & dest_path, & bytes) . await . map_err ( |e| {
8194 FbuildError :: PackageError ( format ! (
@@ -89,6 +102,132 @@ pub async fn download_file(url: &str, dest_dir: &Path) -> Result<PathBuf> {
89102 Ok ( dest_path)
90103}
91104
105+ /// GET `url` and return the body bytes, retrying transient failures
106+ /// up to [`MAX_ATTEMPTS`] times with [`RETRY_BACKOFFS`] between
107+ /// attempts. A non-2xx HTTP status is treated as a hard failure
108+ /// (only server-side 5xx is retried).
109+ async fn get_with_retry ( url : & str ) -> Result < Vec < u8 > > {
110+ let mut attempt: u32 = 0 ;
111+ loop {
112+ attempt += 1 ;
113+ match reqwest:: get ( url) . await {
114+ Ok ( response) => {
115+ let status = response. status ( ) ;
116+ if !status. is_success ( ) {
117+ // 4xx is deterministic, 5xx is retryable.
118+ if status. is_server_error ( ) && attempt < MAX_ATTEMPTS {
119+ let sleep = RETRY_BACKOFFS
120+ . get ( attempt as usize - 1 )
121+ . copied ( )
122+ . unwrap_or ( Duration :: from_secs ( 5 ) ) ;
123+ tracing:: warn!(
124+ "download {}: HTTP {} on attempt {}/{}, retrying after {:?}" ,
125+ url,
126+ status,
127+ attempt,
128+ MAX_ATTEMPTS ,
129+ sleep
130+ ) ;
131+ tokio:: time:: sleep ( sleep) . await ;
132+ continue ;
133+ }
134+ return Err ( FbuildError :: PackageError ( format ! (
135+ "download failed for {}: HTTP {}" ,
136+ url, status
137+ ) ) ) ;
138+ }
139+ return response. bytes ( ) . await . map ( |b| b. to_vec ( ) ) . map_err ( |e| {
140+ FbuildError :: PackageError ( format ! ( "failed to read response body: {}" , e) )
141+ } ) ;
142+ }
143+ Err ( e) => {
144+ if is_transient ( & e) && attempt < MAX_ATTEMPTS {
145+ let sleep = RETRY_BACKOFFS
146+ . get ( attempt as usize - 1 )
147+ . copied ( )
148+ . unwrap_or ( Duration :: from_secs ( 5 ) ) ;
149+ tracing:: warn!(
150+ "download {}: transient error on attempt {}/{} ({}), retrying after {:?}" ,
151+ url,
152+ attempt,
153+ MAX_ATTEMPTS ,
154+ e,
155+ sleep
156+ ) ;
157+ tokio:: time:: sleep ( sleep) . await ;
158+ continue ;
159+ }
160+ return Err ( FbuildError :: PackageError ( format ! (
161+ "failed to download {}: {}" ,
162+ url, e
163+ ) ) ) ;
164+ }
165+ }
166+ }
167+ }
168+
169+ /// GET `url` and return the `Response` for streaming, retrying
170+ /// transient failures on the initial fetch. Once the response body
171+ /// has started streaming, errors are terminal (we don't restart
172+ /// large downloads from byte 0 — that's a bigger lever than the
173+ /// transient `dl.registry.platformio.org` errors call for).
174+ async fn open_with_retry ( url : & str ) -> Result < reqwest:: Response > {
175+ let mut attempt: u32 = 0 ;
176+ loop {
177+ attempt += 1 ;
178+ match reqwest:: get ( url) . await {
179+ Ok ( response) => {
180+ let status = response. status ( ) ;
181+ if status. is_success ( ) {
182+ return Ok ( response) ;
183+ }
184+ if status. is_server_error ( ) && attempt < MAX_ATTEMPTS {
185+ let sleep = RETRY_BACKOFFS
186+ . get ( attempt as usize - 1 )
187+ . copied ( )
188+ . unwrap_or ( Duration :: from_secs ( 5 ) ) ;
189+ tracing:: warn!(
190+ "download {}: HTTP {} on attempt {}/{}, retrying after {:?}" ,
191+ url,
192+ status,
193+ attempt,
194+ MAX_ATTEMPTS ,
195+ sleep
196+ ) ;
197+ tokio:: time:: sleep ( sleep) . await ;
198+ continue ;
199+ }
200+ return Err ( FbuildError :: PackageError ( format ! (
201+ "download failed for {}: HTTP {}" ,
202+ url, status
203+ ) ) ) ;
204+ }
205+ Err ( e) => {
206+ if is_transient ( & e) && attempt < MAX_ATTEMPTS {
207+ let sleep = RETRY_BACKOFFS
208+ . get ( attempt as usize - 1 )
209+ . copied ( )
210+ . unwrap_or ( Duration :: from_secs ( 5 ) ) ;
211+ tracing:: warn!(
212+ "download {}: transient error on attempt {}/{} ({}), retrying after {:?}" ,
213+ url,
214+ attempt,
215+ MAX_ATTEMPTS ,
216+ e,
217+ sleep
218+ ) ;
219+ tokio:: time:: sleep ( sleep) . await ;
220+ continue ;
221+ }
222+ return Err ( FbuildError :: PackageError ( format ! (
223+ "failed to download {}: {}" ,
224+ url, e
225+ ) ) ) ;
226+ }
227+ }
228+ }
229+ }
230+
92231/// Download a file with streaming progress reporting.
93232///
94233/// The `on_progress` callback is called periodically during the download
@@ -101,17 +240,7 @@ pub async fn download_file_with_progress(
101240 let filename = url. rsplit ( '/' ) . next ( ) . unwrap_or ( "download" ) . to_string ( ) ;
102241 let dest_path = dest_dir. join ( & filename) ;
103242
104- let response = reqwest:: get ( url)
105- . await
106- . map_err ( |e| FbuildError :: PackageError ( format ! ( "failed to download {}: {}" , url, e) ) ) ?;
107-
108- if !response. status ( ) . is_success ( ) {
109- return Err ( FbuildError :: PackageError ( format ! (
110- "download failed for {}: HTTP {}" ,
111- url,
112- response. status( )
113- ) ) ) ;
114- }
243+ let response = open_with_retry ( url) . await ?;
115244
116245 let total_bytes = response. content_length ( ) ;
117246 let mut downloaded: u64 = 0 ;
@@ -263,6 +392,99 @@ mod tests {
263392 . contains( "checksum mismatch" ) ) ;
264393 }
265394
395+ // ---- transient-retry tests ----
396+
397+ /// Stand up a tiny raw-TCP HTTP server on a loopback port. Reads
398+ /// one request, drops the body, writes whatever 4-line HTTP
399+ /// response the caller queued for that attempt, and closes the
400+ /// connection. The caller pre-queues a Vec of responses, one per
401+ /// attempt; the server pops the next one as each connection
402+ /// comes in. Keeps the deps to tokio (already required).
403+ async fn run_flaky_server (
404+ responses : std:: sync:: Arc < std:: sync:: Mutex < Vec < & ' static str > > > ,
405+ ) -> u16 {
406+ use tokio:: io:: { AsyncReadExt , AsyncWriteExt } ;
407+ use tokio:: net:: TcpListener ;
408+
409+ let listener = TcpListener :: bind ( "127.0.0.1:0" ) . await . unwrap ( ) ;
410+ let port = listener. local_addr ( ) . unwrap ( ) . port ( ) ;
411+
412+ tokio:: spawn ( async move {
413+ loop {
414+ let ( mut stream, _) = match listener. accept ( ) . await {
415+ Ok ( p) => p,
416+ Err ( _) => break ,
417+ } ;
418+ let resp = {
419+ let mut guard = responses. lock ( ) . unwrap ( ) ;
420+ if guard. is_empty ( ) {
421+ break ;
422+ }
423+ guard. remove ( 0 )
424+ } ;
425+ let mut buf = [ 0u8 ; 1024 ] ;
426+ // Read just the request headers — don't care about the
427+ // body for these tests.
428+ let _ =
429+ tokio:: time:: timeout ( Duration :: from_millis ( 200 ) , stream. read ( & mut buf) ) . await ;
430+ let _ = stream. write_all ( resp. as_bytes ( ) ) . await ;
431+ let _ = stream. shutdown ( ) . await ;
432+ }
433+ } ) ;
434+ port
435+ }
436+
437+ /// #205 nightly STM32 acceptance gate started flaking on
438+ /// `dl.registry.platformio.org` transient errors. A 5xx must
439+ /// trigger a retry, and the retry must succeed.
440+ #[ tokio:: test]
441+ async fn get_with_retry_retries_on_5xx ( ) {
442+ let responses = std:: sync:: Arc :: new ( std:: sync:: Mutex :: new ( vec ! [
443+ "HTTP/1.1 503 Service Unavailable\r \n Content-Length: 0\r \n \r \n " ,
444+ "HTTP/1.1 200 OK\r \n Content-Length: 5\r \n \r \n hello" ,
445+ ] ) ) ;
446+ let port = run_flaky_server ( responses. clone ( ) ) . await ;
447+ let url = format ! ( "http://127.0.0.1:{port}/file" ) ;
448+ let bytes = get_with_retry ( & url) . await . expect ( "retry should succeed" ) ;
449+ assert_eq ! ( bytes, b"hello" ) ;
450+ }
451+
452+ /// 4xx is deterministic — it must NOT retry. The test queues a
453+ /// single 404; if the implementation retried we'd hit the server's
454+ /// empty-queue branch and the test would hang or panic.
455+ #[ tokio:: test]
456+ async fn get_with_retry_does_not_retry_on_4xx ( ) {
457+ let responses = std:: sync:: Arc :: new ( std:: sync:: Mutex :: new ( vec ! [
458+ "HTTP/1.1 404 Not Found\r \n Content-Length: 0\r \n \r \n " ,
459+ ] ) ) ;
460+ let port = run_flaky_server ( responses. clone ( ) ) . await ;
461+ let url = format ! ( "http://127.0.0.1:{port}/missing" ) ;
462+ let err = get_with_retry ( & url) . await . expect_err ( "should error" ) ;
463+ assert ! (
464+ err. to_string( ) . contains( "404" ) ,
465+ "expected 404 in error, got: {err}"
466+ ) ;
467+ }
468+
469+ /// Repeated 5xx exhausts the budget and surfaces the last
470+ /// response.
471+ #[ tokio:: test]
472+ async fn get_with_retry_gives_up_after_max_attempts ( ) {
473+ let responses = std:: sync:: Arc :: new ( std:: sync:: Mutex :: new ( vec ! [
474+ "HTTP/1.1 500 Internal Server Error\r \n Content-Length: 0\r \n \r \n " ,
475+ "HTTP/1.1 502 Bad Gateway\r \n Content-Length: 0\r \n \r \n " ,
476+ "HTTP/1.1 503 Service Unavailable\r \n Content-Length: 0\r \n \r \n " ,
477+ ] ) ) ;
478+ let port = run_flaky_server ( responses. clone ( ) ) . await ;
479+ let url = format ! ( "http://127.0.0.1:{port}/file" ) ;
480+ let err = get_with_retry ( & url) . await . expect_err ( "should give up" ) ;
481+ // Last attempt was a 503; that's what gets surfaced.
482+ assert ! (
483+ err. to_string( ) . contains( "503" ) ,
484+ "expected last-attempt 503 in error, got: {err}"
485+ ) ;
486+ }
487+
266488 #[ test]
267489 fn format_download_progress_with_total ( ) {
268490 let p = DownloadProgress {
0 commit comments