@@ -5,13 +5,15 @@ use std::fs::File;
55use std:: io:: Read ;
66use std:: io:: Write ;
77use std:: path:: PathBuf ;
8+ use std:: sync:: LazyLock ;
89use std:: time:: Duration ;
910
1011use anyhow:: Context ;
1112use anyhow:: Error ;
1213use anyhow:: Result ;
1314use bzip2:: read:: BzDecoder ;
1415use futures:: StreamExt ;
16+ use futures:: stream;
1517use indicatif:: ProgressBar ;
1618use indicatif:: ProgressStyle ;
1719use parking_lot:: RwLock ;
@@ -25,6 +27,29 @@ use tracing::warn;
2527use crate :: utils:: file:: idempotent;
2628use crate :: utils:: file:: idempotent_async;
2729
30+ /// Default concurrency limit for bulk downloads. Keeps us polite to the upstream while still
31+ /// saturating a typical 10 Gb link on a parquet-per-shard benchmark.
32+ pub const DEFAULT_DOWNLOAD_CONCURRENCY : usize = 16 ;
33+
34+ /// Shared HTTP client used by every dataset download.
35+ ///
36+ /// Reusing a single client gives us connection pooling, DNS caching, and consistent timeouts
37+ /// across all callers. Each benchmark used to build its own `reqwest::Client` on every download,
38+ /// which both wasted TLS handshakes and made it hard to reason about total in-flight concurrency.
39+ static HTTP_CLIENT : LazyLock < Client > = LazyLock :: new ( || {
40+ Client :: builder ( )
41+ . read_timeout ( Duration :: from_secs ( 60 ) )
42+ . timeout ( Duration :: from_secs ( 60 * 15 ) )
43+ . build ( )
44+ . expect ( "failed to build shared benchmark HTTP client" )
45+ } ) ;
46+
47+ /// Access the shared HTTP client. Exposed for callers that need custom request shapes
48+ /// (e.g. streaming VCF parsing) while still benefitting from pooled connections.
49+ pub fn http_client ( ) -> & ' static Client {
50+ & HTTP_CLIENT
51+ }
52+
2853async fn retry_get < F : Future < Output = Result < Response > > , R : Fn ( ) -> F > (
2954 make_req : R ,
3055 tmp_path : PathBuf ,
@@ -97,12 +122,13 @@ async fn retry_get<F: Future<Output = Result<Response>>, R: Fn() -> F>(
97122 Err ( last_err. unwrap_or_else ( || anyhow:: anyhow!( "retry_get exhausted with no recorded error" ) ) )
98123}
99124
125+ /// Idempotently download a single URL to `fname`.
126+ ///
127+ /// Uses the shared HTTP client, a 3-attempt exponential backoff retry loop, and an `indicatif`
128+ /// progress bar. If `fname` already exists, the download is skipped.
129+ #[ tracing:: instrument( skip_all, fields( url = %data_url. as_ref( ) , path = %fname. display( ) ) ) ]
100130pub async fn download_data ( fname : PathBuf , data_url : impl AsRef < str > ) -> Result < PathBuf > {
101- let client = Client :: builder ( )
102- . read_timeout ( Duration :: from_secs ( 60 ) )
103- . timeout ( Duration :: from_secs ( 60 * 15 ) )
104- . build ( )
105- . context ( "Failed to build HTTP client" ) ?;
131+ let client = http_client ( ) ;
106132
107133 idempotent_async ( & fname, async |path| {
108134 let url = data_url. as_ref ( ) ;
@@ -123,6 +149,64 @@ pub async fn download_data(fname: PathBuf, data_url: impl AsRef<str>) -> Result<
123149 . await
124150}
125151
152+ /// Idempotently download many `(path, url)` pairs with bounded parallelism.
153+ ///
154+ /// This is the preferred way to fetch multi-shard datasets (ClickBench partitioned, vector
155+ /// dataset train shards, Public BI tables, etc.) because it:
156+ ///
157+ /// - caps in-flight HTTP requests at `max_concurrency` so we don't overwhelm the upstream
158+ /// or our own network stack,
159+ /// - reuses the shared HTTP client across every shard,
160+ /// - short-circuits on the first error (the remaining in-flight downloads are dropped when
161+ /// the returned future is dropped),
162+ /// - returns the resolved on-disk paths in the same order they were submitted.
163+ ///
164+ /// Pass `0` as `max_concurrency` to use [`DEFAULT_DOWNLOAD_CONCURRENCY`].
165+ #[ tracing:: instrument( skip_all, fields( count = tracing:: field:: Empty , max_concurrency) ) ]
166+ pub async fn download_many < I > ( downloads : I , max_concurrency : usize ) -> Result < Vec < PathBuf > >
167+ where
168+ I : IntoIterator ,
169+ I :: Item : IntoDownload ,
170+ {
171+ let downloads: Vec < ( PathBuf , String ) > = downloads
172+ . into_iter ( )
173+ . map ( IntoDownload :: into_download)
174+ . collect ( ) ;
175+ tracing:: Span :: current ( ) . record ( "count" , downloads. len ( ) ) ;
176+
177+ let concurrency = if max_concurrency == 0 {
178+ DEFAULT_DOWNLOAD_CONCURRENCY
179+ } else {
180+ max_concurrency
181+ } ;
182+
183+ let results: Vec < Result < PathBuf > > = stream:: iter ( downloads)
184+ . map ( |( path, url) | async move { download_data ( path, url) . await } )
185+ . buffered ( concurrency)
186+ . collect ( )
187+ . await ;
188+
189+ results. into_iter ( ) . collect ( )
190+ }
191+
192+ /// Anything that can be described as a `(target_path, url)` pair accepted by [`download_many`].
193+ pub trait IntoDownload {
194+ fn into_download ( self ) -> ( PathBuf , String ) ;
195+ }
196+
197+ impl IntoDownload for ( PathBuf , String ) {
198+ fn into_download ( self ) -> ( PathBuf , String ) {
199+ self
200+ }
201+ }
202+
203+ impl IntoDownload for ( PathBuf , & str ) {
204+ fn into_download ( self ) -> ( PathBuf , String ) {
205+ ( self . 0 , self . 1 . to_owned ( ) )
206+ }
207+ }
208+
209+ #[ tracing:: instrument( skip_all, fields( input = %input_path. display( ) , output = %output_path. display( ) ) ) ]
126210pub fn decompress_bz2 ( input_path : PathBuf , output_path : PathBuf ) -> Result < PathBuf > {
127211 idempotent ( & output_path, |path| {
128212 info ! (
0 commit comments