unify download management for benchmarks

connortsui20 · connortsui20 · commit 8577d921f3a4 · 2026-04-16T13:27:49.000-04:00
Signed-off-by: Connor Tsui &lt;connor.tsui20@gmail.com&gt;
diff --git a/vortex-bench/src/clickbench/benchmark.rs b/vortex-bench/src/clickbench/benchmark.rs
@@ -7,13 +7,13 @@ use std::path::Path;
 
 use anyhow::Result;
 use url::Url;
-use vortex::error::VortexExpect;
 
 use crate::Benchmark;
 use crate::BenchmarkDataset;
 use crate::IdempotentPath;
 use crate::TableSpec;
 use crate::clickbench::*;
+use crate::utils::file::resolve_data_url;
 
 /// ClickBench benchmark implementation
 pub struct ClickBenchBenchmark {
@@ -37,31 +37,7 @@ impl ClickBenchBenchmark {
     }
 
     fn create_data_url(remote_data_dir: &Option<String>, flavor: Flavor) -> Result<Url> {
-        match remote_data_dir {
-            None => {
-                let basepath = format!("clickbench_{flavor}").to_data_path();
-                Ok(Url::parse(&format!(
-                    "file:{}/",
-                    basepath.to_str().vortex_expect("path should be utf8")
-                ))?)
-            }
-            Some(remote_data_dir) => {
-                if !remote_data_dir.ends_with("/") {
-                    tracing::warn!(
-                        "Supply a --use-remote-data-dir argument which ends in a slash e.g. s3://vortex-bench-dev-eu/parquet/"
-                    );
-                }
-                tracing::info!(
-                    concat!(
-                        "Assuming data already exists at this remote (e.g. S3, GCS) URL: {}.\\n",
-                        "If it does not, you should kill this command, locally generate the files (by running without\\n",
-                        "--use-remote-data-dir) and upload data/clickbench/ to some remote location.",
-                    ),
-                    remote_data_dir,
-                );
-                Ok(Url::parse(remote_data_dir)?)
-            }
-        }
+        resolve_data_url(remote_data_dir.as_deref(), &format!("clickbench_{flavor}"))
     }
 }
 
diff --git a/vortex-bench/src/clickbench/data.rs b/vortex-bench/src/clickbench/data.rs
@@ -14,14 +14,15 @@ use arrow_schema::TimeUnit;
 use clap::ValueEnum;
 use serde::Deserialize;
 use serde::Serialize;
-use tokio::task::JoinSet;
 use tracing::info;
 use vortex::error::VortexExpect;
 
 use crate::Format;
 // Re-export for use by clickbench_benchmark
 pub use crate::conversions::convert_parquet_directory_to_vortex;
+use crate::datasets::data_downloads::DEFAULT_DOWNLOAD_CONCURRENCY;
 use crate::datasets::data_downloads::download_data;
+use crate::datasets::data_downloads::download_many;
 
 pub static HITS_SCHEMA: LazyLock<Schema> = LazyLock::new(|| {
     use DataType::*;
@@ -193,18 +194,14 @@ impl Flavor {
             Flavor::Partitioned => {
                 // The clickbench-provided file is missing some higher-level type info, so we reprocess it
                 // to add that info, see https://github.com/ClickHouse/ClickBench/issues/7.
-
-                let mut tasks = (0_u32..100).map(|idx| {
-                    let output_path = basepath.join(Format::Parquet.name()).join(format!("hits_{idx}.parquet"));
-
-                    info!("Downloading file {idx}");
+                info!("Downloading 100 ClickBench parquet shards");
+                let parquet_dir = basepath.join(Format::Parquet.name());
+                let downloads = (0_u32..100).map(|idx| {
+                    let output_path = parquet_dir.join(format!("hits_{idx}.parquet"));
                     let url = format!("https://pub-3ba949c0f0354ac18db1f0f14f0a2c52.r2.dev/clickbench/parquet_many/hits_{idx}.parquet");
-                    download_data(output_path, url)
-                }).collect::<JoinSet<_>>();
-
-                while let Some(task) = tasks.join_next().await {
-                    task??;
-                }
+                    (output_path, url)
+                });
+                download_many(downloads, DEFAULT_DOWNLOAD_CONCURRENCY).await?;
             }
         }
         Ok(())
diff --git a/vortex-bench/src/datasets/data_downloads.rs b/vortex-bench/src/datasets/data_downloads.rs
@@ -5,13 +5,15 @@ use std::fs::File;
 use std::io::Read;
 use std::io::Write;
 use std::path::PathBuf;
+use std::sync::LazyLock;
 use std::time::Duration;
 
 use anyhow::Context;
 use anyhow::Error;
 use anyhow::Result;
 use bzip2::read::BzDecoder;
 use futures::StreamExt;
+use futures::stream;
 use indicatif::ProgressBar;
 use indicatif::ProgressStyle;
 use parking_lot::RwLock;
@@ -25,6 +27,29 @@ use tracing::warn;
 use crate::utils::file::idempotent;
 use crate::utils::file::idempotent_async;
 
+/// Default concurrency limit for bulk downloads. Keeps us polite to the upstream while still
+/// saturating a typical 10 Gb link on a parquet-per-shard benchmark.
+pub const DEFAULT_DOWNLOAD_CONCURRENCY: usize = 16;
+
+/// Shared HTTP client used by every dataset download.
+///
+/// Reusing a single client gives us connection pooling, DNS caching, and consistent timeouts
+/// across all callers. Each benchmark used to build its own `reqwest::Client` on every download,
+/// which both wasted TLS handshakes and made it hard to reason about total in-flight concurrency.
+static HTTP_CLIENT: LazyLock<Client> = LazyLock::new(|| {
+    Client::builder()
+        .read_timeout(Duration::from_secs(60))
+        .timeout(Duration::from_secs(60 * 15))
+        .build()
+        .expect("failed to build shared benchmark HTTP client")
+});
+
+/// Access the shared HTTP client. Exposed for callers that need custom request shapes
+/// (e.g. streaming VCF parsing) while still benefitting from pooled connections.
+pub fn http_client() -> &'static Client {
+    &HTTP_CLIENT
+}
+
 async fn retry_get<F: Future<Output = Result<Response>>, R: Fn() -> F>(
     make_req: R,
     tmp_path: PathBuf,
@@ -97,12 +122,13 @@ async fn retry_get<F: Future<Output = Result<Response>>, R: Fn() -> F>(
     Err(last_err.unwrap_or_else(|| anyhow::anyhow!("retry_get exhausted with no recorded error")))
 }
 
+/// Idempotently download a single URL to `fname`.
+///
+/// Uses the shared HTTP client, a 3-attempt exponential backoff retry loop, and an `indicatif`
+/// progress bar. If `fname` already exists, the download is skipped.
+#[tracing::instrument(skip_all, fields(url = %data_url.as_ref(), path = %fname.display()))]
 pub async fn download_data(fname: PathBuf, data_url: impl AsRef<str>) -> Result<PathBuf> {
-    let client = Client::builder()
-        .read_timeout(Duration::from_secs(60))
-        .timeout(Duration::from_secs(60 * 15))
-        .build()
-        .context("Failed to build HTTP client")?;
+    let client = http_client();
 
     idempotent_async(&fname, async |path| {
         let url = data_url.as_ref();
@@ -123,6 +149,64 @@ pub async fn download_data(fname: PathBuf, data_url: impl AsRef<str>) -> Result<
     .await
 }
 
+/// Idempotently download many `(path, url)` pairs with bounded parallelism.
+///
+/// This is the preferred way to fetch multi-shard datasets (ClickBench partitioned, vector
+/// dataset train shards, Public BI tables, etc.) because it:
+///
+/// - caps in-flight HTTP requests at `max_concurrency` so we don't overwhelm the upstream
+///   or our own network stack,
+/// - reuses the shared HTTP client across every shard,
+/// - short-circuits on the first error (the remaining in-flight downloads are dropped when
+///   the returned future is dropped),
+/// - returns the resolved on-disk paths in the same order they were submitted.
+///
+/// Pass `0` as `max_concurrency` to use [`DEFAULT_DOWNLOAD_CONCURRENCY`].
+#[tracing::instrument(skip_all, fields(count = tracing::field::Empty, max_concurrency))]
+pub async fn download_many<I>(downloads: I, max_concurrency: usize) -> Result<Vec<PathBuf>>
+where
+    I: IntoIterator,
+    I::Item: IntoDownload,
+{
+    let downloads: Vec<(PathBuf, String)> = downloads
+        .into_iter()
+        .map(IntoDownload::into_download)
+        .collect();
+    tracing::Span::current().record("count", downloads.len());
+
+    let concurrency = if max_concurrency == 0 {
+        DEFAULT_DOWNLOAD_CONCURRENCY
+    } else {
+        max_concurrency
+    };
+
+    let results: Vec<Result<PathBuf>> = stream::iter(downloads)
+        .map(|(path, url)| async move { download_data(path, url).await })
+        .buffered(concurrency)
+        .collect()
+        .await;
+
+    results.into_iter().collect()
+}
+
+/// Anything that can be described as a `(target_path, url)` pair accepted by [`download_many`].
+pub trait IntoDownload {
+    fn into_download(self) -> (PathBuf, String);
+}
+
+impl IntoDownload for (PathBuf, String) {
+    fn into_download(self) -> (PathBuf, String) {
+        self
+    }
+}
+
+impl IntoDownload for (PathBuf, &str) {
+    fn into_download(self) -> (PathBuf, String) {
+        (self.0, self.1.to_owned())
+    }
+}
+
+#[tracing::instrument(skip_all, fields(input = %input_path.display(), output = %output_path.display()))]
 pub fn decompress_bz2(input_path: PathBuf, output_path: PathBuf) -> Result<PathBuf> {
     idempotent(&output_path, |path| {
         info!(
diff --git a/vortex-bench/src/fineweb/mod.rs b/vortex-bench/src/fineweb/mod.rs
@@ -3,15 +3,14 @@
 
 use std::path::PathBuf;
 
-use tokio::io::AsyncWriteExt;
 use tracing::info;
 use url::Url;
 
 use crate::Benchmark;
 use crate::BenchmarkDataset;
-use crate::IdempotentPath;
 use crate::TableSpec;
-use crate::idempotent_async;
+use crate::datasets::data_downloads::download_data;
+use crate::utils::file::resolve_data_url;
 
 /// URL to the sample file
 const SAMPLE_URL: &str = "https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/v1.4.0/sample/10BT/001_00000.parquet";
@@ -56,30 +55,7 @@ impl FinewebBenchmark {
     }
 
     fn create_data_url(remote_data_dir: &Option<String>) -> anyhow::Result<Url> {
-        match remote_data_dir {
-            None => {
-                let data_dir = "fineweb".to_data_path();
-                Url::from_directory_path(&data_dir).map_err(|_| {
-                    anyhow::anyhow!("Failed to create URL from directory path: {:?}", &data_dir)
-                })
-            }
-            Some(remote_data_dir) => {
-                if !remote_data_dir.ends_with("/") {
-                    tracing::warn!(
-                        "Supply a --use-remote-data-dir argument which ends in a slash e.g. s3://vortex-bench-dev-eu/develop/12345/fineweb/"
-                    );
-                }
-                tracing::info!(
-                    concat!(
-                        "Assuming data already exists at this remote (e.g. S3, GCS) URL: {}.\n",
-                        "If it does not, you should kill this command, locally generate the files (by running without\n",
-                        "--use-remote-data-dir) and upload data/fineweb/ to some remote location.",
-                    ),
-                    remote_data_dir,
-                );
-                Ok(Url::parse(remote_data_dir)?)
-            }
-        }
+        resolve_data_url(remote_data_dir.as_deref(), "fineweb")
     }
 }
 
@@ -104,27 +80,7 @@ impl Benchmark for FinewebBenchmark {
             return Ok(());
         }
 
-        let parquet = idempotent_async(&self.parquet_path()?, |parquet_path| async move {
-            info!("Downloading FineWeb Parquet source from HuggingFace");
-
-            let response = reqwest::get(SAMPLE_URL)
-                .await?
-                .error_for_status()
-                .map_err(|err| {
-                    anyhow::anyhow!("error fetching fineweb sample from HuggingFace: {err}")
-                })?;
-
-            let bytes = response.bytes().await?;
-            let mut w = tokio::fs::File::create(parquet_path).await?;
-
-            w.write_all(&bytes).await?;
-
-            w.flush().await?;
-
-            Ok(())
-        })
-        .await?;
-
+        let parquet = download_data(self.parquet_path()?, SAMPLE_URL).await?;
         info!("fineweb base data generated in {}", parquet.display());
 
         Ok(())
diff --git a/vortex-bench/src/public_bi.rs b/vortex-bench/src/public_bi.rs
@@ -42,8 +42,9 @@ use crate::SESSION;
 use crate::TableSpec;
 use crate::conversions::parquet_to_vortex_chunks;
 use crate::datasets::Dataset;
+use crate::datasets::data_downloads::DEFAULT_DOWNLOAD_CONCURRENCY;
 use crate::datasets::data_downloads::decompress_bz2;
-use crate::datasets::data_downloads::download_data;
+use crate::datasets::data_downloads::download_many;
 use crate::idempotent_async;
 use crate::workspace_root;
 
@@ -289,16 +290,13 @@ pub struct PBIData {
 
 impl PBIData {
     async fn download_bzips(&self) -> anyhow::Result<()> {
-        let download_futures = self.tables.iter().map(|table| {
-            download_data(
+        let downloads = self.tables.iter().map(|table| {
+            (
                 self.get_file_path(&table.name, FileType::CsvBzip2),
-                table.data_url.as_str(),
+                table.data_url.as_str().to_owned(),
             )
         });
-        let results = join_all(download_futures).await;
-        for result in results {
-            result?;
-        }
+        download_many(downloads, DEFAULT_DOWNLOAD_CONCURRENCY).await?;
         Ok(())
     }
 
diff --git a/vortex-bench/src/realnest/gharchive.rs b/vortex-bench/src/realnest/gharchive.rs
@@ -14,10 +14,10 @@ use url::Url;
 
 use crate::Benchmark;
 use crate::BenchmarkDataset;
-use crate::IdempotentPath;
 use crate::TableSpec;
 use crate::idempotent;
 use crate::idempotent_async;
+use crate::utils::file::resolve_data_url;
 
 /// Template URL for raw JSON dataset
 fn raw_json_url(hour: usize) -> String {
@@ -48,30 +48,7 @@ impl GithubArchiveBenchmark {
     }
 
     fn create_data_url(remote_data_dir: &Option<String>) -> anyhow::Result<Url> {
-        match remote_data_dir {
-            None => {
-                let data_dir = "gharchive".to_data_path();
-                Url::from_directory_path(&data_dir).map_err(|_| {
-                    anyhow::anyhow!("Failed to create URL from directory path: {:?}", &data_dir)
-                })
-            }
-            Some(remote_data_dir) => {
-                if !remote_data_dir.ends_with("/") {
-                    tracing::warn!(
-                        "Supply a --use-remote-data-dir argument which ends in a slash e.g. s3://vortex-bench-dev-eu/develop/12345/gharchive/"
-                    );
-                }
-                tracing::info!(
-                    concat!(
-                        "Assuming data already exists at this remote (e.g. S3, GCS) URL: {}.\n",
-                        "If it does not, you should kill this command, locally generate the files (by running without\n",
-                        "--use-remote-data-dir) and upload data/gharchive/ to some remote location.",
-                    ),
-                    remote_data_dir,
-                );
-                Ok(Url::parse(remote_data_dir)?)
-            }
-        }
+        resolve_data_url(remote_data_dir.as_deref(), "gharchive")
     }
 }
 
diff --git a/vortex-bench/src/utils/file.rs b/vortex-bench/src/utils/file.rs
diff --git a/vortex-bench/src/vector_dataset/download.rs b/vortex-bench/src/vector_dataset/download.rs