apache
diff --git a/‎Cargo.lock‎
Lines changed: 30 additions & 46 deletions b/‎Cargo.lock‎
Lines changed: 30 additions & 46 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 28 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎datafusion-cli/src/main.rs‎
Lines changed: 6 additions & 6 deletions b/‎datafusion-cli/src/main.rs‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎datafusion-examples/examples/data_io/json_shredding.rs‎
Lines changed: 10 additions & 0 deletions b/‎datafusion-examples/examples/data_io/json_shredding.rs‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎datafusion/common/src/config.rs‎
Lines changed: 23 additions & 0 deletions b/‎datafusion/common/src/config.rs‎
Lines changed: 23 additions & 0 deletions
@@ -204,6 +204,34 @@ url = "2.5.7"
 uuid = "1.23"
 zstd = { version = "0.13", default-features = false }
 
+# Override arrow / parquet to the `adaptive-strategy-swap` branch on
+# pydantic's fork of arrow-rs, which adds the `swap_strategy` API on
+# `ParquetPushDecoder` that the in-decoder adaptive filter scheduling
+# depends on.
+#
+# The full set of arrow-rs workspace crates is listed so transitive
+# deps (e.g. `arrow-cast` pulled in via `arrow`) resolve to the patched
+# version and we don't link two copies into one binary.
+#
+# Branch: https://github.com/pydantic/arrow-rs/tree/adaptive-strategy-swap
+[patch.crates-io]
+arrow = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+arrow-arith = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+arrow-array = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+arrow-buffer = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+arrow-cast = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+arrow-csv = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+arrow-data = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+arrow-flight = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+arrow-ipc = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+arrow-json = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+arrow-ord = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+arrow-row = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+arrow-schema = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+arrow-select = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+arrow-string = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+parquet = { git = "https://github.com/pydantic/arrow-rs.git", branch = "adaptive-strategy-swap" }
+
 [workspace.lints.clippy]
 # Detects large stack-allocated futures that may cause stack overflow crashes (see threshold in clippy.toml)
 large_futures = "warn"
 
@@ -616,9 +616,9 @@ mod tests {
         +-----------------------------------+-----------------+---------------------+------+------------------+
         | filename                          | file_size_bytes | metadata_size_bytes | hits | extra            |
         +-----------------------------------+-----------------+---------------------+------+------------------+
-        | alltypes_plain.parquet            | 1851            | 8882                | 2    | page_index=false |
-        | alltypes_tiny_pages.parquet       | 454233          | 269074              | 2    | page_index=true  |
-        | lz4_raw_compressed_larger.parquet | 380836          | 1339                | 2    | page_index=false |
+        | alltypes_plain.parquet            | 1851            | 8794                | 2    | page_index=false |
+        | alltypes_tiny_pages.parquet       | 454233          | 268970              | 2    | page_index=true  |
+        | lz4_raw_compressed_larger.parquet | 380836          | 1331                | 2    | page_index=false |
         +-----------------------------------+-----------------+---------------------+------+------------------+
         ");
 
@@ -647,9 +647,9 @@ mod tests {
         +-----------------------------------+-----------------+---------------------+------+------------------+
         | filename                          | file_size_bytes | metadata_size_bytes | hits | extra            |
         +-----------------------------------+-----------------+---------------------+------+------------------+
-        | alltypes_plain.parquet            | 1851            | 8882                | 5    | page_index=false |
-        | alltypes_tiny_pages.parquet       | 454233          | 269074              | 2    | page_index=true  |
-        | lz4_raw_compressed_larger.parquet | 380836          | 1339                | 3    | page_index=false |
+        | alltypes_plain.parquet            | 1851            | 8794                | 5    | page_index=false |
+        | alltypes_tiny_pages.parquet       | 454233          | 268970              | 2    | page_index=true  |
+        | lz4_raw_compressed_larger.parquet | 380836          | 1331                | 3    | page_index=false |
         +-----------------------------------+-----------------+---------------------+------+------------------+
         ");
 
 
@@ -92,6 +92,16 @@ pub async fn json_shredding() -> Result<()> {
     // Set up query execution
     let mut cfg = SessionConfig::new();
     cfg.options_mut().execution.parquet.pushdown_filters = true;
+    // Force every filter to row-level so the example's
+    // `pushdown_rows_pruned=1` assertion is deterministic. The default
+    // adaptive scheduler keeps small-file filters on the post-scan path
+    // (via the byte-ratio heuristic), where `pushdown_rows_pruned` stays
+    // 0; setting `filter_pushdown_min_bytes_per_sec = 0` disables that
+    // heuristic.
+    cfg.options_mut()
+        .execution
+        .parquet
+        .filter_pushdown_min_bytes_per_sec = 0.0;
     let ctx = SessionContext::new_with_config(cfg);
     ctx.runtime_env().register_object_store(
         ObjectStoreUrl::parse("memory://")?.as_ref(),
 
@@ -919,6 +919,29 @@ config_namespace! {
         /// parquet reader setting. 0 means no caching.
         pub max_predicate_cache_size: Option<usize>, default = None
 
+        /// (reading) Minimum throughput, in bytes per second, that an adaptive
+        /// row-level filter must sustain to remain at row-level. Filters that
+        /// drop below this threshold (with statistical confidence — see
+        /// `filter_confidence_z`) are demoted to post-scan, or dropped entirely
+        /// if they were optional (e.g. a hash-join build-side dynamic filter).
+        /// Set to `0` to force every filter to row-level (skip the threshold
+        /// check); set to `f64::INFINITY` to keep every filter post-scan.
+        pub filter_pushdown_min_bytes_per_sec: f64, default = 100.0 * 1024.0 * 1024.0
+
+        /// (reading) Initial-placement heuristic for adaptive filters: when a
+        /// filter is first observed, place it at row-level if its column bytes
+        /// are this fraction or less of the total projection's column bytes.
+        /// Above this ratio, the filter starts as post-scan and only gets
+        /// promoted later if measured throughput crosses
+        /// `filter_pushdown_min_bytes_per_sec`.
+        pub filter_collecting_byte_ratio_threshold: f64, default = 0.20
+
+        /// (reading) Z-score for the one-sided confidence interval the adaptive
+        /// filter scheduler uses when promoting / demoting / dropping filters.
+        /// Default `2.0` (≈ 97.5%) keeps strategy moves conservative; lower the
+        /// value for snappier adaptation, raise it for more stable placements.
+        pub filter_confidence_z: f64, default = 2.0
+
         // The following options affect writing to parquet files
         // and map to parquet::file::properties::WriterProperties