apache
diff --git a/‎datafusion/core/src/optimizer_rule_reference.md‎
Lines changed: 4 additions & 3 deletions b/‎datafusion/core/src/optimizer_rule_reference.md‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎datafusion/datasource-parquet/src/opener.rs‎
Lines changed: 167 additions & 25 deletions b/‎datafusion/datasource-parquet/src/opener.rs‎
Lines changed: 167 additions & 25 deletions
@@ -88,6 +88,7 @@ in multiple phases.
 | 17    | `TopKRepartition`              | -                       | Pushes TopK below hash repartition when the partition key is a prefix of the sort key.                       |
 | 18    | `ProjectionPushdown`           | late pass               | Runs projection pushdown again after limit and TopK rewrites expose new pruning opportunities.               |
 | 19    | `PushdownSort`                 | -                       | Pushes sort requirements into data sources that can already return sorted output.                            |
-| 20    | `EnsureCooperative`            | -                       | Wraps non-cooperative plan parts so long-running tasks yield fairly.                                         |
-| 21    | `FilterPushdown(Post)`         | post-optimization phase | Pushes dynamic filters at the end of optimization, after plan references stop moving.                        |
-| 22    | `SanityCheckPlan`              | -                       | Validates that the final physical plan meets ordering, distribution, and infinite-input safety requirements. |
+| 20    | `SamplePushdown`               | -                       | Pushes `TABLESAMPLE` into the source; errors at planning time if the sample can't be absorbed.               |
+| 21    | `EnsureCooperative`            | -                       | Wraps non-cooperative plan parts so long-running tasks yield fairly.                                         |
+| 22    | `FilterPushdown(Post)`         | post-optimization phase | Pushes dynamic filters at the end of optimization, after plan references stop moving.                        |
+| 23    | `SanityCheckPlan`              | -                       | Validates that the final physical plan meets ordering, distribution, and infinite-input safety requirements. |
@@ -893,32 +893,70 @@ impl FiltersPreparedParquetOpen {
             rg_metadata.len(),
         )?;
 
-        // Apply optional row-group level sampling now that we know the
-        // actual row-group count. Selection is deterministic per
-        // `(file_name, row_group_count, fraction)` so re-runs match.
-        if let Some(fraction) = prepared.sampling.row_group_fraction {
-            apply_row_group_sampling(
-                &mut initial_plan,
-                rg_metadata.len(),
-                fraction,
-                &prepared.file_name,
-            );
-        }
+        // SYSTEM-mode adaptive split: when the SamplePushdown rule
+        // hands us a residual fraction `remaining_p`, choose the
+        // row-group / row split based on the row-group count we just
+        // observed. With ≥ 2 row groups we split as `sqrt(remaining)`
+        // at both axes; with 1 row group we apply the full residual at
+        // the row level (the row-group axis can't reduce). This keeps
+        // the expected output close to `p × N_total` even for tiny
+        // scans where the cube-root math otherwise undershoots
+        // (single-file / single-row-group inputs would read
+        // `cbrt(p)` of the rows, ~46% for SYSTEM(10)).
+        if let Some(remaining) = prepared.sampling.system_target_remaining {
+            let n_rg = rg_metadata.len();
+            if n_rg >= 2 {
+                let q = remaining.sqrt();
+                apply_row_group_sampling(&mut initial_plan, n_rg, q, &prepared.file_name);
+                apply_row_fraction_sampling(
+                    &mut initial_plan,
+                    rg_metadata,
+                    q,
+                    prepared.sampling.row_cluster_size,
+                    &prepared.file_name,
+                );
+            } else {
+                apply_row_fraction_sampling(
+                    &mut initial_plan,
+                    rg_metadata,
+                    remaining,
+                    prepared.sampling.row_cluster_size,
+                    &prepared.file_name,
+                );
+            }
+        } else {
+            // Legacy direct-builder path (no SYSTEM target set):
+            // honour the explicit per-axis fractions independently.
+
+            // Apply optional row-group level sampling now that we know
+            // the actual row-group count. Selection is deterministic
+            // per `(file_name, row_group_count, fraction)` so re-runs
+            // match.
+            if let Some(fraction) = prepared.sampling.row_group_fraction {
+                apply_row_group_sampling(
+                    &mut initial_plan,
+                    rg_metadata.len(),
+                    fraction,
+                    &prepared.file_name,
+                );
+            }
 
-        // Apply optional row-range sampling within each kept row group.
-        // Each row group still marked `Scan` is downgraded to a
-        // `Selection` covering ~`fraction` of the rows in K spread-out
-        // windows. The parquet reader uses the page index to read only
-        // the data pages that overlap the selection, giving page-level
-        // IO savings without requiring per-column page alignment.
-        if let Some(fraction) = prepared.sampling.row_fraction {
-            apply_row_fraction_sampling(
-                &mut initial_plan,
-                rg_metadata,
-                fraction,
-                prepared.sampling.row_cluster_size,
-                &prepared.file_name,
-            );
+            // Apply optional row-range sampling within each kept row
+            // group. Each row group still marked `Scan` is downgraded
+            // to a `Selection` covering ~`fraction` of the rows in K
+            // spread-out windows. The parquet reader uses the page
+            // index to read only the data pages that overlap the
+            // selection, giving page-level IO savings without
+            // requiring per-column page alignment.
+            if let Some(fraction) = prepared.sampling.row_fraction {
+                apply_row_fraction_sampling(
+                    &mut initial_plan,
+                    rg_metadata,
+                    fraction,
+                    prepared.sampling.row_cluster_size,
+                    &prepared.file_name,
+                );
+            }
         }
 
         let mut row_groups = RowGroupAccessPlanFilter::new(initial_plan);
@@ -3056,4 +3094,108 @@ mod test {
             "row_fraction=0.1 should yield ~10-12 rows; got {num_rows}"
         );
     }
+
+    /// End-to-end for the SYSTEM-mode adaptive split on a *single*
+    /// row group: the file axis is fixed (handled at try_push_sample),
+    /// the row-group axis can't reduce, so the opener should apply the
+    /// full residual fraction at the row level. A naïve implementation
+    /// that always splits as `sqrt(remaining)` between row-group and
+    /// row would only achieve `sqrt(0.1) ≈ 32%` here.
+    #[tokio::test]
+    async fn system_target_remaining_single_row_group() {
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        let values: Vec<Option<i32>> = (0..1000).map(Some).collect();
+        let batch = record_batch!(("a", Int32, values)).unwrap();
+        let schema = batch.schema();
+        let data_len =
+            write_parquet(Arc::clone(&store), "tr_1rg.parquet", batch.clone()).await;
+        let file = PartitionedFile::new(
+            "tr_1rg.parquet".to_string(),
+            u64::try_from(data_len).unwrap(),
+        );
+
+        let sampling = crate::source::ParquetSampling {
+            system_target_remaining: Some(0.1),
+            row_cluster_size: 4,
+            ..Default::default()
+        };
+
+        let opener = ParquetMorselizerBuilder::new()
+            .with_store(Arc::clone(&store))
+            .with_schema(Arc::clone(&schema))
+            .with_projection_indices(&[0])
+            .with_sampling(sampling)
+            .build();
+
+        let stream = open_file(&opener, file).await.unwrap();
+        let (_num_batches, num_rows) = count_batches_and_rows(stream).await;
+
+        // 10% of 1000 with small clusters = ~100 rows (window padding
+        // can push it slightly higher). The key invariant: NOT 32%
+        // (sqrt) and NOT 46% (cbrt).
+        assert!(
+            (50..=150).contains(&num_rows),
+            "single-RG SYSTEM(0.1) should hit ~100 rows; got {num_rows} \
+             (would be ~316 if split as sqrt, ~464 if split as cbrt)"
+        );
+    }
+
+    /// End-to-end for the SYSTEM-mode adaptive split on multiple row
+    /// groups: the residual is split as `sqrt` between the row-group
+    /// and row axes, so the result is `sqrt(p) × sqrt(p) = p` of the
+    /// rows in expectation.
+    #[tokio::test]
+    async fn system_target_remaining_multi_row_group() {
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        // 4 row groups × 250 rows = 1000 rows.
+        let batches = (0..4)
+            .map(|g| {
+                let vals: Vec<Option<i32>> =
+                    ((g * 250)..(g * 250 + 250)).map(Some).collect();
+                record_batch!(("a", Int32, vals)).unwrap()
+            })
+            .collect::<Vec<_>>();
+        let schema = batches[0].schema();
+        let props = WriterProperties::builder()
+            .set_max_row_group_row_count(Some(250))
+            .build();
+        let data_len = write_parquet_batches(
+            Arc::clone(&store),
+            "tr_4rg.parquet",
+            batches,
+            Some(props),
+        )
+        .await;
+        let file = PartitionedFile::new(
+            "tr_4rg.parquet".to_string(),
+            u64::try_from(data_len).unwrap(),
+        );
+
+        let sampling = crate::source::ParquetSampling {
+            system_target_remaining: Some(0.25),
+            row_cluster_size: 16,
+            ..Default::default()
+        };
+
+        let opener = ParquetMorselizerBuilder::new()
+            .with_store(Arc::clone(&store))
+            .with_schema(Arc::clone(&schema))
+            .with_projection_indices(&[0])
+            .with_sampling(sampling)
+            .build();
+
+        let stream = open_file(&opener, file).await.unwrap();
+        let (_num_batches, num_rows) = count_batches_and_rows(stream).await;
+
+        // sqrt(0.25) = 0.5 at each axis: keep 2 of 4 row groups
+        // (= 500 rows under scrutiny), then 50% of those rows
+        // = ~250 rows. Window padding can push it higher; assert the
+        // upper bound is well below the unsplit 1000.
+        assert!(
+            (100..=400).contains(&num_rows),
+            "multi-RG SYSTEM remaining=0.25 should hit ~250 rows; got {num_rows}"
+        );
+    }
 }