apache
diff --git a/‎datafusion/core/src/optimizer_rule_reference.md‎
Lines changed: 4 additions & 3 deletions b/‎datafusion/core/src/optimizer_rule_reference.md‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎datafusion/datasource-parquet/src/opener.rs‎
Lines changed: 287 additions & 15 deletions b/‎datafusion/datasource-parquet/src/opener.rs‎
Lines changed: 287 additions & 15 deletions
@@ -88,6 +88,7 @@ in multiple phases.
 | 17    | `TopKRepartition`              | -                       | Pushes TopK below hash repartition when the partition key is a prefix of the sort key.                       |
 | 18    | `ProjectionPushdown`           | late pass               | Runs projection pushdown again after limit and TopK rewrites expose new pruning opportunities.               |
 | 19    | `PushdownSort`                 | -                       | Pushes sort requirements into data sources that can already return sorted output.                            |
-| 20    | `EnsureCooperative`            | -                       | Wraps non-cooperative plan parts so long-running tasks yield fairly.                                         |
-| 21    | `FilterPushdown(Post)`         | post-optimization phase | Pushes dynamic filters at the end of optimization, after plan references stop moving.                        |
-| 22    | `SanityCheckPlan`              | -                       | Validates that the final physical plan meets ordering, distribution, and infinite-input safety requirements. |
+| 20    | `SamplePushdown`               | -                       | Pushes `TABLESAMPLE` into the source; errors at planning time if the sample can't be absorbed.               |
+| 21    | `EnsureCooperative`            | -                       | Wraps non-cooperative plan parts so long-running tasks yield fairly.                                         |
+| 22    | `FilterPushdown(Post)`         | post-optimization phase | Pushes dynamic filters at the end of optimization, after plan references stop moving.                        |
+| 23    | `SanityCheckPlan`              | -                       | Validates that the final physical plan meets ordering, distribution, and infinite-input safety requirements. |
@@ -893,21 +893,58 @@ impl FiltersPreparedParquetOpen {
             rg_metadata.len(),
         )?;
 
-        // Apply optional row-group and row-range sampling now that we
-        // know the actual row-group count. Both calls are no-ops when
-        // their respective fraction is `None`. Selection is
-        // deterministic per `(file_name, row_group_index, fraction,
-        // cluster_size)` so re-runs match.
-        prepared.sampling.apply_row_group_sampling(
-            &mut initial_plan,
-            rg_metadata.len(),
-            &prepared.file_name,
-        );
-        prepared.sampling.apply_row_fraction_sampling(
-            &mut initial_plan,
-            rg_metadata,
-            &prepared.file_name,
-        );
+        // SYSTEM-mode adaptive split: when the SamplePushdown rule
+        // hands us a residual fraction `remaining_p`, choose the
+        // row-group / row split based on the row-group count we just
+        // observed. With ≥ 2 row groups we split as `sqrt(remaining)`
+        // at both axes; with 1 row group we apply the full residual at
+        // the row level (the row-group axis can't reduce). This keeps
+        // the expected output close to `p × N_total` even for tiny
+        // scans where the cube-root math otherwise undershoots
+        // (single-file / single-row-group inputs would read
+        // `cbrt(p)` of the rows, ~46% for SYSTEM(10)).
+        if let Some(remaining) = prepared.sampling.system_target_remaining {
+            // SYSTEM-mode adaptive split: choose the row-group / row
+            // axes based on the row-group count we just observed. With
+            // ≥ 2 row groups split as `sqrt(remaining)` at both axes;
+            // with 1 row group skip row-group sampling and apply the
+            // full residual at the row level. Without this adaptation
+            // a single-file / single-row-group scan would only reach
+            // `cbrt(remaining)` of the rows (~46% for SYSTEM(10)).
+            let n_rg = rg_metadata.len();
+            let mut adapted = prepared.sampling.clone();
+            if n_rg >= 2 {
+                let q = remaining.sqrt();
+                adapted.row_group_fraction = Some(q);
+                adapted.row_fraction = Some(q);
+            } else {
+                adapted.row_group_fraction = None;
+                adapted.row_fraction = Some(remaining);
+            }
+            adapted.apply_row_group_sampling(
+                &mut initial_plan,
+                n_rg,
+                &prepared.file_name,
+            );
+            adapted.apply_row_fraction_sampling(
+                &mut initial_plan,
+                rg_metadata,
+                &prepared.file_name,
+            );
+        } else {
+            // Legacy direct-builder path: each method is a no-op when
+            // its corresponding fraction is `None`.
+            prepared.sampling.apply_row_group_sampling(
+                &mut initial_plan,
+                rg_metadata.len(),
+                &prepared.file_name,
+            );
+            prepared.sampling.apply_row_fraction_sampling(
+                &mut initial_plan,
+                rg_metadata,
+                &prepared.file_name,
+            );
+        }
 
         let mut row_groups = RowGroupAccessPlanFilter::new(initial_plan);
 
@@ -1586,6 +1623,10 @@ fn create_initial_plan(
     Ok(ParquetAccessPlan::new_all(row_group_count))
 }
 
+// `apply_row_group_sampling` and `apply_row_fraction_sampling` live
+// in `crate::sampling` so this file stays focused on the opener
+// pipeline.
+
 /// Build a page pruning predicate from an optional predicate expression.
 /// If the predicate is None or the predicate cannot be converted to a page pruning
 /// predicate, return None.
@@ -2753,6 +2794,9 @@ mod test {
         );
     }
 
+    // -- Sampling end-to-end (unit tests of the helpers themselves
+    //    live in `crate::sampling`) -----------------------------------
+
     /// End-to-end: a parquet file with 4 row groups, scanned with
     /// `row_group_fraction = 0.5`, should return rows from exactly 2
     /// of the 4 row groups.
@@ -2860,4 +2904,232 @@ mod test {
             "row_fraction=0.1 should yield ~10-12 rows; got {num_rows}"
         );
     }
+
+    /// End-to-end for the SYSTEM-mode adaptive split on a *single*
+    /// row group: the file axis is fixed (handled at try_push_sample),
+    /// the row-group axis can't reduce, so the opener should apply the
+    /// full residual fraction at the row level. A naïve implementation
+    /// that always splits as `sqrt(remaining)` between row-group and
+    /// row would only achieve `sqrt(0.1) ≈ 32%` here.
+    #[tokio::test]
+    async fn system_target_remaining_single_row_group() {
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        let values: Vec<Option<i32>> = (0..1000).map(Some).collect();
+        let batch = record_batch!(("a", Int32, values)).unwrap();
+        let schema = batch.schema();
+        let data_len =
+            write_parquet(Arc::clone(&store), "tr_1rg.parquet", batch.clone()).await;
+        let file = PartitionedFile::new(
+            "tr_1rg.parquet".to_string(),
+            u64::try_from(data_len).unwrap(),
+        );
+
+        let sampling = crate::sampling::ParquetSampling {
+            system_target_remaining: Some(0.1),
+            row_cluster_size: 4,
+            ..Default::default()
+        };
+
+        let opener = ParquetMorselizerBuilder::new()
+            .with_store(Arc::clone(&store))
+            .with_schema(Arc::clone(&schema))
+            .with_projection_indices(&[0])
+            .with_sampling(sampling)
+            .build();
+
+        let stream = open_file(&opener, file).await.unwrap();
+        let (_num_batches, num_rows) = count_batches_and_rows(stream).await;
+
+        // 10% of 1000 with small clusters = ~100 rows (window padding
+        // can push it slightly higher). The key invariant: NOT 32%
+        // (sqrt) and NOT 46% (cbrt).
+        assert!(
+            (50..=150).contains(&num_rows),
+            "single-RG SYSTEM(0.1) should hit ~100 rows; got {num_rows} \
+             (would be ~316 if split as sqrt, ~464 if split as cbrt)"
+        );
+    }
+
+    /// End-to-end for the SYSTEM-mode adaptive split on multiple row
+    /// groups: the residual is split as `sqrt` between the row-group
+    /// and row axes, so the result is `sqrt(p) × sqrt(p) = p` of the
+    /// rows in expectation.
+    #[tokio::test]
+    async fn system_target_remaining_multi_row_group() {
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        // 4 row groups × 250 rows = 1000 rows.
+        let batches = (0..4)
+            .map(|g| {
+                let vals: Vec<Option<i32>> =
+                    ((g * 250)..(g * 250 + 250)).map(Some).collect();
+                record_batch!(("a", Int32, vals)).unwrap()
+            })
+            .collect::<Vec<_>>();
+        let schema = batches[0].schema();
+        let props = WriterProperties::builder()
+            .set_max_row_group_row_count(Some(250))
+            .build();
+        let data_len = write_parquet_batches(
+            Arc::clone(&store),
+            "tr_4rg.parquet",
+            batches,
+            Some(props),
+        )
+        .await;
+        let file = PartitionedFile::new(
+            "tr_4rg.parquet".to_string(),
+            u64::try_from(data_len).unwrap(),
+        );
+
+        let sampling = crate::sampling::ParquetSampling {
+            system_target_remaining: Some(0.25),
+            row_cluster_size: 16,
+            ..Default::default()
+        };
+
+        let opener = ParquetMorselizerBuilder::new()
+            .with_store(Arc::clone(&store))
+            .with_schema(Arc::clone(&schema))
+            .with_projection_indices(&[0])
+            .with_sampling(sampling)
+            .build();
+
+        let stream = open_file(&opener, file).await.unwrap();
+        let (_num_batches, num_rows) = count_batches_and_rows(stream).await;
+
+        // sqrt(0.25) = 0.5 at each axis: keep 2 of 4 row groups
+        // (= 500 rows under scrutiny), then 50% of those rows
+        // = ~250 rows. Window padding can push it higher; assert the
+        // upper bound is well below the unsplit 1000.
+        assert!(
+            (100..=400).contains(&num_rows),
+            "multi-RG SYSTEM remaining=0.25 should hit ~250 rows; got {num_rows}"
+        );
+    }
+
+    /// REPEATABLE(seed) must produce the same selection regardless of
+    /// where the parquet file lives. This is the SQL semantics users
+    /// expect from `TABLESAMPLE ... REPEATABLE(n)`.
+    #[tokio::test]
+    async fn system_target_remaining_repeatable_seed_ignores_file_name() {
+        let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
+
+        let values: Vec<Option<i32>> = (0..100).map(Some).collect();
+        let batch = record_batch!(("a", Int32, values)).unwrap();
+        let schema = batch.schema();
+
+        // Two files with different names but identical content.
+        let len_a =
+            write_parquet(Arc::clone(&store), "first.parquet", batch.clone()).await;
+        let len_b =
+            write_parquet(Arc::clone(&store), "second_path.parquet", batch.clone()).await;
+
+        let sampling = crate::sampling::ParquetSampling {
+            system_target_remaining: Some(0.5),
+            row_cluster_size: 4,
+            seed: Some(42),
+            ..Default::default()
+        };
+
+        let opener = ParquetMorselizerBuilder::new()
+            .with_store(Arc::clone(&store))
+            .with_schema(Arc::clone(&schema))
+            .with_projection_indices(&[0])
+            .with_sampling(sampling)
+            .build();
+
+        let stream_a = open_file(
+            &opener,
+            PartitionedFile::new(
+                "first.parquet".to_string(),
+                u64::try_from(len_a).unwrap(),
+            ),
+        )
+        .await
+        .unwrap();
+        let stream_b = open_file(
+            &opener,
+            PartitionedFile::new(
+                "second_path.parquet".to_string(),
+                u64::try_from(len_b).unwrap(),
+            ),
+        )
+        .await
+        .unwrap();
+
+        let rows_a = collect_values(stream_a).await;
+        let rows_b = collect_values(stream_b).await;
+
+        assert_eq!(
+            rows_a, rows_b,
+            "REPEATABLE(seed) must select the same rows regardless of file path"
+        );
+        assert!(
+            !rows_a.is_empty() && rows_a.len() < 100,
+            "expected a strict subset; got {} rows",
+            rows_a.len()
+        );
+
+        // Without a seed the selection must depend on the file name —
+        // otherwise unrelated parquet files in the same scan would all
+        // produce correlated samples, defeating the purpose of file-
+        // axis randomisation.
+        let unseeded_sampling = crate::sampling::ParquetSampling {
+            system_target_remaining: Some(0.5),
+            row_cluster_size: 4,
+            ..Default::default()
+        };
+        let unseeded_opener = ParquetMorselizerBuilder::new()
+            .with_store(Arc::clone(&store))
+            .with_schema(Arc::clone(&schema))
+            .with_projection_indices(&[0])
+            .with_sampling(unseeded_sampling)
+            .build();
+        let stream_a2 = open_file(
+            &unseeded_opener,
+            PartitionedFile::new(
+                "first.parquet".to_string(),
+                u64::try_from(len_a).unwrap(),
+            ),
+        )
+        .await
+        .unwrap();
+        let stream_b2 = open_file(
+            &unseeded_opener,
+            PartitionedFile::new(
+                "second_path.parquet".to_string(),
+                u64::try_from(len_b).unwrap(),
+            ),
+        )
+        .await
+        .unwrap();
+        let rows_a2 = collect_values(stream_a2).await;
+        let rows_b2 = collect_values(stream_b2).await;
+        assert_ne!(
+            rows_a2, rows_b2,
+            "without a seed, different file names should produce different samples"
+        );
+    }
+
+    /// Helper: pull an `i32` column out of a sampled stream.
+    async fn collect_values(
+        mut stream: BoxStream<'static, Result<RecordBatch>>,
+    ) -> Vec<i32> {
+        use futures::StreamExt;
+        let mut out = Vec::new();
+        while let Some(batch) = stream.next().await {
+            let batch = batch.unwrap();
+            let col = batch
+                .column(0)
+                .as_any()
+                .downcast_ref::<arrow::array::Int32Array>()
+                .unwrap();
+            for i in 0..col.len() {
+                out.push(col.value(i));
+            }
+        }
+        out
+    }
 }