pydantic
diff --git a/‎datafusion/datasource-parquet/src/opener.rs‎
Lines changed: 21 additions & 21 deletions b/‎datafusion/datasource-parquet/src/opener.rs‎
Lines changed: 21 additions & 21 deletions
@@ -893,24 +893,24 @@ impl FiltersPreparedParquetOpen {
             rg_metadata.len(),
         )?;
 
-        // SYSTEM-mode adaptive split: when the SamplePushdown rule
-        // hands us a residual fraction `remaining_p`, choose the
-        // row-group / row split based on the row-group count we just
-        // observed. With ≥ 2 row groups we split as `sqrt(remaining)`
-        // at both axes; with 1 row group we apply the full residual at
-        // the row level (the row-group axis can't reduce). This keeps
-        // the expected output close to `p × N_total` even for tiny
-        // scans where the cube-root math otherwise undershoots
-        // (single-file / single-row-group inputs would read
-        // `cbrt(p)` of the rows, ~46% for SYSTEM(10)).
+        // Apply optional row-group and row-range sampling now that we
+        // know the actual row-group count. Selection is deterministic
+        // per `(partition_index, row_group_index, fraction,
+        // cluster_size)` so re-runs match. The execution
+        // `partition_index` is the stable per-file id we plumb in: it
+        // makes sampling reproducible across environments without
+        // depending on object-store paths, and decorrelates files
+        // assigned to different partitions.
         if let Some(remaining) = prepared.sampling.system_target_remaining {
-            // SYSTEM-mode adaptive split: choose the row-group / row
-            // axes based on the row-group count we just observed. With
-            // ≥ 2 row groups split as `sqrt(remaining)` at both axes;
-            // with 1 row group skip row-group sampling and apply the
-            // full residual at the row level. Without this adaptation
-            // a single-file / single-row-group scan would only reach
-            // `cbrt(remaining)` of the rows (~46% for SYSTEM(10)).
+            // SYSTEM-mode adaptive split: when the SamplePushdown rule
+            // hands us a residual fraction `remaining`, choose the
+            // row-group / row split based on the row-group count we
+            // just observed. With ≥ 2 row groups split as
+            // `sqrt(remaining)` at both axes; with 1 row group skip
+            // row-group sampling and apply the full residual at the
+            // row level. Without this adaptation a single-file /
+            // single-row-group scan would only reach `cbrt(remaining)`
+            // of the rows (~46% for SYSTEM(10)).
             let n_rg = rg_metadata.len();
             let mut adapted = prepared.sampling.clone();
             if n_rg >= 2 {
@@ -924,25 +924,25 @@ impl FiltersPreparedParquetOpen {
             adapted.apply_row_group_sampling(
                 &mut initial_plan,
                 n_rg,
-                &prepared.file_name,
+                prepared.partition_index,
             );
             adapted.apply_row_fraction_sampling(
                 &mut initial_plan,
                 rg_metadata,
-                &prepared.file_name,
+                prepared.partition_index,
             );
         } else {
             // Legacy direct-builder path: each method is a no-op when
             // its corresponding fraction is `None`.
             prepared.sampling.apply_row_group_sampling(
                 &mut initial_plan,
                 rg_metadata.len(),
-                &prepared.file_name,
+                prepared.partition_index,
             );
             prepared.sampling.apply_row_fraction_sampling(
                 &mut initial_plan,
                 rg_metadata,
-                &prepared.file_name,
+                prepared.partition_index,
             );
         }