Skip to content

Commit fe8dbfa

Browse files
authored
Support distinct-from predicates in Parquet pruning (#22084)
## Which issue does this PR close? - Closes #. ## Rationale for this change Parquet statistics pruning did not rewrite `IS DISTINCT FROM` or `IS NOT DISTINCT FROM`, so row groups that could be proven irrelevant from min/max and null-count statistics were still kept. ## What changes are included in this PR? - Adds null-aware pruning rewrites for `IS DISTINCT FROM` and `IS NOT DISTINCT FROM`. - Treats distinct-from operators as symmetric when normalizing scalar-left predicates. - Refactors shared min/max and null-count pruning expression builders. - Adds unit tests for pruning predicate evaluation and Parquet row-group regression coverage. ## Are these changes tested? ## Are there any user-facing changes? No API changes. Queries using `IS DISTINCT FROM` and `IS NOT DISTINCT FROM` can now benefit from Parquet statistics pruning.
1 parent 04fbade commit fe8dbfa

3 files changed

Lines changed: 318 additions & 52 deletions

File tree

datafusion/core/tests/parquet/row_group_pruning.rs

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1777,6 +1777,15 @@ fn make_i32_batch(
17771777
RecordBatch::try_new(schema, vec![array]).map_err(DataFusionError::from)
17781778
}
17791779

1780+
fn make_nullable_i32_batch(
1781+
name: &str,
1782+
values: Vec<Option<i32>>,
1783+
) -> datafusion_common::error::Result<RecordBatch> {
1784+
let schema = Arc::new(Schema::new(vec![Field::new(name, DataType::Int32, true)]));
1785+
let array: ArrayRef = Arc::new(Int32Array::from(values));
1786+
RecordBatch::try_new(schema, vec![array]).map_err(DataFusionError::from)
1787+
}
1788+
17801789
// Helper function to create a batch with two Int32 columns
17811790
fn make_two_col_i32_batch(
17821791
name_a: &str,
@@ -1793,6 +1802,72 @@ fn make_two_col_i32_batch(
17931802
RecordBatch::try_new(schema, vec![array_a, array_b]).map_err(DataFusionError::from)
17941803
}
17951804

1805+
#[tokio::test]
1806+
async fn prune_is_not_distinct_from_i32() -> datafusion_common::error::Result<()> {
1807+
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
1808+
let batches = vec![
1809+
make_nullable_i32_batch("a", vec![None, None])?,
1810+
make_nullable_i32_batch("a", vec![Some(0), Some(0)])?,
1811+
make_nullable_i32_batch("a", vec![Some(0), Some(1)])?,
1812+
make_nullable_i32_batch("a", vec![Some(2), Some(3)])?,
1813+
make_nullable_i32_batch("a", vec![None, Some(5)])?,
1814+
];
1815+
1816+
RowGroupPruningTest::new()
1817+
.with_scenario(Scenario::Int)
1818+
.with_query("SELECT a FROM t WHERE a IS NOT DISTINCT FROM 0")
1819+
.with_expected_errors(Some(0))
1820+
.with_expected_rows(3)
1821+
.with_pruned_files(Some(0))
1822+
.with_matched_by_stats(Some(2))
1823+
.with_fully_matched_by_stats(Some(1))
1824+
.with_pruned_by_stats(Some(3))
1825+
.with_limit_pruned_row_groups(Some(0))
1826+
.test_row_group_prune_with_custom_data(schema.clone(), batches.clone(), 2)
1827+
.await;
1828+
1829+
RowGroupPruningTest::new()
1830+
.with_scenario(Scenario::Int)
1831+
.with_query("SELECT a FROM t WHERE a IS NOT DISTINCT FROM NULL")
1832+
.with_expected_errors(Some(0))
1833+
.with_expected_rows(3)
1834+
.with_pruned_files(Some(0))
1835+
.with_matched_by_stats(Some(2))
1836+
.with_fully_matched_by_stats(Some(0))
1837+
.with_pruned_by_stats(Some(3))
1838+
.with_limit_pruned_row_groups(Some(0))
1839+
.test_row_group_prune_with_custom_data(schema.clone(), batches.clone(), 2)
1840+
.await;
1841+
1842+
RowGroupPruningTest::new()
1843+
.with_scenario(Scenario::Int)
1844+
.with_query("SELECT a FROM t WHERE a IS DISTINCT FROM 0")
1845+
.with_expected_errors(Some(0))
1846+
.with_expected_rows(7)
1847+
.with_pruned_files(Some(0))
1848+
.with_matched_by_stats(Some(4))
1849+
.with_fully_matched_by_stats(Some(1))
1850+
.with_pruned_by_stats(Some(1))
1851+
.with_limit_pruned_row_groups(Some(0))
1852+
.test_row_group_prune_with_custom_data(schema.clone(), batches.clone(), 2)
1853+
.await;
1854+
1855+
RowGroupPruningTest::new()
1856+
.with_scenario(Scenario::Int)
1857+
.with_query("SELECT a FROM t WHERE a IS DISTINCT FROM NULL")
1858+
.with_expected_errors(Some(0))
1859+
.with_expected_rows(7)
1860+
.with_pruned_files(Some(0))
1861+
.with_matched_by_stats(Some(4))
1862+
.with_fully_matched_by_stats(Some(3))
1863+
.with_pruned_by_stats(Some(1))
1864+
.with_limit_pruned_row_groups(Some(0))
1865+
.test_row_group_prune_with_custom_data(schema, batches, 2)
1866+
.await;
1867+
1868+
Ok(())
1869+
}
1870+
17961871
#[tokio::test]
17971872
async fn test_limit_pruning_basic() -> datafusion_common::error::Result<()> {
17981873
// Scenario: Simple integer column, multiple row groups

datafusion/expr-common/src/operator.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -255,9 +255,9 @@ impl Operator {
255255
Operator::GtEq => Some(Operator::LtEq),
256256
Operator::AtArrow => Some(Operator::ArrowAt),
257257
Operator::ArrowAt => Some(Operator::AtArrow),
258-
Operator::IsDistinctFrom
259-
| Operator::IsNotDistinctFrom
260-
| Operator::Plus
258+
Operator::IsDistinctFrom => Some(Operator::IsDistinctFrom),
259+
Operator::IsNotDistinctFrom => Some(Operator::IsNotDistinctFrom),
260+
Operator::Plus
261261
| Operator::Minus
262262
| Operator::Multiply
263263
| Operator::Divide

0 commit comments

Comments
 (0)