Extend not to support not(expr)

xudong963 · xudong963 · commit 40c3befcc5a9 · 2025-11-10T17:16:31.000+08:00
diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs
@@ -37,6 +37,7 @@ use datafusion::{
     prelude::{ParquetReadOptions, SessionConfig, SessionContext},
 };
 use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder};
+use datafusion_physical_plan::execute_stream;
 use parquet::arrow::ArrowWriter;
 use parquet::file::properties::{EnabledStatistics, WriterProperties};
 use std::sync::Arc;
@@ -225,6 +226,7 @@ impl ContextWithParquet {
     ) -> Self {
         // Use a single partition for deterministic results no matter how many CPUs the host has
         config = config.with_target_partitions(1);
+        config.options_mut().execution.parquet.pushdown_filters = true;
         let file = match unit {
             Unit::RowGroup(row_per_group) => {
                 config = config.with_parquet_bloom_filter_pruning(true);
@@ -308,6 +310,15 @@ impl ContextWithParquet {
             .await
             .expect("creating physical plan");
 
+        /*
+        use arrow::util::pretty::print_batches;
+        use futures::TryStreamExt;
+        let res =
+            execute_stream(physical_plan.clone(), self.ctx.task_ctx().clone()).unwrap();
+        let batches = res.try_collect::<Vec<_>>().await.unwrap();
+        print_batches(&batches).unwrap();
+        */
+
         let task_ctx = state.task_ctx();
         let results = datafusion::physical_plan::collect(physical_plan.clone(), task_ctx)
             .await
diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -1745,7 +1745,7 @@ async fn test_limit_pruning() -> datafusion_common::error::Result<()> {
     // So 3 row groups are effectively pruned due to limit pruning.
 
     let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
-    let query = "explain verbose SELECT c1 FROM t WHERE c1 > 0 LIMIT 2";
+    let query = "SELECT c1 FROM t WHERE c1 > 0 LIMIT 2";
 
     let batches = vec![
         make_i32_batch("c1", vec![1, 2])?, // RG0: Fully matched, 2 rows
@@ -1764,7 +1764,7 @@ async fn test_limit_pruning() -> datafusion_common::error::Result<()> {
         .with_pruned_by_bloom_filter(Some(0))
         .with_matched_by_stats(Some(3)) // RG0, RG1, RG2 are matched by stats (c1 > 0)
         .with_pruned_by_stats(Some(1)) // RG3 is pruned by stats (c1 = [-1, 0] does not satisfy c1 > 0)
-        // .with_limit_pruned_row_groups(Some(2)) // RG1, RG2 are pruned by limit. (RG3 is already pruned by stats)
+        .with_limit_pruned_row_groups(Some(2)) // RG1, RG2 are pruned by limit. (RG3 is already pruned by stats)
         .test_row_group_prune_with_custom_data(schema, batches)
         .await;
 
diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs
@@ -361,6 +361,8 @@ impl FileOpener for ParquetOpener {
                     );
                 }
 
+                dbg!(&row_groups);
+
                 if enable_bloom_filter && !row_groups.is_empty() {
                     row_groups
                         .prune_by_bloom_filters(
@@ -373,6 +375,7 @@ impl FileOpener for ParquetOpener {
                 }
             }
 
+            dbg!(limit);
             // Prune by limit
             if let Some(limit) = limit {
                 row_groups.prune_by_limit(limit, rg_metadata, &file_metrics);
diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs
@@ -88,6 +88,7 @@ impl RowGroupAccessPlanFilter {
         let mut fully_matched_rows_count: usize = 0;
 
         // Iterate through the currently accessible row groups
+        dbg!(&self.is_fully_matched());
         for &idx in self.access_plan.row_group_indexes().iter() {
             if self.is_fully_matched[idx] {
                 let row_group_row_count = rg_metadata[idx].num_rows() as usize;
@@ -200,6 +201,8 @@ impl RowGroupAccessPlanFilter {
                     }
                 }
 
+                dbg!(&fully_contained_candidates_original_idx);
+
                 if !fully_contained_candidates_original_idx.is_empty() {
                     // Use NotExpr to create the inverted predicate
                     let inverted_expr =
@@ -216,10 +219,12 @@ impl RowGroupAccessPlanFilter {
                                 .collect::<Vec<_>>(),
                             arrow_schema,
                         };
-
+                        dbg!(&inverted_pruning_stats.row_group_metadatas);
+                        dbg!(&inverted_predicate);
                         if let Ok(inverted_values) =
                             inverted_predicate.prune(&inverted_pruning_stats)
                         {
+                            dbg!(&inverted_values);
                             for (i, &original_row_group_idx) in
                                 fully_contained_candidates_original_idx.iter().enumerate()
                             {
diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs
@@ -1415,14 +1415,39 @@ fn build_predicate_expression(
             .unwrap_or_else(|| unhandled_hook.handle(expr));
     }
     if let Some(not) = expr_any.downcast_ref::<phys_expr::NotExpr>() {
-        // match !col (don't do so recursively)
         if let Some(col) = not.arg().as_any().downcast_ref::<phys_expr::Column>() {
             return build_single_column_expr(col, schema, required_columns, true)
                 .unwrap_or_else(|| unhandled_hook.handle(expr));
-        } else {
+        }
+
+        let inner_expr = build_predicate_expression(
+            not.arg(),
+            schema,
+            required_columns,
+            unhandled_hook,
+        );
+
+        // Only apply NOT if the inner expression is NOT a true literal
+        // (because true literals may come from unhandled cases)
+        if is_always_true(&inner_expr) {
+            // Conservative approach: if inner returns true (possibly unhandled),
+            // then NOT should also return true (unhandled) to be safe
             return unhandled_hook.handle(expr);
         }
+
+        // Handle other boolean literals
+        if let Some(literal) = inner_expr.as_any().downcast_ref::<phys_expr::Literal>() {
+            if let ScalarValue::Boolean(Some(val)) = literal.value() {
+                return Arc::new(phys_expr::Literal::new(ScalarValue::Boolean(Some(
+                    !val,
+                ))));
+            }
+        }
+
+        // Apply NOT to the result
+        return Arc::new(phys_expr::NotExpr::new(inner_expr));
     }
+
     if let Some(in_list) = expr_any.downcast_ref::<phys_expr::InListExpr>() {
         if !in_list.list().is_empty()
             && in_list.list().len() <= MAX_LIST_VALUE_SIZE_REWRITE
@@ -1868,7 +1893,7 @@ mod tests {
 
     use super::*;
     use datafusion_common::test_util::batches_to_string;
-    use datafusion_expr::{and, col, lit, or};
+    use datafusion_expr::{and, col, lit, not, or};
     use insta::assert_snapshot;
 
     use arrow::array::Decimal128Array;
@@ -4422,7 +4447,7 @@ mod tests {
             true,
             // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"]  ==> some rows could pass (must keep)
             true,
-            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> no row match. (min, max) maybe truncate 
+            // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"]  ==> no row match. (min, max) maybe truncate
             // original (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
             true,
         ];
@@ -5175,4 +5200,66 @@ mod tests {
             "c1_null_count@2 != row_count@3 AND c1_min@0 <= a AND a <= c1_max@1";
         assert_eq!(res.to_string(), expected);
     }
+
+    #[test]
+    fn test_not_expression_unhandled_inner_true() -> Result<()> {
+        // Test case: when inner expression returns true (unhandled),
+        // NOT should also return true (unhandled) for safety
+        let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
+
+        // NOT(c1) for Int32 returns true because build_single_column_expr
+        // only handles boolean columns, so non-boolean columns fall back to unhandled_hook
+        let expr = not(col("c1"));
+        let predicate_expr =
+            test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
+        assert_eq!(predicate_expr.to_string(), "true");
+        Ok(())
+    }
+
+    #[test]
+    fn test_not_expression_boolean_literal_handling() -> Result<()> {
+        let schema = Schema::empty();
+
+        // NOT(false) -> true
+        let expr = not(lit(false));
+        let predicate_expr =
+            test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
+        assert_eq!(predicate_expr.to_string(), "true");
+
+        // NOT(true) -> true (conservatively)
+        let expr = not(lit(true));
+        let predicate_expr =
+            test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
+        assert_eq!(predicate_expr.to_string(), "true");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_not_expression_wraps_complex_expressions() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
+
+        let expr = not(col("c1").gt(lit(5)));
+        let predicate_expr =
+            test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
+
+        let result_str = predicate_expr.to_string();
+        assert_eq!(
+            result_str,
+            "NOT c1_null_count@1 != row_count@2 AND c1_max@0 > 5"
+        );
+
+        // NOT(c1 = 10)
+        let expr = not(col("c1").eq(lit(10)));
+        let predicate_expr =
+            test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
+
+        let result_str = predicate_expr.to_string();
+        assert_eq!(
+            result_str,
+            "NOT c1_null_count@2 != row_count@3 AND c1_min@0 <= 10 AND 10 <= c1_max@1"
+        );
+
+        Ok(())
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -361,6 +361,8 @@ impl FileOpener for ParquetOpener {`
`361`	`361`	`);`
`362`	`362`	`}`
`363`	`363`
	`364`	`+ dbg!(&row_groups);`
	`365`	`+`
`364`	`366`	`if enable_bloom_filter && !row_groups.is_empty() {`
`365`	`367`	`row_groups`
`366`	`368`	`.prune_by_bloom_filters(`
`@@ -373,6 +375,7 @@ impl FileOpener for ParquetOpener {`
`373`	`375`	`}`
`374`	`376`	`}`
`375`	`377`
	`378`	`+ dbg!(limit);`
`376`	`379`	`// Prune by limit`
`377`	`380`	`if let Some(limit) = limit {`
`378`	`381`	`row_groups.prune_by_limit(limit, rg_metadata, &file_metrics);`