[Minor] Remove redundant ProjectionExec nodes in sort-based plans (#20780)

Dandandan · claude · web-flow · commit 86cb815e03e5 · 2026-03-11T06:52:04.000Z
## Which issue does this PR close?

- Closes #.

## Rationale for this change

ClickBench quueries (Q7, Q15, Q16, Q18) have some redundant projections
for sorting based on count.
Probably not a (measurable) improvement, but the plan looks better (in
case of non-TopK it could probably be measurable).

## What changes are included in this PR?

## Are these changes tested?
Existing tests.

## Are there any user-facing changes?

---------

Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs
@@ -3001,24 +3001,22 @@ async fn test_count_wildcard_on_sort() -> Result<()> {
     assert_snapshot!(
         pretty_format_batches(&sql_results).unwrap(),
         @r"
-    +---------------+------------------------------------------------------------------------------------------------------------+
-    | plan_type     | plan                                                                                                       |
-    +---------------+------------------------------------------------------------------------------------------------------------+
-    | logical_plan  | Projection: t1.b, count(*)                                                                                 |
-    |               |   Sort: count(Int64(1)) AS count(*) AS count(*) ASC NULLS LAST                                             |
-    |               |     Projection: t1.b, count(Int64(1)) AS count(*), count(Int64(1))                                         |
-    |               |       Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1))]]                                                |
-    |               |         TableScan: t1 projection=[b]                                                                       |
-    | physical_plan | ProjectionExec: expr=[b@0 as b, count(*)@1 as count(*)]                                                    |
-    |               |   SortPreservingMergeExec: [count(Int64(1))@2 ASC NULLS LAST]                                              |
-    |               |     SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true]                               |
-    |               |       ProjectionExec: expr=[b@0 as b, count(Int64(1))@1 as count(*), count(Int64(1))@1 as count(Int64(1))] |
-    |               |         AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(Int64(1))]                       |
-    |               |           RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=1                                 |
-    |               |             AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(Int64(1))]                            |
-    |               |               DataSourceExec: partitions=1, partition_sizes=[1]                                            |
-    |               |                                                                                                            |
-    +---------------+------------------------------------------------------------------------------------------------------------+
+    +---------------+------------------------------------------------------------------------------------+
+    | plan_type     | plan                                                                               |
+    +---------------+------------------------------------------------------------------------------------+
+    | logical_plan  | Sort: count(*) AS count(*) ASC NULLS LAST                                          |
+    |               |   Projection: t1.b, count(Int64(1)) AS count(*)                                    |
+    |               |     Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1))]]                          |
+    |               |       TableScan: t1 projection=[b]                                                 |
+    | physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST]                               |
+    |               |   SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true]         |
+    |               |     ProjectionExec: expr=[b@0 as b, count(Int64(1))@1 as count(*)]                 |
+    |               |       AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(Int64(1))] |
+    |               |         RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=1           |
+    |               |           AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(Int64(1))]      |
+    |               |             DataSourceExec: partitions=1, partition_sizes=[1]                      |
+    |               |                                                                                    |
+    +---------------+------------------------------------------------------------------------------------+
     "
     );
 
@@ -3028,7 +3026,7 @@ async fn test_count_wildcard_on_sort() -> Result<()> {
     +---------------+----------------------------------------------------------------------------+
     | plan_type     | plan                                                                       |
     +---------------+----------------------------------------------------------------------------+
-    | logical_plan  | Sort: count(*) ASC NULLS LAST                                              |
+    | logical_plan  | Sort: count(*) AS count(*) ASC NULLS LAST                                  |
     |               |   Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1)) AS count(*)]]        |
     |               |     TableScan: t1 projection=[b]                                           |
     | physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST]                       |
diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs
@@ -77,8 +77,10 @@ fn rewrite_in_terms_of_projection(
     // assumption is that each item in exprs, such as "b + c" is
     // available as an output column named "b + c"
     expr.transform(|expr| {
-        // search for unnormalized names first such as "c1" (such as aliases)
-        if let Some(found) = proj_exprs.iter().find(|a| (**a) == expr) {
+        // search for unnormalized names first such as "c1" (such as aliases).
+        // Also look inside aliases so e.g. `count(Int64(1))` matches
+        // `count(Int64(1)) AS count(*)`.
+        if let Some(found) = proj_exprs.iter().find(|a| expr_match(&expr, a)) {
             let (qualifier, field_name) = found.qualified_name();
             let col = Expr::Column(Column::new(qualifier, field_name));
             return Ok(Transformed::yes(col));
@@ -235,18 +237,22 @@ mod test {
             TestCase {
                 desc: r#"min(c2) --> "min(c2)" -- (column *named* "min(t.c2)"!)"#,
                 input: sort(min(col("c2"))),
-                expected: sort(col("min(t.c2)")),
+                expected: sort(Expr::Column(Column::new_unqualified("min(t.c2)"))),
             },
             TestCase {
                 desc: r#"c1 + min(c2) --> "c1 + min(c2)" -- (column *named* "min(t.c2)"!)"#,
                 input: sort(col("c1") + min(col("c2"))),
                 // should be "c1" not t.c1
-                expected: sort(col("c1") + col("min(t.c2)")),
+                expected: sort(
+                    col("c1") + Expr::Column(Column::new_unqualified("min(t.c2)")),
+                ),
             },
             TestCase {
                 desc: r#"avg(c3) --> "avg(t.c3)" as average (column *named* "avg(t.c3)", aliased)"#,
                 input: sort(avg(col("c3"))),
-                expected: sort(col("avg(t.c3)").alias("average")),
+                expected: sort(
+                    Expr::Column(Column::new_unqualified("avg(t.c3)")).alias("average"),
+                ),
             },
         ];
 
diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs
@@ -1984,7 +1984,7 @@ fn test_complex_order_by_with_grouping() -> Result<()> {
     }, {
         assert_snapshot!(
             sql,
-            @r#"SELECT j1.j1_id, j1.j1_string, lochierarchy FROM (SELECT j1.j1_id, j1.j1_string, (grouping(j1.j1_id) + grouping(j1.j1_string)) AS lochierarchy, grouping(j1.j1_string), grouping(j1.j1_id) FROM j1 GROUP BY ROLLUP (j1.j1_id, j1.j1_string)) ORDER BY lochierarchy DESC NULLS FIRST, CASE WHEN (("grouping(j1.j1_id)" + "grouping(j1.j1_string)") = 0) THEN j1.j1_id END ASC NULLS LAST LIMIT 100"#
+            @"SELECT j1.j1_id, j1.j1_string, (grouping(j1.j1_id) + grouping(j1.j1_string)) AS lochierarchy FROM j1 GROUP BY ROLLUP (j1.j1_id, j1.j1_string) ORDER BY lochierarchy DESC NULLS FIRST, CASE WHEN (lochierarchy = 0) THEN j1.j1_id END ASC NULLS LAST LIMIT 100"
         );
     });
 
diff --git a/datafusion/sqllogictest/test_files/clickbench.slt b/datafusion/sqllogictest/test_files/clickbench.slt