apache
diff --git a/‎datafusion/core/src/dataframe/mod.rs‎
Lines changed: 2 additions & 2 deletions b/‎datafusion/core/src/dataframe/mod.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎datafusion/core/tests/sql/aggregates/basic.rs‎
Lines changed: 77 additions & 0 deletions b/‎datafusion/core/tests/sql/aggregates/basic.rs‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎datafusion/expr/src/logical_plan/plan.rs‎
Lines changed: 16 additions & 0 deletions b/‎datafusion/expr/src/logical_plan/plan.rs‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎datafusion/expr/src/utils.rs‎
Lines changed: 3 additions & 2 deletions b/‎datafusion/expr/src/utils.rs‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎datafusion/optimizer/src/analyzer/resolve_grouping_function.rs‎
Lines changed: 1 addition & 2 deletions b/‎datafusion/optimizer/src/analyzer/resolve_grouping_function.rs‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs‎
Lines changed: 3 additions & 0 deletions b/‎datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎datafusion/optimizer/src/single_distinct_to_groupby.rs‎
Lines changed: 3 additions & 3 deletions b/‎datafusion/optimizer/src/single_distinct_to_groupby.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎datafusion/physical-optimizer/src/enforce_distribution.rs‎
Lines changed: 13 additions & 7 deletions b/‎datafusion/physical-optimizer/src/enforce_distribution.rs‎
Lines changed: 13 additions & 7 deletions
@@ -654,14 +654,14 @@ impl DataFrame {
             .aggregate(group_expr, aggr_expr)?
             .build()?;
         let plan = if is_grouping_set {
-            let grouping_id_pos = plan.schema().fields().len() - 1 - aggr_expr_len;
+            let grouping_id_pos = plan.schema().fields().len() - 2 - aggr_expr_len;
             // For grouping sets we do a project to not expose the internal grouping id
             let exprs = plan
                 .schema()
                 .columns()
                 .into_iter()
                 .enumerate()
-                .filter(|(idx, _)| *idx != grouping_id_pos)
+                .filter(|(idx, _)| *idx < grouping_id_pos || *idx >= grouping_id_pos + 2)
                 .map(|(_, column)| Expr::Column(column))
                 .collect::<Vec<_>>();
             LogicalPlanBuilder::from(plan).project(exprs)?.build()?
 
@@ -175,6 +175,83 @@ async fn count_aggregated_cube() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn duplicate_grouping_sets_are_preserved() -> Result<()> {
+    let ctx = SessionContext::new();
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("deptno", DataType::Int32, false),
+        Field::new("job", DataType::Utf8, true),
+        Field::new("sal", DataType::Int32, true),
+        Field::new("comm", DataType::Int32, true),
+    ]));
+    let batch = RecordBatch::try_new(
+        Arc::clone(&schema),
+        vec![
+            Arc::new(Int32Array::from(vec![10, 20])),
+            Arc::new(StringArray::from(vec![Some("CLERK"), Some("MANAGER")])),
+            Arc::new(Int32Array::from(vec![1300, 3000])),
+            Arc::new(Int32Array::from(vec![None, None])),
+        ],
+    )?;
+    let provider = MemTable::try_new(Arc::clone(&schema), vec![vec![batch]])?;
+    ctx.register_table("dup_grouping_sets", Arc::new(provider))?;
+
+    let results = plan_and_collect(
+        &ctx,
+        "
+        SELECT deptno, job, sal, sum(comm) AS sum_comm,
+               grouping(deptno) AS deptno_flag,
+               grouping(job) AS job_flag,
+               grouping(sal) AS sal_flag
+        FROM dup_grouping_sets
+        GROUP BY GROUPING SETS ((deptno, job), (deptno, sal), (deptno, job))
+        ORDER BY deptno, job, sal, deptno_flag, job_flag, sal_flag
+        ",
+    )
+    .await?;
+
+    assert_eq!(results.len(), 1);
+    assert_snapshot!(batches_to_string(&results), @r"
+    +--------+---------+------+----------+-------------+----------+----------+
+    | deptno | job     | sal  | sum_comm | deptno_flag | job_flag | sal_flag |
+    +--------+---------+------+----------+-------------+----------+----------+
+    | 10     | CLERK   |      |          | 0           | 0        | 1        |
+    | 10     | CLERK   |      |          | 0           | 0        | 1        |
+    | 10     |         | 1300 |          | 0           | 1        | 0        |
+    | 20     | MANAGER |      |          | 0           | 0        | 1        |
+    | 20     | MANAGER |      |          | 0           | 0        | 1        |
+    | 20     |         | 3000 |          | 0           | 1        | 0        |
+    +--------+---------+------+----------+-------------+----------+----------+
+    ");
+
+    let results = plan_and_collect(
+        &ctx,
+        "
+        SELECT deptno, job, sal,
+               grouping(deptno, job, sal) AS grouping_id
+        FROM dup_grouping_sets
+        GROUP BY GROUPING SETS ((deptno, job), (deptno, sal), (deptno, job))
+        ORDER BY deptno, job, sal, grouping_id
+        ",
+    )
+    .await?;
+
+    assert_eq!(results.len(), 1);
+    assert_snapshot!(batches_to_string(&results), @r"
+    +--------+---------+------+-------------+
+    | deptno | job     | sal  | grouping_id |
+    +--------+---------+------+-------------+
+    | 10     | CLERK   |      | 1           |
+    | 10     | CLERK   |      | 1           |
+    | 10     |         | 1300 | 2           |
+    | 20     | MANAGER |      | 1           |
+    | 20     | MANAGER |      | 1           |
+    | 20     |         | 3000 | 2           |
+    +--------+---------+------+-------------+
+    ");
+    Ok(())
+}
+
 async fn run_count_distinct_integers_aggregated_scenario(
     partitions: Vec<Vec<(&str, u64)>>,
 ) -> Result<Vec<RecordBatch>> {
 
@@ -3528,6 +3528,11 @@ impl Aggregate {
                 )
                 .into(),
             ));
+            qualified_fields.push((
+                None,
+                Field::new(Self::INTERNAL_GROUPING_ORDINAL, DataType::UInt32, false)
+                    .into(),
+            ));
         }
 
         qualified_fields.extend(exprlist_to_fields(aggr_expr.as_slice(), &input)?);
@@ -3592,9 +3597,13 @@ impl Aggregate {
         static INTERNAL_ID_EXPR: LazyLock<Expr> = LazyLock::new(|| {
             Expr::Column(Column::from_name(Aggregate::INTERNAL_GROUPING_ID))
         });
+        static INTERNAL_ORDINAL_EXPR: LazyLock<Expr> = LazyLock::new(|| {
+            Expr::Column(Column::from_name(Aggregate::INTERNAL_GROUPING_ORDINAL))
+        });
         let mut exprs = grouping_set_to_exprlist(self.group_expr.as_slice())?;
         if self.is_grouping_set() {
             exprs.push(&INTERNAL_ID_EXPR);
+            exprs.push(&INTERNAL_ORDINAL_EXPR);
         }
         exprs.extend(self.aggr_expr.iter());
         debug_assert!(exprs.len() == self.schema.fields().len());
@@ -3642,6 +3651,13 @@ impl Aggregate {
     /// with `NULL` values. To handle these cases correctly, we must distinguish
     /// between an actual `NULL` value in a column and a column being excluded from the set.
     pub const INTERNAL_GROUPING_ID: &'static str = "__grouping_id";
+
+    /// Internal column used when duplicate grouping sets are present.
+    ///
+    /// This column stores the ordinal of the grouping set among all grouping sets
+    /// with the same semantic grouping mask, allowing the physical aggregation key
+    /// to distinguish duplicate grouping sets without overloading `__grouping_id`.
+    pub const INTERNAL_GROUPING_ORDINAL: &'static str = "__grouping_ordinal";
 }
 
 // Manual implementation needed because of `schema` field. Comparison excludes this field.
 
@@ -59,8 +59,9 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result<usize> {
                 "Invalid group by expressions, GroupingSet must be the only expression"
             );
         }
-        // Groupings sets have an additional integral column for the grouping id
-        Ok(grouping_set.distinct_expr().len() + 1)
+        // Grouping sets have additional internal columns for grouping semantics and
+        // duplicate-grouping-set ordinals.
+        Ok(grouping_set.distinct_expr().len() + 2)
     } else {
         grouping_set_to_exprlist(group_expr).map(|exprs| exprs.len())
     }
 
@@ -84,7 +84,7 @@ fn replace_grouping_exprs(
     let columns = schema.columns();
     let mut new_agg_expr = Vec::new();
     let mut projection_exprs = Vec::new();
-    let grouping_id_len = if is_grouping_set { 1 } else { 0 };
+    let grouping_id_len = if is_grouping_set { 2 } else { 0 };
     let group_expr_len = columns.len() - aggr_expr.len() - grouping_id_len;
     projection_exprs.extend(
         columns
@@ -204,7 +204,6 @@ fn grouping_function_on_id(
             Expr::Literal(ScalarValue::from(value as u64), None)
         }
     };
-
     let grouping_id_column = Expr::Column(Column::from(Aggregate::INTERNAL_GROUPING_ID));
     // The grouping call is exactly our internal grouping id
     if args.len() == group_by_expr_count
 
@@ -238,6 +238,9 @@ fn aggregate_output_exprs(group_expr: &[Expr]) -> Result<Vec<Expr>> {
         output_exprs.push(Expr::Column(Column::from_name(
             Aggregate::INTERNAL_GROUPING_ID,
         )));
+        output_exprs.push(Expr::Column(Column::from_name(
+            Aggregate::INTERNAL_GROUPING_ORDINAL,
+        )));
     }
 
     Ok(output_exprs)
 
@@ -380,7 +380,7 @@ mod tests {
         assert_optimized_plan_equal!(
             plan,
             @r"
-        Aggregate: groupBy=[[GROUPING SETS ((test.a), (test.b))]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, count(DISTINCT test.c):Int64]
+        Aggregate: groupBy=[[GROUPING SETS ((test.a), (test.b))]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, __grouping_ordinal:UInt32, count(DISTINCT test.c):Int64]
           TableScan: test [a:UInt32, b:UInt32, c:UInt32]
         "
         )
@@ -401,7 +401,7 @@ mod tests {
         assert_optimized_plan_equal!(
             plan,
             @r"
-        Aggregate: groupBy=[[CUBE (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, count(DISTINCT test.c):Int64]
+        Aggregate: groupBy=[[CUBE (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, __grouping_ordinal:UInt32, count(DISTINCT test.c):Int64]
           TableScan: test [a:UInt32, b:UInt32, c:UInt32]
         "
         )
@@ -423,7 +423,7 @@ mod tests {
         assert_optimized_plan_equal!(
             plan,
             @r"
-        Aggregate: groupBy=[[ROLLUP (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, count(DISTINCT test.c):Int64]
+        Aggregate: groupBy=[[ROLLUP (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, __grouping_ordinal:UInt32, count(DISTINCT test.c):Int64]
           TableScan: test [a:UInt32, b:UInt32, c:UInt32]
         "
         )
 
@@ -1280,25 +1280,31 @@ pub fn ensure_distribution(
             // Allow subset satisfaction when:
             // 1. Current partition count >= threshold
             // 2. Not a partitioned join since must use exact hash matching for joins
-            // 3. Not a grouping set aggregate (requires exact hash including __grouping_id)
+            // 3. Not a grouping set aggregate (requires exact hash including internal grouping columns)
             let current_partitions = child.plan.output_partitioning().partition_count();
 
-            // Check if the hash partitioning requirement includes __grouping_id column.
+            // Check if the hash partitioning requirement includes internal grouping columns.
             // Grouping set aggregates (ROLLUP, CUBE, GROUPING SETS) require exact hash
-            // partitioning on all group columns including __grouping_id to ensure partial
-            // aggregates from different partitions are correctly combined.
-            let requires_grouping_id = matches!(&requirement, Distribution::HashPartitioned(exprs)
+            // partitioning on all group columns including the internal grouping columns
+            // to ensure partial aggregates from different partitions are correctly combined.
+            let requires_grouping_key = matches!(&requirement, Distribution::HashPartitioned(exprs)
                 if exprs.iter().any(|expr| {
                     expr.as_any()
                         .downcast_ref::<Column>()
-                        .is_some_and(|col| col.name() == Aggregate::INTERNAL_GROUPING_ID)
+                        .is_some_and(|col| {
+                            matches!(
+                                col.name(),
+                                Aggregate::INTERNAL_GROUPING_ID
+                                    | Aggregate::INTERNAL_GROUPING_ORDINAL
+                            )
+                        })
                 })
             );
 
             let allow_subset_satisfy_partitioning = current_partitions
                 >= subset_satisfaction_threshold
                 && !is_partitioned_join
-                && !requires_grouping_id;
+                && !requires_grouping_key;
 
             // When `repartition_file_scans` is set, attempt to increase
             // parallelism at the source.
Original file line number	Diff line number	Diff line change
`@@ -59,8 +59,9 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result<usize> {`
`59`	`59`	`"Invalid group by expressions, GroupingSet must be the only expression"`
`60`	`60`	`);`
`61`	`61`	`}`
`62`		`- // Groupings sets have an additional integral column for the grouping id`
`63`		`- Ok(grouping_set.distinct_expr().len() + 1)`
	`62`	`+ // Grouping sets have additional internal columns for grouping semantics and`
	`63`	`+ // duplicate-grouping-set ordinals.`
	`64`	`+ Ok(grouping_set.distinct_expr().len() + 2)`
`64`	`65`	`} else {`
`65`	`66`	`grouping_set_to_exprlist(group_expr).map(\|exprs\| exprs.len())`
`66`	`67`	`}`
Original file line number	Diff line number	Diff line change
`@@ -238,6 +238,9 @@ fn aggregate_output_exprs(group_expr: &[Expr]) -> Result<Vec<Expr>> {`
`238`	`238`	`output_exprs.push(Expr::Column(Column::from_name(`
`239`	`239`	`Aggregate::INTERNAL_GROUPING_ID,`
`240`	`240`	`)));`
	`241`	`+ output_exprs.push(Expr::Column(Column::from_name(`
	`242`	`+ Aggregate::INTERNAL_GROUPING_ORDINAL,`
	`243`	`+ )));`
`241`	`244`	`}`
`242`	`245`
`243`	`246`	`Ok(output_exprs)`
Original file line number	Diff line number	Diff line change
`@@ -380,7 +380,7 @@ mod tests {`
`380`	`380`	`assert_optimized_plan_equal!(`
`381`	`381`	`plan,`
`382`	`382`	`@r"`
`383`		`- Aggregate: groupBy=[[GROUPING SETS ((test.a), (test.b))]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, count(DISTINCT test.c):Int64]`
	`383`	`+ Aggregate: groupBy=[[GROUPING SETS ((test.a), (test.b))]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, __grouping_ordinal:UInt32, count(DISTINCT test.c):Int64]`
`384`	`384`	`TableScan: test [a:UInt32, b:UInt32, c:UInt32]`
`385`	`385`	`"`
`386`	`386`	`)`
`@@ -401,7 +401,7 @@ mod tests {`
`401`	`401`	`assert_optimized_plan_equal!(`
`402`	`402`	`plan,`
`403`	`403`	`@r"`
`404`		`- Aggregate: groupBy=[[CUBE (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, count(DISTINCT test.c):Int64]`
	`404`	`+ Aggregate: groupBy=[[CUBE (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, __grouping_ordinal:UInt32, count(DISTINCT test.c):Int64]`
`405`	`405`	`TableScan: test [a:UInt32, b:UInt32, c:UInt32]`
`406`	`406`	`"`
`407`	`407`	`)`
`@@ -423,7 +423,7 @@ mod tests {`
`423`	`423`	`assert_optimized_plan_equal!(`
`424`	`424`	`plan,`
`425`	`425`	`@r"`
`426`		`- Aggregate: groupBy=[[ROLLUP (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, count(DISTINCT test.c):Int64]`
	`426`	`+ Aggregate: groupBy=[[ROLLUP (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, __grouping_ordinal:UInt32, count(DISTINCT test.c):Int64]`
`427`	`427`	`TableScan: test [a:UInt32, b:UInt32, c:UInt32]`
`428`	`428`	`"`
`429`	`429`	`)`