fix: Preserve quoted mixed-case identifiers in the pivot_unpivot example (#21432)

niebayes · web-flow · commit d4e629fc67a4 · 2026-04-11T09:09:42.000Z
## Summary

This PR fixes a bug in the
`datafusion-examples/examples/relation_planner/pivot_unpivot.rs`
example implementation.

The example planner rewrites `PIVOT` into a `GROUP BY` plus `CASE`
expressions. During that
rewrite, it rebuilt column references using unquoted `col("...")`
expressions. That loses the
original identifier quoting and case-sensitivity, which breaks queries
that pivot on quoted
mixed-case columns such as `"pointNumber"`.

This PR preserves the original parsed column expression instead of
reconstructing it from a plain
string.

## Problem

Consider a query like:

```sql
SELECT *
FROM point_stats
PIVOT (
  MAX(max_value)
  FOR "pointNumber" IN ('16951' AS p16951, '16952' AS p16952)
)
ORDER BY ts
```

Before this change, the example planner:

1. Parsed `"pointNumber"` correctly as a quoted, case-sensitive
identifier.
2. Extracted its name as `pointNumber`.
3. Reconstructed new expressions with `col(&amp;pivot_col_name)` and
`col(...)` for `GROUP BY`.

That reconstruction treated the identifier as an unquoted column
reference, which could be
normalized differently from the original schema field. In practice, this
means the planner could
end up looking for `pointnumber` while the schema still contained
`"pointNumber"`.

## Root Cause

Two places in the example rewrite logic rebuilt column references from
bare strings:

1. The generated `CASE` expression for the pivot column
2. The inferred `GROUP BY` expressions

That is fine for simple lowercase identifiers, but it is not correct for
quoted identifiers or
qualified fields because the reconstructed expression no longer carries
the original identifier
semantics.

## Fix

The fix is minimal:

1. Reuse the already planned `pivot_col` expression when building the
`CASE` expression.
2. Build `GROUP BY` expressions directly from the input schema via
`Expr::from(...)` rather than
   re-creating them with `col(field_name)`.

This preserves:

- quoted mixed-case identifiers
- qualifiers
- original field resolution semantics

## Code Changes

File changed:

- `datafusion-examples/examples/relation_planner/pivot_unpivot.rs`

Main changes:

- Replace:

```rust
case(col(&amp;pivot_col_name))
```

with:

```rust
case(pivot_col.clone())
```

- Replace string-based `GROUP BY` reconstruction:

```rust
schema
    .fields()
    .iter()
    .map(|f| f.name().as_str())
    ...
    .map(col)
```

with schema-derived expressions:

```rust
schema
    .iter()
    .filter(...)
    .map(Expr::from)
```

## Example Coverage

This PR also adds an additional example scenario to the same file:

- a `point_stats` input table
- a `PIVOT` query using quoted mixed-case column `"pointNumber"`
- a snapshot asserting the expected output

This makes the bug and the fix visible directly in the example itself.
diff --git a/datafusion-examples/examples/relation_planner/pivot_unpivot.rs b/datafusion-examples/examples/relation_planner/pivot_unpivot.rs
@@ -217,6 +217,25 @@ async fn run_examples(ctx: &SessionContext) -> Result<()> {
     +--------+
     ");
 
+    // Example 7: PIVOT on a quoted mixed-case column
+    // Reuses the parsed column expression so quoted identifiers keep their case.
+    let results = run_example(
+        ctx,
+        "Example 7: PIVOT with quoted mixed-case column",
+        r#"SELECT * FROM point_stats
+           PIVOT (MAX(max_value) FOR "pointNumber" IN ('16951' AS p16951, '16952' AS p16952)) AS p
+           ORDER BY ts"#,
+    )
+    .await?;
+    assert_snapshot!(results, @r"
+    +----------------------+------+--------+--------+
+    | ts                   | port | p16951 | p16952 |
+    +----------------------+------+--------+--------+
+    | 2024-09-01T10:00:00Z | 2411 | 10     | 20     |
+    | 2024-09-01T10:01:00Z | 2411 | 30     | 40     |
+    +----------------------+------+--------+--------+
+    ");
+
     Ok(())
 }
 
@@ -288,6 +307,34 @@ fn register_sample_data(ctx: &SessionContext) -> Result<()> {
         ])?,
     )?;
 
+    // point_stats: grouped data with a quoted mixed-case pivot column.
+    ctx.register_batch(
+        "point_stats",
+        RecordBatch::try_from_iter(vec![
+            (
+                "ts",
+                Arc::new(StringArray::from(vec![
+                    "2024-09-01T10:00:00Z",
+                    "2024-09-01T10:00:00Z",
+                    "2024-09-01T10:01:00Z",
+                    "2024-09-01T10:01:00Z",
+                ])) as ArrayRef,
+            ),
+            (
+                "pointNumber",
+                Arc::new(StringArray::from(vec!["16951", "16952", "16951", "16952"])),
+            ),
+            (
+                "port",
+                Arc::new(StringArray::from(vec!["2411", "2411", "2411", "2411"])),
+            ),
+            (
+                "max_value",
+                Arc::new(Int64Array::from(vec![10, 20, 30, 40])),
+            ),
+        ])?,
+    )?;
+
     Ok(())
 }
 
@@ -415,11 +462,12 @@ fn plan_pivot(
         .collect();
 
     let group_by_cols: Vec<Expr> = schema
-        .fields()
         .iter()
-        .map(|f| f.name().as_str())
-        .filter(|name| *name != pivot_col_name.as_str() && !agg_input_cols.contains(name))
-        .map(col)
+        .filter(|(_, field)| {
+            let name = field.name();
+            name != pivot_col_name.as_str() && !agg_input_cols.contains(&name.as_str())
+        })
+        .map(Expr::from)
         .collect();
 
     // Build CASE expressions for each (aggregate, pivot_value) pair
@@ -434,7 +482,7 @@ fn plan_pivot(
 
         for (value_alias, pivot_value) in &pivot_values {
             // CASE pivot_col WHEN pivot_value THEN agg_input END
-            let case_expr = case(col(&pivot_col_name))
+            let case_expr = case(pivot_col.clone())
                 .when(pivot_value.clone(), agg_input.clone())
                 .end()?;