Skip to content

Commit 4d3dc91

Browse files
authored
Merge pull request #136 from DataDog/pablo.abadrubio/cherry-pick/apache-pr-22620-20260602
[branch-53] Cherry-pick apache#22620
2 parents c96c285 + b502200 commit 4d3dc91

2 files changed

Lines changed: 95 additions & 0 deletions

File tree

datafusion/optimizer/src/extract_leaf_expressions.rs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,6 +1157,13 @@ fn try_push_into_inputs(
11571157
return Ok(None);
11581158
}
11591159

1160+
// Unnest may output a column with the same name but different value/type
1161+
// than its input column. Name-based routing cannot distinguish those.
1162+
// On top of that Unnest can't go through the `node.with_new_exprs(node.expressions(), new_inputs)` rebuild
1163+
if matches!(node, LogicalPlan::Unnest(_)) {
1164+
return Ok(None);
1165+
}
1166+
11601167
// SubqueryAlias remaps qualifiers between input and output.
11611168
// Rewrite pairs/columns from alias-space to input-space before routing.
11621169
let remapped = if let LogicalPlan::SubqueryAlias(sa) = node {
@@ -3050,4 +3057,48 @@ mod tests {
30503057

30513058
Ok(())
30523059
}
3060+
3061+
/// Regression test for the `Assertion failed: expr.is_empty(): Unnest`
3062+
/// internal error.
3063+
///
3064+
/// `try_push_into_inputs` rebuilds the parent node via
3065+
/// `node.with_new_exprs(node.expressions(), new_inputs)`. For `Unnest`,
3066+
/// `apply_expressions` exposes the `exec_columns` as `Expr::Column`s
3067+
/// (so `expressions()` is **non-empty**), but `with_new_exprs` for
3068+
/// `Unnest` immediately calls `assert_no_expressions(expr)?` and errors
3069+
/// out. The optimizer should treat `Unnest` as a barrier and bail
3070+
/// instead of attempting to push through it.
3071+
#[test]
3072+
fn test_no_push_through_unnest() -> Result<()> {
3073+
use arrow::datatypes::{DataType, Field, Schema};
3074+
3075+
let schema = Schema::new(vec![
3076+
Field::new("list_col", DataType::new_list(DataType::Int32, true), true),
3077+
Field::new("other_col", DataType::Int32, true),
3078+
]);
3079+
let table_scan =
3080+
datafusion_expr::logical_plan::table_scan(Some("t"), &schema, None)?
3081+
.build()?;
3082+
let plan = LogicalPlanBuilder::from(table_scan)
3083+
.unnest_column("list_col")?
3084+
.filter(leaf_udf(col("list_col"), "x").eq(lit(1i32)))?
3085+
.build()?;
3086+
3087+
let ctx = OptimizerContext::new().with_max_passes(1);
3088+
let optimizer = Optimizer::with_rules(vec![
3089+
Arc::new(ExtractLeafExpressions::new()),
3090+
Arc::new(PushDownLeafProjections::new()),
3091+
]);
3092+
let optimized = optimizer.optimize(plan, &ctx, |_, _| {})?;
3093+
3094+
insta::assert_snapshot!(format!("{optimized}"), @r#"
3095+
Projection: list_col, t.other_col
3096+
Filter: __datafusion_extracted_1 = Int32(1)
3097+
Projection: leaf_udf(list_col, Utf8("x")) AS __datafusion_extracted_1, list_col, t.other_col
3098+
Unnest: lists[t.list_col|depth=1] structs[]
3099+
TableScan: t
3100+
"#);
3101+
3102+
Ok(())
3103+
}
30533104
}

datafusion/sqllogictest/test_files/unnest.slt

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,3 +1233,47 @@ physical_plan
12331233
# cleanup
12341234
statement ok
12351235
drop table t;
1236+
1237+
## Regression: pushing a leaf-extracted projection (containing get_field,
1238+
## which has MoveTowardsLeafNodes placement) through an `Unnest` used to
1239+
## trip `Assertion failed: expr.is_empty(): Unnest` inside
1240+
## `PushDownLeafProjections`. The optimizer must not try to pushdown these
1241+
## projections through an `Unnest` and should produce a valid plan.
1242+
1243+
statement ok
1244+
CREATE TABLE struct_and_list_table
1245+
AS VALUES
1246+
(struct(1, 2), [10, 20, 30]),
1247+
(struct(3, 4), [40, 50]);
1248+
1249+
query I
1250+
SELECT sum(get_field(s, 'c0'))
1251+
FROM (SELECT s, unnest(arr)
1252+
FROM (SELECT column1 AS s, column2 AS arr
1253+
FROM struct_and_list_table));
1254+
----
1255+
9
1256+
1257+
statement ok
1258+
DROP TABLE struct_and_list_table;
1259+
1260+
## Regression: get_field directly references the struct produced by unnest.
1261+
## This covers the case where the leaf-extracted expression depends on the
1262+
## unnested column itself rather than a sibling input column below the Unnest.
1263+
1264+
statement ok
1265+
CREATE TABLE list_struct_table
1266+
AS VALUES
1267+
([struct(1, 'a'), struct(2, 'b')]),
1268+
([struct(3, 'c')]);
1269+
1270+
query IT
1271+
SELECT get_field(unnest(column1), 'c0'), get_field(unnest(column1), 'c1')
1272+
FROM list_struct_table;
1273+
----
1274+
1 a
1275+
2 b
1276+
3 c
1277+
1278+
statement ok
1279+
DROP TABLE list_struct_table;

0 commit comments

Comments
 (0)