Skip to content

Commit 4bf37a0

Browse files
authored
fix: keep name-based validation for column expressions with struct (delta-io#2440)
## What changes are proposed in this pull request? Partially reverts delta-io#2160. The Column expression path in `evaluate_expression` now uses name-based validation (`TypesAndNames`) via `validate_array_type` instead of ordinal-based (`TypesOnly`). The `ValidationMode::TypesOnly` variant and its `ensure_data_types` tests are kept for future use. ## How was this change tested? Updated existing tests to expect name-mismatch errors instead of success.
1 parent ac9dc19 commit 4bf37a0

2 files changed

Lines changed: 13 additions & 82 deletions

File tree

kernel/src/engine/arrow_expression/evaluate_expression.rs

Lines changed: 11 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -265,15 +265,7 @@ pub fn evaluate_expression(
265265
(Literal(scalar), _) => {
266266
validate_array_type(scalar.to_array(batch.num_rows())?, result_type)
267267
}
268-
(Column(name), _) => {
269-
// Column extraction uses ordinal-based struct validation because column mapping
270-
// can cause physical/logical name mismatches. apply_schema handles renaming.
271-
let arr = extract_column(batch, name)?;
272-
if let Some(expected) = result_type {
273-
ensure_data_types(expected, arr.data_type(), ValidationMode::TypesOnly)?;
274-
}
275-
Ok(arr)
276-
}
268+
(Column(name), _) => validate_array_type(extract_column(batch, name)?, result_type),
277269
(Struct(fields, nullability), Some(DataType::Struct(output_schema))) => {
278270
evaluate_struct_expression(fields, batch, output_schema, nullability.as_ref())
279271
}
@@ -2396,7 +2388,7 @@ mod tests {
23962388
}
23972389

23982390
#[test]
2399-
fn column_extract_struct_with_mismatched_field_names() {
2391+
fn column_extract_struct_rejects_mismatched_field_names() {
24002392
let batch = make_struct_batch(
24012393
vec![
24022394
ArrowField::new("col-abc-001", ArrowDataType::Int64, true),
@@ -2408,7 +2400,6 @@ mod tests {
24082400
],
24092401
);
24102402

2411-
// Logical names differ from physical names due to column mapping
24122403
let logical_type = DataType::try_struct_type([
24132404
StructField::nullable("my_column", DataType::LONG),
24142405
StructField::nullable("other_column", DataType::LONG),
@@ -2417,47 +2408,22 @@ mod tests {
24172408

24182409
let expr = column_expr!("stats");
24192410
let result = evaluate_expression(&expr, &batch, Some(&logical_type));
2420-
2421-
// Ordinal-based validation passes: same field count and types by position.
2422-
// The downstream apply_schema transformation handles renaming.
2423-
let arr = result.expect("should succeed with mismatched names but matching types");
2424-
let struct_arr = arr.as_any().downcast_ref::<StructArray>().unwrap();
2425-
assert_eq!(struct_arr.num_columns(), 2);
2426-
assert_eq!(struct_arr.len(), 2);
2427-
}
2428-
2429-
#[test]
2430-
fn column_extract_struct_rejects_mismatched_field_count() {
2431-
let batch = make_struct_batch(
2432-
vec![ArrowField::new("col-abc-001", ArrowDataType::Int64, true)],
2433-
vec![Arc::new(Int64Array::from(vec![Some(1), Some(2)]))],
2434-
);
2435-
2436-
let logical_type = DataType::try_struct_type([
2437-
StructField::nullable("a", DataType::LONG),
2438-
StructField::nullable("b", DataType::LONG),
2439-
])
2440-
.unwrap();
2441-
2442-
let expr = column_expr!("stats");
2443-
let result = evaluate_expression(&expr, &batch, Some(&logical_type));
2444-
assert_result_error_with_message(result, "Struct field count mismatch");
2411+
assert_result_error_with_message(result, "Missing Struct fields");
24452412
}
24462413

24472414
#[test]
24482415
fn column_extract_struct_rejects_mismatched_child_types() {
24492416
let batch = make_struct_batch(
24502417
vec![
2451-
ArrowField::new("col-abc-001", ArrowDataType::Int64, true),
2452-
ArrowField::new("col-abc-002", ArrowDataType::Utf8, true),
2418+
ArrowField::new("a", ArrowDataType::Int64, true),
2419+
ArrowField::new("b", ArrowDataType::Utf8, true),
24532420
],
24542421
vec![
24552422
Arc::new(Int64Array::from(vec![Some(1)])),
24562423
Arc::new(StringArray::from(vec![Some("x")])),
24572424
],
24582425
);
24592426

2460-
// Expect two LONG columns, but the second arrow field is Utf8
24612427
let logical_type = DataType::try_struct_type([
24622428
StructField::nullable("a", DataType::LONG),
24632429
StructField::nullable("b", DataType::LONG),
@@ -2470,7 +2436,7 @@ mod tests {
24702436
}
24712437

24722438
#[test]
2473-
fn column_extract_struct_with_matching_names_still_works() {
2439+
fn column_extract_struct_with_matching_names_works() {
24742440
let batch = make_struct_batch(
24752441
vec![
24762442
ArrowField::new("a", ArrowDataType::Int64, true),
@@ -2493,15 +2459,11 @@ mod tests {
24932459
assert!(result.is_ok());
24942460
}
24952461

2496-
/// Exercises the exact code path from `get_add_transform_expr` where a `struct_from`
2497-
/// expression wraps `column_expr!("add.stats_parsed")`. When the checkpoint parquet has
2498-
/// stats_parsed with physical column names (e.g. `col-abc-001`) but the output schema
2499-
/// uses logical names (e.g. `id`), `evaluate_struct_expression` calls
2500-
/// `evaluate_expression(Column, struct_result_type)` with mismatched field names.
2501-
/// Without ordinal-based validation this fails with a name mismatch error.
2462+
/// When a `struct_from` expression wraps a `Column` referencing stats_parsed, and the
2463+
/// checkpoint parquet has physical column names (e.g. `col-abc-001`) but the output schema
2464+
/// uses logical names (e.g. `id`), name-based validation correctly rejects the mismatch.
25022465
#[test]
2503-
fn struct_from_with_column_tolerates_nested_name_mismatch() {
2504-
// Build a batch mimicking checkpoint data: add.stats_parsed uses physical names
2466+
fn struct_from_with_column_rejects_nested_name_mismatch() {
25052467
let stats_fields: Vec<ArrowField> = vec![
25062468
ArrowField::new("col-abc-001", ArrowDataType::Int64, true),
25072469
ArrowField::new("col-abc-002", ArrowDataType::Int64, true),
@@ -2541,7 +2503,6 @@ mod tests {
25412503
)]);
25422504
let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(add_struct)]).unwrap();
25432505

2544-
// struct_from mimicking get_add_transform_expr: wraps a Column referencing stats_parsed
25452506
let expr = Expr::struct_from([
25462507
column_expr_ref!("add.path"),
25472508
column_expr_ref!("add.stats_parsed"),
@@ -2561,36 +2522,6 @@ mod tests {
25612522
.unwrap();
25622523

25632524
let result = evaluate_expression(&expr, &batch, Some(&output_type));
2564-
result.expect("struct_from with Column sub-expression should tolerate field name mismatch");
2565-
}
2566-
2567-
#[test]
2568-
fn column_extract_nested_struct_with_mismatched_names() {
2569-
let inner_fields = vec![ArrowField::new("phys-inner", ArrowDataType::Int64, true)];
2570-
let inner_struct = ArrowDataType::Struct(inner_fields.clone().into());
2571-
let batch = make_struct_batch(
2572-
vec![ArrowField::new("phys-outer", inner_struct, true)],
2573-
vec![Arc::new(
2574-
StructArray::try_new(
2575-
inner_fields.into(),
2576-
vec![Arc::new(Int64Array::from(vec![Some(42)]))],
2577-
None,
2578-
)
2579-
.unwrap(),
2580-
)],
2581-
);
2582-
2583-
let logical_type = DataType::try_struct_type([StructField::nullable(
2584-
"logical_outer",
2585-
DataType::struct_type_unchecked([StructField::nullable(
2586-
"logical_inner",
2587-
DataType::LONG,
2588-
)]),
2589-
)])
2590-
.unwrap();
2591-
2592-
let expr = column_expr!("stats");
2593-
let result = evaluate_expression(&expr, &batch, Some(&logical_type));
2594-
assert!(result.is_ok());
2525+
assert_result_error_with_message(result, "Missing Struct fields");
25952526
}
25962527
}

kernel/src/engine/ensure_data_types.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ use crate::{DeltaResult, Error};
1818
#[internal_api]
1919
pub(crate) enum ValidationMode {
2020
/// Check types only. Struct fields are matched by ordinal position, not by name.
21-
/// Nullability and metadata are not checked. Used by the expression evaluator where
22-
/// column mapping can cause physical/logical name mismatches.
21+
/// Nullability and metadata are not checked.
22+
#[allow(dead_code)]
2323
TypesOnly,
2424
/// Check types and match struct fields by name, but skip nullability and metadata.
2525
/// Used by the parquet reader where fields are already resolved by name upstream.

0 commit comments

Comments
 (0)