|
17 | 17 |
|
18 | 18 | use std::sync::Arc; |
19 | 19 |
|
20 | | -use arrow::array::{RecordBatch, record_batch}; |
21 | | -use arrow_schema::{DataType, Field, Schema, SchemaRef}; |
| 20 | +use arrow::array::{ |
| 21 | + Array, ArrayRef, BooleanArray, Int32Array, Int64Array, RecordBatch, StringArray, |
| 22 | + StructArray, record_batch, |
| 23 | +}; |
| 24 | +use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef}; |
22 | 25 | use bytes::{BufMut, BytesMut}; |
23 | 26 | use datafusion::assert_batches_eq; |
24 | 27 | use datafusion::common::Result; |
@@ -320,6 +323,145 @@ async fn test_physical_expr_adapter_with_non_null_defaults() { |
320 | 323 | assert_batches_eq!(expected, &batches); |
321 | 324 | } |
322 | 325 |
|
| 326 | +#[tokio::test] |
| 327 | +async fn test_struct_schema_evolution_projection_and_filter() -> Result<()> { |
| 328 | + use std::collections::HashMap; |
| 329 | + |
| 330 | + // Physical struct: {id: Int32, name: Utf8} |
| 331 | + let physical_struct_fields: Fields = vec![ |
| 332 | + Arc::new(Field::new("id", DataType::Int32, false)), |
| 333 | + Arc::new(Field::new("name", DataType::Utf8, true)), |
| 334 | + ] |
| 335 | + .into(); |
| 336 | + |
| 337 | + let struct_array = StructArray::new( |
| 338 | + physical_struct_fields.clone(), |
| 339 | + vec![ |
| 340 | + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, |
| 341 | + Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef, |
| 342 | + ], |
| 343 | + None, |
| 344 | + ); |
| 345 | + |
| 346 | + let physical_schema = Arc::new(Schema::new(vec![Field::new( |
| 347 | + "s", |
| 348 | + DataType::Struct(physical_struct_fields), |
| 349 | + true, |
| 350 | + )])); |
| 351 | + |
| 352 | + let batch = |
| 353 | + RecordBatch::try_new(Arc::clone(&physical_schema), vec![Arc::new(struct_array)])?; |
| 354 | + |
| 355 | + let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>; |
| 356 | + let store_url = ObjectStoreUrl::parse("memory://").unwrap(); |
| 357 | + write_parquet(batch, store.clone(), "struct_evolution.parquet").await; |
| 358 | + |
| 359 | + // Logical struct: {id: Int64?, name: Utf8?, extra: Boolean?} + metadata |
| 360 | + let logical_struct_fields: Fields = vec![ |
| 361 | + Arc::new(Field::new("id", DataType::Int64, true)), |
| 362 | + Arc::new(Field::new("name", DataType::Utf8, true)), |
| 363 | + Arc::new(Field::new("extra", DataType::Boolean, true).with_metadata( |
| 364 | + HashMap::from([("nested_meta".to_string(), "1".to_string())]), |
| 365 | + )), |
| 366 | + ] |
| 367 | + .into(); |
| 368 | + |
| 369 | + let table_schema = Arc::new(Schema::new(vec![ |
| 370 | + Field::new("s", DataType::Struct(logical_struct_fields), false) |
| 371 | + .with_metadata(HashMap::from([("top_meta".to_string(), "1".to_string())])), |
| 372 | + ])); |
| 373 | + |
| 374 | + let mut cfg = SessionConfig::new() |
| 375 | + .with_collect_statistics(false) |
| 376 | + .with_parquet_pruning(false) |
| 377 | + .with_parquet_page_index_pruning(false); |
| 378 | + cfg.options_mut().execution.parquet.pushdown_filters = true; |
| 379 | + |
| 380 | + let ctx = SessionContext::new_with_config(cfg); |
| 381 | + ctx.register_object_store(store_url.as_ref(), Arc::clone(&store)); |
| 382 | + |
| 383 | + let listing_table_config = |
| 384 | + ListingTableConfig::new(ListingTableUrl::parse("memory:///").unwrap()) |
| 385 | + .infer_options(&ctx.state()) |
| 386 | + .await |
| 387 | + .unwrap() |
| 388 | + .with_schema(table_schema.clone()) |
| 389 | + .with_expr_adapter_factory(Arc::new(DefaultPhysicalExprAdapterFactory)); |
| 390 | + |
| 391 | + let table = ListingTable::try_new(listing_table_config).unwrap(); |
| 392 | + ctx.register_table("t", Arc::new(table)).unwrap(); |
| 393 | + |
| 394 | + let batches = ctx |
| 395 | + .sql("SELECT s FROM t") |
| 396 | + .await |
| 397 | + .unwrap() |
| 398 | + .collect() |
| 399 | + .await |
| 400 | + .unwrap(); |
| 401 | + assert_eq!(batches.len(), 1); |
| 402 | + |
| 403 | + // Verify top-level metadata propagation |
| 404 | + let output_schema = batches[0].schema(); |
| 405 | + let s_field = output_schema.field_with_name("s").unwrap(); |
| 406 | + assert_eq!( |
| 407 | + s_field.metadata().get("top_meta").map(String::as_str), |
| 408 | + Some("1") |
| 409 | + ); |
| 410 | + |
| 411 | + // Verify nested struct type/field propagation + values |
| 412 | + let s_array = batches[0] |
| 413 | + .column(0) |
| 414 | + .as_any() |
| 415 | + .downcast_ref::<StructArray>() |
| 416 | + .expect("expected struct array"); |
| 417 | + |
| 418 | + let id_array = s_array |
| 419 | + .column_by_name("id") |
| 420 | + .expect("id column") |
| 421 | + .as_any() |
| 422 | + .downcast_ref::<Int64Array>() |
| 423 | + .expect("id should be cast to Int64"); |
| 424 | + assert_eq!(id_array.values(), &[1, 2, 3]); |
| 425 | + |
| 426 | + let extra_array = s_array.column_by_name("extra").expect("extra column"); |
| 427 | + assert_eq!(extra_array.null_count(), 3); |
| 428 | + |
| 429 | + // Verify nested field metadata propagation |
| 430 | + let extra_field = match s_field.data_type() { |
| 431 | + DataType::Struct(fields) => fields |
| 432 | + .iter() |
| 433 | + .find(|f| f.name() == "extra") |
| 434 | + .expect("extra field"), |
| 435 | + other => panic!("expected struct type for s, got {other:?}"), |
| 436 | + }; |
| 437 | + assert_eq!( |
| 438 | + extra_field |
| 439 | + .metadata() |
| 440 | + .get("nested_meta") |
| 441 | + .map(String::as_str), |
| 442 | + Some("1") |
| 443 | + ); |
| 444 | + |
| 445 | + // Smoke test: filtering on a missing nested field evaluates correctly |
| 446 | + let filtered = ctx |
| 447 | + .sql("SELECT get_field(s, 'extra') AS extra FROM t WHERE get_field(s, 'extra') IS NULL") |
| 448 | + .await |
| 449 | + .unwrap() |
| 450 | + .collect() |
| 451 | + .await |
| 452 | + .unwrap(); |
| 453 | + assert_eq!(filtered.len(), 1); |
| 454 | + assert_eq!(filtered[0].num_rows(), 3); |
| 455 | + let extra = filtered[0] |
| 456 | + .column(0) |
| 457 | + .as_any() |
| 458 | + .downcast_ref::<BooleanArray>() |
| 459 | + .expect("extra should be a boolean array"); |
| 460 | + assert_eq!(extra.null_count(), 3); |
| 461 | + |
| 462 | + Ok(()) |
| 463 | +} |
| 464 | + |
323 | 465 | /// Test demonstrating that a single PhysicalExprAdapterFactory instance can be |
324 | 466 | /// reused across multiple ListingTable instances. |
325 | 467 | /// |
|
0 commit comments