Skip to content

Commit ccaf802

Browse files
Add flat vs. struct field projection benchmarks (#21257)
## Rationale for this change This PR adds a benchmark comparing top-level column access against struct field access for the same logical data #20925 introduced leaf level projection masking so that projecting a single struct field skips decoding its siblings. #21180 added benchmarks measuring that improvement across different strcut shapes. But neither benchmark answers how struct field access compare to reading the same column at the top level. Without that baseline, it's hard to know how much overhead the struct access path itself adds
1 parent 14a85fa commit ccaf802

File tree

1 file changed

+84
-1
lines changed

1 file changed

+84
-1
lines changed

datafusion/core/benches/parquet_struct_projection.rs

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,10 +404,93 @@ fn nested_benchmarks(c: &mut Criterion) {
404404
drop(temp_file);
405405
}
406406

407+
fn flat_schema() -> SchemaRef {
408+
Arc::new(Schema::new(vec![
409+
Field::new("id", DataType::Int32, false),
410+
Field::new("large_string", DataType::Utf8, false),
411+
Field::new("small_int", DataType::Int32, false),
412+
]))
413+
}
414+
415+
fn flat_batch(batch_id: usize) -> RecordBatch {
416+
let schema = flat_schema();
417+
let len = WRITE_RECORD_BATCH_SIZE;
418+
419+
let base_id = (batch_id * len) as i32;
420+
let id_values: Vec<i32> = (0..len).map(|i| base_id + i as i32).collect();
421+
let id_array = Arc::new(Int32Array::from(id_values.clone()));
422+
let small_int_array = Arc::new(Int32Array::from(id_values));
423+
424+
let large_string: String = "x".repeat(LARGE_STRING_LEN);
425+
let mut string_builder = StringBuilder::new();
426+
for _ in 0..len {
427+
string_builder.append_value(&large_string);
428+
}
429+
let large_string_array = Arc::new(string_builder.finish());
430+
431+
RecordBatch::try_new(
432+
schema,
433+
vec![id_array, large_string_array as ArrayRef, small_int_array],
434+
)
435+
.unwrap()
436+
}
437+
438+
/// Compare selecting a small field from a flat (top-level) schema vs from
439+
/// inside a struct. Both files contain the same logical data — the only
440+
/// difference is whether `small_int` lives at the top level or nested inside
441+
/// a struct column.
442+
fn flat_vs_struct_benchmarks(c: &mut Criterion) {
443+
let flat_file = generate_file(flat_schema(), flat_batch, "flat");
444+
let flat_path = flat_file.path().display().to_string();
445+
assert!(Path::new(&flat_path).exists(), "path not found");
446+
447+
let struct_file = generate_file(narrow_schema(), narrow_batch, "narrow_struct_cmp");
448+
let struct_path = struct_file.path().display().to_string();
449+
assert!(Path::new(&struct_path).exists(), "path not found");
450+
451+
let rt = Runtime::new().unwrap();
452+
let flat_ctx = create_context(&rt, &flat_path, "t");
453+
let struct_ctx = create_context(&rt, &struct_path, "t");
454+
455+
let mut group = c.benchmark_group("flat_vs_struct");
456+
group.sample_size(10);
457+
group.warm_up_time(Duration::from_secs(1));
458+
group.measurement_time(Duration::from_secs(2));
459+
460+
// small int: top-level vs struct field
461+
group.bench_function("flat_select_small_int", |b| {
462+
b.iter(|| query(&flat_ctx, &rt, "SELECT small_int FROM t"))
463+
});
464+
group.bench_function("struct_select_small_int", |b| {
465+
b.iter(|| query(&struct_ctx, &rt, "SELECT s['small_int'] FROM t"))
466+
});
467+
468+
// large string: top-level vs struct field
469+
group.bench_function("flat_select_large_string", |b| {
470+
b.iter(|| query(&flat_ctx, &rt, "SELECT large_string FROM t"))
471+
});
472+
group.bench_function("struct_select_large_string", |b| {
473+
b.iter(|| query(&struct_ctx, &rt, "SELECT s['large_string'] FROM t"))
474+
});
475+
476+
// aggregation: SUM of small int
477+
group.bench_function("flat_sum_small_int", |b| {
478+
b.iter(|| query(&flat_ctx, &rt, "SELECT SUM(small_int) FROM t"))
479+
});
480+
group.bench_function("struct_sum_small_int", |b| {
481+
b.iter(|| query(&struct_ctx, &rt, "SELECT SUM(s['small_int']) FROM t"))
482+
});
483+
484+
group.finish();
485+
drop(flat_file);
486+
drop(struct_file);
487+
}
488+
407489
criterion_group!(
408490
benches,
409491
narrow_benchmarks,
410492
wide_benchmarks,
411-
nested_benchmarks
493+
nested_benchmarks,
494+
flat_vs_struct_benchmarks,
412495
);
413496
criterion_main!(benches);

0 commit comments

Comments
 (0)