Skip to content

Commit 4ec33f6

Browse files
committed
Add comments, per review
1 parent e6c07fc commit 4ec33f6

1 file changed

Lines changed: 25 additions & 8 deletions

File tree

datafusion/functions/src/string/common.rs

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -461,10 +461,11 @@ fn case_conversion_utf8view_ascii(
461461
}
462462
}
463463

464-
/// Walks the views once: inline rows (length ≤ 12) convert their inline bytes
465-
/// in place; long rows copy their referenced bytes into a single packed output
466-
/// buffer while converting, then rewrite the view (`buffer_index = 0`, new
467-
/// offset, new 4-byte prefix) to point at it.
464+
/// Walks the views once and produces a new `StringViewArray` with
465+
/// case-converted bytes. Inline strings (<= 12 bytes) are converted in-place;
466+
/// long strings copy-and-convert into output buffers and have their view fields
467+
/// rewritten to address the new bytes. ASCII case conversion preserves is byte
468+
/// length, so no row migrates between the inline and long layouts.
468469
fn case_conversion_utf8view_ascii_inner<F: Fn(&u8) -> u8>(
469470
array: &StringViewArray,
470471
convert: F,
@@ -475,31 +476,41 @@ fn case_conversion_utf8view_ascii_inner<F: Fn(&u8) -> u8>(
475476
let nulls = array.nulls();
476477

477478
let mut new_views: Vec<u128> = Vec::with_capacity(item_len);
479+
// Long values are packed into `in_progress`; when full it is sealed into
480+
// `completed` and a new, larger block is started — same block-doubling
481+
// scheme as Arrow's `GenericByteViewBuilder`.
478482
let mut in_progress: Vec<u8> = Vec::new();
479483
let mut completed: Vec<Buffer> = Vec::new();
480484
let mut block_size: u32 = STRING_VIEW_INIT_BLOCK_SIZE;
481485

482486
for i in 0..item_len {
483487
if nulls.is_some_and(|n| n.is_null(i)) {
488+
// Zero view = empty, no buffer reference; the null buffer is what
489+
// marks the row null, so the view's value is irrelevant.
484490
new_views.push(0);
485491
continue;
486492
}
487493
let view = views[i];
494+
// Length is the low 32 bits; `as u32` discards the rest of the view.
488495
let len = view as u32 as usize;
489496
if len == 0 {
490497
new_views.push(0);
491498
continue;
492499
}
493500
let mut bytes = view.to_le_bytes();
494501
if len <= 12 {
495-
// Inline row: convert the inline data bytes; layout unchanged.
502+
// Inline: value is in bytes[4..4+len], no buffer reference. Convert
503+
// in place; nothing else in the view needs to change.
496504
for b in &mut bytes[4..4 + len] {
497505
*b = convert(b);
498506
}
499507
new_views.push(u128::from_le_bytes(bytes));
500508
} else {
501-
// Make sure the current data block has room for this value;
502-
// otherwise flush and start a new, larger block.
509+
// Long: input view points into shared `data_buffers` we can't
510+
// mutate, so copy-convert into our own buffer and rewrite the
511+
// view's prefix/buffer_index/offset (length is preserved).
512+
513+
// Ensure the current block has room; otherwise flush and grow.
503514
let required_cap = in_progress.len() + len;
504515
if in_progress.capacity() < required_cap {
505516
if !in_progress.is_empty() {
@@ -512,12 +523,16 @@ fn case_conversion_utf8view_ascii_inner<F: Fn(&u8) -> u8>(
512523
in_progress.reserve(to_reserve);
513524
}
514525

526+
// The in-progress block will be sealed at index `completed.len()`,
527+
// and our value starts at the current write position within it.
515528
let buffer_index: u32 = i32::try_from(completed.len())
516529
.expect("buffer count exceeds i32::MAX")
517530
as u32;
518531
let new_offset: u32 =
519532
i32::try_from(in_progress.len()).expect("offset exceeds i32::MAX") as u32;
520533

534+
// Source location from the input view: bytes 8..12 are buffer
535+
// index, bytes 12..16 are the offset within it.
521536
let src_buffer_index =
522537
u32::from_le_bytes(bytes[8..12].try_into().unwrap()) as usize;
523538
let src_offset =
@@ -528,7 +543,9 @@ fn case_conversion_utf8view_ascii_inner<F: Fn(&u8) -> u8>(
528543
let prefix_start = in_progress.len();
529544
in_progress.extend(src.iter().map(&convert));
530545

531-
// Prefix is the first 4 bytes of the converted data we just wrote.
546+
// Rewrite the three long-view fields; bytes[0..4] (length) is
547+
// left untouched. The prefix is read back from the bytes we just
548+
// wrote so the converted value has a single source of truth.
532549
let prefix: [u8; 4] = in_progress[prefix_start..prefix_start + 4]
533550
.try_into()
534551
.unwrap();

0 commit comments

Comments
 (0)