@@ -461,10 +461,11 @@ fn case_conversion_utf8view_ascii(
461461 }
462462}
463463
464- /// Walks the views once: inline rows (length ≤ 12) convert their inline bytes
465- /// in place; long rows copy their referenced bytes into a single packed output
466- /// buffer while converting, then rewrite the view (`buffer_index = 0`, new
467- /// offset, new 4-byte prefix) to point at it.
464+ /// Walks the views once and produces a new `StringViewArray` with
465+ /// case-converted bytes. Inline strings (<= 12 bytes) are converted in-place;
466+ /// long strings copy-and-convert into output buffers and have their view fields
467+ /// rewritten to address the new bytes. ASCII case conversion preserves is byte
468+ /// length, so no row migrates between the inline and long layouts.
468469fn case_conversion_utf8view_ascii_inner < F : Fn ( & u8 ) -> u8 > (
469470 array : & StringViewArray ,
470471 convert : F ,
@@ -475,31 +476,41 @@ fn case_conversion_utf8view_ascii_inner<F: Fn(&u8) -> u8>(
475476 let nulls = array. nulls ( ) ;
476477
477478 let mut new_views: Vec < u128 > = Vec :: with_capacity ( item_len) ;
479+ // Long values are packed into `in_progress`; when full it is sealed into
480+ // `completed` and a new, larger block is started — same block-doubling
481+ // scheme as Arrow's `GenericByteViewBuilder`.
478482 let mut in_progress: Vec < u8 > = Vec :: new ( ) ;
479483 let mut completed: Vec < Buffer > = Vec :: new ( ) ;
480484 let mut block_size: u32 = STRING_VIEW_INIT_BLOCK_SIZE ;
481485
482486 for i in 0 ..item_len {
483487 if nulls. is_some_and ( |n| n. is_null ( i) ) {
488+ // Zero view = empty, no buffer reference; the null buffer is what
489+ // marks the row null, so the view's value is irrelevant.
484490 new_views. push ( 0 ) ;
485491 continue ;
486492 }
487493 let view = views[ i] ;
494+ // Length is the low 32 bits; `as u32` discards the rest of the view.
488495 let len = view as u32 as usize ;
489496 if len == 0 {
490497 new_views. push ( 0 ) ;
491498 continue ;
492499 }
493500 let mut bytes = view. to_le_bytes ( ) ;
494501 if len <= 12 {
495- // Inline row: convert the inline data bytes; layout unchanged.
502+ // Inline: value is in bytes[4..4+len], no buffer reference. Convert
503+ // in place; nothing else in the view needs to change.
496504 for b in & mut bytes[ 4 ..4 + len] {
497505 * b = convert ( b) ;
498506 }
499507 new_views. push ( u128:: from_le_bytes ( bytes) ) ;
500508 } else {
501- // Make sure the current data block has room for this value;
502- // otherwise flush and start a new, larger block.
509+ // Long: input view points into shared `data_buffers` we can't
510+ // mutate, so copy-convert into our own buffer and rewrite the
511+ // view's prefix/buffer_index/offset (length is preserved).
512+
513+ // Ensure the current block has room; otherwise flush and grow.
503514 let required_cap = in_progress. len ( ) + len;
504515 if in_progress. capacity ( ) < required_cap {
505516 if !in_progress. is_empty ( ) {
@@ -512,12 +523,16 @@ fn case_conversion_utf8view_ascii_inner<F: Fn(&u8) -> u8>(
512523 in_progress. reserve ( to_reserve) ;
513524 }
514525
526+ // The in-progress block will be sealed at index `completed.len()`,
527+ // and our value starts at the current write position within it.
515528 let buffer_index: u32 = i32:: try_from ( completed. len ( ) )
516529 . expect ( "buffer count exceeds i32::MAX" )
517530 as u32 ;
518531 let new_offset: u32 =
519532 i32:: try_from ( in_progress. len ( ) ) . expect ( "offset exceeds i32::MAX" ) as u32 ;
520533
534+ // Source location from the input view: bytes 8..12 are buffer
535+ // index, bytes 12..16 are the offset within it.
521536 let src_buffer_index =
522537 u32:: from_le_bytes ( bytes[ 8 ..12 ] . try_into ( ) . unwrap ( ) ) as usize ;
523538 let src_offset =
@@ -528,7 +543,9 @@ fn case_conversion_utf8view_ascii_inner<F: Fn(&u8) -> u8>(
528543 let prefix_start = in_progress. len ( ) ;
529544 in_progress. extend ( src. iter ( ) . map ( & convert) ) ;
530545
531- // Prefix is the first 4 bytes of the converted data we just wrote.
546+ // Rewrite the three long-view fields; bytes[0..4] (length) is
547+ // left untouched. The prefix is read back from the bytes we just
548+ // wrote so the converted value has a single source of truth.
532549 let prefix: [ u8 ; 4 ] = in_progress[ prefix_start..prefix_start + 4 ]
533550 . try_into ( )
534551 . unwrap ( ) ;
0 commit comments