@@ -347,9 +347,22 @@ struct ArrowConverter<AT, LT, is_nullable, is_strict, BinaryATGuard<AT>, StringO
347347 }
348348};
349349
350- // StringView converter: uses per-element GetView(i) instead of bulk offset copy.
351- // StringView out-of-line strings (>12 bytes) may span multiple variadic buffers,
352- // so offset-based optimize_non_fixed_size_binary is NOT safe here.
350+ // Arrow StringView converter specialization.
351+ //
352+ // StringView (Arrow Utf8View / BinaryView) stores strings in a split layout:
353+ // - Inline: strings <= 12 bytes live directly in the 16-byte view struct
354+ // - Out-of-line: strings > 12 bytes reference data in one of potentially many
355+ // variadic data buffers (not a single contiguous offset buffer)
356+ //
357+ // This means the offset-based bulk-memcpy path (optimize_non_fixed_size_binary) used by
358+ // Binary/String types is NOT safe for StringView — it assumes a single contiguous data
359+ // buffer, which would silently corrupt out-of-line strings. Instead, we iterate per-element
360+ // using GetView(i), which correctly resolves both inline and out-of-line storage.
361+ //
362+ // An alternative would be to cast StringView -> String via arrow::compute::Cast() first,
363+ // producing a contiguous offset-based array, then reuse the existing Binary/String converter.
364+ // We avoid that because it copies the string data twice: once into an intermediate Arrow
365+ // String array, then again into BinaryColumn. The direct approach here copies once.
353366template <ArrowTypeId AT, LogicalType LT, bool is_nullable, bool is_strict>
354367struct ArrowConverter <AT, LT, is_nullable, is_strict, StringViewATGuard<AT>, StringOrBinaryGuard<LT>> {
355368 using ArrowArrayType = ArrowTypeIdToArrayType<AT>;
@@ -370,7 +383,18 @@ struct ArrowConverter<AT, LT, is_nullable, is_strict, StringViewATGuard<AT>, Str
370383 }
371384
372385 concrete_column->reserve (concrete_column->size () + num_elements);
373- bool reported = false ;
386+ bool repeated = false ;
387+ // Unlike offset-based Binary/String arrays where null slots still have valid offsets,
388+ // StringView null slots may contain an undefined 16-byte view struct per the Arrow spec.
389+ // GetView() would dereference a garbage buffer_index for out-of-line views. When
390+ // is_nullable=true, null_data already guards this. For is_nullable=false, we check once
391+ // whether the source array has nulls and guard per-element only when needed.
392+ // The framework's fill_filter() typically handles this upstream, but we guard here
393+ // defensively since GetView() on a garbage view is a segfault, not a wrong result.
394+ const bool needs_null_guard = !is_nullable && concrete_array->null_count () > 0 ;
395+ // Per-element iteration: GetView(i) returns a std::string_view that transparently
396+ // handles both inline views (data in the struct) and out-of-line views (pointer
397+ // into a variadic buffer). This is O(n) per element but necessary for correctness.
374398 for (size_t i = 0 ; i < num_elements; ++i) {
375399 size_t array_idx = array_start_idx + i;
376400 if constexpr (is_nullable) {
@@ -379,16 +403,26 @@ struct ArrowConverter<AT, LT, is_nullable, is_strict, StringViewATGuard<AT>, Str
379403 continue ;
380404 }
381405 }
406+ if (needs_null_guard && concrete_array->IsNull (array_idx)) {
407+ concrete_column->append_default ();
408+ // Regardless of fill_filter called or not, we are setting this idx to be filtered out.
409+ filter_data[i] = 0 ;
410+ continue ;
411+ }
382412 auto sv = concrete_array->GetView (array_idx);
383413 Slice s{sv.data (), sv.size ()};
384414 if (s.size > max_length) {
415+ // String exceeds column's max length (VARCHAR limit or CHAR(N) width).
416+ // How we handle it depends on nullable/strict mode:
417+ // - nullable + non-strict: convert to NULL (preserves row, loses value)
418+ // - strict or non-nullable: filter out the row and report the error once
385419 concrete_column->append_default ();
386420 if constexpr (is_nullable && !is_strict) {
387421 null_data[i] = DATUM_NULL;
388422 } else {
389423 filter_data[i] = 0 ;
390- if (ctx != nullptr && !reported ) {
391- reported = true ;
424+ if (ctx != nullptr && !repeated ) {
425+ repeated = true ;
392426 std::string reason = strings::Substitute (
393427 " string length $0 exceeds max length $1" , sv.size (), max_length);
394428 ctx->report_error_message (reason, std::string (sv));
0 commit comments