Skip to content

Commit e1ab33e

Browse files
committed
docs: add detailed comments for StringView converter and tests
1 parent 173a02f commit e1ab33e

3 files changed

Lines changed: 49 additions & 6 deletions

File tree

be/src/exec/arrow_to_starrocks_converter.cpp

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -347,9 +347,22 @@ struct ArrowConverter<AT, LT, is_nullable, is_strict, BinaryATGuard<AT>, StringO
347347
}
348348
};
349349

350-
// StringView converter: uses per-element GetView(i) instead of bulk offset copy.
351-
// StringView out-of-line strings (>12 bytes) may span multiple variadic buffers,
352-
// so offset-based optimize_non_fixed_size_binary is NOT safe here.
350+
// Arrow StringView converter specialization.
351+
//
352+
// StringView (Arrow Utf8View / BinaryView) stores strings in a split layout:
353+
// - Inline: strings <= 12 bytes live directly in the 16-byte view struct
354+
// - Out-of-line: strings > 12 bytes reference data in one of potentially many
355+
// variadic data buffers (not a single contiguous offset buffer)
356+
//
357+
// This means the offset-based bulk-memcpy path (optimize_non_fixed_size_binary) used by
358+
// Binary/String types is NOT safe for StringView — it assumes a single contiguous data
359+
// buffer, which would silently corrupt out-of-line strings. Instead, we iterate per-element
360+
// using GetView(i), which correctly resolves both inline and out-of-line storage.
361+
//
362+
// An alternative would be to cast StringView -> String via arrow::compute::Cast() first,
363+
// producing a contiguous offset-based array, then reuse the existing Binary/String converter.
364+
// We avoid that because it copies the string data twice: once into an intermediate Arrow
365+
// String array, then again into BinaryColumn. The direct approach here copies once.
353366
template <ArrowTypeId AT, LogicalType LT, bool is_nullable, bool is_strict>
354367
struct ArrowConverter<AT, LT, is_nullable, is_strict, StringViewATGuard<AT>, StringOrBinaryGuard<LT>> {
355368
using ArrowArrayType = ArrowTypeIdToArrayType<AT>;
@@ -370,7 +383,18 @@ struct ArrowConverter<AT, LT, is_nullable, is_strict, StringViewATGuard<AT>, Str
370383
}
371384

372385
concrete_column->reserve(concrete_column->size() + num_elements);
373-
bool reported = false;
386+
bool repeated = false;
387+
// Unlike offset-based Binary/String arrays where null slots still have valid offsets,
388+
// StringView null slots may contain an undefined 16-byte view struct per the Arrow spec.
389+
// GetView() would dereference a garbage buffer_index for out-of-line views. When
390+
// is_nullable=true, null_data already guards this. For is_nullable=false, we check once
391+
// whether the source array has nulls and guard per-element only when needed.
392+
// The framework's fill_filter() typically handles this upstream, but we guard here
393+
// defensively since GetView() on a garbage view is a segfault, not a wrong result.
394+
const bool needs_null_guard = !is_nullable && concrete_array->null_count() > 0;
395+
// Per-element iteration: GetView(i) returns a std::string_view that transparently
396+
// handles both inline views (data in the struct) and out-of-line views (pointer
397+
// into a variadic buffer). This is O(n) per element but necessary for correctness.
374398
for (size_t i = 0; i < num_elements; ++i) {
375399
size_t array_idx = array_start_idx + i;
376400
if constexpr (is_nullable) {
@@ -379,16 +403,26 @@ struct ArrowConverter<AT, LT, is_nullable, is_strict, StringViewATGuard<AT>, Str
379403
continue;
380404
}
381405
}
406+
if (needs_null_guard && concrete_array->IsNull(array_idx)) {
407+
concrete_column->append_default();
408+
// Regardless of fill_filter called or not, we are setting this idx to be filtered out.
409+
filter_data[i] = 0;
410+
continue;
411+
}
382412
auto sv = concrete_array->GetView(array_idx);
383413
Slice s{sv.data(), sv.size()};
384414
if (s.size > max_length) {
415+
// String exceeds column's max length (VARCHAR limit or CHAR(N) width).
416+
// How we handle it depends on nullable/strict mode:
417+
// - nullable + non-strict: convert to NULL (preserves row, loses value)
418+
// - strict or non-nullable: filter out the row and report the error once
385419
concrete_column->append_default();
386420
if constexpr (is_nullable && !is_strict) {
387421
null_data[i] = DATUM_NULL;
388422
} else {
389423
filter_data[i] = 0;
390-
if (ctx != nullptr && !reported) {
391-
reported = true;
424+
if (ctx != nullptr && !repeated) {
425+
repeated = true;
392426
std::string reason = strings::Substitute(
393427
"string length $0 exceeds max length $1", sv.size(), max_length);
394428
ctx->report_error_message(reason, std::string(sv));

be/src/exec/arrow_type_traits.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,12 +80,19 @@ struct ArrowTypeIdToCppTypeStruct<ArrowTypeId::BOOL, guard::Guard> {
8080
VALUE_GUARD(ArrowTypeId, BinaryATGuard, at_is_binary, ArrowTypeId::BINARY, ArrowTypeId::STRING,
8181
ArrowTypeId::FIXED_SIZE_BINARY, ArrowTypeId::LARGE_BINARY, ArrowTypeId::LARGE_STRING)
8282

83+
// StringView uses a fundamentally different memory layout (inline + variadic buffers) from
84+
// the contiguous offset-based layout of Binary/String types. It gets its own guard so that
85+
// template specializations can dispatch to a safe per-element iteration path rather than
86+
// the bulk-memcpy fast path used by BinaryATGuard types.
8387
VALUE_GUARD(ArrowTypeId, StringViewATGuard, at_is_string_view, ArrowTypeId::STRING_VIEW)
8488

8589
template <ArrowTypeId AT>
8690
struct ArrowTypeIdToCppTypeStruct<AT, BinaryATGuard<AT>> {
8791
using type = const uint8_t*;
8892
};
93+
94+
// StringView values are accessed via GetView() which returns std::string_view, unlike
95+
// Binary/String types that use raw uint8_t* offsets into a contiguous data buffer.
8996
template <>
9097
struct ArrowTypeIdToCppTypeStruct<ArrowTypeId::STRING_VIEW, guard::Guard> {
9198
using type = std::string_view;

be/test/exec/arrow_converter_test.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ static_assert(std::is_same_v<ArrowTypeIdToCppType<ArrowTypeId::STRING_VIEW>, std
5353
static_assert(at_is_string_view<ArrowTypeId::STRING_VIEW>,
5454
"at_is_string_view must be true for STRING_VIEW");
5555

56+
// STRING_VIEW must NOT match BinaryATGuard — if it did, the Binary/String converter's
57+
// bulk offset-based memcpy path would be selected, silently corrupting out-of-line strings.
5658
static_assert(!at_is_binary<ArrowTypeId::STRING_VIEW>,
5759
"at_is_binary must be false for STRING_VIEW (guard isolation)");
5860

0 commit comments

Comments
 (0)