Skip to content

Commit d42c7dd

Browse files
robert3005rok
andauthored
GH-47279: [C++] Implement GetByteRangesArray for view types (#47418)
### Rationale for this change - Fix #47279 ### What changes are included in this PR? Add missing visit methods to GetByteRangesArray. The returned results are over estimates since there's no cheap way to estimate offsets of the child arrays/buffers. We can compute exact value if we are willing to check every offset/view ### Are these changes tested? Added tests ### Are there any user-facing changes? No * GitHub Issue: #47279 Lead-authored-by: Robert Kruszewski <github@robertk.io> Co-authored-by: Rok Mihevc <rok@mihevc.org> Signed-off-by: Rok Mihevc <rok@mihevc.org>
1 parent c61df29 commit d42c7dd

2 files changed

Lines changed: 139 additions & 0 deletions

File tree

cpp/src/arrow/util/byte_size.cc

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,35 @@ struct GetByteRangesArray {
192192

193193
Status Visit(const LargeBinaryType& type) const { return VisitBaseBinary(type); }
194194

195+
template <typename BaseViewType>
196+
Status VisitBaseViewType(const BaseViewType& type) const {
197+
using c_type = typename BaseViewType::c_type;
198+
RETURN_NOT_OK(VisitBitmap(input.buffers[0]));
199+
const Buffer& views = *input.buffers[1];
200+
RETURN_NOT_OK(range_starts->Append(reinterpret_cast<uint64_t>(views.data())));
201+
RETURN_NOT_OK(range_offsets->Append(sizeof(c_type) * offset));
202+
RETURN_NOT_OK(range_lengths->Append(sizeof(c_type) * length));
203+
204+
// The following calculation is an over estimate of the size since views buffer
205+
// might
206+
// 1. Not reference all the values in data buffers (the array was filtered without gc)
207+
// 2. Reference a value multiple times without repeating it in the data buffer
208+
//
209+
// Producing exact byte size would require linear scan of all values in view buffer
210+
for (size_t i = 2; i < input.buffers.size(); i++) {
211+
const Buffer& buf = *input.buffers[i];
212+
RETURN_NOT_OK(range_starts->Append(reinterpret_cast<uint64_t>(buf.data())));
213+
RETURN_NOT_OK(range_offsets->Append(0));
214+
RETURN_NOT_OK(range_lengths->Append(static_cast<uint64_t>(buf.size())));
215+
}
216+
217+
return Status::OK();
218+
}
219+
220+
Status Visit(const StringViewType& type) const { return VisitBaseViewType(type); }
221+
222+
Status Visit(const BinaryViewType& type) const { return VisitBaseViewType(type); }
223+
195224
template <typename BaseListType>
196225
Status VisitBaseList(const BaseListType& type) const {
197226
using offset_type = typename BaseListType::offset_type;
@@ -215,6 +244,39 @@ struct GetByteRangesArray {
215244

216245
Status Visit(const LargeListType& type) const { return VisitBaseList(type); }
217246

247+
template <typename BaseListViewType>
248+
Status VisitBaseListView(const BaseListViewType& type) const {
249+
using offset_type = typename BaseListViewType::offset_type;
250+
RETURN_NOT_OK(VisitBitmap(input.buffers[0]));
251+
252+
const Buffer& offsets_buffer = *input.buffers[1];
253+
RETURN_NOT_OK(
254+
range_starts->Append(reinterpret_cast<uint64_t>(offsets_buffer.data())));
255+
RETURN_NOT_OK(range_offsets->Append(sizeof(offset_type) * offset));
256+
RETURN_NOT_OK(range_lengths->Append(sizeof(offset_type) * length));
257+
258+
const Buffer& lengths_buffer = *input.buffers[2];
259+
RETURN_NOT_OK(
260+
range_starts->Append(reinterpret_cast<uint64_t>(lengths_buffer.data())));
261+
RETURN_NOT_OK(range_offsets->Append(sizeof(offset_type) * offset));
262+
RETURN_NOT_OK(range_lengths->Append(sizeof(offset_type) * length));
263+
264+
// The following calculation is an over estimate of the byte size since views
265+
// buffer might
266+
// 1. Not reference all the values in data buffers (the array was filtered without gc)
267+
// 2. Reference a value multiple times without repeating it in the data buffer
268+
//
269+
// Producing exact byte size would require linear scan of all values in view buffer
270+
GetByteRangesArray child{
271+
*input.child_data[0], 0, input.child_data[0]->length, range_starts, range_offsets,
272+
range_lengths};
273+
return VisitTypeInline(*type.value_type(), &child);
274+
}
275+
276+
Status Visit(const ListViewType& type) const { return VisitBaseListView(type); }
277+
278+
Status Visit(const LargeListViewType& type) const { return VisitBaseListView(type); }
279+
218280
Status Visit(const FixedSizeListType& type) const {
219281
RETURN_NOT_OK(VisitBitmap(input.buffers[0]));
220282
GetByteRangesArray child{*input.child_data[0],

cpp/src/arrow/util/byte_size_test.cc

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,83 @@ TEST(ByteRanges, FixedSizeList) {
316316
CheckBufferRanges(list_arr_with_nulls->Slice(4, 1), {{0, 0, 1}, {1, 1, 1}, {2, 8, 2}});
317317
}
318318

319+
TEST(ByteRanges, StringViewType) {
320+
std::shared_ptr<Array> str_view_arr = ArrayFromJSON(
321+
utf8_view(), R"(["short", "a longer string that requires a buffer"])");
322+
// First buffer is validity (null in this case), second is views (16 bytes per view),
323+
// third is the data buffer for long strings
324+
CheckBufferRanges(str_view_arr, {{0, 0, 32}, {1, 0, 38}});
325+
326+
std::shared_ptr<Array> str_view_with_nulls = ArrayFromJSON(
327+
utf8_view(), R"(["short", null, "another long string that requires a buffer"])");
328+
CheckBufferRanges(str_view_with_nulls, {{0, 0, 1}, {1, 0, 48}, {2, 0, 42}});
329+
330+
CheckBufferRanges(str_view_arr->Slice(1, 1), {{0, 16, 16}, {1, 0, 38}});
331+
CheckBufferRanges(str_view_arr->Slice(0, 1), {{0, 0, 16}, {1, 0, 38}});
332+
CheckBufferRanges(str_view_with_nulls->Slice(2, 1),
333+
{{0, 0, 1}, {1, 32, 16}, {2, 0, 42}});
334+
}
335+
336+
TEST(ByteRanges, BinaryViewType) {
337+
std::shared_ptr<Array> bin_view_arr =
338+
ArrayFromJSON(binary_view(), R"(["ABCD", "EFGHIJKLMNOPQRSTUVWXYZ"])");
339+
// Similar to string view: views buffer (16 bytes per view), then data buffers
340+
CheckBufferRanges(bin_view_arr, {{0, 0, 32}, {1, 0, 22}});
341+
342+
std::shared_ptr<Array> bin_view_with_nulls =
343+
ArrayFromJSON(binary_view(), R"(["AB", null, "CDEFGHIJKLMNOPQRSTUVWXYZ"])");
344+
CheckBufferRanges(bin_view_with_nulls, {{0, 0, 1}, {1, 0, 48}, {2, 0, 24}});
345+
346+
CheckBufferRanges(bin_view_arr->Slice(1, 1), {{0, 16, 16}, {1, 0, 22}});
347+
CheckBufferRanges(bin_view_with_nulls->Slice(2, 1),
348+
{{0, 0, 1}, {1, 32, 16}, {2, 0, 24}});
349+
}
350+
351+
using ListViewArrowTypes = ::testing::Types<ListViewType, LargeListViewType>;
352+
template <typename Type>
353+
class ByteRangesListView : public ::testing::Test {};
354+
TYPED_TEST_SUITE(ByteRangesListView, ListViewArrowTypes);
355+
356+
TYPED_TEST(ByteRangesListView, Basic) {
357+
using offset_type = typename TypeParam::offset_type;
358+
std::shared_ptr<DataType> type = std::make_shared<TypeParam>(int32());
359+
std::shared_ptr<Array> list_view_arr = ArrayFromJSON(type, "[[1, 2], [3], [0]]");
360+
// Offsets buffer, sizes buffer, then child data
361+
CheckBufferRanges(
362+
list_view_arr,
363+
{{0, 0, 3 * sizeof(offset_type)}, {1, 0, 3 * sizeof(offset_type)}, {2, 0, 16}});
364+
365+
std::shared_ptr<Array> list_view_with_nulls =
366+
ArrayFromJSON(type, "[[1, 2], null, [3, 4, 5]]");
367+
CheckBufferRanges(list_view_with_nulls, {{0, 0, 1},
368+
{1, 0, 3 * sizeof(offset_type)},
369+
{2, 0, 3 * sizeof(offset_type)},
370+
{3, 0, 20}});
371+
CheckBufferRanges(list_view_arr->Slice(2, 1),
372+
{{0, 2 * sizeof(offset_type), sizeof(offset_type)},
373+
{1, 2 * sizeof(offset_type), sizeof(offset_type)},
374+
{2, 0, 16}});
375+
CheckBufferRanges(list_view_with_nulls->Slice(2, 1),
376+
{{0, 0, 1},
377+
{1, 2 * sizeof(offset_type), sizeof(offset_type)},
378+
{2, 2 * sizeof(offset_type), sizeof(offset_type)},
379+
{3, 0, 20}});
380+
}
381+
382+
TYPED_TEST(ByteRangesListView, NestedListView) {
383+
using offset_type = typename TypeParam::offset_type;
384+
std::shared_ptr<DataType> type =
385+
std::make_shared<TypeParam>(std::make_shared<TypeParam>(int32()));
386+
std::shared_ptr<Array> list_view_arr =
387+
ArrayFromJSON(type, "[[[1], [2, 3]], [[4, 5, 6]], [[7]]]");
388+
// Parent offsets, parent sizes, child offsets, child sizes, grandchild data
389+
CheckBufferRanges(list_view_arr, {{0, 0, 3 * sizeof(offset_type)},
390+
{1, 0, 3 * sizeof(offset_type)},
391+
{2, 0, 4 * sizeof(offset_type)},
392+
{3, 0, 4 * sizeof(offset_type)},
393+
{4, 0, 28}});
394+
}
395+
319396
TEST(ByteRanges, Map) {
320397
std::shared_ptr<Array> map_arr = ArrayFromJSON(
321398
map(utf8(), uint16()), R"([[["x", 1], ["y", 2]], [["x", 3], ["y", 4]]])");

0 commit comments

Comments
 (0)