Skip to content

Commit f8eb699

Browse files
authored
feat(arrow/array): add Validate/ValidateFull to binary and string arrays (#747)
## Summary Fixes #691 Adds `Validate()` and `ValidateFull()` methods to `Binary`, `LargeBinary`, `String`, and `LargeString` array types, plus top-level dispatch functions and record-level convenience helpers. ## Problem The existing `setData` validation only checks the **last** offset against the data buffer length. Subtly corrupted data — e.g. non-monotonic or negative intermediate offsets — passes construction but causes a runtime `panic: slice bounds out of range` when `Value(i)` is called later, **after** the IPC reader's `recover()` scope has already returned. Users receiving data from untrusted sources (e.g. Flight SQL from Doris DB) have no way to detect this without crashing. ## Solution - `Validate()` — O(1): checks offset buffer size and that the last offset is within the data buffer (mirrors existing `setData` checks, but returns an error instead of panicking) - `ValidateFull()` — O(n): additionally verifies all offsets are non-negative and monotonically non-decreasing, catching the subtle corruption case - `Validate(arr arrow.Array) error` / `ValidateFull(arr arrow.Array) error` — top-level dispatch via the new `Validator` interface - `ValidateRecord(rec arrow.RecordBatch) error` / `ValidateRecordFull(...)` — convenience wrappers that validate all columns, with error messages including column index and name ## Usage ```go rec, err := reader.Read() if err != nil { ... } if err := array.ValidateRecordFull(rec); err != nil { log.Printf("skipping corrupted batch: %v", err) rec.Release() continue } ``` ## Test plan - [ ] `TestBinaryValidate` — valid arrays, sliced arrays, non-monotonic offsets, negative first offset - [ ] `TestLargeBinaryValidate` — same for large binary - [ ] `TestStringValidate` — same for string - [ ] `TestLargeStringValidate` — same for large string - [ ] `TestTopLevelValidate` — dispatch to `Validator`, passthrough for non-`Validator` types, `ValidateRecord` with mixed valid/corrupt columns
1 parent 073bb06 commit f8eb699

4 files changed

Lines changed: 518 additions & 0 deletions

File tree

arrow/array/binary.go

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,60 @@ func (a *Binary) MarshalJSON() ([]byte, error) {
169169
return json.Marshal(vals)
170170
}
171171

172+
// Validate performs a basic, O(1) consistency check on the array data.
173+
// It returns an error if:
174+
// - The offset buffer is too small for the array length and offset
175+
// - The last offset exceeds the data buffer length
176+
//
177+
// This is useful for detecting corrupted data from untrusted sources (e.g.
178+
// Arrow Flight / Flight SQL servers) before accessing values, which may
179+
// otherwise cause a runtime panic.
180+
func (a *Binary) Validate() error {
181+
if a.data.length == 0 {
182+
return nil
183+
}
184+
if a.data.buffers[1] == nil {
185+
return fmt.Errorf("arrow/array: non-empty binary array has no offsets buffer")
186+
}
187+
expNumOffsets := a.data.offset + a.data.length + 1
188+
if len(a.valueOffsets) < expNumOffsets {
189+
return fmt.Errorf("arrow/array: binary offset buffer must have at least %d values, got %d", expNumOffsets, len(a.valueOffsets))
190+
}
191+
firstOffset := int(a.valueOffsets[a.data.offset])
192+
if firstOffset > len(a.valueBytes) {
193+
return fmt.Errorf("arrow/array: binary offset %d out of bounds of data buffer (length %d)", firstOffset, len(a.valueBytes))
194+
}
195+
196+
lastOffset := int(a.valueOffsets[expNumOffsets-1])
197+
if lastOffset > len(a.valueBytes) {
198+
return fmt.Errorf("arrow/array: binary offset %d out of bounds of data buffer (length %d)", lastOffset, len(a.valueBytes))
199+
}
200+
return nil
201+
}
202+
203+
// ValidateFull performs a full O(n) consistency check on the array data.
204+
// In addition to the checks performed by Validate, it also verifies that
205+
// all offsets are non-negative and monotonically non-decreasing.
206+
func (a *Binary) ValidateFull() error {
207+
if err := a.Validate(); err != nil {
208+
return err
209+
}
210+
if a.data.length == 0 {
211+
return nil
212+
}
213+
offsets := a.valueOffsets[a.data.offset : a.data.offset+a.data.length+1]
214+
if offsets[0] < 0 {
215+
return fmt.Errorf("arrow/array: binary offset at index %d is negative: %d", a.data.offset, offsets[0])
216+
}
217+
for i := 1; i < len(offsets); i++ {
218+
if offsets[i] < offsets[i-1] {
219+
return fmt.Errorf("arrow/array: binary offsets are not monotonically non-decreasing at index %d: %d < %d",
220+
a.data.offset+i, offsets[i], offsets[i-1])
221+
}
222+
}
223+
return nil
224+
}
225+
172226
func arrayEqualBinary(left, right *Binary) bool {
173227
for i := 0; i < left.Len(); i++ {
174228
if left.IsNull(i) {
@@ -309,6 +363,60 @@ func (a *LargeBinary) MarshalJSON() ([]byte, error) {
309363
return json.Marshal(vals)
310364
}
311365

366+
// Validate performs a basic, O(1) consistency check on the array data.
367+
// It returns an error if:
368+
// - The offset buffer is too small for the array length and offset
369+
// - The last offset exceeds the data buffer length
370+
//
371+
// This is useful for detecting corrupted data from untrusted sources (e.g.
372+
// Arrow Flight / Flight SQL servers) before accessing values, which may
373+
// otherwise cause a runtime panic.
374+
func (a *LargeBinary) Validate() error {
375+
if a.data.length == 0 {
376+
return nil
377+
}
378+
if a.data.buffers[1] == nil {
379+
return fmt.Errorf("arrow/array: non-empty large binary array has no offsets buffer")
380+
}
381+
expNumOffsets := a.data.offset + a.data.length + 1
382+
if len(a.valueOffsets) < expNumOffsets {
383+
return fmt.Errorf("arrow/array: large binary offset buffer must have at least %d values, got %d", expNumOffsets, len(a.valueOffsets))
384+
}
385+
firstOffset := int(a.valueOffsets[a.data.offset])
386+
if firstOffset > len(a.valueBytes) {
387+
return fmt.Errorf("arrow/array: large binary offset %d out of bounds of data buffer (length %d)", firstOffset, len(a.valueBytes))
388+
}
389+
390+
lastOffset := int(a.valueOffsets[expNumOffsets-1])
391+
if lastOffset > len(a.valueBytes) {
392+
return fmt.Errorf("arrow/array: large binary offset %d out of bounds of data buffer (length %d)", lastOffset, len(a.valueBytes))
393+
}
394+
return nil
395+
}
396+
397+
// ValidateFull performs a full O(n) consistency check on the array data.
398+
// In addition to the checks performed by Validate, it also verifies that
399+
// all offsets are non-negative and monotonically non-decreasing.
400+
func (a *LargeBinary) ValidateFull() error {
401+
if err := a.Validate(); err != nil {
402+
return err
403+
}
404+
if a.data.length == 0 {
405+
return nil
406+
}
407+
offsets := a.valueOffsets[a.data.offset : a.data.offset+a.data.length+1]
408+
if offsets[0] < 0 {
409+
return fmt.Errorf("arrow/array: large binary offset at index %d is negative: %d", a.data.offset, offsets[0])
410+
}
411+
for i := 1; i < len(offsets); i++ {
412+
if offsets[i] < offsets[i-1] {
413+
return fmt.Errorf("arrow/array: large binary offsets are not monotonically non-decreasing at index %d: %d < %d",
414+
a.data.offset+i, offsets[i], offsets[i-1])
415+
}
416+
}
417+
return nil
418+
}
419+
312420
func arrayEqualLargeBinary(left, right *LargeBinary) bool {
313421
for i := 0; i < left.Len(); i++ {
314422
if left.IsNull(i) {

arrow/array/string.go

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"fmt"
2222
"reflect"
2323
"strings"
24+
"unicode/utf8"
2425
"unsafe"
2526

2627
"github.com/apache/arrow-go/v18/arrow"
@@ -169,6 +170,64 @@ func (a *String) MarshalJSON() ([]byte, error) {
169170
return json.Marshal(vals)
170171
}
171172

173+
// Validate performs a basic, O(1) consistency check on the array data.
174+
// It returns an error if:
175+
// - The offset buffer is too small for the array length and offset
176+
// - The last offset exceeds the data buffer length
177+
//
178+
// This is useful for detecting corrupted data from untrusted sources (e.g.
179+
// Arrow Flight / Flight SQL servers) before accessing values, which may
180+
// otherwise cause a runtime panic.
181+
func (a *String) Validate() error {
182+
if a.data.length == 0 {
183+
return nil
184+
}
185+
if a.data.buffers[1] == nil {
186+
return fmt.Errorf("arrow/array: non-empty string array has no offsets buffer")
187+
}
188+
expNumOffsets := a.data.offset + a.data.length + 1
189+
if len(a.offsets) < expNumOffsets {
190+
return fmt.Errorf("arrow/array: string offset buffer must have at least %d values, got %d", expNumOffsets, len(a.offsets))
191+
}
192+
firstOffset := int(a.offsets[a.data.offset])
193+
if firstOffset > len(a.values) {
194+
return fmt.Errorf("arrow/array: string offset %d out of bounds of data buffer (length %d)", firstOffset, len(a.values))
195+
}
196+
lastOffset := int(a.offsets[expNumOffsets-1])
197+
if lastOffset > len(a.values) {
198+
return fmt.Errorf("arrow/array: string offset %d out of bounds of data buffer (length %d)", lastOffset, len(a.values))
199+
}
200+
return nil
201+
}
202+
203+
// ValidateFull performs a full O(n) consistency check on the array data.
204+
// In addition to the checks performed by Validate, it also verifies that
205+
// all offsets are non-negative and monotonically non-decreasing.
206+
func (a *String) ValidateFull() error {
207+
if err := a.Validate(); err != nil {
208+
return err
209+
}
210+
if a.data.length == 0 {
211+
return nil
212+
}
213+
offsets := a.offsets[a.data.offset : a.data.offset+a.data.length+1]
214+
if offsets[0] < 0 {
215+
return fmt.Errorf("arrow/array: string offset at index %d is negative: %d", a.data.offset, offsets[0])
216+
}
217+
for i := 1; i < len(offsets); i++ {
218+
if offsets[i] < offsets[i-1] {
219+
return fmt.Errorf("arrow/array: string offsets are not monotonically non-decreasing at index %d: %d < %d",
220+
a.data.offset+i, offsets[i], offsets[i-1])
221+
}
222+
value := a.values[offsets[i-1]:offsets[i]]
223+
if !utf8.ValidString(value) {
224+
return fmt.Errorf("arrow/array: string at index %d is not valid utf8: %s", a.data.offset+i-1, value)
225+
}
226+
}
227+
228+
return nil
229+
}
230+
172231
func arrayEqualString(left, right *String) bool {
173232
for i := 0; i < left.Len(); i++ {
174233
if left.IsNull(i) {
@@ -312,6 +371,64 @@ func (a *LargeString) MarshalJSON() ([]byte, error) {
312371
return json.Marshal(vals)
313372
}
314373

374+
// Validate performs a basic, O(1) consistency check on the array data.
375+
// It returns an error if:
376+
// - The offset buffer is too small for the array length and offset
377+
// - The last offset exceeds the data buffer length
378+
//
379+
// This is useful for detecting corrupted data from untrusted sources (e.g.
380+
// Arrow Flight / Flight SQL servers) before accessing values, which may
381+
// otherwise cause a runtime panic.
382+
func (a *LargeString) Validate() error {
383+
if a.data.length == 0 {
384+
return nil
385+
}
386+
if a.data.buffers[1] == nil {
387+
return fmt.Errorf("arrow/array: non-empty large string array has no offsets buffer")
388+
}
389+
expNumOffsets := a.data.offset + a.data.length + 1
390+
if len(a.offsets) < expNumOffsets {
391+
return fmt.Errorf("arrow/array: large string offset buffer must have at least %d values, got %d", expNumOffsets, len(a.offsets))
392+
}
393+
firstOffset := int(a.offsets[a.data.offset])
394+
if firstOffset > len(a.values) {
395+
return fmt.Errorf("arrow/array: large string offset %d out of bounds of data buffer (length %d)", firstOffset, len(a.values))
396+
}
397+
398+
lastOffset := int(a.offsets[expNumOffsets-1])
399+
if lastOffset > len(a.values) {
400+
return fmt.Errorf("arrow/array: large string offset %d out of bounds of data buffer (length %d)", lastOffset, len(a.values))
401+
}
402+
return nil
403+
}
404+
405+
// ValidateFull performs a full O(n) consistency check on the array data.
406+
// In addition to the checks performed by Validate, it also verifies that
407+
// all offsets are non-negative and monotonically non-decreasing.
408+
func (a *LargeString) ValidateFull() error {
409+
if err := a.Validate(); err != nil {
410+
return err
411+
}
412+
if a.data.length == 0 {
413+
return nil
414+
}
415+
offsets := a.offsets[a.data.offset : a.data.offset+a.data.length+1]
416+
if offsets[0] < 0 {
417+
return fmt.Errorf("arrow/array: large string offset at index %d is negative: %d", a.data.offset, offsets[0])
418+
}
419+
for i := 1; i < len(offsets); i++ {
420+
if offsets[i] < offsets[i-1] {
421+
return fmt.Errorf("arrow/array: large string offsets are not monotonically non-decreasing at index %d: %d < %d",
422+
a.data.offset+i, offsets[i], offsets[i-1])
423+
}
424+
value := a.values[offsets[i-1]:offsets[i]]
425+
if !utf8.ValidString(value) {
426+
return fmt.Errorf("arrow/array: string at index %d is not valid utf8: %s", a.data.offset+i-1, value)
427+
}
428+
}
429+
return nil
430+
}
431+
315432
func arrayEqualLargeString(left, right *LargeString) bool {
316433
for i := 0; i < left.Len(); i++ {
317434
if left.IsNull(i) {

arrow/array/validate.go

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing, software
12+
// distributed under the License is distributed on an "AS IS" BASIS,
13+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
// See the License for the specific language governing permissions and
15+
// limitations under the License.
16+
17+
package array
18+
19+
import (
20+
"fmt"
21+
22+
"github.com/apache/arrow-go/v18/arrow"
23+
)
24+
25+
// Validator is implemented by array types that can validate their internal
26+
// consistency. See Validate and ValidateFull for top-level dispatch.
27+
type Validator interface {
28+
arrow.Array
29+
// Validate performs a basic O(1) consistency check.
30+
Validate() error
31+
// ValidateFull performs a thorough O(n) consistency check.
32+
ValidateFull() error
33+
}
34+
35+
// Validate performs a basic O(1) consistency check on arr, returning an error
36+
// if the array's internal buffers are inconsistent. For array types that do not
37+
// implement Validator, nil is returned.
38+
//
39+
// Use this to detect corrupted data from untrusted sources such as Arrow Flight
40+
// or Flight SQL servers before accessing values, which may otherwise panic.
41+
func Validate(arr arrow.Array) error {
42+
if v, ok := arr.(Validator); ok {
43+
return v.Validate()
44+
}
45+
return nil
46+
}
47+
48+
// ValidateFull performs a thorough O(n) consistency check on arr, returning an
49+
// error if the array's internal buffers are inconsistent. For array types that
50+
// do not implement Validator, nil is returned.
51+
//
52+
// Unlike Validate, this checks every element and is therefore O(n). Use this
53+
// when receiving data from untrusted sources where subtle corruption (e.g.
54+
// non-monotonic offsets) may not be detected by Validate alone.
55+
func ValidateFull(arr arrow.Array) error {
56+
if v, ok := arr.(Validator); ok {
57+
return v.ValidateFull()
58+
}
59+
return nil
60+
}
61+
62+
// ValidateRecord validates each column in rec using Validate, returning the
63+
// first error encountered. The error includes the column index and field name.
64+
func ValidateRecord(rec arrow.RecordBatch) error {
65+
for i := int64(0); i < rec.NumCols(); i++ {
66+
if err := Validate(rec.Column(int(i))); err != nil {
67+
return fmt.Errorf("column %d (%s): %w", i, rec.Schema().Field(int(i)).Name, err)
68+
}
69+
}
70+
return nil
71+
}
72+
73+
// ValidateRecordFull validates each column in rec using ValidateFull, returning
74+
// the first error encountered. The error includes the column index and field name.
75+
func ValidateRecordFull(rec arrow.RecordBatch) error {
76+
for i := int64(0); i < rec.NumCols(); i++ {
77+
if err := ValidateFull(rec.Column(int(i))); err != nil {
78+
return fmt.Errorf("column %d (%s): %w", i, rec.Schema().Field(int(i)).Name, err)
79+
}
80+
}
81+
return nil
82+
}

0 commit comments

Comments
 (0)