Skip to content

Commit e0df4f9

Browse files
UnsafeAppendBoolToBitmap for dictionary and REE builders (#758)
### Rationale for this change For more context: https://github.com/apache/arrow-go/pull/558/changes#r2758798540 The Builders for REE and dict-encoded arrays were inheriting the `UnsafeAppendBoolToBitmap()` and `UnsafeAppend()` from the `Builder` interface defined in `arrow/array/builder.go`. The default implementation does not bump the length of the inner `idxBuilder` or `runEndsBuilder`, which causes it to be in an inconsistent state where the parent has greater length than the index/run-ends builder. This causes a panic when instancing a record batch. ### What changes are included in this PR? This PR makes `UnsafeAppendBoolToBitmap()` for `RunEndEncodedBuilder` panic as there is not a good semantic meaning for this method in case of REE. It also implements `UnsafeAppendBoolToBitmap()` and `UnsafeAppend()` for the Dictionary builders. This allows users to use `bldr.Reserve(n)` followed by `n` calls to `UnsafeAppend()` or `UnsafeAppendBoolToBitmap()` without the need to check if the array needs to grow everytime. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. The API was already there (inherited from `Builder`), but the behavior was incorrect. --------- Co-authored-by: Matt Topol <zotthewizard@gmail.com>
1 parent f8eb699 commit e0df4f9

3 files changed

Lines changed: 137 additions & 1 deletion

File tree

arrow/array/dictionary.go

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,8 @@ func arrayApproxEqualDict(l, r *Dictionary, opt equalOption) bool {
314314
// helper for building the properly typed indices of the dictionary builder
315315
type IndexBuilder struct {
316316
Builder
317-
Append func(int)
317+
Append func(int)
318+
UnsafeAppend func(int)
318319
}
319320

320321
func createIndexBuilder(mem memory.Allocator, dt arrow.FixedWidthDataType) (ret IndexBuilder, err error) {
@@ -324,34 +325,58 @@ func createIndexBuilder(mem memory.Allocator, dt arrow.FixedWidthDataType) (ret
324325
ret.Append = func(idx int) {
325326
ret.Builder.(*Int8Builder).Append(int8(idx))
326327
}
328+
ret.UnsafeAppend = func(idx int) {
329+
ret.Builder.(*Int8Builder).UnsafeAppend(int8(idx))
330+
}
327331
case arrow.UINT8:
328332
ret.Append = func(idx int) {
329333
ret.Builder.(*Uint8Builder).Append(uint8(idx))
330334
}
335+
ret.UnsafeAppend = func(idx int) {
336+
ret.Builder.(*Uint8Builder).UnsafeAppend(uint8(idx))
337+
}
331338
case arrow.INT16:
332339
ret.Append = func(idx int) {
333340
ret.Builder.(*Int16Builder).Append(int16(idx))
334341
}
342+
ret.UnsafeAppend = func(idx int) {
343+
ret.Builder.(*Int16Builder).UnsafeAppend(int16(idx))
344+
}
335345
case arrow.UINT16:
336346
ret.Append = func(idx int) {
337347
ret.Builder.(*Uint16Builder).Append(uint16(idx))
338348
}
349+
ret.UnsafeAppend = func(idx int) {
350+
ret.Builder.(*Uint16Builder).UnsafeAppend(uint16(idx))
351+
}
339352
case arrow.INT32:
340353
ret.Append = func(idx int) {
341354
ret.Builder.(*Int32Builder).Append(int32(idx))
342355
}
356+
ret.UnsafeAppend = func(idx int) {
357+
ret.Builder.(*Int32Builder).UnsafeAppend(int32(idx))
358+
}
343359
case arrow.UINT32:
344360
ret.Append = func(idx int) {
345361
ret.Builder.(*Uint32Builder).Append(uint32(idx))
346362
}
363+
ret.UnsafeAppend = func(idx int) {
364+
ret.Builder.(*Uint32Builder).UnsafeAppend(uint32(idx))
365+
}
347366
case arrow.INT64:
348367
ret.Append = func(idx int) {
349368
ret.Builder.(*Int64Builder).Append(int64(idx))
350369
}
370+
ret.UnsafeAppend = func(idx int) {
371+
ret.Builder.(*Int64Builder).UnsafeAppend(int64(idx))
372+
}
351373
case arrow.UINT64:
352374
ret.Append = func(idx int) {
353375
ret.Builder.(*Uint64Builder).Append(uint64(idx))
354376
}
377+
ret.UnsafeAppend = func(idx int) {
378+
ret.Builder.(*Uint64Builder).UnsafeAppend(uint64(idx))
379+
}
355380
default:
356381
debug.Assert(false, "dictionary index type must be integral")
357382
err = fmt.Errorf("dictionary index type must be integral, not %s", dt)
@@ -646,6 +671,14 @@ func (b *dictionaryBuilder) AppendEmptyValues(n int) {
646671
}
647672
}
648673

674+
func (b *dictionaryBuilder) UnsafeAppendBoolToBitmap(v bool) {
675+
if !v {
676+
b.nulls += 1
677+
}
678+
b.length += 1
679+
b.idxBuilder.UnsafeAppendBoolToBitmap(v)
680+
}
681+
649682
func (b *dictionaryBuilder) Reserve(n int) {
650683
b.idxBuilder.Reserve(n)
651684
}
@@ -781,6 +814,13 @@ func (b *dictionaryBuilder) insertDictBytes(val []byte) error {
781814
return err
782815
}
783816

817+
func (b *dictionaryBuilder) unsafeAppendValue(val interface{}) error {
818+
idx, _, err := b.memoTable.GetOrInsert(val)
819+
b.idxBuilder.UnsafeAppend(idx)
820+
b.length += 1
821+
return err
822+
}
823+
784824
func (b *dictionaryBuilder) appendValue(val interface{}) error {
785825
idx, _, err := b.memoTable.GetOrInsert(val)
786826
b.idxBuilder.Append(idx)
@@ -990,7 +1030,39 @@ type dictBuilder[T arrow.ValueType] struct {
9901030
dictionaryBuilder
9911031
}
9921032

1033+
func (b *dictBuilder[T]) UnsafeAppend(v T) {
1034+
// SAFETY: it is safe to ignore the value returned by the calls to `unsafeAppendValue()`
1035+
// here since `UnsafeAppend()` is statically typed and the only case in which that method
1036+
// errors is when trying to insert an invalid `interface{}` into the `memoTable`.
1037+
var err error
1038+
switch val := any(v).(type) {
1039+
case arrow.Duration:
1040+
err = b.unsafeAppendValue(int64(val))
1041+
case arrow.Timestamp:
1042+
err = b.unsafeAppendValue(int64(val))
1043+
case arrow.Time32:
1044+
err = b.unsafeAppendValue(int32(val))
1045+
case arrow.Time64:
1046+
err = b.unsafeAppendValue(int64(val))
1047+
case arrow.Date32:
1048+
err = b.unsafeAppendValue(int32(val))
1049+
case arrow.Date64:
1050+
err = b.unsafeAppendValue(int64(val))
1051+
case arrow.MonthInterval:
1052+
err = b.unsafeAppendValue(int32(val))
1053+
default:
1054+
err = b.unsafeAppendValue(v)
1055+
}
1056+
debug.Assert(err == nil, "Trying to insert wrong type into memoTable even though this method is statically typed. This is an implementation bug.")
1057+
}
1058+
9931059
func (b *dictBuilder[T]) Append(v T) error {
1060+
// TODO: it is safe to ignore the value returned by the calls to `appendValue()`
1061+
// here since `Append()` is statically typed and the only case in which that
1062+
// method errors is when trying to insert an invalid `interface{}` into the `memoTable`.
1063+
//
1064+
// This would be a breaking change to the public API of `dictBuilder`, so it needs
1065+
// to happen over a major release.
9941066
switch val := any(v).(type) {
9951067
case arrow.Duration:
9961068
return b.appendValue(int64(val))
@@ -1058,6 +1130,12 @@ func (b *BinaryDictionaryBuilder) Append(v []byte) error {
10581130
return nil
10591131
}
10601132

1133+
// TODO: it is safe to ignore the value returned by the call to `appendBytes()`
1134+
// here since `Append()` is statically typed and the only case in which that
1135+
// method errors is when trying to insert an invalid `interface{}` into the `memoTable`.
1136+
//
1137+
// This would be a breaking change to the public API of `BinaryDictionaryBuilder`,
1138+
// so it needs to happen over a major release.
10611139
return b.appendBytes(v)
10621140
}
10631141

@@ -1134,6 +1212,13 @@ type fixedSizeDictionaryBuilder[T fsbType] struct {
11341212
}
11351213

11361214
func (b *fixedSizeDictionaryBuilder[T]) Append(v T) error {
1215+
// TODO: it is safe to ignore the value returned by the calls to `appendValue()`
1216+
// and `appendBytes()` here since `Append()` is statically typed and the only
1217+
// case in which these method error is when trying to insert an invalid
1218+
// `interface{}` into the `memoTable`.
1219+
//
1220+
// This would be a breaking change to the public API of `fixedSizeDictionaryBuilder`,
1221+
// so it needs to happen over a major release.
11371222
if v, ok := any(v).([]byte); ok {
11381223
return b.appendBytes(v[:b.byteWidth])
11391224
}
@@ -1164,6 +1249,12 @@ type FixedSizeBinaryDictionaryBuilder struct {
11641249
}
11651250

11661251
func (b *FixedSizeBinaryDictionaryBuilder) Append(v []byte) error {
1252+
// TODO: it is safe to ignore the value returned by the calls to `appendValue()`
1253+
// here since `Append()` is statically typed and the only case in which that
1254+
// method errors is when trying to insert an invalid `interface{}` into the `memoTable`.
1255+
//
1256+
// This would be a breaking change to the public API of `FixedSizeBinaryDictionaryBuilder`,
1257+
// so it needs to happen over a major release.
11671258
return b.appendValue(v[:b.byteWidth])
11681259
}
11691260

arrow/array/dictionary_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,47 @@ func (p *PrimitiveDictionaryTestSuite) TestDictionaryBuilderInit() {
145145
p.True(array.Equal(expected, arr))
146146
}
147147

148+
func (p *PrimitiveDictionaryTestSuite) TestDictionaryBuilderReserveAndAppend() {
149+
expectedType := &arrow.DictionaryType{IndexType: &arrow.Int8Type{}, ValueType: p.typ}
150+
bldr := array.NewDictionaryBuilder(p.mem, expectedType)
151+
defer bldr.Release()
152+
153+
builder := reflect.ValueOf(bldr)
154+
appendFn := builder.MethodByName("UnsafeAppend")
155+
validFn := builder.MethodByName("UnsafeAppendBoolToBitmap")
156+
157+
bldr.Reserve(7)
158+
validFn.Call([]reflect.Value{reflect.ValueOf(true)})
159+
validFn.Call([]reflect.Value{reflect.ValueOf(false)})
160+
appendFn.Call([]reflect.Value{reflect.ValueOf(0).Convert(p.reftyp)})
161+
appendFn.Call([]reflect.Value{reflect.ValueOf(1).Convert(p.reftyp)})
162+
validFn.Call([]reflect.Value{reflect.ValueOf(false)})
163+
appendFn.Call([]reflect.Value{reflect.ValueOf(1).Convert(p.reftyp)})
164+
appendFn.Call([]reflect.Value{reflect.ValueOf(2).Convert(p.reftyp)})
165+
166+
p.EqualValues(7, bldr.Len())
167+
p.EqualValues(2, bldr.NullN())
168+
169+
p.EqualValues(3, bldr.DictionarySize())
170+
171+
arr := bldr.NewArray().(*array.Dictionary)
172+
defer arr.Release()
173+
174+
p.True(arrow.TypeEqual(expectedType, arr.DataType()))
175+
expectedDict, _, err := array.FromJSON(p.mem, expectedType.ValueType, strings.NewReader("[0, 1, 2]"))
176+
p.NoError(err)
177+
defer expectedDict.Release()
178+
179+
expectedIndices, _, err := array.FromJSON(p.mem, expectedType.IndexType, strings.NewReader("[0, null, 0, 1, null, 1, 2]"))
180+
p.NoError(err)
181+
defer expectedIndices.Release()
182+
183+
expected := array.NewDictionaryArray(expectedType, expectedIndices, expectedDict)
184+
defer expected.Release()
185+
186+
p.True(array.Equal(expected, arr))
187+
}
188+
148189
func (p *PrimitiveDictionaryTestSuite) TestDictionaryNewBuilder() {
149190
valueType := p.typ
150191
dictArr, _, err := array.FromJSON(p.mem, valueType, strings.NewReader("[1, 2]"))

arrow/array/encoded.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,10 @@ func (b *RunEndEncodedBuilder) AppendNulls(n int) {
398398
}
399399
}
400400

401+
func (b *RunEndEncodedBuilder) UnsafeAppendBoolToBitmap(v bool) {
402+
panic("Calling UnsafeAppendBoolToBitmap on a run-end encoded array is semantically undefined.")
403+
}
404+
401405
func (b *RunEndEncodedBuilder) NullN() int {
402406
return UnknownNullCount
403407
}

0 commit comments

Comments
 (0)