From 90e0c86b70898580a996b891bfc1f99288290eb6 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Fri, 10 Apr 2026 15:37:26 -0700 Subject: [PATCH 1/4] Panic `UnsafeAppendBoolToBitmap` for REE builder For more context: https://github.com/apache/arrow-go/pull/558/changes#r2758798540 This commit makes `UnsafeAppendBoolToBitmap()` and `RunEndEncodedBuilder` panic. The current implementation pulled from `arrow/array/builder.go` would leave the builder in an invalid state that would panic when trying to finish a record batch. --- arrow/array/dictionary.go | 4 ++++ arrow/array/encoded.go | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/arrow/array/dictionary.go b/arrow/array/dictionary.go index 109d2a97e..cccb06da9 100644 --- a/arrow/array/dictionary.go +++ b/arrow/array/dictionary.go @@ -646,6 +646,10 @@ func (b *dictionaryBuilder) AppendEmptyValues(n int) { } } +func (b *dictionaryBuilder) UnsafeAppendBoolToBitmap(v bool) { + panic("Calling UnsafeAppendBoolToBitmap on dictionaryBuilder would leave it in inconsistent state. Use AppendIndices instead.") +} + func (b *dictionaryBuilder) Reserve(n int) { b.idxBuilder.Reserve(n) } diff --git a/arrow/array/encoded.go b/arrow/array/encoded.go index 08800d4be..85432a13c 100644 --- a/arrow/array/encoded.go +++ b/arrow/array/encoded.go @@ -398,6 +398,10 @@ func (b *RunEndEncodedBuilder) AppendNulls(n int) { } } +func (b *RunEndEncodedBuilder) UnsafeAppendBoolToBitmap(v bool) { + panic("Calling UnsafeAppendBoolToBitmap on a run-end encoded array is semantically undefined.") +} + func (b *RunEndEncodedBuilder) NullN() int { return UnknownNullCount } From 26e209658f70d32a4ba2f7a5f7df6457d43f4e96 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Sat, 11 Apr 2026 21:12:16 -0700 Subject: [PATCH 2/4] Add `UnsafeAppend` to dict builder This commit adds a proper implementation of `UnsafeAppend` and `UnsafeAppendBoolToBitmap` to the dictionary-encoded array builder. This allows using `Reserve(n)` followed by `n` calls to `UnsafeAppend` or `UnsafeAppendBoolToBitmap`. I also added a test to it. --- arrow/array/dictionary.go | 60 ++++++++++++++++++++++++++++++++-- arrow/array/dictionary_test.go | 41 +++++++++++++++++++++++ 2 files changed, 99 insertions(+), 2 deletions(-) diff --git a/arrow/array/dictionary.go b/arrow/array/dictionary.go index cccb06da9..7d03b7ea3 100644 --- a/arrow/array/dictionary.go +++ b/arrow/array/dictionary.go @@ -314,7 +314,8 @@ func arrayApproxEqualDict(l, r *Dictionary, opt equalOption) bool { // helper for building the properly typed indices of the dictionary builder type IndexBuilder struct { Builder - Append func(int) + Append func(int) + UnsafeAppend func(int) } func createIndexBuilder(mem memory.Allocator, dt arrow.FixedWidthDataType) (ret IndexBuilder, err error) { @@ -324,34 +325,58 @@ func createIndexBuilder(mem memory.Allocator, dt arrow.FixedWidthDataType) (ret ret.Append = func(idx int) { ret.Builder.(*Int8Builder).Append(int8(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Int8Builder).UnsafeAppend(int8(idx)) + } case arrow.UINT8: ret.Append = func(idx int) { ret.Builder.(*Uint8Builder).Append(uint8(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Uint8Builder).UnsafeAppend(uint8(idx)) + } case arrow.INT16: ret.Append = func(idx int) { ret.Builder.(*Int16Builder).Append(int16(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Int16Builder).UnsafeAppend(int16(idx)) + } case arrow.UINT16: ret.Append = func(idx int) { ret.Builder.(*Uint16Builder).Append(uint16(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Uint16Builder).UnsafeAppend(uint16(idx)) + } case arrow.INT32: ret.Append = func(idx int) { ret.Builder.(*Int32Builder).Append(int32(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Int32Builder).UnsafeAppend(int32(idx)) + } case arrow.UINT32: ret.Append = func(idx int) { ret.Builder.(*Uint32Builder).Append(uint32(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Uint32Builder).UnsafeAppend(uint32(idx)) + } case arrow.INT64: ret.Append = func(idx int) { ret.Builder.(*Int64Builder).Append(int64(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Int64Builder).UnsafeAppend(int64(idx)) + } case arrow.UINT64: ret.Append = func(idx int) { ret.Builder.(*Uint64Builder).Append(uint64(idx)) } + ret.UnsafeAppend = func(idx int) { + ret.Builder.(*Uint64Builder).UnsafeAppend(uint64(idx)) + } default: debug.Assert(false, "dictionary index type must be integral") err = fmt.Errorf("dictionary index type must be integral, not %s", dt) @@ -647,7 +672,11 @@ func (b *dictionaryBuilder) AppendEmptyValues(n int) { } func (b *dictionaryBuilder) UnsafeAppendBoolToBitmap(v bool) { - panic("Calling UnsafeAppendBoolToBitmap on dictionaryBuilder would leave it in inconsistent state. Use AppendIndices instead.") + if !v { + b.nulls += 1 + } + b.length += 1 + b.idxBuilder.UnsafeAppendBoolToBitmap(v) } func (b *dictionaryBuilder) Reserve(n int) { @@ -785,6 +814,13 @@ func (b *dictionaryBuilder) insertDictBytes(val []byte) error { return err } +func (b *dictionaryBuilder) unsafeAppendValue(val interface{}) error { + idx, _, err := b.memoTable.GetOrInsert(val) + b.idxBuilder.UnsafeAppend(idx) + b.length += 1 + return err +} + func (b *dictionaryBuilder) appendValue(val interface{}) error { idx, _, err := b.memoTable.GetOrInsert(val) b.idxBuilder.Append(idx) @@ -994,6 +1030,26 @@ type dictBuilder[T arrow.ValueType] struct { dictionaryBuilder } +func (b *dictBuilder[T]) UnsafeAppend(v T) error { + switch val := any(v).(type) { + case arrow.Duration: + return b.unsafeAppendValue(int64(val)) + case arrow.Timestamp: + return b.unsafeAppendValue(int64(val)) + case arrow.Time32: + return b.unsafeAppendValue(int32(val)) + case arrow.Time64: + return b.unsafeAppendValue(int64(val)) + case arrow.Date32: + return b.unsafeAppendValue(int32(val)) + case arrow.Date64: + return b.unsafeAppendValue(int64(val)) + case arrow.MonthInterval: + return b.unsafeAppendValue(int32(val)) + } + return b.unsafeAppendValue(v) +} + func (b *dictBuilder[T]) Append(v T) error { switch val := any(v).(type) { case arrow.Duration: diff --git a/arrow/array/dictionary_test.go b/arrow/array/dictionary_test.go index 9b9d3b1f3..24aab674a 100644 --- a/arrow/array/dictionary_test.go +++ b/arrow/array/dictionary_test.go @@ -145,6 +145,47 @@ func (p *PrimitiveDictionaryTestSuite) TestDictionaryBuilderInit() { p.True(array.Equal(expected, arr)) } +func (p *PrimitiveDictionaryTestSuite) TestDictionaryBuilderReserveAndAppend() { + expectedType := &arrow.DictionaryType{IndexType: &arrow.Int8Type{}, ValueType: p.typ} + bldr := array.NewDictionaryBuilder(p.mem, expectedType) + defer bldr.Release() + + builder := reflect.ValueOf(bldr) + appendFn := builder.MethodByName("UnsafeAppend") + validFn := builder.MethodByName("UnsafeAppendBoolToBitmap") + + bldr.Reserve(7) + validFn.Call([]reflect.Value{reflect.ValueOf(true)}) + validFn.Call([]reflect.Value{reflect.ValueOf(false)}) + appendFn.Call([]reflect.Value{reflect.ValueOf(0).Convert(p.reftyp)}) + appendFn.Call([]reflect.Value{reflect.ValueOf(1).Convert(p.reftyp)}) + validFn.Call([]reflect.Value{reflect.ValueOf(false)}) + appendFn.Call([]reflect.Value{reflect.ValueOf(1).Convert(p.reftyp)}) + appendFn.Call([]reflect.Value{reflect.ValueOf(2).Convert(p.reftyp)}) + + p.EqualValues(7, bldr.Len()) + p.EqualValues(2, bldr.NullN()) + + p.EqualValues(3, bldr.DictionarySize()) + + arr := bldr.NewArray().(*array.Dictionary) + defer arr.Release() + + p.True(arrow.TypeEqual(expectedType, arr.DataType())) + expectedDict, _, err := array.FromJSON(p.mem, expectedType.ValueType, strings.NewReader("[0, 1, 2]")) + p.NoError(err) + defer expectedDict.Release() + + expectedIndices, _, err := array.FromJSON(p.mem, expectedType.IndexType, strings.NewReader("[0, null, 0, 1, null, 1, 2]")) + p.NoError(err) + defer expectedIndices.Release() + + expected := array.NewDictionaryArray(expectedType, expectedIndices, expectedDict) + defer expected.Release() + + p.True(array.Equal(expected, arr)) +} + func (p *PrimitiveDictionaryTestSuite) TestDictionaryNewBuilder() { valueType := p.typ dictArr, _, err := array.FromJSON(p.mem, valueType, strings.NewReader("[1, 2]")) From 2cb62c120de8e8485ca8c4ce4b2e0bc9d4184334 Mon Sep 17 00:00:00 2001 From: serramatutu Date: Tue, 14 Apr 2026 12:13:58 +0200 Subject: [PATCH 3/4] Remove `error` from `UnsafeAppend()` API --- arrow/array/dictionary.go | 48 +++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/arrow/array/dictionary.go b/arrow/array/dictionary.go index 7d03b7ea3..37a5db859 100644 --- a/arrow/array/dictionary.go +++ b/arrow/array/dictionary.go @@ -1030,27 +1030,38 @@ type dictBuilder[T arrow.ValueType] struct { dictionaryBuilder } -func (b *dictBuilder[T]) UnsafeAppend(v T) error { +func (b *dictBuilder[T]) UnsafeAppend(v T) { + // SAFETY: it is safe to ignore the value returned by the calls to `unsafeAppendValue()` + // here since `UnsafeAppend()` is statically typed and the only case in which that method + // errors is when trying to insert an invalid `interface{}` into the `memoTable`. + var err error switch val := any(v).(type) { case arrow.Duration: - return b.unsafeAppendValue(int64(val)) + err = b.unsafeAppendValue(int64(val)) case arrow.Timestamp: - return b.unsafeAppendValue(int64(val)) + err = b.unsafeAppendValue(int64(val)) case arrow.Time32: - return b.unsafeAppendValue(int32(val)) + err = b.unsafeAppendValue(int32(val)) case arrow.Time64: - return b.unsafeAppendValue(int64(val)) + err = b.unsafeAppendValue(int64(val)) case arrow.Date32: - return b.unsafeAppendValue(int32(val)) + err = b.unsafeAppendValue(int32(val)) case arrow.Date64: - return b.unsafeAppendValue(int64(val)) + err = b.unsafeAppendValue(int64(val)) case arrow.MonthInterval: - return b.unsafeAppendValue(int32(val)) + err = b.unsafeAppendValue(int32(val)) } - return b.unsafeAppendValue(v) + err = b.unsafeAppendValue(v) + debug.Assert(err == nil, "Trying to insert wrong type into memoTable even though this method is statically typed. This is an implementation bug.") } func (b *dictBuilder[T]) Append(v T) error { + // TODO: it is safe to ignore the value returned by the calls to `appendValue()` + // here since `Append()` is statically typed and the only case in which that + // method errors is when trying to insert an invalid `interface{}` into the `memoTable`. + // + // This would be a breaking change to the public API of `dictBuilder`, so it needs + // to happen over a major release. switch val := any(v).(type) { case arrow.Duration: return b.appendValue(int64(val)) @@ -1118,6 +1129,12 @@ func (b *BinaryDictionaryBuilder) Append(v []byte) error { return nil } + // TODO: it is safe to ignore the value returned by the call to `appendBytes()` + // here since `Append()` is statically typed and the only case in which that + // method errors is when trying to insert an invalid `interface{}` into the `memoTable`. + // + // This would be a breaking change to the public API of `BinaryDictionaryBuilder`, + // so it needs to happen over a major release. return b.appendBytes(v) } @@ -1194,6 +1211,13 @@ type fixedSizeDictionaryBuilder[T fsbType] struct { } func (b *fixedSizeDictionaryBuilder[T]) Append(v T) error { + // TODO: it is safe to ignore the value returned by the calls to `appendValue()` + // and `appendBytes()` here since `Append()` is statically typed and the only + // case in which these method error is when trying to insert an invalid + // `interface{}` into the `memoTable`. + // + // This would be a breaking change to the public API of `fixedSizeDictionaryBuilder`, + // so it needs to happen over a major release. if v, ok := any(v).([]byte); ok { return b.appendBytes(v[:b.byteWidth]) } @@ -1224,6 +1248,12 @@ type FixedSizeBinaryDictionaryBuilder struct { } func (b *FixedSizeBinaryDictionaryBuilder) Append(v []byte) error { + // TODO: it is safe to ignore the value returned by the calls to `appendValue()` + // here since `Append()` is statically typed and the only case in which that + // method errors is when trying to insert an invalid `interface{}` into the `memoTable`. + // + // This would be a breaking change to the public API of `FixedSizeBinaryDictionaryBuilder`, + // so it needs to happen over a major release. return b.appendValue(v[:b.byteWidth]) } From be24d6c3506268b80dde394ba0f40544fddd9dfc Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 14 Apr 2026 13:30:45 -0400 Subject: [PATCH 4/4] fix lint --- arrow/array/dictionary.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arrow/array/dictionary.go b/arrow/array/dictionary.go index 37a5db859..8f9fe2451 100644 --- a/arrow/array/dictionary.go +++ b/arrow/array/dictionary.go @@ -1050,8 +1050,9 @@ func (b *dictBuilder[T]) UnsafeAppend(v T) { err = b.unsafeAppendValue(int64(val)) case arrow.MonthInterval: err = b.unsafeAppendValue(int32(val)) + default: + err = b.unsafeAppendValue(v) } - err = b.unsafeAppendValue(v) debug.Assert(err == nil, "Trying to insert wrong type into memoTable even though this method is statically typed. This is an implementation bug.") } @@ -1212,8 +1213,8 @@ type fixedSizeDictionaryBuilder[T fsbType] struct { func (b *fixedSizeDictionaryBuilder[T]) Append(v T) error { // TODO: it is safe to ignore the value returned by the calls to `appendValue()` - // and `appendBytes()` here since `Append()` is statically typed and the only - // case in which these method error is when trying to insert an invalid + // and `appendBytes()` here since `Append()` is statically typed and the only + // case in which these method error is when trying to insert an invalid // `interface{}` into the `memoTable`. // // This would be a breaking change to the public API of `fixedSizeDictionaryBuilder`,