Skip to content

Commit af5b849

Browse files
rustyconoverclaude
andcommitted
fix(utils): support union types in empty_batch
pyarrow's ``pa.array([], type=union)`` raises ArrowNotImplementedError, so ``empty_batch`` failed for any schema containing a union field. Build empty union arrays via ``Array.from_buffers`` from empty children + an empty type-codes buffer (plus an empty offsets buffer for dense unions). Adds tests covering both sparse and dense union schemas. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 761e3e2 commit af5b849

2 files changed

Lines changed: 40 additions & 1 deletion

File tree

tests/test_utils.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -671,6 +671,22 @@ def test_empty_batch_column_types(self) -> None:
671671
assert batch.column("flag").type == pa.bool_()
672672
assert batch.column("data").type == pa.binary()
673673

674+
def test_empty_batch_sparse_union(self) -> None:
675+
"""Sparse union schemas — pa.array([], type=union) raises, so empty_batch must take a different path."""
676+
union_type = pa.sparse_union([pa.field("i", pa.int32()), pa.field("s", pa.string())], type_codes=[0, 1])
677+
schema = pa.schema([pa.field("u", union_type), pa.field("x", pa.int64())])
678+
batch = empty_batch(schema)
679+
assert batch.num_rows == 0
680+
assert batch.schema.equals(schema)
681+
682+
def test_empty_batch_dense_union(self) -> None:
683+
"""Dense union schemas — same restriction as sparse, plus an extra offsets buffer."""
684+
union_type = pa.dense_union([pa.field("i", pa.int32()), pa.field("s", pa.string())], type_codes=[0, 1])
685+
schema = pa.schema([pa.field("u", union_type)])
686+
batch = empty_batch(schema)
687+
assert batch.num_rows == 0
688+
assert batch.schema.equals(schema)
689+
674690

675691
# ---------------------------------------------------------------------------
676692
# TestSerializeDeserializeRecordBatch

vgi_rpc/utils.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,11 +167,34 @@ def __exit__(
167167
def empty_batch(schema: pa.Schema) -> pa.RecordBatch:
168168
"""Return an empty batch conforming to the schema."""
169169
return pa.RecordBatch.from_arrays(
170-
[pa.array([], type=field.type) for field in schema],
170+
[_empty_array(field.type) for field in schema],
171171
schema=schema,
172172
)
173173

174174

175+
def _empty_array(arrow_type: pa.DataType) -> "pa.Array[Any]":
176+
"""Build a zero-length array of ``arrow_type``.
177+
178+
pyarrow's ``pa.array([], type=...)`` constructor doesn't support
179+
union types — it raises ``ArrowNotImplementedError``. Build those
180+
via ``Array.from_buffers`` from empty children + an empty type-codes
181+
buffer (plus an empty offsets buffer for dense unions).
182+
"""
183+
if pa.types.is_union(arrow_type):
184+
union_type = cast(pa.UnionType, arrow_type)
185+
children = [pa.array([], type=field.type) for field in union_type]
186+
# buffers()[1] is the data buffer; for an empty primitive array it's
187+
# always present, but the type stub says Buffer | None. The leading
188+
# None stands in for the validity bitmap that union arrays don't
189+
# carry (nulls live in the children).
190+
type_codes_buf = cast(pa.Buffer, pa.array([], type=pa.int8()).buffers()[1])
191+
buffers: list[pa.Buffer | None] = [None, type_codes_buf]
192+
if union_type.mode == "dense":
193+
buffers.append(cast(pa.Buffer, pa.array([], type=pa.int32()).buffers()[1]))
194+
return pa.Array.from_buffers(arrow_type, 0, cast(list[pa.Buffer], buffers), children=children)
195+
return pa.array([], type=arrow_type)
196+
197+
175198
def serialize_record_batch(
176199
destination: IOBase,
177200
batch: pa.RecordBatch,

0 commit comments

Comments
 (0)