Skip to content

Commit 7bddb83

Browse files
GH-50012: [Python] Fix list_ storage crashes when values exceed int32 offsets (#50016)
### Rationale for this change When data exceeds int32 limits, properly wraps each chunk as ExtensionArray ### What changes are included in this PR? Modified extension type handling to support both Array and ChunkedArray storage types. ### Are these changes tested? Yes , Manually tested the changes ### Are there any user-facing changes? No ### This PR contains a "Critical Fix". This change fixes a crash in list_ storage . when list data exceeds int32 limits, PyArrow automatically creates a ChunkedArray. However, ExtensionArray.from_storage() only accepts Array objects, not ChunkedArray. * GitHub Issue: #50012 Authored-by: Ankit.Ahlawat@ibm.com <Ankit.Ahlawat@ibm.com> Signed-off-by: Sutou Kouhei <kou@clear-code.com>
1 parent 3346132 commit 7bddb83

2 files changed

Lines changed: 73 additions & 1 deletion

File tree

python/pyarrow/array.pxi

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,7 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
401401
result = _sequence_to_array(obj, mask, size, type, pool, c_from_pandas)
402402

403403
if extension_type is not None:
404-
result = ExtensionArray.from_storage(extension_type, result)
404+
result = extension_type.wrap_array(result)
405405
return result
406406

407407

python/pyarrow/tests/test_extension_type.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2120,3 +2120,75 @@ def test_json(storage_type, pickle_module):
21202120
pa.ArrowInvalid,
21212121
match=f"Invalid storage type for JsonExtensionType: {storage_type}"):
21222122
pa.json_(storage_type)
2123+
2124+
2125+
class ListExtensionType(pa.ExtensionType):
2126+
"""Extension type with a list field for testing int32 overflow."""
2127+
2128+
def __init__(self):
2129+
super().__init__(
2130+
pa.struct({"data": pa.list_(pa.uint8())}),
2131+
"pyarrow.tests.ListExtensionType",
2132+
)
2133+
2134+
def __arrow_ext_serialize__(self):
2135+
return b""
2136+
2137+
@classmethod
2138+
def __arrow_ext_deserialize__(cls, storage_type, serialized):
2139+
return cls()
2140+
2141+
2142+
@pytest.mark.slow
2143+
@pytest.mark.large_memory
2144+
@pytest.mark.numpy
2145+
def test_extension_type_list_overflow():
2146+
"""
2147+
Test that extension types with list fields handle int32 offset overflow.
2148+
"""
2149+
with registered_extension_type(ListExtensionType()):
2150+
schema = pa.schema({"col": ListExtensionType()})
2151+
2152+
# Create data that exceeds int32 max cumulative values
2153+
# 5 rows × 500M values = 2.5B > int32 max (2,147,483,647)
2154+
arr = np.zeros(500_000_000, dtype=np.uint8)
2155+
rows = [{"col": {"data": arr}} for _ in range(5)]
2156+
2157+
result = pa.Table.from_pylist(rows, schema=schema)
2158+
2159+
assert result.num_rows == 5
2160+
assert result.num_columns == 1
2161+
assert result.schema[0].type == ListExtensionType()
2162+
2163+
col = result.column(0)
2164+
assert isinstance(col, pa.ChunkedArray)
2165+
assert col.type == ListExtensionType()
2166+
2167+
assert col.num_chunks > 1, "Expected multiple chunks due to int32 overflow"
2168+
2169+
for chunk_idx in range(col.num_chunks):
2170+
chunk_data = col.chunk(chunk_idx)
2171+
assert chunk_data.type == ListExtensionType()
2172+
2173+
2174+
@pytest.mark.numpy
2175+
def test_extension_type_no_overflow():
2176+
"""Test that extension types work normally when there's no overflow."""
2177+
with registered_extension_type(ListExtensionType()):
2178+
schema = pa.schema({"col": ListExtensionType()})
2179+
2180+
# Small data that won't overflow
2181+
arr = np.array([1, 2, 3], dtype=np.uint8)
2182+
rows = [{"col": {"data": arr}} for _ in range(3)]
2183+
2184+
result = pa.Table.from_pylist(rows, schema=schema)
2185+
2186+
assert result.num_rows == 3
2187+
assert result.num_columns == 1
2188+
assert result.schema[0].type == ListExtensionType()
2189+
2190+
# The column should be a ChunkedArray with a single chunk
2191+
col = result.column(0)
2192+
assert isinstance(col, pa.ChunkedArray)
2193+
assert col.num_chunks == 1
2194+
assert col.type == ListExtensionType()

0 commit comments

Comments
 (0)