Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
258 changes: 258 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -2776,6 +2776,59 @@ cdef class ListArray(BaseListArray):
Concrete class for Arrow arrays of a list data type.
"""

def to_pylist(self, *, maps_as_pydicts=None):
"""
Convert to a list of native Python objects.

Parameters
----------
maps_as_pydicts : str, optional, default `None`
Valid values are `None`, 'lossy', or 'strict'.
The default behavior (`None`), is to convert Arrow Map arrays to
Python association lists (list-of-tuples) in the same order as the
Arrow Map, as in [(key1, value1), (key2, value2), ...].

If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts.

If 'lossy', whenever duplicate keys are detected, a warning will be printed.
The last seen value of a duplicate key will be in the Python dictionary.
If 'strict', this instead results in an exception being raised when detected.

Returns
-------
lst : list
"""
cdef:
CListArray* arr = <CListArray*> self.ap
int64_t i, n, off0, start, end
self._assert_cpu()
n = arr.length()
if n == 0:
return []
# Convert the range of child values referenced by this array in a
# single pass, then slice out each list. This avoids creating a
# Scalar wrapper, a Python Array wrapper and a values-array slice
# for every row (see GH-28694).
off0 = arr.value_offset(0)
child_py = pyarrow_wrap_array(arr.values()).slice(
off0, arr.value_offset(n) - off0
).to_pylist(maps_as_pydicts=maps_as_pydicts)
result = []
if arr.null_count() == 0:
for i in range(n):
start = arr.value_offset(i) - off0
end = arr.value_offset(i + 1) - off0
result.append(child_py[start:end])
else:
for i in range(n):
if arr.IsNull(i):
result.append(None)
else:
start = arr.value_offset(i) - off0
end = arr.value_offset(i + 1) - off0
result.append(child_py[start:end])
return result

@staticmethod
def from_arrays(offsets, values, DataType type=None, MemoryPool pool=None, mask=None):
"""
Expand Down Expand Up @@ -2961,6 +3014,56 @@ cdef class LargeListArray(BaseListArray):
Identical to ListArray, but 64-bit offsets.
"""

def to_pylist(self, *, maps_as_pydicts=None):
"""
Convert to a list of native Python objects.

Parameters
----------
maps_as_pydicts : str, optional, default `None`
Valid values are `None`, 'lossy', or 'strict'.
The default behavior (`None`), is to convert Arrow Map arrays to
Python association lists (list-of-tuples) in the same order as the
Arrow Map, as in [(key1, value1), (key2, value2), ...].

If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts.

If 'lossy', whenever duplicate keys are detected, a warning will be printed.
The last seen value of a duplicate key will be in the Python dictionary.
If 'strict', this instead results in an exception being raised when detected.

Returns
-------
lst : list
"""
cdef:
CLargeListArray* arr = <CLargeListArray*> self.ap
int64_t i, n, off0, start, end
self._assert_cpu()
n = arr.length()
if n == 0:
return []
# See ListArray.to_pylist for an explanation of the bulk conversion.
off0 = arr.value_offset(0)
child_py = pyarrow_wrap_array(arr.values()).slice(
off0, arr.value_offset(n) - off0
).to_pylist(maps_as_pydicts=maps_as_pydicts)
result = []
if arr.null_count() == 0:
for i in range(n):
start = arr.value_offset(i) - off0
end = arr.value_offset(i + 1) - off0
result.append(child_py[start:end])
else:
for i in range(n):
if arr.IsNull(i):
result.append(None)
else:
start = arr.value_offset(i) - off0
end = arr.value_offset(i + 1) - off0
result.append(child_py[start:end])
return result

@staticmethod
def from_arrays(offsets, values, DataType type=None, MemoryPool pool=None, mask=None):
"""
Expand Down Expand Up @@ -3551,6 +3654,34 @@ cdef class MapArray(ListArray):
Concrete class for Arrow arrays of a map data type.
"""

def to_pylist(self, *, maps_as_pydicts=None):
"""
Convert to a list of native Python objects.

Parameters
----------
maps_as_pydicts : str, optional, default `None`
Valid values are `None`, 'lossy', or 'strict'.
The default behavior (`None`), is to convert Arrow Map arrays to
Python association lists (list-of-tuples) in the same order as the
Arrow Map, as in [(key1, value1), (key2, value2), ...].

If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts.

If 'lossy', whenever duplicate keys are detected, a warning will be printed.
The last seen value of a duplicate key will be in the Python dictionary.
If 'strict', this instead results in an exception being raised when detected.

Returns
-------
lst : list
"""
# Maps have per-entry key/value semantics (association tuples,
# optional dict conversion with duplicate-key detection) that the
# bulk path inherited from ListArray does not implement, so use
# the generic scalar-based conversion.
return Array.to_pylist(self, maps_as_pydicts=maps_as_pydicts)

@staticmethod
def from_arrays(offsets, keys, items, DataType type=None, MemoryPool pool=None, mask=None):
"""
Expand Down Expand Up @@ -3688,6 +3819,56 @@ cdef class FixedSizeListArray(BaseListArray):
Concrete class for Arrow arrays of a fixed size list data type.
"""

def to_pylist(self, *, maps_as_pydicts=None):
"""
Convert to a list of native Python objects.

Parameters
----------
maps_as_pydicts : str, optional, default `None`
Valid values are `None`, 'lossy', or 'strict'.
The default behavior (`None`), is to convert Arrow Map arrays to
Python association lists (list-of-tuples) in the same order as the
Arrow Map, as in [(key1, value1), (key2, value2), ...].

If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts.

If 'lossy', whenever duplicate keys are detected, a warning will be printed.
The last seen value of a duplicate key will be in the Python dictionary.
If 'strict', this instead results in an exception being raised when detected.

Returns
-------
lst : list
"""
cdef:
CFixedSizeListArray* arr = <CFixedSizeListArray*> self.ap
int64_t i, n, off0, start, end
self._assert_cpu()
n = arr.length()
if n == 0:
return []
# See ListArray.to_pylist for an explanation of the bulk conversion.
off0 = arr.value_offset(0)
child_py = pyarrow_wrap_array(arr.values()).slice(
off0, arr.value_offset(n) - off0
).to_pylist(maps_as_pydicts=maps_as_pydicts)
result = []
if arr.null_count() == 0:
for i in range(n):
start = arr.value_offset(i) - off0
end = arr.value_offset(i + 1) - off0
result.append(child_py[start:end])
else:
for i in range(n):
if arr.IsNull(i):
result.append(None)
else:
start = arr.value_offset(i) - off0
end = arr.value_offset(i + 1) - off0
result.append(child_py[start:end])
return result

@staticmethod
def from_arrays(values, list_size=None, DataType type=None, mask=None):
"""
Expand Down Expand Up @@ -3974,6 +4155,45 @@ cdef class StringArray(Array):
Concrete class for Arrow arrays of string (or utf8) data type.
"""

def to_pylist(self, *, maps_as_pydicts=None):
"""
Convert to a list of native Python objects.

Parameters
----------
maps_as_pydicts : str, optional, default `None`
Valid values are `None`, 'lossy', or 'strict'.
This parameter is ignored for non-nested Arrays.

Returns
-------
lst : list
"""
cdef:
CStringArray* arr = <CStringArray*> self.ap
int64_t i, n
int32_t length
const uint8_t* data
self._assert_cpu()
n = arr.length()
result = []
# Decode values straight from the data buffer instead of creating
# a C++ Scalar and a Python Scalar wrapper per value (see GH-28694).
if arr.null_count() == 0:
for i in range(n):
data = arr.GetValue(i, &length)
result.append(
cp.PyUnicode_DecodeUTF8(<const char*> data, length, NULL))
else:
for i in range(n):
if arr.IsNull(i):
result.append(None)
else:
data = arr.GetValue(i, &length)
result.append(
cp.PyUnicode_DecodeUTF8(<const char*> data, length, NULL))
return result
Comment on lines +4178 to +4195

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

null_count() is a one-time vectorized popcount over the validity bitmap (~n/8 bytes, well under a millisecond for 2M rows), computed and cached per ArrayData. In exchange, the no-null branch skips the per-element IsNull() check entirely. Branching on null_bitmap_data() == NULL instead would save that single scan but degrade the common case of a sliced/combined array that has a bitmap yet contains no nulls in range — that would take the per-element IsNull() path forever. So the current form should be the better trade-off in practice.


@staticmethod
def from_buffers(int length, Buffer value_offsets, Buffer data,
Buffer null_bitmap=None, int null_count=-1,
Expand Down Expand Up @@ -4006,6 +4226,44 @@ cdef class LargeStringArray(Array):
Concrete class for Arrow arrays of large string (or utf8) data type.
"""

def to_pylist(self, *, maps_as_pydicts=None):
"""
Convert to a list of native Python objects.

Parameters
----------
maps_as_pydicts : str, optional, default `None`
Valid values are `None`, 'lossy', or 'strict'.
This parameter is ignored for non-nested Arrays.

Returns
-------
lst : list
"""
cdef:
CLargeStringArray* arr = <CLargeStringArray*> self.ap
int64_t i, n
int64_t length
const uint8_t* data
self._assert_cpu()
n = arr.length()
result = []
# See StringArray.to_pylist for an explanation of the fast path.
if arr.null_count() == 0:
for i in range(n):
data = arr.GetValue(i, &length)
result.append(
cp.PyUnicode_DecodeUTF8(<const char*> data, length, NULL))
else:
for i in range(n):
if arr.IsNull(i):
result.append(None)
else:
data = arr.GetValue(i, &length)
result.append(
cp.PyUnicode_DecodeUTF8(<const char*> data, length, NULL))
return result

@staticmethod
def from_buffers(int length, Buffer value_offsets, Buffer data,
Buffer null_bitmap=None, int null_count=-1,
Expand Down
32 changes: 32 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,38 @@ def test_array_getitem_numpy_scalars():
assert arr[np.int32(idx)].as_py() == lst[idx]


def test_to_pylist_bulk_paths():
# GH-50326: list-like and string arrays convert to Python objects in
# bulk instead of going through one Scalar per element; the result must
# match the per-scalar conversion exactly.
arrays = [
pa.array([[1, None, 3], None, [], [4]], type=pa.list_(pa.int32())),
pa.array([["a", None], None, [], ["bcd", ""]],
type=pa.list_(pa.string())),
pa.array([["a", None], None, [], ["bcd", ""]],
type=pa.large_list(pa.large_string())),
pa.array([[1, None], None, [3, 4]], type=pa.list_(pa.int32(), 2)),
pa.array([[[1], [2, None]], None, [None, [3]]],
type=pa.list_(pa.list_(pa.int32()))),
pa.array([[("k1", 1), ("k2", None)], None, []],
type=pa.map_(pa.string(), pa.int32())),
pa.array(["a", None, "", "\N{GRINNING FACE} \N{SNOWMAN}"],
type=pa.string()),
pa.array(["a", None, "", "\N{GRINNING FACE} \N{SNOWMAN}"],
type=pa.large_string()),
pa.array([], type=pa.list_(pa.int32())),
pa.array([None, None], type=pa.list_(pa.string())),
]
for arr in arrays:
for view in (arr, arr.slice(1), arr.slice(0, 2), arr.slice(2)):
assert view.to_pylist() == [x.as_py() for x in view]

# Values inside numeric lists must stay Python ints/None, never floats
result = pa.array([[1, None, 3]], type=pa.list_(pa.int32())).to_pylist()
assert result == [[1, None, 3]]
assert [type(x) for x in result[0]] == [int, type(None), int]


def test_array_slice():
arr = pa.array(range(10))

Expand Down
Loading