Skip to content

Commit 68683d5

Browse files
authored
Convert column children computation from lazy to eager (rapidsai#20953)
This change makes the children match the data and mask with eager computation, which in turn makes it easier to reason about when data accesses can happen and will simplify the rewriting of other internals around pylibcudf and buffers. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Bradley Dice (https://github.com/bdice) URL: rapidsai#20953
1 parent ed2a5fb commit 68683d5

5 files changed

Lines changed: 102 additions & 23 deletions

File tree

python/cudf/cudf/core/column/categorical.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,11 +132,11 @@ def _process_values_for_isin(
132132
# Convert values to categorical dtype like self
133133
return self, column.as_column(values, dtype=self.dtype)
134134

135-
@property
136-
def children(self) -> tuple[NumericalColumn]:
135+
def _recompute_children(self) -> None:
137136
if self.offset == 0 and self.size == self.base_size:
138-
return super().children # type: ignore[return-value]
139-
if self._children is None:
137+
# Optimization: for non-sliced columns, children == base_children (just references)
138+
self._children = self.base_children # type: ignore[assignment]
139+
else:
140140
# Pass along size, offset, null_count from __init__
141141
# which doesn't necessarily match the attributes of plc_column
142142
# Use base_children[0]'s plc_column as it has the actual codes data
@@ -148,6 +148,9 @@ def children(self) -> tuple[NumericalColumn]:
148148
exposed=False,
149149
)
150150
self._children = (child,)
151+
152+
@property
153+
def children(self) -> tuple[NumericalColumn]:
151154
return self._children
152155

153156
@property

python/cudf/cudf/core/column/column.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,7 @@ def __init__(
327327
# triggering the destruction of the exposed buffers.
328328
self._exposed_buffers: set[Buffer] = set()
329329
self._recompute_data()
330+
self._recompute_children()
330331

331332
def _get_children_from_pylibcudf_column(
332333
self,
@@ -500,6 +501,7 @@ def set_base_mask(self, value: None | Buffer) -> None:
500501
)
501502

502503
self._clear_cache()
504+
self._recompute_children()
503505

504506
def _clear_cache(self) -> None:
505507
self._distinct_count.clear()
@@ -557,23 +559,8 @@ def base_children(self) -> tuple[ColumnBase, ...]:
557559

558560
@property
559561
def children(self) -> tuple[ColumnBase, ...]:
560-
if self.offset == 0 and self.size == self.base_size:
561-
self._children = self.base_children
562-
if self._children is None:
563-
if not self.base_children:
564-
self._children = ()
565-
else:
566-
# Compute children from the column view (children factoring self.size)
567-
children = ColumnBase.from_pylibcudf(
568-
self.plc_column.copy()
569-
).base_children
570-
dtypes = (
571-
base_child.dtype for base_child in self.base_children
572-
)
573-
self._children = tuple(
574-
child._with_type_metadata(dtype)
575-
for child, dtype in zip(children, dtypes, strict=True)
576-
)
562+
"""Return the offset-aware children columns."""
563+
assert self._children is not None
577564
return self._children
578565

579566
def set_base_children(self, value: tuple[ColumnBase, ...]) -> None:
@@ -584,7 +571,6 @@ def set_base_children(self, value: tuple[ColumnBase, ...]) -> None:
584571
if any(not isinstance(child, ColumnBase) for child in value):
585572
raise TypeError("All children must be Columns.")
586573

587-
self._children = None
588574
self._base_children = value
589575

590576
def _recompute_data(self) -> None:
@@ -600,6 +586,21 @@ def _recompute_data(self) -> None:
600586
end = start + self.size * self.dtype.itemsize
601587
self._data = self.base_data[start:end]
602588

589+
def _recompute_children(self) -> None:
590+
"""Recompute the offset-aware children columns."""
591+
# Check for empty base_children first to avoid accessing properties on empty collections
592+
if not self.base_children:
593+
self._children = ()
594+
elif self.offset == 0 and self.size == self.base_size:
595+
# Optimization: for non-sliced columns, children == base_children (just references)
596+
self._children = self.base_children
597+
else:
598+
# Slice each base child using the parent's offset and size
599+
self._children = tuple(
600+
base_child.slice(self.offset, self.offset + self.size)
601+
for base_child in self.base_children
602+
)
603+
603604
def _mimic_inplace(
604605
self, other_col: Self, inplace: bool = False
605606
) -> None | Self:
@@ -614,8 +615,8 @@ def _mimic_inplace(
614615
self.plc_column = other_col.plc_column
615616
self._base_children = other_col._base_children
616617
self._recompute_data()
618+
self._recompute_children()
617619
self._mask = None
618-
self._children = None
619620
self._clear_cache()
620621
return None
621622
else:

python/cudf/cudf/core/column/lists.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,40 @@ def _get_children_from_pylibcudf_column(
7676
children[1]._with_type_metadata(dtype.element_type),
7777
)
7878

79+
def _recompute_children(self) -> None:
80+
"""Recompute the offset-aware children columns with proper type metadata."""
81+
if not self.base_children:
82+
self._children = ()
83+
elif self.offset == 0 and self.size == self.base_size:
84+
# Optimization: for non-sliced columns, children == base_children
85+
self._children = self.base_children # type: ignore[assignment]
86+
else:
87+
# List columns have special structure:
88+
# - Child 0 (offsets): size = parent.size + 1, slice from [offset, offset+size+1)
89+
# - Child 1 (values): slice based on the offset values
90+
offsets_child = self.base_children[0]
91+
values_child = self.base_children[1]
92+
93+
# Slice offsets: get size+1 offsets starting from self.offset
94+
sliced_offsets = offsets_child.slice(
95+
self.offset, self.offset + self.size + 1
96+
)
97+
98+
# Get the range of values to slice: from first offset to last offset
99+
# The offsets tell us where in the values array each list starts/ends
100+
first_offset = offsets_child.element_indexing(self.offset)
101+
last_offset = offsets_child.element_indexing(
102+
self.offset + self.size
103+
)
104+
105+
# Slice the values child to only include the values we need
106+
sliced_values = values_child.slice(first_offset, last_offset)
107+
108+
# Adjust offsets to be relative to the sliced values (subtract first_offset)
109+
adjusted_offsets = sliced_offsets - first_offset
110+
111+
self._children = (adjusted_offsets, sliced_values) # type: ignore[assignment]
112+
79113
def _prep_pandas_compat_repr(self) -> StringColumn | Self:
80114
"""
81115
Preprocess Column to be compatible with pandas repr, namely handling nulls.

python/cudf/cudf/core/column/string.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,33 @@ def _recompute_data(self) -> None:
211211
assert self.base_data is not None
212212
self._data = self.base_data[self.start_offset : self.end_offset]
213213

214+
def _recompute_children(self) -> None:
215+
if not self.base_children:
216+
self._children = ()
217+
elif (
218+
self.offset == 0
219+
and len(self.base_children) > 0
220+
and self.size == self.base_children[0].size - 1
221+
):
222+
# Optimization: for non-sliced columns, children == base_children
223+
self._children = self.base_children # type: ignore[assignment]
224+
else:
225+
# String columns have one child: the offsets column
226+
# For a string column with size N, the offsets child has size N+1
227+
228+
# Step 1: Slice the offsets column to get the correct size
229+
offsets_child = self.base_children[0]
230+
sliced_offsets = offsets_child.slice(
231+
self.offset, self.offset + self.size + 1
232+
)
233+
234+
# Step 2: Adjust the offsets to be relative to the sliced data
235+
# by subtracting the first offset value (start_offset)
236+
chars_offset = self.start_offset
237+
adjusted_offsets = sliced_offsets - chars_offset
238+
239+
self._children = (adjusted_offsets,) # type: ignore[assignment]
240+
214241
def all(self, skipna: bool = True) -> bool:
215242
if skipna and self.null_count == self.size:
216243
return True

python/cudf/cudf/core/column/struct.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,20 @@ def _get_children_from_pylibcudf_column(
7979
)
8080
)
8181

82+
def _recompute_children(self) -> None:
83+
"""Recompute the offset-aware children columns with proper type metadata."""
84+
if not self.base_children:
85+
self._children = ()
86+
elif self.offset == 0 and self.size == self.base_size:
87+
# Optimization: for non-sliced columns, children == base_children
88+
self._children = self.base_children # type: ignore[assignment]
89+
else:
90+
# Slice each child using the parent's offset and size
91+
self._children = tuple( # type: ignore[assignment]
92+
base_child.slice(self.offset, self.offset + self.size)
93+
for base_child in self.base_children
94+
)
95+
8296
def _prep_pandas_compat_repr(self) -> StringColumn | Self:
8397
"""
8498
Preprocess Column to be compatible with pandas repr, namely handling nulls.

0 commit comments

Comments
 (0)