Skip to content

Commit ab868e5

Browse files
Remove dtype workaround in Column.create (#22396)
## Description This PR removes `pa.null` typecasting workaround in `Column.create`. ## Checklist - [x] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). - [x] New or existing tests cover these changes. - [x] The documentation is up to date with these changes.
1 parent a35b633 commit ab868e5

7 files changed

Lines changed: 57 additions & 45 deletions

File tree

python/cudf/cudf/core/column/column.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,11 @@
8585
dtype_to_pylibcudf_type,
8686
find_common_type,
8787
get_dtype_of_same_kind,
88+
is_arrow_null_dtype,
8889
is_column_like,
8990
is_mixed_with_object_dtype,
9091
is_pandas_nullable_extension_dtype,
9192
is_pandas_nullable_numpy_dtype,
92-
maybe_normalize_arrow_null,
9393
min_signed_type,
9494
np_dtypes_to_pandas_dtypes,
9595
pyarrow_dtype_to_cudf_dtype,
@@ -378,13 +378,9 @@ def _wrap_and_validate(col: plc.Column, dtype: DtypeObj) -> plc.Column:
378378
"Normalize to np.dtype('O') before calling "
379379
"ColumnBase.create."
380380
)
381-
if isinstance(dtype, pd.ArrowDtype) and pa.types.is_null(
382-
dtype.pyarrow_dtype
383-
):
381+
if is_arrow_null_dtype(dtype) and col.null_count() != col.size():
384382
raise ValueError(
385-
f"dtype {dtype} is a pandas nullable string dtype with all nulls. "
386-
"Normalize to an empty string column with the same pandas StringDtype "
387-
"before calling ColumnBase.create."
383+
f"dtype {dtype} can only be used with all-null columns."
388384
)
389385

390386
dtype_kind = dtype.kind
@@ -961,15 +957,11 @@ def create(
961957
like copy-on-write. When validation is disabled, the caller is responsible for
962958
ensuring that col and its children are already normalized and wrapped.
963959
"""
964-
# For pandas nullable null types (ArrowDtype wrapping pa.null()),
965-
# normalize the column data and dtype before construction.
966-
col, dtype, old_dtype = maybe_normalize_arrow_null(col, dtype)
967-
968960
# Dispatch to the appropriate subclass based on dtype
969961
target_cls = ColumnBase._dispatch_subclass_from_dtype(dtype)
970962
self = target_cls.__new__(target_cls)
971963
self.plc_column = _wrap_and_validate(col, dtype) if validate else col
972-
self._dtype = dtype if old_dtype is None else old_dtype
964+
self._dtype = dtype
973965
self._distinct_count = {}
974966
self._has_nulls = {}
975967
# The set of exposed buffers associated with this column. These buffers must be
@@ -1419,6 +1411,8 @@ def dropna(self) -> Self:
14191411
return self.copy()
14201412

14211413
def to_arrow(self) -> pa.Array:
1414+
if is_arrow_null_dtype(self.dtype):
1415+
return pa.nulls(len(self))
14221416
with self.access(mode="read", scope="internal"):
14231417
return _handle_nulls(
14241418
self.plc_column.to_arrow(
@@ -3323,6 +3317,12 @@ def as_column(
33233317
elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
33243318
if isinstance(arbitrary, pa.NullArray) and dtype is None:
33253319
dtype = np.dtype("object")
3320+
elif is_arrow_null_dtype(dtype):
3321+
if arbitrary.null_count != len(arbitrary):
3322+
raise ValueError(
3323+
f"dtype {dtype} can only be used with all-null data."
3324+
)
3325+
arbitrary = pa.nulls(len(arbitrary))
33263326
column = ColumnBase.from_arrow(arbitrary)
33273327
if nan_as_null is not False:
33283328
column = column.nans_to_nulls()
@@ -3543,6 +3543,11 @@ def as_column(
35433543
elif length < 0:
35443544
raise ValueError(f"{length=} must be >=0.")
35453545

3546+
if is_arrow_null_dtype(dtype):
3547+
if is_na_like(arbitrary):
3548+
return column_empty(length, dtype=dtype)
3549+
pa.scalar(arbitrary, type=dtype.pyarrow_dtype)
3550+
35463551
pa_type = None
35473552
if isinstance(arbitrary, pd.Interval) or _is_categorical_dtype(dtype):
35483553
return as_column(
@@ -3775,6 +3780,13 @@ def as_column(
37753780

37763781
from_pandas = nan_as_null is None or nan_as_null
37773782
if dtype is not None:
3783+
if is_arrow_null_dtype(dtype):
3784+
arbitrary = pa.array(
3785+
arbitrary,
3786+
type=dtype.pyarrow_dtype,
3787+
from_pandas=True,
3788+
)
3789+
return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
37783790
try:
37793791
arbitrary = pa.array(
37803792
arbitrary,

python/cudf/cudf/core/dtype/validators.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def is_dtype_obj_string(obj: DtypeObj) -> bool:
3535
and (
3636
pa.types.is_string(obj.pyarrow_dtype)
3737
or pa.types.is_large_string(obj.pyarrow_dtype)
38+
or pa.types.is_null(obj.pyarrow_dtype)
3839
)
3940
)
4041
)

python/cudf/cudf/core/indexed_frame.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6577,12 +6577,16 @@ def convert_dtypes(
65776577
if dtype_backend == "pyarrow":
65786578
cols = []
65796579
for col in self._columns:
6580-
arrow_dtype = pd.ArrowDtype(
6581-
pa.null()
6582-
if col.null_count == len(col)
6583-
else cudf_dtype_to_pa_type(col.dtype)
6584-
)
6585-
cols.append(ColumnBase.create(col.plc_column, arrow_dtype))
6580+
if len(col) == 0 and is_dtype_obj_string(col.dtype):
6581+
cols.append(col)
6582+
continue
6583+
if len(col) != 0 and col.null_count == len(col):
6584+
cols.append(as_column(col, dtype=pd.ArrowDtype(pa.null())))
6585+
else:
6586+
arrow_dtype = pd.ArrowDtype(
6587+
cudf_dtype_to_pa_type(col.dtype)
6588+
)
6589+
cols.append(ColumnBase.create(col.plc_column, arrow_dtype))
65866590
return self._from_data_like_self(
65876591
self._data._from_columns_like_self(cols, verify=False)
65886592
)

python/cudf/cudf/pandas/scripts/conftest-patch.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1944,7 +1944,6 @@ def pytest_unconfigure(config):
19441944
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_convert_dtypes_avoid_block_splitting": "TODO: Add a reason for failure",
19451945
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_convert_dtypes_pyarrow_to_np_nullable": "TODO: Add a reason for failure",
19461946
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_backend_no_conversion": "TODO: Add a reason for failure",
1947-
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_dtype_empty_object": "AssertionError: Attributes of DataFrame.iloc[:, 0] (column name='0') are different",
19481947
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_engine_lines_false": "TODO: Add a reason for failure",
19491948
"tests/frame/methods/test_copy.py::TestCopy::test_copy_consolidates": "TODO: Add a reason for failure",
19501949
"tests/frame/methods/test_count.py::TestDataFrameCount::test_count": "TODO: Add a reason for failure",
@@ -6432,7 +6431,6 @@ def pytest_unconfigure(config):
64326431
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data11-maindtype11-Int8-expected_other11]": "AssertionError: Attributes of Series are different",
64336432
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data2-maindtype2-expected_default2-expected_other2]": "AssertionError: Attributes of Series are different",
64346433
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data6-maindtype6-Int64-expected_other6]": "AssertionError: Attributes of Series are different",
6435-
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes_pyarrow_null": "AssertionError: Attributes of Series are different",
64366434
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes_pyarrow_to_np_nullable": "TODO: Add a reason for failure",
64376435
"tests/series/methods/test_diff.py::TestSeriesDiff::test_diff_bool": "AssertionError: Attributes of Series are different",
64386436
"tests/series/methods/test_drop.py::test_drop_exception_raised[drop_labels1-0-KeyError-not found in axis]": "Failed: DID NOT RAISE <class 'KeyError'>",

python/cudf/cudf/tests/dataframe/methods/test_convert_dtypes.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
1+
# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
22
# SPDX-License-Identifier: Apache-2.0
33
import pandas as pd
44
import pytest
@@ -45,3 +45,13 @@ def test_convert_dtypes():
4545
with pytest.raises(NotImplementedError):
4646
# category and datetime64[ns] are not nullable
4747
gdf[non_nullable_columns].convert_dtypes().to_pandas(nullable=True)
48+
49+
50+
def test_convert_dtypes_pyarrow_null():
51+
pytest.importorskip("pyarrow")
52+
data = {"a": [None, None]}
53+
54+
expected = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow")
55+
result = cudf.DataFrame(data).convert_dtypes(dtype_backend="pyarrow")
56+
57+
assert_eq(result.to_pandas(), expected)

python/cudf/cudf/tests/series/methods/test_convert_dtypes.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
1+
# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
22
# SPDX-License-Identifier: Apache-2.0
33
import pandas as pd
44
import pytest
@@ -43,3 +43,13 @@ def test_convert_integer_false_convert_floating_true():
4343
.to_pandas(nullable=True)
4444
)
4545
assert_eq(result, expected)
46+
47+
48+
def test_convert_dtypes_pyarrow_null():
49+
pytest.importorskip("pyarrow")
50+
data = [None, None]
51+
52+
expected = pd.Series(data).convert_dtypes(dtype_backend="pyarrow")
53+
result = cudf.Series(data).convert_dtypes(dtype_backend="pyarrow")
54+
55+
assert_eq(result.to_pandas(), expected)

python/cudf/cudf/utils/dtypes.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -601,29 +601,6 @@ def is_arrow_null_dtype(dtype: DtypeObj) -> bool:
601601
)
602602

603603

604-
def maybe_normalize_arrow_null(
605-
col: plc.Column, dtype: DtypeObj
606-
) -> tuple[plc.Column, DtypeObj, DtypeObj | None]:
607-
"""Normalize ArrowDtype(pa.null()) columns for internal construction.
608-
609-
For pandas nullable null types (ArrowDtype wrapping pa.null()),
610-
the column data is normalized and the dtype is replaced with
611-
``np.dtype("object")`` for internal dispatch. The original dtype
612-
is returned as ``old_dtype`` so it can be stored on the column.
613-
614-
Returns
615-
-------
616-
tuple of (col, dtype, old_dtype)
617-
``old_dtype`` is the original dtype if normalization occurred,
618-
otherwise ``None``.
619-
"""
620-
from cudf.core.column.column import _normalize_types_column
621-
622-
if is_arrow_null_dtype(dtype):
623-
return _normalize_types_column(col), np.dtype("object"), dtype
624-
return col, dtype, None
625-
626-
627604
SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: dict[np.dtype[Any], plc.types.TypeId] = {
628605
np.dtype("int8"): plc.types.TypeId.INT8,
629606
np.dtype("int16"): plc.types.TypeId.INT16,

0 commit comments

Comments
 (0)