Remove dtype workaround in Column.create (#22396)

galipremsagar · web-flow · commit ab868e50ec42 · 2026-05-11T13:45:42.000-05:00
## Description This PR removes `pa.null` typecasting workaround in `Column.create`. ## Checklist - [x] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). - [x] New or existing tests cover these changes. - [x] The documentation is up to date with these changes.
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -85,11 +85,11 @@
     dtype_to_pylibcudf_type,
     find_common_type,
     get_dtype_of_same_kind,
+    is_arrow_null_dtype,
     is_column_like,
     is_mixed_with_object_dtype,
     is_pandas_nullable_extension_dtype,
     is_pandas_nullable_numpy_dtype,
-    maybe_normalize_arrow_null,
     min_signed_type,
     np_dtypes_to_pandas_dtypes,
     pyarrow_dtype_to_cudf_dtype,
@@ -378,13 +378,9 @@ def _wrap_and_validate(col: plc.Column, dtype: DtypeObj) -> plc.Column:
             "Normalize to np.dtype('O') before calling "
             "ColumnBase.create."
         )
-    if isinstance(dtype, pd.ArrowDtype) and pa.types.is_null(
-        dtype.pyarrow_dtype
-    ):
+    if is_arrow_null_dtype(dtype) and col.null_count() != col.size():
         raise ValueError(
-            f"dtype {dtype} is a pandas nullable string dtype with all nulls. "
-            "Normalize to an empty string column with the same pandas StringDtype "
-            "before calling ColumnBase.create."
+            f"dtype {dtype} can only be used with all-null columns."
         )
 
     dtype_kind = dtype.kind
@@ -961,15 +957,11 @@ def create(
         like copy-on-write. When validation is disabled, the caller is responsible for
         ensuring that col and its children are already normalized and wrapped.
         """
-        # For pandas nullable null types (ArrowDtype wrapping pa.null()),
-        # normalize the column data and dtype before construction.
-        col, dtype, old_dtype = maybe_normalize_arrow_null(col, dtype)
-
         # Dispatch to the appropriate subclass based on dtype
         target_cls = ColumnBase._dispatch_subclass_from_dtype(dtype)
         self = target_cls.__new__(target_cls)
         self.plc_column = _wrap_and_validate(col, dtype) if validate else col
-        self._dtype = dtype if old_dtype is None else old_dtype
+        self._dtype = dtype
         self._distinct_count = {}
         self._has_nulls = {}
         # The set of exposed buffers associated with this column. These buffers must be
@@ -1419,6 +1411,8 @@ def dropna(self) -> Self:
             return self.copy()
 
     def to_arrow(self) -> pa.Array:
+        if is_arrow_null_dtype(self.dtype):
+            return pa.nulls(len(self))
         with self.access(mode="read", scope="internal"):
             return _handle_nulls(
                 self.plc_column.to_arrow(
@@ -3323,6 +3317,12 @@ def as_column(
     elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
         if isinstance(arbitrary, pa.NullArray) and dtype is None:
             dtype = np.dtype("object")
+        elif is_arrow_null_dtype(dtype):
+            if arbitrary.null_count != len(arbitrary):
+                raise ValueError(
+                    f"dtype {dtype} can only be used with all-null data."
+                )
+            arbitrary = pa.nulls(len(arbitrary))
         column = ColumnBase.from_arrow(arbitrary)
         if nan_as_null is not False:
             column = column.nans_to_nulls()
@@ -3543,6 +3543,11 @@ def as_column(
         elif length < 0:
             raise ValueError(f"{length=} must be >=0.")
 
+        if is_arrow_null_dtype(dtype):
+            if is_na_like(arbitrary):
+                return column_empty(length, dtype=dtype)
+            pa.scalar(arbitrary, type=dtype.pyarrow_dtype)
+
         pa_type = None
         if isinstance(arbitrary, pd.Interval) or _is_categorical_dtype(dtype):
             return as_column(
@@ -3775,6 +3780,13 @@ def as_column(
 
     from_pandas = nan_as_null is None or nan_as_null
     if dtype is not None:
+        if is_arrow_null_dtype(dtype):
+            arbitrary = pa.array(
+                arbitrary,
+                type=dtype.pyarrow_dtype,
+                from_pandas=True,
+            )
+            return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
         try:
             arbitrary = pa.array(
                 arbitrary,
diff --git a/python/cudf/cudf/core/dtype/validators.py b/python/cudf/cudf/core/dtype/validators.py
@@ -35,6 +35,7 @@ def is_dtype_obj_string(obj: DtypeObj) -> bool:
             and (
                 pa.types.is_string(obj.pyarrow_dtype)
                 or pa.types.is_large_string(obj.pyarrow_dtype)
+                or pa.types.is_null(obj.pyarrow_dtype)
             )
         )
     )
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -6577,12 +6577,16 @@ def convert_dtypes(
         if dtype_backend == "pyarrow":
             cols = []
             for col in self._columns:
-                arrow_dtype = pd.ArrowDtype(
-                    pa.null()
-                    if col.null_count == len(col)
-                    else cudf_dtype_to_pa_type(col.dtype)
-                )
-                cols.append(ColumnBase.create(col.plc_column, arrow_dtype))
+                if len(col) == 0 and is_dtype_obj_string(col.dtype):
+                    cols.append(col)
+                    continue
+                if len(col) != 0 and col.null_count == len(col):
+                    cols.append(as_column(col, dtype=pd.ArrowDtype(pa.null())))
+                else:
+                    arrow_dtype = pd.ArrowDtype(
+                        cudf_dtype_to_pa_type(col.dtype)
+                    )
+                    cols.append(ColumnBase.create(col.plc_column, arrow_dtype))
             return self._from_data_like_self(
                 self._data._from_columns_like_self(cols, verify=False)
             )
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -1944,7 +1944,6 @@ def pytest_unconfigure(config):
     "tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_convert_dtypes_avoid_block_splitting": "TODO: Add a reason for failure",
     "tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_convert_dtypes_pyarrow_to_np_nullable": "TODO: Add a reason for failure",
     "tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_backend_no_conversion": "TODO: Add a reason for failure",
-    "tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_dtype_empty_object": "AssertionError: Attributes of DataFrame.iloc[:, 0] (column name='0') are different",
     "tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_engine_lines_false": "TODO: Add a reason for failure",
     "tests/frame/methods/test_copy.py::TestCopy::test_copy_consolidates": "TODO: Add a reason for failure",
     "tests/frame/methods/test_count.py::TestDataFrameCount::test_count": "TODO: Add a reason for failure",
@@ -6432,7 +6431,6 @@ def pytest_unconfigure(config):
     "tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data11-maindtype11-Int8-expected_other11]": "AssertionError: Attributes of Series are different",
     "tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data2-maindtype2-expected_default2-expected_other2]": "AssertionError: Attributes of Series are different",
     "tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data6-maindtype6-Int64-expected_other6]": "AssertionError: Attributes of Series are different",
-    "tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes_pyarrow_null": "AssertionError: Attributes of Series are different",
     "tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes_pyarrow_to_np_nullable": "TODO: Add a reason for failure",
     "tests/series/methods/test_diff.py::TestSeriesDiff::test_diff_bool": "AssertionError: Attributes of Series are different",
     "tests/series/methods/test_drop.py::test_drop_exception_raised[drop_labels1-0-KeyError-not found in axis]": "Failed: DID NOT RAISE <class 'KeyError'>",
diff --git a/python/cudf/cudf/tests/dataframe/methods/test_convert_dtypes.py b/python/cudf/cudf/tests/dataframe/methods/test_convert_dtypes.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 import pandas as pd
 import pytest
@@ -45,3 +45,13 @@ def test_convert_dtypes():
     with pytest.raises(NotImplementedError):
         # category and datetime64[ns] are not nullable
         gdf[non_nullable_columns].convert_dtypes().to_pandas(nullable=True)
+
+
+def test_convert_dtypes_pyarrow_null():
+    pytest.importorskip("pyarrow")
+    data = {"a": [None, None]}
+
+    expected = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow")
+    result = cudf.DataFrame(data).convert_dtypes(dtype_backend="pyarrow")
+
+    assert_eq(result.to_pandas(), expected)
diff --git a/python/cudf/cudf/tests/series/methods/test_convert_dtypes.py b/python/cudf/cudf/tests/series/methods/test_convert_dtypes.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 import pandas as pd
 import pytest
@@ -43,3 +43,13 @@ def test_convert_integer_false_convert_floating_true():
         .to_pandas(nullable=True)
     )
     assert_eq(result, expected)
+
+
+def test_convert_dtypes_pyarrow_null():
+    pytest.importorskip("pyarrow")
+    data = [None, None]
+
+    expected = pd.Series(data).convert_dtypes(dtype_backend="pyarrow")
+    result = cudf.Series(data).convert_dtypes(dtype_backend="pyarrow")
+
+    assert_eq(result.to_pandas(), expected)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
@@ -601,29 +601,6 @@ def is_arrow_null_dtype(dtype: DtypeObj) -> bool:
     )
 
 
-def maybe_normalize_arrow_null(
-    col: plc.Column, dtype: DtypeObj
-) -> tuple[plc.Column, DtypeObj, DtypeObj | None]:
-    """Normalize ArrowDtype(pa.null()) columns for internal construction.
-
-    For pandas nullable null types (ArrowDtype wrapping pa.null()),
-    the column data is normalized and the dtype is replaced with
-    ``np.dtype("object")`` for internal dispatch. The original dtype
-    is returned as ``old_dtype`` so it can be stored on the column.
-
-    Returns
-    -------
-    tuple of (col, dtype, old_dtype)
-        ``old_dtype`` is the original dtype if normalization occurred,
-        otherwise ``None``.
-    """
-    from cudf.core.column.column import _normalize_types_column
-
-    if is_arrow_null_dtype(dtype):
-        return _normalize_types_column(col), np.dtype("object"), dtype
-    return col, dtype, None
-
-
 SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: dict[np.dtype[Any], plc.types.TypeId] = {
     np.dtype("int8"): plc.types.TypeId.INT8,
     np.dtype("int16"): plc.types.TypeId.INT16,

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ def is_dtype_obj_string(obj: DtypeObj) -> bool:`
`35`	`35`	`and (`
`36`	`36`	`pa.types.is_string(obj.pyarrow_dtype)`
`37`	`37`	`or pa.types.is_large_string(obj.pyarrow_dtype)`
	`38`	`+ or pa.types.is_null(obj.pyarrow_dtype)`
`38`	`39`	`)`
`39`	`40`	`)`
`40`	`41`	`)`