|
85 | 85 | dtype_to_pylibcudf_type, |
86 | 86 | find_common_type, |
87 | 87 | get_dtype_of_same_kind, |
| 88 | + is_arrow_null_dtype, |
88 | 89 | is_column_like, |
89 | 90 | is_mixed_with_object_dtype, |
90 | 91 | is_pandas_nullable_extension_dtype, |
91 | 92 | is_pandas_nullable_numpy_dtype, |
92 | | - maybe_normalize_arrow_null, |
93 | 93 | min_signed_type, |
94 | 94 | np_dtypes_to_pandas_dtypes, |
95 | 95 | pyarrow_dtype_to_cudf_dtype, |
@@ -378,13 +378,9 @@ def _wrap_and_validate(col: plc.Column, dtype: DtypeObj) -> plc.Column: |
378 | 378 | "Normalize to np.dtype('O') before calling " |
379 | 379 | "ColumnBase.create." |
380 | 380 | ) |
381 | | - if isinstance(dtype, pd.ArrowDtype) and pa.types.is_null( |
382 | | - dtype.pyarrow_dtype |
383 | | - ): |
| 381 | + if is_arrow_null_dtype(dtype) and col.null_count() != col.size(): |
384 | 382 | raise ValueError( |
385 | | - f"dtype {dtype} is a pandas nullable string dtype with all nulls. " |
386 | | - "Normalize to an empty string column with the same pandas StringDtype " |
387 | | - "before calling ColumnBase.create." |
| 383 | + f"dtype {dtype} can only be used with all-null columns." |
388 | 384 | ) |
389 | 385 |
|
390 | 386 | dtype_kind = dtype.kind |
@@ -961,15 +957,11 @@ def create( |
961 | 957 | like copy-on-write. When validation is disabled, the caller is responsible for |
962 | 958 | ensuring that col and its children are already normalized and wrapped. |
963 | 959 | """ |
964 | | - # For pandas nullable null types (ArrowDtype wrapping pa.null()), |
965 | | - # normalize the column data and dtype before construction. |
966 | | - col, dtype, old_dtype = maybe_normalize_arrow_null(col, dtype) |
967 | | - |
968 | 960 | # Dispatch to the appropriate subclass based on dtype |
969 | 961 | target_cls = ColumnBase._dispatch_subclass_from_dtype(dtype) |
970 | 962 | self = target_cls.__new__(target_cls) |
971 | 963 | self.plc_column = _wrap_and_validate(col, dtype) if validate else col |
972 | | - self._dtype = dtype if old_dtype is None else old_dtype |
| 964 | + self._dtype = dtype |
973 | 965 | self._distinct_count = {} |
974 | 966 | self._has_nulls = {} |
975 | 967 | # The set of exposed buffers associated with this column. These buffers must be |
@@ -1419,6 +1411,8 @@ def dropna(self) -> Self: |
1419 | 1411 | return self.copy() |
1420 | 1412 |
|
1421 | 1413 | def to_arrow(self) -> pa.Array: |
| 1414 | + if is_arrow_null_dtype(self.dtype): |
| 1415 | + return pa.nulls(len(self)) |
1422 | 1416 | with self.access(mode="read", scope="internal"): |
1423 | 1417 | return _handle_nulls( |
1424 | 1418 | self.plc_column.to_arrow( |
@@ -3323,6 +3317,12 @@ def as_column( |
3323 | 3317 | elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)): |
3324 | 3318 | if isinstance(arbitrary, pa.NullArray) and dtype is None: |
3325 | 3319 | dtype = np.dtype("object") |
| 3320 | + elif is_arrow_null_dtype(dtype): |
| 3321 | + if arbitrary.null_count != len(arbitrary): |
| 3322 | + raise ValueError( |
| 3323 | + f"dtype {dtype} can only be used with all-null data." |
| 3324 | + ) |
| 3325 | + arbitrary = pa.nulls(len(arbitrary)) |
3326 | 3326 | column = ColumnBase.from_arrow(arbitrary) |
3327 | 3327 | if nan_as_null is not False: |
3328 | 3328 | column = column.nans_to_nulls() |
@@ -3543,6 +3543,11 @@ def as_column( |
3543 | 3543 | elif length < 0: |
3544 | 3544 | raise ValueError(f"{length=} must be >=0.") |
3545 | 3545 |
|
| 3546 | + if is_arrow_null_dtype(dtype): |
| 3547 | + if is_na_like(arbitrary): |
| 3548 | + return column_empty(length, dtype=dtype) |
| 3549 | + pa.scalar(arbitrary, type=dtype.pyarrow_dtype) |
| 3550 | + |
3546 | 3551 | pa_type = None |
3547 | 3552 | if isinstance(arbitrary, pd.Interval) or _is_categorical_dtype(dtype): |
3548 | 3553 | return as_column( |
@@ -3775,6 +3780,13 @@ def as_column( |
3775 | 3780 |
|
3776 | 3781 | from_pandas = nan_as_null is None or nan_as_null |
3777 | 3782 | if dtype is not None: |
| 3783 | + if is_arrow_null_dtype(dtype): |
| 3784 | + arbitrary = pa.array( |
| 3785 | + arbitrary, |
| 3786 | + type=dtype.pyarrow_dtype, |
| 3787 | + from_pandas=True, |
| 3788 | + ) |
| 3789 | + return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype) |
3778 | 3790 | try: |
3779 | 3791 | arbitrary = pa.array( |
3780 | 3792 | arbitrary, |
|
0 commit comments