Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion python/cudf/cudf/core/accessors/base_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,12 @@ def _return_or_inplace( # type: ignore[misc]
index=self._parent.index, # type: ignore[union-attr]
attrs=self._parent.attrs, # type: ignore[union-attr]
)
if len(table) == 0:
keys = (
tuple(table.keys()) if isinstance(table, dict) else ()
)
if len(table) == 0 or (
keys and keys == tuple(range(len(keys)))
):
df._data.rangeindex = True
return df
elif isinstance(self._parent, cudf.Series):
Expand Down
44 changes: 42 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2126,6 +2126,19 @@ def _concat(
# Reassign index and column names
if objs[0]._data.multiindex:
out._set_columns_like(objs[0]._data)
elif (
all(obj._data.rangeindex for obj in objs)
and all(
obj._num_columns == 0
or (
obj._column_names[0] == 0
and obj._column_names[-1] == obj._num_columns - 1
)
for obj in objs
)
and tuple(names) == tuple(range(len(names)))
):
out.columns = cudf.RangeIndex(len(names))
else:
out.columns = names
if not ignore_index:
Expand Down Expand Up @@ -2419,6 +2432,9 @@ def _fill_same_ca_attributes(
else:
raise ValueError("other must be a DataFrame or Series.")

if isinstance(column_names_list, pd.MultiIndex):
ca_attributes["multiindex"] = True
ca_attributes["level_names"] = tuple(column_names_list.names)
sorted_dict = {key: operands[key] for key in column_names_list}
return sorted_dict, index, ca_attributes
return operands, index, ca_attributes
Expand Down Expand Up @@ -4805,6 +4821,17 @@ def join(
df.index.name = (
None if self.index.name != other.index.name else self.index.name
)

# Preserve a CategoricalIndex columns axis when both inputs share the
# same categorical dtype on their column labels (matches pandas).
self_pd_cols = self._data.to_pandas_index
other_pd_cols = other._data.to_pandas_index
if (
isinstance(self_pd_cols, pd.CategoricalIndex)
and isinstance(other_pd_cols, pd.CategoricalIndex)
and self_pd_cols.dtype == other_pd_cols.dtype
):
df.columns = self_pd_cols.append(other_pd_cols)
return df

@_performance_tracking
Expand Down Expand Up @@ -6369,7 +6396,15 @@ def quantile(
if len(res) == 0:
res = column_empty(row_count=len(qs), dtype=ser.dtype)
result[k] = res
result = DataFrame._from_data(result, attrs=self.attrs)
result_ca = ColumnAccessor(
result,
multiindex=data_df._data.multiindex,
level_names=data_df._data.level_names,
rangeindex=data_df._data.rangeindex,
label_dtype=data_df._data.label_dtype,
verify=False,
)
result = DataFrame._from_data(result_ca, attrs=self.attrs)

if q_is_number and numeric_only:
result = result.fillna(np.nan).iloc[0]
Expand Down Expand Up @@ -7272,7 +7307,12 @@ def cudf_dtype_from_pydata_dtype(dtype):
for label, dtype in self._dtypes
if cudf_dtype_from_pydata_dtype(dtype) in inclusion
]
return self.loc[:, to_select]
result = self.loc[:, to_select]
if not to_select and self._data.rangeindex:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally I would hope loc preserved the .rangeindex but that could be for another PR

# Preserve RangeIndex columns through an empty selection so that
# downstream operations match pandas' column metadata.
result._data.rangeindex = True
return result

@ioutils.doc_to_parquet()
def to_parquet(
Expand Down
44 changes: 41 additions & 3 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1093,6 +1093,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
raise NotImplementedError(
"Passing args to func is currently not supported."
)
from cudf.core.dataframe import DataFrame

column_names, columns, normalized_aggs = self._normalize_aggs(
func, **kwargs
Expand Down Expand Up @@ -1172,12 +1173,25 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
if cast_dtype is not None:
result_col = result_col.astype(cast_dtype)
data[key] = result_col
data = ColumnAccessor(data, multiindex=multilevel)
# Preserve the column axis label-dtype/level_names from the source
# DataFrame so that aggregations such as ``nunique`` keep the column
# axis name (matching pandas behavior).
if (
not multilevel
and self.obj.ndim == 2
and self.obj._data.level_names != (None,)
):
data = ColumnAccessor(
data,
multiindex=False,
level_names=self.obj._data.level_names,
label_dtype=self.obj._data.label_dtype,
)
else:
data = ColumnAccessor(data, multiindex=multilevel)
if not multilevel:
data = data.rename_levels({np.nan: None}, level=0)

from cudf.core.dataframe import DataFrame

result = DataFrame._from_data(data, index=result_index)

if self._sort:
Expand Down Expand Up @@ -2824,6 +2838,8 @@ def _scan_fill(
) -> DataFrameOrSeries:
"""Internal implementation for `ffill` and `bfill`"""
values = self.grouping.values
from cudf.core.dataframe import DataFrame

result = self.obj._from_data(
dict(
zip(
Expand All @@ -2833,6 +2849,28 @@ def _scan_fill(
)
)
)
# Pandas' groupby.ffill/bfill builds the result columns via a ``take``
# on the input columns, which converts integer-valued column labels
# to object dtype. Reproduce that here so column metadata matches.
if (
isinstance(result, DataFrame)
and isinstance(self.obj, DataFrame)
and result._num_columns < self.obj._num_columns
):
source_pd_cols = self.obj._data.to_pandas_index
if (
source_pd_cols.dtype.kind in {"i", "u"}
or source_pd_cols.dtype == object
):
indexer = source_pd_cols.get_indexer(result._column_names)
if not (indexer == -1).any():
taken = source_pd_cols.take(indexer)
if (
not isinstance(taken, pd.MultiIndex)
and taken.dtype != object
):
taken = taken.astype(object)
result.columns = taken
return self._mimic_pandas_order(result)

def ffill(self, limit: int | None = None):
Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4022,11 +4022,13 @@ def _reindex(

index = index if index is not None else df.index

label_dtype = None
if column_names is None:
names = list(df._column_names)
level_names = self._data.level_names
multiindex = self._data.multiindex
rangeindex = self._data.rangeindex
label_dtype = self._data.label_dtype
elif isinstance(column_names, (pd.Index, cudf.Index)):
if isinstance(column_names, (pd.MultiIndex, cudf.MultiIndex)):
multiindex = True
Expand All @@ -4043,6 +4045,12 @@ def _reindex(
rangeindex = isinstance(
column_names, (pd.RangeIndex, cudf.RangeIndex)
)
if not rangeindex:
label_dtype = (
column_names.dtype
if isinstance(column_names, pd.Index)
else column_names.to_pandas().dtype
)
level_names = tuple(column_names.names)
else:
names = column_names
Expand Down Expand Up @@ -4075,6 +4083,7 @@ def _reindex(
multiindex=multiindex,
level_names=level_names,
rangeindex=rangeindex,
label_dtype=label_dtype,
),
index=index,
attrs=self.attrs,
Expand Down
37 changes: 35 additions & 2 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import cudf
from cudf.api.extensions import no_default
from cudf.api.types import is_list_like, is_scalar
from cudf.api.types import is_list_like, is_scalar, is_string_dtype
from cudf.core.column import (
ColumnBase,
as_column,
Expand All @@ -25,6 +25,7 @@
DEFAULT_STRING_DTYPE,
SIZE_TYPE_DTYPE,
find_common_type,
is_pandas_nullable_extension_dtype,
min_unsigned_type,
)

Expand Down Expand Up @@ -129,7 +130,10 @@ def _normalize_series_and_dataframe(
name = obj.name
if name is None:
if axis == 0:
name = 0
# Preserve "unnamed" semantics so the resulting frame has
# a RangeIndex columns object (matching pandas).
objs[idx] = obj.to_frame()
continue
else:
name = sr_name
sr_name += 1
Expand Down Expand Up @@ -1063,12 +1067,41 @@ def pivot(
index_data = index_data.get_level_values(0)
else:
index_data = cudf.Index(index_data)
# An entirely empty input pivots to an empty result. Pandas uses the
# default ``object`` dtype for the resulting index axis in that case;
# mirror this so index metadata (dtype/inferred_type) matches.
if (
len(data) == 0
and not isinstance(index_data, cudf.MultiIndex)
and is_pandas_nullable_extension_dtype(index_data.dtype)
and is_dtype_obj_string(index_data.dtype)
):
index_data = cudf.Index(
pd.Index([], name=index_data.name, dtype=object)
)

column_data = data.loc[:, columns]
# When `columns` is a scalar but the source DataFrame has a MultiIndex on
# the row axis, ``loc`` may return a 2-D selection in cuDF. Treat the
# selection as 1-D so we end up with a flat Index of column labels.
if is_scalar(columns) and column_data.ndim == 2:
column_data = column_data.iloc[:, 0]
if column_data.ndim == 2:
column_data = cudf.MultiIndex.from_frame(column_data)
else:
column_data = cudf.Index(column_data)
# An entirely empty input pivots to an empty result. Pandas reports the
# default ``object`` dtype for the resulting columns axis in that case;
# mirror this so column metadata (dtype/inferred_type) matches.
if (
len(data) == 0
and not isinstance(column_data, cudf.MultiIndex)
and is_pandas_nullable_extension_dtype(column_data.dtype)
and is_dtype_obj_string(column_data.dtype)
):
column_data = cudf.Index(
pd.Index([], name=column_data.name, dtype=object)
)

# Create a DataFrame composed of columns from both
# columns and index
Expand Down
15 changes: 13 additions & 2 deletions python/cudf/cudf/core/single_column_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,11 +254,22 @@ def tolist(self) -> None:
def _to_frame(self, name: Hashable, index: Index | None) -> DataFrame:
"""Helper function for Series.to_frame, Index.to_frame"""

unnamed_default = False
col_name: Hashable
if name is no_default:
col_name = 0 if self.name is None else self.name
if self.name is None:
col_name = 0
unnamed_default = True
else:
col_name = self.name
else:
col_name = name
ca = ColumnAccessor({col_name: self._column}, verify=False)
ca = ColumnAccessor(
{col_name: self._column},
multiindex=isinstance(col_name, tuple),
rangeindex=unnamed_default,
verify=False,
)
# TODO: Avoid accessing DataFrame from the top level namespace
return cudf.DataFrame._from_data(ca, index=index)

Expand Down
Loading
Loading