Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7176,7 +7176,10 @@ def to_pandas(self) -> pd.Series | pd.DataFrame:
def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
from xarray.core.extension_array import PandasExtensionArray

columns_in_order = [k for k in self.variables if k not in self.dims]
# All and only non-index arrays (whether data or coordinates) should
# become columns in the output DataFrame. Excluding indexes rather
# than dims handles the case of a MultiIndex along a single dimension.
columns_in_order = [k for k in self.variables if k not in self.xindexes]
non_extension_array_columns = [
k
for k in columns_in_order
Expand Down
12 changes: 12 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3524,6 +3524,18 @@ def test_to_dataframe_multiindex(self) -> None:
assert_array_equal(index_pd.levels[1], ["a", "b"])
assert_array_equal(index_pd.levels[2], [5, 6, 7])

# test converting a dataframe MultiIndexed along a single dimension
mindex_single = pd.MultiIndex.from_product(
[list(range(6)), list("ab")], names=["A", "B"]
)

arr_multi_single = DataArray(
arr_np.flatten(), [("MI", mindex_single)], dims="MI", name="test"
)
actual_df = arr_multi_single.to_dataframe()
expected_df = arr_multi_single.to_series().to_frame()
assert expected_df.equals(actual_df)

def test_to_dataframe_0length(self) -> None:
# regression test for #3008
arr_np = np.random.randn(4, 0)
Expand Down
36 changes: 35 additions & 1 deletion xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5120,7 +5120,6 @@ def test_to_and_from_dataframe(self) -> None:
# from_dataframe attempts to broadcast across because it doesn't know better, so cat must be converted
ds["cat"] = (("x", "y"), np.stack((ds["cat"].to_numpy(), ds["cat"].to_numpy())))
assert_identical(ds.assign_coords(x=[0, 1]), Dataset.from_dataframe(actual))

# Check multiindex reordering
new_order = ["x", "y"]
# revert broadcasting fix above for 1d arrays
Expand Down Expand Up @@ -5154,6 +5153,41 @@ def test_to_and_from_dataframe(self) -> None:
):
ds.to_dataframe(dim_order=invalid_order)

# test a case with a MultiIndex along a single dimension
data_dict = dict(
x=[1, 2, 1, 2, 1], y=["a", "a", "b", "b", "b"], z=[5, 10, 15, 20, 25]
)
data_dict_w_dims = {k: ("single_dim", v) for k, v in data_dict.items()}

# Dataset multi-indexed along "single_dim" by "x" and "y"
ds = Dataset(data_dict_w_dims).set_coords(["x", "y"]).set_xindex(["x", "y"])
expected = pd.DataFrame(data_dict).set_index(["x", "y"])
actual = ds.to_dataframe()
assert expected.equals(actual)
# should be possible to reset index, as there should be no duplication
# between index and columns, and dataframes should still be equal
assert expected.reset_index().equals(actual.reset_index())

# MultiIndex deduplication should not affect other coordinates.
mindex_single = pd.MultiIndex.from_product(
[list(range(6)), list("ab")], names=["A", "B"]
)
ds = DataArray(
range(12), [("MI", mindex_single)], dims="MI", name="test"
)._to_dataset_whole()
ds.coords["C"] = "a single value"
ds.coords["D"] = ds.coords["A"] ** 2
expected = pd.DataFrame(
dict(
test=range(12),
C="a single value",
D=[0, 0, 1, 1, 4, 4, 9, 9, 16, 16, 25, 25],
)
).set_index(mindex_single)
actual = ds.to_dataframe()
assert expected.equals(actual)
assert expected.reset_index().equals(actual.reset_index())

# check pathological cases
df = pd.DataFrame([1])
actual_ds = Dataset.from_dataframe(df)
Expand Down
Loading