diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b4313e2da74..a21aad0c21f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7176,7 +7176,10 @@ def to_pandas(self) -> pd.Series | pd.DataFrame: def _to_dataframe(self, ordered_dims: Mapping[Any, int]): from xarray.core.extension_array import PandasExtensionArray - columns_in_order = [k for k in self.variables if k not in self.dims] + # All and only non-index arrays (whether data or coordinates) should + # become columns in the output DataFrame. Excluding indexes rather + # than dims handles the case of a MultiIndex along a single dimension. + columns_in_order = [k for k in self.variables if k not in self.xindexes] non_extension_array_columns = [ k for k in columns_in_order diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index bb5fd0edb58..d580d873a2b 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3524,6 +3524,18 @@ def test_to_dataframe_multiindex(self) -> None: assert_array_equal(index_pd.levels[1], ["a", "b"]) assert_array_equal(index_pd.levels[2], [5, 6, 7]) + # test converting a dataframe MultiIndexed along a single dimension + mindex_single = pd.MultiIndex.from_product( + [list(range(6)), list("ab")], names=["A", "B"] + ) + + arr_multi_single = DataArray( + arr_np.flatten(), [("MI", mindex_single)], dims="MI", name="test" + ) + actual_df = arr_multi_single.to_dataframe() + expected_df = arr_multi_single.to_series().to_frame() + assert expected_df.equals(actual_df) + def test_to_dataframe_0length(self) -> None: # regression test for #3008 arr_np = np.random.randn(4, 0) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 4af250a0fde..a91d3fac3dd 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -5120,7 +5120,6 @@ def test_to_and_from_dataframe(self) -> None: # from_dataframe attempts to broadcast across because it doesn't know better, so cat must be converted ds["cat"] = (("x", "y"), np.stack((ds["cat"].to_numpy(), ds["cat"].to_numpy()))) assert_identical(ds.assign_coords(x=[0, 1]), Dataset.from_dataframe(actual)) - # Check multiindex reordering new_order = ["x", "y"] # revert broadcasting fix above for 1d arrays @@ -5154,6 +5153,41 @@ def test_to_and_from_dataframe(self) -> None: ): ds.to_dataframe(dim_order=invalid_order) + # test a case with a MultiIndex along a single dimension + data_dict = dict( + x=[1, 2, 1, 2, 1], y=["a", "a", "b", "b", "b"], z=[5, 10, 15, 20, 25] + ) + data_dict_w_dims = {k: ("single_dim", v) for k, v in data_dict.items()} + + # Dataset multi-indexed along "single_dim" by "x" and "y" + ds = Dataset(data_dict_w_dims).set_coords(["x", "y"]).set_xindex(["x", "y"]) + expected = pd.DataFrame(data_dict).set_index(["x", "y"]) + actual = ds.to_dataframe() + assert expected.equals(actual) + # should be possible to reset index, as there should be no duplication + # between index and columns, and dataframes should still be equal + assert expected.reset_index().equals(actual.reset_index()) + + # MultiIndex deduplication should not affect other coordinates. + mindex_single = pd.MultiIndex.from_product( + [list(range(6)), list("ab")], names=["A", "B"] + ) + ds = DataArray( + range(12), [("MI", mindex_single)], dims="MI", name="test" + )._to_dataset_whole() + ds.coords["C"] = "a single value" + ds.coords["D"] = ds.coords["A"] ** 2 + expected = pd.DataFrame( + dict( + test=range(12), + C="a single value", + D=[0, 0, 1, 1, 4, 4, 9, 9, 16, 16, 25, 25], + ) + ).set_index(mindex_single) + actual = ds.to_dataframe() + assert expected.equals(actual) + assert expected.reset_index().equals(actual.reset_index()) + # check pathological cases df = pd.DataFrame([1]) actual_ds = Dataset.from_dataframe(df)