From 762c961faa733e8aca41debf6485dccf80436fe1 Mon Sep 17 00:00:00 2001 From: Ghislain Picard Date: Sun, 20 Sep 2020 22:40:19 +0200 Subject: [PATCH 1/8] Accept coordinates with MultiIndex (solve issue #3008) --- xarray/core/coordinates.py | 43 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 846e4044a2c..7bc7fec4695 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -13,6 +13,7 @@ cast, ) +import numpy as np import pandas as pd from . import formatting, indexing @@ -106,9 +107,47 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index: (dim,) = ordered_dims return self._data.get_index(dim) # type: ignore else: + from pandas.core.arrays.categorical import factorize_from_iterable + indexes = [self._data.get_index(k) for k in ordered_dims] # type: ignore - names = list(ordered_dims) - return pd.MultiIndex.from_product(indexes, names=names) + + # compute the sizes of the repeat and tile for the cartesian product + # (taken from pandas.core.reshape.util) + lenX = np.fromiter((len(index) for index in indexes), dtype=np.intp) + cumprodX = np.cumproduct(lenX) + + if cumprodX[-1] != 0: + # sizes of the repeats + b = cumprodX[-1] / cumprodX + else: + # if any factor is empty, the cartesian product is empty + b = np.zeros_like(cumprodX) + + # sizes of the tiles + a = np.roll(cumprodX, 1) + a[0] = 1 + + # loop over the indexes + # for each MultiIndex or Index compute the cartesian product of the codes + + code_list = [] + level_list = [] + names = [] + + for i, index in enumerate(indexes): + if isinstance(index, pd.MultiIndex): + codes, levels = index.codes, index.levels + else: + code, level = factorize_from_iterable(index) + codes = [code] + levels = [level] + + # compute the cartesian product + code_list += [np.tile(np.repeat(code, b[i]), a[i]) for code in codes] + level_list += levels + names += index.names + + return pd.MultiIndex(level_list, code_list, names=names) def update(self, other: Mapping[Hashable, Any]) -> None: other_vars = getattr(other, "variables", other) From 1472c92ae7b238b34927b9b2d721a806aa5f2b15 Mon Sep 17 00:00:00 2001 From: Ghislain Picard Date: Sun, 20 Sep 2020 22:43:08 +0200 Subject: [PATCH 2/8] formatting --- xarray/core/coordinates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 7bc7fec4695..c5cf5558f25 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -141,7 +141,7 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index: code, level = factorize_from_iterable(index) codes = [code] levels = [level] - + # compute the cartesian product code_list += [np.tile(np.repeat(code, b[i]), a[i]) for code in codes] level_list += levels From 31a698a2b7932c54872ba87fa28ef8d3b8990e2b Mon Sep 17 00:00:00 2001 From: Ghislain Picard Date: Mon, 21 Sep 2020 21:02:23 +0200 Subject: [PATCH 3/8] change to pd.factorize and improve variable names --- xarray/core/coordinates.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index c5cf5558f25..5c27cdb6e7b 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -107,8 +107,6 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index: (dim,) = ordered_dims return self._data.get_index(dim) # type: ignore else: - from pandas.core.arrays.categorical import factorize_from_iterable - indexes = [self._data.get_index(k) for k in ordered_dims] # type: ignore # compute the sizes of the repeat and tile for the cartesian product @@ -118,14 +116,14 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index: if cumprodX[-1] != 0: # sizes of the repeats - b = cumprodX[-1] / cumprodX + repeat_counts = cumprodX[-1] / cumprodX else: # if any factor is empty, the cartesian product is empty - b = np.zeros_like(cumprodX) + repeat_counts = np.zeros_like(cumprodX) # sizes of the tiles - a = np.roll(cumprodX, 1) - a[0] = 1 + tile_counts = np.roll(cumprodX, 1) + tile_counts[0] = 1 # loop over the indexes # for each MultiIndex or Index compute the cartesian product of the codes @@ -138,12 +136,12 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index: if isinstance(index, pd.MultiIndex): codes, levels = index.codes, index.levels else: - code, level = factorize_from_iterable(index) + code, level = pd.factorize(index) codes = [code] levels = [level] # compute the cartesian product - code_list += [np.tile(np.repeat(code, b[i]), a[i]) for code in codes] + code_list += [np.tile(np.repeat(code, repeat_counts[i]), tile_counts[i]) for code in codes] level_list += levels names += index.names From 4ea0b5337af7bb720b442833896a3b342a38c3aa Mon Sep 17 00:00:00 2001 From: Ghislain Picard Date: Mon, 21 Sep 2020 21:02:45 +0200 Subject: [PATCH 4/8] add a test for multiindex --- xarray/tests/test_dataarray.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 5e0fe13ea52..08d7e3ccdd4 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3520,6 +3520,21 @@ def test_to_dataframe(self): with raises_regex(ValueError, "unnamed"): arr.to_dataframe() + def test_to_dataframe_multiindex(self): + # regression test for #3008 + arr_np = np.random.randn(4, 3) + + mindex = pd.MultiIndex.from_product([[1, 2], list('ab')], names=['A', 'B']) + + arr = DataArray(arr_np, [('MI', mindex), ('C', [5, 6, 7])], name="foo") + + actual = arr.to_dataframe() + assert_array_equal(actual['foo'].values, arr_np.flatten()) + assert_array_equal(actual.index.names, list('ABC')) + assert_array_equal(actual.index.levels[0], [1, 2]) + assert_array_equal(actual.index.levels[1], ['a', 'b']) + assert_array_equal(actual.index.levels[2], [5, 6, 7]) + def test_to_pandas_name_matches_coordinate(self): # coordinate with same name as array arr = DataArray([1, 2, 3], dims="x", name="x") From 1febc77f734b92ba17e6e30f4a0bab0c24222716 Mon Sep 17 00:00:00 2001 From: Ghislain Picard Date: Mon, 21 Sep 2020 21:11:52 +0200 Subject: [PATCH 5/8] formatting --- xarray/tests/test_dataarray.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 08d7e3ccdd4..e2f8fc1c938 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3523,16 +3523,16 @@ def test_to_dataframe(self): def test_to_dataframe_multiindex(self): # regression test for #3008 arr_np = np.random.randn(4, 3) - - mindex = pd.MultiIndex.from_product([[1, 2], list('ab')], names=['A', 'B']) - arr = DataArray(arr_np, [('MI', mindex), ('C', [5, 6, 7])], name="foo") + mindex = pd.MultiIndex.from_product([[1, 2], list("ab")], names=["A", "B"]) + + arr = DataArray(arr_np, [("MI", mindex), ("C", [5, 6, 7])], name="foo") actual = arr.to_dataframe() - assert_array_equal(actual['foo'].values, arr_np.flatten()) - assert_array_equal(actual.index.names, list('ABC')) + assert_array_equal(actual["foo"].values, arr_np.flatten()) + assert_array_equal(actual.index.names, list("ABC")) assert_array_equal(actual.index.levels[0], [1, 2]) - assert_array_equal(actual.index.levels[1], ['a', 'b']) + assert_array_equal(actual.index.levels[1], ["a", "b"]) assert_array_equal(actual.index.levels[2], [5, 6, 7]) def test_to_pandas_name_matches_coordinate(self): From aa88f796ac51a2942aabeb36e9c12762f82bda37 Mon Sep 17 00:00:00 2001 From: Ghislain Picard Date: Sun, 27 Sep 2020 09:57:59 +0200 Subject: [PATCH 6/8] change variable names and formatting --- xarray/core/coordinates.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 5c27cdb6e7b..08b7629bed9 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -111,18 +111,20 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index: # compute the sizes of the repeat and tile for the cartesian product # (taken from pandas.core.reshape.util) - lenX = np.fromiter((len(index) for index in indexes), dtype=np.intp) - cumprodX = np.cumproduct(lenX) + index_lengths = np.fromiter( + (len(index) for index in indexes), dtype=np.intp + ) + cumprod_lengths = np.cumproduct(index_lengths) - if cumprodX[-1] != 0: + if cumprod_lengths[-1] != 0: # sizes of the repeats - repeat_counts = cumprodX[-1] / cumprodX + repeat_counts = cumprod_lengths[-1] / cumprod_lengths else: # if any factor is empty, the cartesian product is empty - repeat_counts = np.zeros_like(cumprodX) + repeat_counts = np.zeros_like(cumprod_lengths) # sizes of the tiles - tile_counts = np.roll(cumprodX, 1) + tile_counts = np.roll(cumprod_lengths, 1) tile_counts[0] = 1 # loop over the indexes @@ -141,7 +143,10 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index: levels = [level] # compute the cartesian product - code_list += [np.tile(np.repeat(code, repeat_counts[i]), tile_counts[i]) for code in codes] + code_list += [ + np.tile(np.repeat(code, repeat_counts[i]), tile_counts[i]) + for code in codes + ] level_list += levels names += index.names From 144c618d6f19d31ca0593e53c3886d312837f983 Mon Sep 17 00:00:00 2001 From: Ghislain Picard Date: Sun, 27 Sep 2020 09:58:17 +0200 Subject: [PATCH 7/8] add test for to_dataframe with 0 length index --- xarray/tests/test_dataarray.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index e2f8fc1c938..fcfef8ae217 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3535,6 +3535,18 @@ def test_to_dataframe_multiindex(self): assert_array_equal(actual.index.levels[1], ["a", "b"]) assert_array_equal(actual.index.levels[2], [5, 6, 7]) + def test_to_dataframe_0length(self): + # regression test for #3008 + arr_np = np.random.randn(4, 0) + + mindex = pd.MultiIndex.from_product([[1, 2], list("ab")], names=["A", "B"]) + + arr = DataArray(arr_np, [("MI", mindex), ("C", [])], name="foo") + + actual = arr.to_dataframe() + assert len(actual) == 0 + assert_array_equal(actual.index.names, list("ABC")) + def test_to_pandas_name_matches_coordinate(self): # coordinate with same name as array arr = DataArray([1, 2, 3], dims="x", name="x") From eb34d2b09577df2561666ecf0b178ae89fd8780e Mon Sep 17 00:00:00 2001 From: Keewis Date: Sat, 20 Feb 2021 00:01:04 +0100 Subject: [PATCH 8/8] add a whats-new.rst [skip-ci] --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 22902963d9c..f1a8e851b4b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -141,6 +141,9 @@ Bug fixes a float64 array (:issue:`4898`, :pull:`4911`). By `Blair Bonnett `_. - Fix decoding of vlen strings using h5py versions greater than 3.0.0 with h5netcdf backend (:issue:`4570`, :pull:`4893`). By `Kai Mühlbauer `_. +- Allow converting :py:class:`Dataset` or :py:class:`DataArray` objects with a ``MultiIndex`` + and at least one other dimension to a ``pandas`` object (:issue:`3008`, :pull:`4442`). + By `ghislainp `_. Documentation ~~~~~~~~~~~~~