Fix multiindex selection (#2621)

fujiisoup · shoyer · commit b5059a538ee2 · 2018-12-24T07:37:26.000-08:00
* Fix multiindex selection

* Support pandas0.19

* a bugfix

* Do remove_unused_levels only once in unstack.

* import algos

* Remove unused import

* Adopt local import
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -83,7 +83,8 @@ Bug fixes
   By `Martin Raspaud <https://github.com/mraspaud>`_.
 - Fix parsing of ``_Unsigned`` attribute set by OPENDAP servers. (:issue:`2583`).
   By `Deepak Cherian <https://github.com/dcherian>`_
-
+- Fix MultiIndex selection to update label and level (:issue:`2619`).
+  By `Keisuke Fujii <https://github.com/fujiisoup>`_.
 
 .. _whats-new.0.11.0:
 
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -14,7 +14,7 @@
 
 from . import (
     alignment, computation, duck_array_ops, formatting, groupby, indexing, ops,
-    resample, rolling, utils)
+    pdcompat, resample, rolling, utils)
 from .. import conventions
 from ..coding.cftimeindex import _parse_array_of_cftime_strings
 from .alignment import align
@@ -2425,6 +2425,12 @@ def stack(self, dimensions=None, **dimensions_kwargs):
 
     def _unstack_once(self, dim):
         index = self.get_index(dim)
+        # GH2619. For MultiIndex, we need to call remove_unused.
+        if LooseVersion(pd.__version__) >= "0.20":
+            index = index.remove_unused_levels()
+        else:  # for pandas 0.19
+            index = pdcompat.remove_unused_levels(index)
+
         full_idx = pd.MultiIndex.from_product(index.levels, names=index.names)
 
         # take a shortcut in case the MultiIndex was not modified.
diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py
@@ -159,6 +159,10 @@ def convert_label_indexer(index, label, index_name='', method=None,
             indexer, new_index = index.get_loc_level(
                 tuple(label.values()), level=tuple(label.keys()))
 
+            # GH2619. Raise a KeyError if nothing is chosen
+            if indexer.dtype.kind == 'b' and indexer.sum() == 0:
+                raise KeyError('{} not found'.format(label))
+
     elif isinstance(label, tuple) and isinstance(index, pd.MultiIndex):
         if _is_nested_tuple(label):
             indexer = index.get_locs(label)
@@ -168,7 +172,6 @@ def convert_label_indexer(index, label, index_name='', method=None,
             indexer, new_index = index.get_loc_level(
                 label, level=list(range(len(label)))
             )
-
     else:
         label = (label if getattr(label, 'ndim', 1) > 1  # vectorized-indexing
                  else _asarray_tuplesafe(label))
diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py
@@ -0,0 +1,119 @@
+# The remove_unused_levels defined here was copied based on the source code
+# defined in pandas.core.indexes.muli.py
+
+# For reference, here is a copy of the pandas copyright notice:
+
+# (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team
+# All rights reserved.
+
+# Copyright (c) 2008-2011 AQR Capital Management, LLC
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+
+#     * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+
+#     * Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials provided
+#        with the distribution.
+
+#     * Neither the name of the copyright holder nor the names of any
+#        contributors may be used to endorse or promote products derived
+#        from this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import numpy as np
+import pandas as pd
+
+
+# for pandas 0.19
+def remove_unused_levels(self):
+    """
+    create a new MultiIndex from the current that removing
+    unused levels, meaning that they are not expressed in the labels
+    The resulting MultiIndex will have the same outward
+    appearance, meaning the same .values and ordering. It will also
+    be .equals() to the original.
+    .. versionadded:: 0.20.0
+    Returns
+    -------
+    MultiIndex
+    Examples
+    --------
+    >>> i = pd.MultiIndex.from_product([range(2), list('ab')])
+    MultiIndex(levels=[[0, 1], ['a', 'b']],
+               labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
+    >>> i[2:]
+    MultiIndex(levels=[[0, 1], ['a', 'b']],
+               labels=[[1, 1], [0, 1]])
+    The 0 from the first level is not represented
+    and can be removed
+    >>> i[2:].remove_unused_levels()
+    MultiIndex(levels=[[1], ['a', 'b']],
+               labels=[[0, 0], [0, 1]])
+    """
+    import pandas.core.algorithms as algos
+
+    new_levels = []
+    new_labels = []
+
+    changed = False
+    for lev, lab in zip(self.levels, self.labels):
+
+        # Since few levels are typically unused, bincount() is more
+        # efficient than unique() - however it only accepts positive values
+        # (and drops order):
+        uniques = np.where(np.bincount(lab + 1) > 0)[0] - 1
+        has_na = int(len(uniques) and (uniques[0] == -1))
+
+        if len(uniques) != len(lev) + has_na:
+            # We have unused levels
+            changed = True
+
+            # Recalculate uniques, now preserving order.
+            # Can easily be cythonized by exploiting the already existing
+            # "uniques" and stop parsing "lab" when all items are found:
+            uniques = algos.unique(lab)
+            if has_na:
+                na_idx = np.where(uniques == -1)[0]
+                # Just ensure that -1 is in first position:
+                uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]]
+
+            # labels get mapped from uniques to 0:len(uniques)
+            # -1 (if present) is mapped to last position
+            label_mapping = np.zeros(len(lev) + has_na)
+            # ... and reassigned value -1:
+            label_mapping[uniques] = np.arange(len(uniques)) - has_na
+
+            lab = label_mapping[lab]
+
+            # new levels are simple
+            lev = lev.take(uniques[has_na:])
+
+        new_levels.append(lev)
+        new_labels.append(lab)
+
+    result = self._shallow_copy()
+
+    if changed:
+        result._reset_identity()
+        result._set_levels(new_levels, validate=False)
+        result._set_labels(new_labels, validate=False)
+
+    return result
diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
@@ -1027,6 +1027,20 @@ def test_sel(lab_indexer, pos_indexer, replaced_idx=False,
         assert_identical(mdata.sel(x={'one': 'a', 'two': 1}),
                          mdata.sel(one='a', two=1))
 
+    def test_selection_multiindex(self):
+        # GH2619. For MultiIndex, we need to call remove_unused.
+        ds = xr.DataArray(np.arange(40).reshape(8, 5), dims=['x', 'y'],
+                          coords={'x': np.arange(8), 'y': np.arange(5)})
+        ds = ds.stack(xy=['x', 'y'])
+        ds_isel = ds.isel(xy=ds['x'] < 4)
+        with pytest.raises(KeyError):
+            ds_isel.sel(x=5)
+
+        actual = ds_isel.unstack()
+        expected = ds.reset_index('xy').isel(xy=ds['x'] < 4)
+        expected = expected.set_index(xy=['x', 'y']).unstack()
+        assert_identical(expected, actual)
+
     def test_virtual_default_coords(self):
         array = DataArray(np.zeros((5,)), dims='x')
         expected = DataArray(range(5), dims='x', name='x')