From 7f7a12a0fe18c89a22b6f4bf3f9055e5d00b1b6e Mon Sep 17 00:00:00 2001 From: haison Date: Tue, 12 Mar 2019 09:29:27 -0700 Subject: [PATCH] add test case set value with NaN --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/index.pyx | 25 ++++++- pandas/core/indexes/multi.py | 87 ++++++++++++++++++++-- pandas/tests/indexes/multi/test_missing.py | 70 +++++++++++++++++ 4 files changed, 175 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d186fdfe0f322..a9ac687f5e2f5 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -199,6 +199,7 @@ Missing - Fixed misleading exception message in :meth:`Series.missing` if argument ``order`` is required, but omitted (:issue:`10633`, :issue:`24014`). - Fixed class type displayed in exception message in :meth:`DataFrame.dropna` if invalid ``axis`` parameter passed (:issue:`25555`) +- Fixed MultiIndex bug copying values incorrectly when adding values to index, in case `NaN` is included in the index (:issue:`22247`) - MultiIndex diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 8cea529fbb07e..5b42697548dff 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -591,7 +591,7 @@ cdef class BaseMultiIndexCodesEngine: level, then locating (the integer representation of) codes. """ def __init__(self, object levels, object labels, - ndarray[uint64_t, ndim=1] offsets): + ndarray[uint64_t, ndim=1] offsets, hasnans): """ Parameters ---------- @@ -605,6 +605,7 @@ cdef class BaseMultiIndexCodesEngine: self.levels = levels self.offsets = offsets + self.hasnans = hasnans # Transform labels in a single array, and add 1 so that we are working # with positive integers (-1 for NaN becomes 0): @@ -657,6 +658,14 @@ cdef class BaseMultiIndexCodesEngine: indexer = indexer[order] else: indexer = self._base.get_indexer(self, lab_ints) + # HashTable return same value for 'NaN' and new value + # simple fix by take maximum value from array and plus once + len = indexer.size - 1 + if len + 1 > 1 and self.hasnans: + check_dup = np.any(self._isin(indexer[0:len], + indexer[len:indexer.size])) + if check_dup and indexer[len]==-1: + indexer[len] = np.max(indexer) + 1 return indexer @@ -673,8 +682,18 @@ cdef class BaseMultiIndexCodesEngine: # Transform indices into single integer: lab_int = self._codes_to_ints(np.array(indices, dtype='uint64')) - - return self._base.get_loc(self, lab_int) + ret = [] + try: + ret = self._base.get_loc(self, lab_int) + except KeyError: + if self.hasnans: + # as NaN value, we have 0 bit represent for codes + # hacking here by add position of NaN in levels. + lab_int += len(self.levels[len(self.levels)-1]) + ret = self._base.get_loc(self, np.uint64(lab_int)) + else: + raise KeyError(lab_int) + return ret def get_indexer_non_unique(self, object target): # This needs to be overridden just because the default one works on diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 616c17cd16f9a..805b29ab57f53 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -17,7 +17,7 @@ from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable, is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar, - pandas_dtype) + is_string_dtype, pandas_dtype) from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.missing import array_equivalent, isna @@ -74,8 +74,30 @@ def _codes_to_ints(self, codes): # Single key return np.bitwise_or.reduce(codes) + codes = np.bitwise_or.reduce(codes, axis=1) + if codes.size > 1 and self.hasnans: + check_dup = np.any(algos.isin(codes[0:codes.size - 1], + codes[codes.size - 1:codes.size])) + if check_dup: + codes[codes.size - 1] = np.max(codes) + 1 + # Multiple keys - return np.bitwise_or.reduce(codes, axis=1) + return codes + + def _isin(self, comps, values): + """ + Compute the isin boolean array + Note just wraping algorithms.isin function to avoid fail of isort + Parameters + ---------- + comps : array-like + values : array-like + + Returns + ------- + boolean array same length as comps + """ + return algos.isin(comps, values) class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, @@ -116,8 +138,32 @@ def _codes_to_ints(self, codes): # Single key return np.bitwise_or.reduce(codes) + codes = np.bitwise_or.reduce(codes, axis=1) + # Shift return same value for 'NaN' and new value + # simple fix by take maximum value from array and plus once + if codes.size > 1 and self.hasnans: + check_dup = np.any(algos.isin(codes[0:codes.size - 1], + codes[codes.size - 1:codes.size])) + if check_dup: + codes[codes.size - 1] = np.max(codes) + 1 + # Multiple keys - return np.bitwise_or.reduce(codes, axis=1) + return codes + + def _isin(self, comps, values): + """ + Compute the isin boolean array + Note just wraping algorithms.isin function to avoid fail of isort + Parameters + ---------- + comps : array-like + values : array-like + + Returns + ------- + boolean array same length as comps + """ + return algos.isin(comps, values) class MultiIndex(Index): @@ -208,6 +254,7 @@ class MultiIndex(Index): _levels = FrozenList() _codes = FrozenList() _comparables = ['names'] + _isna = False rename = Index.set_names # -------------------------------------------------------------------- @@ -702,6 +749,34 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, self._codes = new_codes self._tuples = None self._reset_cache() + self._hasnans() + + def _hasnans(self): + """ + Return if I have any nans + """ + is_not_right_level = False + try: + self._verify_integrity() + except ValueError: + is_not_right_level = True + + if is_not_right_level: + return + + if (self.values.size > 0 and is_string_dtype(self.values)): + flat = [] + # flatten tuple to 1-D array for searching 'NaN' + for row in self.values: + flat.extend(row) + # algorithms.isin can not pass test_has_duplicates_overflow + with warnings.catch_warnings(): + warnings.simplefilter(action='ignore', category=FutureWarning) + try: + self._isna = np.array(np.where( + np.hstack(flat) == 'nan')).size > 0 + except UnicodeDecodeError: + self._isna = False def set_labels(self, labels, level=None, inplace=False, verify_integrity=True): @@ -1161,8 +1236,10 @@ def _engine(self): # Check the total number of bits needed for our representation: if lev_bits[0] > 64: # The levels would overflow a 64 bit uint - use Python integers: - return MultiIndexPyIntEngine(self.levels, self.codes, offsets) - return MultiIndexUIntEngine(self.levels, self.codes, offsets) + return MultiIndexPyIntEngine(self.levels, + self.codes, offsets, self._isna) + return MultiIndexUIntEngine(self.levels, + self.codes, offsets, self._isna) @property def values(self): diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index cd4adfa96ef54..338a8b9b03753 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -127,3 +127,73 @@ def test_nan_stays_float(): assert pd.isna(df0.index.get_level_values(1)).all() # the following failed in 0.14.1 assert pd.isna(dfm.index.get_level_values(1)[:-1]).all() + + +def test_nan_multi_index(): + # GH 22247 + # When using the MultiIndex features of pandas, when an `np.nan` + # is in the index when new values are added to the DF then the + # values are not `np.nan`, but copied from the `np.nan` row. + df = pd.DataFrame( + [ + ['A', np.nan, 1.23, 4.56], + ['A', 'G', 1.23, 4.56], + ['A', 'D', 9.87, 10.54], + ], + columns=['pivot_0', 'pivot_1', 'col_1', 'col_2'], + ) + df.set_index(['pivot_0', 'pivot_1'], inplace=True) + pivot_0 = 'A' + pivot_1_values = ['D', 'E', 'F'] + for value in pivot_1_values: + if value not in df.index.get_level_values('pivot_1').tolist(): + df.at[(pivot_0, value), 'col_2'] = 0.0 + + assert df.loc[('A', 'F')]['col_2'] == 0.0 # Pass + # Fails: value of 1.23 from the first row in the df is copied + # This behavior shows for all versions v0.23.x, however is fine for 0.22.0. + assert pd.isna(df.loc[('A', 'F')]['col_1']) + + +def test_nan_set_value_multi_index(): + # GH 22247 + # When using the MultiIndex features of pandas, when an `np.nan` + # is in the index when new values are added to the DF then the + # values are not `np.nan`, but copied from the `np.nan` row. + df = pd.DataFrame( + [ + ['A', 'G', 1.23, 4.56], + ['A', 'D', 9.87, 10.54], + ], + columns=['pivot_0', 'pivot_1', 'col_1', 'col_2'], + ) + df.set_index(['pivot_0', 'pivot_1'], inplace=True) + df.at[('A', 'E'), 'col_2'] = 0.0 + df.at[('A', 'F'), 'col_2'] = 0.0 + # Fails: raise exception + # This behavior shows for all versions v0.23.x, however is fine for 0.22.0. + df.at[('A', np.nan), 'col_2'] = 0.0 + + assert df.loc[('A', np.nan)]['col_2'] == 0.0 + assert pd.isna(df.loc[('A', np.nan)]['col_1']) + + +def test_nan_sigle_index(): + # GH 22247 + df = pd.DataFrame( + [ + [np.nan, 1.23, 4.56], + ['G', 1.23, 4.56], + ['D', 9.87, 10.54], + ], + columns=['pivot_0', 'col_1', 'col_2'], + ) + df.set_index(['pivot_0'], inplace=True) + + pivot_0_values = ['D', 'E', 'F'] + for value in pivot_0_values: + if value not in df.index.get_level_values('pivot_0').tolist(): + df.at[(value), 'col_2'] = 0.0 + + assert df.loc[('F')]['col_2'] == 0.0 + assert pd.isna(df.loc[('F')]['col_1'])