diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 09b504cac5ed4..d14bc917de356 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -63,6 +63,91 @@ levels ` documentation section. left.merge(right, on=['key1', 'key2']) +.. _whatsnew_0220.enhancements.ran_inf: + +handle ``inf`` values properly when ``NaN`` are present +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous version, ``inf`` elements were assigned ``NaN`` as their ranks. Now ranks are calculated properly. (:issue:`6945`) + +.. ipython:: python + + In [9]: s = pd.Series([-np.inf, 0, 1, np.nan, np.inf]) + + In [10]: s + Out[10]: + 0 -inf + 1 0.000000 + 2 1.000000 + 3 NaN + 4 inf + dtype: float64 + +Previous Behavior: + +.. code-block:: ipython + + In [11]: s.rank() + Out[11]: + 0 1.0 + 1 2.0 + 2 3.0 + 3 NaN + 4 NaN + dtype: float64 + +Current Behavior + +.. ipython:: python + + In [4]: s.rank() + Out[4]: + 0 1.0 + 1 2.0 + 2 3.0 + 3 NaN + 4 4.0 + dtype: float64 + +Furthermore, previously if you rank ``inf`` or ``-inf`` values together with ``NaN`` values, the calculation won't distinguish ``NaN`` from infinity when using 'top' or 'bottom' argument. + +.. ipython:: python + + In [14]: s = pd.Series([np.nan, np.nan, -np.inf, -np.inf]) + + In [15]: s + Out[15]: + 0 NaN + 1 NaN + 2 -inf + 3 -inf + dtype: float64 + +Previous Behavior: + +.. code-block:: ipython + + In [15]: s.rank(na_option='top') + Out[15]: + 0 2.5 + 1 2.5 + 2 2.5 + 3 2.5 + dtype: float64 + +Current Behavior + +.. ipython:: python + + In [4]: s.rank(na_option='top') + Out[4]: + 0 1.5 + 1 1.5 + 2 3.5 + 3 3.5 + dtype: float64 + + .. _whatsnew_0220.enhancements.other: Other Enhancements @@ -77,6 +162,7 @@ Other Enhancements - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) - :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`) + .. _whatsnew_0220.api_breaking: Backwards incompatible API changes @@ -220,7 +306,6 @@ Reshaping ^^^^^^^^^ - Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`) - - - diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 61d543cd7303a..02dc81fa36212 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -64,22 +64,24 @@ class Infinity(object): """ provide a positive Infinity comparision method for ranking """ __lt__ = lambda self, other: False - __le__ = lambda self, other: self is other - __eq__ = lambda self, other: self is other - __ne__ = lambda self, other: self is not other - __gt__ = lambda self, other: self is not other - __ge__ = lambda self, other: True + __le__ = lambda self, other: isinstance(other, Infinity) + __eq__ = lambda self, other: isinstance(other, Infinity) + __ne__ = lambda self, other: not isinstance(other, Infinity) + __gt__ = lambda self, other: (not isinstance(other, Infinity) and + not missing.checknull(other)) + __ge__ = lambda self, other: not missing.checknull(other) class NegInfinity(object): """ provide a negative Infinity comparision method for ranking """ - __lt__ = lambda self, other: self is not other - __le__ = lambda self, other: True - __eq__ = lambda self, other: self is other - __ne__ = lambda self, other: self is not other + __lt__ = lambda self, other: (not isinstance(other, NegInfinity) and + not missing.checknull(other)) + __le__ = lambda self, other: not missing.checknull(other) + __eq__ = lambda self, other: isinstance(other, NegInfinity) + __ne__ = lambda self, other: not isinstance(other, NegInfinity) __gt__ = lambda self, other: False - __ge__ = lambda self, other: self is other + __ge__ = lambda self, other: isinstance(other, NegInfinity) @cython.wraparound(False) diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 78a67d2e40be2..0e46530e20d1c 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -27,7 +27,7 @@ dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'), {{if dtype == 'object'}} -def rank_1d_{{dtype}}(object in_arr, bint retry=1, ties_method='average', +def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, na_option='keep', pct=False): {{else}} @@ -40,7 +40,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, """ cdef: - Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 {{if dtype == 'object'}} ndarray sorted_data, values @@ -50,6 +50,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, ndarray[float64_t] ranks ndarray[int64_t] argsorted + ndarray[np.uint8_t, cast=True] sorted_mask {{if dtype == 'uint64'}} {{ctype}} val @@ -60,6 +61,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 + bint isnan float count = 0.0 tiebreak = tiebreakers[ties_method] @@ -76,12 +78,6 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, keep_na = na_option == 'keep' - {{if dtype != 'uint64'}} - if ascending ^ (na_option == 'top'): - nan_value = {{pos_nan_value}} - else: - nan_value = {{neg_nan_value}} - {{if dtype == 'object'}} mask = missing.isnaobj(values) {{elif dtype == 'float64'}} @@ -90,56 +86,69 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, mask = values == iNaT {{endif}} + # double sort first by mask and then by values to ensure nan values are + # either at the beginning or the end. mask/(~mask) controls padding at + # tail or the head + {{if dtype != 'uint64'}} + if ascending ^ (na_option == 'top'): + nan_value = {{pos_nan_value}} + order = (values, mask) + else: + nan_value = {{neg_nan_value}} + order = (values, ~mask) np.putmask(values, mask, nan_value) + {{else}} + mask = np.zeros(shape=len(values), dtype=bool) + order = (values, mask) {{endif}} n = len(values) ranks = np.empty(n, dtype='f8') {{if dtype == 'object'}} + try: - _as = values.argsort() + _as = np.lexsort(keys=order) except TypeError: - if not retry: - raise - - valid_locs = (~mask).nonzero()[0] - ranks.put(valid_locs, rank_1d_object(values.take(valid_locs), 0, - ties_method=ties_method, - ascending=ascending)) - np.putmask(ranks, mask, np.nan) - return ranks + # lexsort on object array will raise TypeError for numpy version + # earlier than 1.11.0. Use argsort with order argument instead. + _dt = [('values', 'O'), ('mask', '?')] + _values = np.asarray(list(zip(order[0], order[1])), dtype=_dt) + _as = np.argsort(_values, kind='mergesort', order=('mask', 'values')) {{else}} if tiebreak == TIEBREAK_FIRST: # need to use a stable sort here - _as = values.argsort(kind='mergesort') + _as = np.lexsort(keys=order) if not ascending: tiebreak = TIEBREAK_FIRST_DESCENDING else: - _as = values.argsort() + _as = np.lexsort(keys=order) {{endif}} if not ascending: _as = _as[::-1] sorted_data = values.take(_as) + sorted_mask = mask.take(_as) + _indices = order[1].take(_as).nonzero()[0] + non_na_idx = _indices[0] if len(_indices) > 0 else -1 argsorted = _as.astype('i8') {{if dtype == 'object'}} for i in range(n): sum_ranks += i + 1 dups += 1 - + isnan = sorted_mask[i] val = util.get_value_at(sorted_data, i) - if (val is nan_value) and keep_na: + if isnan and keep_na: ranks[argsorted[i]] = nan continue - count += 1.0 if (i == n - 1 or - are_diff(util.get_value_at(sorted_data, i + 1), val)): + are_diff(util.get_value_at(sorted_data, i + 1), val) or + i == non_na_idx - 1): if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = sum_ranks / dups @@ -164,18 +173,19 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, for i in range(n): sum_ranks += i + 1 dups += 1 - val = sorted_data[i] {{if dtype != 'uint64'}} - if (val == nan_value) and keep_na: + isnan = sorted_mask[i] + if isnan and keep_na: ranks[argsorted[i]] = nan continue {{endif}} count += 1.0 - if i == n - 1 or sorted_data[i + 1] != val: + if (i == n - 1 or sorted_data[i + 1] != val or + i == non_na_idx - 1): if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = sum_ranks / dups diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index e45acdedbd2a9..baa71217e617d 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -14,6 +14,8 @@ from pandas.util.testing import assert_series_equal import pandas.util.testing as tm from pandas.tests.series.common import TestData +from pandas._libs.tslib import iNaT +from pandas._libs.algos import Infinity, NegInfinity class TestSeriesRank(TestData): @@ -195,16 +197,48 @@ def test_rank_signature(self): s.rank(method='average') pytest.raises(ValueError, s.rank, 'average') - def test_rank_inf(self): - pytest.skip('DataFrame.rank does not currently rank ' - 'np.inf and -np.inf properly') - - values = np.array( - [-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, - 2, 40, np.inf], dtype='float64') + @pytest.mark.parametrize('contents,dtype', [ + ([-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, + 2, 40, np.inf], + 'float64'), + ([-np.inf, -50, -1, -1e-20, -1e-25, -1e-45, 0, 1e-40, 1e-20, 1e-10, + 2, 40, np.inf], + 'float32'), + ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], + 'uint8'), + pytest.param([np.iinfo(np.int64).min, -100, 0, 1, 9999, 100000, + 1e10, np.iinfo(np.int64).max], + 'int64', + marks=pytest.mark.xfail(reason='''iNaT is equivalent to + minimum value of dtype + int64 pending issue + #16674'''), + ), + ([NegInfinity(), '1', 'A', 'BA', 'Ba', 'C', Infinity()], + 'object') + ]) + def test_rank_inf(self, contents, dtype): + dtype_na_map = { + 'float64': np.nan, + 'float32': np.nan, + 'int64': iNaT, + 'object': None + } + # Insert nans at random positions if underlying dtype has missing + # value. Then adjust the expected order by adding nans accordingly + # This is for testing whether rank calculation is affected + # when values are interwined with nan values. + values = np.array(contents, dtype=dtype) + exp_order = np.array(range(len(values)), dtype='float64') + 1.0 + if dtype in dtype_na_map: + na_value = dtype_na_map[dtype] + nan_indices = np.random.choice(range(len(values)), 5) + values = np.insert(values, nan_indices, na_value) + exp_order = np.insert(exp_order, nan_indices, np.nan) + # shuffle the testing array and expected results in the same way random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) - exp = Series(random_order + 1.0, dtype='float64') + exp = Series(exp_order[random_order], dtype='float64') iranks = iseries.rank() assert_series_equal(iranks, exp) @@ -225,6 +259,39 @@ def _check(s, expected, method='average'): series = s if dtype is None else s.astype(dtype) _check(series, results[method], method=method) + def test_rank_tie_methods_on_infs_nans(self): + dtypes = [('object', None, Infinity(), NegInfinity()), + ('float64', np.nan, np.inf, -np.inf)] + chunk = 3 + disabled = set([('object', 'first')]) + + def _check(s, expected, method='average', na_option='keep'): + result = s.rank(method=method, na_option=na_option) + tm.assert_series_equal(result, Series(expected, dtype='float64')) + + exp_ranks = { + 'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]), + 'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]), + 'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]), + 'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]), + 'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3]) + } + na_options = ('top', 'bottom', 'keep') + for dtype, na_value, pos_inf, neg_inf in dtypes: + in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk + iseries = Series(in_arr, dtype=dtype) + for method, na_opt in product(exp_ranks.keys(), na_options): + ranks = exp_ranks[method] + if (dtype, method) in disabled: + continue + if na_opt == 'top': + order = ranks[1] + ranks[0] + ranks[2] + elif na_opt == 'bottom': + order = ranks[0] + ranks[2] + ranks[1] + else: + order = ranks[0] + [np.nan] * chunk + ranks[1] + _check(iseries, order, method, na_opt) + def test_rank_methods_series(self): pytest.importorskip('scipy.stats.special') rankdata = pytest.importorskip('scipy.stats.rankdata') diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index bf244deec9ffc..821706894e767 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1269,11 +1269,15 @@ def test_infinity_sort(): assert all(Inf > x or x is Inf for x in ref_nums) assert Inf >= Inf and Inf == Inf assert not Inf < Inf and not Inf > Inf + assert libalgos.Infinity() == libalgos.Infinity() + assert not libalgos.Infinity() != libalgos.Infinity() assert all(NegInf <= x for x in ref_nums) assert all(NegInf < x or x is NegInf for x in ref_nums) assert NegInf <= NegInf and NegInf == NegInf assert not NegInf < NegInf and not NegInf > NegInf + assert libalgos.NegInfinity() == libalgos.NegInfinity() + assert not libalgos.NegInfinity() != libalgos.NegInfinity() for perm in permutations(ref_nums): assert sorted(perm) == ref_nums @@ -1283,6 +1287,25 @@ def test_infinity_sort(): np.array([libalgos.NegInfinity()] * 32).argsort() +def test_infinity_against_nan(): + Inf = libalgos.Infinity() + NegInf = libalgos.NegInfinity() + + assert not Inf > np.nan + assert not Inf >= np.nan + assert not Inf < np.nan + assert not Inf <= np.nan + assert not Inf == np.nan + assert Inf != np.nan + + assert not NegInf > np.nan + assert not NegInf >= np.nan + assert not NegInf < np.nan + assert not NegInf <= np.nan + assert not NegInf == np.nan + assert NegInf != np.nan + + def test_ensure_platform_int(): arr = np.arange(100, dtype=np.intp)