pandas-dev · jreback · Dec 6, 2017 · Oct 20, 2017 · Dec 4, 2017 · jreback
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -63,6 +63,91 @@ levels <merging.merge_on_columns_and_levels>` documentation section.
    left.merge(right, on=['key1', 'key2'])
 
 
+.. _whatsnew_0220.enhancements.ran_inf:
+
+handle ``inf`` values properly when ``NaN`` are present
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous version, ``inf`` elements were assigned ``NaN`` as their ranks. Now ranks are calculated properly. (:issue:`6945`)
+
+.. ipython:: python
+
+    In [9]: s = pd.Series([-np.inf, 0, 1, np.nan, np.inf])
+
+    In [10]: s
+    Out[10]:
+    0        -inf
+    1    0.000000
+    2    1.000000
+    3         NaN
+    4         inf
+    dtype: float64
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+    In [11]: s.rank()
+    Out[11]:
+    0    1.0
+    1    2.0
+    2    3.0
+    3    NaN
+    4    NaN
+    dtype: float64
+
+Current Behavior
+
+.. ipython:: python
+
+    In [4]: s.rank()
+    Out[4]:
+    0    1.0
+    1    2.0
+    2    3.0
+    3    NaN
+    4    4.0
+    dtype: float64
+
+Furthermore, previously if you rank ``inf`` or ``-inf`` values together with ``NaN`` values, the calculation won't distinguish ``NaN`` from infinity when using 'top' or 'bottom' argument.
+
+.. ipython:: python
+
+    In [14]: s = pd.Series([np.nan, np.nan, -np.inf, -np.inf])
+
+    In [15]: s
+    Out[15]:
+    0    NaN
+    1    NaN
+    2   -inf
+    3   -inf
+    dtype: float64
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+    In [15]: s.rank(na_option='top')
+    Out[15]:
+    0    2.5
+    1    2.5
+    2    2.5
+    3    2.5
+    dtype: float64
+
+Current Behavior
+
+.. ipython:: python
+
+    In [4]: s.rank(na_option='top')
+    Out[4]:
+    0    1.5
+    1    1.5
+    2    3.5
+    3    3.5
+    dtype: float64
+
+
 .. _whatsnew_0220.enhancements.other:
 
 Other Enhancements
@@ -77,6 +162,7 @@ Other Enhancements
 - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`)
 - :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`)
 
+
 .. _whatsnew_0220.api_breaking:
 
 Backwards incompatible API changes
@@ -220,7 +306,6 @@ Reshaping
 ^^^^^^^^^
 
 - Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`)
-
 -
 -
 

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -64,22 +64,24 @@ class Infinity(object):
     """ provide a positive Infinity comparision method for ranking """
 
     __lt__ = lambda self, other: False
-    __le__ = lambda self, other: self is other
-    __eq__ = lambda self, other: self is other
-    __ne__ = lambda self, other: self is not other
-    __gt__ = lambda self, other: self is not other
-    __ge__ = lambda self, other: True
+    __le__ = lambda self, other: isinstance(other, Infinity)
+    __eq__ = lambda self, other: isinstance(other, Infinity)
+    __ne__ = lambda self, other: not isinstance(other, Infinity)
+    __gt__ = lambda self, other: (not isinstance(other, Infinity) and
+                                  not missing.checknull(other))
+    __ge__ = lambda self, other: not missing.checknull(other)
 
 
 class NegInfinity(object):
     """ provide a negative Infinity comparision method for ranking """
 
-    __lt__ = lambda self, other: self is not other
-    __le__ = lambda self, other: True
-    __eq__ = lambda self, other: self is other
-    __ne__ = lambda self, other: self is not other
+    __lt__ = lambda self, other: (not isinstance(other, NegInfinity) and
+                                  not missing.checknull(other))
+    __le__ = lambda self, other: not missing.checknull(other)
+    __eq__ = lambda self, other: isinstance(other, NegInfinity)
+    __ne__ = lambda self, other: not isinstance(other, NegInfinity)
     __gt__ = lambda self, other: False
-    __ge__ = lambda self, other: self is other
+    __ge__ = lambda self, other: isinstance(other, NegInfinity)
 
 
 @cython.wraparound(False)

diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in
@@ -27,7 +27,7 @@ dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'),
 {{if dtype == 'object'}}
 
 
-def rank_1d_{{dtype}}(object in_arr, bint retry=1, ties_method='average',
+def rank_1d_{{dtype}}(object in_arr, ties_method='average',
                       ascending=True, na_option='keep', pct=False):
 {{else}}
 
@@ -40,7 +40,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
     """
 
     cdef:
-        Py_ssize_t i, j, n, dups = 0, total_tie_count = 0
+        Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0
 
         {{if dtype == 'object'}}
         ndarray sorted_data, values
@@ -50,6 +50,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
 
         ndarray[float64_t] ranks
         ndarray[int64_t] argsorted
+        ndarray[np.uint8_t, cast=True] sorted_mask
 
         {{if dtype == 'uint64'}}
         {{ctype}} val
@@ -60,6 +61,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
         float64_t sum_ranks = 0
         int tiebreak = 0
         bint keep_na = 0
+        bint isnan
         float count = 0.0
     tiebreak = tiebreakers[ties_method]
 
@@ -76,12 +78,6 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
 
     keep_na = na_option == 'keep'
 
-    {{if dtype != 'uint64'}}
-    if ascending ^ (na_option == 'top'):
-        nan_value = {{pos_nan_value}}
-    else:
-        nan_value = {{neg_nan_value}}
-
     {{if dtype == 'object'}}
     mask = missing.isnaobj(values)
     {{elif dtype == 'float64'}}
@@ -90,56 +86,69 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
     mask = values == iNaT
     {{endif}}
 
+    # double sort first by mask and then by values to ensure nan values are
+    # either at the beginning or the end. mask/(~mask) controls padding at
+    # tail or the head
+    {{if dtype != 'uint64'}}
+    if ascending ^ (na_option == 'top'):
+        nan_value = {{pos_nan_value}}
+        order = (values, mask)
+    else:
+        nan_value = {{neg_nan_value}}
+        order = (values, ~mask)
     np.putmask(values, mask, nan_value)
+    {{else}}
+    mask = np.zeros(shape=len(values), dtype=bool)
+    order = (values, mask)
     {{endif}}
 
     n = len(values)
     ranks = np.empty(n, dtype='f8')
 
     {{if dtype == 'object'}}
+
     try:
-        _as = values.argsort()
+        _as = np.lexsort(keys=order)
     except TypeError:
-        if not retry:
-            raise
-
-        valid_locs = (~mask).nonzero()[0]
-        ranks.put(valid_locs, rank_1d_object(values.take(valid_locs), 0,
-                                             ties_method=ties_method,
-                                             ascending=ascending))
-        np.putmask(ranks, mask, np.nan)
-        return ranks
+        # lexsort on object array will raise TypeError for numpy version
+        # earlier than 1.11.0. Use argsort with order argument instead.
+        _dt = [('values', 'O'), ('mask', '?')]
+        _values = np.asarray(list(zip(order[0], order[1])), dtype=_dt)
+        _as = np.argsort(_values, kind='mergesort', order=('mask', 'values'))
     {{else}}
     if tiebreak == TIEBREAK_FIRST:
         # need to use a stable sort here
-        _as = values.argsort(kind='mergesort')
+        _as = np.lexsort(keys=order)
         if not ascending:
             tiebreak = TIEBREAK_FIRST_DESCENDING
     else:
-        _as = values.argsort()
+        _as = np.lexsort(keys=order)
     {{endif}}
 
     if not ascending:
         _as = _as[::-1]
 
     sorted_data = values.take(_as)
+    sorted_mask = mask.take(_as)
+    _indices = order[1].take(_as).nonzero()[0]
+    non_na_idx = _indices[0] if len(_indices) > 0 else -1
     argsorted = _as.astype('i8')
 
     {{if dtype == 'object'}}
     for i in range(n):
         sum_ranks += i + 1
         dups += 1
-
+        isnan = sorted_mask[i]
         val = util.get_value_at(sorted_data, i)
 
-        if (val is nan_value) and keep_na:
+        if isnan and keep_na:
             ranks[argsorted[i]] = nan
             continue
-
         count += 1.0
 
         if (i == n - 1 or
-            are_diff(util.get_value_at(sorted_data, i + 1), val)):
+                are_diff(util.get_value_at(sorted_data, i + 1), val) or
+                i == non_na_idx - 1):
             if tiebreak == TIEBREAK_AVERAGE:
                 for j in range(i - dups + 1, i + 1):
                     ranks[argsorted[j]] = sum_ranks / dups
@@ -164,18 +173,19 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
         for i in range(n):
             sum_ranks += i + 1
             dups += 1
-
             val = sorted_data[i]
 
             {{if dtype != 'uint64'}}
-            if (val == nan_value) and keep_na:
+            isnan = sorted_mask[i]
+            if isnan and keep_na:
                 ranks[argsorted[i]] = nan
                 continue
             {{endif}}
 
             count += 1.0
 
-            if i == n - 1 or sorted_data[i + 1] != val:
+            if (i == n - 1 or sorted_data[i + 1] != val or
+                i == non_na_idx - 1):
                 if tiebreak == TIEBREAK_AVERAGE:
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = sum_ranks / dups