Rank Methods (#1733)

0x0L · shoyer · commit a0ef2b7174d5 · 2017-12-18T08:50:59.000-08:00
* initial support for rank

* added pct kwargs

* minor changes

* move to variable

* some polish

* fix docstring

* dataset fix and more tests

* minor changes
diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst
@@ -46,6 +46,7 @@
    Dataset.T
    Dataset.cumsum
    Dataset.cumprod
+   Dataset.rank
 
    DataArray.ndim
    DataArray.shape
@@ -91,6 +92,7 @@
    DataArray.T
    DataArray.cumsum
    DataArray.cumprod
+   DataArray.rank
 
    ufuncs.angle
    ufuncs.arccos
diff --git a/doc/api.rst b/doc/api.rst
@@ -160,6 +160,7 @@ Computation
 :py:attr:`~Dataset.real`
 :py:attr:`~Dataset.cumsum`
 :py:attr:`~Dataset.cumprod`
+:py:attr:`~Dataset.rank`
 
 **Grouped operations**:
 :py:attr:`~core.groupby.DatasetGroupBy.assign`
@@ -312,6 +313,7 @@ Computation
 :py:attr:`~DataArray.T`
 :py:attr:`~DataArray.cumsum`
 :py:attr:`~DataArray.cumprod`
+:py:attr:`~DataArray.rank`
 
 **Grouped operations**:
 :py:attr:`~core.groupby.DataArrayGroupBy.assign_coords`
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -37,6 +37,12 @@ Enhancements
 .. _Zarr: http://zarr.readthedocs.io/
 
 
+**New functions/methods**
+
+- New :py:meth:`~xarray.DataArray.rank` on arrays and datasets. Requires
+  bottleneck (:issue:`1731`).
+  By `0x0L <https://github.com/0x0L>`_.
+
 Bug fixes
 ~~~~~~~~~
 
diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -19,6 +19,7 @@
 from .accessors import DatetimeAccessor
 from .alignment import align, reindex_like_indexers
 from .common import AbstractArray, BaseDataObject
+from .computation import apply_ufunc
 from .coordinates import (DataArrayCoordinates, LevelCoordinatesSource,
                           Indexes, assert_coordinate_consistent,
                           remap_label_indexers)
@@ -1971,6 +1972,45 @@ def quantile(self, q, dim=None, interpolation='linear', keep_attrs=False):
                                               interpolation=interpolation)
         return self._from_temp_dataset(ds)
 
+    def rank(self, dim, pct=False, keep_attrs=False):
+        """Ranks the data.
+
+        Equal values are assigned a rank that is the average of the ranks that
+        would have been otherwise assigned to all of the values within that set.
+        Ranks begin at 1, not 0. If pct is True, computes percentage ranks.
+
+        NaNs in the input array are returned as NaNs.
+
+        The `bottleneck` library is required.
+
+        Parameters
+        ----------
+        dim : str
+            Dimension over which to compute rank.
+        pct : bool, optional
+            If True, compute percentage ranks, otherwise compute integer ranks.
+        keep_attrs : bool, optional
+            If True, the dataset's attributes (`attrs`) will be copied from
+            the original object to the new one.  If False (default), the new
+            object will be returned without attributes.
+
+        Returns
+        -------
+        ranked : DataArray
+            DataArray with the same coordinates and dtype 'float64'.
+
+        Examples
+        --------
+
+        >>> arr = xr.DataArray([5, 6, 7], dims='x')
+        >>> arr.rank('x')
+        <xarray.DataArray (x: 3)>
+        array([ 1.,   2.,   3.])
+        Dimensions without coordinates: x
+        """
+        ds = self._to_temp_dataset().rank(dim, pct=pct, keep_attrs=keep_attrs)
+        return self._from_temp_dataset(ds)
+
 
 # priority most be higher than Variable to properly work with binary ufuncs
 ops.inject_all_ops_and_reduce_methods(DataArray, priority=60)
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -3256,6 +3256,48 @@ def quantile(self, q, dim=None, interpolation='linear',
             new.coords['quantile'] = q
         return new
 
+    def rank(self, dim, pct=False, keep_attrs=False):
+        """Ranks the data.
+
+        Equal values are assigned a rank that is the average of the ranks that
+        would have been otherwise assigned to all of the values within that set.
+        Ranks begin at 1, not 0. If pct is True, computes percentage ranks.
+
+        NaNs in the input array are returned as NaNs.
+
+        The `bottleneck` library is required.
+
+        Parameters
+        ----------
+        dim : str
+            Dimension over which to compute rank.
+        pct : bool, optional
+            If True, compute percentage ranks, otherwise compute integer ranks.
+        keep_attrs : bool, optional
+            If True, the dataset's attributes (`attrs`) will be copied from
+            the original object to the new one.  If False (default), the new
+            object will be returned without attributes.
+
+        Returns
+        -------
+        ranked : Dataset
+            Variables that do not depend on `dim` are dropped.
+        """
+        if dim not in self.dims:
+            raise ValueError('Dataset does not contain the dimension: %s' % dim)
+
+        variables = OrderedDict()
+        for name, var in iteritems(self.variables):
+            if name in self.data_vars:
+                if dim in var.dims:
+                    variables[name] = var.rank(dim, pct=pct)
+            else:
+                variables[name] = var
+
+        coord_names = set(self.coords)
+        attrs = self.attrs if keep_attrs else None
+        return self._replace_vars_and_dims(variables, coord_names, attrs=attrs)
+
     @property
     def real(self):
         return self._unary_op(lambda x: x.real, keep_attrs=True)(self)
diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -1388,7 +1388,6 @@ def quantile(self, q, dim=None, interpolation='linear'):
         numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile,
         DataArray.quantile
         """
-
         if isinstance(self.data, dask_array_type):
             raise TypeError("quantile does not work for arrays stored as dask "
                             "arrays. Load the data via .compute() or .load() "
@@ -1419,6 +1418,47 @@ def quantile(self, q, dim=None, interpolation='linear'):
                               interpolation=interpolation)
         return Variable(new_dims, qs)
 
+    def rank(self, dim, pct=False):
+        """Ranks the data.
+
+        Equal values are assigned a rank that is the average of the ranks that
+        would have been otherwise assigned to all of the values within that set.
+        Ranks begin at 1, not 0. If pct is True, computes percentage ranks.
+
+        NaNs in the input array are returned as NaNs.
+
+        The `bottleneck` library is required.
+
+        Parameters
+        ----------
+        dim : str
+            Dimension over which to compute rank.
+        pct : bool, optional
+            If True, compute percentage ranks, otherwise compute integer ranks.
+
+        Returns
+        -------
+        ranked : Variable
+
+        See Also
+        --------
+        Dataset.rank, DataArray.rank
+        """
+        import bottleneck as bn
+
+        if isinstance(self.data, dask_array_type):
+            raise TypeError("rank does not work for arrays stored as dask "
+                            "arrays. Load the data via .compute() or .load() "
+                            "prior to calling this method.")
+
+        axis = self.get_axis_num(dim)
+        func = bn.nanrankdata if self.dtype.kind is 'f' else bn.rankdata
+        ranked = func(self.data, axis=axis)
+        if pct:
+            count = np.sum(~np.isnan(self.data), axis=axis, keepdims=True)
+            ranked /= count
+        return Variable(self.dims, ranked)
+
     @property
     def real(self):
         return type(self)(self.dims, self.data.real, self._attrs)
diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
@@ -19,7 +19,7 @@
 from xarray.tests import (
     TestCase, ReturnItem, source_ndarray, unittest, requires_dask,
     assert_identical, assert_equal, assert_allclose, assert_array_equal,
-    raises_regex, requires_scipy)
+    raises_regex, requires_scipy, requires_bottleneck)
 
 
 class TestDataArray(TestCase):
@@ -3104,6 +3104,25 @@ def test_sortby(self):
         actual = da.sortby(['x', 'y'])
         self.assertDataArrayEqual(actual, expected)
 
+    @requires_bottleneck
+    def test_rank(self):
+        # floats
+        ar = DataArray([[3, 4, np.nan, 1]])
+        expect_0 = DataArray([[1, 1, np.nan, 1]])
+        expect_1 = DataArray([[2, 3, np.nan, 1]])
+        self.assertDataArrayEqual(ar.rank('dim_0'), expect_0)
+        self.assertDataArrayEqual(ar.rank('dim_1'), expect_1)
+        # int
+        x = DataArray([3,2,1])
+        self.assertDataArrayEqual(x.rank('dim_0'), x)
+        # str
+        y =  DataArray(['c', 'b', 'a'])
+        self.assertDataArrayEqual(y.rank('dim_0'), x)
+
+        x = DataArray([3.0, 1.0, np.nan, 2.0, 4.0], dims=('z',))
+        y = DataArray([0.75, 0.25, np.nan, 0.5, 1.0], dims=('z',))
+        self.assertDataArrayEqual(y.rank('z', pct=True), y)
+
 
 @pytest.fixture(params=[1])
 def da(request):
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -31,7 +31,8 @@
                requires_dask, source_ndarray)
 
 from xarray.tests import (assert_equal, assert_allclose,
-                          assert_array_equal, requires_scipy)
+                          assert_array_equal, requires_bottleneck,
+                          requires_scipy)
 
 
 def create_test_data(seed=None):
@@ -3410,6 +3411,23 @@ def test_quantile(self):
             assert 'dim3' in ds_quantile.dims
             assert all(d not in ds_quantile.dims for d in dim)
 
+    @requires_bottleneck
+    def test_rank(self):
+        ds = create_test_data(seed=1234)
+        # only ds.var3 depends on dim3
+        z = ds.rank('dim3')
+        self.assertItemsEqual(['var3'], list(z.data_vars))
+        # same as dataarray version
+        x = z.var3
+        y = ds.var3.rank('dim3')
+        self.assertDataArrayEqual(x, y)
+        # coordinates stick
+        self.assertItemsEqual(list(z.coords), list(ds.coords))
+        self.assertItemsEqual(list(x.coords), list(y.coords))
+        # invalid dim
+        with raises_regex(ValueError, 'does not contain'):
+            x.rank('invalid_dim')
+
     def test_count(self):
         ds = Dataset({'x': ('a', [np.nan, 1]), 'y': 0, 'z': np.nan})
         expected = Dataset({'x': 1, 'y': 1, 'z': 0})
diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py
@@ -27,6 +27,8 @@
 from . import (
     TestCase, source_ndarray, requires_dask, raises_regex, assert_identical)
 
+from xarray.tests import requires_bottleneck
+
 
 class VariableSubclassTestCases(object):
     def test_properties(self):
@@ -1381,6 +1383,38 @@ def test_quantile_dask_raises(self):
         with raises_regex(TypeError, 'arrays stored as dask'):
             v.quantile(0.5, dim='x')
 
+    @requires_dask
+    @requires_bottleneck
+    def test_rank_dask_raises(self):
+        v = Variable(['x'], [3.0, 1.0, np.nan, 2.0, 4.0]).chunk(2)
+        with raises_regex(TypeError, 'arrays stored as dask'):
+            v.rank('x')
+
+    @requires_bottleneck
+    def test_rank(self):
+        import bottleneck as bn
+        # floats
+        v = Variable(['x', 'y'], [[3, 4, np.nan, 1]])
+        expect_0 = bn.nanrankdata(v.data, axis=0)
+        expect_1 = bn.nanrankdata(v.data, axis=1)
+        np.testing.assert_allclose(v.rank('x').values, expect_0)
+        np.testing.assert_allclose(v.rank('y').values, expect_1)
+        # int
+        v = Variable(['x'], [3,2,1])
+        expect = bn.rankdata(v.data, axis=0)
+        np.testing.assert_allclose(v.rank('x').values, expect)
+        # str
+        v =  Variable(['x'], ['c', 'b', 'a'])
+        expect = bn.rankdata(v.data, axis=0)
+        np.testing.assert_allclose(v.rank('x').values, expect)
+        # pct
+        v = Variable(['x'], [3.0, 1.0, np.nan, 2.0, 4.0])
+        v_expect = Variable(['x'], [0.75, 0.25, np.nan, 0.5, 1.0])
+        self.assertVariableEqual(v.rank('x', pct=True), v_expect)
+        # invalid dim
+        with raises_regex(ValueError, 'not found'):
+            v.rank('y')
+
     def test_big_endian_reduce(self):
         # regression test for GH489
         data = np.ones(5, dtype='>f4')