Skip to content

Commit a0ef2b7

Browse files
0x0Lshoyer
authored andcommitted
Rank Methods (#1733)
* initial support for rank * added pct kwargs * minor changes * move to variable * some polish * fix docstring * dataset fix and more tests * minor changes
1 parent 4924364 commit a0ef2b7

9 files changed

+206
-3
lines changed

doc/api-hidden.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
Dataset.T
4747
Dataset.cumsum
4848
Dataset.cumprod
49+
Dataset.rank
4950

5051
DataArray.ndim
5152
DataArray.shape
@@ -91,6 +92,7 @@
9192
DataArray.T
9293
DataArray.cumsum
9394
DataArray.cumprod
95+
DataArray.rank
9496

9597
ufuncs.angle
9698
ufuncs.arccos

doc/api.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ Computation
160160
:py:attr:`~Dataset.real`
161161
:py:attr:`~Dataset.cumsum`
162162
:py:attr:`~Dataset.cumprod`
163+
:py:attr:`~Dataset.rank`
163164

164165
**Grouped operations**:
165166
:py:attr:`~core.groupby.DatasetGroupBy.assign`
@@ -312,6 +313,7 @@ Computation
312313
:py:attr:`~DataArray.T`
313314
:py:attr:`~DataArray.cumsum`
314315
:py:attr:`~DataArray.cumprod`
316+
:py:attr:`~DataArray.rank`
315317

316318
**Grouped operations**:
317319
:py:attr:`~core.groupby.DataArrayGroupBy.assign_coords`

doc/whats-new.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,12 @@ Enhancements
3737
.. _Zarr: http://zarr.readthedocs.io/
3838

3939

40+
**New functions/methods**
41+
42+
- New :py:meth:`~xarray.DataArray.rank` on arrays and datasets. Requires
43+
bottleneck (:issue:`1731`).
44+
By `0x0L <https://github.com/0x0L>`_.
45+
4046
Bug fixes
4147
~~~~~~~~~
4248

xarray/core/dataarray.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from .accessors import DatetimeAccessor
2020
from .alignment import align, reindex_like_indexers
2121
from .common import AbstractArray, BaseDataObject
22+
from .computation import apply_ufunc
2223
from .coordinates import (DataArrayCoordinates, LevelCoordinatesSource,
2324
Indexes, assert_coordinate_consistent,
2425
remap_label_indexers)
@@ -1971,6 +1972,45 @@ def quantile(self, q, dim=None, interpolation='linear', keep_attrs=False):
19711972
interpolation=interpolation)
19721973
return self._from_temp_dataset(ds)
19731974

1975+
def rank(self, dim, pct=False, keep_attrs=False):
1976+
"""Ranks the data.
1977+
1978+
Equal values are assigned a rank that is the average of the ranks that
1979+
would have been otherwise assigned to all of the values within that set.
1980+
Ranks begin at 1, not 0. If pct is True, computes percentage ranks.
1981+
1982+
NaNs in the input array are returned as NaNs.
1983+
1984+
The `bottleneck` library is required.
1985+
1986+
Parameters
1987+
----------
1988+
dim : str
1989+
Dimension over which to compute rank.
1990+
pct : bool, optional
1991+
If True, compute percentage ranks, otherwise compute integer ranks.
1992+
keep_attrs : bool, optional
1993+
If True, the dataset's attributes (`attrs`) will be copied from
1994+
the original object to the new one. If False (default), the new
1995+
object will be returned without attributes.
1996+
1997+
Returns
1998+
-------
1999+
ranked : DataArray
2000+
DataArray with the same coordinates and dtype 'float64'.
2001+
2002+
Examples
2003+
--------
2004+
2005+
>>> arr = xr.DataArray([5, 6, 7], dims='x')
2006+
>>> arr.rank('x')
2007+
<xarray.DataArray (x: 3)>
2008+
array([ 1., 2., 3.])
2009+
Dimensions without coordinates: x
2010+
"""
2011+
ds = self._to_temp_dataset().rank(dim, pct=pct, keep_attrs=keep_attrs)
2012+
return self._from_temp_dataset(ds)
2013+
19742014

19752015
# priority most be higher than Variable to properly work with binary ufuncs
19762016
ops.inject_all_ops_and_reduce_methods(DataArray, priority=60)

xarray/core/dataset.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3256,6 +3256,48 @@ def quantile(self, q, dim=None, interpolation='linear',
32563256
new.coords['quantile'] = q
32573257
return new
32583258

3259+
def rank(self, dim, pct=False, keep_attrs=False):
3260+
"""Ranks the data.
3261+
3262+
Equal values are assigned a rank that is the average of the ranks that
3263+
would have been otherwise assigned to all of the values within that set.
3264+
Ranks begin at 1, not 0. If pct is True, computes percentage ranks.
3265+
3266+
NaNs in the input array are returned as NaNs.
3267+
3268+
The `bottleneck` library is required.
3269+
3270+
Parameters
3271+
----------
3272+
dim : str
3273+
Dimension over which to compute rank.
3274+
pct : bool, optional
3275+
If True, compute percentage ranks, otherwise compute integer ranks.
3276+
keep_attrs : bool, optional
3277+
If True, the dataset's attributes (`attrs`) will be copied from
3278+
the original object to the new one. If False (default), the new
3279+
object will be returned without attributes.
3280+
3281+
Returns
3282+
-------
3283+
ranked : Dataset
3284+
Variables that do not depend on `dim` are dropped.
3285+
"""
3286+
if dim not in self.dims:
3287+
raise ValueError('Dataset does not contain the dimension: %s' % dim)
3288+
3289+
variables = OrderedDict()
3290+
for name, var in iteritems(self.variables):
3291+
if name in self.data_vars:
3292+
if dim in var.dims:
3293+
variables[name] = var.rank(dim, pct=pct)
3294+
else:
3295+
variables[name] = var
3296+
3297+
coord_names = set(self.coords)
3298+
attrs = self.attrs if keep_attrs else None
3299+
return self._replace_vars_and_dims(variables, coord_names, attrs=attrs)
3300+
32593301
@property
32603302
def real(self):
32613303
return self._unary_op(lambda x: x.real, keep_attrs=True)(self)

xarray/core/variable.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1388,7 +1388,6 @@ def quantile(self, q, dim=None, interpolation='linear'):
13881388
numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile,
13891389
DataArray.quantile
13901390
"""
1391-
13921391
if isinstance(self.data, dask_array_type):
13931392
raise TypeError("quantile does not work for arrays stored as dask "
13941393
"arrays. Load the data via .compute() or .load() "
@@ -1419,6 +1418,47 @@ def quantile(self, q, dim=None, interpolation='linear'):
14191418
interpolation=interpolation)
14201419
return Variable(new_dims, qs)
14211420

1421+
def rank(self, dim, pct=False):
1422+
"""Ranks the data.
1423+
1424+
Equal values are assigned a rank that is the average of the ranks that
1425+
would have been otherwise assigned to all of the values within that set.
1426+
Ranks begin at 1, not 0. If pct is True, computes percentage ranks.
1427+
1428+
NaNs in the input array are returned as NaNs.
1429+
1430+
The `bottleneck` library is required.
1431+
1432+
Parameters
1433+
----------
1434+
dim : str
1435+
Dimension over which to compute rank.
1436+
pct : bool, optional
1437+
If True, compute percentage ranks, otherwise compute integer ranks.
1438+
1439+
Returns
1440+
-------
1441+
ranked : Variable
1442+
1443+
See Also
1444+
--------
1445+
Dataset.rank, DataArray.rank
1446+
"""
1447+
import bottleneck as bn
1448+
1449+
if isinstance(self.data, dask_array_type):
1450+
raise TypeError("rank does not work for arrays stored as dask "
1451+
"arrays. Load the data via .compute() or .load() "
1452+
"prior to calling this method.")
1453+
1454+
axis = self.get_axis_num(dim)
1455+
func = bn.nanrankdata if self.dtype.kind is 'f' else bn.rankdata
1456+
ranked = func(self.data, axis=axis)
1457+
if pct:
1458+
count = np.sum(~np.isnan(self.data), axis=axis, keepdims=True)
1459+
ranked /= count
1460+
return Variable(self.dims, ranked)
1461+
14221462
@property
14231463
def real(self):
14241464
return type(self)(self.dims, self.data.real, self._attrs)

xarray/tests/test_dataarray.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from xarray.tests import (
2020
TestCase, ReturnItem, source_ndarray, unittest, requires_dask,
2121
assert_identical, assert_equal, assert_allclose, assert_array_equal,
22-
raises_regex, requires_scipy)
22+
raises_regex, requires_scipy, requires_bottleneck)
2323

2424

2525
class TestDataArray(TestCase):
@@ -3104,6 +3104,25 @@ def test_sortby(self):
31043104
actual = da.sortby(['x', 'y'])
31053105
self.assertDataArrayEqual(actual, expected)
31063106

3107+
@requires_bottleneck
3108+
def test_rank(self):
3109+
# floats
3110+
ar = DataArray([[3, 4, np.nan, 1]])
3111+
expect_0 = DataArray([[1, 1, np.nan, 1]])
3112+
expect_1 = DataArray([[2, 3, np.nan, 1]])
3113+
self.assertDataArrayEqual(ar.rank('dim_0'), expect_0)
3114+
self.assertDataArrayEqual(ar.rank('dim_1'), expect_1)
3115+
# int
3116+
x = DataArray([3,2,1])
3117+
self.assertDataArrayEqual(x.rank('dim_0'), x)
3118+
# str
3119+
y = DataArray(['c', 'b', 'a'])
3120+
self.assertDataArrayEqual(y.rank('dim_0'), x)
3121+
3122+
x = DataArray([3.0, 1.0, np.nan, 2.0, 4.0], dims=('z',))
3123+
y = DataArray([0.75, 0.25, np.nan, 0.5, 1.0], dims=('z',))
3124+
self.assertDataArrayEqual(y.rank('z', pct=True), y)
3125+
31073126

31083127
@pytest.fixture(params=[1])
31093128
def da(request):

xarray/tests/test_dataset.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@
3131
requires_dask, source_ndarray)
3232

3333
from xarray.tests import (assert_equal, assert_allclose,
34-
assert_array_equal, requires_scipy)
34+
assert_array_equal, requires_bottleneck,
35+
requires_scipy)
3536

3637

3738
def create_test_data(seed=None):
@@ -3410,6 +3411,23 @@ def test_quantile(self):
34103411
assert 'dim3' in ds_quantile.dims
34113412
assert all(d not in ds_quantile.dims for d in dim)
34123413

3414+
@requires_bottleneck
3415+
def test_rank(self):
3416+
ds = create_test_data(seed=1234)
3417+
# only ds.var3 depends on dim3
3418+
z = ds.rank('dim3')
3419+
self.assertItemsEqual(['var3'], list(z.data_vars))
3420+
# same as dataarray version
3421+
x = z.var3
3422+
y = ds.var3.rank('dim3')
3423+
self.assertDataArrayEqual(x, y)
3424+
# coordinates stick
3425+
self.assertItemsEqual(list(z.coords), list(ds.coords))
3426+
self.assertItemsEqual(list(x.coords), list(y.coords))
3427+
# invalid dim
3428+
with raises_regex(ValueError, 'does not contain'):
3429+
x.rank('invalid_dim')
3430+
34133431
def test_count(self):
34143432
ds = Dataset({'x': ('a', [np.nan, 1]), 'y': 0, 'z': np.nan})
34153433
expected = Dataset({'x': 1, 'y': 1, 'z': 0})

xarray/tests/test_variable.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
from . import (
2828
TestCase, source_ndarray, requires_dask, raises_regex, assert_identical)
2929

30+
from xarray.tests import requires_bottleneck
31+
3032

3133
class VariableSubclassTestCases(object):
3234
def test_properties(self):
@@ -1381,6 +1383,38 @@ def test_quantile_dask_raises(self):
13811383
with raises_regex(TypeError, 'arrays stored as dask'):
13821384
v.quantile(0.5, dim='x')
13831385

1386+
@requires_dask
1387+
@requires_bottleneck
1388+
def test_rank_dask_raises(self):
1389+
v = Variable(['x'], [3.0, 1.0, np.nan, 2.0, 4.0]).chunk(2)
1390+
with raises_regex(TypeError, 'arrays stored as dask'):
1391+
v.rank('x')
1392+
1393+
@requires_bottleneck
1394+
def test_rank(self):
1395+
import bottleneck as bn
1396+
# floats
1397+
v = Variable(['x', 'y'], [[3, 4, np.nan, 1]])
1398+
expect_0 = bn.nanrankdata(v.data, axis=0)
1399+
expect_1 = bn.nanrankdata(v.data, axis=1)
1400+
np.testing.assert_allclose(v.rank('x').values, expect_0)
1401+
np.testing.assert_allclose(v.rank('y').values, expect_1)
1402+
# int
1403+
v = Variable(['x'], [3,2,1])
1404+
expect = bn.rankdata(v.data, axis=0)
1405+
np.testing.assert_allclose(v.rank('x').values, expect)
1406+
# str
1407+
v = Variable(['x'], ['c', 'b', 'a'])
1408+
expect = bn.rankdata(v.data, axis=0)
1409+
np.testing.assert_allclose(v.rank('x').values, expect)
1410+
# pct
1411+
v = Variable(['x'], [3.0, 1.0, np.nan, 2.0, 4.0])
1412+
v_expect = Variable(['x'], [0.75, 0.25, np.nan, 0.5, 1.0])
1413+
self.assertVariableEqual(v.rank('x', pct=True), v_expect)
1414+
# invalid dim
1415+
with raises_regex(ValueError, 'not found'):
1416+
v.rank('y')
1417+
13841418
def test_big_endian_reduce(self):
13851419
# regression test for GH489
13861420
data = np.ones(5, dtype='>f4')

0 commit comments

Comments
 (0)