Skip to content

Isin #2031

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Apr 4, 2018
Merged

Isin #2031

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ nosetests.xml
.cache
.ropeproject/
.tags*
.testmondata
.testmon*
.pytest_cache

# asv environments
Expand All @@ -51,10 +51,11 @@ nosetests.xml
.project
.pydevproject

# PyCharm and Vim
# IDEs
.idea
*.swp
.DS_Store
.vscode/

# xarray specific
doc/_build
Expand Down
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ Computation
:py:attr:`~Dataset.cumsum`
:py:attr:`~Dataset.cumprod`
:py:attr:`~Dataset.rank`
:py:attr:`~Dataset.isin`

**Grouped operations**:
:py:attr:`~core.groupby.DatasetGroupBy.assign`
Expand Down Expand Up @@ -339,6 +340,7 @@ Computation
:py:attr:`~DataArray.cumsum`
:py:attr:`~DataArray.cumprod`
:py:attr:`~DataArray.rank`
:py:attr:`~DataArray.isin`

**Grouped operations**:
:py:attr:`~core.groupby.DataArrayGroupBy.assign_coords`
Expand Down
7 changes: 6 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,12 @@ Documentation
Enhancements
~~~~~~~~~~~~

- Some speed improvement to construct :py:class:`~xarray.DataArrayRolling`
- `~xarray.DataArray.isin` and `~xarray.Dataset.isin` methods, which test each value
in the array for whether it is contained in the supplied list, returning a bool array.
Similar to the ``np.isin`` function. Requires NumPy >= 1.13
By `Maximilian Roos <https://github.com/maxim-lian>`

- Some speed improvement to construct :py:class:`~xarray.DataArrayRolling`
object (:issue:`1993`)
By `Keisuke Fujii <https://github.com/fujiisoup>`_.
- Handle variables with different values for ``missing_value`` and
Expand Down
30 changes: 30 additions & 0 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import absolute_import, division, print_function

import warnings
from distutils.version import LooseVersion

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -744,6 +745,35 @@ def close(self):
self._file_obj.close()
self._file_obj = None

def isin(self, test_elements):
"""Tests each value in the array for whether it is in the supplied list
Requires NumPy >= 1.13

Parameters
----------
element : array_like
Input array.
test_elements : array_like
The values against which to test each value of `element`.
This argument is flattened if an array or array_like.
See numpy notes for behavior with non-array-like parameters.

-------
isin : same as object, bool
Has the same shape as object
"""
if LooseVersion(np.__version__) < LooseVersion('1.13.0'):
raise ImportError('isin requires numpy version 1.13.0 or later')
from .computation import apply_ufunc

return apply_ufunc(
np.isin,
self,
kwargs=dict(test_elements=test_elements),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's probably a better idea to explicitly unwrap .data from test_elements if it's an xarray object, and explicitly raise for xarray.Dataset. Otherwise numpy will probably give a really strange error message.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, see #2032 for what converting a Dataset to a numpy array does.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I merged your branch - is that better than an additional check?

Why extract .data - won't the standard machinery take care of that? I added a test

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose you're right, the standard machinery will work fine here for now. In the future when dask supports isin (dask/dask#3363) we'll want to use .data so we can keep it as a dask array.

dask='parallelized',
output_dtypes=[np.bool_],
)

def __enter__(self):
return self

Expand Down
33 changes: 33 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3327,6 +3327,14 @@ def da(request):
[0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7],
dims='time')

if request.param == 'repeating_ints':
return DataArray(
np.tile(np.arange(12), 5).reshape(5, 4, 3),
coords={'x': list('abc'),
'y': list('defg')},
dims=list('zyx')
)


@pytest.fixture
def da_dask(seed=123):
Expand All @@ -3339,6 +3347,31 @@ def da_dask(seed=123):
return da


@pytest.mark.skipif(LooseVersion(np.__version__) < LooseVersion('1.13.0'),
reason='requires numpy version 1.13.0 or later')
@pytest.mark.parametrize('da', ('repeating_ints', ), indirect=True)
def test_isin(da):

expected = DataArray(
np.asarray([[0, 0, 0], [1, 0, 0]]),
dims=list('yx'),
coords={'x': list('abc'),
'y': list('de')},
).astype('bool')

result = da.isin([3]).sel(y=list('de'), z=0)
assert_equal(result, expected)

expected = DataArray(
np.asarray([[0, 0, 1], [1, 0, 0]]),
dims=list('yx'),
coords={'x': list('abc'),
'y': list('de')},
).astype('bool')
result = da.isin([2, 3]).sel(y=list('de'), z=0)
assert_equal(result, expected)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add another test for the dask path, e.g., that calls .chunk() on the input and verifies it gives the same result after .compute()? That will need a skipif for dask.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(done in test_dataset, lmk if better to put in both)


@pytest.mark.parametrize('da', (1, 2), indirect=True)
def test_rolling_iter(da):

Expand Down
65 changes: 61 additions & 4 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from . import (
InaccessibleArray, TestCase, UnexpectedDataAccess, assert_allclose,
assert_array_equal, assert_equal, assert_identical, raises_regex,
assert_array_equal, assert_equal, assert_identical, has_dask, raises_regex,
requires_bottleneck, requires_dask, requires_scipy, source_ndarray)

try:
Expand Down Expand Up @@ -4037,9 +4037,66 @@ def test_ipython_key_completion(self):
# Py.test tests


@pytest.fixture()
def data_set(seed=None):
return create_test_data(seed)
@pytest.fixture(params=[None])
def data_set(request):
return create_test_data(request.param)


@pytest.mark.skipif(LooseVersion(np.__version__) < LooseVersion('1.13.0'),
reason='requires numpy version 1.13.0 or later')
@pytest.mark.parametrize('test_elements', (
[1, 2],
np.array([1, 2]),
DataArray([1, 2]),
pytest.mark.xfail(Dataset({'x': [1, 2]})),
))
def test_isin(test_elements):
expected = Dataset(
data_vars={
'var1': (('dim1',), [0, 1]),
'var2': (('dim1',), [1, 1]),
'var3': (('dim1',), [0, 1]),
}
).astype('bool')

result = Dataset(
data_vars={
'var1': (('dim1',), [0, 1]),
'var2': (('dim1',), [1, 2]),
'var3': (('dim1',), [0, 1]),
}
).isin(test_elements)

assert_equal(result, expected)


@pytest.mark.skipif(LooseVersion(np.__version__) < LooseVersion('1.13.0') or # noqa
not has_dask, # noqa
reason='requires dask and numpy version 1.13.0 or later')
@pytest.mark.parametrize('test_elements', (
[1, 2],
np.array([1, 2]),
DataArray([1, 2]),
pytest.mark.xfail(Dataset({'x': [1, 2]})),
))
def test_isin_dask(test_elements):
expected = Dataset(
data_vars={
'var1': (('dim1',), [0, 1]),
'var2': (('dim1',), [1, 1]),
'var3': (('dim1',), [0, 1]),
}
).astype('bool')

result = Dataset(
data_vars={
'var1': (('dim1',), [0, 1]),
'var2': (('dim1',), [1, 2]),
'var3': (('dim1',), [0, 1]),
}
).chunk(1).isin(test_elements).compute()

assert_equal(result, expected)


def test_dir_expected_attrs(data_set):
Expand Down
2 changes: 0 additions & 2 deletions xarray/tests/test_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1422,8 +1422,6 @@ def test_reduce(self):
with raises_regex(ValueError, 'cannot supply both'):
v.mean(dim='x', axis=0)

@pytest.mark.skipif(LooseVersion(np.__version__) < LooseVersion('1.10.0'),
reason='requires numpy version 1.10.0 or later')
def test_quantile(self):
v = Variable(['x', 'y'], self.d)
for q in [0.25, [0.50], [0.25, 0.75]]:
Expand Down