diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py index de6c34b5af3..54ed9ac9fa2 100644 --- a/asv_bench/benchmarks/dataset_io.py +++ b/asv_bench/benchmarks/dataset_io.py @@ -5,7 +5,7 @@ import xarray as xr -from . import randn, requires_dask +from . import randn, randint, requires_dask try: import dask @@ -71,6 +71,15 @@ def make_ds(self): self.ds.attrs = {'history': 'created for xarray benchmarking'} + self.oinds = {'time': randint(0, self.nt, 120), + 'lon': randint(0, self.nx, 20), + 'lat': randint(0, self.ny, 10)} + self.vinds = {'time': xr.DataArray(randint(0, self.nt, 120), + dims='x'), + 'lon': xr.DataArray(randint(0, self.nx, 120), + dims='x'), + 'lat': slice(3, 20)} + class IOWriteSingleNetCDF3(IOSingleNetCDF): def setup(self): @@ -98,6 +107,14 @@ def setup(self): def time_load_dataset_netcdf4(self): xr.open_dataset(self.filepath, engine='netcdf4').load() + def time_orthogonal_indexing(self): + ds = xr.open_dataset(self.filepath, engine='netcdf4') + ds = ds.isel(**self.oinds).load() + + def time_vectorized_indexing(self): + ds = xr.open_dataset(self.filepath, engine='netcdf4') + ds = ds.isel(**self.vinds).load() + class IOReadSingleNetCDF3(IOReadSingleNetCDF4): def setup(self): @@ -111,6 +128,14 @@ def setup(self): def time_load_dataset_scipy(self): xr.open_dataset(self.filepath, engine='scipy').load() + def time_orthogonal_indexing(self): + ds = xr.open_dataset(self.filepath, engine='scipy') + ds = ds.isel(**self.oinds).load() + + def time_vectorized_indexing(self): + ds = xr.open_dataset(self.filepath, engine='scipy') + ds = ds.isel(**self.vinds).load() + class IOReadSingleNetCDF4Dask(IOSingleNetCDF): def setup(self): @@ -127,6 +152,16 @@ def time_load_dataset_netcdf4_with_block_chunks(self): xr.open_dataset(self.filepath, engine='netcdf4', chunks=self.block_chunks).load() + def time_load_dataset_netcdf4_with_block_chunks_oindexing(self): + ds = xr.open_dataset(self.filepath, engine='netcdf4', + chunks=self.block_chunks) + ds = ds.isel(**self.oinds).load() + + def time_load_dataset_netcdf4_with_block_chunks_vindexing(self): + ds = xr.open_dataset(self.filepath, engine='netcdf4', + chunks=self.block_chunks) + ds = ds.isel(**self.vinds).load() + def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self): with dask.set_options(get=dask.multiprocessing.get): xr.open_dataset(self.filepath, engine='netcdf4', @@ -158,6 +193,16 @@ def time_load_dataset_scipy_with_block_chunks(self): xr.open_dataset(self.filepath, engine='scipy', chunks=self.block_chunks).load() + def time_load_dataset_scipy_with_block_chunks_oindexing(self): + ds = xr.open_dataset(self.filepath, engine='scipy', + chunks=self.block_chunks) + ds = ds.isel(**self.oinds).load() + + def time_load_dataset_scipy_with_block_chunks_vindexing(self): + ds = xr.open_dataset(self.filepath, engine='scipy', + chunks=self.block_chunks) + ds = ds.isel(**self.vinds).load() + def time_load_dataset_scipy_with_time_chunks(self): with dask.set_options(get=dask.multiprocessing.get): xr.open_dataset(self.filepath, engine='scipy', diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ab667ceba3f..0fad6882440 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -38,6 +38,11 @@ Documentation Enhancements ~~~~~~~~~~~~ +- Support lazy vectorized-indexing. After this change, flexible indexing such + as orthogonal/vectorized indexing, becomes possible for all the backend + arrays. Also, lazy ``transpose`` is now also supported. (:issue:`1897`) + By `Keisuke Fujii `_. + - Improve :py:func:`~xarray.DataArray.rolling` logic. :py:func:`~xarray.DataArrayRolling` object now supports :py:func:`~xarray.DataArrayRolling.construct` method that returns a view diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 4e70c8858c3..1d166f05eb1 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -16,15 +16,20 @@ class H5NetCDFArrayWrapper(BaseNetCDF4Array): def __getitem__(self, key): - key = indexing.unwrap_explicit_indexer( - key, self, allow=(indexing.BasicIndexer, indexing.OuterIndexer)) + key, np_inds = indexing.decompose_indexer( + key, self.shape, indexing.IndexingSupport.OUTER_1VECTOR) + # h5py requires using lists for fancy indexing: # https://github.com/h5py/h5py/issues/992 - # OuterIndexer only holds 1D integer ndarrays, so it's safe to convert - # them to lists. - key = tuple(list(k) if isinstance(k, np.ndarray) else k for k in key) + key = tuple(list(k) if isinstance(k, np.ndarray) else k for k in + key.tuple) with self.datastore.ensure_open(autoclose=True): - return self.get_array()[key] + array = self.get_array()[key] + + if len(np_inds.tuple) > 0: + array = indexing.NumpyIndexingAdapter(array)[np_inds] + + return array def maybe_decode_bytes(txt): @@ -85,7 +90,7 @@ def __init__(self, filename, mode='r', format=None, group=None, def open_store_variable(self, name, var): with self.ensure_open(autoclose=False): dimensions = var.dimensions - data = indexing.LazilyIndexedArray( + data = indexing.LazilyOuterIndexedArray( H5NetCDFArrayWrapper(name, self)) attrs = _read_attributes(var) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 313539bd6bc..4903e9a98f2 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -48,9 +48,8 @@ def get_array(self): class NetCDF4ArrayWrapper(BaseNetCDF4Array): def __getitem__(self, key): - key = indexing.unwrap_explicit_indexer( - key, self, allow=(indexing.BasicIndexer, indexing.OuterIndexer)) - + key, np_inds = indexing.decompose_indexer( + key, self.shape, indexing.IndexingSupport.OUTER) if self.datastore.is_remote: # pragma: no cover getitem = functools.partial(robust_getitem, catch=RuntimeError) else: @@ -58,7 +57,7 @@ def __getitem__(self, key): with self.datastore.ensure_open(autoclose=True): try: - data = getitem(self.get_array(), key) + array = getitem(self.get_array(), key.tuple) except IndexError: # Catch IndexError in netCDF4 and return a more informative # error message. This is most often called when an unsorted @@ -71,7 +70,10 @@ def __getitem__(self, key): msg += '\n\nOriginal traceback:\n' + traceback.format_exc() raise IndexError(msg) - return data + if len(np_inds.tuple) > 0: + array = indexing.NumpyIndexingAdapter(array)[np_inds] + + return array def _encode_nc4_variable(var): @@ -277,7 +279,8 @@ def open(cls, filename, mode='r', format='NETCDF4', group=None, def open_store_variable(self, name, var): with self.ensure_open(autoclose=False): dimensions = var.dimensions - data = indexing.LazilyIndexedArray(NetCDF4ArrayWrapper(name, self)) + data = indexing.LazilyOuterIndexedArray( + NetCDF4ArrayWrapper(name, self)) attributes = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs()) _ensure_fill_value_valid(data, attributes) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index a16b1ddcbc8..4a932e3dad2 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -22,18 +22,22 @@ def dtype(self): return self.array.dtype def __getitem__(self, key): - key = indexing.unwrap_explicit_indexer( - key, target=self, allow=indexing.BasicIndexer) + key, np_inds = indexing.decompose_indexer( + key, self.shape, indexing.IndexingSupport.BASIC) # pull the data from the array attribute if possible, to avoid # downloading coordinate data twice array = getattr(self.array, 'array', self.array) - result = robust_getitem(array, key, catch=ValueError) + result = robust_getitem(array, key.tuple, catch=ValueError) # pydap doesn't squeeze axes automatically like numpy - axis = tuple(n for n, k in enumerate(key) + axis = tuple(n for n, k in enumerate(key.tuple) if isinstance(k, integer_types)) if len(axis) > 0: result = np.squeeze(result, axis) + + if len(np_inds.tuple) > 0: + result = indexing.NumpyIndexingAdapter(np.asarray(result))[np_inds] + return result @@ -74,7 +78,7 @@ def open(cls, url, session=None): return cls(ds) def open_store_variable(self, var): - data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) + data = indexing.LazilyOuterIndexedArray(PydapArrayWrapper(var)) return Variable(var.dimensions, data, _fix_attributes(var.attributes)) diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 30969fcd9a0..95226e453b4 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -24,14 +24,19 @@ def get_array(self): return self.datastore.ds.variables[self.variable_name] def __getitem__(self, key): - key = indexing.unwrap_explicit_indexer( - key, target=self, allow=indexing.BasicIndexer) + key, np_inds = indexing.decompose_indexer( + key, self.shape, indexing.IndexingSupport.BASIC) with self.datastore.ensure_open(autoclose=True): array = self.get_array() - if key == () and self.ndim == 0: + if key.tuple == () and self.ndim == 0: return array.get_value() - return array[key] + + array = array[key.tuple] + if len(np_inds.tuple) > 0: + array = indexing.NumpyIndexingAdapter(array)[np_inds] + + return array class NioDataStore(AbstractDataStore, DataStorePickleMixin): @@ -51,7 +56,7 @@ def __init__(self, filename, mode='r', autoclose=False): self._mode = mode def open_store_variable(self, name, var): - data = indexing.LazilyIndexedArray(NioArrayWrapper(name, self)) + data = indexing.LazilyOuterIndexedArray(NioArrayWrapper(name, self)) return Variable(var.dimensions, data, var.attributes) def get_variables(self): diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py index 8777f6e7053..f856f9f9e13 100644 --- a/xarray/backends/rasterio_.py +++ b/xarray/backends/rasterio_.py @@ -42,48 +42,73 @@ def dtype(self): def shape(self): return self._shape - def __getitem__(self, key): - key = indexing.unwrap_explicit_indexer( - key, self, allow=(indexing.BasicIndexer, indexing.OuterIndexer)) + def _get_indexer(self, key): + """ Get indexer for rasterio array. + + Parameter + --------- + key: ExplicitIndexer + + Returns + ------- + band_key: an indexer for the 1st dimension + window: two tuples. Each consists of (start, stop). + squeeze_axis: axes to be squeezed + np_ind: indexer for loaded numpy array + + See also + -------- + indexing.decompose_indexer + """ + key, np_inds = indexing.decompose_indexer( + key, self.shape, indexing.IndexingSupport.OUTER) # bands cannot be windowed but they can be listed - band_key = key[0] - n_bands = self.shape[0] + band_key = key.tuple[0] + new_shape = [] + np_inds2 = [] + # bands (axis=0) cannot be windowed but they can be listed if isinstance(band_key, slice): - start, stop, step = band_key.indices(n_bands) - if step is not None and step != 1: - raise IndexError(_ERROR_MSG) - band_key = np.arange(start, stop) + start, stop, step = band_key.indices(self.shape[0]) + band_key = np.arange(start, stop, step) # be sure we give out a list band_key = (np.asarray(band_key) + 1).tolist() + if isinstance(band_key, list): # if band_key is not a scalar + new_shape.append(len(band_key)) + np_inds2.append(slice(None)) # but other dims can only be windowed window = [] squeeze_axis = [] - for i, (k, n) in enumerate(zip(key[1:], self.shape[1:])): + for i, (k, n) in enumerate(zip(key.tuple[1:], self.shape[1:])): if isinstance(k, slice): + # step is always positive. see indexing.decompose_indexer start, stop, step = k.indices(n) - if step is not None and step != 1: - raise IndexError(_ERROR_MSG) + np_inds2.append(slice(None, None, step)) + new_shape.append(stop - start) elif is_scalar(k): # windowed operations will always return an array # we will have to squeeze it later - squeeze_axis.append(i + 1) + squeeze_axis.append(- (2 - i)) start = k stop = k + 1 else: - k = np.asarray(k) - start = k[0] - stop = k[-1] + 1 - ids = np.arange(start, stop) - if not ((k.shape == ids.shape) and np.all(k == ids)): - raise IndexError(_ERROR_MSG) + start, stop = np.min(k), np.max(k) + 1 + np_inds2.append(k - start) + new_shape.append(stop - start) window.append((start, stop)) + np_inds = indexing._combine_indexers( + indexing.OuterIndexer(tuple(np_inds2)), new_shape, np_inds) + return band_key, window, tuple(squeeze_axis), np_inds + + def __getitem__(self, key): + band_key, window, squeeze_axis, np_inds = self._get_indexer(key) + out = self.rasterio_ds.read(band_key, window=tuple(window)) if squeeze_axis: out = np.squeeze(out, axis=squeeze_axis) - return out + return indexing.NumpyIndexingAdapter(out)[np_inds] def _parse_envi(meta): @@ -249,7 +274,7 @@ def open_rasterio(filename, parse_coordinates=None, chunks=None, cache=None, else: attrs[k] = v - data = indexing.LazilyIndexedArray(RasterioArrayWrapper(riods)) + data = indexing.LazilyOuterIndexedArray(RasterioArrayWrapper(riods)) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index b0323b51f17..3058e935238 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -41,30 +41,6 @@ def _ensure_valid_fill_value(value, dtype): return _encode_zarr_attr_value(valid) -def _replace_slices_with_arrays(key, shape): - """Replace slice objects in vindex with equivalent ndarray objects.""" - num_slices = sum(1 for k in key if isinstance(k, slice)) - ndims = [k.ndim for k in key if isinstance(k, np.ndarray)] - array_subspace_size = max(ndims) if ndims else 0 - assert len(key) == len(shape) - new_key = [] - slice_count = 0 - for k, size in zip(key, shape): - if isinstance(k, slice): - # the slice subspace always appears after the ndarray subspace - array = np.arange(*k.indices(size)) - sl = [np.newaxis] * len(shape) - sl[array_subspace_size + slice_count] = slice(None) - k = array[tuple(sl)] - slice_count += 1 - else: - assert isinstance(k, np.ndarray) - k = k[(slice(None),) * array_subspace_size + - (np.newaxis,) * num_slices] - new_key.append(k) - return tuple(new_key) - - class ZarrArrayWrapper(BackendArray): def __init__(self, variable_name, datastore): self.datastore = datastore @@ -84,8 +60,8 @@ def __getitem__(self, key): if isinstance(key, indexing.BasicIndexer): return array[key.tuple] elif isinstance(key, indexing.VectorizedIndexer): - return array.vindex[_replace_slices_with_arrays(key.tuple, - self.shape)] + return array.vindex[indexing._arrayize_vectorized_indexer( + key.tuple, self.shape).tuple] else: assert isinstance(key, indexing.OuterIndexer) return array.oindex[key.tuple] @@ -292,7 +268,7 @@ def __init__(self, zarr_group, writer=None): super(ZarrStore, self).__init__(zarr_writer) def open_store_variable(self, name, zarr_array): - data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self)) + data = indexing.LazilyOuterIndexedArray(ZarrArrayWrapper(name, self)) dimensions, attributes = _get_zarr_dims_and_attrs(zarr_array, _DIMENSION_KEY) attributes = OrderedDict(attributes) diff --git a/xarray/conventions.py b/xarray/conventions.py index 5bcbd83ee90..93fd56ed5d2 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -490,7 +490,7 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True, del attributes['dtype'] data = BoolTypeArray(data) - return Variable(dimensions, indexing.LazilyIndexedArray(data), + return Variable(dimensions, indexing.LazilyOuterIndexedArray(data), attributes, encoding=encoding) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 0d55eed894e..bd16618d766 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -254,7 +254,7 @@ def slice_slice(old_slice, applied_slice, size): items = _expand_slice(old_slice, size)[applied_slice] if len(items) > 0: start = items[0] - stop = items[-1] + step + stop = items[-1] + int(np.sign(step)) if stop < 0: stop = None else: @@ -425,23 +425,6 @@ def __array__(self, dtype=None): return np.asarray(self[key], dtype=dtype) -def unwrap_explicit_indexer(key, target, allow): - """Unwrap an explicit key into a tuple.""" - if not isinstance(key, ExplicitIndexer): - raise TypeError('unexpected key type: {}'.format(key)) - if not isinstance(key, allow): - key_type_name = { - BasicIndexer: 'Basic', - OuterIndexer: 'Outer', - VectorizedIndexer: 'Vectorized' - }[type(key)] - raise NotImplementedError( - '{} indexing for {} is not implemented. Load your data first with ' - '.load(), .compute() or .persist(), or disable caching by setting ' - 'cache=False in open_dataset.'.format(key_type_name, type(target))) - return key.tuple - - class ImplicitToExplicitIndexingAdapter(utils.NDArrayMixin): """Wrap an array, converting tuples into the indicated explicit indexer.""" @@ -457,8 +440,8 @@ def __getitem__(self, key): return self.array[self.indexer_cls(key)] -class LazilyIndexedArray(ExplicitlyIndexedNDArrayMixin): - """Wrap an array to make basic and orthogonal indexing lazy. +class LazilyOuterIndexedArray(ExplicitlyIndexedNDArrayMixin): + """Wrap an array to make basic and outer indexing lazy. """ def __init__(self, array, key=None): @@ -483,13 +466,6 @@ def __init__(self, array, key=None): self.key = key def _updated_key(self, new_key): - # TODO should suport VectorizedIndexer - if isinstance(new_key, VectorizedIndexer): - raise NotImplementedError( - 'Vectorized indexing for {} is not implemented. Load your ' - 'data first with .load() or .compute(), or disable caching by ' - 'setting cache=False in open_dataset.'.format(type(self))) - iter_new_key = iter(expanded_indexer(new_key.tuple, self.ndim)) full_key = [] for size, k in zip(self.array.shape, self.key.tuple): @@ -517,10 +493,21 @@ def __array__(self, dtype=None): array = as_indexable(self.array) return np.asarray(array[self.key], dtype=None) + def transpose(self, order): + return LazilyVectorizedIndexedArray( + self.array, self.key).transpose(order) + def __getitem__(self, indexer): + if isinstance(indexer, VectorizedIndexer): + array = LazilyVectorizedIndexedArray(self.array, self.key) + return array[indexer] return type(self)(self.array, self._updated_key(indexer)) def __setitem__(self, key, value): + if isinstance(key, VectorizedIndexer): + raise NotImplementedError( + 'Lazy item assignment with the vectorized indexer is not yet ' + 'implemented. Load your data first by .load() or compute().') full_key = self._updated_key(key) self.array[full_key] = value @@ -529,6 +516,56 @@ def __repr__(self): (type(self).__name__, self.array, self.key)) +class LazilyVectorizedIndexedArray(ExplicitlyIndexedNDArrayMixin): + """Wrap an array to make vectorized indexing lazy. + """ + + def __init__(self, array, key): + """ + Parameters + ---------- + array : array_like + Array like object to index. + key : VectorizedIndexer + """ + if isinstance(key, (BasicIndexer, OuterIndexer)): + self.key = _outer_to_vectorized_indexer(key, array.shape) + else: + self.key = _arrayize_vectorized_indexer(key, array.shape) + self.array = as_indexable(array) + + @property + def shape(self): + return np.broadcast(*self.key.tuple).shape + + def __array__(self, dtype=None): + return np.asarray(self.array[self.key], dtype=None) + + def _updated_key(self, new_key): + return _combine_indexers(self.key, self.shape, new_key) + + def __getitem__(self, indexer): + # If the indexed array becomes a scalar, return LazilyOuterIndexedArray + if all(isinstance(ind, integer_types) for ind in indexer.tuple): + key = BasicIndexer(tuple(k[indexer.tuple] for k in self.key.tuple)) + return LazilyOuterIndexedArray(self.array, key) + return type(self)(self.array, self._updated_key(indexer)) + + def transpose(self, order): + key = VectorizedIndexer(tuple( + k.transpose(order) for k in self.key.tuple)) + return type(self)(self.array, key) + + def __setitem__(self, key, value): + raise NotImplementedError( + 'Lazy item assignment with the vectorized indexer is not yet ' + 'implemented. Load your data first by .load() or compute().') + + def __repr__(self): + return ('%s(array=%r, key=%r)' % + (type(self).__name__, self.array, self.key)) + + def _wrap_numpy_scalars(array): """Wrap NumPy scalars in 0d arrays.""" if np.isscalar(array): @@ -553,6 +590,9 @@ def __array__(self, dtype=None): def __getitem__(self, key): return type(self)(_wrap_numpy_scalars(self.array[key])) + def transpose(self, order): + return self.array.transpose(order) + def __setitem__(self, key, value): self._ensure_copied() self.array[key] = value @@ -573,6 +613,9 @@ def __array__(self, dtype=None): def __getitem__(self, key): return type(self)(_wrap_numpy_scalars(self.array[key])) + def transpose(self, order): + return self.array.transpose(order) + def __setitem__(self, key, value): self.array[key] = value @@ -599,24 +642,26 @@ def _outer_to_vectorized_indexer(key, shape): Parameters ---------- - key : tuple - Tuple from an OuterIndexer to convert. + key : Outer/Basic Indexer + An indexer to convert. shape : tuple Shape of the array subject to the indexing. Returns ------- - tuple + VectorizedIndexer Tuple suitable for use to index a NumPy array with vectorized indexing. - Each element is an integer or array: broadcasting them together gives - the shape of the result. + Each element is an array: broadcasting them together gives the shape + of the result. """ + key = key.tuple + n_dim = len([k for k in key if not isinstance(k, integer_types)]) i_dim = 0 new_key = [] for k, size in zip(key, shape): if isinstance(k, integer_types): - new_key.append(k) + new_key.append(np.array(k).reshape((1,) * n_dim)) else: # np.ndarray or slice if isinstance(k, slice): k = np.arange(*k.indices(size)) @@ -625,7 +670,7 @@ def _outer_to_vectorized_indexer(key, shape): (1,) * (n_dim - i_dim - 1)] new_key.append(k.reshape(*shape)) i_dim += 1 - return tuple(new_key) + return VectorizedIndexer(tuple(new_key)) def _outer_to_numpy_indexer(key, shape): @@ -633,8 +678,8 @@ def _outer_to_numpy_indexer(key, shape): Parameters ---------- - key : tuple - Tuple from an OuterIndexer to convert. + key : Basic/OuterIndexer + An indexer to convert. shape : tuple Shape of the array subject to the indexing. @@ -643,13 +688,284 @@ def _outer_to_numpy_indexer(key, shape): tuple Tuple suitable for use to index a NumPy array. """ - if len([k for k in key if not isinstance(k, slice)]) <= 1: + if len([k for k in key.tuple if not isinstance(k, slice)]) <= 1: # If there is only one vector and all others are slice, # it can be safely used in mixed basic/advanced indexing. # Boolean index should already be converted to integer array. - return tuple(key) + return key.tuple else: - return _outer_to_vectorized_indexer(key, shape) + return _outer_to_vectorized_indexer(key, shape).tuple + + +def _combine_indexers(old_key, shape, new_key): + """ Combine two indexers. + + Parameters + ---------- + old_key: ExplicitIndexer + The first indexer for the original array + shape: tuple of ints + Shape of the original array to be indexed by old_key + new_key: + The second indexer for indexing original[old_key] + """ + if not isinstance(old_key, VectorizedIndexer): + old_key = _outer_to_vectorized_indexer(old_key, shape) + if len(old_key.tuple) == 0: + return new_key + + new_shape = np.broadcast(*old_key.tuple).shape + if isinstance(new_key, VectorizedIndexer): + new_key = _arrayize_vectorized_indexer(new_key, new_shape) + else: + new_key = _outer_to_vectorized_indexer(new_key, new_shape) + + return VectorizedIndexer(tuple(o[new_key.tuple] for o in + np.broadcast_arrays(*old_key.tuple))) + + +class IndexingSupport(object): # could inherit from enum.Enum on Python 3 + # for backends that support only basic indexer + BASIC = 'BASIC' + # for backends that support basic / outer indexer + OUTER = 'OUTER' + # for backends that support outer indexer including at most 1 vector. + OUTER_1VECTOR = 'OUTER_1VECTOR' + # for backends that support full vectorized indexer. + VECTORIZED = 'VECTORIZED' + + +def decompose_indexer(indexer, shape, indexing_support): + if isinstance(indexer, VectorizedIndexer): + return _decompose_vectorized_indexer(indexer, shape, indexing_support) + if isinstance(indexer, (BasicIndexer, OuterIndexer)): + return _decompose_outer_indexer(indexer, shape, indexing_support) + raise TypeError('unexpected key type: {}'.format(indexer)) + + +def _decompose_slice(key, size): + """ convert a slice to successive two slices. The first slice always has + a positive step. + """ + start, stop, step = key.indices(size) + if step > 0: + # If key already has a positive step, use it as is in the backend + return key, slice(None) + else: + # determine stop precisely for step > 1 case + # e.g. [98:2:-2] -> [98:3:-2] + stop = start + int((stop - start - 1) / step) * step + 1 + start, stop = stop + 1, start + 1 + return slice(start, stop, -step), slice(None, None, -1) + + +def _decompose_vectorized_indexer(indexer, shape, indexing_support): + """ + Decompose vectorized indexer to the successive two indexers, where the + first indexer will be used to index backend arrays, while the second one + is used to index loaded on-memory np.ndarray. + + Parameters + ---------- + indexer: VectorizedIndexer + indexing_support: one of IndexerSupport entries + + Returns + ------- + backend_indexer: OuterIndexer or BasicIndexer + np_indexers: an ExplicitIndexer (VectorizedIndexer / BasicIndexer) + + Note + ---- + This function is used to realize the vectorized indexing for the backend + arrays that only support basic or outer indexing. + + As an example, let us consider to index a few elements from a backend array + with a vectorized indexer ([0, 3, 1], [2, 3, 2]). + Even if the backend array only supports outer indexing, it is more + efficient to load a subslice of the array than loading the entire array, + + >>> backend_indexer = OuterIndexer([0, 1, 3], [2, 3]) + >>> array = array[backend_indexer] # load subslice of the array + >>> np_indexer = VectorizedIndexer([0, 2, 1], [0, 1, 0]) + >>> array[np_indexer] # vectorized indexing for on-memory np.ndarray. + """ + assert isinstance(indexer, VectorizedIndexer) + + if indexing_support is IndexingSupport.VECTORIZED: + return indexer, BasicIndexer(()) + + backend_indexer = [] + np_indexer = [] + # convert negative indices + indexer = [np.where(k < 0, k + s, k) if isinstance(k, np.ndarray) else k + for k, s in zip(indexer.tuple, shape)] + + for k, s in zip(indexer, shape): + if isinstance(k, slice): + # If it is a slice, then we will slice it as-is + # (but make its step positive) in the backend, + # and then use all of it (slice(None)) for the in-memory portion. + bk_slice, np_slice = _decompose_slice(k, s) + backend_indexer.append(bk_slice) + np_indexer.append(np_slice) + else: + # If it is a (multidimensional) np.ndarray, just pickup the used + # keys without duplication and store them as a 1d-np.ndarray. + oind, vind = np.unique(k, return_inverse=True) + backend_indexer.append(oind) + np_indexer.append(vind.reshape(*k.shape)) + + backend_indexer = OuterIndexer(tuple(backend_indexer)) + np_indexer = VectorizedIndexer(tuple(np_indexer)) + + if indexing_support is IndexingSupport.OUTER: + return backend_indexer, np_indexer + + # If the backend does not support outer indexing, + # backend_indexer (OuterIndexer) is also decomposed. + backend_indexer, np_indexer1 = _decompose_outer_indexer( + backend_indexer, shape, indexing_support) + np_indexer = _combine_indexers(np_indexer1, shape, np_indexer) + return backend_indexer, np_indexer + + +def _decompose_outer_indexer(indexer, shape, indexing_support): + """ + Decompose outer indexer to the successive two indexers, where the + first indexer will be used to index backend arrays, while the second one + is used to index the loaded on-memory np.ndarray. + + Parameters + ---------- + indexer: VectorizedIndexer + indexing_support: One of the entries of IndexingSupport + + Returns + ------- + backend_indexer: OuterIndexer or BasicIndexer + np_indexers: an ExplicitIndexer (OuterIndexer / BasicIndexer) + + Note + ---- + This function is used to realize the vectorized indexing for the backend + arrays that only support basic or outer indexing. + + As an example, let us consider to index a few elements from a backend array + with a orthogonal indexer ([0, 3, 1], [2, 3, 2]). + Even if the backend array only supports basic indexing, it is more + efficient to load a subslice of the array than loading the entire array, + + >>> backend_indexer = BasicIndexer(slice(0, 3), slice(2, 3)) + >>> array = array[backend_indexer] # load subslice of the array + >>> np_indexer = OuterIndexer([0, 2, 1], [0, 1, 0]) + >>> array[np_indexer] # outer indexing for on-memory np.ndarray. + """ + if indexing_support == IndexingSupport.VECTORIZED: + return indexer, BasicIndexer(()) + assert isinstance(indexer, (OuterIndexer, BasicIndexer)) + + backend_indexer = [] + np_indexer = [] + # make indexer positive + pos_indexer = [] + for k, s in zip(indexer.tuple, shape): + if isinstance(k, np.ndarray): + pos_indexer.append(np.where(k < 0, k + s, k)) + elif isinstance(k, integer_types) and k < 0: + pos_indexer.append(k + s) + else: + pos_indexer.append(k) + indexer = pos_indexer + + if indexing_support is IndexingSupport.OUTER_1VECTOR: + # some backends such as h5py supports only 1 vector in indexers + # We choose the most efficient axis + gains = [(np.max(k) - np.min(k) + 1.0) / len(np.unique(k)) + if isinstance(k, np.ndarray) else 0 for k in indexer] + array_index = np.argmax(np.array(gains)) if len(gains) > 0 else None + + for i, (k, s) in enumerate(zip(indexer, shape)): + if isinstance(k, np.ndarray) and i != array_index: + # np.ndarray key is converted to slice that covers the entire + # entries of this key. + backend_indexer.append(slice(np.min(k), np.max(k) + 1)) + np_indexer.append(k - np.min(k)) + elif isinstance(k, np.ndarray): + # Remove duplicates and sort them in the increasing order + pkey, ekey = np.unique(k, return_inverse=True) + backend_indexer.append(pkey) + np_indexer.append(ekey) + elif isinstance(k, integer_types): + backend_indexer.append(k) + else: # slice: convert positive step slice for backend + bk_slice, np_slice = _decompose_slice(k, s) + backend_indexer.append(bk_slice) + np_indexer.append(np_slice) + + return (OuterIndexer(tuple(backend_indexer)), + OuterIndexer(tuple(np_indexer))) + + if indexing_support == IndexingSupport.OUTER: + for k, s in zip(indexer, shape): + if isinstance(k, slice): + # slice: convert positive step slice for backend + bk_slice, np_slice = _decompose_slice(k, s) + backend_indexer.append(bk_slice) + np_indexer.append(np_slice) + elif isinstance(k, integer_types): + backend_indexer.append(k) + elif isinstance(k, np.ndarray) and (np.diff(k) >= 0).all(): + backend_indexer.append(k) + np_indexer.append(slice(None)) + else: + # Remove duplicates and sort them in the increasing order + oind, vind = np.unique(k, return_inverse=True) + backend_indexer.append(oind) + np_indexer.append(vind.reshape(*k.shape)) + + return (OuterIndexer(tuple(backend_indexer)), + OuterIndexer(tuple(np_indexer))) + + # basic indexer + assert indexing_support == IndexingSupport.BASIC + + for k, s in zip(indexer, shape): + if isinstance(k, np.ndarray): + # np.ndarray key is converted to slice that covers the entire + # entries of this key. + backend_indexer.append(slice(np.min(k), np.max(k) + 1)) + np_indexer.append(k - np.min(k)) + elif isinstance(k, integer_types): + backend_indexer.append(k) + else: # slice: convert positive step slice for backend + bk_slice, np_slice = _decompose_slice(k, s) + backend_indexer.append(bk_slice) + np_indexer.append(np_slice) + + return (BasicIndexer(tuple(backend_indexer)), + OuterIndexer(tuple(np_indexer))) + + +def _arrayize_vectorized_indexer(indexer, shape): + """ Return an identical vindex but slices are replaced by arrays """ + slices = [v for v in indexer.tuple if isinstance(v, slice)] + if len(slices) == 0: + return indexer + + arrays = [v for v in indexer.tuple if isinstance(v, np.ndarray)] + n_dim = arrays[0].ndim if len(arrays) > 0 else 0 + i_dim = 0 + new_key = [] + for v, size in zip(indexer.tuple, shape): + if isinstance(v, np.ndarray): + new_key.append(np.reshape(v, v.shape + (1, ) * len(slices))) + else: # slice + shape = ((1,) * (n_dim + i_dim) + (-1,) + + (1,) * (len(slices) - i_dim - 1)) + new_key.append(np.arange(*v.indices(size)).reshape(shape)) + i_dim += 1 + return VectorizedIndexer(tuple(new_key)) def _dask_array_with_chunks_hint(array, chunks): @@ -698,7 +1014,7 @@ def create_mask(indexer, shape, chunks_hint=None): same shape as the indexing result. """ if isinstance(indexer, OuterIndexer): - key = _outer_to_vectorized_indexer(indexer.tuple, shape) + key = _outer_to_vectorized_indexer(indexer, shape).tuple assert not any(isinstance(k, slice) for k in key) mask = _masked_result_drop_slice(key, chunks_hint) @@ -793,7 +1109,7 @@ def _ensure_ndarray(self, value): def _indexing_array_and_key(self, key): if isinstance(key, OuterIndexer): array = self.array - key = _outer_to_numpy_indexer(key.tuple, self.array.shape) + key = _outer_to_numpy_indexer(key, self.array.shape) elif isinstance(key, VectorizedIndexer): array = nputils.NumpyVIndexAdapter(self.array) key = key.tuple @@ -805,6 +1121,9 @@ def _indexing_array_and_key(self, key): return array, key + def transpose(self, order): + return self.array.transpose(order) + def __getitem__(self, key): array, key = self._indexing_array_and_key(key) return self._ensure_ndarray(array[key]) @@ -848,6 +1167,9 @@ def __setitem__(self, key, value): 'into memory explicitly using the .load() ' 'method or accessing its .values attribute.') + def transpose(self, order): + return self.array.transpose(order) + class PandasIndexAdapter(ExplicitlyIndexedNDArrayMixin): """Wrap a pandas.Index to preserve dtypes and handle explicit indexing.""" @@ -922,6 +1244,9 @@ def __getitem__(self, indexer): return result + def transpose(self, order): + return self.array # self.array should be always one-dimensional + def __repr__(self): return ('%s(array=%r, dtype=%r)' % (type(self).__name__, self.array, self.dtype)) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index bb4285fba0a..66a4e781161 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -121,7 +121,7 @@ def _maybe_wrap_data(data): Put pandas.Index and numpy.ndarray arguments in adapter objects to ensure they can be indexed properly. - NumpyArrayAdapter, PandasIndexAdapter and LazilyIndexedArray should + NumpyArrayAdapter, PandasIndexAdapter and LazilyOuterIndexedArray should all pass through unmodified. """ if isinstance(data, pd.Index): @@ -1053,7 +1053,8 @@ def transpose(self, *dims): axes = self.get_axis_num(dims) if len(dims) < 2: # no need to transpose if only one dimension return self.copy(deep=False) - data = duck_array_ops.transpose(self.data, axes) + + data = as_indexable(self._data).transpose(axes) return type(self)(dims, data, self._attrs, self._encoding, fastpath=True) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 32c79107a2c..21152829096 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -403,34 +403,86 @@ def test_orthogonal_indexing(self): 'dim3': np.arange(5)} expected = in_memory.isel(**indexers) actual = on_disk.isel(**indexers) + # make sure the array is not yet loaded into memory + assert not actual['var1'].variable._in_memory assert_identical(expected, actual) # do it twice, to make sure we're switched from orthogonal -> numpy # when we cached the values actual = on_disk.isel(**indexers) assert_identical(expected, actual) - def _test_vectorized_indexing(self, vindex_support=True): - # Make sure vectorized_indexing works or at least raises - # NotImplementedError + def test_vectorized_indexing(self): in_memory = create_test_data() with self.roundtrip(in_memory) as on_disk: indexers = {'dim1': DataArray([0, 2, 0], dims='a'), 'dim2': DataArray([0, 2, 3], dims='a')} expected = in_memory.isel(**indexers) - if vindex_support: - actual = on_disk.isel(**indexers) - assert_identical(expected, actual) - # do it twice, to make sure we're switched from - # orthogonal -> numpy when we cached the values - actual = on_disk.isel(**indexers) - assert_identical(expected, actual) - else: - with raises_regex(NotImplementedError, 'Vectorized indexing '): - actual = on_disk.isel(**indexers) + actual = on_disk.isel(**indexers) + # make sure the array is not yet loaded into memory + assert not actual['var1'].variable._in_memory + assert_identical(expected, actual.load()) + # do it twice, to make sure we're switched from + # vectorized -> numpy when we cached the values + actual = on_disk.isel(**indexers) + assert_identical(expected, actual) - def test_vectorized_indexing(self): - # This test should be overwritten if vindex is supported - self._test_vectorized_indexing(vindex_support=False) + def multiple_indexing(indexers): + # make sure a sequence of lazy indexings certainly works. + with self.roundtrip(in_memory) as on_disk: + actual = on_disk['var3'] + expected = in_memory['var3'] + for ind in indexers: + actual = actual.isel(**ind) + expected = expected.isel(**ind) + # make sure the array is not yet loaded into memory + assert not actual.variable._in_memory + assert_identical(expected, actual.load()) + + # two-staged vectorized-indexing + indexers = [ + {'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b']), + 'dim3': DataArray([[0, 4], [1, 3], [2, 2]], dims=['a', 'b'])}, + {'a': DataArray([0, 1], dims=['c']), + 'b': DataArray([0, 1], dims=['c'])} + ] + multiple_indexing(indexers) + + # vectorized-slice mixed + indexers = [ + {'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b']), + 'dim3': slice(None, 10)} + ] + multiple_indexing(indexers) + + # vectorized-integer mixed + indexers = [ + {'dim3': 0}, + {'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b'])}, + {'a': slice(None, None, 2)} + ] + multiple_indexing(indexers) + + # vectorized-integer mixed + indexers = [ + {'dim3': 0}, + {'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b'])}, + {'a': 1, 'b': 0} + ] + multiple_indexing(indexers) + + # with negative step slice. + indexers = [ + {'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b']), + 'dim3': slice(-1, 1, -1)}, + ] + multiple_indexing(indexers) + + # with negative step slice. + indexers = [ + {'dim1': DataArray([[0, 7], [2, 6], [3, 5]], dims=['a', 'b']), + 'dim3': slice(-1, 1, -2)}, + ] + multiple_indexing(indexers) def test_isel_dataarray(self): # Make sure isel works lazily. GH:issue:1688 @@ -508,7 +560,6 @@ def test_roundtrip_bytes_with_fill_value(self): encoding = {'_FillValue': b'X', 'dtype': 'S1'} original = Dataset({'x': ('t', values, {}, encoding)}) expected = original.copy(deep=True) - print(original) with self.roundtrip(original) as actual: assert_identical(expected, actual) @@ -756,9 +807,6 @@ def test_append_with_invalid_dim_raises(self): 'Unable to update size for existing dimension'): self.save(data, tmp_file, mode='a') - def test_vectorized_indexing(self): - self._test_vectorized_indexing(vindex_support=False) - def test_multiindex_not_implemented(self): ds = (Dataset(coords={'y': ('x', [1, 2]), 'z': ('x', ['a', 'b'])}) .set_index(x=['y', 'z'])) @@ -1119,9 +1167,6 @@ def test_dataset_caching(self): # caching behavior differs for dask pass - def test_vectorized_indexing(self): - self._test_vectorized_indexing(vindex_support=True) - class NetCDF4ViaDaskDataTestAutocloseTrue(NetCDF4ViaDaskDataTest): autoclose = True @@ -1238,9 +1283,6 @@ def test_chunk_encoding_with_dask(self): with self.roundtrip(ds_chunk4) as actual: pass - def test_vectorized_indexing(self): - self._test_vectorized_indexing(vindex_support=True) - def test_hidden_zarr_keys(self): expected = create_test_data() with self.create_store() as store: @@ -1360,27 +1402,6 @@ def create_zarr_target(self): yield tmp -def test_replace_slices_with_arrays(): - (actual,) = xr.backends.zarr._replace_slices_with_arrays( - key=(slice(None),), shape=(5,)) - np.testing.assert_array_equal(actual, np.arange(5)) - - actual = xr.backends.zarr._replace_slices_with_arrays( - key=(np.arange(5),) * 3, shape=(8, 10, 12)) - expected = np.stack([np.arange(5)] * 3) - np.testing.assert_array_equal(np.stack(actual), expected) - - a, b = xr.backends.zarr._replace_slices_with_arrays( - key=(np.arange(5), slice(None)), shape=(8, 10)) - np.testing.assert_array_equal(a, np.arange(5)[:, np.newaxis]) - np.testing.assert_array_equal(b, np.arange(10)[np.newaxis, :]) - - a, b = xr.backends.zarr._replace_slices_with_arrays( - key=(slice(None), np.arange(5)), shape=(8, 10)) - np.testing.assert_array_equal(a, np.arange(8)[np.newaxis, :]) - np.testing.assert_array_equal(b, np.arange(5)[:, np.newaxis]) - - @requires_scipy class ScipyInMemoryDataTest(CFEncodedDataTest, NetCDF3Only, TestCase): engine = 'scipy' @@ -1589,19 +1610,6 @@ def create_store(self): with create_tmp_file() as tmp_file: yield backends.H5NetCDFStore(tmp_file, 'w') - def test_orthogonal_indexing(self): - # simplified version for h5netcdf - in_memory = create_test_data() - with self.roundtrip(in_memory) as on_disk: - indexers = {'dim3': np.arange(5)} - expected = in_memory.isel(**indexers) - actual = on_disk.isel(**indexers) - assert_identical(expected, actual.load()) - - def test_array_type_after_indexing(self): - # h5netcdf does not support multiple list-like indexers - pass - def test_complex(self): expected = Dataset({'x': ('y', np.ones(5) + 1j * np.ones(5))}) with self.roundtrip(expected) as actual: @@ -2049,9 +2057,6 @@ def test_dataarray_compute(self): self.assertTrue(computed._in_memory) assert_allclose(actual, computed, decode_bytes=False) - def test_vectorized_indexing(self): - self._test_vectorized_indexing(vindex_support=True) - class DaskTestAutocloseTrue(DaskTest): autoclose = True @@ -2111,6 +2116,17 @@ def test_cmp_local_file(self): assert_equal(actual.isel(j=slice(1, 2)), expected.isel(j=slice(1, 2))) + with self.create_datasets() as (actual, expected): + indexers = {'i': [1, 0, 0], 'j': [1, 2, 0, 1]} + assert_equal(actual.isel(**indexers), + expected.isel(**indexers)) + + with self.create_datasets() as (actual, expected): + indexers = {'i': DataArray([0, 1, 0], dims='a'), + 'j': DataArray([0, 2, 1], dims='a')} + assert_equal(actual.isel(**indexers), + expected.isel(**indexers)) + def test_compatible_to_netcdf(self): # make sure it can be saved as a netcdf with self.create_datasets() as (actual, expected): @@ -2155,19 +2171,6 @@ def test_write_store(self): # pynio is read-only for now pass - def test_orthogonal_indexing(self): - # pynio also does not support list-like indexing - with raises_regex(NotImplementedError, 'Outer indexing'): - super(TestPyNio, self).test_orthogonal_indexing() - - def test_isel_dataarray(self): - with raises_regex(NotImplementedError, 'Outer indexing'): - super(TestPyNio, self).test_isel_dataarray() - - def test_array_type_after_indexing(self): - # pynio also does not support list-like indexing - pass - @contextlib.contextmanager def open(self, path, **kwargs): with open_dataset(path, engine='pynio', autoclose=self.autoclose, @@ -2346,17 +2349,62 @@ def test_indexing(self): # tests # assert_allclose checks all data + coordinates assert_allclose(actual, expected) - - # Slicing - ex = expected.isel(x=slice(2, 5), y=slice(5, 7)) - ac = actual.isel(x=slice(2, 5), y=slice(5, 7)) - assert_allclose(ac, ex) - - ex = expected.isel(band=slice(1, 2), x=slice(2, 5), - y=slice(5, 7)) - ac = actual.isel(band=slice(1, 2), x=slice(2, 5), - y=slice(5, 7)) - assert_allclose(ac, ex) + assert not actual.variable._in_memory + + # Basic indexer + ind = {'x': slice(2, 5), 'y': slice(5, 7)} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + ind = {'band': slice(1, 2), 'x': slice(2, 5), 'y': slice(5, 7)} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + ind = {'band': slice(1, 2), 'x': slice(2, 5), 'y': 0} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + # orthogonal indexer + ind = {'band': np.array([2, 1, 0]), + 'x': np.array([1, 0]), 'y': np.array([0, 2])} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + ind = {'band': np.array([2, 1, 0]), + 'x': np.array([1, 0]), 'y': 0} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + # minus-stepped slice + ind = {'band': np.array([2, 1, 0]), + 'x': slice(-1, None, -1), 'y': 0} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + ind = {'band': np.array([2, 1, 0]), + 'x': 1, 'y': slice(-1, 1, -2)} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + # None is selected + ind = {'band': np.array([2, 1, 0]), + 'x': 1, 'y': slice(2, 2, 1)} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + # vectorized indexer + ind = {'band': DataArray([2, 1, 0], dims='a'), + 'x': DataArray([1, 0, 0], dims='a'), + 'y': np.array([0, 2])} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory + + ind = { + 'band': DataArray([[2, 1, 0], [1, 0, 2]], dims=['a', 'b']), + 'x': DataArray([[1, 0, 0], [0, 1, 0]], dims=['a', 'b']), + 'y': 0} + assert_allclose(expected.isel(**ind), actual.isel(**ind)) + assert not actual.variable._in_memory # Selecting lists of bands is fine ex = expected.isel(band=[1, 2]) @@ -2366,15 +2414,6 @@ def test_indexing(self): ac = actual.isel(band=[0, 2]) assert_allclose(ac, ex) - # but on x and y only windowed operations are allowed, more - # exotic slicing should raise an error - err_msg = 'not valid on rasterio' - with raises_regex(IndexError, err_msg): - actual.isel(x=[2, 4], y=[1, 3]).values - with raises_regex(IndexError, err_msg): - actual.isel(x=[4, 2]).values - with raises_regex(IndexError, err_msg): - actual.isel(x=slice(5, 2, -1)).values # Integer indexing ex = expected.isel(band=1) ac = actual.isel(band=1) @@ -2412,11 +2451,6 @@ def test_caching(self): # Cache is the default with xr.open_rasterio(tmp_file) as actual: - # Without cache an error is raised - err_msg = 'not valid on rasterio' - with raises_regex(IndexError, err_msg): - actual.isel(x=[2, 4]).values - # This should cache everything assert_allclose(actual, expected) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 1b557479ec1..e3e2934a4fd 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -77,7 +77,8 @@ def get_variables(self): def lazy_inaccessible(k, v): if k in self._indexvars: return v - data = indexing.LazilyIndexedArray(InaccessibleArray(v.values)) + data = indexing.LazilyOuterIndexedArray( + InaccessibleArray(v.values)) return Variable(v.dims, data, v.attrs) return dict((k, lazy_inaccessible(k, v)) for k, v in iteritems(self._variables)) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 4884eebe759..0d1045d35c0 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -136,22 +136,24 @@ def test_indexer(data, x, expected_pos, expected_idx=None): class TestLazyArray(TestCase): def test_slice_slice(self): I = ReturnItem() # noqa: E741 # allow ambiguous name - x = np.arange(100) - slices = [I[:3], I[:4], I[2:4], I[:1], I[:-1], I[5:-1], I[-5:-1], - I[::-1], I[5::-1], I[:3:-1], I[:30:-1], I[10:4:], I[::4], - I[4:4:4], I[:4:-4]] - for i in slices: - for j in slices: - expected = x[i][j] - new_slice = indexing.slice_slice(i, j, size=100) - actual = x[new_slice] - assert_array_equal(expected, actual) + for size in [100, 99]: + # We test even/odd size cases + x = np.arange(size) + slices = [I[:3], I[:4], I[2:4], I[:1], I[:-1], I[5:-1], I[-5:-1], + I[::-1], I[5::-1], I[:3:-1], I[:30:-1], I[10:4:], I[::4], + I[4:4:4], I[:4:-4], I[::-2]] + for i in slices: + for j in slices: + expected = x[i][j] + new_slice = indexing.slice_slice(i, j, size=size) + actual = x[new_slice] + assert_array_equal(expected, actual) def test_lazily_indexed_array(self): original = np.random.rand(10, 20, 30) x = indexing.NumpyIndexingAdapter(original) v = Variable(['i', 'j', 'k'], original) - lazy = indexing.LazilyIndexedArray(x) + lazy = indexing.LazilyOuterIndexedArray(x) v_lazy = Variable(['i', 'j', 'k'], lazy) I = ReturnItem() # noqa: E741 # allow ambiguous name # test orthogonally applied indexers @@ -170,7 +172,7 @@ def test_lazily_indexed_array(self): assert expected.shape == actual.shape assert_array_equal(expected, actual) assert isinstance(actual._data, - indexing.LazilyIndexedArray) + indexing.LazilyOuterIndexedArray) # make sure actual.key is appropriate type if all(isinstance(k, native_int_types + (slice, )) @@ -185,14 +187,66 @@ def test_lazily_indexed_array(self): indexers = [(3, 2), (I[:], 0), (I[:2], -1), (I[:4], [0]), ([4, 5], 0), ([0, 1, 2], [0, 1]), ([0, 3, 5], I[:2])] for i, j in indexers: - expected = np.asarray(v[i][j]) + expected = v[i][j] actual = v_lazy[i][j] assert expected.shape == actual.shape assert_array_equal(expected, actual) - assert isinstance(actual._data, indexing.LazilyIndexedArray) + + # test transpose + if actual.ndim > 1: + order = np.random.choice(actual.ndim, actual.ndim) + order = np.array(actual.dims) + transposed = actual.transpose(*order) + assert_array_equal(expected.transpose(*order), transposed) + assert isinstance( + actual._data, (indexing.LazilyVectorizedIndexedArray, + indexing.LazilyOuterIndexedArray)) + + assert isinstance(actual._data, indexing.LazilyOuterIndexedArray) assert isinstance(actual._data.array, indexing.NumpyIndexingAdapter) + def test_vectorized_lazily_indexed_array(self): + original = np.random.rand(10, 20, 30) + x = indexing.NumpyIndexingAdapter(original) + v_eager = Variable(['i', 'j', 'k'], x) + lazy = indexing.LazilyOuterIndexedArray(x) + v_lazy = Variable(['i', 'j', 'k'], lazy) + I = ReturnItem() # noqa: E741 # allow ambiguous name + + def check_indexing(v_eager, v_lazy, indexers): + for indexer in indexers: + actual = v_lazy[indexer] + expected = v_eager[indexer] + assert expected.shape == actual.shape + assert isinstance(actual._data, + (indexing.LazilyVectorizedIndexedArray, + indexing.LazilyOuterIndexedArray)) + assert_array_equal(expected, actual) + v_eager = expected + v_lazy = actual + + # test orthogonal indexing + indexers = [(I[:], 0, 1), (Variable('i', [0, 1]), )] + check_indexing(v_eager, v_lazy, indexers) + + # vectorized indexing + indexers = [ + (Variable('i', [0, 1]), Variable('i', [0, 1]), slice(None)), + (slice(1, 3, 2), 0)] + check_indexing(v_eager, v_lazy, indexers) + + indexers = [ + (slice(None, None, 2), 0, slice(None, 10)), + (Variable('i', [3, 2, 4, 3]), Variable('i', [3, 2, 1, 0])), + (Variable(['i', 'j'], [[0, 1], [1, 2]]), )] + check_indexing(v_eager, v_lazy, indexers) + + indexers = [ + (Variable('i', [3, 2, 4, 3]), Variable('i', [3, 2, 1, 0])), + (Variable(['i', 'j'], [[0, 1], [1, 2]]), )] + check_indexing(v_eager, v_lazy, indexers) + class TestCopyOnWriteArray(TestCase): def test_setitem(self): @@ -220,19 +274,19 @@ def test_index_scalar(self): class TestMemoryCachedArray(TestCase): def test_wrapper(self): - original = indexing.LazilyIndexedArray(np.arange(10)) + original = indexing.LazilyOuterIndexedArray(np.arange(10)) wrapped = indexing.MemoryCachedArray(original) assert_array_equal(wrapped, np.arange(10)) assert isinstance(wrapped.array, indexing.NumpyIndexingAdapter) def test_sub_array(self): - original = indexing.LazilyIndexedArray(np.arange(10)) + original = indexing.LazilyOuterIndexedArray(np.arange(10)) wrapped = indexing.MemoryCachedArray(original) child = wrapped[B[:5]] assert isinstance(child, indexing.MemoryCachedArray) assert_array_equal(child, np.arange(5)) assert isinstance(child.array, indexing.NumpyIndexingAdapter) - assert isinstance(wrapped.array, indexing.LazilyIndexedArray) + assert isinstance(wrapped.array, indexing.LazilyOuterIndexedArray) def test_setitem(self): original = np.arange(10) @@ -331,21 +385,126 @@ def test_vectorized_indexer(): np.arange(5, dtype=np.int64))) -def test_unwrap_explicit_indexer(): - indexer = indexing.BasicIndexer((1, 2)) - target = None - - unwrapped = indexing.unwrap_explicit_indexer( - indexer, target, allow=indexing.BasicIndexer) - assert unwrapped == (1, 2) - - with raises_regex(NotImplementedError, 'Load your data'): - indexing.unwrap_explicit_indexer( - indexer, target, allow=indexing.OuterIndexer) - - with raises_regex(TypeError, 'unexpected key type'): - indexing.unwrap_explicit_indexer( - indexer.tuple, target, allow=indexing.OuterIndexer) +class Test_vectorized_indexer(TestCase): + def setUp(self): + self.data = indexing.NumpyIndexingAdapter(np.random.randn(10, 12, 13)) + self.indexers = [np.array([[0, 3, 2], ]), + np.array([[0, 3, 3], [4, 6, 7]]), + slice(2, -2, 2), slice(2, -2, 3), slice(None)] + + def test_arrayize_vectorized_indexer(self): + for i, j, k in itertools.product(self.indexers, repeat=3): + vindex = indexing.VectorizedIndexer((i, j, k)) + vindex_array = indexing._arrayize_vectorized_indexer( + vindex, self.data.shape) + np.testing.assert_array_equal( + self.data[vindex], self.data[vindex_array],) + + actual = indexing._arrayize_vectorized_indexer( + indexing.VectorizedIndexer((slice(None),)), shape=(5,)) + np.testing.assert_array_equal(actual.tuple, [np.arange(5)]) + + actual = indexing._arrayize_vectorized_indexer( + indexing.VectorizedIndexer((np.arange(5),) * 3), shape=(8, 10, 12)) + expected = np.stack([np.arange(5)] * 3) + np.testing.assert_array_equal(np.stack(actual.tuple), expected) + + actual = indexing._arrayize_vectorized_indexer( + indexing.VectorizedIndexer((np.arange(5), slice(None))), + shape=(8, 10)) + a, b = actual.tuple + np.testing.assert_array_equal(a, np.arange(5)[:, np.newaxis]) + np.testing.assert_array_equal(b, np.arange(10)[np.newaxis, :]) + + actual = indexing._arrayize_vectorized_indexer( + indexing.VectorizedIndexer((slice(None), np.arange(5))), + shape=(8, 10)) + a, b = actual.tuple + np.testing.assert_array_equal(a, np.arange(8)[np.newaxis, :]) + np.testing.assert_array_equal(b, np.arange(5)[:, np.newaxis]) + + +def get_indexers(shape, mode): + if mode == 'vectorized': + indexed_shape = (3, 4) + indexer = tuple(np.random.randint(0, s, size=indexed_shape) + for s in shape) + return indexing.VectorizedIndexer(indexer) + + elif mode == 'outer': + indexer = tuple(np.random.randint(0, s, s + 2) for s in shape) + return indexing.OuterIndexer(indexer) + + elif mode == 'outer_scalar': + indexer = (np.random.randint(0, 3, 4), 0, slice(None, None, 2)) + return indexing.OuterIndexer(indexer[:len(shape)]) + + elif mode == 'outer_scalar2': + indexer = (np.random.randint(0, 3, 4), -2, slice(None, None, 2)) + return indexing.OuterIndexer(indexer[:len(shape)]) + + elif mode == 'outer1vec': + indexer = [slice(2, -3) for s in shape] + indexer[1] = np.random.randint(0, shape[1], shape[1] + 2) + return indexing.OuterIndexer(tuple(indexer)) + + elif mode == 'basic': # basic indexer + indexer = [slice(2, -3) for s in shape] + indexer[0] = 3 + return indexing.BasicIndexer(tuple(indexer)) + + elif mode == 'basic1': # basic indexer + return indexing.BasicIndexer((3, )) + + elif mode == 'basic2': # basic indexer + indexer = [0, 2, 4] + return indexing.BasicIndexer(tuple(indexer[:len(shape)])) + + elif mode == 'basic3': # basic indexer + indexer = [slice(None) for s in shape] + indexer[0] = slice(-2, 2, -2) + indexer[1] = slice(1, -1, 2) + return indexing.BasicIndexer(tuple(indexer[:len(shape)])) + + +@pytest.mark.parametrize('size', [100, 99]) +@pytest.mark.parametrize('sl', [slice(1, -1, 1), slice(None, -1, 2), + slice(-1, 1, -1), slice(-1, 1, -2)]) +def test_decompose_slice(size, sl): + x = np.arange(size) + slice1, slice2 = indexing._decompose_slice(sl, size) + expected = x[sl] + actual = x[slice1][slice2] + assert_array_equal(expected, actual) + + +@pytest.mark.parametrize('shape', [(10, 5, 8), (10, 3)]) +@pytest.mark.parametrize('indexer_mode', + ['vectorized', 'outer', 'outer_scalar', + 'outer_scalar2', 'outer1vec', + 'basic', 'basic1', 'basic2', 'basic3']) +@pytest.mark.parametrize('indexing_support', + [indexing.IndexingSupport.BASIC, + indexing.IndexingSupport.OUTER, + indexing.IndexingSupport.OUTER_1VECTOR, + indexing.IndexingSupport.VECTORIZED]) +def test_decompose_indexers(shape, indexer_mode, indexing_support): + data = np.random.randn(*shape) + indexer = get_indexers(shape, indexer_mode) + + backend_ind, np_ind = indexing.decompose_indexer( + indexer, shape, indexing_support) + + expected = indexing.NumpyIndexingAdapter(data)[indexer] + array = indexing.NumpyIndexingAdapter(data)[backend_ind] + if len(np_ind.tuple) > 0: + array = indexing.NumpyIndexingAdapter(array)[np_ind] + np.testing.assert_array_equal(expected, array) + + if not all(isinstance(k, indexing.integer_types) for k in np_ind.tuple): + combined_ind = indexing._combine_indexers(backend_ind, shape, np_ind) + array = indexing.NumpyIndexingAdapter(data)[combined_ind] + np.testing.assert_array_equal(expected, array) def test_implicit_indexing_adapter(): @@ -382,7 +541,8 @@ def nonzero(x): expected_data = np.moveaxis(expected_data, old_order, new_order) - outer_index = (nonzero(i), nonzero(j), nonzero(k)) + outer_index = indexing.OuterIndexer((nonzero(i), nonzero(j), + nonzero(k))) actual = indexing._outer_to_numpy_indexer(outer_index, v.shape) actual_data = v.data[actual] np.testing.assert_array_equal(actual_data, expected_data) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 1373358c476..c4489f50246 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -16,9 +16,9 @@ from xarray.core import indexing from xarray.core.common import full_like, ones_like, zeros_like from xarray.core.indexing import ( - BasicIndexer, CopyOnWriteArray, DaskIndexingAdapter, LazilyIndexedArray, - MemoryCachedArray, NumpyIndexingAdapter, OuterIndexer, PandasIndexAdapter, - VectorizedIndexer) + BasicIndexer, CopyOnWriteArray, DaskIndexingAdapter, + LazilyOuterIndexedArray, MemoryCachedArray, NumpyIndexingAdapter, + OuterIndexer, PandasIndexAdapter, VectorizedIndexer) from xarray.core.pycompat import PY3, OrderedDict from xarray.core.utils import NDArrayMixin from xarray.core.variable import as_compatible_data, as_variable @@ -988,9 +988,9 @@ def test_repr(self): assert expected == repr(v) def test_repr_lazy_data(self): - v = Variable('x', LazilyIndexedArray(np.arange(2e5))) + v = Variable('x', LazilyOuterIndexedArray(np.arange(2e5))) assert '200000 values with dtype' in repr(v) - assert isinstance(v._data, LazilyIndexedArray) + assert isinstance(v._data, LazilyOuterIndexedArray) def test_detect_indexer_type(self): """ Tests indexer type was correctly detected. """ @@ -1798,7 +1798,7 @@ def test_rolling_window(self): class TestAsCompatibleData(TestCase): def test_unchanged_types(self): - types = (np.asarray, PandasIndexAdapter, indexing.LazilyIndexedArray) + types = (np.asarray, PandasIndexAdapter, LazilyOuterIndexedArray) for t in types: for data in [np.arange(3), pd.date_range('2000-01-01', periods=3), @@ -1961,18 +1961,19 @@ def test_NumpyIndexingAdapter(self): v = Variable(dims=('x', 'y'), data=NumpyIndexingAdapter( NumpyIndexingAdapter(self.d))) - def test_LazilyIndexedArray(self): - v = Variable(dims=('x', 'y'), data=LazilyIndexedArray(self.d)) + def test_LazilyOuterIndexedArray(self): + v = Variable(dims=('x', 'y'), data=LazilyOuterIndexedArray(self.d)) self.check_orthogonal_indexing(v) - with raises_regex(NotImplementedError, 'Vectorized indexing for '): - self.check_vectorized_indexing(v) + self.check_vectorized_indexing(v) # doubly wrapping - v = Variable(dims=('x', 'y'), - data=LazilyIndexedArray(LazilyIndexedArray(self.d))) + v = Variable( + dims=('x', 'y'), + data=LazilyOuterIndexedArray(LazilyOuterIndexedArray(self.d))) self.check_orthogonal_indexing(v) # hierarchical wrapping - v = Variable(dims=('x', 'y'), - data=LazilyIndexedArray(NumpyIndexingAdapter(self.d))) + v = Variable( + dims=('x', 'y'), + data=LazilyOuterIndexedArray(NumpyIndexingAdapter(self.d))) self.check_orthogonal_indexing(v) def test_CopyOnWriteArray(self): @@ -1980,11 +1981,11 @@ def test_CopyOnWriteArray(self): self.check_orthogonal_indexing(v) self.check_vectorized_indexing(v) # doubly wrapping - v = Variable(dims=('x', 'y'), - data=CopyOnWriteArray(LazilyIndexedArray(self.d))) + v = Variable( + dims=('x', 'y'), + data=CopyOnWriteArray(LazilyOuterIndexedArray(self.d))) self.check_orthogonal_indexing(v) - with raises_regex(NotImplementedError, 'Vectorized indexing for '): - self.check_vectorized_indexing(v) + self.check_vectorized_indexing(v) def test_MemoryCachedArray(self): v = Variable(dims=('x', 'y'), data=MemoryCachedArray(self.d))