diff --git a/sparse/_dok_indexer.py b/sparse/_dok_indexer.py new file mode 100644 index 00000000..9f13ebba --- /dev/null +++ b/sparse/_dok_indexer.py @@ -0,0 +1,311 @@ +# Copyright 2019 Steffen Schroeder +# All the contents of the file have been copied from here +# https://gist.github.com/stschroe/7c4660774835258c06b2be968eb49204#file-sparse_dok_indexer-py + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from itertools import product, repeat +from numbers import Integral + +import numpy as np +from sparse import DOK, SparseArray +from sparse._slicing import normalize_index + + +class DOKVectorIndexer: + """Vector indexing + Allows so called "fancy" or advanced indexing with one dimensional + sequences (here named as vectors) as array indices. As opposed to + numpy legacy indexing, the vector result dimensions are always the + first axes of the result array. This is inspired by NEP 21: + https://www.numpy.org/neps/nep-0021-advanced-indexing.html + It is not fully implemented yet. Especially the setting of array + elements. + """ + + def __init__(self, dok): + self._dok = dok + + def slice_getitem(self, keys): + """Indexing with slices and scalars only (outer indexing) + expects with normalize_index() normalized indices + """ + shape = self._dok.shape + idxs, res_idxs = [], [] + res_shape = [None] * self._dok.ndim + for i, key in enumerate(keys): + if isinstance(key, slice): + ir = range(*key.indices(shape[i])) + idxs.append(ir) + lr = len(ir) + res_idxs.append(range(lr)) + res_shape[i] = ( + lr if lr > 0 else 0 + ) # if 0: dimension gets preserved but set to zero + else: # should be integral or 0d array of integral + idxs.append((int(key),)) + res_idxs.append((0,)) + res_shape[i] = -1 # dimension gets removed in result + if any(rs == 0 for rs in res_shape): + new_res_shape = tuple(rs for rs in res_shape if rs >= 0) + return DOK( + shape=new_res_shape, + dtype=self._dok.dtype, + fill_value=self._dok.fill_value, + ) + data = self._dok.data + res_data = dict() + res_shape_idxs = [i for i, s in enumerate(res_shape) if res_shape[i] > 0] + for res_key, key in zip(product(*res_idxs), product(*idxs)): + if key in data: + res_data[tuple(res_key[i] for i in res_shape_idxs)] = data[key] + new_res_shape = tuple(s for s in res_shape if s > 0) + if new_res_shape == (): + if key in data: + return data[key] + else: + return self._dok.fill_value + else: + return DOK( + shape=new_res_shape, + data=res_data, + dtype=self._dok.dtype, + fill_value=self._dok.fill_value, + ) + + def __getitem__(self, keys): + """Indexing with vectors, slices and scalar integers. + Example:: + @property + def vix(self): + return DOKVectorIndexer(self) + DOK.vix = vix + d = DOK((3, 4, 5), dtype=np.int64) + d0 = d.vix[[1, 2]] + d1 = d.vix[[1, 2], [1, 2], [1, 2]] + d2 = d.vix[:, [0, 1], [0, 1]] + d3 = d.vix[[0, 1], [0, 1], 1] + """ + shape = self._dok.shape + # expands keys to full dimension, sequences to arrays, positive indices + keys = normalize_index(keys, shape) + vecs = [] + idxs, res_idxs = [], [] + res_shape = [None] * self._dok.ndim + for i, key in enumerate(keys): + # normalize_index() may contain arrays of ndim 0 and 1 + if isinstance(key, np.ndarray) and key.ndim == 1: + vecs.append(key) + arr_len = len(key) + res_idxs.append((range(arr_len),)) + res_shape[i] = None # mark vector indices + idxs.append((key,)) + elif isinstance(key, slice): + ir = range(*key.indices(shape[i])) + il = [repeat(i) for i in ir] + idxs.append(il) + res_idxs.append([repeat(i) for i in range(len(il))]) + if len(il) > 0: + res_shape[i] = len(il) + else: + res_shape[i] = 0 # dimension gets preserved but set to zero + else: # should be integral or 0d array of integral + idxs.append((repeat(int(key)),)) + res_idxs.append((repeat(0),)) + res_shape[i] = -1 # dimension gets removed in result + if vecs: + vec_len = len(vecs[0]) + if not all(vec_len == len(v) for v in vecs[1:]): + raise IndexError("Unequal length of index sequences!") + else: + # TODO: replace with DOC.__getitem__, when it supports such combinations + return self.slice_getitem(keys) + if any( + rs == 0 for rs in res_shape + ): # dimension of length zero results in empty array + new_res_shape = ( + arr_len, + *[s for s in res_shape if s is not None and s >= 0], + ) + return DOK( + shape=new_res_shape, + dtype=self._dok.dtype, + fill_value=self._dok.fill_value, + ) + non_vec_idxs = [ + i for i, s in enumerate(res_shape) if s is not None and res_shape[i] > 0 + ] + vec_idxs = [i for i, s in enumerate(res_shape) if s is None] + first_vec_idx = vec_idxs[0] + data = self._dok.data + res_data = dict() + for res_key, key in zip(product(*res_idxs), product(*idxs)): + for rk, k in zip(zip(*res_key), zip(*key)): + if k in data: + reordered_rk = (rk[first_vec_idx], *[rk[i] for i in non_vec_idxs]) + res_data[reordered_rk] = data[k] + new_res_shape = (arr_len, *[s for s in res_shape if s is not None and s > 0]) + return DOK( + shape=new_res_shape, + data=res_data, + dtype=self._dok.dtype, + fill_value=self._dok.fill_value, + ) + + def __setitem__(self, idxs, values): + """Indexing supports only vectors as indices. + Accepted values are vectors and scalars. + Example:: + @property + def vix(self): + return DOKVectorIndexer(self) + DOK.vix = vix + d = DOK((2, 2), dtype=np.int64) + d.vix[[0, 1],[1, 1]] = [1, 2] + d.vix[[0, 1],[0, 0]] = 2 + d.vix[[0, 1],[1, 1]] = d.vix[[0, 1],[0, 0]] + """ + if not isinstance(idxs, tuple): # one dimension, one argument + idxs = (idxs,) + if len(idxs) != self._dok.ndim: + raise NotImplementedError( + f"Index sequences for all {self._dok.ndim} array dimensions needed!" + ) + idxs = tuple(np.asanyarray(idxs) for idxs in idxs) + if not (isinstance(k.dtype, Integral) for k in idxs): + raise IndexError("Indices must be sequences of integer types!") + if not all(idxs[0].shape == k.shape for k in idxs[1:]): + raise IndexError("Unequal length of index sequences!") + if idxs[0].ndim != 1: + raise IndexError("Indices are not 1d sequences!") + values = np.asanyarray(values, self._dok.dtype) + if values.ndim == 0: + values = np.full(idxs[0].size, values, self._dok.dtype) + elif values.ndim > 1: + raise ValueError(f"Dimension of values ({values.ndim}) must be 0 or 1!") + if not idxs[0].shape == values.shape: + raise ValueError( + f"Shape mismatch of indices ({idxs[0].shape}) and values ({values.shape})!" + ) + fill_value = self._dok.fill_value + data = self._dok.data + for idx, value in zip(zip(*idxs), values): + # self._dok._setitem(k, value) # costly equallity checks + if not value == fill_value: + data[idx] = value + elif idx in data: + del data[idx] + + +def add_vector_indexer_to_sparse_dok(): + """Add vector indexer to sparse.DOK array""" + # alternative implementation + # class VDOK(DOK): + # @property + # def vidx(self): + # return VectorIndexer(self) + # + # s2 = VDOK((5, 5), dtype=np.int64) + + @property + def vix(self): + return DOKVectorIndexer(self) + + DOK.vix = vix + + +class DOKArrayIndexer: + """Indexing with vectors only for sparse.DOK + This is a fast version for vectorized or "fancy" indexing. Only + vectors (1d sequences) are accepted for indexing. The getitem + method always returns dense ndarrays in contrast to normal + fancy indexing of sparse arrays which create new copies of sparse + arrays. The setitem method allows only vectors and scalars as + values. This allows fast calculations over ndarray methods + without the expensive creation of temporary sparse arrays between + operations. This is of course only useful for small vectors, which + fit easily in to memory. + Example:: + @property + def aix(self): + return DOKArrayIndexer(self) + DOK.aix = aix + d = DOK((2, 2), dtype=np.int64) + # setitem + d.aix[[0, 1],[1, 1]] = [1, 2] + d.aix[[0, 1],[1, 1]] = 2 + d.aix[[0, 1],[1, 1]] = d.vix[[0, 0],[1, 2]] + """ + + def __init__(self, dok): + self._dok = dok + + def _normalize_idxs(self, idxs): + if not isinstance(idxs, tuple): # one dimension, one argument + idxs = (idxs,) + if len(idxs) != self._dok.ndim: + raise NotImplementedError( + f"Index sequences for all {self._dok.ndim} array dimensions needed!" + ) + idxs = tuple(np.asanyarray(idxs) for idxs in idxs) + if not (isinstance(k.dtype, Integral) for k in idxs): + raise IndexError("Indices must be sequences of integer types!") + if not all(idxs[0].shape == k.shape for k in idxs[1:]): + raise IndexError("Unequal length of index sequences!") + if idxs[0].ndim != 1: + raise IndexError("Indices are not 1d sequences!") + return idxs + + def __getitem__(self, idxs): + idxs = self._normalize_idxs(idxs) + fill_value = self._dok.fill_value + data_keys = self._dok.data.keys() + data = self._dok.data + return np.array( + [data[idx] if idx in data_keys else fill_value for idx in zip(*idxs)], + self._dok.dtype, + ) + + def __setitem__(self, idxs, values): + idxs = self._normalize_idxs(idxs) + values = np.asanyarray(values, self._dok.dtype) + if values.ndim == 0: + values = np.full(idxs[0].size, values, self._dok.dtype) + elif values.ndim > 1: + raise ValueError(f"Dimension of values ({values.ndim}) must be 0 or 1!") + if not idxs[0].shape == values.shape: + raise ValueError( + f"Shape mismatch of indices ({idxs[0].shape}) and values ({values.shape})!" + ) + fill_value = self._dok.fill_value + data = self._dok.data + for idx, value in zip(zip(*idxs), values): + # self._dok._setitem(k, value) + if not value == fill_value: + data[idx] = value + elif idx in data: + del data[idx] + + +def add_array_indexer_to_sparse_dok(): + """Add array indexer to sparse.DOK array""" + + @property + def aix(self): + return DOKArrayIndexer(self) + + DOK.aix = aix + + +add_vector_indexer_to_sparse_dok() +add_array_indexer_to_sparse_dok() diff --git a/sparse/numpy_vindex.py b/sparse/numpy_vindex.py new file mode 100644 index 00000000..69d86c45 --- /dev/null +++ b/sparse/numpy_vindex.py @@ -0,0 +1,66 @@ +# Prototype reference implementation of numpy NEP 21 (https://github.com/numpy/numpy/pull/6256) +# Copied from https://gist.github.com/shoyer/c700193625347eb68fee4d1f0dc8c0c8 + +# Copyright 2017 Google Inc. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +def is_contiguous(positions): + """Given a non-empty list, does it consist of contiguous integers?""" + previous = positions[0] + for current in positions[1:]: + if current != previous + 1: + return False + previous = current + return True + + +def advanced_indexer_subspaces(key): + """Indices of the advanced indexes subspaces for mixed indexing and vindex.""" + if not isinstance(key, tuple): + key = (key,) + advanced_index_positions = [ + i for i, k in enumerate(key) if not isinstance(k, slice) + ] + + if not advanced_index_positions or not is_contiguous(advanced_index_positions): + # nothing to reorder + return (), () + + non_slices = [k for k in key if not isinstance(k, slice)] + ndim = len(np.broadcast(*non_slices).shape) + mixed_positions = advanced_index_positions[0] + np.arange(ndim) + vindex_positions = np.arange(ndim) + return mixed_positions, vindex_positions + + +class VectorizedIndexer(object): + def __init__(self, array): + self._array = array + + def __getitem__(self, key): + mixed_positions, vindex_positions = advanced_indexer_subspaces(key) + return np.moveaxis(self._array[key], mixed_positions, vindex_positions) + + def __setitem__(self, key, value): + mixed_positions, vindex_positions = advanced_indexer_subspaces(key) + self._array[key] = np.moveaxis(value, vindex_positions, mixed_positions) + + +class VindexArray(np.ndarray): + @property + def vindex(self): + return VectorizedIndexer(self) diff --git a/sparse/tests/test_dok_indexer.py b/sparse/tests/test_dok_indexer.py new file mode 100644 index 00000000..0bcda40e --- /dev/null +++ b/sparse/tests/test_dok_indexer.py @@ -0,0 +1,100 @@ +# Copyright 2019 Steffen Schroeder +# All the contents of the file have been copied from here +# https://gist.github.com/stschroe/7c4660774835258c06b2be968eb49204#file-test_sparse_dok_indexer-py + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from sparse import DOK + +from sparse.numpy_vindex import VindexArray +import sparse._dok_indexer + + +def test_vix(): + assert_array_equal = np.testing.assert_array_equal + x = np.arange(3 * 4 * 5).reshape((3, 4, 5)).view(VindexArray) + d = DOK.from_numpy(x) + + # # Test getitem + # sequences + assert_array_equal(d.vix[[]].todense(), x[[]]) + assert_array_equal(d.vix[[], []].todense(), x[[], []]) + assert_array_equal(d.vix[[2]].todense(), x[[2]]) + assert_array_equal(d.vix[[1, 2]].todense(), x[[1, 2]]) + assert_array_equal(d.vix[[1, 2], [1, 2]].todense(), x[[1, 2], [1, 2]]) + assert_array_equal( + d.vix[[1, 2], [1, 2], [1, 2]].todense(), x[[1, 2], [1, 2], [1, 2]] + ) + # slices + assert_array_equal(d.vix[[0, 1], [0, 1], :].todense(), x.vindex[[0, 1], [0, 1], :]) + assert_array_equal(d.vix[[0, 1], :, [0, 1]].todense(), x.vindex[[0, 1], :, [0, 1]]) + assert_array_equal(d.vix[:, [0, 1], [0, 1]].todense(), x.vindex[:, [0, 1], [0, 1]]) + assert_array_equal(d.vix[:, :, [0, 1]].todense(), x.vindex[:, :, [0, 1]]) + assert_array_equal(d.vix[:, [0, 1], :].todense(), x.vindex[:, [0, 1], :]) + assert_array_equal(d.vix[[0, 1], :, :].todense(), x.vindex[[0, 1], :, :]) + assert_array_equal(d.vix[[0, 1], [0, 1], 1].todense(), x.vindex[[0, 1], [0, 1], 1]) + assert_array_equal(d.vix[[0, 1], 1, [0, 1]].todense(), x.vindex[[0, 1], 1, [0, 1]]) + assert_array_equal(d.vix[1, [0, 1], [0, 1]].todense(), x.vindex[1, [0, 1], [0, 1]]) + assert_array_equal(d.vix[1, 1, [0, 1]].todense(), x.vindex[1, 1, [0, 1]]) + assert_array_equal(d.vix[1, [0, 1], 1].todense(), x.vindex[1, [0, 1], 1]) + assert_array_equal(d.vix[[0, 1], 1, 1].todense(), x.vindex[[0, 1], 1, 1]) + # different behaviour between scalar indices and slices resulting in one scalar + assert_array_equal( + d.vix[[0, 1], [0, 1], 0:1].todense(), x.vindex[[0, 1], [0, 1], 0:1] + ) + assert_array_equal( + d.vix[[0, 1], 0:1, [0, 1]].todense(), x.vindex[[0, 1], 0:1, [0, 1]] + ) + assert_array_equal( + d.vix[0:1, [0, 1], [0, 1]].todense(), x.vindex[0:1, [0, 1], [0, 1]] + ) + assert_array_equal(d.vix[0:1, 0:1, [0, 1]].todense(), x.vindex[0:1, 0:1, [0, 1]]) + assert_array_equal(d.vix[0:1, [0, 1], 0:1].todense(), x.vindex[0:1, [0, 1], 0:1]) + assert_array_equal(d.vix[[0, 1], 0:1, 0:1].todense(), x.vindex[[0, 1], 0:1, 0:1]) + # test empty slice + assert_array_equal( + d.vix[[0, 1], [0, 1], 1:1].todense(), x.vindex[[0, 1], [0, 1], 1:1] + ) + # test fallback to slice_getitem + assert_array_equal(d.vix[1, 1, 1], x.vindex[1, 1, 1]) + assert_array_equal(d.vix[1, 1, :].todense(), x.vindex[1, 1, :]) + assert_array_equal(d.vix[1, :, :].todense(), x.vindex[1, :, :]) + assert_array_equal(d.vix[:, :, :].todense(), x.vindex[:, :, :]) + + # # test setitem + d.vix[[0, 1], [1, 1], [0, 0]] = [1, 2] + x.vindex[[0, 1], [1, 1], [0, 0]] = [1, 2] + assert_array_equal(d.todense(), x) + d.vix[[0, 1], [0, 0], [0, 0]] = 2 + x[[0, 1], [0, 0], [0, 0]] = 2 + assert_array_equal(d.todense(), x) + d.vix[[0, 1], [1, 1], [0, 0]] = d.vix[[0, 1], [0, 0], [0, 0]].todense() + x.vindex[[0, 1], [1, 1], [0, 0]] = x.vindex[[0, 1], [0, 0], [0, 0]] + assert_array_equal(d.todense(), x) + + +def test_aix(): + assert_array_equal = np.testing.assert_array_equal + x = np.arange(3 * 4 * 5).reshape((3, 4, 5)).view(VindexArray) + d = DOK.from_numpy(x) + # # test getitem and setitem + d.aix[[0, 1], [1, 1], [0, 0]] = [1, 2] + x.vindex[[0, 1], [1, 1], [0, 0]] = [1, 2] + assert_array_equal(d.todense(), x) + d.aix[[0, 1], [0, 0], [0, 0]] = 2 + x[[0, 1], [0, 0], [0, 0]] = 2 + assert_array_equal(d.todense(), x) + d.aix[[0, 1], [1, 1], [0, 0]] = d.aix[[0, 1], [0, 0], [0, 0]] + x.vindex[[0, 1], [1, 1], [0, 0]] = x.vindex[[0, 1], [0, 0], [0, 0]] + assert_array_equal(d.todense(), x)