From 69fde3a39a32aae3baeb32a9623d056c1870b61b Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 26 Oct 2017 00:14:33 +0100 Subject: [PATCH 01/67] ellipsis handling; resolves #93 --- zarr/tests/test_core.py | 71 +++++++++++++++++++++++++++++++++++++++-- zarr/tests/test_util.py | 21 ++++++++---- zarr/util.py | 54 +++++++++++++++++++------------ 3 files changed, 116 insertions(+), 30 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index d7957d162b..a29ad933bf 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -123,6 +123,12 @@ def test_array_1d(self): assert_array_equal(a[:10], z[:10]) assert_array_equal(a[10:20], z[10:20]) assert_array_equal(a[-10:], z[-10:]) + assert_array_equal(a[:10, ...], z[:10, ...]) + assert_array_equal(a[10:20, ...], z[10:20, ...]) + assert_array_equal(a[-10:, ...], z[-10:, ...]) + assert_array_equal(a[..., :10], z[..., :10]) + assert_array_equal(a[..., 10:20], z[..., 10:20]) + assert_array_equal(a[..., -10:], z[..., -10:]) # ...across chunk boundaries... assert_array_equal(a[:110], z[:110]) assert_array_equal(a[190:310], z[190:310]) @@ -135,6 +141,18 @@ def test_array_1d(self): eq(a[42], z[np.int32(42)]) eq(a[42], z[np.uint64(42)]) eq(a[42], z[np.uint32(42)]) + # too many indices + with assert_raises(IndexError): + z[:, :] + with assert_raises(IndexError): + z[0, :] + with assert_raises(IndexError): + z[:, 0] + with assert_raises(IndexError): + z[0, 0] + # only single ellipsis allowed + with assert_raises(IndexError): + z[..., ...] # check partial assignment b = np.arange(1e5, 2e5) @@ -194,37 +212,84 @@ def test_array_2d(self): eq(a.nbytes, z.nbytes) eq(50, z.nchunks_initialized) - # check slicing + # check array-like assert_array_equal(a, np.array(z)) + + # check slicing + + # total slice assert_array_equal(a, z[:]) assert_array_equal(a, z[...]) # noinspection PyTypeChecker assert_array_equal(a, z[slice(None)]) + + # slice first dimension assert_array_equal(a[:10], z[:10]) assert_array_equal(a[10:20], z[10:20]) assert_array_equal(a[-10:], z[-10:]) + assert_array_equal(a[:10, :], z[:10, :]) + assert_array_equal(a[10:20, :], z[10:20, :]) + assert_array_equal(a[-10:, :], z[-10:, :]) + assert_array_equal(a[:10, ...], z[:10, ...]) + assert_array_equal(a[10:20, ...], z[10:20, ...]) + assert_array_equal(a[-10:, ...], z[-10:, ...]) + assert_array_equal(a[:10, :, ...], z[:10, :, ...]) + assert_array_equal(a[10:20, :, ...], z[10:20, :, ...]) + assert_array_equal(a[-10:, :, ...], z[-10:, :, ...]) + + # slice second dimension assert_array_equal(a[:, :2], z[:, :2]) assert_array_equal(a[:, 2:4], z[:, 2:4]) assert_array_equal(a[:, -2:], z[:, -2:]) + assert_array_equal(a[..., :2], z[..., :2]) + assert_array_equal(a[..., 2:4], z[..., 2:4]) + assert_array_equal(a[..., -2:], z[..., -2:]) + assert_array_equal(a[:, ..., :2], z[:, ..., :2]) + assert_array_equal(a[:, ..., 2:4], z[:, ..., 2:4]) + assert_array_equal(a[:, ..., -2:], z[:, ..., -2:]) + + # slice both dimensions assert_array_equal(a[:10, :2], z[:10, :2]) assert_array_equal(a[10:20, 2:4], z[10:20, 2:4]) assert_array_equal(a[-10:, -2:], z[-10:, -2:]) - # ...across chunk boundaries... + + # slicing across chunk boundaries assert_array_equal(a[:110], z[:110]) assert_array_equal(a[190:310], z[190:310]) assert_array_equal(a[-110:], z[-110:]) + assert_array_equal(a[:110, :], z[:110, :]) + assert_array_equal(a[190:310, :], z[190:310, :]) + assert_array_equal(a[-110:, :], z[-110:, :]) assert_array_equal(a[:, :3], z[:, :3]) assert_array_equal(a[:, 3:7], z[:, 3:7]) assert_array_equal(a[:, -3:], z[:, -3:]) assert_array_equal(a[:110, :3], z[:110, :3]) assert_array_equal(a[190:310, 3:7], z[190:310, 3:7]) assert_array_equal(a[-110:, -3:], z[-110:, -3:]) - # single item + + # single row/col/item assert_array_equal(a[0], z[0]) assert_array_equal(a[-1], z[-1]) + assert_array_equal(a[:, 0], z[:, 0]) + assert_array_equal(a[:, -1], z[:, -1]) eq(a[0, 0], z[0, 0]) eq(a[-1, -1], z[-1, -1]) + # too many indices + with assert_raises(IndexError): + z[:, :, :] + with assert_raises(IndexError): + z[0, :, :] + with assert_raises(IndexError): + z[:, 0, :] + with assert_raises(IndexError): + z[:, :, 0] + with assert_raises(IndexError): + z[0, 0, 0] + # only single ellipsis allowed + with assert_raises(IndexError): + z[..., ...] + # check partial assignment b = np.arange(10000, 20000).reshape((1000, 10)) z[190:310, 3:7] = b[190:310, 3:7] diff --git a/zarr/tests/test_util.py b/zarr/tests/test_util.py index 5866ab9836..32fbe8e04e 100644 --- a/zarr/tests/test_util.py +++ b/zarr/tests/test_util.py @@ -111,18 +111,17 @@ def test_normalize_array_selection(): eq((slice(0, 100),), normalize_array_selection(slice(None), (100,))) eq((slice(0, 100),), normalize_array_selection(slice(None, 100), (100,))) eq((slice(0, 100),), normalize_array_selection(slice(0, None), (100,))) + eq((slice(0, 100),), normalize_array_selection((slice(None), Ellipsis), (100,))) + eq((slice(0, 100),), normalize_array_selection((Ellipsis, slice(None)), (100,))) # 2D, single item eq((0, 0), normalize_array_selection((0, 0), (100, 100))) eq((99, 1), normalize_array_selection((-1, 1), (100, 100))) # 2D, single col/row - eq((0, slice(0, 100)), normalize_array_selection((0, slice(None)), - (100, 100))) - eq((0, slice(0, 100)), normalize_array_selection((0,), - (100, 100))) - eq((slice(0, 100), 0), normalize_array_selection((slice(None), 0), - (100, 100))) + eq((0, slice(0, 100)), normalize_array_selection((0, slice(None)), (100, 100))) + eq((0, slice(0, 100)), normalize_array_selection((0,), (100, 100))) + eq((slice(0, 100), 0), normalize_array_selection((slice(None), 0), (100, 100))) # 2D slice eq((slice(0, 100), slice(0, 100)), @@ -131,6 +130,16 @@ def test_normalize_array_selection(): normalize_array_selection(slice(None), (100, 100))) eq((slice(0, 100), slice(0, 100)), normalize_array_selection((slice(None), slice(None)), (100, 100))) + eq((slice(0, 100), slice(0, 100)), + normalize_array_selection((Ellipsis, slice(None)), (100, 100))) + eq((slice(0, 100), slice(0, 100)), + normalize_array_selection((slice(None), Ellipsis), (100, 100))) + eq((slice(0, 100), slice(0, 100)), + normalize_array_selection((slice(None), Ellipsis, slice(None)), (100, 100))) + eq((slice(0, 100), slice(0, 100)), + normalize_array_selection((Ellipsis, slice(None), slice(None)), (100, 100))) + eq((slice(0, 100), slice(0, 100)), + normalize_array_selection((slice(None), slice(None), Ellipsis), (100, 100))) with assert_raises(TypeError): normalize_array_selection('foo', (100,)) diff --git a/zarr/util.py b/zarr/util.py index 51b65d06d2..ce41957fd3 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -192,29 +192,41 @@ def normalize_array_selection(item, shape): """Convenience function to normalize a selection within an array with the given `shape`.""" - # normalize item - if isinstance(item, numbers.Integral): - item = (int(item),) - elif isinstance(item, slice): + # ensure tuple + if not isinstance(item, tuple): item = (item,) - elif item == Ellipsis: - item = (slice(None),) - - # handle tuple of indices/slices - if isinstance(item, tuple): - # determine start and stop indices for all axes - selection = tuple(normalize_axis_selection(i, l) - for i, l in zip(item, shape)) - - # fill out selection if not completely specified - if len(selection) < len(shape): - selection += tuple(slice(0, l) for l in shape[len(selection):]) - - return selection - - else: - raise TypeError('expected indices or slice, found: %r' % item) + # handle ellipsis + n_ellipsis = sum(1 for i in item if i == Ellipsis) + if n_ellipsis > 1: + raise IndexError("an index can only have a single ellipsis ('...')") + elif n_ellipsis == 1: + idx_ellipsis = item.index(Ellipsis) + n_items_l = idx_ellipsis # items to left of ellipsis + n_items_r = len(item) - (idx_ellipsis + 1) # items to right of ellipsis + n_items = len(item) - 1 # all non-ellipsis items + if n_items >= len(shape): + # ellipsis does nothing, just remove it + item = tuple(i for i in item if i != Ellipsis) + else: + # replace ellipsis with slices + new_item = item[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) + if n_items_r: + new_item += item[-n_items_r:] + item = new_item + + # check dimensionality + if len(item) > len(shape): + raise IndexError('too many indices for array') + + # determine start and stop indices for all axes + selection = tuple(normalize_axis_selection(i, l) for i, l in zip(item, shape)) + + # fill out selection if not completely specified + if len(selection) < len(shape): + selection += tuple(slice(0, l) for l in shape[len(selection):]) + + return selection def get_chunk_range(selection, chunks): From 30fae464d56205e45203a286a08466d1dbbb8b95 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 26 Oct 2017 22:02:25 +0100 Subject: [PATCH 02/67] minor conciseness --- zarr/util.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/zarr/util.py b/zarr/util.py index ce41957fd3..22d88661cd 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -201,15 +201,14 @@ def normalize_array_selection(item, shape): if n_ellipsis > 1: raise IndexError("an index can only have a single ellipsis ('...')") elif n_ellipsis == 1: - idx_ellipsis = item.index(Ellipsis) - n_items_l = idx_ellipsis # items to left of ellipsis - n_items_r = len(item) - (idx_ellipsis + 1) # items to right of ellipsis + n_items_l = item.index(Ellipsis) # items to left of ellipsis + n_items_r = len(item) - (n_items_l + 1) # items to right of ellipsis n_items = len(item) - 1 # all non-ellipsis items if n_items >= len(shape): # ellipsis does nothing, just remove it item = tuple(i for i in item if i != Ellipsis) else: - # replace ellipsis with slices + # replace ellipsis with as many slices are needed for number of dims new_item = item[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) if n_items_r: new_item += item[-n_items_r:] From d47492f91c837ee8b49f0119b9082023a0967040 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sun, 29 Oct 2017 23:40:22 +0000 Subject: [PATCH 03/67] initial pass on orthogonal advanced indexing --- zarr/core.py | 140 ++++++++++++++++++++++++++++++++-------- zarr/tests/test_core.py | 14 ++++ zarr/tests/test_util.py | 3 +- zarr/util.py | 83 ++++++++++++++++++++---- 4 files changed, 200 insertions(+), 40 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index f7e55b1f23..15cb21ebc9 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,7 +8,7 @@ import numpy as np -from zarr.util import is_total_slice, normalize_array_selection, get_chunk_range, \ +from zarr.util import is_total_slice, normalize_array_selection, get_chunk_ranges, \ human_readable_size, normalize_resize_args, normalize_storage_path, normalize_shape, \ normalize_chunks, InfoReporter from zarr.storage import array_meta_key, attrs_key, listdir, getsize @@ -489,49 +489,137 @@ def _getitem_zd(self, item): def _getitem_nd(self, item): # implementation of __getitem__ for array with at least one dimension + # N.B., this is the crux of zarr. We iterate over all chunks which overlap the selection + # and thus contain data that needs to be extracted. Each chunk is processed in turn, + # extracting the necessary data and storing into the correct location in the output array. + + # N.B., it is an important optimisation that we only visit chunks which overlap the + # selection. This minimises the nuimber of iterations in the main for loop. + # normalize selection selection = normalize_array_selection(item, self._shape) - # determine output array shape - out_shape = tuple(s.stop - s.start for s in selection - if isinstance(s, slice)) + # determine indices of chunks overlapping the selection + chunk_ranges, out_shape = get_chunk_ranges(selection, self._chunks) # setup output array out = np.empty(out_shape, dtype=self._dtype, order=self._order) - # determine indices of chunks overlapping the selection - chunk_range = get_chunk_range(selection, self._chunks) + # iterate over chunks in range, i.e., chunks overlapping the selection + for chunk_indices in itertools.product(*chunk_ranges): - # iterate over chunks in range - for cidx in itertools.product(*chunk_range): + # chunk_indices holds the index along each dimension for the current chunk within the + # chunk grid. E.g., (0, 0) locates the first (top left) chunk in a 2D chunk grid. - # determine chunk offset - offset = [i * c for i, c in zip(cidx, self._chunks)] + chunk_selection = [] + out_selection = [] - # determine region within output array - out_selection = tuple( - slice(max(0, o - s.start), - min(o + c - s.start, s.stop - s.start)) - for s, o, c, in zip(selection, offset, self._chunks) - if isinstance(s, slice) - ) + # iterate over dimensions (axes) of the array + for dim, (dim_sel, dim_chunk_index, dim_chunk_length) \ + in enumerate(zip(selection, chunk_indices, self._chunks)): - # determine region within chunk - chunk_selection = tuple( - slice(max(0, s.start - o), min(c, s.stop - o)) - if isinstance(s, slice) - else s - o - for s, o, c in zip(selection, offset, self._chunks) - ) + # dim_sel: selection for current dimension + # dim_chunk_index: chunk index along current dimension + # dim_chunk_length: chunk length along current dimension + + # selection for current chunk along current dimension + dim_chunk_sel = None + + # selection into output array to store data from current chunk + dim_out_sel = None + + # calculate offset for current chunk along current dimension - this is used to + # determine the values to be extracted from the current chunk + dim_chunk_offset = dim_chunk_index * dim_chunk_length + + # handle integer selection, i.e., single item + if isinstance(dim_sel, int): + + dim_chunk_sel = dim_sel - dim_chunk_offset + + # N.B., leave dim_out_sel as None, as this dimension has been dropped in the + # output array because of single value index + + # handle slice selection, i.e., contiguous range of items + elif isinstance(dim_sel, slice): + + if dim_sel.start <= dim_chunk_offset: + # selection starts before current chunk + dim_chunk_sel_start = 0 + dim_out_offset = dim_chunk_offset - dim_sel.start + + else: + # selection starts within current chunk + dim_chunk_sel_start = dim_sel.start - dim_chunk_offset + dim_out_offset = 0 + + if dim_sel.stop > dim_chunk_offset + dim_chunk_length: + # selection ends after current chunk + dim_chunk_sel_stop = dim_chunk_length + + else: + # selection ends within current chunk + dim_chunk_sel_stop = dim_sel.stop - dim_chunk_offset + + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) + dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + + + elif hasattr(dim_sel, 'dtype'): + + # handle boolean array selection + if dim_sel.dtype == bool: + + # pull out a slice of the boolean indexing array for the current chunk + dim_chunk_sel = \ + dim_sel[dim_chunk_offset:dim_chunk_offset + dim_chunk_length] + + # figure out where to put these items in the output array + # TODO possible bottleneck here with many calls to count_nonzero + dim_out_offset = np.count_nonzero(dim_sel[:dim_chunk_offset]) + dim_chunk_nitems = np.count_nonzero(dim_chunk_sel) + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + + else: + raise RuntimeError('unexpected selection type') + + else: + raise RuntimeError('unexpected selection type') + + # add to chunk selection + chunk_selection.append(dim_chunk_sel) + + # add to output selection + if dim_out_sel is not None: + out_selection.append(dim_out_sel) + + # normalise for indexing into numpy arrays + chunk_selection = tuple(chunk_selection) + # handle indexing arrays orthogonally + if any([hasattr(dim_chunk_sel, 'dtype') for dim_chunk_sel in chunk_selection]): + squeeze_axes = [i for i, dim_chunk_sel in enumerate(chunk_selection) + if isinstance(dim_chunk_sel, int)] + chunk_selection = [range(dim_chunk_sel.start, dim_chunk_sel.stop) + if isinstance(dim_chunk_sel, slice) + else [dim_chunk_sel] if isinstance(dim_chunk_sel, int) + else dim_chunk_sel + for dim_chunk_sel in chunk_selection] + chunk_selection = np.ix_(*chunk_selection) + else: + squeeze_axes = None + out_selection = tuple(out_selection) # obtain the destination array as a view of the output array if out_selection: dest = out[out_selection] + if squeeze_axes: + dest = dest.squeeze(axis=squeeze_axes) else: dest = out # load chunk selection into output array - self._chunk_getitem(cidx, chunk_selection, dest) + self._chunk_getitem(chunk_indices, chunk_selection, dest) if out.shape: return out @@ -651,7 +739,7 @@ def _setitem_nd(self, item, value): str(value.shape))) # determine indices of chunks overlapping the selection - chunk_range = get_chunk_range(selection, self._chunks) + chunk_range, _ = get_chunk_ranges(selection, self._chunks) # iterate over chunks in range for cidx in itertools.product(*chunk_range): diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index a29ad933bf..1dac539eca 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -710,6 +710,20 @@ def test_nchunks_initialized(self): z[:] = 42 eq(10, z.nchunks_initialized) + def test_advanced_indexing_1d_bool(self): + + # setup + a = np.arange(1050, dtype=int) + z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + np.random.seed(42) + for p in 0.9, 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + expect = a[ix] + actual = z[ix] + assert_array_equal(expect, actual) + class TestArrayWithPath(TestArray): diff --git a/zarr/tests/test_util.py b/zarr/tests/test_util.py index 32fbe8e04e..dcb02af969 100644 --- a/zarr/tests/test_util.py +++ b/zarr/tests/test_util.py @@ -85,14 +85,13 @@ def test_normalize_axis_selection(): eq(slice(0, 100), normalize_axis_selection(slice(0, 1000), 100)) eq(slice(99, 100), normalize_axis_selection(slice(-1, None), 100)) eq(slice(98, 99), normalize_axis_selection(slice(-2, -1), 100)) + eq(slice(10, 10), normalize_axis_selection(slice(10, 0), 100)) with assert_raises(IndexError): normalize_axis_selection(slice(100, None), 100) with assert_raises(IndexError): normalize_axis_selection(slice(1000, 2000), 100) with assert_raises(IndexError): normalize_axis_selection(slice(-1000, 0), 100) - with assert_raises(IndexError): - normalize_axis_selection(slice(10, 0), 100) with assert_raises(TypeError): normalize_axis_selection('foo', 100) diff --git a/zarr/util.py b/zarr/util.py index 22d88661cd..b485555812 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -138,6 +138,10 @@ def normalize_axis_selection(item, length): """Convenience function to normalize a selection within a single axis of size `l`.""" + # normalize list to array + if isinstance(item, list): + item = np.asarray(item) + if isinstance(item, numbers.Integral): item = int(item) @@ -172,19 +176,39 @@ def normalize_axis_selection(item, length): return slice(0, 0) # handle out of bounds - if start < 0 or stop < 0: - raise IndexError('index out of bounds: %s, %s' % (start, stop)) + if start < 0: + raise IndexError('start index out of bounds: %s' % item.start) + if stop < 0: + raise IndexError('stop index out of bounds: %s' % item.stop) if start >= length: - raise IndexError('index out of bounds: %s, %s' % (start, stop)) + raise IndexError('start index out of bounds: %ss' % item.start) if stop > length: stop = length if stop < start: - raise IndexError('index out of bounds: %s, %s' % (start, stop)) + stop = start return slice(start, stop) + elif hasattr(item, 'dtype') and hasattr(item, 'shape'): + + # check number of dimensions, only support indexing with 1d array + if len(item.shape) > 1: + raise IndexError('can only index with 1-dimensional array') + + if item.dtype == bool: + + # check shape + if item.shape[0] != length: + raise IndexError('Boolean array has wrong length; expected %s, found %s' % + (length, item.shape[0])) + + return item + + else: + raise IndexError('TODO') + else: - raise TypeError('expected integer or slice, found: %r' % item) + raise TypeError('unsupported index item type: %r' % item) # noinspection PyTypeChecker @@ -197,7 +221,7 @@ def normalize_array_selection(item, shape): item = (item,) # handle ellipsis - n_ellipsis = sum(1 for i in item if i == Ellipsis) + n_ellipsis = sum(1 for i in item if i is Ellipsis) if n_ellipsis > 1: raise IndexError("an index can only have a single ellipsis ('...')") elif n_ellipsis == 1: @@ -228,14 +252,49 @@ def normalize_array_selection(item, shape): return selection -def get_chunk_range(selection, chunks): +def get_chunk_ranges(selection, chunks): """Convenience function to get a range over all chunk indices, for iterating over chunks.""" - chunk_range = [range(s.start//l, int(np.ceil(s.stop/l))) - if isinstance(s, slice) - else range(s//l, (s//l)+1) - for s, l in zip(selection, chunks)] - return chunk_range + + chunk_ranges = [] + out_shape = [] + + for item, chunk_length in zip(selection, chunks): + chunk_range = None + out_length = None + + if isinstance(item, int): + chunk_range = [item//chunk_length] + + elif isinstance(item, slice): + chunk_from = item.start//chunk_length + chunk_to = int(np.ceil(item.stop/chunk_length)) + chunk_range = range(chunk_from, chunk_to) + out_length = item.stop - item.start + + elif hasattr(item, 'dtype'): + if item.dtype == bool: + + # convert to indices to find chunks with nonzero values and skip chunks with no + # requested values + + # TODO profile this, try alternative strategies + indices = np.nonzero(item)[0] + chunk_range = np.unique(indices // chunk_length) + out_length = len(indices) + + elif item.dtype.kind in 'ui': + raise NotImplementedError('TODO') + + if chunk_range is None: + # should not happen + raise RuntimeError('could not determine chunk range') + + chunk_ranges.append(chunk_range) + if out_length is not None: + out_shape.append(out_length) + + return chunk_ranges, tuple(out_shape) def normalize_resize_args(old_shape, *args): From f45dc6a9e803910a347ae52cb891fed46ab19c92 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sun, 29 Oct 2017 23:43:48 +0000 Subject: [PATCH 04/67] add 2d test --- zarr/tests/test_core.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 1dac539eca..c329140c99 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -724,6 +724,21 @@ def test_advanced_indexing_1d_bool(self): actual = z[ix] assert_array_equal(expect, actual) + def test_advanced_indexing_2d_bool(self): + + # setup + a = np.arange(10000, dtype=int).reshape(100, 100) + z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype) + z[:] = a + + np.random.seed(42) + for p in 0.9, 0.5, 0.1, 0.01: + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) + expect = a[np.ix_(ix0, ix1)] + actual = z[ix0, ix1] + assert_array_equal(expect, actual) + class TestArrayWithPath(TestArray): From 4ef758c67519075cf8d2140a33afeaaea1b0db20 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sun, 29 Oct 2017 23:50:58 +0000 Subject: [PATCH 05/67] renaming --- zarr/tests/test_core.py | 6 ++++++ zarr/util.py | 40 ++++++++++++++++++++-------------------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index c329140c99..de01c307e5 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -735,10 +735,16 @@ def test_advanced_indexing_2d_bool(self): for p in 0.9, 0.5, 0.1, 0.01: ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) + + # index both axes with bool array expect = a[np.ix_(ix0, ix1)] actual = z[ix0, ix1] assert_array_equal(expect, actual) + # TODO mixed indexing with bool array / slice + + # TODO mixed indexing with bool array / single index + class TestArrayWithPath(TestArray): diff --git a/zarr/util.py b/zarr/util.py index b485555812..cbbb951594 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -259,40 +259,40 @@ def get_chunk_ranges(selection, chunks): chunk_ranges = [] out_shape = [] - for item, chunk_length in zip(selection, chunks): - chunk_range = None - out_length = None + for dim_sel, dim_chunk_len in zip(selection, chunks): + dim_chunk_range = None + dim_out_len = None - if isinstance(item, int): - chunk_range = [item//chunk_length] + if isinstance(dim_sel, int): + dim_chunk_range = [dim_sel//dim_chunk_len] - elif isinstance(item, slice): - chunk_from = item.start//chunk_length - chunk_to = int(np.ceil(item.stop/chunk_length)) - chunk_range = range(chunk_from, chunk_to) - out_length = item.stop - item.start + elif isinstance(dim_sel, slice): + chunk_from = dim_sel.start//dim_chunk_len + chunk_to = int(np.ceil(dim_sel.stop/dim_chunk_len)) + dim_chunk_range = range(chunk_from, chunk_to) + dim_out_len = dim_sel.stop - dim_sel.start - elif hasattr(item, 'dtype'): - if item.dtype == bool: + elif hasattr(dim_sel, 'dtype'): + if dim_sel.dtype == bool: # convert to indices to find chunks with nonzero values and skip chunks with no # requested values # TODO profile this, try alternative strategies - indices = np.nonzero(item)[0] - chunk_range = np.unique(indices // chunk_length) - out_length = len(indices) + indices = np.nonzero(dim_sel)[0] + dim_chunk_range = np.unique(indices // dim_chunk_len) + dim_out_len = len(indices) - elif item.dtype.kind in 'ui': + elif dim_sel.dtype.kind in 'ui': raise NotImplementedError('TODO') - if chunk_range is None: + if dim_chunk_range is None: # should not happen raise RuntimeError('could not determine chunk range') - chunk_ranges.append(chunk_range) - if out_length is not None: - out_shape.append(out_length) + chunk_ranges.append(dim_chunk_range) + if dim_out_len is not None: + out_shape.append(dim_out_len) return chunk_ranges, tuple(out_shape) From 2b0aaaa8620c878c500b49a78c98cba8bae83f11 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 30 Oct 2017 11:54:23 +0000 Subject: [PATCH 06/67] WIP boolean indexing --- zarr/core.py | 147 +++++++++++++++--------------- zarr/tests/test_core.py | 92 ++++++++++++++++++- zarr/tests/test_util.py | 78 ++++++++-------- zarr/util.py | 196 ++++++++++++++++++++++++---------------- 4 files changed, 323 insertions(+), 190 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 15cb21ebc9..78e150c469 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,9 +8,9 @@ import numpy as np -from zarr.util import is_total_slice, normalize_array_selection, get_chunk_ranges, \ +from zarr.util import is_total_slice, normalize_array_selection, get_chunks_for_selection, \ human_readable_size, normalize_resize_args, normalize_storage_path, normalize_shape, \ - normalize_chunks, InfoReporter + normalize_chunks, InfoReporter, BooleanSelection from zarr.storage import array_meta_key, attrs_key, listdir, getsize from zarr.meta import decode_array_metadata, encode_array_metadata from zarr.attrs import Attributes @@ -497,30 +497,40 @@ def _getitem_nd(self, item): # selection. This minimises the nuimber of iterations in the main for loop. # normalize selection - selection = normalize_array_selection(item, self._shape) + selection = normalize_array_selection(item, self._shape, self._chunks) + + # figure out if we're doing advanced indexing + is_advanced_selection = any([not isinstance(dim_sel, (int, slice)) + for dim_sel in selection]) + + # axes that need to get squeezed out if doing advanced selection + squeeze_axes = None + if is_advanced_selection: + squeeze_axes = tuple([i for i, dim_sel in enumerate(selection) + if isinstance(dim_sel, int)]) # determine indices of chunks overlapping the selection - chunk_ranges, out_shape = get_chunk_ranges(selection, self._chunks) + chunk_ranges, sel_shape = get_chunks_for_selection(selection, self._chunks) # setup output array - out = np.empty(out_shape, dtype=self._dtype, order=self._order) + out = np.empty(sel_shape, dtype=self._dtype, order=self._order) # iterate over chunks in range, i.e., chunks overlapping the selection - for chunk_indices in itertools.product(*chunk_ranges): + for chunk_coords in itertools.product(*chunk_ranges): - # chunk_indices holds the index along each dimension for the current chunk within the + # chunk_coords: holds the index along each dimension for the current chunk within the # chunk grid. E.g., (0, 0) locates the first (top left) chunk in a 2D chunk grid. chunk_selection = [] out_selection = [] # iterate over dimensions (axes) of the array - for dim, (dim_sel, dim_chunk_index, dim_chunk_length) \ - in enumerate(zip(selection, chunk_indices, self._chunks)): + for dim_sel, dim_chunk_idx, dim_chunk_len \ + in zip(selection, chunk_coords, self._chunks): # dim_sel: selection for current dimension - # dim_chunk_index: chunk index along current dimension - # dim_chunk_length: chunk length along current dimension + # dim_chunk_idx: chunk index along current dimension + # dim_chunk_len: chunk length along current dimension # selection for current chunk along current dimension dim_chunk_sel = None @@ -530,7 +540,7 @@ def _getitem_nd(self, item): # calculate offset for current chunk along current dimension - this is used to # determine the values to be extracted from the current chunk - dim_chunk_offset = dim_chunk_index * dim_chunk_length + dim_chunk_offset = dim_chunk_idx * dim_chunk_len # handle integer selection, i.e., single item if isinstance(dim_sel, int): @@ -553,9 +563,9 @@ def _getitem_nd(self, item): dim_chunk_sel_start = dim_sel.start - dim_chunk_offset dim_out_offset = 0 - if dim_sel.stop > dim_chunk_offset + dim_chunk_length: + if dim_sel.stop > dim_chunk_offset + dim_chunk_len: # selection ends after current chunk - dim_chunk_sel_stop = dim_chunk_length + dim_chunk_sel_stop = dim_chunk_len else: # selection ends within current chunk @@ -565,24 +575,15 @@ def _getitem_nd(self, item): dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + elif isinstance(dim_sel, BooleanSelection): - elif hasattr(dim_sel, 'dtype'): - - # handle boolean array selection - if dim_sel.dtype == bool: - - # pull out a slice of the boolean indexing array for the current chunk - dim_chunk_sel = \ - dim_sel[dim_chunk_offset:dim_chunk_offset + dim_chunk_length] - - # figure out where to put these items in the output array - # TODO possible bottleneck here with many calls to count_nonzero - dim_out_offset = np.count_nonzero(dim_sel[:dim_chunk_offset]) - dim_chunk_nitems = np.count_nonzero(dim_chunk_sel) - dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + # pull out a slice of the boolean indexing array for the current chunk + dim_chunk_sel = dim_sel[dim_chunk_offset:dim_chunk_offset + dim_chunk_len] - else: - raise RuntimeError('unexpected selection type') + # figure out where to put these items in the output array + dim_out_offset = dim_sel.get_sel_offset(dim_chunk_idx) + dim_chunk_nitems = dim_sel.get_chunk_nitems(dim_chunk_idx) + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) else: raise RuntimeError('unexpected selection type') @@ -596,30 +597,28 @@ def _getitem_nd(self, item): # normalise for indexing into numpy arrays chunk_selection = tuple(chunk_selection) - # handle indexing arrays orthogonally - if any([hasattr(dim_chunk_sel, 'dtype') for dim_chunk_sel in chunk_selection]): - squeeze_axes = [i for i, dim_chunk_sel in enumerate(chunk_selection) - if isinstance(dim_chunk_sel, int)] + out_selection = tuple(out_selection) + + # handle advanced indexing arrays orthogonally + if is_advanced_selection: + # numpy doesn't support orthogonal indexing directly as yet, so need to work + # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices + # or integers, so need to convert slices and integers into ranges. chunk_selection = [range(dim_chunk_sel.start, dim_chunk_sel.stop) if isinstance(dim_chunk_sel, slice) else [dim_chunk_sel] if isinstance(dim_chunk_sel, int) else dim_chunk_sel for dim_chunk_sel in chunk_selection] chunk_selection = np.ix_(*chunk_selection) - else: - squeeze_axes = None - out_selection = tuple(out_selection) # obtain the destination array as a view of the output array if out_selection: dest = out[out_selection] - if squeeze_axes: - dest = dest.squeeze(axis=squeeze_axes) else: dest = out # load chunk selection into output array - self._chunk_getitem(chunk_indices, chunk_selection, dest) + self._chunk_getitem(chunk_coords, chunk_selection, dest, squeeze_axes) if out.shape: return out @@ -724,28 +723,28 @@ def _setitem_nd(self, item, value): # implementation of __setitem__ for array with at least one dimension # normalize selection - selection = normalize_array_selection(item, self._shape) + selection = normalize_array_selection(item, self._shape, self._chunks) + + # determine indices of chunks overlapping the selection + chunk_ranges, sel_shape = get_chunks_for_selection(selection, self._chunks) # check value shape - expected_shape = tuple( - s.stop - s.start for s in selection - if isinstance(s, slice) - ) if np.isscalar(value): pass - elif expected_shape != value.shape: + elif sel_shape != value.shape: + # TODO IndexError? what does numpy do? raise ValueError('value has wrong shape; expected %s, found %s' - % (str(expected_shape), + % (str(sel_shape), str(value.shape))) - # determine indices of chunks overlapping the selection - chunk_range, _ = get_chunk_ranges(selection, self._chunks) - # iterate over chunks in range - for cidx in itertools.product(*chunk_range): + for chunk_coords in itertools.product(*chunk_ranges): + # TODO refactor code for computing input and output selection for current chunk - + # shared with __getitem__ + # determine chunk offset - offset = [i * c for i, c in zip(cidx, self._chunks)] + offset = [i * c for i, c in zip(chunk_coords, self._chunks)] # determine required index range within chunk chunk_selection = tuple( @@ -758,7 +757,7 @@ def _setitem_nd(self, item, value): if np.isscalar(value): # put data - self._chunk_setitem(cidx, chunk_selection, value) + self._chunk_setitem(chunk_coords, chunk_selection, value) else: # assume value is array-like @@ -772,26 +771,28 @@ def _setitem_nd(self, item, value): ) # put data - self._chunk_setitem(cidx, chunk_selection, value[value_selection]) + self._chunk_setitem(chunk_coords, chunk_selection, value[value_selection]) - def _chunk_getitem(self, cidx, item, dest): + def _chunk_getitem(self, chunk_coords, chunk_selection, dest, squeeze_axes=None): """Obtain part or whole of a chunk. Parameters ---------- - cidx : tuple of ints + chunk_coords : tuple of ints Indices of the chunk. - item : tuple of slices + chunk_selection : tuple of slices Location of region within the chunk. dest : ndarray Numpy array to store result in. + squeeze_axes : tuple of ints + Axes to squeeze out of the chunk before """ try: # obtain compressed data for chunk - ckey = self._chunk_key(cidx) + ckey = self._chunk_key(chunk_coords) cdata = self.chunk_store[ckey] except KeyError: @@ -802,7 +803,7 @@ def _chunk_getitem(self, cidx, item, dest): else: - if is_total_slice(item, self._chunks) and \ + if is_total_slice(chunk_selection, self._chunks) and \ not self._filters and \ ((self._order == 'C' and dest.flags.c_contiguous) or (self._order == 'F' and dest.flags.f_contiguous)): @@ -825,20 +826,22 @@ def _chunk_getitem(self, cidx, item, dest): # set data in output array # (split into two lines for profiling) - tmp = chunk[item] + tmp = chunk[chunk_selection] + if squeeze_axes: + tmp = np.squeeze(tmp, axis=squeeze_axes) if dest.shape: dest[:] = tmp else: dest[()] = tmp - def _chunk_setitem(self, cidx, item, value): + def _chunk_setitem(self, chunk_coords, chunk_selection, value): """Replace part or whole of a chunk. Parameters ---------- - cidx : tuple of ints + chunk_coords : tuple of ints Indices of the chunk. - item : tuple of slices + chunk_selection : tuple of slices Location of region within the chunk. value : scalar or ndarray Value to set. @@ -847,19 +850,19 @@ def _chunk_setitem(self, cidx, item, value): # synchronization if self._synchronizer is None: - self._chunk_setitem_nosync(cidx, item, value) + self._chunk_setitem_nosync(chunk_coords, chunk_selection, value) else: # synchronize on the chunk - ckey = self._chunk_key(cidx) + ckey = self._chunk_key(chunk_coords) with self._synchronizer[ckey]: - self._chunk_setitem_nosync(cidx, item, value) + self._chunk_setitem_nosync(chunk_coords, chunk_selection, value) - def _chunk_setitem_nosync(self, cidx, item, value): + def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value): # obtain key for chunk storage - ckey = self._chunk_key(cidx) + ckey = self._chunk_key(chunk_coords) - if is_total_slice(item, self._chunks): + if is_total_slice(chunk_selection, self._chunks): # totally replace chunk # optimization: we are completely replacing the chunk, so no need @@ -912,7 +915,7 @@ def _chunk_setitem_nosync(self, cidx, item, value): chunk = chunk.copy(order='K') # modify - chunk[item] = value + chunk[chunk_selection] = value # encode chunk cdata = self._encode_chunk(chunk) @@ -920,8 +923,8 @@ def _chunk_setitem_nosync(self, cidx, item, value): # store self.chunk_store[ckey] = cdata - def _chunk_key(self, cidx): - return self._key_prefix + '.'.join(map(str, cidx)) + def _chunk_key(self, chunk_coords): + return self._key_prefix + '.'.join(map(str, chunk_coords)) def _decode_chunk(self, cdata): diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index de01c307e5..29a4d84176 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -718,6 +718,7 @@ def test_advanced_indexing_1d_bool(self): z[:] = a np.random.seed(42) + # test with different degrees of sparseness for p in 0.9, 0.5, 0.1, 0.01: ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) expect = a[ix] @@ -732,6 +733,7 @@ def test_advanced_indexing_2d_bool(self): z[:] = a np.random.seed(42) + # test with different degrees of sparseness for p in 0.9, 0.5, 0.1, 0.01: ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) @@ -741,9 +743,95 @@ def test_advanced_indexing_2d_bool(self): actual = z[ix0, ix1] assert_array_equal(expect, actual) - # TODO mixed indexing with bool array / slice + # mixed indexing with bool array / slice + expect = a[ix0, 15:35] + actual = z[ix0, 15:35] + assert_array_equal(expect, actual) + expect = a[15:35, ix1] + actual = z[15:35, ix1] + assert_array_equal(expect, actual) + + # mixed indexing with bool array / single index + expect = a[ix0, 42] + actual = z[ix0, 42] + assert_array_equal(expect, actual) + expect = a[42, ix1] + actual = z[42, ix1] + assert_array_equal(expect, actual) + + def test_advanced_indexing_3d_bool(self): + + # setup + a = np.arange(1000000, dtype=int).reshape(100, 100, 100) + z = self.create_array(shape=a.shape, chunks=(10, 10, 10), dtype=a.dtype) + z[:] = a - # TODO mixed indexing with bool array / single index + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.9, 0.5, 0.1, 0.01: + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) + ix2 = np.random.binomial(1, p, size=a.shape[2]).astype(bool) + + # index all axes with bool array + expect = a[np.ix_(ix0, ix1, ix2)] + actual = z[ix0, ix1, ix2] + assert_array_equal(expect, actual) + + # mixed indexing with single bool array / slices + expect = a[ix0, 15:35, 25:45] + actual = z[ix0, 15:35, 25:45] + assert_array_equal(expect, actual) + expect = a[15:35, ix1, 25:45] + actual = z[15:35, ix1, 25:45] + assert_array_equal(expect, actual) + expect = a[15:35, 25:45, ix2] + actual = z[15:35, 25:45, ix2] + assert_array_equal(expect, actual) + + # mixed indexing with single bool array / single index + expect = a[ix0, 42, 84] + actual = z[ix0, 42, 84] + assert_array_equal(expect, actual) + expect = a[42, ix1, 84] + actual = z[42, ix1, 84] + assert_array_equal(expect, actual) + expect = a[42, 84, ix2] + actual = z[42, 84, ix2] + assert_array_equal(expect, actual) + + # mixed indexing with single bool array / slice / single index + expect = a[ix0, 15:35, 42] + actual = z[ix0, 15:35, 42] + assert_array_equal(expect, actual) + expect = a[42, ix1, 25:45] + actual = z[42, ix1, 25:45] + assert_array_equal(expect, actual) + expect = a[15:35, 42, ix2] + actual = z[15:35, 42, ix2] + assert_array_equal(expect, actual) + + # mixed indexing with two bool array / slice + expect = a[np.ix_(ix0, ix1, range(25, 45))] + actual = z[ix0, ix1, 25:45] + assert_array_equal(expect, actual) + expect = a[np.ix_(range(15, 35), ix1, ix2)] + actual = z[15:35, ix1, ix2] + assert_array_equal(expect, actual) + expect = a[np.ix_(ix0, range(25, 45), ix2)] + actual = z[ix0, 25:45, ix2] + assert_array_equal(expect, actual) + + # mixed indexing with two bool array / integer + expect = a[np.ix_(ix0, ix1, [42])].squeeze(axis=2) + actual = z[ix0, ix1, 42] + assert_array_equal(expect, actual) + expect = a[np.ix_([42], ix1, ix2)].squeeze(axis=0) + actual = z[42, ix1, ix2] + assert_array_equal(expect, actual) + expect = a[np.ix_(ix0, [42], ix2)].squeeze(axis=1) + actual = z[ix0, 42, ix2] + assert_array_equal(expect, actual) class TestArrayWithPath(TestArray): diff --git a/zarr/tests/test_util.py b/zarr/tests/test_util.py index dcb02af969..aa9950caef 100644 --- a/zarr/tests/test_util.py +++ b/zarr/tests/test_util.py @@ -5,7 +5,7 @@ from nose.tools import eq_ as eq, assert_raises, assert_true, assert_false, \ assert_is_instance -from zarr.util import normalize_shape, normalize_chunks, is_total_slice, normalize_axis_selection, \ +from zarr.util import normalize_shape, normalize_chunks, is_total_slice, normalize_dim_selection, \ normalize_array_selection, normalize_resize_args, human_readable_size, normalize_order, \ guess_chunks, info_html_report, info_text_report @@ -69,79 +69,79 @@ def test_is_total_slice(): def test_normalize_axis_selection(): # single item - eq(1, normalize_axis_selection(1, 100)) - eq(99, normalize_axis_selection(-1, 100)) + eq(1, normalize_dim_selection(1, 100, 10)) + eq(99, normalize_dim_selection(-1, 100, 10)) with assert_raises(IndexError): - normalize_axis_selection(100, 100) + normalize_dim_selection(100, 100, 10) with assert_raises(IndexError): - normalize_axis_selection(1000, 100) + normalize_dim_selection(1000, 100, 10) with assert_raises(IndexError): - normalize_axis_selection(-1000, 100) + normalize_dim_selection(-1000, 100, 10) # slice - eq(slice(0, 100), normalize_axis_selection(slice(None), 100)) - eq(slice(0, 100), normalize_axis_selection(slice(None, 100), 100)) - eq(slice(0, 100), normalize_axis_selection(slice(0, None), 100)) - eq(slice(0, 100), normalize_axis_selection(slice(0, 1000), 100)) - eq(slice(99, 100), normalize_axis_selection(slice(-1, None), 100)) - eq(slice(98, 99), normalize_axis_selection(slice(-2, -1), 100)) - eq(slice(10, 10), normalize_axis_selection(slice(10, 0), 100)) + eq(slice(0, 100), normalize_dim_selection(slice(None), 100, 10)) + eq(slice(0, 100), normalize_dim_selection(slice(None, 100), 100, 10)) + eq(slice(0, 100), normalize_dim_selection(slice(0, None), 100, 10)) + eq(slice(0, 100), normalize_dim_selection(slice(0, 1000), 100, 10)) + eq(slice(99, 100), normalize_dim_selection(slice(-1, None), 100, 10)) + eq(slice(98, 99), normalize_dim_selection(slice(-2, -1), 100, 10)) + eq(slice(10, 10), normalize_dim_selection(slice(10, 0), 100, 10)) with assert_raises(IndexError): - normalize_axis_selection(slice(100, None), 100) + normalize_dim_selection(slice(100, None), 100, 10) with assert_raises(IndexError): - normalize_axis_selection(slice(1000, 2000), 100) + normalize_dim_selection(slice(1000, 2000), 100, 10) with assert_raises(IndexError): - normalize_axis_selection(slice(-1000, 0), 100) + normalize_dim_selection(slice(-1000, 0), 100, 10) with assert_raises(TypeError): - normalize_axis_selection('foo', 100) + normalize_dim_selection('foo', 100, 10) with assert_raises(NotImplementedError): - normalize_axis_selection(slice(0, 100, 2), 100) + normalize_dim_selection(slice(0, 100, 2), 100, 10) def test_normalize_array_selection(): # 1D, single item - eq((0,), normalize_array_selection(0, (100,))) + eq((0,), normalize_array_selection(0, (100,), (10,))) # 1D, slice - eq((slice(0, 100),), normalize_array_selection(Ellipsis, (100,))) - eq((slice(0, 100),), normalize_array_selection(slice(None), (100,))) - eq((slice(0, 100),), normalize_array_selection(slice(None, 100), (100,))) - eq((slice(0, 100),), normalize_array_selection(slice(0, None), (100,))) - eq((slice(0, 100),), normalize_array_selection((slice(None), Ellipsis), (100,))) - eq((slice(0, 100),), normalize_array_selection((Ellipsis, slice(None)), (100,))) + eq((slice(0, 100),), normalize_array_selection(Ellipsis, (100,), (10,))) + eq((slice(0, 100),), normalize_array_selection(slice(None), (100,), (10,))) + eq((slice(0, 100),), normalize_array_selection(slice(None, 100), (100,), (10,))) + eq((slice(0, 100),), normalize_array_selection(slice(0, None), (100,), (10,))) + eq((slice(0, 100),), normalize_array_selection((slice(None), Ellipsis), (100,), (10,))) + eq((slice(0, 100),), normalize_array_selection((Ellipsis, slice(None)), (100,), (10,))) # 2D, single item - eq((0, 0), normalize_array_selection((0, 0), (100, 100))) - eq((99, 1), normalize_array_selection((-1, 1), (100, 100))) + eq((0, 0), normalize_array_selection((0, 0), (100, 100), (10, 10))) + eq((99, 1), normalize_array_selection((-1, 1), (100, 100), (10, 10))) # 2D, single col/row - eq((0, slice(0, 100)), normalize_array_selection((0, slice(None)), (100, 100))) - eq((0, slice(0, 100)), normalize_array_selection((0,), (100, 100))) - eq((slice(0, 100), 0), normalize_array_selection((slice(None), 0), (100, 100))) + eq((0, slice(0, 100)), normalize_array_selection((0, slice(None)), (100, 100), (10, 10))) + eq((0, slice(0, 100)), normalize_array_selection((0,), (100, 100), (10, 10))) + eq((slice(0, 100), 0), normalize_array_selection((slice(None), 0), (100, 100), (10, 10))) # 2D slice eq((slice(0, 100), slice(0, 100)), - normalize_array_selection(Ellipsis, (100, 100))) + normalize_array_selection(Ellipsis, (100, 100), (10, 10))) eq((slice(0, 100), slice(0, 100)), - normalize_array_selection(slice(None), (100, 100))) + normalize_array_selection(slice(None), (100, 100), (10, 10))) eq((slice(0, 100), slice(0, 100)), - normalize_array_selection((slice(None), slice(None)), (100, 100))) + normalize_array_selection((slice(None), slice(None)), (100, 100), (10, 10))) eq((slice(0, 100), slice(0, 100)), - normalize_array_selection((Ellipsis, slice(None)), (100, 100))) + normalize_array_selection((Ellipsis, slice(None)), (100, 100), (10, 10))) eq((slice(0, 100), slice(0, 100)), - normalize_array_selection((slice(None), Ellipsis), (100, 100))) + normalize_array_selection((slice(None), Ellipsis), (100, 100), (10, 10))) eq((slice(0, 100), slice(0, 100)), - normalize_array_selection((slice(None), Ellipsis, slice(None)), (100, 100))) + normalize_array_selection((slice(None), Ellipsis, slice(None)), (100, 100), (10, 10))) eq((slice(0, 100), slice(0, 100)), - normalize_array_selection((Ellipsis, slice(None), slice(None)), (100, 100))) + normalize_array_selection((Ellipsis, slice(None), slice(None)), (100, 100), (10, 10))) eq((slice(0, 100), slice(0, 100)), - normalize_array_selection((slice(None), slice(None), Ellipsis), (100, 100))) + normalize_array_selection((slice(None), slice(None), Ellipsis), (100, 100), (10, 10))) with assert_raises(TypeError): - normalize_array_selection('foo', (100,)) + normalize_array_selection('foo', (100,), (10,)) def test_normalize_resize_args(): diff --git a/zarr/util.py b/zarr/util.py index cbbb951594..25f9e1f97a 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -3,6 +3,7 @@ import operator from textwrap import TextWrapper import numbers +import functools import numpy as np @@ -134,116 +135,153 @@ def is_total_slice(item, shape): raise TypeError('expected slice or tuple of slices, found %r' % item) -def normalize_axis_selection(item, length): +class BooleanSelection(object): + + def __init__(self, dim_sel, dim_len, dim_chunk_len): + + # check number of dimensions, only support indexing with 1d array + if len(dim_sel.shape) > 1: + raise IndexError('can only index with 1-dimensional Boolean array') + + # check shape + if dim_sel.shape[0] != dim_len: + raise IndexError('Boolean array has wrong length; expected %s, found %s' % + (dim_len, dim_sel.shape[0])) + + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) + + def __getitem__(self, item): + return self.dim_sel[item] + + @functools.lru_cache(maxsize=None) + def get_chunk_nitems(self, dim_chunk_idx): + dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] + return np.count_nonzero(dim_chunk_sel) + + @functools.lru_cache(maxsize=None) + def get_nitems(self): + return sum(self.get_chunk_nitems(i) for i in range(self.nchunks)) + + @functools.lru_cache(maxsize=None) + def get_sel_offset(self, dim_chunk_idx): + if dim_chunk_idx == 0: + return 0 + else: + return self.get_sel_offset(dim_chunk_idx - 1) + self.get_chunk_nitems(dim_chunk_idx - 1) + + def get_chunk_ranges(self): + for dim_chunk_idx in range(self.nchunks): + nitems = self.get_chunk_nitems(dim_chunk_idx) + if nitems: + yield dim_chunk_idx + + +def normalize_dim_selection(dim_sel, dim_len, dim_chunk_len): """Convenience function to normalize a selection within a single axis of size `l`.""" # normalize list to array - if isinstance(item, list): - item = np.asarray(item) + if isinstance(dim_sel, list): + dim_sel = np.asarray(dim_sel) - if isinstance(item, numbers.Integral): - item = int(item) + if isinstance(dim_sel, numbers.Integral): + + # normalize type to int + dim_sel = int(dim_sel) # handle wraparound - if item < 0: - item = length + item + if dim_sel < 0: + dim_sel = dim_len + dim_sel # handle out of bounds - if item >= length or item < 0: - raise IndexError('index out of bounds: %s' % item) + if dim_sel >= dim_len or dim_sel < 0: + raise IndexError('index out of bounds: %s' % dim_sel) - return item + return dim_sel - elif isinstance(item, slice): + elif isinstance(dim_sel, slice): # handle slice with step - if item.step is not None and item.step != 1: + if dim_sel.step is not None and dim_sel.step != 1: raise NotImplementedError('slice with step not implemented') # handle slice with None bound - start = 0 if item.start is None else item.start - stop = length if item.stop is None else item.stop + start = 0 if dim_sel.start is None else dim_sel.start + stop = dim_len if dim_sel.stop is None else dim_sel.stop # handle wraparound if start < 0: - start = length + start + start = dim_len + start if stop < 0: - stop = length + stop + stop = dim_len + stop # handle zero-length axis - if start == stop == length == 0: + if start == stop == dim_len == 0: return slice(0, 0) # handle out of bounds if start < 0: - raise IndexError('start index out of bounds: %s' % item.start) + raise IndexError('start index out of bounds: %s' % dim_sel.start) if stop < 0: - raise IndexError('stop index out of bounds: %s' % item.stop) - if start >= length: - raise IndexError('start index out of bounds: %ss' % item.start) - if stop > length: - stop = length + raise IndexError('stop index out of bounds: %s' % dim_sel.stop) + if start >= dim_len: + raise IndexError('start index out of bounds: %ss' % dim_sel.start) + if stop > dim_len: + stop = dim_len if stop < start: stop = start return slice(start, stop) - elif hasattr(item, 'dtype') and hasattr(item, 'shape'): - - # check number of dimensions, only support indexing with 1d array - if len(item.shape) > 1: - raise IndexError('can only index with 1-dimensional array') - - if item.dtype == bool: - - # check shape - if item.shape[0] != length: - raise IndexError('Boolean array has wrong length; expected %s, found %s' % - (length, item.shape[0])) + elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): - return item + if dim_sel.dtype == bool: + return BooleanSelection(dim_sel, dim_len, dim_chunk_len) else: raise IndexError('TODO') else: - raise TypeError('unsupported index item type: %r' % item) + raise TypeError('unsupported index item type: %r' % dim_sel) # noinspection PyTypeChecker -def normalize_array_selection(item, shape): +def normalize_array_selection(selection, shape, chunks): """Convenience function to normalize a selection within an array with the given `shape`.""" # ensure tuple - if not isinstance(item, tuple): - item = (item,) + if not isinstance(selection, tuple): + selection = (selection,) # handle ellipsis - n_ellipsis = sum(1 for i in item if i is Ellipsis) + n_ellipsis = sum(1 for i in selection if i is Ellipsis) if n_ellipsis > 1: raise IndexError("an index can only have a single ellipsis ('...')") elif n_ellipsis == 1: - n_items_l = item.index(Ellipsis) # items to left of ellipsis - n_items_r = len(item) - (n_items_l + 1) # items to right of ellipsis - n_items = len(item) - 1 # all non-ellipsis items + n_items_l = selection.index(Ellipsis) # items to left of ellipsis + n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis + n_items = len(selection) - 1 # all non-ellipsis items if n_items >= len(shape): # ellipsis does nothing, just remove it - item = tuple(i for i in item if i != Ellipsis) + selection = tuple(i for i in selection if i != Ellipsis) else: # replace ellipsis with as many slices are needed for number of dims - new_item = item[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) + new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) if n_items_r: - new_item += item[-n_items_r:] - item = new_item + new_item += selection[-n_items_r:] + selection = new_item # check dimensionality - if len(item) > len(shape): + if len(selection) > len(shape): raise IndexError('too many indices for array') # determine start and stop indices for all axes - selection = tuple(normalize_axis_selection(i, l) for i, l in zip(item, shape)) + selection = tuple(normalize_dim_selection(i, l, c) for i, l, c in zip(selection, shape, chunks)) # fill out selection if not completely specified if len(selection) < len(shape): @@ -252,49 +290,53 @@ def normalize_array_selection(item, shape): return selection -def get_chunk_ranges(selection, chunks): - """Convenience function to get a range over all chunk indices, - for iterating over chunks.""" +def get_chunks_for_selection(selection, chunks): + """Convenience function to find chunks overlapping an array selection. N.B., + assumes selection has already been normalized.""" + # indices of chunks overlapping the selection chunk_ranges = [] - out_shape = [] + # shape of the selection + sel_shape = [] + + # iterate over dimensions of the array for dim_sel, dim_chunk_len in zip(selection, chunks): - dim_chunk_range = None - dim_out_len = None + + # dim_sel: selection for current dimension + # dim_chunk_len: length of chunk along current dimension + + dim_sel_len = None if isinstance(dim_sel, int): + + # dim selection is an integer, i.e., single item, so only need single chunk index for + # this dimension dim_chunk_range = [dim_sel//dim_chunk_len] elif isinstance(dim_sel, slice): - chunk_from = dim_sel.start//dim_chunk_len - chunk_to = int(np.ceil(dim_sel.stop/dim_chunk_len)) - dim_chunk_range = range(chunk_from, chunk_to) - dim_out_len = dim_sel.stop - dim_sel.start - elif hasattr(dim_sel, 'dtype'): - if dim_sel.dtype == bool: + # dim selection is a slice, need range of chunk indices including start and stop of + # selection + dim_chunk_from = dim_sel.start//dim_chunk_len + dim_chunk_to = int(np.ceil(dim_sel.stop/dim_chunk_len)) + dim_chunk_range = range(dim_chunk_from, dim_chunk_to) + dim_sel_len = dim_sel.stop - dim_sel.start - # convert to indices to find chunks with nonzero values and skip chunks with no - # requested values + elif isinstance(dim_sel, BooleanSelection): - # TODO profile this, try alternative strategies - indices = np.nonzero(dim_sel)[0] - dim_chunk_range = np.unique(indices // dim_chunk_len) - dim_out_len = len(indices) + # dim selection is a boolean array, delegate this to the BooleanSelection class + dim_chunk_range = dim_sel.get_chunk_ranges() + dim_sel_len = dim_sel.get_nitems() - elif dim_sel.dtype.kind in 'ui': - raise NotImplementedError('TODO') - - if dim_chunk_range is None: - # should not happen - raise RuntimeError('could not determine chunk range') + else: + raise RuntimeError('unexpected selection type') chunk_ranges.append(dim_chunk_range) - if dim_out_len is not None: - out_shape.append(dim_out_len) + if dim_sel_len is not None: + sel_shape.append(dim_sel_len) - return chunk_ranges, tuple(out_shape) + return chunk_ranges, tuple(sel_shape) def normalize_resize_args(old_shape, *args): From b9cda15c54f0fb6fe21d34ead77c7df0434467e1 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 30 Oct 2017 14:47:49 +0000 Subject: [PATCH 07/67] some refactoring --- zarr/core.py | 8 +++----- zarr/util.py | 25 +++++++++++++++++++++++-- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 78e150c469..0dbcafe8f4 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -578,12 +578,10 @@ def _getitem_nd(self, item): elif isinstance(dim_sel, BooleanSelection): # pull out a slice of the boolean indexing array for the current chunk - dim_chunk_sel = dim_sel[dim_chunk_offset:dim_chunk_offset + dim_chunk_len] + dim_chunk_sel = dim_sel.get_chunk_sel(dim_chunk_idx) # figure out where to put these items in the output array - dim_out_offset = dim_sel.get_sel_offset(dim_chunk_idx) - dim_chunk_nitems = dim_sel.get_chunk_nitems(dim_chunk_idx) - dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + dim_out_sel = dim_sel.get_out_sel(dim_chunk_idx) else: raise RuntimeError('unexpected selection type') @@ -742,7 +740,7 @@ def _setitem_nd(self, item, value): # TODO refactor code for computing input and output selection for current chunk - # shared with __getitem__ - + # determine chunk offset offset = [i * c for i, c in zip(chunk_coords, self._chunks)] diff --git a/zarr/util.py b/zarr/util.py index 25f9e1f97a..ff8188a389 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -156,10 +156,18 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): def __getitem__(self, item): return self.dim_sel[item] + def get_chunk_sel(self, dim_chunk_idx): + dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len + return self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] + + def get_out_sel(self, dim_chunk_idx): + dim_out_offset = self.get_sel_offset(dim_chunk_idx) + dim_chunk_nitems = self.get_chunk_nitems(dim_chunk_idx) + return slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + @functools.lru_cache(maxsize=None) def get_chunk_nitems(self, dim_chunk_idx): - dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len - dim_chunk_sel = self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] + dim_chunk_sel = self.get_chunk_sel(dim_chunk_idx) return np.count_nonzero(dim_chunk_sel) @functools.lru_cache(maxsize=None) @@ -180,6 +188,19 @@ def get_chunk_ranges(self): yield dim_chunk_idx +class IntegerSelection(object): + + def __init__(self, dim_sel, dim_len, dim_chunk_len): + + # TODO validate dim_sel + + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) + + + def normalize_dim_selection(dim_sel, dim_len, dim_chunk_len): """Convenience function to normalize a selection within a single axis of size `l`.""" From 5819a84783d355dc569fc1df44e88e74c4dc3ff4 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 30 Oct 2017 15:01:59 +0000 Subject: [PATCH 08/67] WIP integer selection --- zarr/util.py | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/zarr/util.py b/zarr/util.py index ff8188a389..f64ff9ae29 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -192,13 +192,50 @@ class IntegerSelection(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): - # TODO validate dim_sel + # has to be a numpy array so we can do bincount + dim_sel = np.asanyarray(dim_sel) + + # TODO handle wraparound + + # TODO validate dim_sel - out of bounds; monotonically increasing self.dim_sel = dim_sel self.dim_len = dim_len self.dim_chunk_len = dim_chunk_len self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) + # precompute some useful stuff + self.chunk_nitems = np.bincount(self.dim_sel // self.dim_chunk_len, minlength=self.nchunks) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + + def get_chunk_sel(self, dim_chunk_idx): + # need to slice out relevant indices from the total selection, then subtract the chunk + # offset + + dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len + + if dim_chunk_idx == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] + stop = start + self.chunk_nitems[dim_chunk_idx] + dim_chunk_sel = self.dim_sel[start:stop] - dim_chunk_offset + + return dim_chunk_sel + + def get_out_sel(self, dim_chunk_idx): + if dim_chunk_idx == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] + stop = start + self.chunk_nitems[dim_chunk_idx] + return slice(start, stop) + + def get_chunk_ranges(self): + return np.nonzero(self.chunk_nitems)[0] + + +# TODO support slice with step via integer selection (convert to np.arange) def normalize_dim_selection(dim_sel, dim_len, dim_chunk_len): From 9d1ea3917d4f80d41676e8cc5ef710c0fbd930e5 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 30 Oct 2017 15:08:05 +0000 Subject: [PATCH 09/67] refactoring --- zarr/util.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/zarr/util.py b/zarr/util.py index f64ff9ae29..773b04898e 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -197,7 +197,9 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): # TODO handle wraparound - # TODO validate dim_sel - out of bounds; monotonically increasing + # TODO validate dim_sel + # TODO check out of bounds + # TODO validate monotonically increasing self.dim_sel = dim_sel self.dim_len = dim_len @@ -213,13 +215,8 @@ def get_chunk_sel(self, dim_chunk_idx): # offset dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len - - if dim_chunk_idx == 0: - start = 0 - else: - start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] - stop = start + self.chunk_nitems[dim_chunk_idx] - dim_chunk_sel = self.dim_sel[start:stop] - dim_chunk_offset + dim_out_sel = self.get_out_sel(dim_chunk_idx) + dim_chunk_sel = self.dim_sel[dim_out_sel] - dim_chunk_offset return dim_chunk_sel @@ -228,7 +225,7 @@ def get_out_sel(self, dim_chunk_idx): start = 0 else: start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] - stop = start + self.chunk_nitems[dim_chunk_idx] + stop = self.chunk_nitems_cumsum[dim_chunk_idx] return slice(start, stop) def get_chunk_ranges(self): From d0abae07c55ae7be6f302cb6b93150c6a4ce0d15 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 30 Oct 2017 15:10:05 +0000 Subject: [PATCH 10/67] doco --- zarr/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/util.py b/zarr/util.py index 773b04898e..f0a6bcb434 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -237,7 +237,7 @@ def get_chunk_ranges(self): def normalize_dim_selection(dim_sel, dim_len, dim_chunk_len): """Convenience function to normalize a selection within a single axis - of size `l`.""" + of size `dim_len` for an array with chunk length `dim_chunk_len`.""" # normalize list to array if isinstance(dim_sel, list): From 4d0e7aee02f800c1f885ed1860ed5c55b414b921 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 30 Oct 2017 17:08:17 +0000 Subject: [PATCH 11/67] initial implementation of integer indexing --- zarr/core.py | 6 +- zarr/tests/test_core.py | 139 ++++++++++++++++++++++++++++++++++++++++ zarr/util.py | 16 ++++- 3 files changed, 157 insertions(+), 4 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 0dbcafe8f4..986548426a 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -10,7 +10,7 @@ from zarr.util import is_total_slice, normalize_array_selection, get_chunks_for_selection, \ human_readable_size, normalize_resize_args, normalize_storage_path, normalize_shape, \ - normalize_chunks, InfoReporter, BooleanSelection + normalize_chunks, InfoReporter, BooleanSelection, IntegerSelection from zarr.storage import array_meta_key, attrs_key, listdir, getsize from zarr.meta import decode_array_metadata, encode_array_metadata from zarr.attrs import Attributes @@ -575,9 +575,9 @@ def _getitem_nd(self, item): dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - elif isinstance(dim_sel, BooleanSelection): + elif isinstance(dim_sel, (BooleanSelection, IntegerSelection)): - # pull out a slice of the boolean indexing array for the current chunk + # get selection to extract data for the current chunk dim_chunk_sel = dim_sel.get_chunk_sel(dim_chunk_idx) # figure out where to put these items in the output array diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 29a4d84176..16b762d270 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -725,6 +725,26 @@ def test_advanced_indexing_1d_bool(self): actual = z[ix] assert_array_equal(expect, actual) + # TODO test errors + + def test_advanced_indexing_1d_int(self): + + # setup + a = np.arange(1050, dtype=int) + z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.9, 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix = np.nonzero(ix)[0] + expect = a[ix] + actual = z[ix] + assert_array_equal(expect, actual) + + # TODO test errors + def test_advanced_indexing_2d_bool(self): # setup @@ -759,6 +779,46 @@ def test_advanced_indexing_2d_bool(self): actual = z[42, ix1] assert_array_equal(expect, actual) + # TODO test errors + + def test_advanced_indexing_2d_int(self): + + # setup + a = np.arange(10000, dtype=int).reshape(100, 100) + z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.9, 0.5, 0.1, 0.01: + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix0 = np.nonzero(ix0)[0] + ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) + ix1 = np.nonzero(ix1)[0] + + # index both axes with int array + expect = a[np.ix_(ix0, ix1)] + actual = z[ix0, ix1] + assert_array_equal(expect, actual) + + # mixed indexing with int array / slice + expect = a[ix0, 15:35] + actual = z[ix0, 15:35] + assert_array_equal(expect, actual) + expect = a[15:35, ix1] + actual = z[15:35, ix1] + assert_array_equal(expect, actual) + + # mixed indexing with int array / single index + expect = a[ix0, 42] + actual = z[ix0, 42] + assert_array_equal(expect, actual) + expect = a[42, ix1] + actual = z[42, ix1] + assert_array_equal(expect, actual) + + # TODO test errors + def test_advanced_indexing_3d_bool(self): # setup @@ -833,6 +893,85 @@ def test_advanced_indexing_3d_bool(self): actual = z[ix0, 42, ix2] assert_array_equal(expect, actual) + def test_advanced_indexing_3d_int(self): + + # setup + a = np.arange(1000000, dtype=int).reshape(100, 100, 100) + z = self.create_array(shape=a.shape, chunks=(10, 10, 10), dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.9, 0.5, 0.1, 0.01: + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix0 = np.nonzero(ix0)[0] + ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) + ix1 = np.nonzero(ix1)[0] + ix2 = np.random.binomial(1, p, size=a.shape[2]).astype(bool) + ix2 = np.nonzero(ix2)[0] + + # index all axes with int array + expect = a[np.ix_(ix0, ix1, ix2)] + actual = z[ix0, ix1, ix2] + assert_array_equal(expect, actual) + + # mixed indexing with single int array / slices + expect = a[ix0, 15:35, 25:45] + actual = z[ix0, 15:35, 25:45] + assert_array_equal(expect, actual) + expect = a[15:35, ix1, 25:45] + actual = z[15:35, ix1, 25:45] + assert_array_equal(expect, actual) + expect = a[15:35, 25:45, ix2] + actual = z[15:35, 25:45, ix2] + assert_array_equal(expect, actual) + + # mixed indexing with single int array / single index + expect = a[ix0, 42, 84] + actual = z[ix0, 42, 84] + assert_array_equal(expect, actual) + expect = a[42, ix1, 84] + actual = z[42, ix1, 84] + assert_array_equal(expect, actual) + expect = a[42, 84, ix2] + actual = z[42, 84, ix2] + assert_array_equal(expect, actual) + + # mixed indexing with single int array / slice / single index + expect = a[ix0, 15:35, 42] + actual = z[ix0, 15:35, 42] + assert_array_equal(expect, actual) + expect = a[42, ix1, 25:45] + actual = z[42, ix1, 25:45] + assert_array_equal(expect, actual) + expect = a[15:35, 42, ix2] + actual = z[15:35, 42, ix2] + assert_array_equal(expect, actual) + + # mixed indexing with two int array / slice + expect = a[np.ix_(ix0, ix1, range(25, 45))] + actual = z[ix0, ix1, 25:45] + assert_array_equal(expect, actual) + expect = a[np.ix_(range(15, 35), ix1, ix2)] + actual = z[15:35, ix1, ix2] + assert_array_equal(expect, actual) + expect = a[np.ix_(ix0, range(25, 45), ix2)] + actual = z[ix0, 25:45, ix2] + assert_array_equal(expect, actual) + + # mixed indexing with two int array / integer + expect = a[np.ix_(ix0, ix1, [42])].squeeze(axis=2) + actual = z[ix0, ix1, 42] + assert_array_equal(expect, actual) + expect = a[np.ix_([42], ix1, ix2)].squeeze(axis=0) + actual = z[42, ix1, ix2] + assert_array_equal(expect, actual) + expect = a[np.ix_(ix0, [42], ix2)].squeeze(axis=1) + actual = z[ix0, 42, ix2] + assert_array_equal(expect, actual) + + # TODO test advanced indexing with __setitem__ + class TestArrayWithPath(TestArray): diff --git a/zarr/util.py b/zarr/util.py index f0a6bcb434..b08abf32bb 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -231,6 +231,9 @@ def get_out_sel(self, dim_chunk_idx): def get_chunk_ranges(self): return np.nonzero(self.chunk_nitems)[0] + def get_nitems(self): + return len(self.dim_sel) + # TODO support slice with step via integer selection (convert to np.arange) @@ -297,10 +300,15 @@ def normalize_dim_selection(dim_sel, dim_len, dim_chunk_len): if dim_sel.dtype == bool: return BooleanSelection(dim_sel, dim_len, dim_chunk_len) + elif dim_sel.dtype.kind in 'ui': + return IntegerSelection(dim_sel, dim_len, dim_chunk_len) + else: - raise IndexError('TODO') + # TODO IndexError? + raise TypeError('unsupported index item type: %r' % dim_sel) else: + # TODO IndexError? raise TypeError('unsupported index item type: %r' % dim_sel) @@ -384,6 +392,12 @@ def get_chunks_for_selection(selection, chunks): dim_chunk_range = dim_sel.get_chunk_ranges() dim_sel_len = dim_sel.get_nitems() + elif isinstance(dim_sel, IntegerSelection): + + # dim selection is an integer array, delegate this to the integerSelection class + dim_chunk_range = dim_sel.get_chunk_ranges() + dim_sel_len = dim_sel.get_nitems() + else: raise RuntimeError('unexpected selection type') From 6c10c6e89c221baace34c9c6c352050a6af1793e Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 30 Oct 2017 17:48:57 +0000 Subject: [PATCH 12/67] refactor for simplicity --- zarr/util.py | 49 +++++++++++++++++++------------------------------ 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/zarr/util.py b/zarr/util.py index b08abf32bb..7a9fb59736 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -153,39 +153,30 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): self.dim_chunk_len = dim_chunk_len self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) - def __getitem__(self, item): - return self.dim_sel[item] + # precompute number of selected items for each chunk + self.chunk_nitems = np.zeros(self.nchunks, dtype='i8') + for dim_chunk_idx in range(self.nchunks): + dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len + self.chunk_nitems[dim_chunk_idx] = np.count_nonzero( + self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] + ) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + self.nitems = self.chunk_nitems_cumsum[-1] def get_chunk_sel(self, dim_chunk_idx): dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len return self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] def get_out_sel(self, dim_chunk_idx): - dim_out_offset = self.get_sel_offset(dim_chunk_idx) - dim_chunk_nitems = self.get_chunk_nitems(dim_chunk_idx) - return slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - - @functools.lru_cache(maxsize=None) - def get_chunk_nitems(self, dim_chunk_idx): - dim_chunk_sel = self.get_chunk_sel(dim_chunk_idx) - return np.count_nonzero(dim_chunk_sel) - - @functools.lru_cache(maxsize=None) - def get_nitems(self): - return sum(self.get_chunk_nitems(i) for i in range(self.nchunks)) - - @functools.lru_cache(maxsize=None) - def get_sel_offset(self, dim_chunk_idx): if dim_chunk_idx == 0: - return 0 + start = 0 else: - return self.get_sel_offset(dim_chunk_idx - 1) + self.get_chunk_nitems(dim_chunk_idx - 1) + start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_idx] + return slice(start, stop) def get_chunk_ranges(self): - for dim_chunk_idx in range(self.nchunks): - nitems = self.get_chunk_nitems(dim_chunk_idx) - if nitems: - yield dim_chunk_idx + return np.nonzero(self.chunk_nitems)[0] class IntegerSelection(object): @@ -206,16 +197,17 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): self.dim_chunk_len = dim_chunk_len self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) - # precompute some useful stuff + # precompute number of selected items for each chunk self.chunk_nitems = np.bincount(self.dim_sel // self.dim_chunk_len, minlength=self.nchunks) self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + self.nitems = len(dim_sel) def get_chunk_sel(self, dim_chunk_idx): # need to slice out relevant indices from the total selection, then subtract the chunk # offset - dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len dim_out_sel = self.get_out_sel(dim_chunk_idx) + dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len dim_chunk_sel = self.dim_sel[dim_out_sel] - dim_chunk_offset return dim_chunk_sel @@ -231,9 +223,6 @@ def get_out_sel(self, dim_chunk_idx): def get_chunk_ranges(self): return np.nonzero(self.chunk_nitems)[0] - def get_nitems(self): - return len(self.dim_sel) - # TODO support slice with step via integer selection (convert to np.arange) @@ -390,13 +379,13 @@ def get_chunks_for_selection(selection, chunks): # dim selection is a boolean array, delegate this to the BooleanSelection class dim_chunk_range = dim_sel.get_chunk_ranges() - dim_sel_len = dim_sel.get_nitems() + dim_sel_len = dim_sel.nitems elif isinstance(dim_sel, IntegerSelection): # dim selection is an integer array, delegate this to the integerSelection class dim_chunk_range = dim_sel.get_chunk_ranges() - dim_sel_len = dim_sel.get_nitems() + dim_sel_len = dim_sel.nitems else: raise RuntimeError('unexpected selection type') From 7c12181f0d3d1b761b8f099fa58990494b98bba9 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 30 Oct 2017 17:57:20 +0000 Subject: [PATCH 13/67] only do np.ix_ if absolutely necessary --- zarr/core.py | 11 ++++++----- zarr/util.py | 8 +++++++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 986548426a..3f6c2b0aa3 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -499,13 +499,14 @@ def _getitem_nd(self, item): # normalize selection selection = normalize_array_selection(item, self._shape, self._chunks) - # figure out if we're doing advanced indexing - is_advanced_selection = any([not isinstance(dim_sel, (int, slice)) - for dim_sel in selection]) + # figure out if we're doing advanced indexing, count number of advanced selections - if + # more than one need special handling + n_advanced_selection = sum(1 for dim_sel in selection + if not isinstance(dim_sel, (int, slice))) # axes that need to get squeezed out if doing advanced selection squeeze_axes = None - if is_advanced_selection: + if n_advanced_selection > 1: squeeze_axes = tuple([i for i, dim_sel in enumerate(selection) if isinstance(dim_sel, int)]) @@ -598,7 +599,7 @@ def _getitem_nd(self, item): out_selection = tuple(out_selection) # handle advanced indexing arrays orthogonally - if is_advanced_selection: + if n_advanced_selection > 1: # numpy doesn't support orthogonal indexing directly as yet, so need to work # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices # or integers, so need to convert slices and integers into ranges. diff --git a/zarr/util.py b/zarr/util.py index 7a9fb59736..0f0e96fae8 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -165,7 +165,13 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): def get_chunk_sel(self, dim_chunk_idx): dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len - return self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] + dim_chunk_sel = self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] + # pad out if final chunk + if dim_chunk_sel.shape[0] < self.dim_chunk_len: + tmp = np.zeros(self.dim_chunk_len, dtype=bool) + tmp[:dim_chunk_sel.shape[0]] = dim_chunk_sel + dim_chunk_sel = tmp + return dim_chunk_sel def get_out_sel(self, dim_chunk_idx): if dim_chunk_idx == 0: From 3edf96c1886da7cd0c3cb064b1cfc9de939b0489 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 30 Oct 2017 18:08:08 +0000 Subject: [PATCH 14/67] refactor getitem setitem together --- zarr/core.py | 141 ++++++++++----------------------------------------- zarr/util.py | 95 ++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 115 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 3f6c2b0aa3..6a45aaace8 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -10,7 +10,7 @@ from zarr.util import is_total_slice, normalize_array_selection, get_chunks_for_selection, \ human_readable_size, normalize_resize_args, normalize_storage_path, normalize_shape, \ - normalize_chunks, InfoReporter, BooleanSelection, IntegerSelection + normalize_chunks, InfoReporter, get_chunk_selections from zarr.storage import array_meta_key, attrs_key, listdir, getsize from zarr.meta import decode_array_metadata, encode_array_metadata from zarr.attrs import Attributes @@ -500,15 +500,17 @@ def _getitem_nd(self, item): selection = normalize_array_selection(item, self._shape, self._chunks) # figure out if we're doing advanced indexing, count number of advanced selections - if - # more than one need special handling + # more than one need special handling, because we are doing orthogonal indexing here, + # which is different from fancy indexing if there is more than one array selection n_advanced_selection = sum(1 for dim_sel in selection if not isinstance(dim_sel, (int, slice))) # axes that need to get squeezed out if doing advanced selection - squeeze_axes = None if n_advanced_selection > 1: squeeze_axes = tuple([i for i, dim_sel in enumerate(selection) if isinstance(dim_sel, int)]) + else: + squeeze_axes = None # determine indices of chunks overlapping the selection chunk_ranges, sel_shape = get_chunks_for_selection(selection, self._chunks) @@ -519,96 +521,9 @@ def _getitem_nd(self, item): # iterate over chunks in range, i.e., chunks overlapping the selection for chunk_coords in itertools.product(*chunk_ranges): - # chunk_coords: holds the index along each dimension for the current chunk within the - # chunk grid. E.g., (0, 0) locates the first (top left) chunk in a 2D chunk grid. - - chunk_selection = [] - out_selection = [] - - # iterate over dimensions (axes) of the array - for dim_sel, dim_chunk_idx, dim_chunk_len \ - in zip(selection, chunk_coords, self._chunks): - - # dim_sel: selection for current dimension - # dim_chunk_idx: chunk index along current dimension - # dim_chunk_len: chunk length along current dimension - - # selection for current chunk along current dimension - dim_chunk_sel = None - - # selection into output array to store data from current chunk - dim_out_sel = None - - # calculate offset for current chunk along current dimension - this is used to - # determine the values to be extracted from the current chunk - dim_chunk_offset = dim_chunk_idx * dim_chunk_len - - # handle integer selection, i.e., single item - if isinstance(dim_sel, int): - - dim_chunk_sel = dim_sel - dim_chunk_offset - - # N.B., leave dim_out_sel as None, as this dimension has been dropped in the - # output array because of single value index - - # handle slice selection, i.e., contiguous range of items - elif isinstance(dim_sel, slice): - - if dim_sel.start <= dim_chunk_offset: - # selection starts before current chunk - dim_chunk_sel_start = 0 - dim_out_offset = dim_chunk_offset - dim_sel.start - - else: - # selection starts within current chunk - dim_chunk_sel_start = dim_sel.start - dim_chunk_offset - dim_out_offset = 0 - - if dim_sel.stop > dim_chunk_offset + dim_chunk_len: - # selection ends after current chunk - dim_chunk_sel_stop = dim_chunk_len - - else: - # selection ends within current chunk - dim_chunk_sel_stop = dim_sel.stop - dim_chunk_offset - - dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) - dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start - dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - - elif isinstance(dim_sel, (BooleanSelection, IntegerSelection)): - - # get selection to extract data for the current chunk - dim_chunk_sel = dim_sel.get_chunk_sel(dim_chunk_idx) - - # figure out where to put these items in the output array - dim_out_sel = dim_sel.get_out_sel(dim_chunk_idx) - - else: - raise RuntimeError('unexpected selection type') - - # add to chunk selection - chunk_selection.append(dim_chunk_sel) - - # add to output selection - if dim_out_sel is not None: - out_selection.append(dim_out_sel) - - # normalise for indexing into numpy arrays - chunk_selection = tuple(chunk_selection) - out_selection = tuple(out_selection) - - # handle advanced indexing arrays orthogonally - if n_advanced_selection > 1: - # numpy doesn't support orthogonal indexing directly as yet, so need to work - # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices - # or integers, so need to convert slices and integers into ranges. - chunk_selection = [range(dim_chunk_sel.start, dim_chunk_sel.stop) - if isinstance(dim_chunk_sel, slice) - else [dim_chunk_sel] if isinstance(dim_chunk_sel, int) - else dim_chunk_sel - for dim_chunk_sel in chunk_selection] - chunk_selection = np.ix_(*chunk_selection) + # obtain selections for chunk and output arrays + chunk_selection, out_selection = \ + get_chunk_selections(selection, chunk_coords, self._chunks, n_advanced_selection) # obtain the destination array as a view of the output array if out_selection: @@ -724,6 +639,19 @@ def _setitem_nd(self, item, value): # normalize selection selection = normalize_array_selection(item, self._shape, self._chunks) + # figure out if we're doing advanced indexing, count number of advanced selections - if + # more than one need special handling, because we are doing orthogonal indexing here, + # which is different from fancy indexing if there is more than one array selection + n_advanced_selection = sum(1 for dim_sel in selection + if not isinstance(dim_sel, (int, slice))) + + # axes that need to get squeezed out if doing advanced selection + if n_advanced_selection > 1: + squeeze_axes = tuple([i for i, dim_sel in enumerate(selection) + if isinstance(dim_sel, int)]) + else: + squeeze_axes = None + # determine indices of chunks overlapping the selection chunk_ranges, sel_shape = get_chunks_for_selection(selection, self._chunks) @@ -739,19 +667,9 @@ def _setitem_nd(self, item, value): # iterate over chunks in range for chunk_coords in itertools.product(*chunk_ranges): - # TODO refactor code for computing input and output selection for current chunk - - # shared with __getitem__ - - # determine chunk offset - offset = [i * c for i, c in zip(chunk_coords, self._chunks)] - - # determine required index range within chunk - chunk_selection = tuple( - slice(max(0, s.start - o), min(c, s.stop - o)) - if isinstance(s, slice) - else s - o - for s, o, c in zip(selection, offset, self._chunks) - ) + # obtain selections for chunk and destination arrays + chunk_selection, out_selection = \ + get_chunk_selections(selection, chunk_coords, self._chunks, n_advanced_selection) if np.isscalar(value): @@ -761,16 +679,9 @@ def _setitem_nd(self, item, value): else: # assume value is array-like - # determine index within value - value_selection = tuple( - slice(max(0, o - s.start), - min(o + c - s.start, s.stop - s.start)) - for s, o, c in zip(selection, offset, self._chunks) - if isinstance(s, slice) - ) - # put data - self._chunk_setitem(chunk_coords, chunk_selection, value[value_selection]) + dest = value[out_selection] + self._chunk_setitem(chunk_coords, chunk_selection, dest) def _chunk_getitem(self, chunk_coords, chunk_selection, dest, squeeze_axes=None): """Obtain part or whole of a chunk. diff --git a/zarr/util.py b/zarr/util.py index 0f0e96fae8..cf2c7df80c 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -403,6 +403,101 @@ def get_chunks_for_selection(selection, chunks): return chunk_ranges, tuple(sel_shape) +def get_chunk_selections(selection, chunk_coords, chunks, n_advanced_selection): + + # chunk_coords: holds the index along each dimension for the current chunk within the + # chunk grid. E.g., (0, 0) locates the first (top left) chunk in a 2D chunk grid. + + chunk_selection = [] + out_selection = [] + + # iterate over dimensions (axes) of the array + for dim_sel, dim_chunk_idx, dim_chunk_len in zip(selection, chunk_coords, chunks): + + # dim_sel: selection for current dimension + # dim_chunk_idx: chunk index along current dimension + # dim_chunk_len: chunk length along current dimension + + # selection for current chunk along current dimension + dim_chunk_sel = None + + # selection into output array to store data from current chunk + dim_out_sel = None + + # calculate offset for current chunk along current dimension - this is used to + # determine the values to be extracted from the current chunk + dim_chunk_offset = dim_chunk_idx * dim_chunk_len + + # handle integer selection, i.e., single item + if isinstance(dim_sel, int): + + dim_chunk_sel = dim_sel - dim_chunk_offset + + # N.B., leave dim_out_sel as None, as this dimension has been dropped in the + # output array because of single value index + + # handle slice selection, i.e., contiguous range of items + elif isinstance(dim_sel, slice): + + if dim_sel.start <= dim_chunk_offset: + # selection starts before current chunk + dim_chunk_sel_start = 0 + dim_out_offset = dim_chunk_offset - dim_sel.start + + else: + # selection starts within current chunk + dim_chunk_sel_start = dim_sel.start - dim_chunk_offset + dim_out_offset = 0 + + if dim_sel.stop > dim_chunk_offset + dim_chunk_len: + # selection ends after current chunk + dim_chunk_sel_stop = dim_chunk_len + + else: + # selection ends within current chunk + dim_chunk_sel_stop = dim_sel.stop - dim_chunk_offset + + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) + dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + + elif isinstance(dim_sel, (BooleanSelection, IntegerSelection)): + + # get selection to extract data for the current chunk + dim_chunk_sel = dim_sel.get_chunk_sel(dim_chunk_idx) + + # figure out where to put these items in the output array + dim_out_sel = dim_sel.get_out_sel(dim_chunk_idx) + + else: + raise RuntimeError('unexpected selection type') + + # add to chunk selection + chunk_selection.append(dim_chunk_sel) + + # add to output selection + if dim_out_sel is not None: + out_selection.append(dim_out_sel) + + # normalise for indexing into numpy arrays + chunk_selection = tuple(chunk_selection) + out_selection = tuple(out_selection) + + # handle advanced indexing arrays orthogonally + if n_advanced_selection > 1: + # numpy doesn't support orthogonal indexing directly as yet, so need to work + # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices + # or integers, so need to convert slices and integers into ranges. + chunk_selection = [range(dim_chunk_sel.start, dim_chunk_sel.stop) + if isinstance(dim_chunk_sel, slice) + else [dim_chunk_sel] if isinstance(dim_chunk_sel, int) + else dim_chunk_sel + for dim_chunk_sel in chunk_selection] + chunk_selection = np.ix_(*chunk_selection) + + return chunk_selection, out_selection + + def normalize_resize_args(old_shape, *args): # normalize new shape argument From 56999832172507c31cd86c260bb60afce7b7f2dd Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 30 Oct 2017 22:10:50 +0000 Subject: [PATCH 15/67] tests for advanced setting --- zarr/tests/test_core.py | 139 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 126 insertions(+), 13 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 16b762d270..79af57d15c 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -737,8 +737,8 @@ def test_advanced_indexing_1d_int(self): np.random.seed(42) # test with different degrees of sparseness for p in 0.9, 0.5, 0.1, 0.01: - ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix = np.nonzero(ix)[0] + ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix.sort() expect = a[ix] actual = z[ix] assert_array_equal(expect, actual) @@ -779,6 +779,16 @@ def test_advanced_indexing_2d_bool(self): actual = z[42, ix1] assert_array_equal(expect, actual) + # mixed int array / bool array + selections = ( + (ix0, np.nonzero(ix1)[0]), + (np.nonzero(ix0)[0], ix1), + ) + for selection in selections: + expect = a[np.ix_(ix0, ix1)] + actual = z[ix0, ix1] + assert_array_equal(expect, actual) + # TODO test errors def test_advanced_indexing_2d_int(self): @@ -791,10 +801,10 @@ def test_advanced_indexing_2d_int(self): np.random.seed(42) # test with different degrees of sparseness for p in 0.9, 0.5, 0.1, 0.01: - ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix0 = np.nonzero(ix0)[0] - ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) - ix1 = np.nonzero(ix1)[0] + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix0.sort() + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * p), replace=True) + ix1.sort() # index both axes with int array expect = a[np.ix_(ix0, ix1)] @@ -903,12 +913,12 @@ def test_advanced_indexing_3d_int(self): np.random.seed(42) # test with different degrees of sparseness for p in 0.9, 0.5, 0.1, 0.01: - ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix0 = np.nonzero(ix0)[0] - ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) - ix1 = np.nonzero(ix1)[0] - ix2 = np.random.binomial(1, p, size=a.shape[2]).astype(bool) - ix2 = np.nonzero(ix2)[0] + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix0.sort() + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * p), replace=True) + ix1.sort() + ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * p), replace=True) + ix2.sort() # index all axes with int array expect = a[np.ix_(ix0, ix1, ix2)] @@ -970,7 +980,110 @@ def test_advanced_indexing_3d_int(self): actual = z[ix0, 42, ix2] assert_array_equal(expect, actual) - # TODO test advanced indexing with __setitem__ + # TODO test mixed int and bool arrays + + def test_advanced_indexing_1d_bool_set(self): + + # setup + a = np.empty(1050, dtype=int) + v = np.arange(1050, dtype=int) + z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.9, 0.5, 0.1, 0.01: + a[:] = 0 + z[:] = 0 + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + a[ix] = v[ix] + z[ix] = v[ix] + assert_array_equal(a, z[:]) + + def test_advanced_indexing_1d_int_set(self): + + # setup + v = np.arange(1050, dtype=int) + a = np.empty(v.shape, dtype=v.dtype) + z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.9, 0.5, 0.1, 0.01: + a[:] = 0 + z[:] = 0 + ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix.sort() + a[ix] = v[ix] + z[ix] = v[ix] + assert_array_equal(a, z[:]) + + def test_advanced_indexing_2d_bool_set(self): + + # setup + v = np.arange(10000, dtype=int).reshape(100, 100) + a = np.empty_like(v) + z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.9, 0.5, 0.1, 0.01: + a[:] = 0 + z[:] = 0 + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) + + # index both axes with bool array + selection = ix0, ix1 + a[np.ix_(*selection)] = v[np.ix_(*selection)] + z[selection] = v[np.ix_(*selection)] + assert_array_equal(a, z[:]) + + # mixed indexing with bool array / slice + selections = ( + (ix0, slice(15, 35)), + (slice(15, 35), ix1), + (ix0, 42), + (42, ix1), + ) + for selection in selections: + a[selection] = v[selection] + z[selection] = v[selection] + assert_array_equal(a, z[:]) + + def test_advanced_indexing_2d_int_set(self): + + # setup + v = np.arange(10000, dtype=int).reshape(100, 100) + a = np.empty_like(v) + z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.9, 0.5, 0.1, 0.01: + a[:] = 0 + z[:] = 0 + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix0.sort() + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * p), replace=True) + ix1.sort() + + # index both axes with int array + selection = ix0, ix1 + a[np.ix_(*selection)] = v[np.ix_(*selection)] + z[selection] = v[np.ix_(*selection)] + assert_array_equal(a, z[:]) + + # mixed indexing with int array / slice + selections = ( + (ix0, slice(15, 35)), + (slice(15, 35), ix1), + (ix0, 42), + (42, ix1), + ) + for selection in selections: + a[selection] = v[selection] + z[selection] = v[selection] + assert_array_equal(a, z[:]) class TestArrayWithPath(TestArray): From 53efd48253640dfca319d1f8d1a250e7843e6d0a Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 30 Oct 2017 22:32:41 +0000 Subject: [PATCH 16/67] test and fix error handling --- zarr/core.py | 6 ++---- zarr/tests/test_core.py | 34 ++++++++++++++++++++++++++-------- zarr/tests/test_util.py | 4 ++-- zarr/util.py | 41 ++++++++++++++++++++++++++--------------- 4 files changed, 56 insertions(+), 29 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 6a45aaace8..9d3736b98d 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -659,10 +659,8 @@ def _setitem_nd(self, item, value): if np.isscalar(value): pass elif sel_shape != value.shape: - # TODO IndexError? what does numpy do? - raise ValueError('value has wrong shape; expected %s, found %s' - % (str(sel_shape), - str(value.shape))) + raise ValueError('value shape does not match selection shape; expected %s, found %s' + % (str(sel_shape), str(value.shape))) # iterate over chunks in range for chunk_coords in itertools.product(*chunk_ranges): diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 79af57d15c..654dfa9459 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -725,7 +725,13 @@ def test_advanced_indexing_1d_bool(self): actual = z[ix] assert_array_equal(expect, actual) - # TODO test errors + # test errors + with assert_raises(IndexError): + z[ix[:50]] # too short + with assert_raises(IndexError): + z[np.concatenate([ix[:50]] * 2)] # too long + with assert_raises(IndexError): + z[[[True, False], [False, True]]] # too many dimensions def test_advanced_indexing_1d_int(self): @@ -743,7 +749,25 @@ def test_advanced_indexing_1d_int(self): actual = z[ix] assert_array_equal(expect, actual) - # TODO test errors + # test wraparound + ix = [0, 3, 10, -23, -12, -1] + expect = a[ix] + actual = z[ix] + assert_array_equal(expect, actual) + + # test errors + with assert_raises(IndexError): + ix = [a.shape[0] + 1] # out of bounds + z[ix] + with assert_raises(IndexError): + ix = [-(a.shape[0] + 1)] # out of bounds + z[ix] + with assert_raises(IndexError): + ix = [[2, 4], [6, 8]] # too many dimensions + z[ix] + with assert_raises(NotImplementedError): + ix = [3, 105, 23, 127] # not monotonically increasing + z[ix] def test_advanced_indexing_2d_bool(self): @@ -788,8 +812,6 @@ def test_advanced_indexing_2d_bool(self): expect = a[np.ix_(ix0, ix1)] actual = z[ix0, ix1] assert_array_equal(expect, actual) - - # TODO test errors def test_advanced_indexing_2d_int(self): @@ -827,8 +849,6 @@ def test_advanced_indexing_2d_int(self): actual = z[42, ix1] assert_array_equal(expect, actual) - # TODO test errors - def test_advanced_indexing_3d_bool(self): # setup @@ -980,8 +1000,6 @@ def test_advanced_indexing_3d_int(self): actual = z[ix0, 42, ix2] assert_array_equal(expect, actual) - # TODO test mixed int and bool arrays - def test_advanced_indexing_1d_bool_set(self): # setup diff --git a/zarr/tests/test_util.py b/zarr/tests/test_util.py index aa9950caef..0cdbca77f3 100644 --- a/zarr/tests/test_util.py +++ b/zarr/tests/test_util.py @@ -93,7 +93,7 @@ def test_normalize_axis_selection(): with assert_raises(IndexError): normalize_dim_selection(slice(-1000, 0), 100, 10) - with assert_raises(TypeError): + with assert_raises(IndexError): normalize_dim_selection('foo', 100, 10) with assert_raises(NotImplementedError): @@ -140,7 +140,7 @@ def test_normalize_array_selection(): eq((slice(0, 100), slice(0, 100)), normalize_array_selection((slice(None), slice(None), Ellipsis), (100, 100), (10, 10))) - with assert_raises(TypeError): + with assert_raises(IndexError): normalize_array_selection('foo', (100,), (10,)) diff --git a/zarr/util.py b/zarr/util.py index cf2c7df80c..ad666fcbbd 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -135,7 +135,7 @@ def is_total_slice(item, shape): raise TypeError('expected slice or tuple of slices, found %r' % item) -class BooleanSelection(object): +class BoolArraySelection(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): @@ -148,6 +148,7 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): raise IndexError('Boolean array has wrong length; expected %s, found %s' % (dim_len, dim_sel.shape[0])) + # store attributes self.dim_sel = dim_sel self.dim_len = dim_len self.dim_chunk_len = dim_chunk_len @@ -185,19 +186,31 @@ def get_chunk_ranges(self): return np.nonzero(self.chunk_nitems)[0] -class IntegerSelection(object): +class IntArraySelection(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): # has to be a numpy array so we can do bincount dim_sel = np.asanyarray(dim_sel) - # TODO handle wraparound + # check number of dimensions, only support indexing with 1d array + if len(dim_sel.shape) > 1: + raise IndexError('can only index with 1-dimensional integer array') + + # handle wraparound + loc_neg = dim_sel < 0 + if np.any(loc_neg): + dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len + + # handle out of bounds + if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): + raise IndexError('index out of bounds') - # TODO validate dim_sel - # TODO check out of bounds - # TODO validate monotonically increasing + # validate monotonically increasing + if np.any(np.diff(dim_sel) < 0): + raise NotImplementedError('only monotonically increasing indices are supported') + # store attributes self.dim_sel = dim_sel self.dim_len = dim_len self.dim_chunk_len = dim_chunk_len @@ -293,18 +306,16 @@ def normalize_dim_selection(dim_sel, dim_len, dim_chunk_len): elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): if dim_sel.dtype == bool: - return BooleanSelection(dim_sel, dim_len, dim_chunk_len) + return BoolArraySelection(dim_sel, dim_len, dim_chunk_len) elif dim_sel.dtype.kind in 'ui': - return IntegerSelection(dim_sel, dim_len, dim_chunk_len) + return IntArraySelection(dim_sel, dim_len, dim_chunk_len) else: - # TODO IndexError? - raise TypeError('unsupported index item type: %r' % dim_sel) + raise IndexError('unsupported index item type: %r' % dim_sel) else: - # TODO IndexError? - raise TypeError('unsupported index item type: %r' % dim_sel) + raise IndexError('unsupported index item type: %r' % dim_sel) # noinspection PyTypeChecker @@ -381,13 +392,13 @@ def get_chunks_for_selection(selection, chunks): dim_chunk_range = range(dim_chunk_from, dim_chunk_to) dim_sel_len = dim_sel.stop - dim_sel.start - elif isinstance(dim_sel, BooleanSelection): + elif isinstance(dim_sel, BoolArraySelection): # dim selection is a boolean array, delegate this to the BooleanSelection class dim_chunk_range = dim_sel.get_chunk_ranges() dim_sel_len = dim_sel.nitems - elif isinstance(dim_sel, IntegerSelection): + elif isinstance(dim_sel, IntArraySelection): # dim selection is an integer array, delegate this to the integerSelection class dim_chunk_range = dim_sel.get_chunk_ranges() @@ -461,7 +472,7 @@ def get_chunk_selections(selection, chunk_coords, chunks, n_advanced_selection): dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - elif isinstance(dim_sel, (BooleanSelection, IntegerSelection)): + elif isinstance(dim_sel, (BoolArraySelection, IntArraySelection)): # get selection to extract data for the current chunk dim_chunk_sel = dim_sel.get_chunk_sel(dim_chunk_idx) From 95fb592ab7dcd5c1ef3c5a9390debc0443b07664 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 30 Oct 2017 22:55:09 +0000 Subject: [PATCH 17/67] handle 3d set cases --- zarr/core.py | 17 ++++++--- zarr/tests/test_core.py | 83 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 85 insertions(+), 15 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 9d3736b98d..61912ab633 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -679,7 +679,7 @@ def _setitem_nd(self, item, value): # put data dest = value[out_selection] - self._chunk_setitem(chunk_coords, chunk_selection, dest) + self._chunk_setitem(chunk_coords, chunk_selection, dest, squeeze_axes) def _chunk_getitem(self, chunk_coords, chunk_selection, dest, squeeze_axes=None): """Obtain part or whole of a chunk. @@ -742,7 +742,7 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, dest, squeeze_axes=None) else: dest[()] = tmp - def _chunk_setitem(self, chunk_coords, chunk_selection, value): + def _chunk_setitem(self, chunk_coords, chunk_selection, value, squeeze_axes=None): """Replace part or whole of a chunk. Parameters @@ -758,14 +758,14 @@ def _chunk_setitem(self, chunk_coords, chunk_selection, value): # synchronization if self._synchronizer is None: - self._chunk_setitem_nosync(chunk_coords, chunk_selection, value) + self._chunk_setitem_nosync(chunk_coords, chunk_selection, value, squeeze_axes) else: # synchronize on the chunk ckey = self._chunk_key(chunk_coords) with self._synchronizer[ckey]: - self._chunk_setitem_nosync(chunk_coords, chunk_selection, value) + self._chunk_setitem_nosync(chunk_coords, chunk_selection, value, squeeze_axes) - def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value): + def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, squeeze_axes=None): # obtain key for chunk storage ckey = self._chunk_key(chunk_coords) @@ -822,6 +822,13 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value): if not chunk.flags.writeable: chunk = chunk.copy(order='K') + # handle missing singleton dimensions + if squeeze_axes: + item = [slice(None)] * self.ndim + for a in squeeze_axes: + item[a] = np.newaxis + value = value[item] + # modify chunk[chunk_selection] = value diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 654dfa9459..af07504b24 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -719,7 +719,7 @@ def test_advanced_indexing_1d_bool(self): np.random.seed(42) # test with different degrees of sparseness - for p in 0.9, 0.5, 0.1, 0.01: + for p in 0.5, 0.1, 0.01: ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) expect = a[ix] actual = z[ix] @@ -742,7 +742,7 @@ def test_advanced_indexing_1d_int(self): np.random.seed(42) # test with different degrees of sparseness - for p in 0.9, 0.5, 0.1, 0.01: + for p in 0.5, 0.1, 0.01: ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix.sort() expect = a[ix] @@ -778,7 +778,7 @@ def test_advanced_indexing_2d_bool(self): np.random.seed(42) # test with different degrees of sparseness - for p in 0.9, 0.5, 0.1, 0.01: + for p in 0.5, 0.1, 0.01: ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) @@ -822,7 +822,7 @@ def test_advanced_indexing_2d_int(self): np.random.seed(42) # test with different degrees of sparseness - for p in 0.9, 0.5, 0.1, 0.01: + for p in 0.5, 0.1, 0.01: ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix0.sort() ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * p), replace=True) @@ -858,7 +858,7 @@ def test_advanced_indexing_3d_bool(self): np.random.seed(42) # test with different degrees of sparseness - for p in 0.9, 0.5, 0.1, 0.01: + for p in 0.5, 0.1, 0.01: ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) ix2 = np.random.binomial(1, p, size=a.shape[2]).astype(bool) @@ -932,7 +932,7 @@ def test_advanced_indexing_3d_int(self): np.random.seed(42) # test with different degrees of sparseness - for p in 0.9, 0.5, 0.1, 0.01: + for p in 0.5, 0.1, 0.01: ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix0.sort() ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * p), replace=True) @@ -1009,7 +1009,7 @@ def test_advanced_indexing_1d_bool_set(self): np.random.seed(42) # test with different degrees of sparseness - for p in 0.9, 0.5, 0.1, 0.01: + for p in 0.5, 0.1, 0.01: a[:] = 0 z[:] = 0 ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) @@ -1026,7 +1026,7 @@ def test_advanced_indexing_1d_int_set(self): np.random.seed(42) # test with different degrees of sparseness - for p in 0.9, 0.5, 0.1, 0.01: + for p in 0.5, 0.1, 0.01: a[:] = 0 z[:] = 0 ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) @@ -1044,7 +1044,7 @@ def test_advanced_indexing_2d_bool_set(self): np.random.seed(42) # test with different degrees of sparseness - for p in 0.9, 0.5, 0.1, 0.01: + for p in 0.5, 0.1, 0.01: a[:] = 0 z[:] = 0 ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) @@ -1064,6 +1064,8 @@ def test_advanced_indexing_2d_bool_set(self): (42, ix1), ) for selection in selections: + a[:] = 0 + z[:] = 0 a[selection] = v[selection] z[selection] = v[selection] assert_array_equal(a, z[:]) @@ -1077,7 +1079,7 @@ def test_advanced_indexing_2d_int_set(self): np.random.seed(42) # test with different degrees of sparseness - for p in 0.9, 0.5, 0.1, 0.01: + for p in 0.5, 0.1, 0.01: a[:] = 0 z[:] = 0 ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) @@ -1099,10 +1101,71 @@ def test_advanced_indexing_2d_int_set(self): (42, ix1), ) for selection in selections: + a[:] = 0 + z[:] = 0 a[selection] = v[selection] z[selection] = v[selection] assert_array_equal(a, z[:]) + def test_advanced_indexing_3d_bool_set(self): + + # setup + v = np.arange(1000000, dtype=int).reshape(100, 100, 100) + a = np.empty_like(v) + z = self.create_array(shape=a.shape, chunks=(10, 10, 10), dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + a[:] = 0 + z[:] = 0 + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) + ix2 = np.random.binomial(1, p, size=a.shape[2]).astype(bool) + + # index all axes with bool array + selection = ix0, ix1, ix2 + a[np.ix_(*selection)] = v[np.ix_(*selection)] + z[selection] = v[np.ix_(*selection)] + assert_array_equal(a, z[:]) + + # mixed indexing with single bool array / slice or int + selections = ( + (ix0, slice(15, 35), slice(25, 45)), + (slice(15, 35), ix1, slice(25, 45)), + (slice(15, 35), slice(25, 45), ix2), + (ix0, 42, 84), + (42, ix1, 84), + (42, 84, ix2), + (ix0, slice(15, 35), 42), + (slice(15, 35), ix1, 42), + (slice(15, 35), 42, ix2), + ) + for selection in selections: + a[:] = 0 + z[:] = 0 + a[selection] = v[selection] + z[selection] = v[selection] + assert_array_equal(a, z[:]) + + # indexing with two arrays / slice + a[:] = 0 + z[:] = 0 + zsel = ix0, ix1, slice(25, 45) + vsel = np.ix_(ix0, ix1, range(25, 45)) + a[vsel] = v[vsel] + z[zsel] = v[vsel] + assert_array_equal(a, z[:]) + + # indexing with two arrays / integer + a[:] = 0 + z[:] = 0 + zsel = ix0, ix1, 42 + vsel = np.ix_(ix0, ix1, [42]) + a[vsel] = v[vsel] + z[zsel] = v[vsel].squeeze(axis=2) + assert_array_equal(a, z[:]) + class TestArrayWithPath(TestArray): From 733ef087a754e5222ce47564e59f464f8e3de397 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 31 Oct 2017 00:07:30 +0000 Subject: [PATCH 18/67] benchmark advanced indexing --- notebooks/advanced_indexing.ipynb | 1520 +++++++++++++++++++++++++++++ zarr/util.py | 1 + 2 files changed, 1521 insertions(+) create mode 100644 notebooks/advanced_indexing.ipynb diff --git a/notebooks/advanced_indexing.ipynb b/notebooks/advanced_indexing.ipynb new file mode 100644 index 0000000000..969cffc7bb --- /dev/null +++ b/notebooks/advanced_indexing.ipynb @@ -0,0 +1,1520 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '..')\n", + "import zarr\n", + "import numpy as np\n", + "np.random.seed(42)\n", + "import cProfile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Demonstrate advanced indexing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indexing with Boolean arrays" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "a = np.arange(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([False, True, True, True, False, False, False, True, True, True], dtype=bool)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ix = np.random.binomial(1, 0.5, size=a.shape[0]).astype(bool)\n", + "ix" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 2, 3, 7, 8, 9])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a[ix]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 2, 3, 7, 8, 9])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "za = zarr.array(a, chunks=2)\n", + "za[ix]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 10, 20, 30, 4, 5, 6, 70, 80, 90])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "za[ix] = a[ix] * 10\n", + "za[:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indexing with integer arrays" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 4, 5, 5, 7])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ix = np.random.choice(a.shape[0], size=a.shape[0]//2)\n", + "ix.sort() # only monotonically increasing indices are supported\n", + "ix" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 4, 5, 5, 7])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a[ix]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 4, 5, 5, 7])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "za = zarr.array(a, chunks=2)\n", + "za[ix]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 10, 2, 3, 40, 50, 6, 70, 8, 9])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "za[ix] = a[ix] * 10\n", + "za[:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Multidimensional indexing\n", + "\n", + "N.B., orthogonaly indexing is implemented. This is different from numpy fancy indexing if more than one dimension is indexed with an array." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "b = np.arange(100).reshape(10, 10)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([False, True, True, True, False, False, True, False, False, True], dtype=bool)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ix0 = np.random.binomial(1, 0.5, size=b.shape[0]).astype(bool)\n", + "ix0" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([False, False, True, False, True, False, True, True, False, False], dtype=bool)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ix1 = np.random.binomial(1, 0.5, size=b.shape[1]).astype(bool)\n", + "ix1" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[12, 14, 16, 17],\n", + " [22, 24, 26, 27],\n", + " [32, 34, 36, 37],\n", + " [62, 64, 66, 67],\n", + " [92, 94, 96, 97]])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "b[np.ix_(ix0, ix1)]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[12, 14, 16, 17],\n", + " [22, 24, 26, 27],\n", + " [32, 34, 36, 37],\n", + " [62, 64, 66, 67],\n", + " [92, 94, 96, 97]])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zb = zarr.array(b, chunks=(2, 2))\n", + "zb[ix0, ix1]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n", + " [10, 11, -1, 13, -1, 15, -1, -1, 18, 19],\n", + " [20, 21, -1, 23, -1, 25, -1, -1, 28, 29],\n", + " [30, 31, -1, 33, -1, 35, -1, -1, 38, 39],\n", + " [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],\n", + " [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],\n", + " [60, 61, -1, 63, -1, 65, -1, -1, 68, 69],\n", + " [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],\n", + " [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],\n", + " [90, 91, -1, 93, -1, 95, -1, -1, 98, 99]])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zb[ix0, ix1] = -1\n", + "zb[:]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 8, 8, 9, 9])" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ix0 = np.random.choice(b.shape[0], size=b.shape[0]//2)\n", + "ix0.sort() # only monotonically increasing indices are supported\n", + "ix0" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 3, 4, 6, 7])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ix1 = np.random.choice(b.shape[1], size=b.shape[1]//2)\n", + "ix1.sort() # only monotonically increasing indices are supported\n", + "ix1" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[11, 13, 14, 16, 17],\n", + " [81, 83, 84, 86, 87],\n", + " [81, 83, 84, 86, 87],\n", + " [91, 93, 94, 96, 97],\n", + " [91, 93, 94, 96, 97]])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "b[np.ix_(ix0, ix1)]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[11, 13, 14, 16, 17],\n", + " [81, 83, 84, 86, 87],\n", + " [81, 83, 84, 86, 87],\n", + " [91, 93, 94, 96, 97],\n", + " [91, 93, 94, 96, 97]])" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zb = zarr.array(b, chunks=(2, 2))\n", + "zb[ix0, ix1]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n", + " [10, -1, 12, -1, -1, 15, -1, -1, 18, 19],\n", + " [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],\n", + " [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],\n", + " [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],\n", + " [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],\n", + " [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],\n", + " [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],\n", + " [80, -1, 82, -1, -1, 85, -1, -1, 88, 89],\n", + " [90, -1, 92, -1, -1, 95, -1, -1, 98, 99]])" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zb[ix0, ix1] = -1\n", + "zb[:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indexing with zarr bool arrays" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "ix = np.random.binomial(1, 0.5, size=a.shape[0]).astype(bool)\n", + "zix = zarr.array(ix, chunks=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 3, 5, 6, 8, 9])" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "za = zarr.array(a, chunks=2)\n", + "za[ix]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 3, 5, 6, 8, 9])" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# will not load all zix into memory\n", + "za[zix]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Benchmarking" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "800000000" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c = np.arange(100000000)\n", + "c.nbytes" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Typezarr.core.Array
Data typeint64
Shape(100000000,)
Chunk shape(48829,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes800000000 (762.9M)
No. bytes stored11870277 (11.3M)
Storage ratio67.4
Chunks initialized2048/2048
" + ], + "text/plain": [ + "Type : zarr.core.Array\n", + "Data type : int64\n", + "Shape : (100000000,)\n", + "Chunk shape : (48829,)\n", + "Order : C\n", + "Read-only : False\n", + "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", + "Store type : builtins.dict\n", + "No. bytes : 800000000 (762.9M)\n", + "No. bytes stored : 11870277 (11.3M)\n", + "Storage ratio : 67.4\n", + "Chunks initialized : 2048/2048" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zc = zarr.array(c)\n", + "zc.info" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 148 ms, sys: 52 ms, total: 200 ms\n", + "Wall time: 200 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, ..., 99999997, 99999998, 99999999])" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time c.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 480 ms, sys: 420 ms, total: 900 ms\n", + "Wall time: 308 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, ..., 99999997, 99999998, 99999999])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc[:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### bool dense selection" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "49994863" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# relatively dense selection\n", + "ix_dense_bool = np.random.binomial(1, 0.5, size=c.shape[0]).astype(bool)\n", + "np.count_nonzero(ix_dense_bool)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 612 ms, sys: 20 ms, total: 632 ms\n", + "Wall time: 628 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, ..., 99999994, 99999995, 99999996])" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time c[ix_dense_bool]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.5 s, sys: 208 ms, total: 1.71 s\n", + "Wall time: 983 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, ..., 99999994, 99999995, 99999996])" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc[ix_dense_bool]" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 73776 function calls in 1.005 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 2048 0.666 0.000 0.967 0.000 core.py:679(_chunk_getitem)\n", + " 2048 0.255 0.000 0.274 0.000 core.py:839(_decode_chunk)\n", + " 2048 0.010 0.000 0.019 0.000 util.py:418(get_chunk_selections)\n", + " 2048 0.009 0.000 0.009 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", + " 2048 0.008 0.000 0.008 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 2048 0.008 0.000 0.008 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1 0.006 0.006 1.004 1.004 core.py:484(_getitem_nd)\n", + " 2048 0.005 0.000 0.009 0.000 util.py:114(is_total_slice)\n", + " 2048 0.005 0.000 0.010 0.000 arrayprint.py:381(wrapper)\n", + " 2048 0.004 0.000 0.016 0.000 {method 'join' of 'str' objects}\n", + " 2048 0.004 0.000 0.004 0.000 util.py:167(get_chunk_sel)\n", + " 14344 0.003 0.000 0.004 0.000 {built-in method builtins.isinstance}\n", + " 2048 0.003 0.000 0.003 0.000 util.py:177(get_out_sel)\n", + " 2048 0.003 0.000 0.004 0.000 arrayprint.py:399(array2string)\n", + " 2048 0.002 0.000 0.018 0.000 core.py:836(_chunk_key)\n", + " 2048 0.002 0.000 0.012 0.000 numeric.py:1905(array_str)\n", + " 1 0.002 0.002 0.011 0.011 util.py:140(__init__)\n", + " 4096 0.001 0.000 0.002 0.000 util.py:129()\n", + " 2048 0.001 0.000 0.002 0.000 threading.py:1230(current_thread)\n", + " 4096 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", + " 1 0.001 0.001 1.005 1.005 :1()\n", + " 4098 0.001 0.000 0.001 0.000 {method 'append' of 'list' objects}\n", + " 2048 0.001 0.000 0.002 0.000 {built-in method builtins.all}\n", + " 2048 0.001 0.000 0.010 0.000 numeric.py:380(count_nonzero)\n", + " 2048 0.001 0.000 0.001 0.000 core.py:200(chunk_store)\n", + " 2048 0.001 0.000 0.001 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 2048 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 2048 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 2048 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1 0.000 0.000 1.005 1.005 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.zeros}\n", + " 1 0.000 0.000 0.011 0.011 util.py:323(normalize_array_selection)\n", + " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.011 0.011 util.py:250(normalize_dim_selection)\n", + " 1 0.000 0.000 0.000 0.000 util.py:363(get_chunks_for_selection)\n", + " 1 0.000 0.000 1.004 1.004 core.py:377(__getitem__)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 util.py:185(get_chunk_ranges)\n", + " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 5 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", + " 2 0.000 0.000 0.011 0.006 util.py:354()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 2 0.000 0.000 0.000 0.000 core.py:500()\n", + " 1 0.000 0.000 0.000 0.000 util.py:332()\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run('zc[ix_dense_bool]', sort='time')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### int dense selection" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "49994863" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ix_dense_int = np.nonzero(ix_dense_bool)[0]\n", + "len(ix_dense_int)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 144 ms, sys: 20 ms, total: 164 ms\n", + "Wall time: 160 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, ..., 99999994, 99999995, 99999996])" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time c[ix_dense_int]" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.3 s, sys: 152 ms, total: 1.45 s\n", + "Wall time: 1.16 s\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, ..., 99999994, 99999995, 99999996])" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc[ix_dense_int]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 71758 function calls in 1.208 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.662 0.662 0.864 0.864 util.py:191(__init__)\n", + " 2048 0.129 0.000 0.139 0.000 core.py:839(_decode_chunk)\n", + " 1 0.119 0.119 0.119 0.119 {built-in method numpy.core.multiarray.bincount}\n", + " 2048 0.116 0.000 0.278 0.000 core.py:679(_chunk_getitem)\n", + " 1 0.063 0.063 0.063 0.063 function_base.py:1848(diff)\n", + " 2048 0.042 0.000 0.045 0.000 util.py:225(get_chunk_sel)\n", + " 4 0.020 0.005 0.020 0.005 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 2048 0.010 0.000 0.059 0.000 util.py:418(get_chunk_selections)\n", + " 1 0.006 0.006 1.207 1.207 core.py:484(_getitem_nd)\n", + " 4096 0.005 0.000 0.005 0.000 util.py:235(get_out_sel)\n", + " 2048 0.005 0.000 0.007 0.000 util.py:114(is_total_slice)\n", + " 2048 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 2048 0.004 0.000 0.009 0.000 arrayprint.py:381(wrapper)\n", + " 2048 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 2048 0.003 0.000 0.013 0.000 {method 'join' of 'str' objects}\n", + " 14345 0.003 0.000 0.003 0.000 {built-in method builtins.isinstance}\n", + " 2048 0.003 0.000 0.003 0.000 arrayprint.py:399(array2string)\n", + " 2048 0.002 0.000 0.015 0.000 core.py:836(_chunk_key)\n", + " 2048 0.001 0.000 0.010 0.000 numeric.py:1905(array_str)\n", + " 4096 0.001 0.000 0.002 0.000 util.py:129()\n", + " 2048 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 1 0.001 0.001 1.208 1.208 :1()\n", + " 4096 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", + " 4098 0.001 0.000 0.001 0.000 {method 'append' of 'list' objects}\n", + " 2048 0.001 0.000 0.002 0.000 {built-in method builtins.all}\n", + " 2048 0.001 0.000 0.001 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 2048 0.000 0.000 0.000 0.000 core.py:200(chunk_store)\n", + " 2048 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 2048 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 2048 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1 0.000 0.000 0.864 0.864 util.py:250(normalize_dim_selection)\n", + " 4 0.000 0.000 0.020 0.005 fromnumeric.py:1886(any)\n", + " 1 0.000 0.000 1.208 1.208 {built-in method builtins.exec}\n", + " 6 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", + " 1 0.000 0.000 0.864 0.864 util.py:323(normalize_array_selection)\n", + " 4 0.000 0.000 0.020 0.005 {method 'any' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 4 0.000 0.000 0.020 0.005 _methods.py:37(_any)\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 1 0.000 0.000 0.000 0.000 util.py:363(get_chunks_for_selection)\n", + " 1 0.000 0.000 1.207 1.207 core.py:377(__getitem__)\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 2 0.000 0.000 0.864 0.432 util.py:354()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 6 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", + " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 util.py:243(get_chunk_ranges)\n", + " 1 0.000 0.000 0.000 0.000 util.py:332()\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 2 0.000 0.000 0.000 0.000 core.py:500()\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run('zc[ix_dense_int]', sort='time')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### bool sparse selection" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9950" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# relatively sparse selection\n", + "ix_sparse_bool = np.random.binomial(1, 0.0001, size=c.shape[0]).astype(bool)\n", + "np.count_nonzero(ix_sparse_bool)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 20 ms, sys: 0 ns, total: 20 ms\n", + "Wall time: 17.8 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 12643, 15188, 16392, ..., 99989960, 99995101, 99999097])" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time c[ix_sparse_bool]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 472 ms, sys: 88 ms, total: 560 ms\n", + "Wall time: 262 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 12643, 15188, 16392, ..., 99989960, 99995101, 99999097])" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc[ix_sparse_bool]" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 73436 function calls in 0.289 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 2038 0.172 0.000 0.184 0.000 core.py:839(_decode_chunk)\n", + " 2038 0.035 0.000 0.248 0.000 core.py:679(_chunk_getitem)\n", + " 2048 0.013 0.000 0.013 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", + " 2038 0.011 0.000 0.020 0.000 util.py:418(get_chunk_selections)\n", + " 1 0.006 0.006 0.289 0.289 core.py:484(_getitem_nd)\n", + " 2038 0.006 0.000 0.009 0.000 util.py:114(is_total_slice)\n", + " 2038 0.005 0.000 0.011 0.000 arrayprint.py:381(wrapper)\n", + " 2038 0.005 0.000 0.005 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 2038 0.005 0.000 0.017 0.000 {method 'join' of 'str' objects}\n", + " 2038 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 2038 0.004 0.000 0.004 0.000 util.py:167(get_chunk_sel)\n", + " 2038 0.003 0.000 0.003 0.000 util.py:177(get_out_sel)\n", + " 14274 0.003 0.000 0.003 0.000 {built-in method builtins.isinstance}\n", + " 2038 0.003 0.000 0.004 0.000 arrayprint.py:399(array2string)\n", + " 1 0.002 0.002 0.016 0.016 util.py:140(__init__)\n", + " 2038 0.002 0.000 0.019 0.000 core.py:836(_chunk_key)\n", + " 2038 0.002 0.000 0.012 0.000 numeric.py:1905(array_str)\n", + " 4076 0.002 0.000 0.002 0.000 util.py:129()\n", + " 2038 0.001 0.000 0.002 0.000 threading.py:1230(current_thread)\n", + " 4076 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", + " 2038 0.001 0.000 0.002 0.000 {built-in method builtins.all}\n", + " 4078 0.001 0.000 0.001 0.000 {method 'append' of 'list' objects}\n", + " 2038 0.001 0.000 0.001 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 2038 0.001 0.000 0.001 0.000 core.py:200(chunk_store)\n", + " 2048 0.001 0.000 0.013 0.000 numeric.py:380(count_nonzero)\n", + " 2038 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 2038 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 2038 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 2038 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.zeros}\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.289 0.289 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.016 0.016 util.py:323(normalize_array_selection)\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 0.000 0.000 util.py:363(get_chunks_for_selection)\n", + " 1 0.000 0.000 0.289 0.289 core.py:377(__getitem__)\n", + " 1 0.000 0.000 0.016 0.016 util.py:250(normalize_dim_selection)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.289 0.289 :1()\n", + " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 2 0.000 0.000 0.016 0.008 util.py:354()\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 5 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", + " 1 0.000 0.000 0.000 0.000 util.py:185(get_chunk_ranges)\n", + " 2 0.000 0.000 0.000 0.000 core.py:500()\n", + " 1 0.000 0.000 0.000 0.000 util.py:332()\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run('zc[ix_sparse_bool]', sort='time')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### int sparse selection" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9950" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ix_sparse_int = np.nonzero(ix_sparse_bool)[0]\n", + "len(ix_sparse_int)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n", + "Wall time: 169 µs\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 12643, 15188, 16392, ..., 99989960, 99995101, 99999097])" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time c[ix_sparse_int]" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 504 ms, sys: 68 ms, total: 572 ms\n", + "Wall time: 262 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 12643, 15188, 16392, ..., 99989960, 99995101, 99999097])" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc[ix_sparse_int]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 71408 function calls in 0.241 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 2038 0.158 0.000 0.169 0.000 core.py:839(_decode_chunk)\n", + " 2038 0.011 0.000 0.014 0.000 util.py:225(get_chunk_sel)\n", + " 2038 0.010 0.000 0.028 0.000 util.py:418(get_chunk_selections)\n", + " 2038 0.010 0.000 0.207 0.000 core.py:679(_chunk_getitem)\n", + " 1 0.006 0.006 0.241 0.241 core.py:484(_getitem_nd)\n", + " 2038 0.005 0.000 0.009 0.000 util.py:114(is_total_slice)\n", + " 4076 0.005 0.000 0.005 0.000 util.py:235(get_out_sel)\n", + " 2038 0.005 0.000 0.010 0.000 arrayprint.py:381(wrapper)\n", + " 2038 0.005 0.000 0.005 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 2038 0.004 0.000 0.016 0.000 {method 'join' of 'str' objects}\n", + " 2038 0.003 0.000 0.003 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 2038 0.003 0.000 0.004 0.000 arrayprint.py:399(array2string)\n", + " 14275 0.003 0.000 0.003 0.000 {built-in method builtins.isinstance}\n", + " 2038 0.002 0.000 0.018 0.000 core.py:836(_chunk_key)\n", + " 2038 0.002 0.000 0.012 0.000 numeric.py:1905(array_str)\n", + " 4076 0.001 0.000 0.002 0.000 util.py:129()\n", + " 2038 0.001 0.000 0.002 0.000 threading.py:1230(current_thread)\n", + " 4076 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", + " 2038 0.001 0.000 0.002 0.000 {built-in method builtins.all}\n", + " 4078 0.001 0.000 0.001 0.000 {method 'append' of 'list' objects}\n", + " 2038 0.001 0.000 0.001 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 2038 0.001 0.000 0.001 0.000 core.py:200(chunk_store)\n", + " 2038 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1 0.000 0.000 0.001 0.001 util.py:191(__init__)\n", + " 2038 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 2038 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 2038 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1 0.000 0.000 0.000 0.000 function_base.py:1848(diff)\n", + " 4 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.bincount}\n", + " 1 0.000 0.000 0.241 0.241 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.001 0.001 util.py:323(normalize_array_selection)\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 4 0.000 0.000 0.000 0.000 fromnumeric.py:1886(any)\n", + " 1 0.000 0.000 0.001 0.001 util.py:250(normalize_dim_selection)\n", + " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 0.000 0.000 util.py:363(get_chunks_for_selection)\n", + " 6 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", + " 1 0.000 0.000 0.241 0.241 core.py:377(__getitem__)\n", + " 4 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", + " 4 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.000 0.000 util.py:243(get_chunk_ranges)\n", + " 1 0.000 0.000 0.241 0.241 :1()\n", + " 2 0.000 0.000 0.001 0.000 util.py:354()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 2 0.000 0.000 0.000 0.000 core.py:500()\n", + " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 6 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", + " 1 0.000 0.000 0.000 0.000 util.py:332()\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run('zc[ix_sparse_int]', sort='time')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### sparse bool selection as zarr array" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Typezarr.core.Array
Data typebool
Shape(100000000,)
Chunk shape(195313,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes100000000 (95.4M)
No. bytes stored511163 (499.2K)
Storage ratio195.6
Chunks initialized512/512
" + ], + "text/plain": [ + "Type : zarr.core.Array\n", + "Data type : bool\n", + "Shape : (100000000,)\n", + "Chunk shape : (195313,)\n", + "Order : C\n", + "Read-only : False\n", + "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", + "Store type : builtins.dict\n", + "No. bytes : 100000000 (95.4M)\n", + "No. bytes stored : 511163 (499.2K)\n", + "Storage ratio : 195.6\n", + "Chunks initialized : 512/512" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zix_sparse_bool = zarr.array(ix_sparse_bool)\n", + "zix_sparse_bool.info" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 932 ms, sys: 180 ms, total: 1.11 s\n", + "Wall time: 570 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 12643, 15188, 16392, ..., 99989960, 99995101, 99999097])" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc[zix_sparse_bool]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### h5py comparison\n", + "\n", + "N.B., not really fair because using slower compressor, but for interest..." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "import h5py\n", + "import tempfile" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "h5f = h5py.File(tempfile.mktemp(), driver='core', backing_store=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hc = h5f.create_dataset('c', data=c, compression='gzip', compression_opts=1, chunks=zc.chunks, shuffle=True)\n", + "hc" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.14 s, sys: 40 ms, total: 1.18 s\n", + "Wall time: 1.17 s\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, ..., 99999997, 99999998, 99999999])" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time hc[:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.1 s, sys: 0 ns, total: 1.1 s\n", + "Wall time: 1.1 s\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 12643, 15188, 16392, ..., 99989960, 99995101, 99999097])" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time hc[ix_sparse_bool]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# this is pathological, takes > 1 minute \n", + "%time hc[ix_dense_bool]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/zarr/util.py b/zarr/util.py index ad666fcbbd..df70dfcbe8 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -217,6 +217,7 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) # precompute number of selected items for each chunk + # note: for dense integer selections, the division operation here is the bottleneck self.chunk_nitems = np.bincount(self.dim_sel // self.dim_chunk_len, minlength=self.nchunks) self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) self.nitems = len(dim_sel) From 1420ce28ca45d7918bb8bc66a695d69813e1aaec Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 31 Oct 2017 00:43:49 +0000 Subject: [PATCH 19/67] flake8 --- zarr/util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/zarr/util.py b/zarr/util.py index df70dfcbe8..ac04649e44 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -3,7 +3,6 @@ import operator from textwrap import TextWrapper import numbers -import functools import numpy as np From db69c0dba5af984e600c4c38ebea299c488a8404 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 31 Oct 2017 11:37:56 +0000 Subject: [PATCH 20/67] refactor and extend tests to handle indexing edge cases --- zarr/core.py | 4 +- zarr/tests/test_core.py | 513 ++++++++++++++++++---------------------- zarr/util.py | 2 +- 3 files changed, 233 insertions(+), 286 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 61912ab633..e4baf78d51 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -506,7 +506,7 @@ def _getitem_nd(self, item): if not isinstance(dim_sel, (int, slice))) # axes that need to get squeezed out if doing advanced selection - if n_advanced_selection > 1: + if n_advanced_selection > 0: squeeze_axes = tuple([i for i, dim_sel in enumerate(selection) if isinstance(dim_sel, int)]) else: @@ -646,7 +646,7 @@ def _setitem_nd(self, item, value): if not isinstance(dim_sel, (int, slice))) # axes that need to get squeezed out if doing advanced selection - if n_advanced_selection > 1: + if n_advanced_selection > 0: squeeze_axes = tuple([i for i, dim_sel in enumerate(selection) if isinstance(dim_sel, int)]) else: diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index af07504b24..358a81950b 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -710,6 +710,11 @@ def test_nchunks_initialized(self): z[:] = 42 eq(10, z.nchunks_initialized) + def _test_advanced_indexing_1d_common(self, a, z, ix): + expect = a[ix] + actual = z[ix] + assert_array_equal(expect, actual) + def test_advanced_indexing_1d_bool(self): # setup @@ -721,9 +726,7 @@ def test_advanced_indexing_1d_bool(self): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - expect = a[ix] - actual = z[ix] - assert_array_equal(expect, actual) + self._test_advanced_indexing_1d_common(a, z, ix) # test errors with assert_raises(IndexError): @@ -745,9 +748,7 @@ def test_advanced_indexing_1d_int(self): for p in 0.5, 0.1, 0.01: ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix.sort() - expect = a[ix] - actual = z[ix] - assert_array_equal(expect, actual) + self._test_advanced_indexing_1d_common(a, z, ix) # test wraparound ix = [0, 3, 10, -23, -12, -1] @@ -769,39 +770,44 @@ def test_advanced_indexing_1d_int(self): ix = [3, 105, 23, 127] # not monotonically increasing z[ix] + def _test_advanced_indexing_2d_common(self, a, z, ix0, ix1): + + # index both axes with array + expect = a[np.ix_(ix0, ix1)] + actual = z[ix0, ix1] + assert_array_equal(expect, actual) + + # mixed indexing with array / slice + expect = a[ix0, 1:5] + actual = z[ix0, 1:5] + assert_array_equal(expect, actual) + expect = a[250:350, ix1] + actual = z[250:350, ix1] + assert_array_equal(expect, actual) + + # mixed indexing with array / single index + expect = a[ix0, 4] + actual = z[ix0, 4] + assert_array_equal(expect, actual) + expect = a[42, ix1] + actual = z[42, ix1] + assert_array_equal(expect, actual) + def test_advanced_indexing_2d_bool(self): # setup - a = np.arange(10000, dtype=int).reshape(100, 100) - z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype) + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = self.create_array(shape=a.shape, chunks=(300, 3), dtype=a.dtype) z[:] = a np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) - - # index both axes with bool array - expect = a[np.ix_(ix0, ix1)] - actual = z[ix0, ix1] - assert_array_equal(expect, actual) - - # mixed indexing with bool array / slice - expect = a[ix0, 15:35] - actual = z[ix0, 15:35] - assert_array_equal(expect, actual) - expect = a[15:35, ix1] - actual = z[15:35, ix1] - assert_array_equal(expect, actual) - - # mixed indexing with bool array / single index - expect = a[ix0, 42] - actual = z[ix0, 42] - assert_array_equal(expect, actual) - expect = a[42, ix1] - actual = z[42, ix1] - assert_array_equal(expect, actual) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + + # main tests + self._test_advanced_indexing_2d_common(a, z, ix0, ix1) # mixed int array / bool array selections = ( @@ -816,8 +822,8 @@ def test_advanced_indexing_2d_bool(self): def test_advanced_indexing_2d_int(self): # setup - a = np.arange(10000, dtype=int).reshape(100, 100) - z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype) + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = self.create_array(shape=a.shape, chunks=(300, 3), dtype=a.dtype) z[:] = a np.random.seed(42) @@ -825,109 +831,106 @@ def test_advanced_indexing_2d_int(self): for p in 0.5, 0.1, 0.01: ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix0.sort() - ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) ix1.sort() + self._test_advanced_indexing_2d_common(a, z, ix0, ix1) + + def _test_advanced_indexing_3d_common(self, a, z, ix0, ix1, ix2): + + # index all axes with array + expect = a[np.ix_(ix0, ix1, ix2)] + actual = z[ix0, ix1, ix2] + assert_array_equal(expect, actual) + + # mixed indexing with single array / slices + expect = a[ix0, 15:25, 1:5] + actual = z[ix0, 15:25, 1:5] + assert_array_equal(expect, actual) + expect = a[50:70, ix1, 1:5] + actual = z[50:70, ix1, 1:5] + assert_array_equal(expect, actual) + expect = a[50:70, 15:25, ix2] + actual = z[50:70, 15:25, ix2] + assert_array_equal(expect, actual) + + # mixed indexing with single array / single index + expect = a[ix0, 42, 4] + actual = z[ix0, 42, 4] + assert_array_equal(expect, actual) + expect = a[42, ix1, 4] + actual = z[42, ix1, 4] + assert_array_equal(expect, actual) + expect = a[84, 42, ix2] + actual = z[84, 42, ix2] + assert_array_equal(expect, actual) - # index both axes with int array - expect = a[np.ix_(ix0, ix1)] - actual = z[ix0, ix1] - assert_array_equal(expect, actual) - - # mixed indexing with int array / slice - expect = a[ix0, 15:35] - actual = z[ix0, 15:35] - assert_array_equal(expect, actual) - expect = a[15:35, ix1] - actual = z[15:35, ix1] - assert_array_equal(expect, actual) - - # mixed indexing with int array / single index - expect = a[ix0, 42] - actual = z[ix0, 42] - assert_array_equal(expect, actual) - expect = a[42, ix1] - actual = z[42, ix1] - assert_array_equal(expect, actual) + # mixed indexing with single array / slice / single index + expect = a[ix0, 15:25, 4] + actual = z[ix0, 15:25, 4] + assert_array_equal(expect, actual) + expect = a[42, ix1, 1:5] + actual = z[42, ix1, 1:5] + assert_array_equal(expect, actual) + expect = a[50:70, 42, ix2] + actual = z[50:70, 42, ix2] + assert_array_equal(expect, actual) + + # mixed indexing with two array / slice + expect = a[np.ix_(ix0, ix1, range(1, 5))] + actual = z[ix0, ix1, 1:5] + assert_array_equal(expect, actual) + expect = a[np.ix_(range(50, 70), ix1, ix2)] + actual = z[50:70, ix1, ix2] + assert_array_equal(expect, actual) + expect = a[np.ix_(ix0, range(15, 25), ix2)] + actual = z[ix0, 15:25, ix2] + assert_array_equal(expect, actual) + + # mixed indexing with two array / integer + expect = a[np.ix_(ix0, ix1, [4])].squeeze(axis=2) + actual = z[ix0, ix1, 4] + assert_array_equal(expect, actual) + expect = a[np.ix_([42], ix1, ix2)].squeeze(axis=0) + actual = z[42, ix1, ix2] + assert_array_equal(expect, actual) + expect = a[np.ix_(ix0, [42], ix2)].squeeze(axis=1) + actual = z[ix0, 42, ix2] + assert_array_equal(expect, actual) def test_advanced_indexing_3d_bool(self): # setup - a = np.arange(1000000, dtype=int).reshape(100, 100, 100) - z = self.create_array(shape=a.shape, chunks=(10, 10, 10), dtype=a.dtype) + a = np.arange(100000, dtype=int).reshape(200, 50, 10) + z = self.create_array(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) z[:] = a np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) - ix2 = np.random.binomial(1, p, size=a.shape[2]).astype(bool) - - # index all axes with bool array - expect = a[np.ix_(ix0, ix1, ix2)] - actual = z[ix0, ix1, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with single bool array / slices - expect = a[ix0, 15:35, 25:45] - actual = z[ix0, 15:35, 25:45] - assert_array_equal(expect, actual) - expect = a[15:35, ix1, 25:45] - actual = z[15:35, ix1, 25:45] - assert_array_equal(expect, actual) - expect = a[15:35, 25:45, ix2] - actual = z[15:35, 25:45, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with single bool array / single index - expect = a[ix0, 42, 84] - actual = z[ix0, 42, 84] - assert_array_equal(expect, actual) - expect = a[42, ix1, 84] - actual = z[42, ix1, 84] - assert_array_equal(expect, actual) - expect = a[42, 84, ix2] - actual = z[42, 84, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with single bool array / slice / single index - expect = a[ix0, 15:35, 42] - actual = z[ix0, 15:35, 42] - assert_array_equal(expect, actual) - expect = a[42, ix1, 25:45] - actual = z[42, ix1, 25:45] - assert_array_equal(expect, actual) - expect = a[15:35, 42, ix2] - actual = z[15:35, 42, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with two bool array / slice - expect = a[np.ix_(ix0, ix1, range(25, 45))] - actual = z[ix0, ix1, 25:45] - assert_array_equal(expect, actual) - expect = a[np.ix_(range(15, 35), ix1, ix2)] - actual = z[15:35, ix1, ix2] - assert_array_equal(expect, actual) - expect = a[np.ix_(ix0, range(25, 45), ix2)] - actual = z[ix0, 25:45, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with two bool array / integer - expect = a[np.ix_(ix0, ix1, [42])].squeeze(axis=2) - actual = z[ix0, ix1, 42] - assert_array_equal(expect, actual) - expect = a[np.ix_([42], ix1, ix2)].squeeze(axis=0) - actual = z[42, ix1, ix2] - assert_array_equal(expect, actual) - expect = a[np.ix_(ix0, [42], ix2)].squeeze(axis=1) - actual = z[ix0, 42, ix2] - assert_array_equal(expect, actual) + ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) + ix2 = np.random.binomial(1, .5, size=a.shape[2]).astype(bool) + self._test_advanced_indexing_3d_common(a, z, ix0, ix1, ix2) + + def test_advanced_indexing_edge_cases(self): + + a = np.arange(6).reshape(1, 2, 3) + z = self.create_array(shape=a.shape, chunks=(1, 2, 3), dtype=a.dtype) + z[:] = a + + expect = a[np.ix_([0], range(2), [0, 1, 2])].squeeze(axis=0) + actual = z[0, :, [0, 1, 2]] + assert_array_equal(expect, actual) + + expect = a[np.ix_([0], range(2), [True, True, True])].squeeze(axis=0) + actual = z[0, :, [True, True, True]] + assert_array_equal(expect, actual) def test_advanced_indexing_3d_int(self): # setup - a = np.arange(1000000, dtype=int).reshape(100, 100, 100) - z = self.create_array(shape=a.shape, chunks=(10, 10, 10), dtype=a.dtype) + a = np.arange(100000, dtype=int).reshape(200, 50, 10) + z = self.create_array(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) z[:] = a np.random.seed(42) @@ -935,87 +938,31 @@ def test_advanced_indexing_3d_int(self): for p in 0.5, 0.1, 0.01: ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix0.sort() - ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) ix1.sort() - ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * p), replace=True) + ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) ix2.sort() + self._test_advanced_indexing_3d_common(a, z, ix0, ix1, ix2) - # index all axes with int array - expect = a[np.ix_(ix0, ix1, ix2)] - actual = z[ix0, ix1, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with single int array / slices - expect = a[ix0, 15:35, 25:45] - actual = z[ix0, 15:35, 25:45] - assert_array_equal(expect, actual) - expect = a[15:35, ix1, 25:45] - actual = z[15:35, ix1, 25:45] - assert_array_equal(expect, actual) - expect = a[15:35, 25:45, ix2] - actual = z[15:35, 25:45, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with single int array / single index - expect = a[ix0, 42, 84] - actual = z[ix0, 42, 84] - assert_array_equal(expect, actual) - expect = a[42, ix1, 84] - actual = z[42, ix1, 84] - assert_array_equal(expect, actual) - expect = a[42, 84, ix2] - actual = z[42, 84, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with single int array / slice / single index - expect = a[ix0, 15:35, 42] - actual = z[ix0, 15:35, 42] - assert_array_equal(expect, actual) - expect = a[42, ix1, 25:45] - actual = z[42, ix1, 25:45] - assert_array_equal(expect, actual) - expect = a[15:35, 42, ix2] - actual = z[15:35, 42, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with two int array / slice - expect = a[np.ix_(ix0, ix1, range(25, 45))] - actual = z[ix0, ix1, 25:45] - assert_array_equal(expect, actual) - expect = a[np.ix_(range(15, 35), ix1, ix2)] - actual = z[15:35, ix1, ix2] - assert_array_equal(expect, actual) - expect = a[np.ix_(ix0, range(25, 45), ix2)] - actual = z[ix0, 25:45, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with two int array / integer - expect = a[np.ix_(ix0, ix1, [42])].squeeze(axis=2) - actual = z[ix0, ix1, 42] - assert_array_equal(expect, actual) - expect = a[np.ix_([42], ix1, ix2)].squeeze(axis=0) - actual = z[42, ix1, ix2] - assert_array_equal(expect, actual) - expect = a[np.ix_(ix0, [42], ix2)].squeeze(axis=1) - actual = z[ix0, 42, ix2] - assert_array_equal(expect, actual) + def _test_advanced_indexing_1d_common_set(self, v, a, z, ix): + a[:] = 0 + z[:] = 0 + a[ix] = v[ix] + z[ix] = v[ix] + assert_array_equal(a, z[:]) def test_advanced_indexing_1d_bool_set(self): # setup - a = np.empty(1050, dtype=int) v = np.arange(1050, dtype=int) + a = np.empty(v.shape, dtype=int) z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: - a[:] = 0 - z[:] = 0 ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - a[ix] = v[ix] - z[ix] = v[ix] - assert_array_equal(a, z[:]) + self._test_advanced_indexing_1d_common_set(v, a, z, ix) def test_advanced_indexing_1d_int_set(self): @@ -1027,144 +974,143 @@ def test_advanced_indexing_1d_int_set(self): np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: - a[:] = 0 - z[:] = 0 ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix.sort() - a[ix] = v[ix] - z[ix] = v[ix] + self._test_advanced_indexing_1d_common_set(v, a, z, ix) + + def _test_advanced_indexing_2d_common_set(self, v, a, z, ix0, ix1): + + # index both axes with array + a[:] = 0 + z[:] = 0 + selection = ix0, ix1 + a[np.ix_(*selection)] = v[np.ix_(*selection)] + z[selection] = v[np.ix_(*selection)] + assert_array_equal(a, z[:]) + + # mixed indexing with array / slice or int + selections = ( + (ix0, slice(1, 5)), + (slice(250, 350), ix1), + (ix0, 4), + (42, ix1), + ) + for selection in selections: + a[:] = 0 + z[:] = 0 + a[selection] = v[selection] + z[selection] = v[selection] assert_array_equal(a, z[:]) def test_advanced_indexing_2d_bool_set(self): # setup - v = np.arange(10000, dtype=int).reshape(100, 100) + v = np.arange(10000, dtype=int).reshape(1000, 10) a = np.empty_like(v) - z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype) + z = self.create_array(shape=a.shape, chunks=(300, 30), dtype=a.dtype) np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: - a[:] = 0 - z[:] = 0 ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) - - # index both axes with bool array - selection = ix0, ix1 - a[np.ix_(*selection)] = v[np.ix_(*selection)] - z[selection] = v[np.ix_(*selection)] - assert_array_equal(a, z[:]) - - # mixed indexing with bool array / slice - selections = ( - (ix0, slice(15, 35)), - (slice(15, 35), ix1), - (ix0, 42), - (42, ix1), - ) - for selection in selections: - a[:] = 0 - z[:] = 0 - a[selection] = v[selection] - z[selection] = v[selection] - assert_array_equal(a, z[:]) + ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) + self._test_advanced_indexing_2d_common_set(v, a, z, ix0, ix1) def test_advanced_indexing_2d_int_set(self): # setup - v = np.arange(10000, dtype=int).reshape(100, 100) + v = np.arange(10000, dtype=int).reshape(1000, 10) a = np.empty_like(v) - z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype) + z = self.create_array(shape=a.shape, chunks=(300, 30), dtype=a.dtype) np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: - a[:] = 0 - z[:] = 0 ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix0.sort() - ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) ix1.sort() + self._test_advanced_indexing_2d_common_set(v, a, z, ix0, ix1) + + def _test_advanced_indexing_3d_common_set(self, v, a, z, ix0, ix1, ix2): + + # index all axes with bool array + a[:] = 0 + z[:] = 0 + selection = ix0, ix1, ix2 + a[np.ix_(*selection)] = v[np.ix_(*selection)] + z[selection] = v[np.ix_(*selection)] + assert_array_equal(a, z[:]) - # index both axes with int array - selection = ix0, ix1 - a[np.ix_(*selection)] = v[np.ix_(*selection)] - z[selection] = v[np.ix_(*selection)] + # mixed indexing with single bool array / slice or int + selections = ( + (ix0, slice(15, 25), slice(1, 5)), + (slice(50, 70), ix1, slice(1, 5)), + (slice(50, 70), slice(15, 25), ix2), + (ix0, 42, 4), + (84, ix1, 4), + (84, 42, ix2), + (ix0, slice(15, 25), 4), + (slice(50, 70), ix1, 4), + (slice(50, 70), 42, ix2), + ) + for selection in selections: + a[:] = 0 + z[:] = 0 + a[selection] = v[selection] + z[selection] = v[selection] assert_array_equal(a, z[:]) - # mixed indexing with int array / slice - selections = ( - (ix0, slice(15, 35)), - (slice(15, 35), ix1), - (ix0, 42), - (42, ix1), - ) - for selection in selections: - a[:] = 0 - z[:] = 0 - a[selection] = v[selection] - z[selection] = v[selection] - assert_array_equal(a, z[:]) + # indexing with two arrays / slice + a[:] = 0 + z[:] = 0 + zsel = ix0, ix1, slice(1, 5) + vsel = np.ix_(ix0, ix1, range(1, 5)) + a[vsel] = v[vsel] + z[zsel] = v[vsel] + assert_array_equal(a, z[:]) + + # indexing with two arrays / integer + a[:] = 0 + z[:] = 0 + zsel = ix0, ix1, 4 + vsel = np.ix_(ix0, ix1, [4]) + a[vsel] = v[vsel] + z[zsel] = v[vsel].squeeze(axis=2) + assert_array_equal(a, z[:]) def test_advanced_indexing_3d_bool_set(self): # setup - v = np.arange(1000000, dtype=int).reshape(100, 100, 100) + v = np.arange(100000, dtype=int).reshape(200, 50, 10) a = np.empty_like(v) - z = self.create_array(shape=a.shape, chunks=(10, 10, 10), dtype=a.dtype) + z = self.create_array(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: - a[:] = 0 - z[:] = 0 ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix1 = np.random.binomial(1, p, size=a.shape[1]).astype(bool) - ix2 = np.random.binomial(1, p, size=a.shape[2]).astype(bool) + ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) + ix2 = np.random.binomial(1, .5, size=a.shape[2]).astype(bool) + self._test_advanced_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) - # index all axes with bool array - selection = ix0, ix1, ix2 - a[np.ix_(*selection)] = v[np.ix_(*selection)] - z[selection] = v[np.ix_(*selection)] - assert_array_equal(a, z[:]) + def test_advanced_indexing_3d_int_set(self): - # mixed indexing with single bool array / slice or int - selections = ( - (ix0, slice(15, 35), slice(25, 45)), - (slice(15, 35), ix1, slice(25, 45)), - (slice(15, 35), slice(25, 45), ix2), - (ix0, 42, 84), - (42, ix1, 84), - (42, 84, ix2), - (ix0, slice(15, 35), 42), - (slice(15, 35), ix1, 42), - (slice(15, 35), 42, ix2), - ) - for selection in selections: - a[:] = 0 - z[:] = 0 - a[selection] = v[selection] - z[selection] = v[selection] - assert_array_equal(a, z[:]) - - # indexing with two arrays / slice - a[:] = 0 - z[:] = 0 - zsel = ix0, ix1, slice(25, 45) - vsel = np.ix_(ix0, ix1, range(25, 45)) - a[vsel] = v[vsel] - z[zsel] = v[vsel] - assert_array_equal(a, z[:]) + # setup + v = np.arange(100000, dtype=int).reshape(200, 50, 10) + a = np.empty_like(v) + z = self.create_array(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) - # indexing with two arrays / integer - a[:] = 0 - z[:] = 0 - zsel = ix0, ix1, 42 - vsel = np.ix_(ix0, ix1, [42]) - a[vsel] = v[vsel] - z[zsel] = v[vsel].squeeze(axis=2) - assert_array_equal(a, z[:]) + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix0.sort() + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) + ix1.sort() + ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) + ix2.sort() + self._test_advanced_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) class TestArrayWithPath(TestArray): @@ -1310,6 +1256,7 @@ def create_array(read_only=False, **kwargs): Delta(dtype=dtype), FixedScaleOffset(dtype=dtype, scale=1, offset=0), ] + # print(dtype, filters[1].astype) kwargs.setdefault('filters', filters) compressor = Zlib(1) kwargs.setdefault('compressor', compressor) diff --git a/zarr/util.py b/zarr/util.py index ac04649e44..8af623142e 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -495,7 +495,7 @@ def get_chunk_selections(selection, chunk_coords, chunks, n_advanced_selection): out_selection = tuple(out_selection) # handle advanced indexing arrays orthogonally - if n_advanced_selection > 1: + if n_advanced_selection > 0: # numpy doesn't support orthogonal indexing directly as yet, so need to work # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices # or integers, so need to convert slices and integers into ranges. From 0e9acaf9b4267ff74569724206a3dcba5fe3d6ba Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 31 Oct 2017 12:00:17 +0000 Subject: [PATCH 21/67] deal with filter interaction bug --- zarr/core.py | 10 ++++++++-- zarr/tests/test_core.py | 16 ++++++++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index e4baf78d51..26cf5db770 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -474,9 +474,11 @@ def _getitem_zd(self, item): except KeyError: # chunk not initialized - out = np.empty((), dtype=self._dtype) if self._fill_value is not None: + out = np.empty((), dtype=self._dtype) out.fill(self._fill_value) + else: + out = np.zeros((), dtype=self._dtype) else: out = self._decode_chunk(cdata) @@ -811,9 +813,13 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, squeeze_ax except KeyError: # chunk not initialized - chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order) if self._fill_value is not None: + chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order) chunk.fill(self._fill_value) + else: + # N.B., use zeros here so any region beyond the array has consistent and + # compressible data + chunk = np.zeros(self._chunks, dtype=self._dtype, order=self._order) else: diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 358a81950b..65499cdbaa 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -299,6 +299,18 @@ def test_array_2d(self): assert_array_equal(a[310:], z[310:]) assert_array_equal(a[:, 7:], z[:, 7:]) + def test_array_2d_edge_case(self): + # this fails with filters - chunks extend beyond edge of array, messes with delta filter + # if no fill value? + shape = 1000, 10 + chunks = 300, 30 + dtype = 'i8' + z = self.create_array(shape=shape, dtype=dtype, chunks=chunks) + z[:] = 0 + expect = np.zeros(shape, dtype=dtype) + actual = z[:] + assert_array_equal(expect, actual) + def test_array_2d_partial(self): z = self.create_array(shape=(1000, 10), chunks=(100, 2), dtype='i4', fill_value=0) @@ -1007,7 +1019,7 @@ def test_advanced_indexing_2d_bool_set(self): # setup v = np.arange(10000, dtype=int).reshape(1000, 10) a = np.empty_like(v) - z = self.create_array(shape=a.shape, chunks=(300, 30), dtype=a.dtype) + z = self.create_array(shape=a.shape, chunks=(300, 3), dtype=a.dtype) np.random.seed(42) # test with different degrees of sparseness @@ -1021,7 +1033,7 @@ def test_advanced_indexing_2d_int_set(self): # setup v = np.arange(10000, dtype=int).reshape(1000, 10) a = np.empty_like(v) - z = self.create_array(shape=a.shape, chunks=(300, 30), dtype=a.dtype) + z = self.create_array(shape=a.shape, chunks=(300, 3), dtype=a.dtype) np.random.seed(42) # test with different degrees of sparseness From c7e07da6dca9009c76180234ac9612c5fed9b87c Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 1 Nov 2017 23:54:17 +0000 Subject: [PATCH 22/67] WIP refactoring to split orthogonal and basic indexing --- zarr/core.py | 78 ++++++-- zarr/indexing.py | 428 ++++++++++++++++++++++++++++++++++++++++ zarr/tests/test_core.py | 81 ++++---- 3 files changed, 530 insertions(+), 57 deletions(-) create mode 100644 zarr/indexing.py diff --git a/zarr/core.py b/zarr/core.py index 26cf5db770..3867154b53 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -17,6 +17,7 @@ from zarr.errors import PermissionError, err_read_only, err_array_not_found from zarr.compat import reduce from zarr.codecs import AsType, get_codec +from zarr.indexing import OIndex, OrthogonalSelection class Array(object): @@ -107,6 +108,9 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, # initialize info reporter self._info_reporter = InfoReporter(self) + # initialize indexing helpers + self._oindex = OIndex(self) + def _load_metadata(self): """(Re)load metadata from store.""" if self._synchronizer is None: @@ -356,6 +360,11 @@ def is_view(self): """A boolean, True if this array is a view on another array.""" return self._is_view + @property + def oindex(self): + """TODO""" + return self._oindex + def __eq__(self, other): return ( isinstance(other, Array) and @@ -541,6 +550,39 @@ def _getitem_nd(self, item): else: return out[()] + def get_orthogonal_selection(self, selection, out=None): + + # setup selection + selection = OrthogonalSelection(selection, self) + + # determine indices of chunks overlapping the selection + chunk_ranges, sel_shape = selection.get_overlapping_chunks() + + # setup output array + if out is None: + out = np.empty(sel_shape, dtype=self._dtype, order=self._order) + else: + # validate 'out' parameter + if not hasattr(out, 'shape'): + raise TypeError('out must be an array-like object') + if out.shape != sel_shape: + raise ValueError('out has wrong shape for selection') + + # iterate over chunks in range, i.e., chunks overlapping the selection + for chunk_coords in itertools.product(*chunk_ranges): + + # obtain selections for chunk and output arrays + chunk_selection, out_selection = selection.get_chunk_projection(chunk_coords) + + # load chunk selection into output array + self._chunk_getitem(chunk_coords, chunk_selection, out, out_selection, + squeeze_axes=selection.squeeze_axes) + + if out.shape: + return out + else: + return out[()] + def __setitem__(self, item, value): """Modify data for some portion of the array. @@ -683,19 +725,21 @@ def _setitem_nd(self, item, value): dest = value[out_selection] self._chunk_setitem(chunk_coords, chunk_selection, dest, squeeze_axes) - def _chunk_getitem(self, chunk_coords, chunk_selection, dest, squeeze_axes=None): + def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, squeeze_axes=None): """Obtain part or whole of a chunk. Parameters ---------- chunk_coords : tuple of ints Indices of the chunk. - chunk_selection : tuple of slices - Location of region within the chunk. - dest : ndarray - Numpy array to store result in. + chunk_selection : selection + Location of region within the chunk to extract. + out : ndarray + Array to store result in. + out_selection : selection + Location of region within output array to store results in. squeeze_axes : tuple of ints - Axes to squeeze out of the chunk before + Axes to squeeze out of the chunk. """ @@ -709,25 +753,27 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, dest, squeeze_axes=None) # chunk not initialized if self._fill_value is not None: - dest.fill(self._fill_value) + out[out_selection] = self._fill_value else: - if is_total_slice(chunk_selection, self._chunks) and \ + if isinstance(out, np.ndarray) and \ + is_total_slice(chunk_selection, self._chunks) and \ not self._filters and \ - ((self._order == 'C' and dest.flags.c_contiguous) or - (self._order == 'F' and dest.flags.f_contiguous)): + ((self._order == 'C' and out.flags.c_contiguous) or + (self._order == 'F' and out.flags.f_contiguous)): # optimization: we want the whole chunk, and the destination is # contiguous, so we can decompress directly from the chunk # into the destination array + dest = out[out_selection] if self._compressor: self._compressor.decode(cdata, dest) else: - arr = np.frombuffer(cdata, dtype=self._dtype) - arr = arr.reshape(self._chunks, order=self._order) - np.copyto(dest, arr) + chunk = np.frombuffer(cdata, dtype=self._dtype) + chunk = chunk.reshape(self._chunks, order=self._order) + np.copyto(dest, chunk) else: @@ -735,14 +781,10 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, dest, squeeze_axes=None) chunk = self._decode_chunk(cdata) # set data in output array - # (split into two lines for profiling) tmp = chunk[chunk_selection] if squeeze_axes: tmp = np.squeeze(tmp, axis=squeeze_axes) - if dest.shape: - dest[:] = tmp - else: - dest[()] = tmp + out[out_selection] = tmp def _chunk_setitem(self, chunk_coords, chunk_selection, value, squeeze_axes=None): """Replace part or whole of a chunk. diff --git a/zarr/indexing.py b/zarr/indexing.py new file mode 100644 index 0000000000..67a4021a7f --- /dev/null +++ b/zarr/indexing.py @@ -0,0 +1,428 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, print_function, division +import numbers + + +import numpy as np + + +def replace_ellipsis(selection, shape): + + # count number of ellipsis present + n_ellipsis = sum(1 for i in selection if i is Ellipsis) + + if n_ellipsis > 1: + # more than 1 is an error + raise IndexError("an index can only have a single ellipsis ('...')") + + elif n_ellipsis == 1: + # locate the ellipsis, count how many items to left and right + n_items_l = selection.index(Ellipsis) # items to left of ellipsis + n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis + n_items = len(selection) - 1 # all non-ellipsis items + + if n_items >= len(shape): + # ellipsis does nothing, just remove it + selection = tuple(i for i in selection if i != Ellipsis) + + else: + # replace ellipsis with as many slices are needed for number of dims + new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) + if n_items_r: + new_item += selection[-n_items_r:] + selection = new_item + + # fill out selection if not completely specified + if len(selection) < len(shape): + selection += tuple(slice(0, l) for l in shape[len(selection):]) + + return selection + + +class OIndex(object): + + def __init__(self, array): + self.array = array + + def __getitem__(self, selection): + + # ensure tuple + if not isinstance(selection, tuple): + selection = (selection,) + + # handle ellipsis + selection = replace_ellipsis(selection, self.array.shape) + + # delegate to method + return self.array.get_orthogonal_selection(selection) + + +class OrthogonalSelection(object): + + def __init__(self, selection, array): + + # ensure tuple + if not isinstance(selection, tuple): + selection = (selection,) + + # validation - check dimensionality + if len(selection) > len(array.shape): + raise IndexError('too many indices for array') + if len(selection) < len(array.shape): + raise IndexError('not enough indices for array') + + # normalization + self.selection = self.normalize_selection(selection, array) + self.array = array + + # figure out if we're going to be doing advanced indexing on chunks, if so then + # chunk selections will need special handling + self.is_advanced = any([not isinstance(dim_sel, (int, slice)) + for dim_sel in self.selection]) + + # locate axes that need to get squeezed out later if doing advanced selection + if self.is_advanced: + self.squeeze_axes = tuple([i for i, dim_sel in enumerate(self.selection) + if isinstance(dim_sel, int)]) + else: + self.squeeze_axes = None + + def __iter__(self): + return iter(self.selection) + + def __len__(self): + return len(self.selection) + + def normalize_selection(self, selection, array): + # normalize each dimension + selection = tuple(self.normalize_dim_selection(s, l, c) + for s, l, c in zip(selection, array.shape, array.chunks)) + return selection + + def normalize_dim_selection(self, dim_sel, dim_len, dim_chunk_len): + + # normalize list to array + if isinstance(dim_sel, list): + dim_sel = np.asarray(dim_sel) + + if isinstance(dim_sel, numbers.Integral): + + # normalize type to int + dim_sel = int(dim_sel) + + # handle wraparound + if dim_sel < 0: + dim_sel = dim_len + dim_sel + + # handle out of bounds + if dim_sel >= dim_len or dim_sel < 0: + raise IndexError('index out of bounds: %s' % dim_sel) + + return dim_sel + + elif isinstance(dim_sel, slice): + + # handle slice with None bound + start = 0 if dim_sel.start is None else dim_sel.start + stop = dim_len if dim_sel.stop is None else dim_sel.stop + + # handle wraparound + if start < 0: + start = dim_len + start + if stop < 0: + stop = dim_len + stop + + # handle zero-length axis + if start == stop == dim_len == 0: + return slice(0, 0) + + # handle out of bounds + if start < 0: + raise IndexError('start index out of bounds: %s' % dim_sel.start) + if stop < 0: + raise IndexError('stop index out of bounds: %s' % dim_sel.stop) + if start >= dim_len: + raise IndexError('start index out of bounds: %ss' % dim_sel.start) + if stop > dim_len: + stop = dim_len + if stop < start: + stop = start + + # handle slice with step + if dim_sel.step is not None: + if dim_sel.step > 1: + dim_sel = np.arange(start, stop, dim_sel.step) + return IntArrayOrthogonalSelection(dim_sel, dim_len, dim_chunk_len) + elif dim_sel.step < 1: + raise IndexError('only positive step supported') + + return slice(start, stop) + + elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): + + if dim_sel.dtype == bool: + return BoolArrayOrthogonalSelection(dim_sel, dim_len, dim_chunk_len) + + elif dim_sel.dtype.kind in 'ui': + return IntArrayOrthogonalSelection(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError('unsupported index item type: %r' % dim_sel) + + else: + raise IndexError('unsupported index item type: %r' % dim_sel) + + def get_overlapping_chunks(self): + """Convenience function to find chunks overlapping an array selection. N.B., + assumes selection has already been normalized.""" + + # indices of chunks overlapping the selection + chunk_ranges = [] + + # shape of the selection + sel_shape = [] + + # iterate over dimensions of the array + for dim_sel, dim_chunk_len in zip(self.selection, self.array.chunks): + + # dim_sel: selection for current dimension + # dim_chunk_len: length of chunk along current dimension + + dim_sel_len = None + + if isinstance(dim_sel, int): + + # dim selection is an integer, i.e., single item, so only need single chunk index for + # this dimension + dim_chunk_range = [dim_sel//dim_chunk_len] + + elif isinstance(dim_sel, slice): + + # dim selection is a slice, need range of chunk indices including start and stop of + # selection + dim_chunk_from = dim_sel.start//dim_chunk_len + dim_chunk_to = int(np.ceil(dim_sel.stop/dim_chunk_len)) + dim_chunk_range = range(dim_chunk_from, dim_chunk_to) + dim_sel_len = dim_sel.stop - dim_sel.start + + elif isinstance(dim_sel, BoolArrayOrthogonalSelection): + + # dim selection is a boolean array, delegate this to the BooleanSelection class + dim_chunk_range = dim_sel.get_chunk_ranges() + dim_sel_len = dim_sel.nitems + + elif isinstance(dim_sel, IntArrayOrthogonalSelection): + + # dim selection is an integer array, delegate this to the integerSelection class + dim_chunk_range = dim_sel.get_chunk_ranges() + dim_sel_len = dim_sel.nitems + + else: + raise RuntimeError('unexpected selection type') + + chunk_ranges.append(dim_chunk_range) + if dim_sel_len is not None: + sel_shape.append(dim_sel_len) + + return chunk_ranges, tuple(sel_shape) + + def get_chunk_projection(self, chunk_coords): + + # chunk_coords: holds the index along each dimension for the current chunk within the + # chunk grid. E.g., (0, 0) locates the first (top left) chunk in a 2D chunk grid. + + chunk_selection = [] + out_selection = [] + + # iterate over dimensions (axes) of the array + for dim_sel, dim_chunk_idx, dim_chunk_len in zip(self.selection, chunk_coords, + self.array.chunks): + + # dim_sel: selection for current dimension + # dim_chunk_idx: chunk index along current dimension + # dim_chunk_len: chunk length along current dimension + + # selection into output array to store data from current chunk + dim_out_sel = None + + # calculate offset for current chunk along current dimension - this is used to + # determine the values to be extracted from the current chunk + dim_chunk_offset = dim_chunk_idx * dim_chunk_len + + # handle integer selection, i.e., single item + if isinstance(dim_sel, int): + + dim_chunk_sel = dim_sel - dim_chunk_offset + + # N.B., leave dim_out_sel as None, as this dimension has been dropped in the + # output array because of single value index + + # handle slice selection, i.e., contiguous range of items + elif isinstance(dim_sel, slice): + + if dim_sel.start <= dim_chunk_offset: + # selection starts before current chunk + dim_chunk_sel_start = 0 + dim_out_offset = dim_chunk_offset - dim_sel.start + + else: + # selection starts within current chunk + dim_chunk_sel_start = dim_sel.start - dim_chunk_offset + dim_out_offset = 0 + + if dim_sel.stop > dim_chunk_offset + dim_chunk_len: + # selection ends after current chunk + dim_chunk_sel_stop = dim_chunk_len + + else: + # selection ends within current chunk + dim_chunk_sel_stop = dim_sel.stop - dim_chunk_offset + + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) + dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + + elif isinstance(dim_sel, (BoolArrayOrthogonalSelection, IntArrayOrthogonalSelection)): + + # get selection to extract data for the current chunk + dim_chunk_sel = dim_sel.get_chunk_sel(dim_chunk_idx) + + # figure out where to put these items in the output array + dim_out_sel = dim_sel.get_out_sel(dim_chunk_idx) + + else: + raise RuntimeError('unexpected selection type') + + # add to chunk selection + chunk_selection.append(dim_chunk_sel) + + # add to output selection + if dim_out_sel is not None: + out_selection.append(dim_out_sel) + + # normalise for indexing into numpy arrays + chunk_selection = tuple(chunk_selection) + out_selection = tuple(out_selection) + + # handle advanced indexing arrays orthogonally + if self.is_advanced: + # numpy doesn't support orthogonal indexing directly as yet, so need to work + # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices + # or integers, so need to convert slices and integers into ranges. + chunk_selection = [range(dim_chunk_sel.start, dim_chunk_sel.stop) + if isinstance(dim_chunk_sel, slice) + else [dim_chunk_sel] if isinstance(dim_chunk_sel, int) + else dim_chunk_sel + for dim_chunk_sel in chunk_selection] + chunk_selection = np.ix_(*chunk_selection) + + return chunk_selection, out_selection + + +class IntArrayOrthogonalSelection(object): + + def __init__(self, dim_sel, dim_len, dim_chunk_len): + + # has to be a numpy array so we can do bincount + dim_sel = np.asanyarray(dim_sel) + + # check number of dimensions, only support indexing with 1d array + if len(dim_sel.shape) > 1: + raise IndexError('can only index with 1-dimensional integer array') + + # handle wraparound + loc_neg = dim_sel < 0 + if np.any(loc_neg): + dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len + + # handle out of bounds + if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): + raise IndexError('index out of bounds') + + # validate monotonically increasing + if np.any(np.diff(dim_sel) < 0): + raise NotImplementedError('only monotonically increasing indices are supported') + + # store attributes + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) + + # precompute number of selected items for each chunk + # note: for dense integer selections, the division operation here is the bottleneck + self.chunk_nitems = np.bincount(self.dim_sel // self.dim_chunk_len, minlength=self.nchunks) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + self.nitems = len(dim_sel) + + def get_chunk_sel(self, dim_chunk_idx): + # need to slice out relevant indices from the total selection, then subtract the chunk + # offset + + dim_out_sel = self.get_out_sel(dim_chunk_idx) + dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[dim_out_sel] - dim_chunk_offset + + return dim_chunk_sel + + def get_out_sel(self, dim_chunk_idx): + if dim_chunk_idx == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_idx] + return slice(start, stop) + + def get_chunk_ranges(self): + return np.nonzero(self.chunk_nitems)[0] + + +class BoolArrayOrthogonalSelection(object): + + def __init__(self, dim_sel, dim_len, dim_chunk_len): + + # check number of dimensions, only support indexing with 1d array + if len(dim_sel.shape) > 1: + raise IndexError('can only index with 1-dimensional Boolean array') + + # check shape + if dim_sel.shape[0] != dim_len: + raise IndexError('Boolean array has wrong length; expected %s, found %s' % + (dim_len, dim_sel.shape[0])) + + # store attributes + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) + + # precompute number of selected items for each chunk + self.chunk_nitems = np.zeros(self.nchunks, dtype='i8') + for dim_chunk_idx in range(self.nchunks): + dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len + self.chunk_nitems[dim_chunk_idx] = np.count_nonzero( + self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] + ) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + self.nitems = self.chunk_nitems_cumsum[-1] + + def get_chunk_sel(self, dim_chunk_idx): + dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] + # pad out if final chunk + if dim_chunk_sel.shape[0] < self.dim_chunk_len: + tmp = np.zeros(self.dim_chunk_len, dtype=bool) + tmp[:dim_chunk_sel.shape[0]] = dim_chunk_sel + dim_chunk_sel = tmp + return dim_chunk_sel + + def get_out_sel(self, dim_chunk_idx): + if dim_chunk_idx == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_idx] + return slice(start, stop) + + def get_chunk_ranges(self): + return np.nonzero(self.chunk_nitems)[0] diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 65499cdbaa..0a05cb8c9a 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -722,12 +722,15 @@ def test_nchunks_initialized(self): z[:] = 42 eq(10, z.nchunks_initialized) - def _test_advanced_indexing_1d_common(self, a, z, ix): + def _test_orthogonal_indexing_1d_common(self, a, z, ix): expect = a[ix] - actual = z[ix] + actual = z.get_orthogonal_selection(ix) + assert_array_equal(expect, actual) + actual = z.oindex[ix] assert_array_equal(expect, actual) - def test_advanced_indexing_1d_bool(self): + # noinspection PyStatementEffect + def test_orthogonal_indexing_1d_bool(self): # setup a = np.arange(1050, dtype=int) @@ -738,17 +741,17 @@ def test_advanced_indexing_1d_bool(self): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - self._test_advanced_indexing_1d_common(a, z, ix) + self._test_orthogonal_indexing_1d_common(a, z, ix) # test errors with assert_raises(IndexError): - z[ix[:50]] # too short + z.oindex[np.zeros(50, dtype=bool)] # too short with assert_raises(IndexError): - z[np.concatenate([ix[:50]] * 2)] # too long + z.oindex[np.zeros(2000, dtype=bool)] # too long with assert_raises(IndexError): - z[[[True, False], [False, True]]] # too many dimensions + z.oindex[[[True, False], [False, True]]] # too many dimensions - def test_advanced_indexing_1d_int(self): + def test_orthogonal_indexing_1d_int(self): # setup a = np.arange(1050, dtype=int) @@ -760,7 +763,7 @@ def test_advanced_indexing_1d_int(self): for p in 0.5, 0.1, 0.01: ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix.sort() - self._test_advanced_indexing_1d_common(a, z, ix) + self._test_orthogonal_indexing_1d_common(a, z, ix) # test wraparound ix = [0, 3, 10, -23, -12, -1] @@ -771,18 +774,18 @@ def test_advanced_indexing_1d_int(self): # test errors with assert_raises(IndexError): ix = [a.shape[0] + 1] # out of bounds - z[ix] + z.oindex[ix] with assert_raises(IndexError): ix = [-(a.shape[0] + 1)] # out of bounds - z[ix] + z.oindex[ix] with assert_raises(IndexError): ix = [[2, 4], [6, 8]] # too many dimensions - z[ix] + z.oindex[ix] with assert_raises(NotImplementedError): ix = [3, 105, 23, 127] # not monotonically increasing - z[ix] + z.oindex[ix] - def _test_advanced_indexing_2d_common(self, a, z, ix0, ix1): + def _test_orthogonal_indexing_2d_common(self, a, z, ix0, ix1): # index both axes with array expect = a[np.ix_(ix0, ix1)] @@ -805,7 +808,7 @@ def _test_advanced_indexing_2d_common(self, a, z, ix0, ix1): actual = z[42, ix1] assert_array_equal(expect, actual) - def test_advanced_indexing_2d_bool(self): + def test_orthogonal_indexing_2d_bool(self): # setup a = np.arange(10000, dtype=int).reshape(1000, 10) @@ -819,7 +822,7 @@ def test_advanced_indexing_2d_bool(self): ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) # main tests - self._test_advanced_indexing_2d_common(a, z, ix0, ix1) + self._test_orthogonal_indexing_2d_common(a, z, ix0, ix1) # mixed int array / bool array selections = ( @@ -831,7 +834,7 @@ def test_advanced_indexing_2d_bool(self): actual = z[ix0, ix1] assert_array_equal(expect, actual) - def test_advanced_indexing_2d_int(self): + def test_orthogonal_indexing_2d_int(self): # setup a = np.arange(10000, dtype=int).reshape(1000, 10) @@ -845,9 +848,9 @@ def test_advanced_indexing_2d_int(self): ix0.sort() ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) ix1.sort() - self._test_advanced_indexing_2d_common(a, z, ix0, ix1) + self._test_orthogonal_indexing_2d_common(a, z, ix0, ix1) - def _test_advanced_indexing_3d_common(self, a, z, ix0, ix1, ix2): + def _test_orthogonal_indexing_3d_common(self, a, z, ix0, ix1, ix2): # index all axes with array expect = a[np.ix_(ix0, ix1, ix2)] @@ -909,7 +912,7 @@ def _test_advanced_indexing_3d_common(self, a, z, ix0, ix1, ix2): actual = z[ix0, 42, ix2] assert_array_equal(expect, actual) - def test_advanced_indexing_3d_bool(self): + def test_orthogonal_indexing_3d_bool(self): # setup a = np.arange(100000, dtype=int).reshape(200, 50, 10) @@ -922,9 +925,9 @@ def test_advanced_indexing_3d_bool(self): ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) ix2 = np.random.binomial(1, .5, size=a.shape[2]).astype(bool) - self._test_advanced_indexing_3d_common(a, z, ix0, ix1, ix2) + self._test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) - def test_advanced_indexing_edge_cases(self): + def test_orthogonal_indexing_edge_cases(self): a = np.arange(6).reshape(1, 2, 3) z = self.create_array(shape=a.shape, chunks=(1, 2, 3), dtype=a.dtype) @@ -938,7 +941,7 @@ def test_advanced_indexing_edge_cases(self): actual = z[0, :, [True, True, True]] assert_array_equal(expect, actual) - def test_advanced_indexing_3d_int(self): + def test_orthogonal_indexing_3d_int(self): # setup a = np.arange(100000, dtype=int).reshape(200, 50, 10) @@ -954,16 +957,16 @@ def test_advanced_indexing_3d_int(self): ix1.sort() ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) ix2.sort() - self._test_advanced_indexing_3d_common(a, z, ix0, ix1, ix2) + self._test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) - def _test_advanced_indexing_1d_common_set(self, v, a, z, ix): + def _test_orthogonal_indexing_1d_common_set(self, v, a, z, ix): a[:] = 0 z[:] = 0 a[ix] = v[ix] z[ix] = v[ix] assert_array_equal(a, z[:]) - def test_advanced_indexing_1d_bool_set(self): + def test_orthogonal_indexing_1d_bool_set(self): # setup v = np.arange(1050, dtype=int) @@ -974,9 +977,9 @@ def test_advanced_indexing_1d_bool_set(self): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - self._test_advanced_indexing_1d_common_set(v, a, z, ix) + self._test_orthogonal_indexing_1d_common_set(v, a, z, ix) - def test_advanced_indexing_1d_int_set(self): + def test_orthogonal_indexing_1d_int_set(self): # setup v = np.arange(1050, dtype=int) @@ -988,9 +991,9 @@ def test_advanced_indexing_1d_int_set(self): for p in 0.5, 0.1, 0.01: ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix.sort() - self._test_advanced_indexing_1d_common_set(v, a, z, ix) + self._test_orthogonal_indexing_1d_common_set(v, a, z, ix) - def _test_advanced_indexing_2d_common_set(self, v, a, z, ix0, ix1): + def _test_orthogonal_indexing_2d_common_set(self, v, a, z, ix0, ix1): # index both axes with array a[:] = 0 @@ -1014,7 +1017,7 @@ def _test_advanced_indexing_2d_common_set(self, v, a, z, ix0, ix1): z[selection] = v[selection] assert_array_equal(a, z[:]) - def test_advanced_indexing_2d_bool_set(self): + def test_orthogonal_indexing_2d_bool_set(self): # setup v = np.arange(10000, dtype=int).reshape(1000, 10) @@ -1026,9 +1029,9 @@ def test_advanced_indexing_2d_bool_set(self): for p in 0.5, 0.1, 0.01: ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) - self._test_advanced_indexing_2d_common_set(v, a, z, ix0, ix1) + self._test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1) - def test_advanced_indexing_2d_int_set(self): + def test_orthogonal_indexing_2d_int_set(self): # setup v = np.arange(10000, dtype=int).reshape(1000, 10) @@ -1042,9 +1045,9 @@ def test_advanced_indexing_2d_int_set(self): ix0.sort() ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) ix1.sort() - self._test_advanced_indexing_2d_common_set(v, a, z, ix0, ix1) + self._test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1) - def _test_advanced_indexing_3d_common_set(self, v, a, z, ix0, ix1, ix2): + def _test_orthogonal_indexing_3d_common_set(self, v, a, z, ix0, ix1, ix2): # index all axes with bool array a[:] = 0 @@ -1091,7 +1094,7 @@ def _test_advanced_indexing_3d_common_set(self, v, a, z, ix0, ix1, ix2): z[zsel] = v[vsel].squeeze(axis=2) assert_array_equal(a, z[:]) - def test_advanced_indexing_3d_bool_set(self): + def test_orthogonal_indexing_3d_bool_set(self): # setup v = np.arange(100000, dtype=int).reshape(200, 50, 10) @@ -1104,9 +1107,9 @@ def test_advanced_indexing_3d_bool_set(self): ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) ix2 = np.random.binomial(1, .5, size=a.shape[2]).astype(bool) - self._test_advanced_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) + self._test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) - def test_advanced_indexing_3d_int_set(self): + def test_orthogonal_indexing_3d_int_set(self): # setup v = np.arange(100000, dtype=int).reshape(200, 50, 10) @@ -1122,7 +1125,7 @@ def test_advanced_indexing_3d_int_set(self): ix1.sort() ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) ix2.sort() - self._test_advanced_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) + self._test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) class TestArrayWithPath(TestArray): From 25619458c2df7d1422c27d96369a3ab8a70d084f Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 2 Nov 2017 01:55:25 +0000 Subject: [PATCH 23/67] WIP refactor oindex --- zarr/core.py | 151 ++++++++---------- zarr/indexing.py | 335 ++++++++++++++++++++++++++++++++-------- zarr/tests/test_core.py | 162 +++++++++---------- 3 files changed, 404 insertions(+), 244 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 3867154b53..183e9e86b2 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -17,7 +17,7 @@ from zarr.errors import PermissionError, err_read_only, err_array_not_found from zarr.compat import reduce from zarr.codecs import AsType, get_codec -from zarr.indexing import OIndex, OrthogonalSelection +from zarr.indexing import OIndex, OrthogonalIndexer, BasicIndexer class Array(object): @@ -388,7 +388,7 @@ def __len__(self): else: raise TypeError('len() of unsized object') - def __getitem__(self, item): + def __getitem__(self, selection): """Retrieve data for some portion of the array. Most NumPy-style slicing operations are supported. @@ -459,21 +459,27 @@ def __getitem__(self, item): """ + # delegate to method + return self.get_basic_selection(selection) + + def get_basic_selection(self, selection, out=None): + """TODO""" + # refresh metadata if not self._cache_metadata: self._load_metadata() # handle zero-dimensional arrays if self._shape == (): - return self._getitem_zd(item) + return self._get_basic_selection_zd(selection, out=out) else: - return self._getitem_nd(item) + return self._get_basic_selection_nd(selection, out=out) - def _getitem_zd(self, item): - # special case __getitem__ for zero-dimensional array + def _get_basic_selection_zd(self, selection, out=None): + # special case basic selection for zero-dimensional array - # check item is valid - if item not in ((), Ellipsis): + # check selection is valid + if selection not in ((), Ellipsis): raise IndexError('too many indices for array') try: @@ -484,79 +490,52 @@ def _getitem_zd(self, item): except KeyError: # chunk not initialized if self._fill_value is not None: - out = np.empty((), dtype=self._dtype) - out.fill(self._fill_value) + chunk = np.empty((), dtype=self._dtype) + chunk.fill(self._fill_value) else: - out = np.zeros((), dtype=self._dtype) + chunk = np.zeros((), dtype=self._dtype) else: - out = self._decode_chunk(cdata) + chunk = self._decode_chunk(cdata) # handle selection of the scalar value via empty tuple - out = out[item] + if out is None: + out = chunk[selection] + else: + out[selection] = chunk[selection] return out - def _getitem_nd(self, item): - # implementation of __getitem__ for array with at least one dimension - - # N.B., this is the crux of zarr. We iterate over all chunks which overlap the selection - # and thus contain data that needs to be extracted. Each chunk is processed in turn, - # extracting the necessary data and storing into the correct location in the output array. + def _get_basic_selection_nd(self, selection, out=None): + # implementation of basic selection for array with at least one dimension - # N.B., it is an important optimisation that we only visit chunks which overlap the - # selection. This minimises the nuimber of iterations in the main for loop. - - # normalize selection - selection = normalize_array_selection(item, self._shape, self._chunks) - - # figure out if we're doing advanced indexing, count number of advanced selections - if - # more than one need special handling, because we are doing orthogonal indexing here, - # which is different from fancy indexing if there is more than one array selection - n_advanced_selection = sum(1 for dim_sel in selection - if not isinstance(dim_sel, (int, slice))) - - # axes that need to get squeezed out if doing advanced selection - if n_advanced_selection > 0: - squeeze_axes = tuple([i for i, dim_sel in enumerate(selection) - if isinstance(dim_sel, int)]) - else: - squeeze_axes = None - - # determine indices of chunks overlapping the selection - chunk_ranges, sel_shape = get_chunks_for_selection(selection, self._chunks) + # setup indexer + indexer = BasicIndexer(selection, self) - # setup output array - out = np.empty(sel_shape, dtype=self._dtype, order=self._order) + return self._get_selection(indexer, out=out) - # iterate over chunks in range, i.e., chunks overlapping the selection - for chunk_coords in itertools.product(*chunk_ranges): + def get_orthogonal_selection(self, selection, out=None): - # obtain selections for chunk and output arrays - chunk_selection, out_selection = \ - get_chunk_selections(selection, chunk_coords, self._chunks, n_advanced_selection) + # refresh metadata + if not self._cache_metadata: + self._load_metadata() - # obtain the destination array as a view of the output array - if out_selection: - dest = out[out_selection] - else: - dest = out + # setup indexer + indexer = OrthogonalIndexer(selection, self) - # load chunk selection into output array - self._chunk_getitem(chunk_coords, chunk_selection, dest, squeeze_axes) + return self._get_selection(indexer, out=out) - if out.shape: - return out - else: - return out[()] + def _get_selection(self, indexer, out=None): - def get_orthogonal_selection(self, selection, out=None): + # We iterate over all chunks which overlap the selection and thus contain data that needs + # to be extracted. Each chunk is processed in turn, extracting the necessary data and + # storing into the correct location in the output array. - # setup selection - selection = OrthogonalSelection(selection, self) + # N.B., it is an important optimisation that we only visit chunks which overlap the + # selection. This minimises the nuimber of iterations in the main for loop. # determine indices of chunks overlapping the selection - chunk_ranges, sel_shape = selection.get_overlapping_chunks() + chunk_ranges, sel_shape = indexer.get_overlapping_chunks() # setup output array if out is None: @@ -572,11 +551,11 @@ def get_orthogonal_selection(self, selection, out=None): for chunk_coords in itertools.product(*chunk_ranges): # obtain selections for chunk and output arrays - chunk_selection, out_selection = selection.get_chunk_projection(chunk_coords) + chunk_selection, out_selection = indexer.get_chunk_projection(chunk_coords) # load chunk selection into output array self._chunk_getitem(chunk_coords, chunk_selection, out, out_selection, - squeeze_axes=selection.squeeze_axes) + squeeze_axes=indexer.squeeze_axes) if out.shape: return out @@ -759,32 +738,34 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, sque if isinstance(out, np.ndarray) and \ is_total_slice(chunk_selection, self._chunks) and \ - not self._filters and \ - ((self._order == 'C' and out.flags.c_contiguous) or - (self._order == 'F' and out.flags.f_contiguous)): - - # optimization: we want the whole chunk, and the destination is - # contiguous, so we can decompress directly from the chunk - # into the destination array + not self._filters: dest = out[out_selection] - if self._compressor: - self._compressor.decode(cdata, dest) - else: - chunk = np.frombuffer(cdata, dtype=self._dtype) - chunk = chunk.reshape(self._chunks, order=self._order) - np.copyto(dest, chunk) + contiguous = ((self._order == 'C' and dest.flags.c_contiguous) or + (self._order == 'F' and dest.flags.f_contiguous)) - else: + if contiguous: - # decode chunk - chunk = self._decode_chunk(cdata) + # optimization: we want the whole chunk, and the destination is + # contiguous, so we can decompress directly from the chunk + # into the destination array - # set data in output array - tmp = chunk[chunk_selection] - if squeeze_axes: - tmp = np.squeeze(tmp, axis=squeeze_axes) - out[out_selection] = tmp + if self._compressor: + self._compressor.decode(cdata, dest) + else: + chunk = np.frombuffer(cdata, dtype=self._dtype) + chunk = chunk.reshape(self._chunks, order=self._order) + np.copyto(dest, chunk) + return + + # decode chunk + chunk = self._decode_chunk(cdata) + + # set data in output array + tmp = chunk[chunk_selection] + if squeeze_axes: + tmp = np.squeeze(tmp, axis=squeeze_axes) + out[out_selection] = tmp def _chunk_setitem(self, chunk_coords, chunk_selection, value, squeeze_axes=None): """Replace part or whole of a chunk. diff --git a/zarr/indexing.py b/zarr/indexing.py index 67a4021a7f..fef7c6725c 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -46,18 +46,239 @@ def __init__(self, array): def __getitem__(self, selection): + # delegate to method + return self.array.get_orthogonal_selection(selection) + + +def normalize_integer_selection(dim_sel, dim_len): + + # normalize type to int + dim_sel = int(dim_sel) + + # handle wraparound + if dim_sel < 0: + dim_sel = dim_len + dim_sel + + # handle out of bounds + if dim_sel >= dim_len or dim_sel < 0: + raise IndexError('index out of bounds: %s' % dim_sel) + + return dim_sel + + +def normalize_slice_selection(dim_sel, dim_len): + + # handle slice with None bound + start = 0 if dim_sel.start is None else dim_sel.start + stop = dim_len if dim_sel.stop is None else dim_sel.stop + step = 1 if dim_sel.step is None else dim_sel.step + + # handle wraparound + if start < 0: + start = dim_len + start + if stop < 0: + stop = dim_len + stop + + # handle out of bounds + if start < 0: + raise IndexError('start index out of bounds: %s' % dim_sel.start) + if stop < 0: + raise IndexError('stop index out of bounds: %s' % dim_sel.stop) + if start >= dim_len and dim_len > 0: + raise IndexError('start index out of bounds: %ss' % dim_sel.start) + if stop > dim_len: + stop = dim_len + if stop < start: + stop = start + + return slice(start, stop, step) + + +class IndexerBase(object): + + def __init__(self, selection, array): + self.selection = selection + self.array = array + self.squeeze_axes = None + + def __iter__(self): + return iter(self.selection) + + def __len__(self): + return len(self.selection) + + +# noinspection PyProtectedMember +class BasicIndexer(IndexerBase): + + def __init__(self, selection, array): + # ensure tuple if not isinstance(selection, tuple): selection = (selection,) # handle ellipsis - selection = replace_ellipsis(selection, self.array.shape) + selection = replace_ellipsis(selection, array._shape) - # delegate to method - return self.array.get_orthogonal_selection(selection) + # validation - check dimensionality + if len(selection) > len(array._shape): + raise IndexError('too many indices for array') + if len(selection) < len(array._shape): + raise IndexError('not enough indices for array') + + # TODO refactor with OrthogonalIndexer + + # normalization + selection = self.normalize_selection(selection, array) + + # complete initialisation + super(BasicIndexer, self).__init__(selection, array) + + def normalize_selection(self, selection, array): + # normalize each dimension + selection = tuple(self.normalize_dim_selection(s, l) + for s, l in zip(selection, array._shape)) + return selection + + def normalize_dim_selection(self, dim_sel, dim_len): + + if isinstance(dim_sel, numbers.Integral): + + dim_sel = normalize_integer_selection(dim_sel, dim_len) + return dim_sel + + elif isinstance(dim_sel, slice): + + dim_sel = normalize_slice_selection(dim_sel, dim_len) + + # handle slice with step + if dim_sel.step is not None and dim_sel.step != 1: + raise IndexError('slice with step not supported via basic indexing') + + return dim_sel + + else: + raise IndexError('unsupported index item type: %r' % dim_sel) + + def get_overlapping_chunks(self): + """Convenience function to find chunks overlapping an array selection. N.B., + assumes selection has already been normalized.""" + + # indices of chunks overlapping the selection + chunk_ranges = [] + + # shape of the selection + sel_shape = [] + + # iterate over dimensions of the array + for dim_sel, dim_chunk_len in zip(self.selection, self.array._chunks): + + # dim_sel: selection for current dimension + # dim_chunk_len: length of chunk along current dimension + + dim_sel_len = None + + if isinstance(dim_sel, int): + + # dim selection is an integer, i.e., single item, so only need single chunk index + # for this dimension + dim_chunk_range = [dim_sel//dim_chunk_len] + + elif isinstance(dim_sel, slice): + + # dim selection is a slice, need range of chunk indices including start and stop of + # selection + dim_chunk_from = dim_sel.start//dim_chunk_len + dim_chunk_to = int(np.ceil(dim_sel.stop/dim_chunk_len)) + dim_chunk_range = range(dim_chunk_from, dim_chunk_to) + dim_sel_len = dim_sel.stop - dim_sel.start + + else: + raise RuntimeError('unexpected selection type') + + chunk_ranges.append(dim_chunk_range) + if dim_sel_len is not None: + sel_shape.append(dim_sel_len) + + return chunk_ranges, tuple(sel_shape) + + def get_chunk_projection(self, chunk_coords): + + # chunk_coords: holds the index along each dimension for the current chunk within the + # chunk grid. E.g., (0, 0) locates the first (top left) chunk in a 2D chunk grid. + + chunk_selection = [] + out_selection = [] + + # iterate over dimensions (axes) of the array + for dim_sel, dim_chunk_idx, dim_chunk_len in zip(self.selection, chunk_coords, + self.array._chunks): + + # dim_sel: selection for current dimension + # dim_chunk_idx: chunk index along current dimension + # dim_chunk_len: chunk length along current dimension + + # selection into output array to store data from current chunk + dim_out_sel = None + + # calculate offset for current chunk along current dimension - this is used to + # determine the values to be extracted from the current chunk + dim_chunk_offset = dim_chunk_idx * dim_chunk_len + + # handle integer selection, i.e., single item + if isinstance(dim_sel, int): + + dim_chunk_sel = dim_sel - dim_chunk_offset + + # N.B., leave dim_out_sel as None, as this dimension has been dropped in the + # output array because of single value index + + # handle slice selection, i.e., contiguous range of items + elif isinstance(dim_sel, slice): + + if dim_sel.start <= dim_chunk_offset: + # selection starts before current chunk + dim_chunk_sel_start = 0 + dim_out_offset = dim_chunk_offset - dim_sel.start + + else: + # selection starts within current chunk + dim_chunk_sel_start = dim_sel.start - dim_chunk_offset + dim_out_offset = 0 + + if dim_sel.stop > dim_chunk_offset + dim_chunk_len: + # selection ends after current chunk + dim_chunk_sel_stop = dim_chunk_len + else: + # selection ends within current chunk + dim_chunk_sel_stop = dim_sel.stop - dim_chunk_offset + + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) + dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + + # TODO refactor code with OrthogonalIndexer + + else: + raise RuntimeError('unexpected selection type') + + # add to chunk selection + chunk_selection.append(dim_chunk_sel) + + # add to output selection + if dim_out_sel is not None: + out_selection.append(dim_out_sel) + + # normalise for indexing into numpy arrays + chunk_selection = tuple(chunk_selection) + out_selection = tuple(out_selection) + + return chunk_selection, out_selection -class OrthogonalSelection(object): + +# noinspection PyProtectedMember +class OrthogonalIndexer(IndexerBase): def __init__(self, selection, array): @@ -65,38 +286,37 @@ def __init__(self, selection, array): if not isinstance(selection, tuple): selection = (selection,) + # handle ellipsis + selection = replace_ellipsis(selection, array._shape) + # validation - check dimensionality - if len(selection) > len(array.shape): + if len(selection) > len(array._shape): raise IndexError('too many indices for array') - if len(selection) < len(array.shape): + if len(selection) < len(array._shape): raise IndexError('not enough indices for array') # normalization - self.selection = self.normalize_selection(selection, array) - self.array = array + selection = self.normalize_selection(selection, array) + + # super initialisation + super(OrthogonalIndexer, self).__init__(selection, array) # figure out if we're going to be doing advanced indexing on chunks, if so then # chunk selections will need special handling self.is_advanced = any([not isinstance(dim_sel, (int, slice)) - for dim_sel in self.selection]) + for dim_sel in selection]) # locate axes that need to get squeezed out later if doing advanced selection if self.is_advanced: - self.squeeze_axes = tuple([i for i, dim_sel in enumerate(self.selection) + self.squeeze_axes = tuple([i for i, dim_sel in enumerate(selection) if isinstance(dim_sel, int)]) else: self.squeeze_axes = None - def __iter__(self): - return iter(self.selection) - - def __len__(self): - return len(self.selection) - def normalize_selection(self, selection, array): # normalize each dimension selection = tuple(self.normalize_dim_selection(s, l, c) - for s, l, c in zip(selection, array.shape, array.chunks)) + for s, l, c in zip(selection, array._shape, array._chunks)) return selection def normalize_dim_selection(self, dim_sel, dim_len, dim_chunk_len): @@ -107,56 +327,21 @@ def normalize_dim_selection(self, dim_sel, dim_len, dim_chunk_len): if isinstance(dim_sel, numbers.Integral): - # normalize type to int - dim_sel = int(dim_sel) - - # handle wraparound - if dim_sel < 0: - dim_sel = dim_len + dim_sel - - # handle out of bounds - if dim_sel >= dim_len or dim_sel < 0: - raise IndexError('index out of bounds: %s' % dim_sel) - + dim_sel = normalize_integer_selection(dim_sel, dim_len) return dim_sel elif isinstance(dim_sel, slice): - # handle slice with None bound - start = 0 if dim_sel.start is None else dim_sel.start - stop = dim_len if dim_sel.stop is None else dim_sel.stop - - # handle wraparound - if start < 0: - start = dim_len + start - if stop < 0: - stop = dim_len + stop - - # handle zero-length axis - if start == stop == dim_len == 0: - return slice(0, 0) - - # handle out of bounds - if start < 0: - raise IndexError('start index out of bounds: %s' % dim_sel.start) - if stop < 0: - raise IndexError('stop index out of bounds: %s' % dim_sel.stop) - if start >= dim_len: - raise IndexError('start index out of bounds: %ss' % dim_sel.start) - if stop > dim_len: - stop = dim_len - if stop < start: - stop = start + dim_sel = normalize_slice_selection(dim_sel, dim_len) # handle slice with step - if dim_sel.step is not None: - if dim_sel.step > 1: - dim_sel = np.arange(start, stop, dim_sel.step) - return IntArrayOrthogonalSelection(dim_sel, dim_len, dim_chunk_len) - elif dim_sel.step < 1: - raise IndexError('only positive step supported') + if dim_sel.step > 1: + dim_sel = np.arange(dim_sel.start, dim_sel.stop, dim_sel.step) + return IntArrayOrthogonalSelection(dim_sel, dim_len, dim_chunk_len) + elif dim_sel.step < 1: + raise IndexError('only positive step supported') - return slice(start, stop) + return dim_sel elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): @@ -183,7 +368,7 @@ def get_overlapping_chunks(self): sel_shape = [] # iterate over dimensions of the array - for dim_sel, dim_chunk_len in zip(self.selection, self.array.chunks): + for dim_sel, dim_chunk_len in zip(self.selection, self.array._chunks): # dim_sel: selection for current dimension # dim_chunk_len: length of chunk along current dimension @@ -236,7 +421,7 @@ def get_chunk_projection(self, chunk_coords): # iterate over dimensions (axes) of the array for dim_sel, dim_chunk_idx, dim_chunk_len in zip(self.selection, chunk_coords, - self.array.chunks): + self.array._chunks): # dim_sel: selection for current dimension # dim_chunk_idx: chunk index along current dimension @@ -309,16 +494,30 @@ def get_chunk_projection(self, chunk_coords): # numpy doesn't support orthogonal indexing directly as yet, so need to work # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices # or integers, so need to convert slices and integers into ranges. - chunk_selection = [range(dim_chunk_sel.start, dim_chunk_sel.stop) - if isinstance(dim_chunk_sel, slice) - else [dim_chunk_sel] if isinstance(dim_chunk_sel, int) - else dim_chunk_sel - for dim_chunk_sel in chunk_selection] - chunk_selection = np.ix_(*chunk_selection) + chunk_selection = ix_(*chunk_selection) return chunk_selection, out_selection +def slice_to_range(dim_sel): + return range(dim_sel.start, dim_sel.stop, 1 if dim_sel.step is None else dim_sel.step) + + +def ix_(*selection): + """Convert an orthogonal selection to a numpy advanced (fancy) selection, with support for + slices and single ints.""" + + # replace slice and int as these are not supported by numpy ix_() + selection = [slice_to_range(dim_sel) if isinstance(dim_sel, slice) + else [dim_sel] if isinstance(dim_sel, int) + else dim_sel + for dim_sel in selection] + + selection = np.ix_(*selection) + + return selection + + class IntArrayOrthogonalSelection(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 0a05cb8c9a..e698caf793 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -18,9 +18,20 @@ from zarr.errors import PermissionError from zarr.compat import PY2 from zarr.util import buffer_size +from zarr.indexing import ix_ from numcodecs import Delta, FixedScaleOffset, Zlib, Blosc, BZ2 +def oindex(a, selection): + """Implementation of orthogonal indexing with slices and ints.""" + squeeze_axes = tuple([i for i, s in enumerate(selection) if isinstance(s, int)]) + selection = ix_(*selection) + result = a[selection] + if squeeze_axes: + result = result.squeeze(axis=squeeze_axes) + return result + + class TestArray(unittest.TestCase): def test_array_init(self): @@ -751,6 +762,7 @@ def test_orthogonal_indexing_1d_bool(self): with assert_raises(IndexError): z.oindex[[[True, False], [False, True]]] # too many dimensions + # noinspection PyStatementEffect def test_orthogonal_indexing_1d_int(self): # setup @@ -768,7 +780,7 @@ def test_orthogonal_indexing_1d_int(self): # test wraparound ix = [0, 3, 10, -23, -12, -1] expect = a[ix] - actual = z[ix] + actual = z.oindex[ix] assert_array_equal(expect, actual) # test errors @@ -787,26 +799,23 @@ def test_orthogonal_indexing_1d_int(self): def _test_orthogonal_indexing_2d_common(self, a, z, ix0, ix1): - # index both axes with array - expect = a[np.ix_(ix0, ix1)] - actual = z[ix0, ix1] - assert_array_equal(expect, actual) - - # mixed indexing with array / slice - expect = a[ix0, 1:5] - actual = z[ix0, 1:5] - assert_array_equal(expect, actual) - expect = a[250:350, ix1] - actual = z[250:350, ix1] - assert_array_equal(expect, actual) + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice + (ix0, slice(1, 5)), + (slice(250, 350), ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + ] - # mixed indexing with array / single index - expect = a[ix0, 4] - actual = z[ix0, 4] - assert_array_equal(expect, actual) - expect = a[42, ix1] - actual = z[42, ix1] - assert_array_equal(expect, actual) + for selection in selections: + expect = oindex(a, selection) + actual = z.get_orthogonal_selection(selection) + assert_array_equal(expect, actual) + actual = z.oindex[selection] + assert_array_equal(expect, actual) def test_orthogonal_indexing_2d_bool(self): @@ -830,8 +839,8 @@ def test_orthogonal_indexing_2d_bool(self): (np.nonzero(ix0)[0], ix1), ) for selection in selections: - expect = a[np.ix_(ix0, ix1)] - actual = z[ix0, ix1] + expect = oindex(a, selection) + actual = z.oindex[selection] assert_array_equal(expect, actual) def test_orthogonal_indexing_2d_int(self): @@ -852,65 +861,36 @@ def test_orthogonal_indexing_2d_int(self): def _test_orthogonal_indexing_3d_common(self, a, z, ix0, ix1, ix2): - # index all axes with array - expect = a[np.ix_(ix0, ix1, ix2)] - actual = z[ix0, ix1, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with single array / slices - expect = a[ix0, 15:25, 1:5] - actual = z[ix0, 15:25, 1:5] - assert_array_equal(expect, actual) - expect = a[50:70, ix1, 1:5] - actual = z[50:70, ix1, 1:5] - assert_array_equal(expect, actual) - expect = a[50:70, 15:25, ix2] - actual = z[50:70, 15:25, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with single array / single index - expect = a[ix0, 42, 4] - actual = z[ix0, 42, 4] - assert_array_equal(expect, actual) - expect = a[42, ix1, 4] - actual = z[42, ix1, 4] - assert_array_equal(expect, actual) - expect = a[84, 42, ix2] - actual = z[84, 42, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with single array / slice / single index - expect = a[ix0, 15:25, 4] - actual = z[ix0, 15:25, 4] - assert_array_equal(expect, actual) - expect = a[42, ix1, 1:5] - actual = z[42, ix1, 1:5] - assert_array_equal(expect, actual) - expect = a[50:70, 42, ix2] - actual = z[50:70, 42, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with two array / slice - expect = a[np.ix_(ix0, ix1, range(1, 5))] - actual = z[ix0, ix1, 1:5] - assert_array_equal(expect, actual) - expect = a[np.ix_(range(50, 70), ix1, ix2)] - actual = z[50:70, ix1, ix2] - assert_array_equal(expect, actual) - expect = a[np.ix_(ix0, range(15, 25), ix2)] - actual = z[ix0, 15:25, ix2] - assert_array_equal(expect, actual) - - # mixed indexing with two array / integer - expect = a[np.ix_(ix0, ix1, [4])].squeeze(axis=2) - actual = z[ix0, ix1, 4] - assert_array_equal(expect, actual) - expect = a[np.ix_([42], ix1, ix2)].squeeze(axis=0) - actual = z[42, ix1, ix2] - assert_array_equal(expect, actual) - expect = a[np.ix_(ix0, [42], ix2)].squeeze(axis=1) - actual = z[ix0, 42, ix2] - assert_array_equal(expect, actual) + selections = [ + # index all axes with array + (ix0, ix1, ix2), + # mixed indexing with single array / slices + (ix0, slice(15, 25), slice(1, 5)), + (slice(50, 70), ix1, slice(1, 5)), + (slice(50, 70), slice(15, 25), ix2), + # mixed indexing with single array / ints + (ix0, 42, 4), + (84, ix1, 4), + (84, 42, ix2), + # mixed indexing with single array / slice / int + (ix0, slice(15, 25), 4), + (42, ix1, slice(1, 5)), + (slice(50, 70), 42, ix2), + # mixed indexing with two array / slice + (ix0, ix1, slice(1, 5)), + (slice(50, 70), ix1, ix2), + (ix0, slice(15, 25), ix2), + # mixed indexing with two array / integer + (ix0, ix1, 4), + (42, ix1, ix2), + (ix0, 42, ix2), + ] + for selection in selections: + expect = oindex(a, selection) + actual = z.get_orthogonal_selection(selection) + assert_array_equal(expect, actual) + actual = z.oindex[selection] + assert_array_equal(expect, actual) def test_orthogonal_indexing_3d_bool(self): @@ -933,12 +913,12 @@ def test_orthogonal_indexing_edge_cases(self): z = self.create_array(shape=a.shape, chunks=(1, 2, 3), dtype=a.dtype) z[:] = a - expect = a[np.ix_([0], range(2), [0, 1, 2])].squeeze(axis=0) - actual = z[0, :, [0, 1, 2]] + expect = a[ix_([0], range(2), [0, 1, 2])].squeeze(axis=0) + actual = z.oindex[0, :, [0, 1, 2]] assert_array_equal(expect, actual) - expect = a[np.ix_([0], range(2), [True, True, True])].squeeze(axis=0) - actual = z[0, :, [True, True, True]] + expect = a[ix_([0], range(2), [True, True, True])].squeeze(axis=0) + actual = z.oindex[0, :, [True, True, True]] assert_array_equal(expect, actual) def test_orthogonal_indexing_3d_int(self): @@ -999,8 +979,8 @@ def _test_orthogonal_indexing_2d_common_set(self, v, a, z, ix0, ix1): a[:] = 0 z[:] = 0 selection = ix0, ix1 - a[np.ix_(*selection)] = v[np.ix_(*selection)] - z[selection] = v[np.ix_(*selection)] + a[ix_(*selection)] = v[ix_(*selection)] + z[selection] = v[ix_(*selection)] assert_array_equal(a, z[:]) # mixed indexing with array / slice or int @@ -1053,8 +1033,8 @@ def _test_orthogonal_indexing_3d_common_set(self, v, a, z, ix0, ix1, ix2): a[:] = 0 z[:] = 0 selection = ix0, ix1, ix2 - a[np.ix_(*selection)] = v[np.ix_(*selection)] - z[selection] = v[np.ix_(*selection)] + a[ix_(*selection)] = v[ix_(*selection)] + z[selection] = v[ix_(*selection)] assert_array_equal(a, z[:]) # mixed indexing with single bool array / slice or int @@ -1080,7 +1060,7 @@ def _test_orthogonal_indexing_3d_common_set(self, v, a, z, ix0, ix1, ix2): a[:] = 0 z[:] = 0 zsel = ix0, ix1, slice(1, 5) - vsel = np.ix_(ix0, ix1, range(1, 5)) + vsel = ix_(ix0, ix1, range(1, 5)) a[vsel] = v[vsel] z[zsel] = v[vsel] assert_array_equal(a, z[:]) @@ -1089,7 +1069,7 @@ def _test_orthogonal_indexing_3d_common_set(self, v, a, z, ix0, ix1, ix2): a[:] = 0 z[:] = 0 zsel = ix0, ix1, 4 - vsel = np.ix_(ix0, ix1, [4]) + vsel = ix_(ix0, ix1, [4]) a[vsel] = v[vsel] z[zsel] = v[vsel].squeeze(axis=2) assert_array_equal(a, z[:]) From 8f0895d08245d3d11fd8eb51f23a80496906553b Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 2 Nov 2017 11:41:50 +0000 Subject: [PATCH 24/67] refactored to use oindex --- zarr/core.py | 103 ++++++++++++++++++++--------------- zarr/indexing.py | 8 +-- zarr/tests/test_core.py | 115 +++++++++++++++++++++++++--------------- 3 files changed, 137 insertions(+), 89 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 183e9e86b2..ba8b6e6670 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -562,7 +562,7 @@ def _get_selection(self, indexer, out=None): else: return out[()] - def __setitem__(self, item, value): + def __setitem__(self, selection, value): """Modify data for some portion of the array. Examples @@ -621,6 +621,11 @@ def __setitem__(self, item, value): """ + self.set_basic_selection(selection, value) + + def set_basic_selection(self, selection, value): + """TODO""" + # guard conditions if self._read_only: err_read_only() @@ -631,15 +636,30 @@ def __setitem__(self, item, value): # handle zero-dimensional arrays if self._shape == (): - return self._setitem_zd(item, value) + return self._set_basic_selection_zd(selection, value) else: - return self._setitem_nd(item, value) + return self._set_basic_selection_nd(selection, value) + + def set_orthogonal_selection(self, selection, value): + + # guard conditions + if self._read_only: + err_read_only() + + # refresh metadata + if not self._cache_metadata: + self._load_metadata_nosync() + + # setup indexer + indexer = OrthogonalIndexer(selection, self) + + self._set_selection(indexer, value) - def _setitem_zd(self, item, value): + def _set_basic_selection_zd(self, selection, value): # special case __setitem__ for zero-dimensional array # check item is valid - if item not in ((), Ellipsis): + if selection not in ((), Ellipsis): raise IndexError('too many indices for array') # setup data to store @@ -656,53 +676,55 @@ def _setitem_zd(self, item, value): cdata = self._encode_chunk(arr) self.chunk_store[ckey] = cdata - def _setitem_nd(self, item, value): + def _set_basic_selection_nd(self, selection, value): # implementation of __setitem__ for array with at least one dimension - # normalize selection - selection = normalize_array_selection(item, self._shape, self._chunks) + # setup indexer + indexer = BasicIndexer(selection, self) - # figure out if we're doing advanced indexing, count number of advanced selections - if - # more than one need special handling, because we are doing orthogonal indexing here, - # which is different from fancy indexing if there is more than one array selection - n_advanced_selection = sum(1 for dim_sel in selection - if not isinstance(dim_sel, (int, slice))) + self._set_selection(indexer, value) - # axes that need to get squeezed out if doing advanced selection - if n_advanced_selection > 0: - squeeze_axes = tuple([i for i, dim_sel in enumerate(selection) - if isinstance(dim_sel, int)]) - else: - squeeze_axes = None + def _set_selection(self, indexer, value): + + # We iterate over all chunks which overlap the selection and thus contain data that needs + # to be replaced. Each chunk is processed in turn, extracting the necessary data from the + # value array and storing into the chunk array. + + # N.B., it is an important optimisation that we only visit chunks which overlap the + # selection. This minimises the nuimber of iterations in the main for loop. # determine indices of chunks overlapping the selection - chunk_ranges, sel_shape = get_chunks_for_selection(selection, self._chunks) + chunk_ranges, sel_shape = indexer.get_overlapping_chunks() # check value shape if np.isscalar(value): pass - elif sel_shape != value.shape: - raise ValueError('value shape does not match selection shape; expected %s, found %s' - % (str(sel_shape), str(value.shape))) + else: + if not hasattr(value, 'shape'): + raise TypeError('value must be an array-like object') + if value.shape != sel_shape: + raise ValueError('value has wrong shape for selection') # iterate over chunks in range for chunk_coords in itertools.product(*chunk_ranges): # obtain selections for chunk and destination arrays - chunk_selection, out_selection = \ - get_chunk_selections(selection, chunk_coords, self._chunks, n_advanced_selection) + chunk_selection, value_selection = indexer.get_chunk_projection(chunk_coords) + # extract data to store if np.isscalar(value): - - # put data - self._chunk_setitem(chunk_coords, chunk_selection, value) - + chunk_value = value else: - # assume value is array-like + chunk_value = value[value_selection] + # handle missing singleton dimensions + if indexer.squeeze_axes: + item = [slice(None)] * self.ndim + for a in indexer.squeeze_axes: + item[a] = np.newaxis + chunk_value = chunk_value[item] - # put data - dest = value[out_selection] - self._chunk_setitem(chunk_coords, chunk_selection, dest, squeeze_axes) + # put data + self._chunk_setitem(chunk_coords, chunk_selection, chunk_value) def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, squeeze_axes=None): """Obtain part or whole of a chunk. @@ -767,7 +789,7 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, sque tmp = np.squeeze(tmp, axis=squeeze_axes) out[out_selection] = tmp - def _chunk_setitem(self, chunk_coords, chunk_selection, value, squeeze_axes=None): + def _chunk_setitem(self, chunk_coords, chunk_selection, value): """Replace part or whole of a chunk. Parameters @@ -783,14 +805,14 @@ def _chunk_setitem(self, chunk_coords, chunk_selection, value, squeeze_axes=None # synchronization if self._synchronizer is None: - self._chunk_setitem_nosync(chunk_coords, chunk_selection, value, squeeze_axes) + self._chunk_setitem_nosync(chunk_coords, chunk_selection, value) else: # synchronize on the chunk ckey = self._chunk_key(chunk_coords) with self._synchronizer[ckey]: - self._chunk_setitem_nosync(chunk_coords, chunk_selection, value, squeeze_axes) + self._chunk_setitem_nosync(chunk_coords, chunk_selection, value) - def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, squeeze_axes=None): + def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value): # obtain key for chunk storage ckey = self._chunk_key(chunk_coords) @@ -851,13 +873,6 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, squeeze_ax if not chunk.flags.writeable: chunk = chunk.copy(order='K') - # handle missing singleton dimensions - if squeeze_axes: - item = [slice(None)] * self.ndim - for a in squeeze_axes: - item[a] = np.newaxis - value = value[item] - # modify chunk[chunk_selection] = value diff --git a/zarr/indexing.py b/zarr/indexing.py index fef7c6725c..5294dee8a7 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -45,10 +45,11 @@ def __init__(self, array): self.array = array def __getitem__(self, selection): - - # delegate to method return self.array.get_orthogonal_selection(selection) + def __setitem__(self, selection, value): + return self.array.set_orthogonal_selection(selection, value) + def normalize_integer_selection(dim_sel, dim_len): @@ -153,7 +154,8 @@ def normalize_dim_selection(self, dim_sel, dim_len): # handle slice with step if dim_sel.step is not None and dim_sel.step != 1: - raise IndexError('slice with step not supported via basic indexing') + raise IndexError('slice with step not supported via basic indexing; use ' + 'orthogonal indexing instead') return dim_sel diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index e698caf793..21ee389ae3 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -32,6 +32,7 @@ def oindex(a, selection): return result +# noinspection PyMethodMayBeStatic class TestArray(unittest.TestCase): def test_array_init(self): @@ -939,11 +940,16 @@ def test_orthogonal_indexing_3d_int(self): ix2.sort() self._test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) + # TODO change to use .oindex for setter + def _test_orthogonal_indexing_1d_common_set(self, v, a, z, ix): a[:] = 0 - z[:] = 0 a[ix] = v[ix] - z[ix] = v[ix] + z[:] = 0 + z.oindex[ix] = v[ix] + assert_array_equal(a, z[:]) + z[:] = 0 + z.set_orthogonal_selection(ix, v[ix]) assert_array_equal(a, z[:]) def test_orthogonal_indexing_1d_bool_set(self): @@ -975,16 +981,10 @@ def test_orthogonal_indexing_1d_int_set(self): def _test_orthogonal_indexing_2d_common_set(self, v, a, z, ix0, ix1): - # index both axes with array - a[:] = 0 - z[:] = 0 - selection = ix0, ix1 - a[ix_(*selection)] = v[ix_(*selection)] - z[selection] = v[ix_(*selection)] - assert_array_equal(a, z[:]) - - # mixed indexing with array / slice or int selections = ( + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice or int (ix0, slice(1, 5)), (slice(250, 350), ix1), (ix0, 4), @@ -992,9 +992,12 @@ def _test_orthogonal_indexing_2d_common_set(self, v, a, z, ix0, ix1): ) for selection in selections: a[:] = 0 + a[ix_(*selection)] = v[ix_(*selection)] + z[:] = 0 + z.oindex[selection] = oindex(v, selection) + assert_array_equal(a, z[:]) z[:] = 0 - a[selection] = v[selection] - z[selection] = v[selection] + z.set_orthogonal_selection(selection, oindex(v, selection)) assert_array_equal(a, z[:]) def test_orthogonal_indexing_2d_bool_set(self): @@ -1029,16 +1032,10 @@ def test_orthogonal_indexing_2d_int_set(self): def _test_orthogonal_indexing_3d_common_set(self, v, a, z, ix0, ix1, ix2): - # index all axes with bool array - a[:] = 0 - z[:] = 0 - selection = ix0, ix1, ix2 - a[ix_(*selection)] = v[ix_(*selection)] - z[selection] = v[ix_(*selection)] - assert_array_equal(a, z[:]) - - # mixed indexing with single bool array / slice or int selections = ( + # index all axes with bool array + (ix0, ix1, ix2), + # mixed indexing with single bool array / slice or int (ix0, slice(15, 25), slice(1, 5)), (slice(50, 70), ix1, slice(1, 5)), (slice(50, 70), slice(15, 25), ix2), @@ -1048,31 +1045,20 @@ def _test_orthogonal_indexing_3d_common_set(self, v, a, z, ix0, ix1, ix2): (ix0, slice(15, 25), 4), (slice(50, 70), ix1, 4), (slice(50, 70), 42, ix2), + # indexing with two arrays / slice + (ix0, ix1, slice(1, 5)), + # indexing with two arrays / integer + (ix0, ix1, 4), ) for selection in selections: a[:] = 0 + a[ix_(*selection)] = v[ix_(*selection)] z[:] = 0 - a[selection] = v[selection] - z[selection] = v[selection] + z.oindex[selection] = oindex(v, selection) + assert_array_equal(a, z[:]) + z[:] = 0 + z.set_orthogonal_selection(selection, oindex(v, selection)) assert_array_equal(a, z[:]) - - # indexing with two arrays / slice - a[:] = 0 - z[:] = 0 - zsel = ix0, ix1, slice(1, 5) - vsel = ix_(ix0, ix1, range(1, 5)) - a[vsel] = v[vsel] - z[zsel] = v[vsel] - assert_array_equal(a, z[:]) - - # indexing with two arrays / integer - a[:] = 0 - z[:] = 0 - zsel = ix0, ix1, 4 - vsel = ix_(ix0, ix1, [4]) - a[vsel] = v[vsel] - z[zsel] = v[vsel].squeeze(axis=2) - assert_array_equal(a, z[:]) def test_orthogonal_indexing_3d_bool_set(self): @@ -1107,6 +1093,51 @@ def test_orthogonal_indexing_3d_int_set(self): ix2.sort() self._test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) + def test_get_selection_out(self): + + # basic selections + a = np.arange(1050) + z = self.create_array(shape=1050, chunks=100, dtype=a.dtype) + z[:] = a + selections = [ + slice(50, 150), + slice(0, 1050), + slice(1, 2), + ] + for selection in selections: + expect = a[selection] + out = self.create_array(shape=expect.shape, chunks=10, dtype=expect.dtype, fill_value=0) + z.get_basic_selection(selection, out=out) + assert_array_equal(expect, out[:]) + + # orthogonal selections + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = self.create_array(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice + (ix0, slice(1, 5)), + (slice(250, 350), ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + # mixed int array / bool array + (ix0, np.nonzero(ix1)[0]), + (np.nonzero(ix0)[0], ix1), + ] + for selection in selections: + expect = oindex(a, selection) + out = self.create_array(shape=expect.shape, chunks=10, dtype=expect.dtype, fill_value=0) + z.get_orthogonal_selection(selection, out=out) + assert_array_equal(expect, out[:]) + class TestArrayWithPath(TestArray): From 04f263d803bf86929c2b2700407045d6b660ee9a Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 2 Nov 2017 11:49:01 +0000 Subject: [PATCH 25/67] add tests for slice with step --- zarr/tests/test_core.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 21ee389ae3..54eb256e68 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -798,6 +798,28 @@ def test_orthogonal_indexing_1d_int(self): ix = [3, 105, 23, 127] # not monotonically increasing z.oindex[ix] + def test_orthogonal_indexing_1d_slice_with_step(self): + + # setup + a = np.arange(1050, dtype=int) + z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + selections = [ + slice(0, 1050), + slice(0, 1050, 1), + slice(0, 1050, 10), + slice(0, 1050, 100), + slice(0, 1050, 1000), + slice(50, 150, 1), + slice(50, 150, 10), + slice(50, 150, 100), + ] + for selection in selections: + expect = a[selection] + actual = z.oindex[selection] + assert_array_equal(expect, actual) + def _test_orthogonal_indexing_2d_common(self, a, z, ix0, ix1): selections = [ @@ -805,7 +827,9 @@ def _test_orthogonal_indexing_2d_common(self, a, z, ix0, ix1): (ix0, ix1), # mixed indexing with array / slice (ix0, slice(1, 5)), + (ix0, slice(1, 5, 2)), (slice(250, 350), ix1), + (slice(250, 350, 10), ix1), # mixed indexing with array / int (ix0, 4), (42, ix1), @@ -869,6 +893,9 @@ def _test_orthogonal_indexing_3d_common(self, a, z, ix0, ix1, ix2): (ix0, slice(15, 25), slice(1, 5)), (slice(50, 70), ix1, slice(1, 5)), (slice(50, 70), slice(15, 25), ix2), + (ix0, slice(15, 25, 5), slice(1, 5, 2)), + (slice(50, 70, 3), ix1, slice(1, 5, 2)), + (slice(50, 70, 3), slice(15, 25, 5), ix2), # mixed indexing with single array / ints (ix0, 42, 4), (84, ix1, 4), @@ -940,8 +967,6 @@ def test_orthogonal_indexing_3d_int(self): ix2.sort() self._test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) - # TODO change to use .oindex for setter - def _test_orthogonal_indexing_1d_common_set(self, v, a, z, ix): a[:] = 0 a[ix] = v[ix] @@ -1134,7 +1159,8 @@ def test_get_selection_out(self): ] for selection in selections: expect = oindex(a, selection) - out = self.create_array(shape=expect.shape, chunks=10, dtype=expect.dtype, fill_value=0) + out = self.create_array(shape=expect.shape, chunks=10, dtype=expect.dtype, + fill_value=0) z.get_orthogonal_selection(selection, out=out) assert_array_equal(expect, out[:]) From 33c2023be4862c17ed7784c07d64d3fbfcef47b1 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 3 Nov 2017 09:16:31 +0000 Subject: [PATCH 26/67] WIP coordinate indexing --- zarr/core.py | 19 ++- zarr/indexing.py | 285 ++++++++++++++++++++++++++++++++++++++++ zarr/tests/test_core.py | 102 ++++++++++++++ 3 files changed, 405 insertions(+), 1 deletion(-) diff --git a/zarr/core.py b/zarr/core.py index ba8b6e6670..e6437fbed6 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -17,7 +17,7 @@ from zarr.errors import PermissionError, err_read_only, err_array_not_found from zarr.compat import reduce from zarr.codecs import AsType, get_codec -from zarr.indexing import OIndex, OrthogonalIndexer, BasicIndexer +from zarr.indexing import OIndex, OrthogonalIndexer, BasicIndexer, VIndex, CoordinateIndexer class Array(object): @@ -110,6 +110,7 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, # initialize indexing helpers self._oindex = OIndex(self) + self._vindex = VIndex(self) def _load_metadata(self): """(Re)load metadata from store.""" @@ -365,6 +366,11 @@ def oindex(self): """TODO""" return self._oindex + @property + def vindex(self): + """TODO""" + return self._vindex + def __eq__(self, other): return ( isinstance(other, Array) and @@ -525,6 +531,17 @@ def get_orthogonal_selection(self, selection, out=None): return self._get_selection(indexer, out=out) + def get_coordinate_selection(self, selection, out=None): + + # refresh metadata + if not self._cache_metadata: + self._load_metadata() + + # setup indexer + indexer = CoordinateIndexer(selection, self) + + return self._get_selection(indexer, out=out) + def _get_selection(self, indexer, out=None): # We iterate over all chunks which overlap the selection and thus contain data that needs diff --git a/zarr/indexing.py b/zarr/indexing.py index 5294dee8a7..390ea17d87 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -51,6 +51,53 @@ def __setitem__(self, selection, value): return self.array.set_orthogonal_selection(selection, value) +def is_coordinate_selection(selection, array): + return ( + (len(selection) == array.ndim) and + all( + [(isinstance(dim_sel, numbers.Integral) or + (hasattr(dim_sel, 'dtype') and dim_sel.dtype.kind in 'ui')) + for dim_sel in selection] + ) + ) + + +def is_mask_selection(selection, array): + return ( + hasattr(selection, 'dtype') and + selection.dtype == bool and + hasattr(selection, 'shape') and + len(selection.shape) == len(array.shape) + ) + + +def replace_lists(selection): + return tuple( + np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel + for dim_sel in selection + ) + + +class VIndex(object): + + def __init__(self, array): + self.array = array + + def __getitem__(self, selection): + if not isinstance(selection, tuple): + selection = tuple(selection) + selection = replace_lists(selection) + if is_coordinate_selection(selection, self.array): + return self.array.get_coordinate_selection(selection) + # elif is_mask_selection(selection, self.array): + # return self.array.get_mask_selection(selection) + else: + raise IndexError('unsupported selection') + + # def __setitem__(self, selection, value): + # return self.array.set_orthogonal_selection(selection, value) + + def normalize_integer_selection(dim_sel, dim_len): # normalize type to int @@ -501,6 +548,244 @@ def get_chunk_projection(self, chunk_coords): return chunk_selection, out_selection +# noinspection PyProtectedMember +class CoordinateIndexer(IndexerBase): + + def __init__(self, selection, array): + + # some initial normalization + if not isinstance(selection, tuple): + selection = tuple(selection) + selection = replace_lists(selection) + + # validation + if not is_coordinate_selection(selection, array): + # TODO refactor error messages for consistency + raise IndexError('invalid coordinate selection') + + # more normalization + selection = self.normalize_selection(selection, array) + + # super initialisation + super(CoordinateIndexer, self).__init__(selection, array) + + # TODO compute nitems, chunk_nitems, chunk_nitems_cumsum + # TODO validate monotonically increasing + + def normalize_selection(self, selection, array): + + # attempt to broadcast selection - this will raise error if array dimensions don't match + selection = np.broadcast_arrays(*selection) + + for dim_sel, dim_len in zip(selection, array.shape): + + # check number of dimensions, only support indexing with 1d array + if len(dim_sel.shape) > 1: + raise IndexError('can only index with integer or 1-dimensional integer array') + + # handle wraparound + loc_neg = dim_sel < 0 + if np.any(loc_neg): + # TODO need to take a copy here, or OK to replace? + dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len + + # handle out of bounds + if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): + raise IndexError('index out of bounds') + + return selection + + + + + + + + # normalize each dimension + selection = tuple(self.normalize_dim_selection(s, l, c) + for s, l, c in zip(selection, array._shape, array._chunks)) + return selection + + def normalize_dim_selection(self, dim_sel, dim_len, dim_chunk_len): + + # normalize list to array + if isinstance(dim_sel, list): + dim_sel = np.asarray(dim_sel) + + if isinstance(dim_sel, numbers.Integral): + + dim_sel = normalize_integer_selection(dim_sel, dim_len) + return dim_sel + + elif isinstance(dim_sel, slice): + + dim_sel = normalize_slice_selection(dim_sel, dim_len) + + # handle slice with step + if dim_sel.step > 1: + dim_sel = np.arange(dim_sel.start, dim_sel.stop, dim_sel.step) + return IntArrayOrthogonalSelection(dim_sel, dim_len, dim_chunk_len) + elif dim_sel.step < 1: + raise IndexError('only positive step supported') + + return dim_sel + + elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): + + if dim_sel.dtype == bool: + return BoolArrayOrthogonalSelection(dim_sel, dim_len, dim_chunk_len) + + elif dim_sel.dtype.kind in 'ui': + return IntArrayOrthogonalSelection(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError('unsupported index item type: %r' % dim_sel) + + else: + raise IndexError('unsupported index item type: %r' % dim_sel) + + def get_overlapping_chunks(self): + """Convenience function to find chunks overlapping an array selection. N.B., + assumes selection has already been normalized.""" + + # indices of chunks overlapping the selection + chunk_ranges = [] + + # shape of the selection + sel_shape = [] + + # iterate over dimensions of the array + for dim_sel, dim_chunk_len in zip(self.selection, self.array._chunks): + + # dim_sel: selection for current dimension + # dim_chunk_len: length of chunk along current dimension + + dim_sel_len = None + + if isinstance(dim_sel, int): + + # dim selection is an integer, i.e., single item, so only need single chunk index for + # this dimension + dim_chunk_range = [dim_sel//dim_chunk_len] + + elif isinstance(dim_sel, slice): + + # dim selection is a slice, need range of chunk indices including start and stop of + # selection + dim_chunk_from = dim_sel.start//dim_chunk_len + dim_chunk_to = int(np.ceil(dim_sel.stop/dim_chunk_len)) + dim_chunk_range = range(dim_chunk_from, dim_chunk_to) + dim_sel_len = dim_sel.stop - dim_sel.start + + elif isinstance(dim_sel, BoolArrayOrthogonalSelection): + + # dim selection is a boolean array, delegate this to the BooleanSelection class + dim_chunk_range = dim_sel.get_chunk_ranges() + dim_sel_len = dim_sel.nitems + + elif isinstance(dim_sel, IntArrayOrthogonalSelection): + + # dim selection is an integer array, delegate this to the integerSelection class + dim_chunk_range = dim_sel.get_chunk_ranges() + dim_sel_len = dim_sel.nitems + + else: + raise RuntimeError('unexpected selection type') + + chunk_ranges.append(dim_chunk_range) + if dim_sel_len is not None: + sel_shape.append(dim_sel_len) + + return chunk_ranges, tuple(sel_shape) + + def get_chunk_projection(self, chunk_coords): + + # chunk_coords: holds the index along each dimension for the current chunk within the + # chunk grid. E.g., (0, 0) locates the first (top left) chunk in a 2D chunk grid. + + chunk_selection = [] + out_selection = [] + + # iterate over dimensions (axes) of the array + for dim_sel, dim_chunk_idx, dim_chunk_len in zip(self.selection, chunk_coords, + self.array._chunks): + + # dim_sel: selection for current dimension + # dim_chunk_idx: chunk index along current dimension + # dim_chunk_len: chunk length along current dimension + + # selection into output array to store data from current chunk + dim_out_sel = None + + # calculate offset for current chunk along current dimension - this is used to + # determine the values to be extracted from the current chunk + dim_chunk_offset = dim_chunk_idx * dim_chunk_len + + # handle integer selection, i.e., single item + if isinstance(dim_sel, int): + + dim_chunk_sel = dim_sel - dim_chunk_offset + + # N.B., leave dim_out_sel as None, as this dimension has been dropped in the + # output array because of single value index + + # handle slice selection, i.e., contiguous range of items + elif isinstance(dim_sel, slice): + + if dim_sel.start <= dim_chunk_offset: + # selection starts before current chunk + dim_chunk_sel_start = 0 + dim_out_offset = dim_chunk_offset - dim_sel.start + + else: + # selection starts within current chunk + dim_chunk_sel_start = dim_sel.start - dim_chunk_offset + dim_out_offset = 0 + + if dim_sel.stop > dim_chunk_offset + dim_chunk_len: + # selection ends after current chunk + dim_chunk_sel_stop = dim_chunk_len + + else: + # selection ends within current chunk + dim_chunk_sel_stop = dim_sel.stop - dim_chunk_offset + + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) + dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + + elif isinstance(dim_sel, (BoolArrayOrthogonalSelection, IntArrayOrthogonalSelection)): + + # get selection to extract data for the current chunk + dim_chunk_sel = dim_sel.get_chunk_sel(dim_chunk_idx) + + # figure out where to put these items in the output array + dim_out_sel = dim_sel.get_out_sel(dim_chunk_idx) + + else: + raise RuntimeError('unexpected selection type') + + # add to chunk selection + chunk_selection.append(dim_chunk_sel) + + # add to output selection + if dim_out_sel is not None: + out_selection.append(dim_out_sel) + + # normalise for indexing into numpy arrays + chunk_selection = tuple(chunk_selection) + out_selection = tuple(out_selection) + + # handle advanced indexing arrays orthogonally + if self.is_advanced: + # numpy doesn't support orthogonal indexing directly as yet, so need to work + # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices + # or integers, so need to convert slices and integers into ranges. + chunk_selection = ix_(*chunk_selection) + + return chunk_selection, out_selection + + def slice_to_range(dim_sel): return range(dim_sel.start, dim_sel.stop, 1 if dim_sel.step is None else dim_sel.step) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 54eb256e68..04ffa6a96a 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1118,6 +1118,102 @@ def test_orthogonal_indexing_3d_int_set(self): ix2.sort() self._test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) + # noinspection PyStatementEffect + def test_coordinate_indexing_1d(self): + + # setup + a = np.arange(1050, dtype=int) + z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix.sort() + expect = a[ix] + actual = z.get_coordinate_selection(ix) + assert_array_equal(expect, actual) + actual = z.vindex[ix] + assert_array_equal(expect, actual) + + # test wraparound + ix = [0, 3, 10, -23, -12, -1] + expect = a[ix] + actual = z.get_coordinate_selection(ix) + assert_array_equal(expect, actual) + + # test errors + with assert_raises(IndexError): + ix = [a.shape[0] + 1] # out of bounds + z.get_coordinate_selection(ix) + with assert_raises(IndexError): + ix = [-(a.shape[0] + 1)] # out of bounds + z.get_coordinate_selection(ix) + with assert_raises(IndexError): + ix = [[2, 4], [6, 8]] # too many dimensions + z.get_coordinate_selection(ix) + with assert_raises(NotImplementedError): + ix = [3, 105, 23, 127] # not monotonically increasing + z.get_coordinate_selection(ix) + with assert_raises(NotImplementedError): + ix = slice(5, 15) + z.get_coordinate_selection(ix) + with assert_raises(NotImplementedError): + ix = Ellipsis + z.get_coordinate_selection(ix) + + def test_coordinate_indexing_2d(self): + + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = self.create_array(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) + srt = np.lexsort((ix0, ix1)) + ix0 = ix0[srt] + ix1 = ix1[srt] + + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + ] + + for selection in selections: + expect = a[selection] + actual = z.get_coordinate_selection(selection) + assert_array_equal(expect, actual) + actual = z.vindex[selection] + assert_array_equal(expect, actual) + + with assert_raises(NotImplementedError): + selection = slice(5, 15), [1, 2, 3] + z.get_coordinate_selection(selection) + with assert_raises(NotImplementedError): + selection = [1, 2, 3], slice(5, 15) + z.get_coordinate_selection(selection) + with assert_raises(NotImplementedError): + selection = Ellipsis, [1, 2, 3] + z.get_coordinate_selection(selection) + with assert_raises(NotImplementedError): + # not monotonically increasing + ix0 = [3, 3, 4, 2, 5] + ix1 = [1, 3, 5, 7, 9] + z.get_coordinate_selection((ix0, ix1)) + with assert_raises(NotImplementedError): + # not monotonically increasing + ix0 = [3, 3, 4, 4, 5] + ix1 = [1, 3, 2, 1, 7] + z.get_coordinate_selection((ix0, ix1)) + def test_get_selection_out(self): # basic selections @@ -1164,6 +1260,12 @@ def test_get_selection_out(self): z.get_orthogonal_selection(selection, out=out) assert_array_equal(expect, out[:]) + # TODO coordinate selection + + # TODO mask selection + + # TODO selection with fields + class TestArrayWithPath(TestArray): From d0d50ffac41446787799dd28df1485b16178bd8b Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 4 Nov 2017 00:42:25 +0000 Subject: [PATCH 27/67] WIP refactor indexing --- zarr/indexing.py | 241 +++++++------------------------- zarr/new_indexing.py | 321 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 369 insertions(+), 193 deletions(-) create mode 100644 zarr/new_indexing.py diff --git a/zarr/indexing.py b/zarr/indexing.py index 390ea17d87..e6d56c46f8 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -384,21 +384,19 @@ def normalize_dim_selection(self, dim_sel, dim_len, dim_chunk_len): dim_sel = normalize_slice_selection(dim_sel, dim_len) # handle slice with step - if dim_sel.step > 1: + if dim_sel.step != 1: dim_sel = np.arange(dim_sel.start, dim_sel.stop, dim_sel.step) - return IntArrayOrthogonalSelection(dim_sel, dim_len, dim_chunk_len) - elif dim_sel.step < 1: - raise IndexError('only positive step supported') + return IntArrayDimSelection(dim_sel, dim_len, dim_chunk_len) return dim_sel elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): if dim_sel.dtype == bool: - return BoolArrayOrthogonalSelection(dim_sel, dim_len, dim_chunk_len) + return BoolArrayDimSelection(dim_sel, dim_len, dim_chunk_len) elif dim_sel.dtype.kind in 'ui': - return IntArrayOrthogonalSelection(dim_sel, dim_len, dim_chunk_len) + return IntArrayDimSelection(dim_sel, dim_len, dim_chunk_len) else: raise IndexError('unsupported index item type: %r' % dim_sel) @@ -439,16 +437,16 @@ def get_overlapping_chunks(self): dim_chunk_range = range(dim_chunk_from, dim_chunk_to) dim_sel_len = dim_sel.stop - dim_sel.start - elif isinstance(dim_sel, BoolArrayOrthogonalSelection): + elif isinstance(dim_sel, BoolArrayDimSelection): # dim selection is a boolean array, delegate this to the BooleanSelection class - dim_chunk_range = dim_sel.get_chunk_ranges() + dim_chunk_range = dim_sel.get_overlapping_chunks() dim_sel_len = dim_sel.nitems - elif isinstance(dim_sel, IntArrayOrthogonalSelection): + elif isinstance(dim_sel, IntArrayDimSelection): # dim selection is an integer array, delegate this to the integerSelection class - dim_chunk_range = dim_sel.get_chunk_ranges() + dim_chunk_range = dim_sel.get_overlapping_chunks() dim_sel_len = dim_sel.nitems else: @@ -516,7 +514,7 @@ def get_chunk_projection(self, chunk_coords): dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - elif isinstance(dim_sel, (BoolArrayOrthogonalSelection, IntArrayOrthogonalSelection)): + elif isinstance(dim_sel, (BoolArrayDimSelection, IntArrayDimSelection)): # get selection to extract data for the current chunk dim_chunk_sel = dim_sel.get_chunk_sel(dim_chunk_idx) @@ -569,8 +567,25 @@ def __init__(self, selection, array): # super initialisation super(CoordinateIndexer, self).__init__(selection, array) - # TODO compute nitems, chunk_nitems, chunk_nitems_cumsum - # TODO validate monotonically increasing + # compute flattened chunk indices for each point selected + chunks_multi_index = tuple( + dim_sel // dim_chunk_len + for (dim_sel, dim_chunk_len) in zip(selection, array._chunks) + ) + chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, + dims=array._cdata_shape) + + # validated that indices are monotonically increasing + if np.any(np.diff(chunks_raveled_indices) < 0): + raise NotImplementedError('only monotonically increasing indices are supported') + + # compute various useful things + self.chunk_nitems = np.bincount(chunks_raveled_indices) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + self.nitems = len(chunks_raveled_indices) + self.sel_shape = (self.nitems,) + self.chunk_ranges = np.unravel_index(np.unique(chunks_raveled_indices), + dims=array._cdata_shape) def normalize_selection(self, selection, array): @@ -595,193 +610,33 @@ def normalize_selection(self, selection, array): return selection - - - - - - - # normalize each dimension - selection = tuple(self.normalize_dim_selection(s, l, c) - for s, l, c in zip(selection, array._shape, array._chunks)) - return selection - - def normalize_dim_selection(self, dim_sel, dim_len, dim_chunk_len): - - # normalize list to array - if isinstance(dim_sel, list): - dim_sel = np.asarray(dim_sel) - - if isinstance(dim_sel, numbers.Integral): - - dim_sel = normalize_integer_selection(dim_sel, dim_len) - return dim_sel - - elif isinstance(dim_sel, slice): - - dim_sel = normalize_slice_selection(dim_sel, dim_len) - - # handle slice with step - if dim_sel.step > 1: - dim_sel = np.arange(dim_sel.start, dim_sel.stop, dim_sel.step) - return IntArrayOrthogonalSelection(dim_sel, dim_len, dim_chunk_len) - elif dim_sel.step < 1: - raise IndexError('only positive step supported') - - return dim_sel - - elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): - - if dim_sel.dtype == bool: - return BoolArrayOrthogonalSelection(dim_sel, dim_len, dim_chunk_len) - - elif dim_sel.dtype.kind in 'ui': - return IntArrayOrthogonalSelection(dim_sel, dim_len, dim_chunk_len) - - else: - raise IndexError('unsupported index item type: %r' % dim_sel) - - else: - raise IndexError('unsupported index item type: %r' % dim_sel) - def get_overlapping_chunks(self): """Convenience function to find chunks overlapping an array selection. N.B., assumes selection has already been normalized.""" - # indices of chunks overlapping the selection - chunk_ranges = [] - - # shape of the selection - sel_shape = [] - - # iterate over dimensions of the array - for dim_sel, dim_chunk_len in zip(self.selection, self.array._chunks): - - # dim_sel: selection for current dimension - # dim_chunk_len: length of chunk along current dimension - - dim_sel_len = None - - if isinstance(dim_sel, int): - - # dim selection is an integer, i.e., single item, so only need single chunk index for - # this dimension - dim_chunk_range = [dim_sel//dim_chunk_len] - - elif isinstance(dim_sel, slice): - - # dim selection is a slice, need range of chunk indices including start and stop of - # selection - dim_chunk_from = dim_sel.start//dim_chunk_len - dim_chunk_to = int(np.ceil(dim_sel.stop/dim_chunk_len)) - dim_chunk_range = range(dim_chunk_from, dim_chunk_to) - dim_sel_len = dim_sel.stop - dim_sel.start - - elif isinstance(dim_sel, BoolArrayOrthogonalSelection): - - # dim selection is a boolean array, delegate this to the BooleanSelection class - dim_chunk_range = dim_sel.get_chunk_ranges() - dim_sel_len = dim_sel.nitems - - elif isinstance(dim_sel, IntArrayOrthogonalSelection): - - # dim selection is an integer array, delegate this to the integerSelection class - dim_chunk_range = dim_sel.get_chunk_ranges() - dim_sel_len = dim_sel.nitems - - else: - raise RuntimeError('unexpected selection type') - - chunk_ranges.append(dim_chunk_range) - if dim_sel_len is not None: - sel_shape.append(dim_sel_len) - - return chunk_ranges, tuple(sel_shape) + return self.chunk_ranges, self.sel_shape def get_chunk_projection(self, chunk_coords): # chunk_coords: holds the index along each dimension for the current chunk within the # chunk grid. E.g., (0, 0) locates the first (top left) chunk in a 2D chunk grid. - chunk_selection = [] - out_selection = [] - - # iterate over dimensions (axes) of the array - for dim_sel, dim_chunk_idx, dim_chunk_len in zip(self.selection, chunk_coords, - self.array._chunks): - - # dim_sel: selection for current dimension - # dim_chunk_idx: chunk index along current dimension - # dim_chunk_len: chunk length along current dimension - - # selection into output array to store data from current chunk - dim_out_sel = None - - # calculate offset for current chunk along current dimension - this is used to - # determine the values to be extracted from the current chunk - dim_chunk_offset = dim_chunk_idx * dim_chunk_len - - # handle integer selection, i.e., single item - if isinstance(dim_sel, int): - - dim_chunk_sel = dim_sel - dim_chunk_offset - - # N.B., leave dim_out_sel as None, as this dimension has been dropped in the - # output array because of single value index - - # handle slice selection, i.e., contiguous range of items - elif isinstance(dim_sel, slice): - - if dim_sel.start <= dim_chunk_offset: - # selection starts before current chunk - dim_chunk_sel_start = 0 - dim_out_offset = dim_chunk_offset - dim_sel.start - - else: - # selection starts within current chunk - dim_chunk_sel_start = dim_sel.start - dim_chunk_offset - dim_out_offset = 0 - - if dim_sel.stop > dim_chunk_offset + dim_chunk_len: - # selection ends after current chunk - dim_chunk_sel_stop = dim_chunk_len - - else: - # selection ends within current chunk - dim_chunk_sel_stop = dim_sel.stop - dim_chunk_offset - - dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) - dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start - dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - - elif isinstance(dim_sel, (BoolArrayOrthogonalSelection, IntArrayOrthogonalSelection)): - - # get selection to extract data for the current chunk - dim_chunk_sel = dim_sel.get_chunk_sel(dim_chunk_idx) - - # figure out where to put these items in the output array - dim_out_sel = dim_sel.get_out_sel(dim_chunk_idx) - - else: - raise RuntimeError('unexpected selection type') - - # add to chunk selection - chunk_selection.append(dim_chunk_sel) - - # add to output selection - if dim_out_sel is not None: - out_selection.append(dim_out_sel) - - # normalise for indexing into numpy arrays - chunk_selection = tuple(chunk_selection) - out_selection = tuple(out_selection) + chunk_idx = np.ravel_multi_index(*chunk_coords, dims=self.array._cdata_shape) + if chunk_idx == 0: + out_start = 0 + else: + out_start = self.chunk_nitems_cumsum[chunk_idx - 1] + out_stop = self.chunk_nitems_cumsum[chunk_idx] + out_selection = slice(out_start, out_stop) - # handle advanced indexing arrays orthogonally - if self.is_advanced: - # numpy doesn't support orthogonal indexing directly as yet, so need to work - # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices - # or integers, so need to convert slices and integers into ranges. - chunk_selection = ix_(*chunk_selection) + chunk_offsets = tuple( + dim_chunk_idx * dim_chunk_len + for dim_chunk_idx, dim_chunk_len in zip(chunk_coords, self.array._chunks) + ) + chunk_selection = tuple( + dim_sel[out_selection] - dim_chunk_offset + for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets) + ) return chunk_selection, out_selection @@ -805,7 +660,7 @@ def ix_(*selection): return selection -class IntArrayOrthogonalSelection(object): +class IntArrayDimSelection(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): @@ -859,11 +714,11 @@ def get_out_sel(self, dim_chunk_idx): stop = self.chunk_nitems_cumsum[dim_chunk_idx] return slice(start, stop) - def get_chunk_ranges(self): + def get_overlapping_chunks(self): return np.nonzero(self.chunk_nitems)[0] -class BoolArrayOrthogonalSelection(object): +class BoolArrayDimSelection(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): @@ -910,5 +765,5 @@ def get_out_sel(self, dim_chunk_idx): stop = self.chunk_nitems_cumsum[dim_chunk_idx] return slice(start, stop) - def get_chunk_ranges(self): + def get_overlapping_chunks(self): return np.nonzero(self.chunk_nitems)[0] diff --git a/zarr/new_indexing.py b/zarr/new_indexing.py new file mode 100644 index 0000000000..f556685d6b --- /dev/null +++ b/zarr/new_indexing.py @@ -0,0 +1,321 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, print_function, division +import numbers +import itertools + + +import numpy as np + + +def normalize_integer_selection(dim_sel, dim_len): + + # normalize type to int + dim_sel = int(dim_sel) + + # handle wraparound + if dim_sel < 0: + dim_sel = dim_len + dim_sel + + # handle out of bounds + if dim_sel >= dim_len or dim_sel < 0: + raise IndexError('index out of bounds') + + return dim_sel + + +class IntSelection(object): + + def __init__(self, dim_sel, dim_len, dim_chunk_len): + + # check type + if not isinstance(dim_sel, numbers.Integral): + raise ValueError('selection must be an integer') + + # normalize + dim_sel = normalize_integer_selection(dim_sel, dim_len) + + # store attributes + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nitems = 1 + + def get_overlapping_chunks(self): + + dim_chunk_ix = self.dim_sel // self.dim_chunk_len + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel - dim_offset + dim_out_sel = None + yield dim_chunk_ix, dim_chunk_sel, dim_out_sel + + +def normalize_slice_selection(dim_sel, dim_len): + + # handle slice with None bound + start = 0 if dim_sel.start is None else dim_sel.start + stop = dim_len if dim_sel.stop is None else dim_sel.stop + step = 1 if dim_sel.step is None else dim_sel.step + + # handle wraparound + if start < 0: + start = dim_len + start + if stop < 0: + stop = dim_len + stop + + # handle out of bounds + if start < 0: + raise IndexError('start index out of bounds: %s' % dim_sel.start) + if stop < 0: + raise IndexError('stop index out of bounds: %s' % dim_sel.stop) + if start >= dim_len and dim_len > 0: + raise IndexError('start index out of bounds: %ss' % dim_sel.start) + if stop > dim_len: + stop = dim_len + if stop < start: + stop = start + + return slice(start, stop, step) + + +class SliceSelection(object): + + def __init__(self, dim_sel, dim_len, dim_chunk_len): + + # check type + if not isinstance(dim_sel, slice): + raise ValueError('selection must be a slice') + + # normalize + dim_sel = normalize_slice_selection(dim_sel, dim_len) + + # store attributes + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nitems = dim_sel.stop - dim_sel.start + + def get_overlapping_chunks(self): + + dim_chunk_from = self.dim_sel.start // self.dim_chunk_len + dim_chunk_to = int(np.ceil(self.dim_sel.stop / self.dim_chunk_len)) + + for dim_chunk_ix in range(dim_chunk_from, dim_chunk_to): + + dim_offset = dim_chunk_ix * self.dim_chunk_len + + if self.dim_sel.start <= dim_offset: + # selection starts before current chunk + dim_chunk_sel_start = 0 + dim_out_offset = dim_offset - self.dim_sel.start + + else: + # selection starts within current chunk + dim_chunk_sel_start = self.dim_sel.start - dim_offset + dim_out_offset = 0 + + if self.dim_sel.stop > (dim_offset + self.dim_chunk_len): + # selection ends after current chunk + dim_chunk_sel_stop = self.dim_chunk_len + + else: + # selection ends within current chunk + dim_chunk_sel_stop = self.dim_sel.stop - dim_offset + + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) + dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + + yield dim_chunk_ix, dim_chunk_sel, dim_out_sel + + +class BoolArrayDimSelection(object): + + def __init__(self, dim_sel, dim_len, dim_chunk_len): + + # check number of dimensions + if len(dim_sel.shape) > 1: + raise IndexError('selection must be a 1d array') + + # check shape + if dim_sel.shape[0] != dim_len: + raise IndexError('selection has the wrong length') + + # store attributes + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) + + # precompute number of selected items for each chunk + self.chunk_nitems = np.zeros(self.nchunks, dtype='i8') + for dim_chunk_idx in range(self.nchunks): + dim_offset = dim_chunk_idx * self.dim_chunk_len + self.chunk_nitems[dim_chunk_idx] = np.count_nonzero( + self.dim_sel[dim_offset:dim_offset + self.dim_chunk_len] + ) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + self.nitems = self.chunk_nitems_cumsum[-1] + + def get_overlapping_chunks(self): + + # iterate over chunks with at least one item + for dim_chunk_ix in np.nonzero(self.chunk_nitems)[0]: + + # find region in chunk + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[dim_offset:dim_offset + self.dim_chunk_len] + + # pad out if final chunk + if dim_chunk_sel.shape[0] < self.dim_chunk_len: + tmp = np.zeros(self.dim_chunk_len, dtype=bool) + tmp[:dim_chunk_sel.shape[0]] = dim_chunk_sel + dim_chunk_sel = tmp + + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + dim_out_sel = slice(start, stop) + + yield dim_chunk_ix, dim_chunk_sel, dim_out_sel + + +def find_runs(x): + """Find runs of consecutive items in an array.""" + + # ensure array + x = np.asanyarray(x) + if x.ndim != 1: + raise ValueError('only 1D array supported') + n = x.shape[0] + + # handle empty array + if n == 0: + return np.array([]), np.array([]), np.array([]) + + else: + # find run starts + loc_run_start = np.empty(n, dtype=bool) + loc_run_start[0] = True + np.not_equal(x[:-1], x[1:], out=loc_run_start[1:]) + run_starts = np.nonzero(loc_run_start)[0] + + # find run values + run_values = x[loc_run_start] + + # find run lengths + run_lengths = np.diff(np.append(run_starts, n)) + + return run_values, run_starts, run_lengths + + +class IntArrayDimSelection(object): + """Integer array selection against a single dimension.""" + + def __init__(self, dim_sel, dim_len, dim_chunk_len): + + # ensure array + dim_sel = np.asanyarray(dim_sel) + + # check number of dimensions + if dim_sel.ndim != 1: + raise IndexError('selection must be a 1d array') + + # check dtype + if dim_sel.dtype.kind not in 'ui': + raise IndexError('selection must be an integer array') + + # handle wraparound + loc_neg = dim_sel < 0 + if np.any(loc_neg): + dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len + + # handle out of bounds + if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): + raise IndexError('selection contains index out of bounds') + + # store attributes + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) + self.nitems = len(dim_sel) + + def get_overlapping_chunks(self): + + # locate required chunk for each index + dim_chunk_sel = self.dim_sel // self.dim_chunk_len + + # find runs of indices in the same chunk + dim_chunk_ixs, run_starts, run_lengths = find_runs(dim_chunk_sel) + + # iterate over chunks + for dim_chunk_ix, s, l in zip(dim_chunk_ixs, run_starts, run_lengths): + + # find region in output array + dim_out_sel = slice(s, s + l) + + # find region in chunk + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[dim_out_sel] - dim_offset + + yield dim_chunk_ix, dim_chunk_sel, dim_out_sel + + +def replace_ellipsis(selection, shape): + + # count number of ellipsis present + n_ellipsis = sum(1 for i in selection if i is Ellipsis) + + if n_ellipsis > 1: + # more than 1 is an error + raise IndexError("an index can only have a single ellipsis ('...')") + + elif n_ellipsis == 1: + # locate the ellipsis, count how many items to left and right + n_items_l = selection.index(Ellipsis) # items to left of ellipsis + n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis + n_items = len(selection) - 1 # all non-ellipsis items + + if n_items >= len(shape): + # ellipsis does nothing, just remove it + selection = tuple(i for i in selection if i != Ellipsis) + + else: + # replace ellipsis with as many slices are needed for number of dims + new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) + if n_items_r: + new_item += selection[-n_items_r:] + selection = new_item + + # fill out selection if not completely specified + if len(selection) < len(shape): + selection += tuple(slice(0, l) for l in shape[len(selection):]) + + return selection + + +class BasicIndexer(object): + + def __init__(self, selection, array): + + # ensure tuple + if not isinstance(selection, tuple): + selection = (selection,) + + # handle ellipsis + selection = replace_ellipsis(selection, array._shape) + + # validation - check dimensionality + if len(selection) > len(array._shape): + raise IndexError('too many indices for array') + if len(selection) < len(array._shape): + raise IndexError('not enough indices for array') + + # TODO normalize and setup selections + + def get_overlapping_chunks(self): + dim_tasks = [s.get_overlapping_chunks() for s in self.selection] + return itertools.product(*dim_tasks) From fe28594220ba2c9e3fef0b38e21d16605afce8dd Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 4 Nov 2017 01:33:34 +0000 Subject: [PATCH 28/67] WIP refactor indexing --- zarr/core.py | 37 +++--- zarr/new_indexing.py | 282 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 294 insertions(+), 25 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index e6437fbed6..dd9f1f03da 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -17,7 +17,7 @@ from zarr.errors import PermissionError, err_read_only, err_array_not_found from zarr.compat import reduce from zarr.codecs import AsType, get_codec -from zarr.indexing import OIndex, OrthogonalIndexer, BasicIndexer, VIndex, CoordinateIndexer +from zarr.new_indexing import OIndex, OrthogonalIndexer, BasicIndexer, VIndex, CoordinateIndexer class Array(object): @@ -551,8 +551,8 @@ def _get_selection(self, indexer, out=None): # N.B., it is an important optimisation that we only visit chunks which overlap the # selection. This minimises the nuimber of iterations in the main for loop. - # determine indices of chunks overlapping the selection - chunk_ranges, sel_shape = indexer.get_overlapping_chunks() + # determine output shape + sel_shape = indexer.shape # setup output array if out is None: @@ -564,11 +564,8 @@ def _get_selection(self, indexer, out=None): if out.shape != sel_shape: raise ValueError('out has wrong shape for selection') - # iterate over chunks in range, i.e., chunks overlapping the selection - for chunk_coords in itertools.product(*chunk_ranges): - - # obtain selections for chunk and output arrays - chunk_selection, out_selection = indexer.get_chunk_projection(chunk_coords) + # iterate over chunks + for chunk_coords, chunk_selection, out_selection in indexer.get_overlapping_chunks(): # load chunk selection into output array self._chunk_getitem(chunk_coords, chunk_selection, out, out_selection, @@ -672,6 +669,21 @@ def set_orthogonal_selection(self, selection, value): self._set_selection(indexer, value) + def set_coordinate_selection(self, selection, value): + + # guard conditions + if self._read_only: + err_read_only() + + # refresh metadata + if not self._cache_metadata: + self._load_metadata_nosync() + + # setup indexer + indexer = CoordinateIndexer(selection, self) + + self._set_selection(indexer, value) + def _set_basic_selection_zd(self, selection, value): # special case __setitem__ for zero-dimensional array @@ -711,7 +723,7 @@ def _set_selection(self, indexer, value): # selection. This minimises the nuimber of iterations in the main for loop. # determine indices of chunks overlapping the selection - chunk_ranges, sel_shape = indexer.get_overlapping_chunks() + sel_shape = indexer.shape # check value shape if np.isscalar(value): @@ -723,16 +735,13 @@ def _set_selection(self, indexer, value): raise ValueError('value has wrong shape for selection') # iterate over chunks in range - for chunk_coords in itertools.product(*chunk_ranges): - - # obtain selections for chunk and destination arrays - chunk_selection, value_selection = indexer.get_chunk_projection(chunk_coords) + for chunk_coords, chunk_selection, out_selection in indexer.get_overlapping_chunks(): # extract data to store if np.isscalar(value): chunk_value = value else: - chunk_value = value[value_selection] + chunk_value = value[out_selection] # handle missing singleton dimensions if indexer.squeeze_axes: item = [slice(None)] * self.ndim diff --git a/zarr/new_indexing.py b/zarr/new_indexing.py index f556685d6b..3524918022 100644 --- a/zarr/new_indexing.py +++ b/zarr/new_indexing.py @@ -23,7 +23,7 @@ def normalize_integer_selection(dim_sel, dim_len): return dim_sel -class IntSelection(object): +class IntIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): @@ -77,7 +77,7 @@ def normalize_slice_selection(dim_sel, dim_len): return slice(start, stop, step) -class SliceSelection(object): +class SliceIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): @@ -128,7 +128,7 @@ def get_overlapping_chunks(self): yield dim_chunk_ix, dim_chunk_sel, dim_out_sel -class BoolArrayDimSelection(object): +class BoolArrayDimIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): @@ -211,7 +211,7 @@ def find_runs(x): return run_values, run_starts, run_lengths -class IntArrayDimSelection(object): +class IntArrayDimIndexer(object): """Integer array selection against a single dimension.""" def __init__(self, dim_sel, dim_len, dim_chunk_len): @@ -243,16 +243,17 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) self.nitems = len(dim_sel) - def get_overlapping_chunks(self): - # locate required chunk for each index dim_chunk_sel = self.dim_sel // self.dim_chunk_len + self.dim_chunk_sel = dim_chunk_sel # find runs of indices in the same chunk - dim_chunk_ixs, run_starts, run_lengths = find_runs(dim_chunk_sel) + self.dim_chunk_ixs, self.run_starts, self.run_lengths = find_runs(dim_chunk_sel) + + def get_overlapping_chunks(self): # iterate over chunks - for dim_chunk_ix, s, l in zip(dim_chunk_ixs, run_starts, run_lengths): + for dim_chunk_ix, s, l in zip(self.dim_chunk_ixs, self.run_starts, self.run_lengths): # find region in output array dim_out_sel = slice(s, s + l) @@ -297,6 +298,7 @@ def replace_ellipsis(selection, shape): return selection +# noinspection PyProtectedMember class BasicIndexer(object): def __init__(self, selection, array): @@ -314,8 +316,266 @@ def __init__(self, selection, array): if len(selection) < len(array._shape): raise IndexError('not enough indices for array') - # TODO normalize and setup selections + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): + + if isinstance(dim_sel, numbers.Integral): + dim_sel = normalize_integer_selection(dim_sel, dim_len) + dim_indexer = IntIndexer(dim_sel, dim_len, dim_chunk_len) + + elif isinstance(dim_sel, slice): + dim_sel = normalize_slice_selection(dim_sel, dim_len) + dim_indexer = SliceIndexer(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError('bad selection type') + + dim_indexers.append(dim_indexer) + + self.dim_indexers = dim_indexers + self.shape = tuple(s.nitems for s in self.dim_indexers) + self.squeeze_axes = None def get_overlapping_chunks(self): - dim_tasks = [s.get_overlapping_chunks() for s in self.selection] - return itertools.product(*dim_tasks) + overlaps = [s.get_overlapping_chunks() for s in self.dim_indexers] + for dim_tasks in itertools.product(*overlaps): + + chunk_coords = tuple(t[0] for t in dim_tasks) + chunk_selection = tuple(t[1] for t in dim_tasks) + out_selection = tuple(t[2] for t in dim_tasks) + + yield chunk_coords, chunk_selection, out_selection + + +def slice_to_range(s): + return range(s.start, s.stop, 1 if s.step is None else s.step) + + +def ix_(*selection): + """Convert an orthogonal selection to a numpy advanced (fancy) selection, with support for + slices and single ints.""" + + # replace slice and int as these are not supported by numpy ix_() + selection = [slice_to_range(dim_sel) if isinstance(dim_sel, slice) + else [dim_sel] if isinstance(dim_sel, int) + else dim_sel + for dim_sel in selection] + + selection = np.ix_(*selection) + + return selection + + +class OrthogonalIndexer(object): + + def __init__(self, selection, array): + + # ensure tuple + if not isinstance(selection, tuple): + selection = (selection,) + + # handle ellipsis + selection = replace_ellipsis(selection, array._shape) + + # validation - check dimensionality + if len(selection) > len(array._shape): + raise IndexError('too many indices for array') + if len(selection) < len(array._shape): + raise IndexError('not enough indices for array') + + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): + + # normalize list to array + if isinstance(dim_sel, list): + dim_sel = np.asarray(dim_sel) + + if isinstance(dim_sel, numbers.Integral): + dim_sel = normalize_integer_selection(dim_sel, dim_len) + dim_indexer = IntIndexer(dim_sel, dim_len, dim_chunk_len) + + elif isinstance(dim_sel, slice): + + # normalize + dim_sel = normalize_slice_selection(dim_sel, dim_len) + + # handle slice with step + if dim_sel.step != 1: + dim_sel = np.arange(dim_sel.start, dim_sel.stop, dim_sel.step) + dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + else: + dim_indexer = SliceIndexer(dim_sel, dim_len, dim_chunk_len) + + elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): + + if dim_sel.dtype == bool: + dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif dim_sel.dtype.kind in 'ui': + dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError('bad selection type') + + else: + raise IndexError('bad selection type') + + dim_indexers.append(dim_indexer) + + self.dim_indexers = dim_indexers + self.shape = tuple(s.nitems for s in self.dim_indexers) + self.is_advanced = any([not isinstance(dim_indexer, (IntIndexer, SliceIndexer)) + for dim_indexer in self.dim_indexers]) + if self.is_advanced: + self.squeeze_axes = tuple([i for i, dim_indexer in enumerate(self.dim_indexers) + if isinstance(dim_indexer, IntIndexer)]) + else: + self.squeeze_axes = None + + def get_overlapping_chunks(self): + overlaps = [s.get_overlapping_chunks() for s in self.dim_indexers] + for dim_tasks in itertools.product(*overlaps): + + chunk_coords = tuple(t[0] for t in dim_tasks) + chunk_selection = tuple(t[1] for t in dim_tasks) + out_selection = tuple(t[2] for t in dim_tasks) + + # handle advanced indexing arrays orthogonally + if self.is_advanced: + # numpy doesn't support orthogonal indexing directly as yet, so need to work + # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices + # or integers, so need to convert slices and integers into ranges. + chunk_selection = ix_(*chunk_selection) + + yield chunk_coords, chunk_selection, out_selection + + +class OIndex(object): + + def __init__(self, array): + self.array = array + + def __getitem__(self, selection): + return self.array.get_orthogonal_selection(selection) + + def __setitem__(self, selection, value): + return self.array.set_orthogonal_selection(selection, value) + + +def is_coordinate_selection(selection, array): + return ( + (len(selection) == array.ndim) and + all( + [(isinstance(dim_sel, numbers.Integral) or + (hasattr(dim_sel, 'dtype') and dim_sel.dtype.kind in 'ui')) + for dim_sel in selection] + ) + ) + + +def is_mask_selection(selection, array): + return ( + hasattr(selection, 'dtype') and + selection.dtype == bool and + hasattr(selection, 'shape') and + len(selection.shape) == len(array.shape) + ) + + +def replace_lists(selection): + return tuple( + np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel + for dim_sel in selection + ) + + +# noinspection PyProtectedMember +class CoordinateIndexer(object): + + def __init__(self, selection, array): + + # some initial normalization + if not isinstance(selection, tuple): + selection = tuple(selection) + selection = replace_lists(selection) + + # validation + if not is_coordinate_selection(selection, array): + # TODO refactor error messages for consistency + raise IndexError('invalid coordinate selection') + + # attempt to broadcast selection - this will raise error if array dimensions don't match + self.selection = np.broadcast_arrays(*selection) + self.shape = len(selection[0]) + self.squeeze_axes = None + + # normalization + for dim_sel, dim_len in zip(selection, array.shape): + + # check number of dimensions, only support indexing with 1d array + if len(dim_sel.shape) > 1: + raise IndexError('selection must be 1-dimensional integer array') + + # handle wraparound + loc_neg = dim_sel < 0 + if np.any(loc_neg): + # TODO need to take a copy here, or OK to replace? + dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len + + # handle out of bounds + if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): + raise IndexError('index out of bounds') + + # compute flattened chunk index for each point selected + chunks_multi_index = tuple( + dim_sel // dim_chunk_len + for (dim_sel, dim_chunk_len) in zip(selection, array._chunks) + ) + chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, + dims=array._cdata_shape) + + # find runs of indices in the same chunk + self.chunks_rixs, self.run_starts, self.run_lengths = find_runs(chunks_raveled_indices) + # unravel + self.chunks_ixs = np.unravel_index(self.chunks_rixs, dims=array._cdata_shape) + + def get_overlapping_chunks(self): + + # iterate over chunks + for chunk_coords, s, l in zip(self.chunks_ixs, self.run_starts, self.run_lengths): + + out_selection = slice(s, s+l) + + chunk_offsets = tuple( + dim_chunk_ix * dim_chunk_len + for dim_chunk_ix, dim_chunk_len in zip(chunk_coords, self.array._chunks) + ) + + chunk_selection = tuple( + dim_sel[out_selection] - dim_chunk_offset + for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets) + ) + + yield chunk_coords, chunk_selection, out_selection + + +class VIndex(object): + + def __init__(self, array): + self.array = array + + def __getitem__(self, selection): + if not isinstance(selection, tuple): + selection = tuple(selection) + selection = replace_lists(selection) + if is_coordinate_selection(selection, self.array): + return self.array.get_coordinate_selection(selection) + # elif is_mask_selection(selection, self.array): + # return self.array.get_mask_selection(selection) + else: + raise IndexError('unsupported selection') + + def __setitem__(self, selection, value): + return self.array.set_orthogonal_selection(selection, value) From d6cf5873f9bf5249c1c33522f5065ac0ee0e6962 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 4 Nov 2017 02:20:12 +0000 Subject: [PATCH 29/67] WIP refactor indexing --- zarr/core.py | 2 + zarr/new_indexing.py | 208 +++++++++++++++++++++------------------- zarr/tests/test_core.py | 57 +++++++---- 3 files changed, 149 insertions(+), 118 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index dd9f1f03da..64421a570e 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -770,6 +770,8 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, sque """ + assert len(chunk_coords) == len(self._cdata_shape) + try: # obtain compressed data for chunk diff --git a/zarr/new_indexing.py b/zarr/new_indexing.py index 3524918022..6c1485cd54 100644 --- a/zarr/new_indexing.py +++ b/zarr/new_indexing.py @@ -128,6 +128,95 @@ def get_overlapping_chunks(self): yield dim_chunk_ix, dim_chunk_sel, dim_out_sel +def replace_ellipsis(selection, shape): + + # count number of ellipsis present + n_ellipsis = sum(1 for i in selection if i is Ellipsis) + + if n_ellipsis > 1: + # more than 1 is an error + raise IndexError("an index can only have a single ellipsis ('...')") + + elif n_ellipsis == 1: + # locate the ellipsis, count how many items to left and right + n_items_l = selection.index(Ellipsis) # items to left of ellipsis + n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis + n_items = len(selection) - 1 # all non-ellipsis items + + if n_items >= len(shape): + # ellipsis does nothing, just remove it + selection = tuple(i for i in selection if i != Ellipsis) + + else: + # replace ellipsis with as many slices are needed for number of dims + new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) + if n_items_r: + new_item += selection[-n_items_r:] + selection = new_item + + # fill out selection if not completely specified + if len(selection) < len(shape): + selection += tuple(slice(0, l) for l in shape[len(selection):]) + + return selection + + +def ensure_tuple(v): + if not isinstance(v, tuple): + v = (v,) + return v + + +# noinspection PyProtectedMember +class BasicIndexer(object): + + def __init__(self, selection, array): + + # ensure tuple + selection = ensure_tuple(selection) + + # handle ellipsis + selection = replace_ellipsis(selection, array._shape) + + # validation - check dimensionality + if len(selection) > len(array._shape): + raise IndexError('too many indices for array') + if len(selection) < len(array._shape): + raise IndexError('not enough indices for array') + + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): + + if isinstance(dim_sel, numbers.Integral): + dim_sel = normalize_integer_selection(dim_sel, dim_len) + dim_indexer = IntIndexer(dim_sel, dim_len, dim_chunk_len) + + elif isinstance(dim_sel, slice): + dim_sel = normalize_slice_selection(dim_sel, dim_len) + dim_indexer = SliceIndexer(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError('bad selection type') + + dim_indexers.append(dim_indexer) + + self.dim_indexers = dim_indexers + self.shape = tuple(s.nitems for s in self.dim_indexers + if not isinstance(s, IntIndexer)) + self.squeeze_axes = None + + def get_overlapping_chunks(self): + overlaps = [s.get_overlapping_chunks() for s in self.dim_indexers] + for dim_tasks in itertools.product(*overlaps): + + chunk_coords = tuple(t[0] for t in dim_tasks) + chunk_selection = tuple(t[1] for t in dim_tasks) + out_selection = tuple(t[2] for t in dim_tasks if t[2] is not None) + + yield chunk_coords, chunk_selection, out_selection + + class BoolArrayDimIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): @@ -265,89 +354,6 @@ def get_overlapping_chunks(self): yield dim_chunk_ix, dim_chunk_sel, dim_out_sel -def replace_ellipsis(selection, shape): - - # count number of ellipsis present - n_ellipsis = sum(1 for i in selection if i is Ellipsis) - - if n_ellipsis > 1: - # more than 1 is an error - raise IndexError("an index can only have a single ellipsis ('...')") - - elif n_ellipsis == 1: - # locate the ellipsis, count how many items to left and right - n_items_l = selection.index(Ellipsis) # items to left of ellipsis - n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis - n_items = len(selection) - 1 # all non-ellipsis items - - if n_items >= len(shape): - # ellipsis does nothing, just remove it - selection = tuple(i for i in selection if i != Ellipsis) - - else: - # replace ellipsis with as many slices are needed for number of dims - new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) - if n_items_r: - new_item += selection[-n_items_r:] - selection = new_item - - # fill out selection if not completely specified - if len(selection) < len(shape): - selection += tuple(slice(0, l) for l in shape[len(selection):]) - - return selection - - -# noinspection PyProtectedMember -class BasicIndexer(object): - - def __init__(self, selection, array): - - # ensure tuple - if not isinstance(selection, tuple): - selection = (selection,) - - # handle ellipsis - selection = replace_ellipsis(selection, array._shape) - - # validation - check dimensionality - if len(selection) > len(array._shape): - raise IndexError('too many indices for array') - if len(selection) < len(array._shape): - raise IndexError('not enough indices for array') - - # setup per-dimension indexers - dim_indexers = [] - for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): - - if isinstance(dim_sel, numbers.Integral): - dim_sel = normalize_integer_selection(dim_sel, dim_len) - dim_indexer = IntIndexer(dim_sel, dim_len, dim_chunk_len) - - elif isinstance(dim_sel, slice): - dim_sel = normalize_slice_selection(dim_sel, dim_len) - dim_indexer = SliceIndexer(dim_sel, dim_len, dim_chunk_len) - - else: - raise IndexError('bad selection type') - - dim_indexers.append(dim_indexer) - - self.dim_indexers = dim_indexers - self.shape = tuple(s.nitems for s in self.dim_indexers) - self.squeeze_axes = None - - def get_overlapping_chunks(self): - overlaps = [s.get_overlapping_chunks() for s in self.dim_indexers] - for dim_tasks in itertools.product(*overlaps): - - chunk_coords = tuple(t[0] for t in dim_tasks) - chunk_selection = tuple(t[1] for t in dim_tasks) - out_selection = tuple(t[2] for t in dim_tasks) - - yield chunk_coords, chunk_selection, out_selection - - def slice_to_range(s): return range(s.start, s.stop, 1 if s.step is None else s.step) @@ -372,8 +378,7 @@ class OrthogonalIndexer(object): def __init__(self, selection, array): # ensure tuple - if not isinstance(selection, tuple): - selection = (selection,) + selection = ensure_tuple(selection) # handle ellipsis selection = replace_ellipsis(selection, array._shape) @@ -425,7 +430,8 @@ def __init__(self, selection, array): dim_indexers.append(dim_indexer) self.dim_indexers = dim_indexers - self.shape = tuple(s.nitems for s in self.dim_indexers) + self.shape = tuple(s.nitems for s in self.dim_indexers + if not isinstance(s, IntIndexer)) self.is_advanced = any([not isinstance(dim_indexer, (IntIndexer, SliceIndexer)) for dim_indexer in self.dim_indexers]) if self.is_advanced: @@ -440,7 +446,7 @@ def get_overlapping_chunks(self): chunk_coords = tuple(t[0] for t in dim_tasks) chunk_selection = tuple(t[1] for t in dim_tasks) - out_selection = tuple(t[2] for t in dim_tasks) + out_selection = tuple(t[2] for t in dim_tasks if t[2] is not None) # handle advanced indexing arrays orthogonally if self.is_advanced: @@ -466,7 +472,7 @@ def __setitem__(self, selection, value): def is_coordinate_selection(selection, array): return ( - (len(selection) == array.ndim) and + (len(selection) == len(array._shape)) and all( [(isinstance(dim_sel, numbers.Integral) or (hasattr(dim_sel, 'dtype') and dim_sel.dtype.kind in 'ui')) @@ -497,8 +503,9 @@ class CoordinateIndexer(object): def __init__(self, selection, array): # some initial normalization - if not isinstance(selection, tuple): - selection = tuple(selection) + selection = ensure_tuple(selection) + selection = tuple([i] if isinstance(i, numbers.Integral) else i + for i in selection) selection = replace_lists(selection) # validation @@ -507,12 +514,14 @@ def __init__(self, selection, array): raise IndexError('invalid coordinate selection') # attempt to broadcast selection - this will raise error if array dimensions don't match - self.selection = np.broadcast_arrays(*selection) - self.shape = len(selection[0]) + selection = np.broadcast_arrays(*selection) + self.selection = selection + self.shape = len(self.selection[0]) if self.selection[0].shape else 1 self.squeeze_axes = None + self.array = array # normalization - for dim_sel, dim_len in zip(selection, array.shape): + for dim_sel, dim_len in zip(self.selection, array.shape): # check number of dimensions, only support indexing with 1d array if len(dim_sel.shape) > 1: @@ -531,7 +540,7 @@ def __init__(self, selection, array): # compute flattened chunk index for each point selected chunks_multi_index = tuple( dim_sel // dim_chunk_len - for (dim_sel, dim_chunk_len) in zip(selection, array._chunks) + for (dim_sel, dim_chunk_len) in zip(self.selection, array._chunks) ) chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, dims=array._cdata_shape) @@ -539,12 +548,16 @@ def __init__(self, selection, array): # find runs of indices in the same chunk self.chunks_rixs, self.run_starts, self.run_lengths = find_runs(chunks_raveled_indices) # unravel - self.chunks_ixs = np.unravel_index(self.chunks_rixs, dims=array._cdata_shape) + self.chunks_mixs = np.unravel_index(self.chunks_rixs, dims=array._cdata_shape) def get_overlapping_chunks(self): # iterate over chunks - for chunk_coords, s, l in zip(self.chunks_ixs, self.run_starts, self.run_lengths): + for i in range(len(self.chunks_rixs)): + + chunk_coords = tuple(mix[i] for mix in self.chunks_mixs) + s = self.run_starts[i] + l = self.run_lengths[i] out_selection = slice(s, s+l) @@ -567,8 +580,7 @@ def __init__(self, array): self.array = array def __getitem__(self, selection): - if not isinstance(selection, tuple): - selection = tuple(selection) + selection = ensure_tuple(selection) selection = replace_lists(selection) if is_coordinate_selection(selection, self.array): return self.array.get_coordinate_selection(selection) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 04ffa6a96a..b7c1813c30 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -990,6 +990,8 @@ def test_orthogonal_indexing_1d_bool_set(self): ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) self._test_orthogonal_indexing_1d_common_set(v, a, z, ix) + # TODO test orthogonal with unsorted ints + def test_orthogonal_indexing_1d_int_set(self): # setup @@ -1001,6 +1003,7 @@ def test_orthogonal_indexing_1d_int_set(self): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + self._test_orthogonal_indexing_1d_common_set(v, a, z, ix) ix.sort() self._test_orthogonal_indexing_1d_common_set(v, a, z, ix) @@ -1137,12 +1140,24 @@ def test_coordinate_indexing_1d(self): actual = z.vindex[ix] assert_array_equal(expect, actual) + # test single item + ix = 42 + expect = a[ix] + actual = z.get_coordinate_selection(ix) + assert_array_equal(expect, actual) + # test wraparound ix = [0, 3, 10, -23, -12, -1] expect = a[ix] actual = z.get_coordinate_selection(ix) assert_array_equal(expect, actual) + # test out of order + ix = [3, 105, 23, 127] # not monotonically increasing + expect = a[ix] + actual = z.get_coordinate_selection(ix) + assert_array_equal(expect, actual) + # test errors with assert_raises(IndexError): ix = [a.shape[0] + 1] # out of bounds @@ -1153,13 +1168,10 @@ def test_coordinate_indexing_1d(self): with assert_raises(IndexError): ix = [[2, 4], [6, 8]] # too many dimensions z.get_coordinate_selection(ix) - with assert_raises(NotImplementedError): - ix = [3, 105, 23, 127] # not monotonically increasing - z.get_coordinate_selection(ix) - with assert_raises(NotImplementedError): + with assert_raises(IndexError): ix = slice(5, 15) z.get_coordinate_selection(ix) - with assert_raises(NotImplementedError): + with assert_raises(IndexError): ix = Ellipsis z.get_coordinate_selection(ix) @@ -1173,8 +1185,9 @@ def test_coordinate_indexing_2d(self): np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: - ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) + n = int(a.size * p) + ix0 = np.random.choice(a.shape[0], size=n, replace=True) + ix1 = np.random.choice(a.shape[1], size=n, replace=True) srt = np.lexsort((ix0, ix1)) ix0 = ix0[srt] ix1 = ix1[srt] @@ -1185,6 +1198,7 @@ def test_coordinate_indexing_2d(self): # mixed indexing with array / int (ix0, 4), (42, ix1), + (42, 4), ] for selection in selections: @@ -1194,25 +1208,28 @@ def test_coordinate_indexing_2d(self): actual = z.vindex[selection] assert_array_equal(expect, actual) - with assert_raises(NotImplementedError): + # not monotonically increasing + ix0 = [3, 3, 4, 2, 5] + ix1 = [1, 3, 5, 7, 9] + expect = a[ix0, ix1] + actual = z.get_coordinate_selection((ix0, ix1)) + assert_array_equal(expect, actual) + # not monotonically increasing + ix0 = [3, 3, 4, 4, 5] + ix1 = [1, 3, 2, 1, 7] + # TODO fix failure here + actual = z.get_coordinate_selection((ix0, ix1)) + assert_array_equal(expect, actual) + + with assert_raises(IndexError): selection = slice(5, 15), [1, 2, 3] z.get_coordinate_selection(selection) - with assert_raises(NotImplementedError): + with assert_raises(IndexError): selection = [1, 2, 3], slice(5, 15) z.get_coordinate_selection(selection) - with assert_raises(NotImplementedError): + with assert_raises(IndexError): selection = Ellipsis, [1, 2, 3] z.get_coordinate_selection(selection) - with assert_raises(NotImplementedError): - # not monotonically increasing - ix0 = [3, 3, 4, 2, 5] - ix1 = [1, 3, 5, 7, 9] - z.get_coordinate_selection((ix0, ix1)) - with assert_raises(NotImplementedError): - # not monotonically increasing - ix0 = [3, 3, 4, 4, 5] - ix1 = [1, 3, 2, 1, 7] - z.get_coordinate_selection((ix0, ix1)) def test_get_selection_out(self): From af6f65f4641c62e07690dc6db7095213c3127478 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 4 Nov 2017 22:49:49 +0000 Subject: [PATCH 30/67] tests passing --- zarr/core.py | 18 +++---- zarr/new_indexing.py | 110 ++++++++++++++++++++++++++-------------- zarr/tests/test_core.py | 13 +++-- 3 files changed, 88 insertions(+), 53 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 64421a570e..5ba61d1ef7 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -565,11 +565,11 @@ def _get_selection(self, indexer, out=None): raise ValueError('out has wrong shape for selection') # iterate over chunks - for chunk_coords, chunk_selection, out_selection in indexer.get_overlapping_chunks(): + for chunk_coords, chunk_selection, out_selection in indexer: # load chunk selection into output array self._chunk_getitem(chunk_coords, chunk_selection, out, out_selection, - squeeze_axes=indexer.squeeze_axes) + drop_axes=indexer.drop_axes) if out.shape: return out @@ -735,7 +735,7 @@ def _set_selection(self, indexer, value): raise ValueError('value has wrong shape for selection') # iterate over chunks in range - for chunk_coords, chunk_selection, out_selection in indexer.get_overlapping_chunks(): + for chunk_coords, chunk_selection, out_selection in indexer: # extract data to store if np.isscalar(value): @@ -743,16 +743,16 @@ def _set_selection(self, indexer, value): else: chunk_value = value[out_selection] # handle missing singleton dimensions - if indexer.squeeze_axes: + if indexer.drop_axes: item = [slice(None)] * self.ndim - for a in indexer.squeeze_axes: + for a in indexer.drop_axes: item[a] = np.newaxis chunk_value = chunk_value[item] # put data self._chunk_setitem(chunk_coords, chunk_selection, chunk_value) - def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, squeeze_axes=None): + def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop_axes=None): """Obtain part or whole of a chunk. Parameters @@ -765,7 +765,7 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, sque Array to store result in. out_selection : selection Location of region within output array to store results in. - squeeze_axes : tuple of ints + drop_axes : tuple of ints Axes to squeeze out of the chunk. """ @@ -813,8 +813,8 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, sque # set data in output array tmp = chunk[chunk_selection] - if squeeze_axes: - tmp = np.squeeze(tmp, axis=squeeze_axes) + if drop_axes: + tmp = np.squeeze(tmp, axis=drop_axes) out[out_selection] = tmp def _chunk_setitem(self, chunk_coords, chunk_selection, value): diff --git a/zarr/new_indexing.py b/zarr/new_indexing.py index 6c1485cd54..b2c756d5cf 100644 --- a/zarr/new_indexing.py +++ b/zarr/new_indexing.py @@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division import numbers import itertools +import collections import numpy as np @@ -23,7 +24,23 @@ def normalize_integer_selection(dim_sel, dim_len): return dim_sel -class IntIndexer(object): +ChunkDimProjection = collections.namedtuple('ChunkDimProjection', + ('dim_chunk_ix', 'dim_chunk_sel', 'dim_out_sel')) +"""A mapping from chunk to output array for a single dimension. + +Parameters +---------- +dim_chunk_ix + Index of chunk. +dim_chunk_sel + Selection of items from chunk array. +dim_out_sel + Selection of items in target (output) array. + +""" + + +class IntDimIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): @@ -40,13 +57,12 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): self.dim_chunk_len = dim_chunk_len self.nitems = 1 - def get_overlapping_chunks(self): - + def __iter__(self): dim_chunk_ix = self.dim_sel // self.dim_chunk_len dim_offset = dim_chunk_ix * self.dim_chunk_len dim_chunk_sel = self.dim_sel - dim_offset dim_out_sel = None - yield dim_chunk_ix, dim_chunk_sel, dim_out_sel + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) def normalize_slice_selection(dim_sel, dim_len): @@ -77,7 +93,7 @@ def normalize_slice_selection(dim_sel, dim_len): return slice(start, stop, step) -class SliceIndexer(object): +class SliceDimIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): @@ -94,7 +110,7 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): self.dim_chunk_len = dim_chunk_len self.nitems = dim_sel.stop - dim_sel.start - def get_overlapping_chunks(self): + def __iter__(self): dim_chunk_from = self.dim_sel.start // self.dim_chunk_len dim_chunk_to = int(np.ceil(self.dim_sel.stop / self.dim_chunk_len)) @@ -125,7 +141,7 @@ def get_overlapping_chunks(self): dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - yield dim_chunk_ix, dim_chunk_sel, dim_out_sel + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) def replace_ellipsis(selection, shape): @@ -167,6 +183,24 @@ def ensure_tuple(v): return v +ChunkProjection = collections.namedtuple('ChunkProjection', + ('chunk_coords', 'chunk_selection', 'out_selection')) +"""A mapping of items from chunk to output array. Can be used to extract items from the chunk +array for loading into an output array. Can also be used to extract items from a value array for +setting/updating in a chunk array. + +Parameters +---------- +chunk_coords + Indices of chunk. +chunk_selection + Selection of items from chunk array. +out_selection + Selection of items in target (output) array. + +""" + + # noinspection PyProtectedMember class BasicIndexer(object): @@ -190,11 +224,11 @@ def __init__(self, selection, array): if isinstance(dim_sel, numbers.Integral): dim_sel = normalize_integer_selection(dim_sel, dim_len) - dim_indexer = IntIndexer(dim_sel, dim_len, dim_chunk_len) + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) elif isinstance(dim_sel, slice): dim_sel = normalize_slice_selection(dim_sel, dim_len) - dim_indexer = SliceIndexer(dim_sel, dim_len, dim_chunk_len) + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) else: raise IndexError('bad selection type') @@ -203,18 +237,17 @@ def __init__(self, selection, array): self.dim_indexers = dim_indexers self.shape = tuple(s.nitems for s in self.dim_indexers - if not isinstance(s, IntIndexer)) - self.squeeze_axes = None + if not isinstance(s, IntDimIndexer)) + self.drop_axes = None - def get_overlapping_chunks(self): - overlaps = [s.get_overlapping_chunks() for s in self.dim_indexers] - for dim_tasks in itertools.product(*overlaps): + def __iter__(self): + for dim_projections in itertools.product(*self.dim_indexers): - chunk_coords = tuple(t[0] for t in dim_tasks) - chunk_selection = tuple(t[1] for t in dim_tasks) - out_selection = tuple(t[2] for t in dim_tasks if t[2] is not None) + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple(p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None) - yield chunk_coords, chunk_selection, out_selection + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) class BoolArrayDimIndexer(object): @@ -245,7 +278,7 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) self.nitems = self.chunk_nitems_cumsum[-1] - def get_overlapping_chunks(self): + def __iter__(self): # iterate over chunks with at least one item for dim_chunk_ix in np.nonzero(self.chunk_nitems)[0]: @@ -268,7 +301,7 @@ def get_overlapping_chunks(self): stop = self.chunk_nitems_cumsum[dim_chunk_ix] dim_out_sel = slice(start, stop) - yield dim_chunk_ix, dim_chunk_sel, dim_out_sel + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) def find_runs(x): @@ -339,7 +372,7 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): # find runs of indices in the same chunk self.dim_chunk_ixs, self.run_starts, self.run_lengths = find_runs(dim_chunk_sel) - def get_overlapping_chunks(self): + def __iter__(self): # iterate over chunks for dim_chunk_ix, s, l in zip(self.dim_chunk_ixs, self.run_starts, self.run_lengths): @@ -351,7 +384,7 @@ def get_overlapping_chunks(self): dim_offset = dim_chunk_ix * self.dim_chunk_len dim_chunk_sel = self.dim_sel[dim_out_sel] - dim_offset - yield dim_chunk_ix, dim_chunk_sel, dim_out_sel + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) def slice_to_range(s): @@ -399,7 +432,7 @@ def __init__(self, selection, array): if isinstance(dim_sel, numbers.Integral): dim_sel = normalize_integer_selection(dim_sel, dim_len) - dim_indexer = IntIndexer(dim_sel, dim_len, dim_chunk_len) + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) elif isinstance(dim_sel, slice): @@ -411,7 +444,7 @@ def __init__(self, selection, array): dim_sel = np.arange(dim_sel.start, dim_sel.stop, dim_sel.step) dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) else: - dim_indexer = SliceIndexer(dim_sel, dim_len, dim_chunk_len) + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): @@ -431,22 +464,21 @@ def __init__(self, selection, array): self.dim_indexers = dim_indexers self.shape = tuple(s.nitems for s in self.dim_indexers - if not isinstance(s, IntIndexer)) - self.is_advanced = any([not isinstance(dim_indexer, (IntIndexer, SliceIndexer)) + if not isinstance(s, IntDimIndexer)) + self.is_advanced = any([not isinstance(dim_indexer, (IntDimIndexer, SliceDimIndexer)) for dim_indexer in self.dim_indexers]) if self.is_advanced: - self.squeeze_axes = tuple([i for i, dim_indexer in enumerate(self.dim_indexers) - if isinstance(dim_indexer, IntIndexer)]) + self.drop_axes = tuple([i for i, dim_indexer in enumerate(self.dim_indexers) + if isinstance(dim_indexer, IntDimIndexer)]) else: - self.squeeze_axes = None + self.drop_axes = None - def get_overlapping_chunks(self): - overlaps = [s.get_overlapping_chunks() for s in self.dim_indexers] - for dim_tasks in itertools.product(*overlaps): + def __iter__(self): + for dim_projections in itertools.product(*self.dim_indexers): - chunk_coords = tuple(t[0] for t in dim_tasks) - chunk_selection = tuple(t[1] for t in dim_tasks) - out_selection = tuple(t[2] for t in dim_tasks if t[2] is not None) + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple(p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None) # handle advanced indexing arrays orthogonally if self.is_advanced: @@ -455,7 +487,7 @@ def get_overlapping_chunks(self): # or integers, so need to convert slices and integers into ranges. chunk_selection = ix_(*chunk_selection) - yield chunk_coords, chunk_selection, out_selection + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) class OIndex(object): @@ -517,7 +549,7 @@ def __init__(self, selection, array): selection = np.broadcast_arrays(*selection) self.selection = selection self.shape = len(self.selection[0]) if self.selection[0].shape else 1 - self.squeeze_axes = None + self.drop_axes = None self.array = array # normalization @@ -550,7 +582,7 @@ def __init__(self, selection, array): # unravel self.chunks_mixs = np.unravel_index(self.chunks_rixs, dims=array._cdata_shape) - def get_overlapping_chunks(self): + def __iter__(self): # iterate over chunks for i in range(len(self.chunks_rixs)): @@ -571,7 +603,7 @@ def get_overlapping_chunks(self): for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets) ) - yield chunk_coords, chunk_selection, out_selection + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) class VIndex(object): diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index b7c1813c30..490bcad8b7 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -784,6 +784,12 @@ def test_orthogonal_indexing_1d_int(self): actual = z.oindex[ix] assert_array_equal(expect, actual) + # test not sorted + ix = [3, 105, 23, 127] # not monotonically increasing + expect = a[ix] + actual = z.oindex[ix] + assert_array_equal(expect, actual) + # test errors with assert_raises(IndexError): ix = [a.shape[0] + 1] # out of bounds @@ -794,9 +800,6 @@ def test_orthogonal_indexing_1d_int(self): with assert_raises(IndexError): ix = [[2, 4], [6, 8]] # too many dimensions z.oindex[ix] - with assert_raises(NotImplementedError): - ix = [3, 105, 23, 127] # not monotonically increasing - z.oindex[ix] def test_orthogonal_indexing_1d_slice_with_step(self): @@ -1215,9 +1218,9 @@ def test_coordinate_indexing_2d(self): actual = z.get_coordinate_selection((ix0, ix1)) assert_array_equal(expect, actual) # not monotonically increasing - ix0 = [3, 3, 4, 4, 5] + ix0 = [1, 1, 2, 2, 5] ix1 = [1, 3, 2, 1, 7] - # TODO fix failure here + expect = a[ix0, ix1] actual = z.get_coordinate_selection((ix0, ix1)) assert_array_equal(expect, actual) From 3f0a98bf7e770e766eb6b8aa78257fca1700bb54 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sun, 5 Nov 2017 01:43:19 +0000 Subject: [PATCH 31/67] WIP optimise coordinate indexing --- notebooks/advanced_indexing.ipynb | 1601 +++++++++++++++++++++++------ zarr/core.py | 26 +- zarr/new_indexing.py | 152 ++- zarr/tests/test_core.py | 65 +- 4 files changed, 1474 insertions(+), 370 deletions(-) diff --git a/notebooks/advanced_indexing.ipynb b/notebooks/advanced_indexing.ipynb index 969cffc7bb..e1ea045749 100644 --- a/notebooks/advanced_indexing.ipynb +++ b/notebooks/advanced_indexing.ipynb @@ -135,7 +135,7 @@ { "data": { "text/plain": [ - "array([1, 4, 5, 5, 7])" + "array([5, 4, 1, 7, 5])" ] }, "execution_count": 7, @@ -145,7 +145,6 @@ ], "source": [ "ix = np.random.choice(a.shape[0], size=a.shape[0]//2)\n", - "ix.sort() # only monotonically increasing indices are supported\n", "ix" ] }, @@ -157,7 +156,7 @@ { "data": { "text/plain": [ - "array([1, 4, 5, 5, 7])" + "array([5, 4, 1, 7, 5])" ] }, "execution_count": 8, @@ -177,7 +176,7 @@ { "data": { "text/plain": [ - "array([1, 4, 5, 5, 7])" + "array([5, 4, 1, 7, 5])" ] }, "execution_count": 9, @@ -217,7 +216,7 @@ "source": [ "### Multidimensional indexing\n", "\n", - "N.B., orthogonaly indexing is implemented. This is different from numpy fancy indexing if more than one dimension is indexed with an array." + "N.B., orthogonal indexing is available. This is different from numpy fancy indexing if more than one dimension is indexed with an array." ] }, { @@ -317,7 +316,7 @@ ], "source": [ "zb = zarr.array(b, chunks=(2, 2))\n", - "zb[ix0, ix1]" + "zb.oindex[ix0, ix1]" ] }, { @@ -346,7 +345,7 @@ } ], "source": [ - "zb[ix0, ix1] = -1\n", + "zb.oindex[ix0, ix1] = -1\n", "zb[:]" ] }, @@ -358,7 +357,7 @@ { "data": { "text/plain": [ - "array([1, 8, 8, 9, 9])" + "array([8, 1, 9, 8, 9])" ] }, "execution_count": 17, @@ -368,7 +367,6 @@ ], "source": [ "ix0 = np.random.choice(b.shape[0], size=b.shape[0]//2)\n", - "ix0.sort() # only monotonically increasing indices are supported\n", "ix0" ] }, @@ -380,7 +378,7 @@ { "data": { "text/plain": [ - "array([1, 3, 4, 6, 7])" + "array([4, 1, 3, 6, 7])" ] }, "execution_count": 18, @@ -390,7 +388,6 @@ ], "source": [ "ix1 = np.random.choice(b.shape[1], size=b.shape[1]//2)\n", - "ix1.sort() # only monotonically increasing indices are supported\n", "ix1" ] }, @@ -402,11 +399,11 @@ { "data": { "text/plain": [ - "array([[11, 13, 14, 16, 17],\n", - " [81, 83, 84, 86, 87],\n", - " [81, 83, 84, 86, 87],\n", - " [91, 93, 94, 96, 97],\n", - " [91, 93, 94, 96, 97]])" + "array([[84, 81, 83, 86, 87],\n", + " [14, 11, 13, 16, 17],\n", + " [94, 91, 93, 96, 97],\n", + " [84, 81, 83, 86, 87],\n", + " [94, 91, 93, 96, 97]])" ] }, "execution_count": 19, @@ -426,11 +423,11 @@ { "data": { "text/plain": [ - "array([[11, 13, 14, 16, 17],\n", - " [81, 83, 84, 86, 87],\n", - " [81, 83, 84, 86, 87],\n", - " [91, 93, 94, 96, 97],\n", - " [91, 93, 94, 96, 97]])" + "array([[84, 81, 83, 86, 87],\n", + " [14, 11, 13, 16, 17],\n", + " [94, 91, 93, 96, 97],\n", + " [84, 81, 83, 86, 87],\n", + " [94, 91, 93, 96, 97]])" ] }, "execution_count": 20, @@ -440,7 +437,7 @@ ], "source": [ "zb = zarr.array(b, chunks=(2, 2))\n", - "zb[ix0, ix1]" + "zb.oindex[ix0, ix1]" ] }, { @@ -469,7 +466,7 @@ } ], "source": [ - "zb[ix0, ix1] = -1\n", + "zb.oindex[ix0, ix1] = -1\n", "zb[:]" ] }, @@ -536,7 +533,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Benchmarking" + "## 1D Benchmarking" ] }, { @@ -565,6 +562,14 @@ "execution_count": 26, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 524 ms, sys: 124 ms, total: 648 ms\n", + "Wall time: 226 ms\n" + ] + }, { "data": { "text/html": [ @@ -591,7 +596,7 @@ } ], "source": [ - "zc = zarr.array(c)\n", + "%time zc = zarr.array(c)\n", "zc.info" ] }, @@ -604,8 +609,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 148 ms, sys: 52 ms, total: 200 ms\n", - "Wall time: 200 ms\n" + "CPU times: user 124 ms, sys: 52 ms, total: 176 ms\n", + "Wall time: 173 ms\n" ] }, { @@ -632,8 +637,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 480 ms, sys: 420 ms, total: 900 ms\n", - "Wall time: 308 ms\n" + "CPU times: user 512 ms, sys: 108 ms, total: 620 ms\n", + "Wall time: 312 ms\n" ] }, { @@ -689,8 +694,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 612 ms, sys: 20 ms, total: 632 ms\n", - "Wall time: 628 ms\n" + "CPU times: user 612 ms, sys: 8 ms, total: 620 ms\n", + "Wall time: 621 ms\n" ] }, { @@ -717,8 +722,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.5 s, sys: 208 ms, total: 1.71 s\n", - "Wall time: 983 ms\n" + "CPU times: user 1.47 s, sys: 124 ms, total: 1.6 s\n", + "Wall time: 922 ms\n" ] }, { @@ -745,63 +750,79 @@ "name": "stdout", "output_type": "stream", "text": [ - " 73776 function calls in 1.005 seconds\n", + " 116791 function calls in 0.943 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 2048 0.666 0.000 0.967 0.000 core.py:679(_chunk_getitem)\n", - " 2048 0.255 0.000 0.274 0.000 core.py:839(_decode_chunk)\n", - " 2048 0.010 0.000 0.019 0.000 util.py:418(get_chunk_selections)\n", - " 2048 0.009 0.000 0.009 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", - " 2048 0.008 0.000 0.008 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 2048 0.008 0.000 0.008 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1 0.006 0.006 1.004 1.004 core.py:484(_getitem_nd)\n", - " 2048 0.005 0.000 0.009 0.000 util.py:114(is_total_slice)\n", + " 2049 0.445 0.000 0.445 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 2048 0.234 0.000 0.250 0.000 core.py:931(_decode_chunk)\n", + " 2048 0.140 0.000 0.425 0.000 core.py:769(_chunk_getitem)\n", + " 2049 0.013 0.000 0.497 0.000 new_indexing.py:547(__iter__)\n", + " 2048 0.011 0.000 0.011 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", + " 2048 0.011 0.000 0.465 0.000 index_tricks.py:26(ix_)\n", + " 4096 0.010 0.000 0.010 0.000 core.py:324()\n", + " 4096 0.009 0.000 0.009 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1 0.006 0.006 0.928 0.928 core.py:549(_get_selection)\n", + " 2048 0.006 0.000 0.006 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", " 2048 0.005 0.000 0.010 0.000 arrayprint.py:381(wrapper)\n", - " 2048 0.004 0.000 0.016 0.000 {method 'join' of 'str' objects}\n", - " 2048 0.004 0.000 0.004 0.000 util.py:167(get_chunk_sel)\n", - " 14344 0.003 0.000 0.004 0.000 {built-in method builtins.isinstance}\n", - " 2048 0.003 0.000 0.003 0.000 util.py:177(get_out_sel)\n", + " 2049 0.005 0.000 0.006 0.000 new_indexing.py:281(__iter__)\n", + " 2048 0.004 0.000 0.017 0.000 {method 'join' of 'str' objects}\n", + " 2048 0.004 0.000 0.014 0.000 core.py:319(_cdata_shape)\n", " 2048 0.003 0.000 0.004 0.000 arrayprint.py:399(array2string)\n", - " 2048 0.002 0.000 0.018 0.000 core.py:836(_chunk_key)\n", - " 2048 0.002 0.000 0.012 0.000 numeric.py:1905(array_str)\n", - " 1 0.002 0.002 0.011 0.011 util.py:140(__init__)\n", - " 4096 0.001 0.000 0.002 0.000 util.py:129()\n", + " 12295 0.003 0.000 0.003 0.000 {built-in method builtins.isinstance}\n", + " 2048 0.003 0.000 0.470 0.000 new_indexing.py:466(ix_)\n", + " 1 0.003 0.003 0.014 0.014 new_indexing.py:255(__init__)\n", + " 2048 0.002 0.000 0.004 0.000 numerictypes.py:728(issubdtype)\n", + " 2048 0.002 0.000 0.019 0.000 core.py:928(_chunk_key)\n", + " 2048 0.002 0.000 0.013 0.000 numeric.py:1905(array_str)\n", + " 2048 0.002 0.000 0.003 0.000 new_indexing.py:471()\n", + " 4096 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5616c71ce480}\n", + " 2048 0.001 0.000 0.002 0.000 new_indexing.py:562()\n", + " 4096 0.001 0.000 0.001 0.000 new_indexing.py:550()\n", " 2048 0.001 0.000 0.002 0.000 threading.py:1230(current_thread)\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", + " 4096 0.001 0.000 0.001 0.000 new_indexing.py:552()\n", + " 6153 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 2048 0.001 0.000 0.002 0.000 :12(__new__)\n", " 4096 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", - " 1 0.001 0.001 1.005 1.005 :1()\n", - " 4098 0.001 0.000 0.001 0.000 {method 'append' of 'list' objects}\n", - " 2048 0.001 0.000 0.002 0.000 {built-in method builtins.all}\n", - " 2048 0.001 0.000 0.010 0.000 numeric.py:380(count_nonzero)\n", - " 2048 0.001 0.000 0.001 0.000 core.py:200(chunk_store)\n", + " 2048 0.001 0.000 0.002 0.000 numeric.py:463(asarray)\n", + " 4096 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", + " 4096 0.001 0.000 0.001 0.000 new_indexing.py:551()\n", + " 2048 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 2048 0.001 0.000 0.001 0.000 core.py:205(chunk_store)\n", + " 1 0.001 0.001 0.943 0.943 :1()\n", + " 2048 0.001 0.000 0.012 0.000 numeric.py:380(count_nonzero)\n", + " 2049 0.001 0.000 0.001 0.000 {built-in method builtins.any}\n", " 2048 0.001 0.000 0.001 0.000 {method 'item' of 'numpy.ndarray' objects}\n", " 2048 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 2048 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", " 2048 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 2048 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 1.005 1.005 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.015 0.015 new_indexing.py:483(__init__)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.zeros}\n", + " 1 0.000 0.000 0.943 0.943 {built-in method builtins.exec}\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.zeros}\n", - " 1 0.000 0.000 0.011 0.011 util.py:323(normalize_array_selection)\n", " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.011 0.011 util.py:250(normalize_dim_selection)\n", - " 1 0.000 0.000 0.000 0.000 util.py:363(get_chunks_for_selection)\n", - " 1 0.000 0.000 1.004 1.004 core.py:377(__getitem__)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.942 0.942 core.py:392(__getitem__)\n", + " 1 0.000 0.000 0.942 0.942 core.py:527(get_orthogonal_selection)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:539()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 util.py:185(get_chunk_ranges)\n", " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 5 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", - " 2 0.000 0.000 0.011 0.006 util.py:354()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:600(replace_lists)\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 2 0.000 0.000 0.000 0.000 core.py:500()\n", - " 1 0.000 0.000 0.000 0.000 util.py:332()\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:537()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:542()\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:602()\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", "\n", "\n" @@ -827,7 +848,8 @@ { "data": { "text/plain": [ - "49994863" + "(50000000,\n", + " array([86098038, 51488465, 9242439, ..., 31235734, 20293124, 13824417]))" ] }, "execution_count": 33, @@ -836,8 +858,10 @@ } ], "source": [ - "ix_dense_int = np.nonzero(ix_dense_bool)[0]\n", - "len(ix_dense_int)" + "ix_dense_int = np.random.choice(c.shape[0], size=c.shape[0]//2, replace=True)\n", + "ix_dense_int_sorted = ix_dense_int.copy()\n", + "ix_dense_int_sorted.sort()\n", + "len(ix_dense_int), ix_dense_int" ] }, { @@ -849,14 +873,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 144 ms, sys: 20 ms, total: 164 ms\n", - "Wall time: 160 ms\n" + "CPU times: user 10.6 s, sys: 0 ns, total: 10.6 s\n", + "Wall time: 10.6 s\n" ] }, { "data": { "text/plain": [ - "array([ 0, 1, 2, ..., 99999994, 99999995, 99999996])" + "array([35886154, 6592339, 23747762, ..., 26251840, 48664862, 3479456])" ] }, "execution_count": 34, @@ -864,30 +888,114 @@ "output_type": "execute_result" } ], + "source": [ + "%time np.argsort(ix_dense_int, kind='quicksort')" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 152 ms, sys: 0 ns, total: 152 ms\n", + "Wall time: 152 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 0, 0, 1, ..., 99999994, 99999999, 99999999])" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time c[ix_dense_int_sorted]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.39 s, sys: 188 ms, total: 1.58 s\n", + "Wall time: 1.29 s\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 0, 0, 1, ..., 99999994, 99999999, 99999999])" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc[ix_dense_int_sorted]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 684 ms, sys: 36 ms, total: 720 ms\n", + "Wall time: 718 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([86098038, 51488465, 9242439, ..., 31235734, 20293124, 13824417])" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "%time c[ix_dense_int]" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.3 s, sys: 152 ms, total: 1.45 s\n", - "Wall time: 1.16 s\n" + "CPU times: user 14.8 s, sys: 716 ms, total: 15.5 s\n", + "Wall time: 14.7 s\n" ] }, { "data": { "text/plain": [ - "array([ 0, 1, 2, ..., 99999994, 99999995, 99999996])" + "array([86098038, 51488465, 9242439, ..., 31235734, 20293124, 13824417])" ] }, - "execution_count": 35, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -898,76 +1006,191 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 71758 function calls in 1.208 seconds\n", + " 110675 function calls in 1.659 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 0.662 0.662 0.864 0.864 util.py:191(__init__)\n", - " 2048 0.129 0.000 0.139 0.000 core.py:839(_decode_chunk)\n", - " 1 0.119 0.119 0.119 0.119 {built-in method numpy.core.multiarray.bincount}\n", - " 2048 0.116 0.000 0.278 0.000 core.py:679(_chunk_getitem)\n", - " 1 0.063 0.063 0.063 0.063 function_base.py:1848(diff)\n", - " 2048 0.042 0.000 0.045 0.000 util.py:225(get_chunk_sel)\n", - " 4 0.020 0.005 0.020 0.005 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 2048 0.010 0.000 0.059 0.000 util.py:418(get_chunk_selections)\n", - " 1 0.006 0.006 1.207 1.207 core.py:484(_getitem_nd)\n", - " 4096 0.005 0.000 0.005 0.000 util.py:235(get_out_sel)\n", - " 2048 0.005 0.000 0.007 0.000 util.py:114(is_total_slice)\n", - " 2048 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1 0.781 0.781 1.051 1.051 new_indexing.py:393(__init__)\n", + " 2048 0.209 0.000 0.380 0.000 core.py:769(_chunk_getitem)\n", + " 2049 0.148 0.000 0.150 0.000 new_indexing.py:440(__iter__)\n", + " 1 0.132 0.132 0.132 0.132 function_base.py:1848(diff)\n", + " 2048 0.130 0.000 0.141 0.000 core.py:931(_decode_chunk)\n", + " 1 0.120 0.120 0.120 0.120 {built-in method numpy.core.multiarray.bincount}\n", + " 1 0.018 0.018 1.642 1.642 core.py:527(get_orthogonal_selection)\n", + " 1 0.018 0.018 1.659 1.659 :1()\n", + " 4 0.017 0.004 0.017 0.004 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 2049 0.012 0.000 0.188 0.000 new_indexing.py:547(__iter__)\n", + " 4096 0.008 0.000 0.008 0.000 core.py:324()\n", + " 2048 0.008 0.000 0.015 0.000 index_tricks.py:26(ix_)\n", + " 4096 0.007 0.000 0.007 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1 0.004 0.004 0.572 0.572 core.py:549(_get_selection)\n", " 2048 0.004 0.000 0.009 0.000 arrayprint.py:381(wrapper)\n", " 2048 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 2048 0.003 0.000 0.013 0.000 {method 'join' of 'str' objects}\n", - " 14345 0.003 0.000 0.003 0.000 {built-in method builtins.isinstance}\n", + " 2048 0.004 0.000 0.014 0.000 {method 'join' of 'str' objects}\n", + " 2048 0.003 0.000 0.011 0.000 core.py:319(_cdata_shape)\n", + " 12295 0.003 0.000 0.003 0.000 {built-in method builtins.isinstance}\n", " 2048 0.003 0.000 0.003 0.000 arrayprint.py:399(array2string)\n", - " 2048 0.002 0.000 0.015 0.000 core.py:836(_chunk_key)\n", - " 2048 0.001 0.000 0.010 0.000 numeric.py:1905(array_str)\n", - " 4096 0.001 0.000 0.002 0.000 util.py:129()\n", + " 2048 0.002 0.000 0.020 0.000 new_indexing.py:466(ix_)\n", + " 2048 0.002 0.000 0.003 0.000 numerictypes.py:728(issubdtype)\n", + " 2048 0.002 0.000 0.016 0.000 core.py:928(_chunk_key)\n", + " 2048 0.002 0.000 0.011 0.000 numeric.py:1905(array_str)\n", + " 4096 0.002 0.000 0.002 0.000 {built-in method __new__ of type object at 0x5616c71ce480}\n", + " 2048 0.001 0.000 0.002 0.000 new_indexing.py:471()\n", + " 4096 0.001 0.000 0.001 0.000 new_indexing.py:550()\n", " 2048 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 1 0.001 0.001 1.208 1.208 :1()\n", + " 2048 0.001 0.000 0.002 0.000 new_indexing.py:562()\n", + " 6153 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 4096 0.001 0.000 0.001 0.000 new_indexing.py:552()\n", + " 2054 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", + " 2048 0.001 0.000 0.001 0.000 :12(__new__)\n", + " 2048 0.001 0.000 0.002 0.000 numeric.py:463(asarray)\n", " 4096 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", - " 4098 0.001 0.000 0.001 0.000 {method 'append' of 'list' objects}\n", - " 2048 0.001 0.000 0.002 0.000 {built-in method builtins.all}\n", - " 2048 0.001 0.000 0.001 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 2048 0.000 0.000 0.000 0.000 core.py:200(chunk_store)\n", + " 4096 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", + " 4096 0.001 0.000 0.001 0.000 new_indexing.py:551()\n", + " 2048 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 2048 0.001 0.000 0.001 0.000 core.py:205(chunk_store)\n", + " 2049 0.001 0.000 0.001 0.000 {built-in method builtins.any}\n", + " 2048 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 2048 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", " 2048 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", " 2048 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 2048 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1 0.000 0.000 1.052 1.052 new_indexing.py:483(__init__)\n", + " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 0.864 0.864 util.py:250(normalize_dim_selection)\n", - " 4 0.000 0.000 0.020 0.005 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 1.208 1.208 {built-in method builtins.exec}\n", - " 6 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 1 0.000 0.000 0.864 0.864 util.py:323(normalize_array_selection)\n", - " 4 0.000 0.000 0.020 0.005 {method 'any' of 'numpy.ndarray' objects}\n", + " 4 0.000 0.000 0.017 0.004 fromnumeric.py:1886(any)\n", + " 1 0.000 0.000 1.659 1.659 {built-in method builtins.exec}\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 4 0.000 0.000 0.020 0.005 _methods.py:37(_any)\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 1.642 1.642 core.py:392(__getitem__)\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 4 0.000 0.000 0.017 0.004 {method 'any' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 1 0.000 0.000 0.000 0.000 util.py:363(get_chunks_for_selection)\n", - " 1 0.000 0.000 1.207 1.207 core.py:377(__getitem__)\n", + " 4 0.000 0.000 0.017 0.004 _methods.py:37(_any)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 2 0.000 0.000 0.864 0.432 util.py:354()\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:600(replace_lists)\n", + " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:539()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:537()\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:602()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 6 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:542()\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run('zc[ix_dense_int_sorted]', sort='time')" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 139355 function calls in 16.778 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 13.664 13.664 13.664 13.664 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 1 0.787 0.787 15.497 15.497 new_indexing.py:393(__init__)\n", + " 1 0.776 0.776 0.776 0.776 {method 'take' of 'numpy.ndarray' objects}\n", + " 2048 0.741 0.000 1.006 0.000 core.py:769(_chunk_getitem)\n", + " 2048 0.220 0.000 0.235 0.000 core.py:931(_decode_chunk)\n", + " 2049 0.140 0.000 0.141 0.000 new_indexing.py:440(__iter__)\n", + " 1 0.130 0.130 0.130 0.130 function_base.py:1848(diff)\n", + " 1 0.121 0.121 0.121 0.121 {built-in method numpy.core.multiarray.bincount}\n", + " 1 0.040 0.040 16.760 16.760 core.py:392(__getitem__)\n", + " 4 0.019 0.005 0.019 0.005 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 1 0.018 0.018 16.778 16.778 :1()\n", + " 1 0.018 0.018 16.720 16.720 core.py:527(get_orthogonal_selection)\n", + " 2049 0.013 0.000 0.193 0.000 new_indexing.py:547(__iter__)\n", + " 4096 0.012 0.000 0.024 0.000 index_tricks.py:26(ix_)\n", + " 6144 0.010 0.000 0.010 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 4096 0.008 0.000 0.008 0.000 core.py:324()\n", + " 1 0.005 0.005 1.205 1.205 core.py:549(_get_selection)\n", + " 2048 0.005 0.000 0.005 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 2048 0.004 0.000 0.009 0.000 arrayprint.py:381(wrapper)\n", + " 4096 0.004 0.000 0.032 0.000 new_indexing.py:466(ix_)\n", + " 2048 0.004 0.000 0.015 0.000 {method 'join' of 'str' objects}\n", + " 16391 0.004 0.000 0.004 0.000 {built-in method builtins.isinstance}\n", + " 2048 0.003 0.000 0.012 0.000 core.py:319(_cdata_shape)\n", + " 4096 0.003 0.000 0.006 0.000 numerictypes.py:728(issubdtype)\n", + " 2048 0.003 0.000 0.004 0.000 arrayprint.py:399(array2string)\n", + " 4096 0.002 0.000 0.004 0.000 new_indexing.py:471()\n", + " 2048 0.002 0.000 0.017 0.000 core.py:928(_chunk_key)\n", + " 2048 0.002 0.000 0.011 0.000 numeric.py:1905(array_str)\n", + " 4102 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.array}\n", + " 4096 0.002 0.000 0.002 0.000 {built-in method __new__ of type object at 0x5616c71ce480}\n", + " 4096 0.001 0.000 0.001 0.000 new_indexing.py:550()\n", + " 8192 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", + " 4096 0.001 0.000 0.003 0.000 numeric.py:463(asarray)\n", + " 8201 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 4096 0.001 0.000 0.002 0.000 numerictypes.py:660(issubclass_)\n", + " 2048 0.001 0.000 0.002 0.000 new_indexing.py:562()\n", + " 2048 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 4096 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", + " 2048 0.001 0.000 0.002 0.000 :12(__new__)\n", + " 4096 0.001 0.000 0.001 0.000 new_indexing.py:552()\n", + " 4096 0.001 0.000 0.001 0.000 new_indexing.py:551()\n", + " 2048 0.001 0.000 0.001 0.000 core.py:205(chunk_store)\n", + " 4097 0.001 0.000 0.001 0.000 {method 'append' of 'list' objects}\n", + " 2049 0.001 0.000 0.001 0.000 {built-in method builtins.any}\n", + " 2048 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 2048 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 2048 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 2048 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1 0.000 0.000 15.498 15.498 new_indexing.py:483(__init__)\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 4 0.000 0.000 0.019 0.005 fromnumeric.py:1886(any)\n", + " 4 0.000 0.000 14.441 3.610 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 16.778 16.778 {built-in method builtins.exec}\n", + " 4 0.000 0.000 0.019 0.005 {method 'any' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 4 0.000 0.000 0.019 0.005 _methods.py:37(_any)\n", + " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 1 0.000 0.000 0.776 0.776 fromnumeric.py:70(take)\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 1 0.000 0.000 13.664 13.664 fromnumeric.py:826(argsort)\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:537()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:539()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:600(replace_lists)\n", " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 util.py:243(get_chunk_ranges)\n", - " 1 0.000 0.000 0.000 0.000 util.py:332()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", + " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:542()\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:602()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", - " 2 0.000 0.000 0.000 0.000 core.py:500()\n", "\n", "\n" ] @@ -986,16 +1209,16 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "9950" + "9958" ] }, - "execution_count": 37, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -1008,24 +1231,24 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 20 ms, sys: 0 ns, total: 20 ms\n", - "Wall time: 17.8 ms\n" + "CPU times: user 24 ms, sys: 0 ns, total: 24 ms\n", + "Wall time: 21.6 ms\n" ] }, { "data": { "text/plain": [ - "array([ 12643, 15188, 16392, ..., 99989960, 99995101, 99999097])" + "array([ 1063, 28396, 37229, ..., 99955875, 99979354, 99995791])" ] }, - "execution_count": 38, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -1036,24 +1259,24 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 472 ms, sys: 88 ms, total: 560 ms\n", - "Wall time: 262 ms\n" + "CPU times: user 508 ms, sys: 72 ms, total: 580 ms\n", + "Wall time: 288 ms\n" ] }, { "data": { "text/plain": [ - "array([ 12643, 15188, 16392, ..., 99989960, 99995101, 99999097])" + "array([ 1063, 28396, 37229, ..., 99955875, 99979354, 99995791])" ] }, - "execution_count": 39, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -1064,70 +1287,86 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 73436 function calls in 0.289 seconds\n", + " 116461 function calls in 0.300 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 2038 0.172 0.000 0.184 0.000 core.py:839(_decode_chunk)\n", - " 2038 0.035 0.000 0.248 0.000 core.py:679(_chunk_getitem)\n", - " 2048 0.013 0.000 0.013 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", - " 2038 0.011 0.000 0.020 0.000 util.py:418(get_chunk_selections)\n", - " 1 0.006 0.006 0.289 0.289 core.py:484(_getitem_nd)\n", - " 2038 0.006 0.000 0.009 0.000 util.py:114(is_total_slice)\n", - " 2038 0.005 0.000 0.011 0.000 arrayprint.py:381(wrapper)\n", - " 2038 0.005 0.000 0.005 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 2038 0.005 0.000 0.017 0.000 {method 'join' of 'str' objects}\n", - " 2038 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 2038 0.004 0.000 0.004 0.000 util.py:167(get_chunk_sel)\n", - " 2038 0.003 0.000 0.003 0.000 util.py:177(get_out_sel)\n", - " 14274 0.003 0.000 0.003 0.000 {built-in method builtins.isinstance}\n", - " 2038 0.003 0.000 0.004 0.000 arrayprint.py:399(array2string)\n", - " 1 0.002 0.002 0.016 0.016 util.py:140(__init__)\n", - " 2038 0.002 0.000 0.019 0.000 core.py:836(_chunk_key)\n", - " 2038 0.002 0.000 0.012 0.000 numeric.py:1905(array_str)\n", - " 4076 0.002 0.000 0.002 0.000 util.py:129()\n", - " 2038 0.001 0.000 0.002 0.000 threading.py:1230(current_thread)\n", - " 4076 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", - " 2038 0.001 0.000 0.002 0.000 {built-in method builtins.all}\n", - " 4078 0.001 0.000 0.001 0.000 {method 'append' of 'list' objects}\n", - " 2038 0.001 0.000 0.001 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 2038 0.001 0.000 0.001 0.000 core.py:200(chunk_store)\n", - " 2048 0.001 0.000 0.013 0.000 numeric.py:380(count_nonzero)\n", - " 2038 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 2038 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 2038 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 2038 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 2042 0.145 0.000 0.158 0.000 core.py:931(_decode_chunk)\n", + " 2043 0.023 0.000 0.023 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 2043 0.013 0.000 0.074 0.000 new_indexing.py:547(__iter__)\n", + " 2042 0.013 0.000 0.206 0.000 core.py:769(_chunk_getitem)\n", + " 2048 0.011 0.000 0.011 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", + " 2042 0.011 0.000 0.042 0.000 index_tricks.py:26(ix_)\n", + " 4084 0.010 0.000 0.010 0.000 core.py:324()\n", + " 4084 0.007 0.000 0.007 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1 0.005 0.005 0.286 0.286 core.py:549(_get_selection)\n", + " 2042 0.005 0.000 0.011 0.000 arrayprint.py:381(wrapper)\n", + " 2043 0.005 0.000 0.006 0.000 new_indexing.py:281(__iter__)\n", + " 2042 0.004 0.000 0.018 0.000 {method 'join' of 'str' objects}\n", + " 2042 0.004 0.000 0.014 0.000 core.py:319(_cdata_shape)\n", + " 2042 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 2042 0.004 0.000 0.004 0.000 arrayprint.py:399(array2string)\n", + " 12259 0.003 0.000 0.003 0.000 {built-in method builtins.isinstance}\n", + " 2042 0.003 0.000 0.048 0.000 new_indexing.py:466(ix_)\n", + " 1 0.002 0.002 0.014 0.014 new_indexing.py:255(__init__)\n", + " 2042 0.002 0.000 0.020 0.000 core.py:928(_chunk_key)\n", + " 2042 0.002 0.000 0.004 0.000 numerictypes.py:728(issubdtype)\n", + " 2042 0.002 0.000 0.013 0.000 numeric.py:1905(array_str)\n", + " 2042 0.002 0.000 0.003 0.000 new_indexing.py:471()\n", + " 4084 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5616c71ce480}\n", + " 2042 0.001 0.000 0.002 0.000 new_indexing.py:562()\n", + " 4084 0.001 0.000 0.001 0.000 new_indexing.py:550()\n", + " 2042 0.001 0.000 0.002 0.000 threading.py:1230(current_thread)\n", + " 6135 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 4084 0.001 0.000 0.001 0.000 new_indexing.py:552()\n", + " 2042 0.001 0.000 0.002 0.000 :12(__new__)\n", + " 2042 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", + " 4084 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", + " 2042 0.001 0.000 0.002 0.000 numeric.py:463(asarray)\n", + " 4084 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", + " 2042 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 4084 0.001 0.000 0.001 0.000 new_indexing.py:551()\n", + " 2042 0.001 0.000 0.001 0.000 core.py:205(chunk_store)\n", + " 2043 0.001 0.000 0.001 0.000 {built-in method builtins.any}\n", + " 2048 0.001 0.000 0.012 0.000 numeric.py:380(count_nonzero)\n", + " 2042 0.001 0.000 0.001 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 2042 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 2042 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 2042 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 2043 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 2042 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1 0.000 0.000 0.014 0.014 new_indexing.py:483(__init__)\n", " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.zeros}\n", + " 1 0.000 0.000 0.300 0.300 {built-in method builtins.exec}\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.289 0.289 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.016 0.016 util.py:323(normalize_array_selection)\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 util.py:363(get_chunks_for_selection)\n", - " 1 0.000 0.000 0.289 0.289 core.py:377(__getitem__)\n", - " 1 0.000 0.000 0.016 0.016 util.py:250(normalize_dim_selection)\n", + " 1 0.000 0.000 0.300 0.300 core.py:392(__getitem__)\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.289 0.289 :1()\n", + " 1 0.000 0.000 0.300 0.300 core.py:527(get_orthogonal_selection)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 2 0.000 0.000 0.016 0.008 util.py:354()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:600(replace_lists)\n", + " 1 0.000 0.000 0.300 0.300 :1()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 5 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", - " 1 0.000 0.000 0.000 0.000 util.py:185(get_chunk_ranges)\n", - " 2 0.000 0.000 0.000 0.000 core.py:500()\n", - " 1 0.000 0.000 0.000 0.000 util.py:332()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:537()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:539()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:542()\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:602()\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", "\n", "\n" @@ -1147,45 +1386,76 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "9950" + "(10000,\n", + " array([94979430, 11935675, 63597355, ..., 91349759, 40936288, 76612910]))" ] }, - "execution_count": 41, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ix_sparse_int = np.nonzero(ix_sparse_bool)[0]\n", - "len(ix_sparse_int)" + "ix_sparse_int = np.random.choice(c.shape[0], size=c.shape[0]//10000, replace=True)\n", + "ix_sparse_int_sorted = ix_sparse_int.copy()\n", + "ix_sparse_int_sorted.sort()\n", + "len(ix_sparse_int), ix_sparse_int" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n", - "Wall time: 169 µs\n" + "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", + "Wall time: 279 µs\n" ] }, { "data": { "text/plain": [ - "array([ 12643, 15188, 16392, ..., 99989960, 99995101, 99999097])" + "array([ 11962, 27590, 30701, ..., 99968761, 99977334, 99990442])" ] }, - "execution_count": 42, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time c[ix_sparse_int_sorted]" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", + "Wall time: 362 µs\n" + ] + }, + { + "data": { + "text/plain": [ + "array([94979430, 11935675, 63597355, ..., 91349759, 40936288, 76612910])" + ] + }, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -1196,24 +1466,52 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 504 ms, sys: 68 ms, total: 572 ms\n", - "Wall time: 262 ms\n" + "CPU times: user 472 ms, sys: 52 ms, total: 524 ms\n", + "Wall time: 243 ms\n" ] }, { "data": { "text/plain": [ - "array([ 12643, 15188, 16392, ..., 99989960, 99995101, 99999097])" + "array([ 11962, 27590, 30701, ..., 99968761, 99977334, 99990442])" ] }, - "execution_count": 43, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc[ix_sparse_int_sorted]" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 448 ms, sys: 104 ms, total: 552 ms\n", + "Wall time: 255 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([94979430, 11935675, 63597355, ..., 91349759, 40936288, 76612910])" + ] + }, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -1224,75 +1522,94 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 71408 function calls in 0.241 seconds\n", + " 138743 function calls in 0.280 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 2038 0.158 0.000 0.169 0.000 core.py:839(_decode_chunk)\n", - " 2038 0.011 0.000 0.014 0.000 util.py:225(get_chunk_sel)\n", - " 2038 0.010 0.000 0.028 0.000 util.py:418(get_chunk_selections)\n", - " 2038 0.010 0.000 0.207 0.000 core.py:679(_chunk_getitem)\n", - " 1 0.006 0.006 0.241 0.241 core.py:484(_getitem_nd)\n", - " 2038 0.005 0.000 0.009 0.000 util.py:114(is_total_slice)\n", - " 4076 0.005 0.000 0.005 0.000 util.py:235(get_out_sel)\n", - " 2038 0.005 0.000 0.010 0.000 arrayprint.py:381(wrapper)\n", - " 2038 0.005 0.000 0.005 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 2038 0.004 0.000 0.016 0.000 {method 'join' of 'str' objects}\n", - " 2038 0.003 0.000 0.003 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 2038 0.003 0.000 0.004 0.000 arrayprint.py:399(array2string)\n", - " 14275 0.003 0.000 0.003 0.000 {built-in method builtins.isinstance}\n", - " 2038 0.002 0.000 0.018 0.000 core.py:836(_chunk_key)\n", - " 2038 0.002 0.000 0.012 0.000 numeric.py:1905(array_str)\n", - " 4076 0.001 0.000 0.002 0.000 util.py:129()\n", - " 2038 0.001 0.000 0.002 0.000 threading.py:1230(current_thread)\n", - " 4076 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", - " 2038 0.001 0.000 0.002 0.000 {built-in method builtins.all}\n", - " 4078 0.001 0.000 0.001 0.000 {method 'append' of 'list' objects}\n", - " 2038 0.001 0.000 0.001 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 2038 0.001 0.000 0.001 0.000 core.py:200(chunk_store)\n", - " 2038 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1 0.000 0.000 0.001 0.001 util.py:191(__init__)\n", - " 2038 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 2038 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 2038 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 2039 0.156 0.000 0.167 0.000 core.py:931(_decode_chunk)\n", + " 2040 0.013 0.000 0.064 0.000 new_indexing.py:547(__iter__)\n", + " 4078 0.012 0.000 0.025 0.000 index_tricks.py:26(ix_)\n", + " 2039 0.009 0.000 0.210 0.000 core.py:769(_chunk_getitem)\n", + " 4078 0.009 0.000 0.009 0.000 core.py:324()\n", + " 2040 0.009 0.000 0.010 0.000 new_indexing.py:440(__iter__)\n", + " 6117 0.008 0.000 0.008 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1 0.005 0.005 0.279 0.279 core.py:549(_get_selection)\n", + " 2039 0.005 0.000 0.010 0.000 arrayprint.py:381(wrapper)\n", + " 4078 0.004 0.000 0.034 0.000 new_indexing.py:466(ix_)\n", + " 2039 0.004 0.000 0.013 0.000 core.py:319(_cdata_shape)\n", + " 2039 0.004 0.000 0.016 0.000 {method 'join' of 'str' objects}\n", + " 16319 0.004 0.000 0.004 0.000 {built-in method builtins.isinstance}\n", + " 2039 0.003 0.000 0.003 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 2039 0.003 0.000 0.004 0.000 arrayprint.py:399(array2string)\n", + " 4078 0.003 0.000 0.006 0.000 numerictypes.py:728(issubdtype)\n", + " 4078 0.003 0.000 0.004 0.000 new_indexing.py:471()\n", + " 2039 0.002 0.000 0.018 0.000 core.py:928(_chunk_key)\n", + " 2039 0.002 0.000 0.012 0.000 numeric.py:1905(array_str)\n", + " 4078 0.002 0.000 0.002 0.000 {built-in method __new__ of type object at 0x5616c71ce480}\n", + " 4084 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.array}\n", + " 8156 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", + " 8165 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 4078 0.001 0.000 0.002 0.000 numerictypes.py:660(issubclass_)\n", + " 4078 0.001 0.000 0.003 0.000 numeric.py:463(asarray)\n", + " 2039 0.001 0.000 0.002 0.000 threading.py:1230(current_thread)\n", + " 4078 0.001 0.000 0.001 0.000 new_indexing.py:550()\n", + " 2039 0.001 0.000 0.002 0.000 new_indexing.py:562()\n", + " 4078 0.001 0.000 0.001 0.000 new_indexing.py:552()\n", + " 2039 0.001 0.000 0.002 0.000 :12(__new__)\n", + " 4078 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", + " 4078 0.001 0.000 0.001 0.000 new_indexing.py:551()\n", + " 1 0.001 0.001 0.001 0.001 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 2039 0.001 0.000 0.001 0.000 core.py:205(chunk_store)\n", + " 4079 0.001 0.000 0.001 0.000 {method 'append' of 'list' objects}\n", + " 2040 0.001 0.000 0.001 0.000 {built-in method builtins.any}\n", + " 2039 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 2039 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 2039 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 2039 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 2039 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1 0.000 0.000 0.001 0.001 new_indexing.py:393(__init__)\n", " 1 0.000 0.000 0.000 0.000 function_base.py:1848(diff)\n", + " 1 0.000 0.000 0.280 0.280 core.py:392(__getitem__)\n", " 4 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 1 0.000 0.000 0.001 0.001 new_indexing.py:483(__init__)\n", + " 1 0.000 0.000 0.000 0.000 {method 'take' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.000 0.000 0.241 0.241 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.001 0.001 util.py:323(normalize_array_selection)\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.280 0.280 {built-in method builtins.exec}\n", + " 4 0.000 0.000 0.001 0.000 fromnumeric.py:55(_wrapfunc)\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", " 4 0.000 0.000 0.000 0.000 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 0.001 0.001 util.py:250(normalize_dim_selection)\n", - " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 util.py:363(get_chunks_for_selection)\n", - " 6 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 1 0.000 0.000 0.241 0.241 core.py:377(__getitem__)\n", - " 4 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", " 4 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 util.py:243(get_chunk_ranges)\n", - " 1 0.000 0.000 0.241 0.241 :1()\n", - " 2 0.000 0.000 0.001 0.000 util.py:354()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 1 0.000 0.000 0.280 0.280 core.py:527(get_orthogonal_selection)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 4 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 2 0.000 0.000 0.000 0.000 core.py:500()\n", + " 1 0.000 0.000 0.280 0.280 :1()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:600(replace_lists)\n", + " 1 0.000 0.000 0.001 0.001 fromnumeric.py:826(argsort)\n", + " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 6 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", - " 1 0.000 0.000 0.000 0.000 util.py:332()\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:537()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:539()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:70(take)\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:542()\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:602()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", "\n", "\n" @@ -1312,13 +1629,13 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Typezarr.core.Array
Data typebool
Shape(100000000,)
Chunk shape(195313,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes100000000 (95.4M)
No. bytes stored511163 (499.2K)
Storage ratio195.6
Chunks initialized512/512
" + "
Typezarr.core.Array
Data typebool
Shape(100000000,)
Chunk shape(195313,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes100000000 (95.4M)
No. bytes stored511297 (499.3K)
Storage ratio195.6
Chunks initialized512/512
" ], "text/plain": [ "Type : zarr.core.Array\n", @@ -1330,12 +1647,12 @@ "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", "Store type : builtins.dict\n", "No. bytes : 100000000 (95.4M)\n", - "No. bytes stored : 511163 (499.2K)\n", + "No. bytes stored : 511297 (499.3K)\n", "Storage ratio : 195.6\n", "Chunks initialized : 512/512" ] }, - "execution_count": 45, + "execution_count": 58, "metadata": {}, "output_type": "execute_result" } @@ -1347,24 +1664,24 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 932 ms, sys: 180 ms, total: 1.11 s\n", - "Wall time: 570 ms\n" + "CPU times: user 1.01 s, sys: 228 ms, total: 1.24 s\n", + "Wall time: 640 ms\n" ] }, { "data": { "text/plain": [ - "array([ 12643, 15188, 16392, ..., 99989960, 99995101, 99999097])" + "array([ 1063, 28396, 37229, ..., 99955875, 99979354, 99995791])" ] }, - "execution_count": 46, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } @@ -1377,115 +1694,805 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### h5py comparison\n", - "\n", - "N.B., not really fair because using slower compressor, but for interest..." - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [], - "source": [ - "import h5py\n", - "import tempfile" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [], - "source": [ - "h5f = h5py.File(tempfile.mktemp(), driver='core', backing_store=False)" + "### slice with step" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 60, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 80 ms, sys: 24 ms, total: 104 ms\n", + "Wall time: 101 ms\n" + ] + }, { "data": { "text/plain": [ - "" + "array([ 0, 2, 4, ..., 99999994, 99999996, 99999998])" ] }, - "execution_count": 49, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "hc = h5f.create_dataset('c', data=c, compression='gzip', compression_opts=1, chunks=zc.chunks, shuffle=True)\n", - "hc" + "%time np.array(c[::2])" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 61, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.14 s, sys: 40 ms, total: 1.18 s\n", - "Wall time: 1.17 s\n" + "CPU times: user 1.45 s, sys: 664 ms, total: 2.12 s\n", + "Wall time: 1.78 s\n" ] }, { "data": { "text/plain": [ - "array([ 0, 1, 2, ..., 99999997, 99999998, 99999999])" + "array([ 0, 2, 4, ..., 99999994, 99999996, 99999998])" ] }, - "execution_count": 50, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time hc[:]" + "%time zc[::2]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 62, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.1 s, sys: 0 ns, total: 1.1 s\n", - "Wall time: 1.1 s\n" + "CPU times: user 572 ms, sys: 224 ms, total: 796 ms\n", + "Wall time: 513 ms\n" ] }, { "data": { "text/plain": [ - "array([ 12643, 15188, 16392, ..., 99989960, 99995101, 99999097])" + "array([ 0, 10, 20, ..., 99999970, 99999980, 99999990])" ] }, - "execution_count": 51, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time hc[ix_sparse_bool]" + "%time zc[::10]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 488 ms, sys: 88 ms, total: 576 ms\n", + "Wall time: 273 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 0, 100, 200, ..., 99999700, 99999800, 99999900])" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc[::100]" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 472 ms, sys: 64 ms, total: 536 ms\n", + "Wall time: 225 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 0, 1000, 2000, ..., 99997000, 99998000, 99999000])" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc[::1000]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2D Benchmarking" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(100000000,)" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(100000, 1000)" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d = c.reshape(-1, 1000)\n", + "d.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Typezarr.core.Array
Data typeint64
Shape(100000, 1000)
Chunk shape(1563, 32)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes800000000 (762.9M)
No. bytes stored39862349 (38.0M)
Storage ratio20.1
Chunks initialized2048/2048
" + ], + "text/plain": [ + "Type : zarr.core.Array\n", + "Data type : int64\n", + "Shape : (100000, 1000)\n", + "Chunk shape : (1563, 32)\n", + "Order : C\n", + "Read-only : False\n", + "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", + "Store type : builtins.dict\n", + "No. bytes : 800000000 (762.9M)\n", + "No. bytes stored : 39862349 (38.0M)\n", + "Storage ratio : 20.1\n", + "Chunks initialized : 2048/2048" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zd = zarr.array(d)\n", + "zd.info" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### bool orthogonal selection" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "ix0 = np.random.binomial(1, 0.5, size=d.shape[0]).astype(bool)\n", + "ix1 = np.random.binomial(1, 0.5, size=d.shape[1]).astype(bool)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 124 ms, sys: 40 ms, total: 164 ms\n", + "Wall time: 164 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[ 3, 5, 6, ..., 994, 995, 997],\n", + " [ 2003, 2005, 2006, ..., 2994, 2995, 2997],\n", + " [ 3003, 3005, 3006, ..., 3994, 3995, 3997],\n", + " ..., \n", + " [99995003, 99995005, 99995006, ..., 99995994, 99995995, 99995997],\n", + " [99997003, 99997005, 99997006, ..., 99997994, 99997995, 99997997],\n", + " [99999003, 99999005, 99999006, ..., 99999994, 99999995, 99999997]])" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time d[np.ix_(ix0, ix1)]" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 832 ms, sys: 84 ms, total: 916 ms\n", + "Wall time: 533 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[ 3, 5, 6, ..., 994, 995, 997],\n", + " [ 2003, 2005, 2006, ..., 2994, 2995, 2997],\n", + " [ 3003, 3005, 3006, ..., 3994, 3995, 3997],\n", + " ..., \n", + " [99995003, 99995005, 99995006, ..., 99995994, 99995995, 99995997],\n", + " [99997003, 99997005, 99997006, ..., 99997994, 99997995, 99997997],\n", + " [99999003, 99999005, 99999006, ..., 99999994, 99999995, 99999997]])" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zd.oindex[ix0, ix1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### int orthogonal selection" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "ix0 = np.random.choice(d.shape[0], size=int(d.shape[0] * .5), replace=True)\n", + "ix1 = np.random.choice(d.shape[1], size=int(d.shape[1] * .5), replace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 224 ms, sys: 56 ms, total: 280 ms\n", + "Wall time: 277 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[16704459, 16704351, 16704547, ..., 16704405, 16704425, 16704805],\n", + " [10766459, 10766351, 10766547, ..., 10766405, 10766425, 10766805],\n", + " [64625459, 64625351, 64625547, ..., 64625405, 64625425, 64625805],\n", + " ..., \n", + " [12875459, 12875351, 12875547, ..., 12875405, 12875425, 12875805],\n", + " [58689459, 58689351, 58689547, ..., 58689405, 58689425, 58689805],\n", + " [18138459, 18138351, 18138547, ..., 18138405, 18138425, 18138805]])" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time d[np.ix_(ix0, ix1)]" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.06 s, sys: 120 ms, total: 1.18 s\n", + "Wall time: 675 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[16704459, 16704351, 16704547, ..., 16704405, 16704425, 16704805],\n", + " [10766459, 10766351, 10766547, ..., 10766405, 10766425, 10766805],\n", + " [64625459, 64625351, 64625547, ..., 64625405, 64625425, 64625805],\n", + " ..., \n", + " [12875459, 12875351, 12875547, ..., 12875405, 12875425, 12875805],\n", + " [58689459, 58689351, 58689547, ..., 58689405, 58689425, 58689805],\n", + " [18138459, 18138351, 18138547, ..., 18138405, 18138425, 18138805]])" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zd.oindex[ix0, ix1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### coordinate (point) selection" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "500000" + ] + }, + "execution_count": 120, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n = int(d.size * .005)\n", + "ix0 = np.random.choice(d.shape[0], size=n, replace=True)\n", + "ix1 = np.random.choice(d.shape[1], size=n, replace=True)\n", + "n" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 228 ms, sys: 0 ns, total: 228 ms\n", + "Wall time: 228 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([235092, 460446, 351446, ..., 66295, 90139, 174162])" + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time np.lexsort((ix0, ix1))" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 48 ms, sys: 0 ns, total: 48 ms\n", + "Wall time: 46.8 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, ..., 499997, 499998, 499999])" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ix0_sorted = np.sort(ix0)\n", + "ix1_sorted = np.sort(ix1)\n", + "%time np.lexsort((ix0_sorted, ix1_sorted))" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", + "Wall time: 1.29 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([15735056, 50367996, 82690284, ..., 79292255, 83283781, 38856303])" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time d[ix0, ix1]" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 13.5 s, sys: 1.7 s, total: 15.2 s\n", + "Wall time: 6.33 s\n" + ] + }, + { + "data": { + "text/plain": [ + "array([15735056, 50367996, 82690284, ..., 79292255, 83283781, 38856303])" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zd.vindex[ix0, ix1]" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2048" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zd.nchunks" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 2698965 function calls in 5.459 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 49978 2.866 0.000 3.116 0.000 core.py:931(_decode_chunk)\n", + " 49978 0.411 0.000 4.602 0.000 core.py:769(_chunk_getitem)\n", + " 49979 0.316 0.000 0.752 0.000 new_indexing.py:660(__iter__)\n", + " 149934 0.296 0.000 0.296 0.000 new_indexing.py:677()\n", + " 149940 0.202 0.000 0.202 0.000 core.py:324()\n", + " 99956 0.162 0.000 0.343 0.000 arrayprint.py:381(wrapper)\n", + " 49978 0.129 0.000 0.533 0.000 {method 'join' of 'str' objects}\n", + " 49978 0.115 0.000 0.115 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 99956 0.105 0.000 0.123 0.000 arrayprint.py:399(array2string)\n", + " 1 0.099 0.099 5.454 5.454 core.py:549(_get_selection)\n", + " 49978 0.097 0.000 0.169 0.000 util.py:113(is_total_slice)\n", + " 49980 0.077 0.000 0.279 0.000 core.py:319(_cdata_shape)\n", + " 49978 0.072 0.000 0.072 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 99956 0.061 0.000 0.404 0.000 numeric.py:1905(array_str)\n", + " 299881 0.060 0.000 0.060 0.000 {built-in method builtins.isinstance}\n", + " 149934 0.053 0.000 0.053 0.000 new_indexing.py:672()\n", + " 49978 0.046 0.000 0.579 0.000 core.py:928(_chunk_key)\n", + " 149934 0.041 0.000 0.041 0.000 new_indexing.py:665()\n", + " 99956 0.034 0.000 0.042 0.000 util.py:128()\n", + " 49978 0.028 0.000 0.037 0.000 threading.py:1230(current_thread)\n", + " 149934 0.025 0.000 0.025 0.000 {built-in method _thread.get_ident}\n", + " 49978 0.023 0.000 0.045 0.000 :12(__new__)\n", + " 49978 0.022 0.000 0.022 0.000 {built-in method __new__ of type object at 0x5616c71ce480}\n", + " 99956 0.018 0.000 0.018 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 99965 0.016 0.000 0.016 0.000 {built-in method builtins.len}\n", + " 99956 0.014 0.000 0.014 0.000 {method 'add' of 'set' objects}\n", + " 49981 0.014 0.000 0.042 0.000 {built-in method builtins.all}\n", + " 99956 0.014 0.000 0.014 0.000 {method 'discard' of 'set' objects}\n", + " 49978 0.014 0.000 0.014 0.000 core.py:205(chunk_store)\n", + " 99956 0.013 0.000 0.013 0.000 {built-in method builtins.id}\n", + " 49978 0.009 0.000 0.009 0.000 threading.py:1304(main_thread)\n", + " 1 0.003 0.003 0.003 0.003 {built-in method numpy.core.multiarray.unravel_index}\n", + " 3 0.001 0.000 0.001 0.000 new_indexing.py:649()\n", + " 1 0.001 0.001 0.005 0.005 new_indexing.py:610(__init__)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.ravel_multi_index}\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:307(find_runs)\n", + " 6 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 function_base.py:1848(diff)\n", + " 1 0.000 0.000 5.459 5.459 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.concatenate}\n", + " 6 0.000 0.000 0.000 0.000 fromnumeric.py:1886(any)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 12 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", + " 1 0.000 0.000 5.459 5.459 core.py:538(get_coordinate_selection)\n", + " 1 0.000 0.000 5.459 5.459 :1()\n", + " 6 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 1 0.000 0.000 5.459 5.459 new_indexing.py:689(__getitem__)\n", + " 10 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 1 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:584()\n", + " 6 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:600(replace_lists)\n", + " 1 0.000 0.000 0.000 0.000 function_base.py:5100(append)\n", + " 6 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:580(is_coordinate_selection)\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 0.000 0.000 core.py:364(vindex)\n", + " 1 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1380(ravel)\n", + " 6 0.000 0.000 0.000 0.000 new_indexing.py:602()\n", + " 3 0.000 0.000 0.000 0.000 new_indexing.py:614()\n", + " 1 0.000 0.000 0.000 0.000 {method 'ravel' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 1 0.000 0.000 0.000 0.000 core.py:213(shape)\n", + " 12 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 1 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 3 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.000 0.000 0.000 0.000 core.py:153(_refresh_metadata)\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run('zd.vindex[ix0, ix1]', sort='time')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## h5py comparison\n", + "\n", + "N.B., not really fair because using slower compressor, but for interest..." + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "import h5py\n", + "import tempfile" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "h5f = h5py.File(tempfile.mktemp(), driver='core', backing_store=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hc = h5f.create_dataset('c', data=c, compression='gzip', compression_opts=1, chunks=zc.chunks, shuffle=True)\n", + "hc" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.16 s, sys: 172 ms, total: 1.33 s\n", + "Wall time: 1.32 s\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, ..., 99999997, 99999998, 99999999])" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time hc[:]" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.11 s, sys: 0 ns, total: 1.11 s\n", + "Wall time: 1.11 s\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 1063, 28396, 37229, ..., 99955875, 99979354, 99995791])" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time hc[ix_sparse_bool]" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "# # this is pathological, takes > 1 minute \n", + "# %time hc[ix_dense_bool]" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 38.3 s, sys: 136 ms, total: 38.4 s\n", + "Wall time: 38.1 s\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 0, 1000, 2000, ..., 99997000, 99998000, 99999000])" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# this is pathological, takes > 1 minute \n", - "%time hc[ix_dense_bool]" + "# this is pretty slow\n", + "%time hc[::1000]" ] }, { diff --git a/zarr/core.py b/zarr/core.py index 5ba61d1ef7..2a1d5b6d01 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -465,8 +465,17 @@ def __getitem__(self, selection): """ - # delegate to method - return self.get_basic_selection(selection) + if len(self._shape) == 0: + return self._get_basic_selection_zd(selection) + + elif len(self._shape) == 1: + # safe to do "fancy" indexing, no ambiguity + return self.get_orthogonal_selection(selection) + + else: + # "fancy" indexing can be ambiguous/hard to understand for multidimensional arrays, + # force people to go through explicit methods + return self.get_basic_selection(selection) def get_basic_selection(self, selection, out=None): """TODO""" @@ -635,7 +644,17 @@ def __setitem__(self, selection, value): """ - self.set_basic_selection(selection, value) + if len(self._shape) == 0: + self._set_basic_selection_zd(selection, value) + + elif len(self._shape) == 1: + # safe to do "fancy" indexing, no ambiguity + self.set_orthogonal_selection(selection, value) + + else: + # "fancy" indexing can be ambiguous/hard to understand for multidimensional arrays, + # force people to go through explicit methods + self.set_basic_selection(selection, value) def set_basic_selection(self, selection, value): """TODO""" @@ -787,6 +806,7 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop else: if isinstance(out, np.ndarray) and \ + isinstance(out_selection, slice) and \ is_total_slice(chunk_selection, self._chunks) and \ not self._filters: diff --git a/zarr/new_indexing.py b/zarr/new_indexing.py index b2c756d5cf..e083f968c1 100644 --- a/zarr/new_indexing.py +++ b/zarr/new_indexing.py @@ -304,33 +304,33 @@ def __iter__(self): yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) -def find_runs(x): - """Find runs of consecutive items in an array.""" - - # ensure array - x = np.asanyarray(x) - if x.ndim != 1: - raise ValueError('only 1D array supported') - n = x.shape[0] - - # handle empty array - if n == 0: - return np.array([]), np.array([]), np.array([]) - - else: - # find run starts - loc_run_start = np.empty(n, dtype=bool) - loc_run_start[0] = True - np.not_equal(x[:-1], x[1:], out=loc_run_start[1:]) - run_starts = np.nonzero(loc_run_start)[0] - - # find run values - run_values = x[loc_run_start] - - # find run lengths - run_lengths = np.diff(np.append(run_starts, n)) - - return run_values, run_starts, run_lengths +# def find_runs(x): +# """Find runs of consecutive items in an array.""" +# +# # ensure array +# x = np.asanyarray(x) +# if x.ndim != 1: +# raise ValueError('only 1D array supported') +# n = x.shape[0] +# +# # handle empty array +# if n == 0: +# return np.array([]), np.array([]), np.array([]) +# +# else: +# # find run starts +# loc_run_start = np.empty(n, dtype=bool) +# loc_run_start[0] = True +# np.not_equal(x[:-1], x[1:], out=loc_run_start[1:]) +# run_starts = np.nonzero(loc_run_start)[0] +# +# # find run values +# run_values = x[loc_run_start] +# +# # find run lengths +# run_lengths = np.diff(np.append(run_starts, n)) +# +# return run_values, run_starts, run_lengths class IntArrayDimIndexer(object): @@ -358,31 +358,49 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): raise IndexError('selection contains index out of bounds') + # handle non-monotonic indices + if np.any(np.diff(dim_sel) < 0): + self.is_monotonic = False + # sort indices + self.dim_sort = np.argsort(dim_sel) + self.dim_sel = np.take(dim_sel, self.dim_sort) + + else: + self.is_monotonic = True + self.dim_sort = None + self.dim_sel = dim_sel + # store attributes - self.dim_sel = dim_sel self.dim_len = dim_len self.dim_chunk_len = dim_chunk_len self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) self.nitems = len(dim_sel) - # locate required chunk for each index - dim_chunk_sel = self.dim_sel // self.dim_chunk_len - self.dim_chunk_sel = dim_chunk_sel - - # find runs of indices in the same chunk - self.dim_chunk_ixs, self.run_starts, self.run_lengths = find_runs(dim_chunk_sel) + # precompute number of selected items for each chunk + # note: for dense integer selections, the division operation here is the bottleneck + self.chunk_nitems = np.bincount(self.dim_sel // self.dim_chunk_len, + minlength=self.nchunks) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] def __iter__(self): - # iterate over chunks - for dim_chunk_ix, s, l in zip(self.dim_chunk_ixs, self.run_starts, self.run_lengths): + for dim_chunk_ix in self.dim_chunk_ixs: - # find region in output array - dim_out_sel = slice(s, s + l) + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + if self.is_monotonic: + dim_out_sel = slice(start, stop) + else: + dim_out_sel = self.dim_sort[start:stop] # find region in chunk dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_chunk_sel = self.dim_sel[dim_out_sel] - dim_offset + dim_chunk_sel = self.dim_sel[start:stop] - dim_offset yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) @@ -416,6 +434,9 @@ def __init__(self, selection, array): # handle ellipsis selection = replace_ellipsis(selection, array._shape) + # normalize list to array + selection = replace_lists(selection) + # validation - check dimensionality if len(selection) > len(array._shape): raise IndexError('too many indices for array') @@ -426,10 +447,6 @@ def __init__(self, selection, array): dim_indexers = [] for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): - # normalize list to array - if isinstance(dim_sel, list): - dim_sel = np.asarray(dim_sel) - if isinstance(dim_sel, numbers.Integral): dim_sel = normalize_integer_selection(dim_sel, dim_len) dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) @@ -487,6 +504,10 @@ def __iter__(self): # or integers, so need to convert slices and integers into ranges. chunk_selection = ix_(*chunk_selection) + # special case for non-monotonic indices + if any([not isinstance(s, (int, slice)) for s in out_selection]): + out_selection = ix_(*out_selection) + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) @@ -547,13 +568,12 @@ def __init__(self, selection, array): # attempt to broadcast selection - this will raise error if array dimensions don't match selection = np.broadcast_arrays(*selection) - self.selection = selection - self.shape = len(self.selection[0]) if self.selection[0].shape else 1 + self.shape = len(selection[0]) if selection[0].shape else 1 self.drop_axes = None self.array = array # normalization - for dim_sel, dim_len in zip(self.selection, array.shape): + for dim_sel, dim_len in zip(selection, array.shape): # check number of dimensions, only support indexing with 1d array if len(dim_sel.shape) > 1: @@ -569,6 +589,17 @@ def __init__(self, selection, array): if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): raise IndexError('index out of bounds') + # handle monotonicity + lexsort = np.lexsort(selection[::-1]) + if np.any(np.diff(lexsort) != 1): + self.is_monotonic = False + self.lexsort = lexsort + self.selection = tuple(np.take(dim_sel, lexsort) for dim_sel in selection) + else: + self.is_monotonic = True + self.lexsort = None + self.selection = selection + # compute flattened chunk index for each point selected chunks_multi_index = tuple( dim_sel // dim_chunk_len @@ -577,29 +608,38 @@ def __init__(self, selection, array): chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, dims=array._cdata_shape) - # find runs of indices in the same chunk - self.chunks_rixs, self.run_starts, self.run_lengths = find_runs(chunks_raveled_indices) + # precompute number of selected items for each chunk + self.chunk_nitems = np.bincount(chunks_raveled_indices, minlength=array.nchunks) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + self.chunk_rixs = np.nonzero(self.chunk_nitems)[0] + # unravel - self.chunks_mixs = np.unravel_index(self.chunks_rixs, dims=array._cdata_shape) + self.chunk_mixs = np.unravel_index(self.chunk_rixs, dims=array._cdata_shape) def __iter__(self): # iterate over chunks - for i in range(len(self.chunks_rixs)): + for i, chunk_rix in enumerate(self.chunk_rixs): - chunk_coords = tuple(mix[i] for mix in self.chunks_mixs) - s = self.run_starts[i] - l = self.run_lengths[i] + chunk_coords = tuple(m[i] for m in self.chunk_mixs) + if chunk_rix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[chunk_rix - 1] + stop = self.chunk_nitems_cumsum[chunk_rix] + if self.is_monotonic: + out_selection = slice(start, stop) + else: + out_selection = self.lexsort[start:stop] - out_selection = slice(s, s+l) + # TODO fix bug somewhere around here chunk_offsets = tuple( dim_chunk_ix * dim_chunk_len for dim_chunk_ix, dim_chunk_len in zip(chunk_coords, self.array._chunks) ) - chunk_selection = tuple( - dim_sel[out_selection] - dim_chunk_offset + dim_sel[start:stop] - dim_chunk_offset for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets) ) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 490bcad8b7..cca98c0227 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -740,6 +740,9 @@ def _test_orthogonal_indexing_1d_common(self, a, z, ix): assert_array_equal(expect, actual) actual = z.oindex[ix] assert_array_equal(expect, actual) + # for 1d arrays, also available via __getitem__ + actual = z[ix] + assert_array_equal(expect, actual) # noinspection PyStatementEffect def test_orthogonal_indexing_1d_bool(self): @@ -775,6 +778,7 @@ def test_orthogonal_indexing_1d_int(self): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + self._test_orthogonal_indexing_1d_common(a, z, ix) ix.sort() self._test_orthogonal_indexing_1d_common(a, z, ix) @@ -820,8 +824,13 @@ def test_orthogonal_indexing_1d_slice_with_step(self): ] for selection in selections: expect = a[selection] + actual = z.get_orthogonal_selection(selection) + assert_array_equal(expect, actual) actual = z.oindex[selection] assert_array_equal(expect, actual) + # for 1d arrays also available via __getitem__ + actual = z[selection] + assert_array_equal(expect, actual) def _test_orthogonal_indexing_2d_common(self, a, z, ix0, ix1): @@ -882,8 +891,9 @@ def test_orthogonal_indexing_2d_int(self): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - ix0.sort() ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) + self._test_orthogonal_indexing_2d_common(a, z, ix0, ix1) + ix0.sort() ix1.sort() self._test_orthogonal_indexing_2d_common(a, z, ix0, ix1) @@ -963,10 +973,11 @@ def test_orthogonal_indexing_3d_int(self): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - ix0.sort() ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) - ix1.sort() ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) + self._test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) + ix0.sort() + ix1.sort() ix2.sort() self._test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) @@ -979,6 +990,10 @@ def _test_orthogonal_indexing_1d_common_set(self, v, a, z, ix): z[:] = 0 z.set_orthogonal_selection(ix, v[ix]) assert_array_equal(a, z[:]) + # also available via __getitem__ for 1d arrays + z[:] = 0 + z[ix] = v[ix] + assert_array_equal(a, z[:]) def test_orthogonal_indexing_1d_bool_set(self): @@ -993,8 +1008,6 @@ def test_orthogonal_indexing_1d_bool_set(self): ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) self._test_orthogonal_indexing_1d_common_set(v, a, z, ix) - # TODO test orthogonal with unsorted ints - def test_orthogonal_indexing_1d_int_set(self): # setup @@ -1056,8 +1069,9 @@ def test_orthogonal_indexing_2d_int_set(self): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - ix0.sort() ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) + self._test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1) + ix0.sort() ix1.sort() self._test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1) @@ -1117,10 +1131,11 @@ def test_orthogonal_indexing_3d_int_set(self): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - ix0.sort() ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) - ix1.sort() ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) + self._test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) + ix0.sort() + ix1.sort() ix2.sort() self._test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) @@ -1136,6 +1151,11 @@ def test_coordinate_indexing_1d(self): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + expect = a[ix] + actual = z.get_coordinate_selection(ix) + assert_array_equal(expect, actual) + actual = z.vindex[ix] + assert_array_equal(expect, actual) ix.sort() expect = a[ix] actual = z.get_coordinate_selection(ix) @@ -1191,10 +1211,25 @@ def test_coordinate_indexing_2d(self): n = int(a.size * p) ix0 = np.random.choice(a.shape[0], size=n, replace=True) ix1 = np.random.choice(a.shape[1], size=n, replace=True) + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + (42, 4), + ] + + for selection in selections: + expect = a[selection] + actual = z.get_coordinate_selection(selection) + assert_array_equal(expect, actual) + actual = z.vindex[selection] + assert_array_equal(expect, actual) + srt = np.lexsort((ix0, ix1)) ix0 = ix0[srt] ix1 = ix1[srt] - selections = [ # index both axes with array (ix0, ix1), @@ -1211,15 +1246,16 @@ def test_coordinate_indexing_2d(self): actual = z.vindex[selection] assert_array_equal(expect, actual) - # not monotonically increasing + # not monotonically increasing (first dim) ix0 = [3, 3, 4, 2, 5] ix1 = [1, 3, 5, 7, 9] expect = a[ix0, ix1] actual = z.get_coordinate_selection((ix0, ix1)) assert_array_equal(expect, actual) - # not monotonically increasing + + # not monotonically increasing (second dim) ix0 = [1, 1, 2, 2, 5] - ix1 = [1, 3, 2, 1, 7] + ix1 = [1, 3, 2, 1, 0] expect = a[ix0, ix1] actual = z.get_coordinate_selection((ix0, ix1)) assert_array_equal(expect, actual) @@ -1275,8 +1311,9 @@ def test_get_selection_out(self): ] for selection in selections: expect = oindex(a, selection) - out = self.create_array(shape=expect.shape, chunks=10, dtype=expect.dtype, - fill_value=0) + # out = self.create_array(shape=expect.shape, chunks=10, dtype=expect.dtype, + # fill_value=0) + out = np.zeros(expect.shape, dtype=expect.dtype) z.get_orthogonal_selection(selection, out=out) assert_array_equal(expect, out[:]) From 050ad79c8e6ec0aaa62028e8f7315fd3f77abf0f Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sun, 5 Nov 2017 23:40:56 +0000 Subject: [PATCH 32/67] rework indexing for performance --- notebooks/advanced_indexing.ipynb | 1281 +++++++++++++---------------- zarr/core.py | 5 +- zarr/new_indexing.py | 56 +- zarr/util.py | 750 ++++++++--------- 4 files changed, 996 insertions(+), 1096 deletions(-) diff --git a/notebooks/advanced_indexing.ipynb b/notebooks/advanced_indexing.ipynb index e1ea045749..e608cf1021 100644 --- a/notebooks/advanced_indexing.ipynb +++ b/notebooks/advanced_indexing.ipynb @@ -4,14 +4,26 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'2.1.5.dev83'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import sys\n", "sys.path.insert(0, '..')\n", "import zarr\n", "import numpy as np\n", "np.random.seed(42)\n", - "import cProfile" + "import cProfile\n", + "zarr.__version__" ] }, { @@ -566,28 +578,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 524 ms, sys: 124 ms, total: 648 ms\n", - "Wall time: 226 ms\n" + "CPU times: user 508 ms, sys: 28 ms, total: 536 ms\n", + "Wall time: 162 ms\n" ] }, { "data": { "text/html": [ - "
Typezarr.core.Array
Data typeint64
Shape(100000000,)
Chunk shape(48829,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes800000000 (762.9M)
No. bytes stored11870277 (11.3M)
Storage ratio67.4
Chunks initialized2048/2048
" + "
Typezarr.core.Array
Data typeint64
Shape(100000000,)
Chunk shape(97657,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes800000000 (762.9M)
No. bytes stored11854081 (11.3M)
Storage ratio67.5
Chunks initialized1024/1024
" ], "text/plain": [ "Type : zarr.core.Array\n", "Data type : int64\n", "Shape : (100000000,)\n", - "Chunk shape : (48829,)\n", + "Chunk shape : (97657,)\n", "Order : C\n", "Read-only : False\n", "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", "Store type : builtins.dict\n", "No. bytes : 800000000 (762.9M)\n", - "No. bytes stored : 11870277 (11.3M)\n", - "Storage ratio : 67.4\n", - "Chunks initialized : 2048/2048" + "No. bytes stored : 11854081 (11.3M)\n", + "Storage ratio : 67.5\n", + "Chunks initialized : 1024/1024" ] }, "execution_count": 26, @@ -609,8 +621,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 124 ms, sys: 52 ms, total: 176 ms\n", - "Wall time: 173 ms\n" + "CPU times: user 120 ms, sys: 60 ms, total: 180 ms\n", + "Wall time: 178 ms\n" ] }, { @@ -630,15 +642,15 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 512 ms, sys: 108 ms, total: 620 ms\n", - "Wall time: 312 ms\n" + "CPU times: user 520 ms, sys: 32 ms, total: 552 ms\n", + "Wall time: 261 ms\n" ] }, { @@ -647,7 +659,7 @@ "array([ 0, 1, 2, ..., 99999997, 99999998, 99999999])" ] }, - "execution_count": 28, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -665,46 +677,46 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "49994863" + "9998583" ] }, - "execution_count": 29, + "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# relatively dense selection\n", - "ix_dense_bool = np.random.binomial(1, 0.5, size=c.shape[0]).astype(bool)\n", + "# relatively dense selection - 10%\n", + "ix_dense_bool = np.random.binomial(1, 0.1, size=c.shape[0]).astype(bool)\n", "np.count_nonzero(ix_dense_bool)" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 612 ms, sys: 8 ms, total: 620 ms\n", - "Wall time: 621 ms\n" + "CPU times: user 312 ms, sys: 0 ns, total: 312 ms\n", + "Wall time: 311 ms\n" ] }, { "data": { "text/plain": [ - "array([ 0, 1, 2, ..., 99999994, 99999995, 99999996])" + "array([ 23, 24, 39, ..., 99999967, 99999978, 99999995])" ] }, - "execution_count": 30, + "execution_count": 83, "metadata": {}, "output_type": "execute_result" } @@ -715,24 +727,24 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.47 s, sys: 124 ms, total: 1.6 s\n", - "Wall time: 922 ms\n" + "CPU times: user 888 ms, sys: 52 ms, total: 940 ms\n", + "Wall time: 459 ms\n" ] }, { "data": { "text/plain": [ - "array([ 0, 1, 2, ..., 99999994, 99999995, 99999996])" + "array([ 23, 24, 39, ..., 99999967, 99999978, 99999995])" ] }, - "execution_count": 31, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } @@ -743,87 +755,87 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 85, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 116791 function calls in 0.943 seconds\n", + " 58423 function calls in 0.514 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 2049 0.445 0.000 0.445 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 2048 0.234 0.000 0.250 0.000 core.py:931(_decode_chunk)\n", - " 2048 0.140 0.000 0.425 0.000 core.py:769(_chunk_getitem)\n", - " 2049 0.013 0.000 0.497 0.000 new_indexing.py:547(__iter__)\n", - " 2048 0.011 0.000 0.011 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", - " 2048 0.011 0.000 0.465 0.000 index_tricks.py:26(ix_)\n", - " 4096 0.010 0.000 0.010 0.000 core.py:324()\n", - " 4096 0.009 0.000 0.009 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1 0.006 0.006 0.928 0.928 core.py:549(_get_selection)\n", - " 2048 0.006 0.000 0.006 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 2048 0.005 0.000 0.010 0.000 arrayprint.py:381(wrapper)\n", - " 2049 0.005 0.000 0.006 0.000 new_indexing.py:281(__iter__)\n", - " 2048 0.004 0.000 0.017 0.000 {method 'join' of 'str' objects}\n", - " 2048 0.004 0.000 0.014 0.000 core.py:319(_cdata_shape)\n", - " 2048 0.003 0.000 0.004 0.000 arrayprint.py:399(array2string)\n", - " 12295 0.003 0.000 0.003 0.000 {built-in method builtins.isinstance}\n", - " 2048 0.003 0.000 0.470 0.000 new_indexing.py:466(ix_)\n", - " 1 0.003 0.003 0.014 0.014 new_indexing.py:255(__init__)\n", - " 2048 0.002 0.000 0.004 0.000 numerictypes.py:728(issubdtype)\n", - " 2048 0.002 0.000 0.019 0.000 core.py:928(_chunk_key)\n", - " 2048 0.002 0.000 0.013 0.000 numeric.py:1905(array_str)\n", - " 2048 0.002 0.000 0.003 0.000 new_indexing.py:471()\n", - " 4096 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5616c71ce480}\n", - " 2048 0.001 0.000 0.002 0.000 new_indexing.py:562()\n", - " 4096 0.001 0.000 0.001 0.000 new_indexing.py:550()\n", - " 2048 0.001 0.000 0.002 0.000 threading.py:1230(current_thread)\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", - " 4096 0.001 0.000 0.001 0.000 new_indexing.py:552()\n", - " 6153 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 2048 0.001 0.000 0.002 0.000 :12(__new__)\n", - " 4096 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", - " 2048 0.001 0.000 0.002 0.000 numeric.py:463(asarray)\n", - " 4096 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", - " 4096 0.001 0.000 0.001 0.000 new_indexing.py:551()\n", - " 2048 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 2048 0.001 0.000 0.001 0.000 core.py:205(chunk_store)\n", - " 1 0.001 0.001 0.943 0.943 :1()\n", - " 2048 0.001 0.000 0.012 0.000 numeric.py:380(count_nonzero)\n", - " 2049 0.001 0.000 0.001 0.000 {built-in method builtins.any}\n", - " 2048 0.001 0.000 0.001 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 2048 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 2048 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", - " 2048 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 0.015 0.015 new_indexing.py:483(__init__)\n", + " 1025 0.205 0.000 0.205 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1024 0.169 0.000 0.179 0.000 core.py:930(_decode_chunk)\n", + " 1024 0.062 0.000 0.261 0.000 core.py:768(_chunk_getitem)\n", + " 1024 0.011 0.000 0.011 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", + " 1025 0.008 0.000 0.234 0.000 new_indexing.py:494(__iter__)\n", + " 1024 0.006 0.000 0.216 0.000 index_tricks.py:26(ix_)\n", + " 2048 0.006 0.000 0.006 0.000 core.py:323()\n", + " 2048 0.005 0.000 0.005 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1024 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1 0.004 0.004 0.499 0.499 core.py:548(_get_selection)\n", + " 1024 0.003 0.000 0.006 0.000 arrayprint.py:381(wrapper)\n", + " 1024 0.002 0.000 0.010 0.000 {method 'join' of 'str' objects}\n", + " 1025 0.002 0.000 0.003 0.000 new_indexing.py:282(__iter__)\n", + " 1024 0.002 0.000 0.008 0.000 core.py:318(_cdata_shape)\n", + " 1 0.002 0.002 0.514 0.514 :1()\n", + " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 6151 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.002 0.000 0.219 0.000 new_indexing.py:413(ix_)\n", + " 1 0.001 0.001 0.013 0.013 new_indexing.py:255(__init__)\n", + " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", + " 1024 0.001 0.000 0.011 0.000 core.py:927(_chunk_key)\n", + " 1024 0.001 0.000 0.007 0.000 numeric.py:1905(array_str)\n", + " 1024 0.001 0.000 0.001 0.000 new_indexing.py:418()\n", + " 1024 0.001 0.000 0.001 0.000 new_indexing.py:509()\n", + " 2048 0.001 0.000 0.001 0.000 new_indexing.py:497()\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55c5e1118480}\n", + " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", + " 1024 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", + " 2048 0.001 0.000 0.001 0.000 new_indexing.py:499()\n", + " 3081 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", + " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", + " 2048 0.000 0.000 0.000 0.000 new_indexing.py:498()\n", + " 1024 0.000 0.000 0.000 0.000 core.py:204(chunk_store)\n", + " 1024 0.000 0.000 0.011 0.000 numeric.py:380(count_nonzero)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1 0.000 0.000 0.013 0.013 new_indexing.py:430(__init__)\n", " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.zeros}\n", - " 1 0.000 0.000 0.943 0.943 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.514 0.514 {built-in method builtins.exec}\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.942 0.942 core.py:392(__getitem__)\n", - " 1 0.000 0.000 0.942 0.942 core.py:527(get_orthogonal_selection)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:539()\n", + " 1 0.000 0.000 0.512 0.512 core.py:391(__getitem__)\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.512 0.512 core.py:526(get_orthogonal_selection)\n", " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:547(replace_lists)\n", " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:600(replace_lists)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:486()\n", + " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:484()\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:549()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:537()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:542()\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:602()\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:489()\n", "\n", "\n" ] @@ -842,23 +854,23 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(50000000,\n", - " array([86098038, 51488465, 9242439, ..., 31235734, 20293124, 13824417]))" + "(10000000,\n", + " array([38852033, 29570639, 6153807, ..., 51604068, 33056119, 29899374]))" ] }, - "execution_count": 33, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ix_dense_int = np.random.choice(c.shape[0], size=c.shape[0]//2, replace=True)\n", + "ix_dense_int = np.random.choice(c.shape[0], size=c.shape[0]//10, replace=True)\n", "ix_dense_int_sorted = ix_dense_int.copy()\n", "ix_dense_int_sorted.sort()\n", "len(ix_dense_int), ix_dense_int" @@ -866,52 +878,24 @@ }, { "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 10.6 s, sys: 0 ns, total: 10.6 s\n", - "Wall time: 10.6 s\n" - ] - }, - { - "data": { - "text/plain": [ - "array([35886154, 6592339, 23747762, ..., 26251840, 48664862, 3479456])" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%time np.argsort(ix_dense_int, kind='quicksort')" - ] - }, - { - "cell_type": "code", - "execution_count": 36, + "execution_count": 87, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 152 ms, sys: 0 ns, total: 152 ms\n", - "Wall time: 152 ms\n" + "CPU times: user 60 ms, sys: 32 ms, total: 92 ms\n", + "Wall time: 91 ms\n" ] }, { "data": { "text/plain": [ - "array([ 0, 0, 1, ..., 99999994, 99999999, 99999999])" + "array([ 6, 9, 15, ..., 99999956, 99999964, 99999985])" ] }, - "execution_count": 36, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } @@ -922,24 +906,24 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 88, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.39 s, sys: 188 ms, total: 1.58 s\n", - "Wall time: 1.29 s\n" + "CPU times: user 576 ms, sys: 104 ms, total: 680 ms\n", + "Wall time: 428 ms\n" ] }, { "data": { "text/plain": [ - "array([ 0, 0, 1, ..., 99999994, 99999999, 99999999])" + "array([ 6, 9, 15, ..., 99999956, 99999964, 99999985])" ] }, - "execution_count": 37, + "execution_count": 88, "metadata": {}, "output_type": "execute_result" } @@ -950,24 +934,24 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 684 ms, sys: 36 ms, total: 720 ms\n", - "Wall time: 718 ms\n" + "CPU times: user 144 ms, sys: 20 ms, total: 164 ms\n", + "Wall time: 162 ms\n" ] }, { "data": { "text/plain": [ - "array([86098038, 51488465, 9242439, ..., 31235734, 20293124, 13824417])" + "array([38852033, 29570639, 6153807, ..., 51604068, 33056119, 29899374])" ] }, - "execution_count": 38, + "execution_count": 89, "metadata": {}, "output_type": "execute_result" } @@ -978,24 +962,24 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 90, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 14.8 s, sys: 716 ms, total: 15.5 s\n", - "Wall time: 14.7 s\n" + "CPU times: user 2.34 s, sys: 156 ms, total: 2.49 s\n", + "Wall time: 2.18 s\n" ] }, { "data": { "text/plain": [ - "array([86098038, 51488465, 9242439, ..., 31235734, 20293124, 13824417])" + "array([38852033, 29570639, 6153807, ..., 51604068, 33056119, 29899374])" ] }, - "execution_count": 39, + "execution_count": 90, "metadata": {}, "output_type": "execute_result" } @@ -1006,90 +990,90 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 110675 function calls in 1.659 seconds\n", + " 55379 function calls in 0.491 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 0.781 0.781 1.051 1.051 new_indexing.py:393(__init__)\n", - " 2048 0.209 0.000 0.380 0.000 core.py:769(_chunk_getitem)\n", - " 2049 0.148 0.000 0.150 0.000 new_indexing.py:440(__iter__)\n", - " 1 0.132 0.132 0.132 0.132 function_base.py:1848(diff)\n", - " 2048 0.130 0.000 0.141 0.000 core.py:931(_decode_chunk)\n", - " 1 0.120 0.120 0.120 0.120 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.018 0.018 1.642 1.642 core.py:527(get_orthogonal_selection)\n", - " 1 0.018 0.018 1.659 1.659 :1()\n", - " 4 0.017 0.004 0.017 0.004 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 2049 0.012 0.000 0.188 0.000 new_indexing.py:547(__iter__)\n", - " 4096 0.008 0.000 0.008 0.000 core.py:324()\n", - " 2048 0.008 0.000 0.015 0.000 index_tricks.py:26(ix_)\n", - " 4096 0.007 0.000 0.007 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1 0.004 0.004 0.572 0.572 core.py:549(_get_selection)\n", - " 2048 0.004 0.000 0.009 0.000 arrayprint.py:381(wrapper)\n", - " 2048 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 2048 0.004 0.000 0.014 0.000 {method 'join' of 'str' objects}\n", - " 2048 0.003 0.000 0.011 0.000 core.py:319(_cdata_shape)\n", - " 12295 0.003 0.000 0.003 0.000 {built-in method builtins.isinstance}\n", - " 2048 0.003 0.000 0.003 0.000 arrayprint.py:399(array2string)\n", - " 2048 0.002 0.000 0.020 0.000 new_indexing.py:466(ix_)\n", - " 2048 0.002 0.000 0.003 0.000 numerictypes.py:728(issubdtype)\n", - " 2048 0.002 0.000 0.016 0.000 core.py:928(_chunk_key)\n", - " 2048 0.002 0.000 0.011 0.000 numeric.py:1905(array_str)\n", - " 4096 0.002 0.000 0.002 0.000 {built-in method __new__ of type object at 0x5616c71ce480}\n", - " 2048 0.001 0.000 0.002 0.000 new_indexing.py:471()\n", - " 4096 0.001 0.000 0.001 0.000 new_indexing.py:550()\n", - " 2048 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 2048 0.001 0.000 0.002 0.000 new_indexing.py:562()\n", - " 6153 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 4096 0.001 0.000 0.001 0.000 new_indexing.py:552()\n", - " 2054 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", - " 2048 0.001 0.000 0.001 0.000 :12(__new__)\n", - " 2048 0.001 0.000 0.002 0.000 numeric.py:463(asarray)\n", - " 4096 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", - " 4096 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", - " 4096 0.001 0.000 0.001 0.000 new_indexing.py:551()\n", - " 2048 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 2048 0.001 0.000 0.001 0.000 core.py:205(chunk_store)\n", - " 2049 0.001 0.000 0.001 0.000 {built-in method builtins.any}\n", - " 2048 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 2048 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 2048 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 2048 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1 0.000 0.000 1.052 1.052 new_indexing.py:483(__init__)\n", - " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 4 0.000 0.000 0.017 0.004 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 1.659 1.659 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 1.642 1.642 core.py:392(__getitem__)\n", + " 1 0.183 0.183 0.237 0.237 new_indexing.py:340(__init__)\n", + " 1024 0.099 0.000 0.107 0.000 core.py:930(_decode_chunk)\n", + " 1024 0.065 0.000 0.191 0.000 core.py:768(_chunk_getitem)\n", + " 1 0.026 0.026 0.026 0.026 {built-in method numpy.core.multiarray.bincount}\n", + " 1025 0.025 0.000 0.025 0.000 new_indexing.py:387(__iter__)\n", + " 1 0.024 0.024 0.024 0.024 function_base.py:1848(diff)\n", + " 1 0.007 0.007 0.245 0.245 core.py:548(_get_selection)\n", + " 2048 0.006 0.000 0.006 0.000 core.py:323()\n", + " 1025 0.006 0.000 0.046 0.000 new_indexing.py:494(__iter__)\n", + " 2048 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1024 0.004 0.000 0.008 0.000 index_tricks.py:26(ix_)\n", + " 1024 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1 0.004 0.004 0.241 0.241 new_indexing.py:430(__init__)\n", + " 1 0.004 0.004 0.491 0.491 :1()\n", + " 1024 0.003 0.000 0.009 0.000 {method 'join' of 'str' objects}\n", + " 4 0.003 0.001 0.003 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 1 0.003 0.003 0.488 0.488 core.py:526(get_orthogonal_selection)\n", + " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", + " 1024 0.002 0.000 0.008 0.000 core.py:318(_cdata_shape)\n", + " 6151 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.001 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 1024 0.001 0.000 0.011 0.000 new_indexing.py:413(ix_)\n", + " 1024 0.001 0.000 0.010 0.000 core.py:927(_chunk_key)\n", + " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", + " 1024 0.001 0.000 0.005 0.000 numeric.py:1905(array_str)\n", + " 1024 0.001 0.000 0.001 0.000 new_indexing.py:418()\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55c5e1118480}\n", + " 2048 0.001 0.000 0.001 0.000 new_indexing.py:497()\n", + " 3081 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 1024 0.001 0.000 0.001 0.000 new_indexing.py:509()\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 2048 0.000 0.000 0.000 0.000 new_indexing.py:499()\n", + " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", + " 1030 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", + " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", + " 2048 0.000 0.000 0.000 0.000 new_indexing.py:498()\n", + " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:204(chunk_store)\n", + " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 4 0.000 0.000 0.003 0.001 fromnumeric.py:1886(any)\n", + " 1 0.000 0.000 0.491 0.491 {built-in method builtins.exec}\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 4 0.000 0.000 0.017 0.004 {method 'any' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 4 0.000 0.000 0.003 0.001 {method 'any' of 'numpy.ndarray' objects}\n", " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 4 0.000 0.000 0.017 0.004 _methods.py:37(_any)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.488 0.488 core.py:391(__getitem__)\n", + " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", + " 4 0.000 0.000 0.003 0.001 _methods.py:37(_any)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:547(replace_lists)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:600(replace_lists)\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:484()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:486()\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:549()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:539()\n", " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:537()\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:602()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:489()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:542()\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", "\n", "\n" @@ -1102,95 +1086,95 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 139355 function calls in 16.778 seconds\n", + " 69723 function calls in 2.217 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 13.664 13.664 13.664 13.664 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 1 0.787 0.787 15.497 15.497 new_indexing.py:393(__init__)\n", - " 1 0.776 0.776 0.776 0.776 {method 'take' of 'numpy.ndarray' objects}\n", - " 2048 0.741 0.000 1.006 0.000 core.py:769(_chunk_getitem)\n", - " 2048 0.220 0.000 0.235 0.000 core.py:931(_decode_chunk)\n", - " 2049 0.140 0.000 0.141 0.000 new_indexing.py:440(__iter__)\n", - " 1 0.130 0.130 0.130 0.130 function_base.py:1848(diff)\n", - " 1 0.121 0.121 0.121 0.121 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.040 0.040 16.760 16.760 core.py:392(__getitem__)\n", - " 4 0.019 0.005 0.019 0.005 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1 0.018 0.018 16.778 16.778 :1()\n", - " 1 0.018 0.018 16.720 16.720 core.py:527(get_orthogonal_selection)\n", - " 2049 0.013 0.000 0.193 0.000 new_indexing.py:547(__iter__)\n", - " 4096 0.012 0.000 0.024 0.000 index_tricks.py:26(ix_)\n", - " 6144 0.010 0.000 0.010 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 4096 0.008 0.000 0.008 0.000 core.py:324()\n", - " 1 0.005 0.005 1.205 1.205 core.py:549(_get_selection)\n", - " 2048 0.005 0.000 0.005 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 2048 0.004 0.000 0.009 0.000 arrayprint.py:381(wrapper)\n", - " 4096 0.004 0.000 0.032 0.000 new_indexing.py:466(ix_)\n", - " 2048 0.004 0.000 0.015 0.000 {method 'join' of 'str' objects}\n", - " 16391 0.004 0.000 0.004 0.000 {built-in method builtins.isinstance}\n", - " 2048 0.003 0.000 0.012 0.000 core.py:319(_cdata_shape)\n", - " 4096 0.003 0.000 0.006 0.000 numerictypes.py:728(issubdtype)\n", - " 2048 0.003 0.000 0.004 0.000 arrayprint.py:399(array2string)\n", - " 4096 0.002 0.000 0.004 0.000 new_indexing.py:471()\n", - " 2048 0.002 0.000 0.017 0.000 core.py:928(_chunk_key)\n", - " 2048 0.002 0.000 0.011 0.000 numeric.py:1905(array_str)\n", - " 4102 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.array}\n", - " 4096 0.002 0.000 0.002 0.000 {built-in method __new__ of type object at 0x5616c71ce480}\n", - " 4096 0.001 0.000 0.001 0.000 new_indexing.py:550()\n", - " 8192 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", - " 4096 0.001 0.000 0.003 0.000 numeric.py:463(asarray)\n", - " 8201 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 4096 0.001 0.000 0.002 0.000 numerictypes.py:660(issubclass_)\n", - " 2048 0.001 0.000 0.002 0.000 new_indexing.py:562()\n", - " 2048 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 4096 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", - " 2048 0.001 0.000 0.002 0.000 :12(__new__)\n", - " 4096 0.001 0.000 0.001 0.000 new_indexing.py:552()\n", - " 4096 0.001 0.000 0.001 0.000 new_indexing.py:551()\n", - " 2048 0.001 0.000 0.001 0.000 core.py:205(chunk_store)\n", - " 4097 0.001 0.000 0.001 0.000 {method 'append' of 'list' objects}\n", - " 2049 0.001 0.000 0.001 0.000 {built-in method builtins.any}\n", - " 2048 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 2048 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 2048 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 2048 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1 0.000 0.000 15.498 15.498 new_indexing.py:483(__init__)\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 4 0.000 0.000 0.019 0.005 fromnumeric.py:1886(any)\n", - " 4 0.000 0.000 14.441 3.610 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 16.778 16.778 {built-in method builtins.exec}\n", - " 4 0.000 0.000 0.019 0.005 {method 'any' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 4 0.000 0.000 0.019 0.005 _methods.py:37(_any)\n", + " 1 1.417 1.417 1.417 1.417 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 1 0.198 0.198 1.834 1.834 new_indexing.py:340(__init__)\n", + " 1 0.169 0.169 0.169 0.169 {method 'take' of 'numpy.ndarray' objects}\n", + " 1024 0.167 0.000 0.306 0.000 core.py:768(_chunk_getitem)\n", + " 1024 0.116 0.000 0.122 0.000 core.py:930(_decode_chunk)\n", + " 1025 0.026 0.000 0.027 0.000 new_indexing.py:387(__iter__)\n", + " 1 0.024 0.024 0.024 0.024 function_base.py:1848(diff)\n", + " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", + " 1025 0.008 0.000 0.056 0.000 new_indexing.py:494(__iter__)\n", + " 1 0.007 0.007 2.213 2.213 core.py:391(__getitem__)\n", + " 2048 0.007 0.000 0.013 0.000 index_tricks.py:26(ix_)\n", + " 2048 0.005 0.000 0.005 0.000 core.py:323()\n", + " 1 0.004 0.004 1.839 1.839 new_indexing.py:430(__init__)\n", + " 1 0.004 0.004 2.217 2.217 :1()\n", + " 3072 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1 0.003 0.003 0.365 0.365 core.py:548(_get_selection)\n", + " 4 0.003 0.001 0.003 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 1 0.003 0.003 2.206 2.206 core.py:526(get_orthogonal_selection)\n", + " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", + " 2048 0.002 0.000 0.018 0.000 new_indexing.py:413(ix_)\n", + " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", + " 1024 0.002 0.000 0.007 0.000 core.py:318(_cdata_shape)\n", + " 8199 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", + " 2048 0.002 0.000 0.003 0.000 numerictypes.py:728(issubdtype)\n", + " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 2048 0.001 0.000 0.002 0.000 new_indexing.py:418()\n", + " 1024 0.001 0.000 0.009 0.000 core.py:927(_chunk_key)\n", + " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", + " 2054 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55c5e1118480}\n", + " 2048 0.001 0.000 0.001 0.000 new_indexing.py:497()\n", + " 4096 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", + " 2048 0.001 0.000 0.001 0.000 numeric.py:463(asarray)\n", + " 4105 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 1024 0.001 0.000 0.001 0.000 new_indexing.py:509()\n", + " 2048 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", + " 2048 0.000 0.000 0.000 0.000 new_indexing.py:499()\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 2048 0.000 0.000 0.000 0.000 new_indexing.py:498()\n", + " 1024 0.000 0.000 0.000 0.000 core.py:204(chunk_store)\n", + " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", + " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1 0.000 0.000 2.217 2.217 {built-in method builtins.exec}\n", + " 4 0.000 0.000 0.003 0.001 fromnumeric.py:1886(any)\n", + " 4 0.000 0.000 1.587 0.397 fromnumeric.py:55(_wrapfunc)\n", " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 1 0.000 0.000 0.776 0.776 fromnumeric.py:70(take)\n", - " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 1 0.000 0.000 13.664 13.664 fromnumeric.py:826(argsort)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 4 0.000 0.000 0.003 0.001 {method 'any' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:537()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:539()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:600(replace_lists)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:547(replace_lists)\n", + " 4 0.000 0.000 0.003 0.001 _methods.py:37(_any)\n", " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", - " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 1 0.000 0.000 0.169 0.169 fromnumeric.py:70(take)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:549()\n", + " 1 0.000 0.000 1.417 1.417 fromnumeric.py:826(argsort)\n", " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:542()\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:602()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:484()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:486()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:489()\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", "\n", "\n" ] @@ -1209,16 +1193,16 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "9958" + "10033" ] }, - "execution_count": 42, + "execution_count": 94, "metadata": {}, "output_type": "execute_result" } @@ -1231,24 +1215,24 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 24 ms, sys: 0 ns, total: 24 ms\n", + "CPU times: user 20 ms, sys: 0 ns, total: 20 ms\n", "Wall time: 21.6 ms\n" ] }, { "data": { "text/plain": [ - "array([ 1063, 28396, 37229, ..., 99955875, 99979354, 99995791])" + "array([ 35449, 41893, 45592, ..., 99987487, 99990184, 99993538])" ] }, - "execution_count": 43, + "execution_count": 95, "metadata": {}, "output_type": "execute_result" } @@ -1259,24 +1243,24 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 96, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 508 ms, sys: 72 ms, total: 580 ms\n", - "Wall time: 288 ms\n" + "CPU times: user 440 ms, sys: 56 ms, total: 496 ms\n", + "Wall time: 222 ms\n" ] }, { "data": { "text/plain": [ - "array([ 1063, 28396, 37229, ..., 99955875, 99979354, 99995791])" + "array([ 35449, 41893, 45592, ..., 99987487, 99990184, 99993538])" ] }, - "execution_count": 44, + "execution_count": 96, "metadata": {}, "output_type": "execute_result" } @@ -1287,86 +1271,86 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 97, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 116461 function calls in 0.300 seconds\n", + " 58423 function calls in 0.259 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 2042 0.145 0.000 0.158 0.000 core.py:931(_decode_chunk)\n", - " 2043 0.023 0.000 0.023 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 2043 0.013 0.000 0.074 0.000 new_indexing.py:547(__iter__)\n", - " 2042 0.013 0.000 0.206 0.000 core.py:769(_chunk_getitem)\n", - " 2048 0.011 0.000 0.011 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", - " 2042 0.011 0.000 0.042 0.000 index_tricks.py:26(ix_)\n", - " 4084 0.010 0.000 0.010 0.000 core.py:324()\n", - " 4084 0.007 0.000 0.007 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1 0.005 0.005 0.286 0.286 core.py:549(_get_selection)\n", - " 2042 0.005 0.000 0.011 0.000 arrayprint.py:381(wrapper)\n", - " 2043 0.005 0.000 0.006 0.000 new_indexing.py:281(__iter__)\n", - " 2042 0.004 0.000 0.018 0.000 {method 'join' of 'str' objects}\n", - " 2042 0.004 0.000 0.014 0.000 core.py:319(_cdata_shape)\n", - " 2042 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 2042 0.004 0.000 0.004 0.000 arrayprint.py:399(array2string)\n", - " 12259 0.003 0.000 0.003 0.000 {built-in method builtins.isinstance}\n", - " 2042 0.003 0.000 0.048 0.000 new_indexing.py:466(ix_)\n", - " 1 0.002 0.002 0.014 0.014 new_indexing.py:255(__init__)\n", - " 2042 0.002 0.000 0.020 0.000 core.py:928(_chunk_key)\n", - " 2042 0.002 0.000 0.004 0.000 numerictypes.py:728(issubdtype)\n", - " 2042 0.002 0.000 0.013 0.000 numeric.py:1905(array_str)\n", - " 2042 0.002 0.000 0.003 0.000 new_indexing.py:471()\n", - " 4084 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5616c71ce480}\n", - " 2042 0.001 0.000 0.002 0.000 new_indexing.py:562()\n", - " 4084 0.001 0.000 0.001 0.000 new_indexing.py:550()\n", - " 2042 0.001 0.000 0.002 0.000 threading.py:1230(current_thread)\n", - " 6135 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 4084 0.001 0.000 0.001 0.000 new_indexing.py:552()\n", - " 2042 0.001 0.000 0.002 0.000 :12(__new__)\n", - " 2042 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", - " 4084 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", - " 2042 0.001 0.000 0.002 0.000 numeric.py:463(asarray)\n", - " 4084 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", - " 2042 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 4084 0.001 0.000 0.001 0.000 new_indexing.py:551()\n", - " 2042 0.001 0.000 0.001 0.000 core.py:205(chunk_store)\n", - " 2043 0.001 0.000 0.001 0.000 {built-in method builtins.any}\n", - " 2048 0.001 0.000 0.012 0.000 numeric.py:380(count_nonzero)\n", - " 2042 0.001 0.000 0.001 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 2042 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 2042 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 2042 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 2043 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", - " 2042 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 0.014 0.014 new_indexing.py:483(__init__)\n", + " 1024 0.137 0.000 0.144 0.000 core.py:930(_decode_chunk)\n", + " 1024 0.026 0.000 0.026 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", + " 1025 0.023 0.000 0.023 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1024 0.008 0.000 0.172 0.000 core.py:768(_chunk_getitem)\n", + " 1025 0.007 0.000 0.052 0.000 new_indexing.py:494(__iter__)\n", + " 1024 0.006 0.000 0.034 0.000 index_tricks.py:26(ix_)\n", + " 2048 0.006 0.000 0.006 0.000 core.py:323()\n", + " 1 0.005 0.005 0.032 0.032 new_indexing.py:255(__init__)\n", + " 2048 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1 0.003 0.003 0.227 0.227 core.py:548(_get_selection)\n", + " 1025 0.003 0.000 0.003 0.000 new_indexing.py:282(__iter__)\n", + " 1024 0.003 0.000 0.006 0.000 arrayprint.py:381(wrapper)\n", + " 1024 0.002 0.000 0.010 0.000 {method 'join' of 'str' objects}\n", + " 1024 0.002 0.000 0.008 0.000 core.py:318(_cdata_shape)\n", + " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 6151 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.002 0.000 0.037 0.000 new_indexing.py:413(ix_)\n", + " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", + " 1024 0.001 0.000 0.011 0.000 core.py:927(_chunk_key)\n", + " 1024 0.001 0.000 0.007 0.000 numeric.py:1905(array_str)\n", + " 1024 0.001 0.000 0.027 0.000 numeric.py:380(count_nonzero)\n", + " 1024 0.001 0.000 0.001 0.000 new_indexing.py:418()\n", + " 1024 0.001 0.000 0.001 0.000 new_indexing.py:509()\n", + " 2048 0.001 0.000 0.001 0.000 new_indexing.py:497()\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55c5e1118480}\n", + " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", + " 3081 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 2048 0.001 0.000 0.001 0.000 new_indexing.py:499()\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", + " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 2048 0.000 0.000 0.000 0.000 new_indexing.py:498()\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:204(chunk_store)\n", + " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1 0.000 0.000 0.260 0.260 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.032 0.032 new_indexing.py:430(__init__)\n", " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.zeros}\n", - " 1 0.000 0.000 0.300 0.300 {built-in method builtins.exec}\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.259 0.259 core.py:526(get_orthogonal_selection)\n", " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.300 0.300 core.py:392(__getitem__)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 1 0.000 0.000 0.300 0.300 core.py:527(get_orthogonal_selection)\n", + " 1 0.000 0.000 0.259 0.259 core.py:391(__getitem__)\n", " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:547(replace_lists)\n", " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:600(replace_lists)\n", - " 1 0.000 0.000 0.300 0.300 :1()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 1 0.000 0.000 0.259 0.259 :1()\n", " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:537()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:539()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:542()\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:602()\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:549()\n", " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:484()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:486()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:489()\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", "\n", "\n" @@ -1386,17 +1370,17 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(10000,\n", - " array([94979430, 11935675, 63597355, ..., 91349759, 40936288, 76612910]))" + " array([49021295, 65674535, 71257616, ..., 12130114, 48117886, 98926729]))" ] }, - "execution_count": 52, + "execution_count": 98, "metadata": {}, "output_type": "execute_result" } @@ -1410,7 +1394,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 99, "metadata": {}, "outputs": [ { @@ -1418,16 +1402,16 @@ "output_type": "stream", "text": [ "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", - "Wall time: 279 µs\n" + "Wall time: 245 µs\n" ] }, { "data": { "text/plain": [ - "array([ 11962, 27590, 30701, ..., 99968761, 99977334, 99990442])" + "array([ 14556, 48679, 54538, ..., 99958362, 99994365, 99999645])" ] }, - "execution_count": 53, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } @@ -1438,7 +1422,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 100, "metadata": {}, "outputs": [ { @@ -1446,16 +1430,16 @@ "output_type": "stream", "text": [ "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", - "Wall time: 362 µs\n" + "Wall time: 233 µs\n" ] }, { "data": { "text/plain": [ - "array([94979430, 11935675, 63597355, ..., 91349759, 40936288, 76612910])" + "array([49021295, 65674535, 71257616, ..., 12130114, 48117886, 98926729])" ] }, - "execution_count": 54, + "execution_count": 100, "metadata": {}, "output_type": "execute_result" } @@ -1466,24 +1450,24 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 101, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 472 ms, sys: 52 ms, total: 524 ms\n", - "Wall time: 243 ms\n" + "CPU times: user 388 ms, sys: 60 ms, total: 448 ms\n", + "Wall time: 172 ms\n" ] }, { "data": { "text/plain": [ - "array([ 11962, 27590, 30701, ..., 99968761, 99977334, 99990442])" + "array([ 14556, 48679, 54538, ..., 99958362, 99994365, 99999645])" ] }, - "execution_count": 55, + "execution_count": 101, "metadata": {}, "output_type": "execute_result" } @@ -1494,24 +1478,24 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 448 ms, sys: 104 ms, total: 552 ms\n", - "Wall time: 255 ms\n" + "CPU times: user 456 ms, sys: 32 ms, total: 488 ms\n", + "Wall time: 182 ms\n" ] }, { "data": { "text/plain": [ - "array([94979430, 11935675, 63597355, ..., 91349759, 40936288, 76612910])" + "array([49021295, 65674535, 71257616, ..., 12130114, 48117886, 98926729])" ] }, - "execution_count": 56, + "execution_count": 102, "metadata": {}, "output_type": "execute_result" } @@ -1522,94 +1506,94 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 103, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 138743 function calls in 0.280 seconds\n", + " 69723 function calls in 0.224 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 2039 0.156 0.000 0.167 0.000 core.py:931(_decode_chunk)\n", - " 2040 0.013 0.000 0.064 0.000 new_indexing.py:547(__iter__)\n", - " 4078 0.012 0.000 0.025 0.000 index_tricks.py:26(ix_)\n", - " 2039 0.009 0.000 0.210 0.000 core.py:769(_chunk_getitem)\n", - " 4078 0.009 0.000 0.009 0.000 core.py:324()\n", - " 2040 0.009 0.000 0.010 0.000 new_indexing.py:440(__iter__)\n", - " 6117 0.008 0.000 0.008 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1 0.005 0.005 0.279 0.279 core.py:549(_get_selection)\n", - " 2039 0.005 0.000 0.010 0.000 arrayprint.py:381(wrapper)\n", - " 4078 0.004 0.000 0.034 0.000 new_indexing.py:466(ix_)\n", - " 2039 0.004 0.000 0.013 0.000 core.py:319(_cdata_shape)\n", - " 2039 0.004 0.000 0.016 0.000 {method 'join' of 'str' objects}\n", - " 16319 0.004 0.000 0.004 0.000 {built-in method builtins.isinstance}\n", - " 2039 0.003 0.000 0.003 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 2039 0.003 0.000 0.004 0.000 arrayprint.py:399(array2string)\n", - " 4078 0.003 0.000 0.006 0.000 numerictypes.py:728(issubdtype)\n", - " 4078 0.003 0.000 0.004 0.000 new_indexing.py:471()\n", - " 2039 0.002 0.000 0.018 0.000 core.py:928(_chunk_key)\n", - " 2039 0.002 0.000 0.012 0.000 numeric.py:1905(array_str)\n", - " 4078 0.002 0.000 0.002 0.000 {built-in method __new__ of type object at 0x5616c71ce480}\n", - " 4084 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.array}\n", - " 8156 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", - " 8165 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 4078 0.001 0.000 0.002 0.000 numerictypes.py:660(issubclass_)\n", - " 4078 0.001 0.000 0.003 0.000 numeric.py:463(asarray)\n", - " 2039 0.001 0.000 0.002 0.000 threading.py:1230(current_thread)\n", - " 4078 0.001 0.000 0.001 0.000 new_indexing.py:550()\n", - " 2039 0.001 0.000 0.002 0.000 new_indexing.py:562()\n", - " 4078 0.001 0.000 0.001 0.000 new_indexing.py:552()\n", - " 2039 0.001 0.000 0.002 0.000 :12(__new__)\n", - " 4078 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", - " 4078 0.001 0.000 0.001 0.000 new_indexing.py:551()\n", + " 1024 0.148 0.000 0.155 0.000 core.py:930(_decode_chunk)\n", + " 1025 0.008 0.000 0.038 0.000 new_indexing.py:494(__iter__)\n", + " 2048 0.008 0.000 0.015 0.000 index_tricks.py:26(ix_)\n", + " 1024 0.006 0.000 0.182 0.000 core.py:768(_chunk_getitem)\n", + " 2048 0.006 0.000 0.006 0.000 core.py:323()\n", + " 3072 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1025 0.004 0.000 0.005 0.000 new_indexing.py:387(__iter__)\n", + " 1 0.003 0.003 0.223 0.223 core.py:548(_get_selection)\n", + " 1024 0.003 0.000 0.006 0.000 arrayprint.py:381(wrapper)\n", + " 2048 0.003 0.000 0.020 0.000 new_indexing.py:413(ix_)\n", + " 1024 0.003 0.000 0.010 0.000 {method 'join' of 'str' objects}\n", + " 1024 0.003 0.000 0.003 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1024 0.002 0.000 0.008 0.000 core.py:318(_cdata_shape)\n", + " 8199 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", + " 2048 0.002 0.000 0.004 0.000 numerictypes.py:728(issubdtype)\n", + " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 2048 0.002 0.000 0.002 0.000 new_indexing.py:418()\n", + " 1024 0.001 0.000 0.011 0.000 core.py:927(_chunk_key)\n", + " 1024 0.001 0.000 0.007 0.000 numeric.py:1905(array_str)\n", + " 2054 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", + " 4096 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", + " 2048 0.001 0.000 0.001 0.000 new_indexing.py:497()\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55c5e1118480}\n", + " 1024 0.001 0.000 0.001 0.000 new_indexing.py:509()\n", + " 2048 0.001 0.000 0.002 0.000 numeric.py:463(asarray)\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 2048 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 4105 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", " 1 0.001 0.001 0.001 0.001 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 2039 0.001 0.000 0.001 0.000 core.py:205(chunk_store)\n", - " 4079 0.001 0.000 0.001 0.000 {method 'append' of 'list' objects}\n", - " 2040 0.001 0.000 0.001 0.000 {built-in method builtins.any}\n", - " 2039 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 2039 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 2039 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 2039 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 2039 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 0.001 0.001 new_indexing.py:393(__init__)\n", - " 1 0.000 0.000 0.000 0.000 function_base.py:1848(diff)\n", - " 1 0.000 0.000 0.280 0.280 core.py:392(__getitem__)\n", + " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", + " 2048 0.001 0.000 0.001 0.000 new_indexing.py:499()\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 2048 0.000 0.000 0.000 0.000 new_indexing.py:498()\n", + " 1024 0.000 0.000 0.000 0.000 core.py:204(chunk_store)\n", + " 1 0.000 0.000 0.001 0.001 new_indexing.py:340(__init__)\n", + " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1 0.000 0.000 0.224 0.224 core.py:391(__getitem__)\n", " 4 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1 0.000 0.000 0.001 0.001 new_indexing.py:483(__init__)\n", + " 1 0.000 0.000 0.001 0.001 new_indexing.py:430(__init__)\n", " 1 0.000 0.000 0.000 0.000 {method 'take' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.000 0.000 0.280 0.280 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.000 0.000 function_base.py:1848(diff)\n", + " 1 0.000 0.000 0.224 0.224 {built-in method builtins.exec}\n", + " 4 0.000 0.000 0.000 0.000 fromnumeric.py:1886(any)\n", " 4 0.000 0.000 0.001 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 4 0.000 0.000 0.000 0.000 fromnumeric.py:1886(any)\n", " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 4 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n", " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 1 0.000 0.000 0.280 0.280 core.py:527(get_orthogonal_selection)\n", + " 4 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 4 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.280 0.280 :1()\n", + " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 1 0.000 0.000 0.224 0.224 core.py:526(get_orthogonal_selection)\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:600(replace_lists)\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:547(replace_lists)\n", + " 1 0.000 0.000 0.224 0.224 :1()\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", " 1 0.000 0.000 0.001 0.001 fromnumeric.py:826(argsort)\n", - " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 4 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:537()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:539()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:484()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:486()\n", + " 1 0.000 0.000 0.000 0.000 new_indexing.py:489()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:70(take)\n", - " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:542()\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:602()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:549()\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", "\n", "\n" @@ -1629,30 +1613,30 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Typezarr.core.Array
Data typebool
Shape(100000000,)
Chunk shape(195313,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes100000000 (95.4M)
No. bytes stored511297 (499.3K)
Storage ratio195.6
Chunks initialized512/512
" + "
Typezarr.core.Array
Data typebool
Shape(100000000,)
Chunk shape(390625,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes100000000 (95.4M)
No. bytes stored507558 (495.7K)
Storage ratio197.0
Chunks initialized256/256
" ], "text/plain": [ "Type : zarr.core.Array\n", "Data type : bool\n", "Shape : (100000000,)\n", - "Chunk shape : (195313,)\n", + "Chunk shape : (390625,)\n", "Order : C\n", "Read-only : False\n", "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", "Store type : builtins.dict\n", "No. bytes : 100000000 (95.4M)\n", - "No. bytes stored : 511297 (499.3K)\n", - "Storage ratio : 195.6\n", - "Chunks initialized : 512/512" + "No. bytes stored : 507558 (495.7K)\n", + "Storage ratio : 197.0\n", + "Chunks initialized : 256/256" ] }, - "execution_count": 58, + "execution_count": 104, "metadata": {}, "output_type": "execute_result" } @@ -1664,24 +1648,24 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 105, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.01 s, sys: 228 ms, total: 1.24 s\n", - "Wall time: 640 ms\n" + "CPU times: user 852 ms, sys: 140 ms, total: 992 ms\n", + "Wall time: 450 ms\n" ] }, { "data": { "text/plain": [ - "array([ 1063, 28396, 37229, ..., 99955875, 99979354, 99995791])" + "array([ 35449, 41893, 45592, ..., 99987487, 99990184, 99993538])" ] }, - "execution_count": 59, + "execution_count": 105, "metadata": {}, "output_type": "execute_result" } @@ -1699,15 +1683,15 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 80 ms, sys: 24 ms, total: 104 ms\n", - "Wall time: 101 ms\n" + "CPU times: user 68 ms, sys: 28 ms, total: 96 ms\n", + "Wall time: 92.7 ms\n" ] }, { @@ -1716,7 +1700,7 @@ "array([ 0, 2, 4, ..., 99999994, 99999996, 99999998])" ] }, - "execution_count": 60, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -1727,15 +1711,15 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.45 s, sys: 664 ms, total: 2.12 s\n", - "Wall time: 1.78 s\n" + "CPU times: user 1.3 s, sys: 268 ms, total: 1.57 s\n", + "Wall time: 1.3 s\n" ] }, { @@ -1744,7 +1728,7 @@ "array([ 0, 2, 4, ..., 99999994, 99999996, 99999998])" ] }, - "execution_count": 61, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -1755,15 +1739,15 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 572 ms, sys: 224 ms, total: 796 ms\n", - "Wall time: 513 ms\n" + "CPU times: user 564 ms, sys: 84 ms, total: 648 ms\n", + "Wall time: 396 ms\n" ] }, { @@ -1772,7 +1756,7 @@ "array([ 0, 10, 20, ..., 99999970, 99999980, 99999990])" ] }, - "execution_count": 62, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -1783,15 +1767,15 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 488 ms, sys: 88 ms, total: 576 ms\n", - "Wall time: 273 ms\n" + "CPU times: user 472 ms, sys: 40 ms, total: 512 ms\n", + "Wall time: 213 ms\n" ] }, { @@ -1800,7 +1784,7 @@ "array([ 0, 100, 200, ..., 99999700, 99999800, 99999900])" ] }, - "execution_count": 63, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -1811,15 +1795,15 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 472 ms, sys: 64 ms, total: 536 ms\n", - "Wall time: 225 ms\n" + "CPU times: user 432 ms, sys: 48 ms, total: 480 ms\n", + "Wall time: 192 ms\n" ] }, { @@ -1828,7 +1812,7 @@ "array([ 0, 1000, 2000, ..., 99997000, 99998000, 99999000])" ] }, - "execution_count": 64, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -1846,7 +1830,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 106, "metadata": {}, "outputs": [ { @@ -1855,7 +1839,7 @@ "(100000000,)" ] }, - "execution_count": 89, + "execution_count": 106, "metadata": {}, "output_type": "execute_result" } @@ -1866,7 +1850,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 107, "metadata": {}, "outputs": [ { @@ -1875,7 +1859,7 @@ "(100000, 1000)" ] }, - "execution_count": 90, + "execution_count": 107, "metadata": {}, "output_type": "execute_result" } @@ -1887,30 +1871,30 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Typezarr.core.Array
Data typeint64
Shape(100000, 1000)
Chunk shape(1563, 32)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes800000000 (762.9M)
No. bytes stored39862349 (38.0M)
Storage ratio20.1
Chunks initialized2048/2048
" + "
Typezarr.core.Array
Data typeint64
Shape(100000, 1000)
Chunk shape(3125, 32)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes800000000 (762.9M)
No. bytes stored39228864 (37.4M)
Storage ratio20.4
Chunks initialized1024/1024
" ], "text/plain": [ "Type : zarr.core.Array\n", "Data type : int64\n", "Shape : (100000, 1000)\n", - "Chunk shape : (1563, 32)\n", + "Chunk shape : (3125, 32)\n", "Order : C\n", "Read-only : False\n", "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", "Store type : builtins.dict\n", "No. bytes : 800000000 (762.9M)\n", - "No. bytes stored : 39862349 (38.0M)\n", - "Storage ratio : 20.1\n", - "Chunks initialized : 2048/2048" + "No. bytes stored : 39228864 (37.4M)\n", + "Storage ratio : 20.4\n", + "Chunks initialized : 1024/1024" ] }, - "execution_count": 91, + "execution_count": 108, "metadata": {}, "output_type": "execute_result" } @@ -1929,40 +1913,40 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ "ix0 = np.random.binomial(1, 0.5, size=d.shape[0]).astype(bool)\n", - "ix1 = np.random.binomial(1, 0.5, size=d.shape[1]).astype(bool)\n" + "ix1 = np.random.binomial(1, 0.5, size=d.shape[1]).astype(bool)" ] }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 110, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 124 ms, sys: 40 ms, total: 164 ms\n", - "Wall time: 164 ms\n" + "CPU times: user 140 ms, sys: 24 ms, total: 164 ms\n", + "Wall time: 165 ms\n" ] }, { "data": { "text/plain": [ - "array([[ 3, 5, 6, ..., 994, 995, 997],\n", - " [ 2003, 2005, 2006, ..., 2994, 2995, 2997],\n", - " [ 3003, 3005, 3006, ..., 3994, 3995, 3997],\n", + "array([[ 0, 1, 3, ..., 995, 998, 999],\n", + " [ 2000, 2001, 2003, ..., 2995, 2998, 2999],\n", + " [ 4000, 4001, 4003, ..., 4995, 4998, 4999],\n", " ..., \n", - " [99995003, 99995005, 99995006, ..., 99995994, 99995995, 99995997],\n", - " [99997003, 99997005, 99997006, ..., 99997994, 99997995, 99997997],\n", - " [99999003, 99999005, 99999006, ..., 99999994, 99999995, 99999997]])" + " [99992000, 99992001, 99992003, ..., 99992995, 99992998, 99992999],\n", + " [99997000, 99997001, 99997003, ..., 99997995, 99997998, 99997999],\n", + " [99999000, 99999001, 99999003, ..., 99999995, 99999998, 99999999]])" ] }, - "execution_count": 93, + "execution_count": 110, "metadata": {}, "output_type": "execute_result" } @@ -1973,30 +1957,30 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 111, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 832 ms, sys: 84 ms, total: 916 ms\n", - "Wall time: 533 ms\n" + "CPU times: user 860 ms, sys: 84 ms, total: 944 ms\n", + "Wall time: 468 ms\n" ] }, { "data": { "text/plain": [ - "array([[ 3, 5, 6, ..., 994, 995, 997],\n", - " [ 2003, 2005, 2006, ..., 2994, 2995, 2997],\n", - " [ 3003, 3005, 3006, ..., 3994, 3995, 3997],\n", + "array([[ 0, 1, 3, ..., 995, 998, 999],\n", + " [ 2000, 2001, 2003, ..., 2995, 2998, 2999],\n", + " [ 4000, 4001, 4003, ..., 4995, 4998, 4999],\n", " ..., \n", - " [99995003, 99995005, 99995006, ..., 99995994, 99995995, 99995997],\n", - " [99997003, 99997005, 99997006, ..., 99997994, 99997995, 99997997],\n", - " [99999003, 99999005, 99999006, ..., 99999994, 99999995, 99999997]])" + " [99992000, 99992001, 99992003, ..., 99992995, 99992998, 99992999],\n", + " [99997000, 99997001, 99997003, ..., 99997995, 99997998, 99997999],\n", + " [99999000, 99999001, 99999003, ..., 99999995, 99999998, 99999999]])" ] }, - "execution_count": 94, + "execution_count": 111, "metadata": {}, "output_type": "execute_result" } @@ -2014,7 +1998,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 112, "metadata": {}, "outputs": [], "source": [ @@ -2024,30 +2008,30 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 113, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 224 ms, sys: 56 ms, total: 280 ms\n", - "Wall time: 277 ms\n" + "CPU times: user 196 ms, sys: 56 ms, total: 252 ms\n", + "Wall time: 250 ms\n" ] }, { "data": { "text/plain": [ - "array([[16704459, 16704351, 16704547, ..., 16704405, 16704425, 16704805],\n", - " [10766459, 10766351, 10766547, ..., 10766405, 10766425, 10766805],\n", - " [64625459, 64625351, 64625547, ..., 64625405, 64625425, 64625805],\n", + "array([[50767038, 50767472, 50767242, ..., 50767418, 50767445, 50767947],\n", + " [28829038, 28829472, 28829242, ..., 28829418, 28829445, 28829947],\n", + " [17474038, 17474472, 17474242, ..., 17474418, 17474445, 17474947],\n", " ..., \n", - " [12875459, 12875351, 12875547, ..., 12875405, 12875425, 12875805],\n", - " [58689459, 58689351, 58689547, ..., 58689405, 58689425, 58689805],\n", - " [18138459, 18138351, 18138547, ..., 18138405, 18138425, 18138805]])" + " [ 5185038, 5185472, 5185242, ..., 5185418, 5185445, 5185947],\n", + " [27248038, 27248472, 27248242, ..., 27248418, 27248445, 27248947],\n", + " [72575038, 72575472, 72575242, ..., 72575418, 72575445, 72575947]])" ] }, - "execution_count": 96, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } @@ -2058,30 +2042,30 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 114, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.06 s, sys: 120 ms, total: 1.18 s\n", - "Wall time: 675 ms\n" + "CPU times: user 1.17 s, sys: 128 ms, total: 1.3 s\n", + "Wall time: 682 ms\n" ] }, { "data": { "text/plain": [ - "array([[16704459, 16704351, 16704547, ..., 16704405, 16704425, 16704805],\n", - " [10766459, 10766351, 10766547, ..., 10766405, 10766425, 10766805],\n", - " [64625459, 64625351, 64625547, ..., 64625405, 64625425, 64625805],\n", + "array([[50767038, 50767472, 50767242, ..., 50767418, 50767445, 50767947],\n", + " [28829038, 28829472, 28829242, ..., 28829418, 28829445, 28829947],\n", + " [17474038, 17474472, 17474242, ..., 17474418, 17474445, 17474947],\n", " ..., \n", - " [12875459, 12875351, 12875547, ..., 12875405, 12875425, 12875805],\n", - " [58689459, 58689351, 58689547, ..., 58689405, 58689425, 58689805],\n", - " [18138459, 18138351, 18138547, ..., 18138405, 18138425, 18138805]])" + " [ 5185038, 5185472, 5185242, ..., 5185418, 5185445, 5185947],\n", + " [27248038, 27248472, 27248242, ..., 27248418, 27248445, 27248947],\n", + " [72575038, 72575472, 72575242, ..., 72575418, 72575445, 72575947]])" ] }, - "execution_count": 97, + "execution_count": 114, "metadata": {}, "output_type": "execute_result" } @@ -2099,22 +2083,22 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 115, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "500000" + "10000000" ] }, - "execution_count": 120, + "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "n = int(d.size * .005)\n", + "n = int(d.size * .1)\n", "ix0 = np.random.choice(d.shape[0], size=n, replace=True)\n", "ix1 = np.random.choice(d.shape[1], size=n, replace=True)\n", "n" @@ -2122,82 +2106,24 @@ }, { "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 228 ms, sys: 0 ns, total: 228 ms\n", - "Wall time: 228 ms\n" - ] - }, - { - "data": { - "text/plain": [ - "array([235092, 460446, 351446, ..., 66295, 90139, 174162])" - ] - }, - "execution_count": 121, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%time np.lexsort((ix0, ix1))" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 48 ms, sys: 0 ns, total: 48 ms\n", - "Wall time: 46.8 ms\n" - ] - }, - { - "data": { - "text/plain": [ - "array([ 0, 1, 2, ..., 499997, 499998, 499999])" - ] - }, - "execution_count": 122, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ix0_sorted = np.sort(ix0)\n", - "ix1_sorted = np.sort(ix1)\n", - "%time np.lexsort((ix0_sorted, ix1_sorted))" - ] - }, - { - "cell_type": "code", - "execution_count": 113, + "execution_count": 116, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", - "Wall time: 1.29 ms\n" + "CPU times: user 236 ms, sys: 56 ms, total: 292 ms\n", + "Wall time: 289 ms\n" ] }, { "data": { "text/plain": [ - "array([15735056, 50367996, 82690284, ..., 79292255, 83283781, 38856303])" + "array([71132822, 44407411, 66463897, ..., 16188129, 30562595, 3115554])" ] }, - "execution_count": 113, + "execution_count": 116, "metadata": {}, "output_type": "execute_result" } @@ -2208,24 +2134,24 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 117, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 13.5 s, sys: 1.7 s, total: 15.2 s\n", - "Wall time: 6.33 s\n" + "CPU times: user 3.06 s, sys: 296 ms, total: 3.36 s\n", + "Wall time: 2.83 s\n" ] }, { "data": { "text/plain": [ - "array([15735056, 50367996, 82690284, ..., 79292255, 83283781, 38856303])" + "array([71132822, 44407411, 66463897, ..., 16188129, 30562595, 3115554])" ] }, - "execution_count": 114, + "execution_count": 117, "metadata": {}, "output_type": "execute_result" } @@ -2236,112 +2162,93 @@ }, { "cell_type": "code", - "execution_count": 116, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2048" - ] - }, - "execution_count": 116, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "zd.nchunks" - ] - }, - { - "cell_type": "code", - "execution_count": 115, + "execution_count": 118, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 2698965 function calls in 5.459 seconds\n", + " 48284 function calls in 2.856 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 49978 2.866 0.000 3.116 0.000 core.py:931(_decode_chunk)\n", - " 49978 0.411 0.000 4.602 0.000 core.py:769(_chunk_getitem)\n", - " 49979 0.316 0.000 0.752 0.000 new_indexing.py:660(__iter__)\n", - " 149934 0.296 0.000 0.296 0.000 new_indexing.py:677()\n", - " 149940 0.202 0.000 0.202 0.000 core.py:324()\n", - " 99956 0.162 0.000 0.343 0.000 arrayprint.py:381(wrapper)\n", - " 49978 0.129 0.000 0.533 0.000 {method 'join' of 'str' objects}\n", - " 49978 0.115 0.000 0.115 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 99956 0.105 0.000 0.123 0.000 arrayprint.py:399(array2string)\n", - " 1 0.099 0.099 5.454 5.454 core.py:549(_get_selection)\n", - " 49978 0.097 0.000 0.169 0.000 util.py:113(is_total_slice)\n", - " 49980 0.077 0.000 0.279 0.000 core.py:319(_cdata_shape)\n", - " 49978 0.072 0.000 0.072 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 99956 0.061 0.000 0.404 0.000 numeric.py:1905(array_str)\n", - " 299881 0.060 0.000 0.060 0.000 {built-in method builtins.isinstance}\n", - " 149934 0.053 0.000 0.053 0.000 new_indexing.py:672()\n", - " 49978 0.046 0.000 0.579 0.000 core.py:928(_chunk_key)\n", - " 149934 0.041 0.000 0.041 0.000 new_indexing.py:665()\n", - " 99956 0.034 0.000 0.042 0.000 util.py:128()\n", - " 49978 0.028 0.000 0.037 0.000 threading.py:1230(current_thread)\n", - " 149934 0.025 0.000 0.025 0.000 {built-in method _thread.get_ident}\n", - " 49978 0.023 0.000 0.045 0.000 :12(__new__)\n", - " 49978 0.022 0.000 0.022 0.000 {built-in method __new__ of type object at 0x5616c71ce480}\n", - " 99956 0.018 0.000 0.018 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 99965 0.016 0.000 0.016 0.000 {built-in method builtins.len}\n", - " 99956 0.014 0.000 0.014 0.000 {method 'add' of 'set' objects}\n", - " 49981 0.014 0.000 0.042 0.000 {built-in method builtins.all}\n", - " 99956 0.014 0.000 0.014 0.000 {method 'discard' of 'set' objects}\n", - " 49978 0.014 0.000 0.014 0.000 core.py:205(chunk_store)\n", - " 99956 0.013 0.000 0.013 0.000 {built-in method builtins.id}\n", - " 49978 0.009 0.000 0.009 0.000 threading.py:1304(main_thread)\n", - " 1 0.003 0.003 0.003 0.003 {built-in method numpy.core.multiarray.unravel_index}\n", - " 3 0.001 0.000 0.001 0.000 new_indexing.py:649()\n", - " 1 0.001 0.001 0.005 0.005 new_indexing.py:610(__init__)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.ravel_multi_index}\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:307(find_runs)\n", - " 6 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 function_base.py:1848(diff)\n", - " 1 0.000 0.000 5.459 5.459 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.concatenate}\n", - " 6 0.000 0.000 0.000 0.000 fromnumeric.py:1886(any)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 12 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 1 0.000 0.000 5.459 5.459 core.py:538(get_coordinate_selection)\n", - " 1 0.000 0.000 5.459 5.459 :1()\n", + " 1 1.401 1.401 1.401 1.401 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 3 0.412 0.137 0.412 0.137 new_indexing.py:601()\n", + " 3 0.259 0.086 0.259 0.086 new_indexing.py:592()\n", + " 1 0.242 0.242 2.414 2.414 new_indexing.py:557(__init__)\n", + " 1024 0.196 0.000 0.377 0.000 core.py:768(_chunk_getitem)\n", + " 1024 0.151 0.000 0.160 0.000 core.py:930(_decode_chunk)\n", + " 1 0.056 0.056 0.056 0.056 {built-in method numpy.core.multiarray.ravel_multi_index}\n", + " 1 0.038 0.038 0.038 0.038 {built-in method numpy.core.multiarray.bincount}\n", + " 3072 0.023 0.000 0.023 0.000 new_indexing.py:636()\n", + " 1 0.012 0.012 2.843 2.843 core.py:537(get_coordinate_selection)\n", + " 1025 0.010 0.000 0.036 0.000 new_indexing.py:618(__iter__)\n", + " 1 0.010 0.010 2.853 2.853 new_indexing.py:648(__getitem__)\n", + " 6 0.006 0.001 0.006 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 3081 0.005 0.000 0.005 0.000 core.py:323()\n", + " 1024 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1024 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 2048 0.004 0.000 0.008 0.000 arrayprint.py:381(wrapper)\n", + " 1 0.004 0.004 0.417 0.417 core.py:548(_get_selection)\n", + " 1 0.003 0.003 2.856 2.856 :1()\n", + " 1024 0.003 0.000 0.012 0.000 {method 'join' of 'str' objects}\n", + " 2048 0.002 0.000 0.003 0.000 arrayprint.py:399(array2string)\n", + " 1027 0.002 0.000 0.007 0.000 core.py:318(_cdata_shape)\n", + " 2048 0.002 0.000 0.009 0.000 numeric.py:1905(array_str)\n", + " 3084 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.001 0.000 0.013 0.000 core.py:927(_chunk_key)\n", + " 3072 0.001 0.000 0.001 0.000 new_indexing.py:632()\n", + " 3072 0.001 0.000 0.001 0.000 new_indexing.py:623()\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", + " 3072 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x55c5e1118480}\n", + " 2048 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 2056 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:204(chunk_store)\n", + " 2048 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 2048 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.unravel_index}\n", + " 6 0.000 0.000 0.006 0.001 fromnumeric.py:1886(any)\n", + " 1 0.000 0.000 2.856 2.856 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 3 0.000 0.000 1.401 0.467 fromnumeric.py:55(_wrapfunc)\n", + " 6 0.000 0.000 0.006 0.001 {method 'any' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 8 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", + " 1 0.000 0.000 0.000 0.000 core.py:333(_nchunks)\n", " 6 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.000 0.000 5.459 5.459 new_indexing.py:689(__getitem__)\n", - " 10 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:527(is_coordinate_selection)\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:584()\n", - " 6 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n", + " 6 0.000 0.000 0.006 0.001 _methods.py:37(_any)\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:531()\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:547(replace_lists)\n", " 1 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:600(replace_lists)\n", - " 1 0.000 0.000 0.000 0.000 function_base.py:5100(append)\n", - " 6 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:580(is_coordinate_selection)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 core.py:364(vindex)\n", + " 1 0.000 0.000 1.401 1.401 fromnumeric.py:826(argsort)\n", + " 1 0.000 0.000 0.000 0.000 core.py:337(nchunks)\n", + " 3 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method _functools.reduce}\n", " 1 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1380(ravel)\n", - " 6 0.000 0.000 0.000 0.000 new_indexing.py:602()\n", - " 3 0.000 0.000 0.000 0.000 new_indexing.py:614()\n", - " 1 0.000 0.000 0.000 0.000 {method 'ravel' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 core.py:213(shape)\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", " 12 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 2 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", + " 6 0.000 0.000 0.000 0.000 new_indexing.py:549()\n", + " 3 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", + " 2 0.000 0.000 0.000 0.000 core.py:152(_refresh_metadata)\n", " 1 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 3 0.000 0.000 0.000 0.000 {built-in method builtins.all}\n", " 4 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 3 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 1 0.000 0.000 0.000 0.000 core.py:363(vindex)\n", + " 1 0.000 0.000 0.000 0.000 core.py:212(shape)\n", + " 3 0.000 0.000 0.000 0.000 new_indexing.py:561()\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", - " 1 0.000 0.000 0.000 0.000 core.py:153(_refresh_metadata)\n", "\n", "\n" ] diff --git a/zarr/core.py b/zarr/core.py index 2a1d5b6d01..6414d40a2a 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,9 +8,8 @@ import numpy as np -from zarr.util import is_total_slice, normalize_array_selection, get_chunks_for_selection, \ - human_readable_size, normalize_resize_args, normalize_storage_path, normalize_shape, \ - normalize_chunks, InfoReporter, get_chunk_selections +from zarr.util import is_total_slice, human_readable_size, normalize_resize_args, \ + normalize_storage_path, normalize_shape, normalize_chunks, InfoReporter from zarr.storage import array_meta_key, attrs_key, listdir, getsize from zarr.meta import decode_array_metadata, encode_array_metadata from zarr.attrs import Attributes diff --git a/zarr/new_indexing.py b/zarr/new_indexing.py index e083f968c1..b4a2610c19 100644 --- a/zarr/new_indexing.py +++ b/zarr/new_indexing.py @@ -270,18 +270,19 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): # precompute number of selected items for each chunk self.chunk_nitems = np.zeros(self.nchunks, dtype='i8') - for dim_chunk_idx in range(self.nchunks): - dim_offset = dim_chunk_idx * self.dim_chunk_len - self.chunk_nitems[dim_chunk_idx] = np.count_nonzero( + for dim_chunk_ix in range(self.nchunks): + dim_offset = dim_chunk_ix * self.dim_chunk_len + self.chunk_nitems[dim_chunk_ix] = np.count_nonzero( self.dim_sel[dim_offset:dim_offset + self.dim_chunk_len] ) self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) self.nitems = self.chunk_nitems_cumsum[-1] + self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] def __iter__(self): # iterate over chunks with at least one item - for dim_chunk_ix in np.nonzero(self.chunk_nitems)[0]: + for dim_chunk_ix in self.dim_chunk_ixs: # find region in chunk dim_offset = dim_chunk_ix * self.dim_chunk_len @@ -359,10 +360,11 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): raise IndexError('selection contains index out of bounds') # handle non-monotonic indices + dim_sel_chunk = dim_sel // dim_chunk_len if np.any(np.diff(dim_sel) < 0): self.is_monotonic = False - # sort indices - self.dim_sort = np.argsort(dim_sel) + # sort indices to group by chunk + self.dim_sort = np.argsort(dim_sel_chunk) self.dim_sel = np.take(dim_sel, self.dim_sort) else: @@ -374,12 +376,11 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): self.dim_len = dim_len self.dim_chunk_len = dim_chunk_len self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) - self.nitems = len(dim_sel) + self.nitems = len(self.dim_sel) # precompute number of selected items for each chunk # note: for dense integer selections, the division operation here is the bottleneck - self.chunk_nitems = np.bincount(self.dim_sel // self.dim_chunk_len, - minlength=self.nchunks) + self.chunk_nitems = np.bincount(dim_sel_chunk, minlength=self.nchunks) self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] @@ -568,9 +569,6 @@ def __init__(self, selection, array): # attempt to broadcast selection - this will raise error if array dimensions don't match selection = np.broadcast_arrays(*selection) - self.shape = len(selection[0]) if selection[0].shape else 1 - self.drop_axes = None - self.array = array # normalization for dim_sel, dim_len in zip(selection, array.shape): @@ -589,25 +587,26 @@ def __init__(self, selection, array): if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): raise IndexError('index out of bounds') - # handle monotonicity - lexsort = np.lexsort(selection[::-1]) - if np.any(np.diff(lexsort) != 1): - self.is_monotonic = False - self.lexsort = lexsort - self.selection = tuple(np.take(dim_sel, lexsort) for dim_sel in selection) - else: - self.is_monotonic = True - self.lexsort = None - self.selection = selection - - # compute flattened chunk index for each point selected + # compute flattened chunk index for each point in the selection chunks_multi_index = tuple( dim_sel // dim_chunk_len - for (dim_sel, dim_chunk_len) in zip(self.selection, array._chunks) + for (dim_sel, dim_chunk_len) in zip(selection, array._chunks) ) chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, dims=array._cdata_shape) + # group points by chunk + sel_sort = np.argsort(chunks_raveled_indices) + chunks_raveled_indices = chunks_raveled_indices[sel_sort] + selection = tuple(dim_sel[sel_sort] for dim_sel in selection) + + # store atrributes + self.selection = selection + self.sel_sort = sel_sort + self.shape = len(selection[0]) if selection[0].shape else 1 + self.drop_axes = None + self.array = array + # precompute number of selected items for each chunk self.chunk_nitems = np.bincount(chunks_raveled_indices, minlength=array.nchunks) self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) @@ -627,12 +626,7 @@ def __iter__(self): else: start = self.chunk_nitems_cumsum[chunk_rix - 1] stop = self.chunk_nitems_cumsum[chunk_rix] - if self.is_monotonic: - out_selection = slice(start, stop) - else: - out_selection = self.lexsort[start:stop] - - # TODO fix bug somewhere around here + out_selection = self.sel_sort[start:stop] chunk_offsets = tuple( dim_chunk_ix * dim_chunk_len diff --git a/zarr/util.py b/zarr/util.py index 8af623142e..9d770ffbb3 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -28,9 +28,9 @@ def normalize_shape(shape): # code to guess chunk shape, adapted from h5py -CHUNK_BASE = 64*1024 # Multiplier by which chunks are adjusted +CHUNK_BASE = 128*1024 # Multiplier by which chunks are adjusted CHUNK_MIN = 128*1024 # Soft lower limit (128k) -CHUNK_MAX = 16*1024*1024 # Hard upper limit (16M) +CHUNK_MAX = 64*1024*1024 # Hard upper limit def guess_chunks(shape, typesize): @@ -134,379 +134,379 @@ def is_total_slice(item, shape): raise TypeError('expected slice or tuple of slices, found %r' % item) -class BoolArraySelection(object): - - def __init__(self, dim_sel, dim_len, dim_chunk_len): - - # check number of dimensions, only support indexing with 1d array - if len(dim_sel.shape) > 1: - raise IndexError('can only index with 1-dimensional Boolean array') - - # check shape - if dim_sel.shape[0] != dim_len: - raise IndexError('Boolean array has wrong length; expected %s, found %s' % - (dim_len, dim_sel.shape[0])) - - # store attributes - self.dim_sel = dim_sel - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) - - # precompute number of selected items for each chunk - self.chunk_nitems = np.zeros(self.nchunks, dtype='i8') - for dim_chunk_idx in range(self.nchunks): - dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len - self.chunk_nitems[dim_chunk_idx] = np.count_nonzero( - self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] - ) - self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) - self.nitems = self.chunk_nitems_cumsum[-1] - - def get_chunk_sel(self, dim_chunk_idx): - dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len - dim_chunk_sel = self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] - # pad out if final chunk - if dim_chunk_sel.shape[0] < self.dim_chunk_len: - tmp = np.zeros(self.dim_chunk_len, dtype=bool) - tmp[:dim_chunk_sel.shape[0]] = dim_chunk_sel - dim_chunk_sel = tmp - return dim_chunk_sel - - def get_out_sel(self, dim_chunk_idx): - if dim_chunk_idx == 0: - start = 0 - else: - start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] - stop = self.chunk_nitems_cumsum[dim_chunk_idx] - return slice(start, stop) - - def get_chunk_ranges(self): - return np.nonzero(self.chunk_nitems)[0] - - -class IntArraySelection(object): - - def __init__(self, dim_sel, dim_len, dim_chunk_len): - - # has to be a numpy array so we can do bincount - dim_sel = np.asanyarray(dim_sel) - - # check number of dimensions, only support indexing with 1d array - if len(dim_sel.shape) > 1: - raise IndexError('can only index with 1-dimensional integer array') - - # handle wraparound - loc_neg = dim_sel < 0 - if np.any(loc_neg): - dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len - - # handle out of bounds - if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): - raise IndexError('index out of bounds') - - # validate monotonically increasing - if np.any(np.diff(dim_sel) < 0): - raise NotImplementedError('only monotonically increasing indices are supported') - - # store attributes - self.dim_sel = dim_sel - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) - - # precompute number of selected items for each chunk - # note: for dense integer selections, the division operation here is the bottleneck - self.chunk_nitems = np.bincount(self.dim_sel // self.dim_chunk_len, minlength=self.nchunks) - self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) - self.nitems = len(dim_sel) - - def get_chunk_sel(self, dim_chunk_idx): - # need to slice out relevant indices from the total selection, then subtract the chunk - # offset - - dim_out_sel = self.get_out_sel(dim_chunk_idx) - dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len - dim_chunk_sel = self.dim_sel[dim_out_sel] - dim_chunk_offset - - return dim_chunk_sel - - def get_out_sel(self, dim_chunk_idx): - if dim_chunk_idx == 0: - start = 0 - else: - start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] - stop = self.chunk_nitems_cumsum[dim_chunk_idx] - return slice(start, stop) - - def get_chunk_ranges(self): - return np.nonzero(self.chunk_nitems)[0] - - -# TODO support slice with step via integer selection (convert to np.arange) - - -def normalize_dim_selection(dim_sel, dim_len, dim_chunk_len): - """Convenience function to normalize a selection within a single axis - of size `dim_len` for an array with chunk length `dim_chunk_len`.""" - - # normalize list to array - if isinstance(dim_sel, list): - dim_sel = np.asarray(dim_sel) - - if isinstance(dim_sel, numbers.Integral): - - # normalize type to int - dim_sel = int(dim_sel) - - # handle wraparound - if dim_sel < 0: - dim_sel = dim_len + dim_sel - - # handle out of bounds - if dim_sel >= dim_len or dim_sel < 0: - raise IndexError('index out of bounds: %s' % dim_sel) - - return dim_sel - - elif isinstance(dim_sel, slice): - - # handle slice with step - if dim_sel.step is not None and dim_sel.step != 1: - raise NotImplementedError('slice with step not implemented') - - # handle slice with None bound - start = 0 if dim_sel.start is None else dim_sel.start - stop = dim_len if dim_sel.stop is None else dim_sel.stop - - # handle wraparound - if start < 0: - start = dim_len + start - if stop < 0: - stop = dim_len + stop - - # handle zero-length axis - if start == stop == dim_len == 0: - return slice(0, 0) - - # handle out of bounds - if start < 0: - raise IndexError('start index out of bounds: %s' % dim_sel.start) - if stop < 0: - raise IndexError('stop index out of bounds: %s' % dim_sel.stop) - if start >= dim_len: - raise IndexError('start index out of bounds: %ss' % dim_sel.start) - if stop > dim_len: - stop = dim_len - if stop < start: - stop = start - - return slice(start, stop) - - elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): - - if dim_sel.dtype == bool: - return BoolArraySelection(dim_sel, dim_len, dim_chunk_len) - - elif dim_sel.dtype.kind in 'ui': - return IntArraySelection(dim_sel, dim_len, dim_chunk_len) - - else: - raise IndexError('unsupported index item type: %r' % dim_sel) - - else: - raise IndexError('unsupported index item type: %r' % dim_sel) - - -# noinspection PyTypeChecker -def normalize_array_selection(selection, shape, chunks): - """Convenience function to normalize a selection within an array with - the given `shape`.""" - - # ensure tuple - if not isinstance(selection, tuple): - selection = (selection,) - - # handle ellipsis - n_ellipsis = sum(1 for i in selection if i is Ellipsis) - if n_ellipsis > 1: - raise IndexError("an index can only have a single ellipsis ('...')") - elif n_ellipsis == 1: - n_items_l = selection.index(Ellipsis) # items to left of ellipsis - n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis - n_items = len(selection) - 1 # all non-ellipsis items - if n_items >= len(shape): - # ellipsis does nothing, just remove it - selection = tuple(i for i in selection if i != Ellipsis) - else: - # replace ellipsis with as many slices are needed for number of dims - new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) - if n_items_r: - new_item += selection[-n_items_r:] - selection = new_item - - # check dimensionality - if len(selection) > len(shape): - raise IndexError('too many indices for array') - - # determine start and stop indices for all axes - selection = tuple(normalize_dim_selection(i, l, c) for i, l, c in zip(selection, shape, chunks)) - - # fill out selection if not completely specified - if len(selection) < len(shape): - selection += tuple(slice(0, l) for l in shape[len(selection):]) - - return selection - - -def get_chunks_for_selection(selection, chunks): - """Convenience function to find chunks overlapping an array selection. N.B., - assumes selection has already been normalized.""" - - # indices of chunks overlapping the selection - chunk_ranges = [] - - # shape of the selection - sel_shape = [] - - # iterate over dimensions of the array - for dim_sel, dim_chunk_len in zip(selection, chunks): - - # dim_sel: selection for current dimension - # dim_chunk_len: length of chunk along current dimension - - dim_sel_len = None - - if isinstance(dim_sel, int): - - # dim selection is an integer, i.e., single item, so only need single chunk index for - # this dimension - dim_chunk_range = [dim_sel//dim_chunk_len] - - elif isinstance(dim_sel, slice): - - # dim selection is a slice, need range of chunk indices including start and stop of - # selection - dim_chunk_from = dim_sel.start//dim_chunk_len - dim_chunk_to = int(np.ceil(dim_sel.stop/dim_chunk_len)) - dim_chunk_range = range(dim_chunk_from, dim_chunk_to) - dim_sel_len = dim_sel.stop - dim_sel.start - - elif isinstance(dim_sel, BoolArraySelection): - - # dim selection is a boolean array, delegate this to the BooleanSelection class - dim_chunk_range = dim_sel.get_chunk_ranges() - dim_sel_len = dim_sel.nitems - - elif isinstance(dim_sel, IntArraySelection): - - # dim selection is an integer array, delegate this to the integerSelection class - dim_chunk_range = dim_sel.get_chunk_ranges() - dim_sel_len = dim_sel.nitems - - else: - raise RuntimeError('unexpected selection type') - - chunk_ranges.append(dim_chunk_range) - if dim_sel_len is not None: - sel_shape.append(dim_sel_len) - - return chunk_ranges, tuple(sel_shape) - - -def get_chunk_selections(selection, chunk_coords, chunks, n_advanced_selection): - - # chunk_coords: holds the index along each dimension for the current chunk within the - # chunk grid. E.g., (0, 0) locates the first (top left) chunk in a 2D chunk grid. - - chunk_selection = [] - out_selection = [] - - # iterate over dimensions (axes) of the array - for dim_sel, dim_chunk_idx, dim_chunk_len in zip(selection, chunk_coords, chunks): - - # dim_sel: selection for current dimension - # dim_chunk_idx: chunk index along current dimension - # dim_chunk_len: chunk length along current dimension - - # selection for current chunk along current dimension - dim_chunk_sel = None - - # selection into output array to store data from current chunk - dim_out_sel = None - - # calculate offset for current chunk along current dimension - this is used to - # determine the values to be extracted from the current chunk - dim_chunk_offset = dim_chunk_idx * dim_chunk_len - - # handle integer selection, i.e., single item - if isinstance(dim_sel, int): - - dim_chunk_sel = dim_sel - dim_chunk_offset - - # N.B., leave dim_out_sel as None, as this dimension has been dropped in the - # output array because of single value index - - # handle slice selection, i.e., contiguous range of items - elif isinstance(dim_sel, slice): - - if dim_sel.start <= dim_chunk_offset: - # selection starts before current chunk - dim_chunk_sel_start = 0 - dim_out_offset = dim_chunk_offset - dim_sel.start - - else: - # selection starts within current chunk - dim_chunk_sel_start = dim_sel.start - dim_chunk_offset - dim_out_offset = 0 - - if dim_sel.stop > dim_chunk_offset + dim_chunk_len: - # selection ends after current chunk - dim_chunk_sel_stop = dim_chunk_len - - else: - # selection ends within current chunk - dim_chunk_sel_stop = dim_sel.stop - dim_chunk_offset - - dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) - dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start - dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - - elif isinstance(dim_sel, (BoolArraySelection, IntArraySelection)): - - # get selection to extract data for the current chunk - dim_chunk_sel = dim_sel.get_chunk_sel(dim_chunk_idx) - - # figure out where to put these items in the output array - dim_out_sel = dim_sel.get_out_sel(dim_chunk_idx) - - else: - raise RuntimeError('unexpected selection type') - - # add to chunk selection - chunk_selection.append(dim_chunk_sel) - - # add to output selection - if dim_out_sel is not None: - out_selection.append(dim_out_sel) - - # normalise for indexing into numpy arrays - chunk_selection = tuple(chunk_selection) - out_selection = tuple(out_selection) - - # handle advanced indexing arrays orthogonally - if n_advanced_selection > 0: - # numpy doesn't support orthogonal indexing directly as yet, so need to work - # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices - # or integers, so need to convert slices and integers into ranges. - chunk_selection = [range(dim_chunk_sel.start, dim_chunk_sel.stop) - if isinstance(dim_chunk_sel, slice) - else [dim_chunk_sel] if isinstance(dim_chunk_sel, int) - else dim_chunk_sel - for dim_chunk_sel in chunk_selection] - chunk_selection = np.ix_(*chunk_selection) - - return chunk_selection, out_selection +# class BoolArraySelection(object): +# +# def __init__(self, dim_sel, dim_len, dim_chunk_len): +# +# # check number of dimensions, only support indexing with 1d array +# if len(dim_sel.shape) > 1: +# raise IndexError('can only index with 1-dimensional Boolean array') +# +# # check shape +# if dim_sel.shape[0] != dim_len: +# raise IndexError('Boolean array has wrong length; expected %s, found %s' % +# (dim_len, dim_sel.shape[0])) +# +# # store attributes +# self.dim_sel = dim_sel +# self.dim_len = dim_len +# self.dim_chunk_len = dim_chunk_len +# self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) +# +# # precompute number of selected items for each chunk +# self.chunk_nitems = np.zeros(self.nchunks, dtype='i8') +# for dim_chunk_idx in range(self.nchunks): +# dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len +# self.chunk_nitems[dim_chunk_idx] = np.count_nonzero( +# self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] +# ) +# self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) +# self.nitems = self.chunk_nitems_cumsum[-1] +# +# def get_chunk_sel(self, dim_chunk_idx): +# dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len +# dim_chunk_sel = self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] +# # pad out if final chunk +# if dim_chunk_sel.shape[0] < self.dim_chunk_len: +# tmp = np.zeros(self.dim_chunk_len, dtype=bool) +# tmp[:dim_chunk_sel.shape[0]] = dim_chunk_sel +# dim_chunk_sel = tmp +# return dim_chunk_sel +# +# def get_out_sel(self, dim_chunk_idx): +# if dim_chunk_idx == 0: +# start = 0 +# else: +# start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] +# stop = self.chunk_nitems_cumsum[dim_chunk_idx] +# return slice(start, stop) +# +# def get_chunk_ranges(self): +# return np.nonzero(self.chunk_nitems)[0] +# +# +# class IntArraySelection(object): +# +# def __init__(self, dim_sel, dim_len, dim_chunk_len): +# +# # has to be a numpy array so we can do bincount +# dim_sel = np.asanyarray(dim_sel) +# +# # check number of dimensions, only support indexing with 1d array +# if len(dim_sel.shape) > 1: +# raise IndexError('can only index with 1-dimensional integer array') +# +# # handle wraparound +# loc_neg = dim_sel < 0 +# if np.any(loc_neg): +# dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len +# +# # handle out of bounds +# if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): +# raise IndexError('index out of bounds') +# +# # validate monotonically increasing +# if np.any(np.diff(dim_sel) < 0): +# raise NotImplementedError('only monotonically increasing indices are supported') +# +# # store attributes +# self.dim_sel = dim_sel +# self.dim_len = dim_len +# self.dim_chunk_len = dim_chunk_len +# self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) +# +# # precompute number of selected items for each chunk +# # note: for dense integer selections, the division operation here is the bottleneck +# self.chunk_nitems = np.bincount(self.dim_sel // self.dim_chunk_len, minlength=self.nchunks) +# self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) +# self.nitems = len(dim_sel) +# +# def get_chunk_sel(self, dim_chunk_idx): +# # need to slice out relevant indices from the total selection, then subtract the chunk +# # offset +# +# dim_out_sel = self.get_out_sel(dim_chunk_idx) +# dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len +# dim_chunk_sel = self.dim_sel[dim_out_sel] - dim_chunk_offset +# +# return dim_chunk_sel +# +# def get_out_sel(self, dim_chunk_idx): +# if dim_chunk_idx == 0: +# start = 0 +# else: +# start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] +# stop = self.chunk_nitems_cumsum[dim_chunk_idx] +# return slice(start, stop) +# +# def get_chunk_ranges(self): +# return np.nonzero(self.chunk_nitems)[0] +# +# +# # TODO support slice with step via integer selection (convert to np.arange) +# +# +# def normalize_dim_selection(dim_sel, dim_len, dim_chunk_len): +# """Convenience function to normalize a selection within a single axis +# of size `dim_len` for an array with chunk length `dim_chunk_len`.""" +# +# # normalize list to array +# if isinstance(dim_sel, list): +# dim_sel = np.asarray(dim_sel) +# +# if isinstance(dim_sel, numbers.Integral): +# +# # normalize type to int +# dim_sel = int(dim_sel) +# +# # handle wraparound +# if dim_sel < 0: +# dim_sel = dim_len + dim_sel +# +# # handle out of bounds +# if dim_sel >= dim_len or dim_sel < 0: +# raise IndexError('index out of bounds: %s' % dim_sel) +# +# return dim_sel +# +# elif isinstance(dim_sel, slice): +# +# # handle slice with step +# if dim_sel.step is not None and dim_sel.step != 1: +# raise NotImplementedError('slice with step not implemented') +# +# # handle slice with None bound +# start = 0 if dim_sel.start is None else dim_sel.start +# stop = dim_len if dim_sel.stop is None else dim_sel.stop +# +# # handle wraparound +# if start < 0: +# start = dim_len + start +# if stop < 0: +# stop = dim_len + stop +# +# # handle zero-length axis +# if start == stop == dim_len == 0: +# return slice(0, 0) +# +# # handle out of bounds +# if start < 0: +# raise IndexError('start index out of bounds: %s' % dim_sel.start) +# if stop < 0: +# raise IndexError('stop index out of bounds: %s' % dim_sel.stop) +# if start >= dim_len: +# raise IndexError('start index out of bounds: %ss' % dim_sel.start) +# if stop > dim_len: +# stop = dim_len +# if stop < start: +# stop = start +# +# return slice(start, stop) +# +# elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): +# +# if dim_sel.dtype == bool: +# return BoolArraySelection(dim_sel, dim_len, dim_chunk_len) +# +# elif dim_sel.dtype.kind in 'ui': +# return IntArraySelection(dim_sel, dim_len, dim_chunk_len) +# +# else: +# raise IndexError('unsupported index item type: %r' % dim_sel) +# +# else: +# raise IndexError('unsupported index item type: %r' % dim_sel) +# +# +# # noinspection PyTypeChecker +# def normalize_array_selection(selection, shape, chunks): +# """Convenience function to normalize a selection within an array with +# the given `shape`.""" +# +# # ensure tuple +# if not isinstance(selection, tuple): +# selection = (selection,) +# +# # handle ellipsis +# n_ellipsis = sum(1 for i in selection if i is Ellipsis) +# if n_ellipsis > 1: +# raise IndexError("an index can only have a single ellipsis ('...')") +# elif n_ellipsis == 1: +# n_items_l = selection.index(Ellipsis) # items to left of ellipsis +# n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis +# n_items = len(selection) - 1 # all non-ellipsis items +# if n_items >= len(shape): +# # ellipsis does nothing, just remove it +# selection = tuple(i for i in selection if i != Ellipsis) +# else: +# # replace ellipsis with as many slices are needed for number of dims +# new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) +# if n_items_r: +# new_item += selection[-n_items_r:] +# selection = new_item +# +# # check dimensionality +# if len(selection) > len(shape): +# raise IndexError('too many indices for array') +# +# # determine start and stop indices for all axes +# selection = tuple(normalize_dim_selection(i, l, c) for i, l, c in zip(selection, shape, chunks)) +# +# # fill out selection if not completely specified +# if len(selection) < len(shape): +# selection += tuple(slice(0, l) for l in shape[len(selection):]) +# +# return selection +# +# +# def get_chunks_for_selection(selection, chunks): +# """Convenience function to find chunks overlapping an array selection. N.B., +# assumes selection has already been normalized.""" +# +# # indices of chunks overlapping the selection +# chunk_ranges = [] +# +# # shape of the selection +# sel_shape = [] +# +# # iterate over dimensions of the array +# for dim_sel, dim_chunk_len in zip(selection, chunks): +# +# # dim_sel: selection for current dimension +# # dim_chunk_len: length of chunk along current dimension +# +# dim_sel_len = None +# +# if isinstance(dim_sel, int): +# +# # dim selection is an integer, i.e., single item, so only need single chunk index for +# # this dimension +# dim_chunk_range = [dim_sel//dim_chunk_len] +# +# elif isinstance(dim_sel, slice): +# +# # dim selection is a slice, need range of chunk indices including start and stop of +# # selection +# dim_chunk_from = dim_sel.start//dim_chunk_len +# dim_chunk_to = int(np.ceil(dim_sel.stop/dim_chunk_len)) +# dim_chunk_range = range(dim_chunk_from, dim_chunk_to) +# dim_sel_len = dim_sel.stop - dim_sel.start +# +# elif isinstance(dim_sel, BoolArraySelection): +# +# # dim selection is a boolean array, delegate this to the BooleanSelection class +# dim_chunk_range = dim_sel.get_chunk_ranges() +# dim_sel_len = dim_sel.nitems +# +# elif isinstance(dim_sel, IntArraySelection): +# +# # dim selection is an integer array, delegate this to the integerSelection class +# dim_chunk_range = dim_sel.get_chunk_ranges() +# dim_sel_len = dim_sel.nitems +# +# else: +# raise RuntimeError('unexpected selection type') +# +# chunk_ranges.append(dim_chunk_range) +# if dim_sel_len is not None: +# sel_shape.append(dim_sel_len) +# +# return chunk_ranges, tuple(sel_shape) +# +# +# def get_chunk_selections(selection, chunk_coords, chunks, n_advanced_selection): +# +# # chunk_coords: holds the index along each dimension for the current chunk within the +# # chunk grid. E.g., (0, 0) locates the first (top left) chunk in a 2D chunk grid. +# +# chunk_selection = [] +# out_selection = [] +# +# # iterate over dimensions (axes) of the array +# for dim_sel, dim_chunk_idx, dim_chunk_len in zip(selection, chunk_coords, chunks): +# +# # dim_sel: selection for current dimension +# # dim_chunk_idx: chunk index along current dimension +# # dim_chunk_len: chunk length along current dimension +# +# # selection for current chunk along current dimension +# dim_chunk_sel = None +# +# # selection into output array to store data from current chunk +# dim_out_sel = None +# +# # calculate offset for current chunk along current dimension - this is used to +# # determine the values to be extracted from the current chunk +# dim_chunk_offset = dim_chunk_idx * dim_chunk_len +# +# # handle integer selection, i.e., single item +# if isinstance(dim_sel, int): +# +# dim_chunk_sel = dim_sel - dim_chunk_offset +# +# # N.B., leave dim_out_sel as None, as this dimension has been dropped in the +# # output array because of single value index +# +# # handle slice selection, i.e., contiguous range of items +# elif isinstance(dim_sel, slice): +# +# if dim_sel.start <= dim_chunk_offset: +# # selection starts before current chunk +# dim_chunk_sel_start = 0 +# dim_out_offset = dim_chunk_offset - dim_sel.start +# +# else: +# # selection starts within current chunk +# dim_chunk_sel_start = dim_sel.start - dim_chunk_offset +# dim_out_offset = 0 +# +# if dim_sel.stop > dim_chunk_offset + dim_chunk_len: +# # selection ends after current chunk +# dim_chunk_sel_stop = dim_chunk_len +# +# else: +# # selection ends within current chunk +# dim_chunk_sel_stop = dim_sel.stop - dim_chunk_offset +# +# dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) +# dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start +# dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) +# +# elif isinstance(dim_sel, (BoolArraySelection, IntArraySelection)): +# +# # get selection to extract data for the current chunk +# dim_chunk_sel = dim_sel.get_chunk_sel(dim_chunk_idx) +# +# # figure out where to put these items in the output array +# dim_out_sel = dim_sel.get_out_sel(dim_chunk_idx) +# +# else: +# raise RuntimeError('unexpected selection type') +# +# # add to chunk selection +# chunk_selection.append(dim_chunk_sel) +# +# # add to output selection +# if dim_out_sel is not None: +# out_selection.append(dim_out_sel) +# +# # normalise for indexing into numpy arrays +# chunk_selection = tuple(chunk_selection) +# out_selection = tuple(out_selection) +# +# # handle advanced indexing arrays orthogonally +# if n_advanced_selection > 0: +# # numpy doesn't support orthogonal indexing directly as yet, so need to work +# # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices +# # or integers, so need to convert slices and integers into ranges. +# chunk_selection = [range(dim_chunk_sel.start, dim_chunk_sel.stop) +# if isinstance(dim_chunk_sel, slice) +# else [dim_chunk_sel] if isinstance(dim_chunk_sel, int) +# else dim_chunk_sel +# for dim_chunk_sel in chunk_selection] +# chunk_selection = np.ix_(*chunk_selection) +# +# return chunk_selection, out_selection def normalize_resize_args(old_shape, *args): From b94fa67b8668818d61ba03094717bcdaebff7234 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 6 Nov 2017 00:29:15 +0000 Subject: [PATCH 33/67] tidy up --- zarr/core.py | 2 +- zarr/indexing.py | 963 ++++++++++++++++-------------------- zarr/new_indexing.py | 659 ------------------------ zarr/tests/test_core.py | 600 ---------------------- zarr/tests/test_indexing.py | 695 ++++++++++++++++++++++++++ zarr/tests/test_util.py | 83 +--- zarr/util.py | 375 -------------- 7 files changed, 1116 insertions(+), 2261 deletions(-) delete mode 100644 zarr/new_indexing.py create mode 100644 zarr/tests/test_indexing.py diff --git a/zarr/core.py b/zarr/core.py index 6414d40a2a..af6df3aca2 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -16,7 +16,7 @@ from zarr.errors import PermissionError, err_read_only, err_array_not_found from zarr.compat import reduce from zarr.codecs import AsType, get_codec -from zarr.new_indexing import OIndex, OrthogonalIndexer, BasicIndexer, VIndex, CoordinateIndexer +from zarr.indexing import OIndex, OrthogonalIndexer, BasicIndexer, VIndex, CoordinateIndexer class Array(object): diff --git a/zarr/indexing.py b/zarr/indexing.py index e6d56c46f8..2b412662c4 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -1,117 +1,68 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division import numbers +import itertools +import collections import numpy as np -def replace_ellipsis(selection, shape): - - # count number of ellipsis present - n_ellipsis = sum(1 for i in selection if i is Ellipsis) - - if n_ellipsis > 1: - # more than 1 is an error - raise IndexError("an index can only have a single ellipsis ('...')") - - elif n_ellipsis == 1: - # locate the ellipsis, count how many items to left and right - n_items_l = selection.index(Ellipsis) # items to left of ellipsis - n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis - n_items = len(selection) - 1 # all non-ellipsis items - - if n_items >= len(shape): - # ellipsis does nothing, just remove it - selection = tuple(i for i in selection if i != Ellipsis) - - else: - # replace ellipsis with as many slices are needed for number of dims - new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) - if n_items_r: - new_item += selection[-n_items_r:] - selection = new_item - - # fill out selection if not completely specified - if len(selection) < len(shape): - selection += tuple(slice(0, l) for l in shape[len(selection):]) - - return selection - - -class OIndex(object): - - def __init__(self, array): - self.array = array - - def __getitem__(self, selection): - return self.array.get_orthogonal_selection(selection) - - def __setitem__(self, selection, value): - return self.array.set_orthogonal_selection(selection, value) - - -def is_coordinate_selection(selection, array): - return ( - (len(selection) == array.ndim) and - all( - [(isinstance(dim_sel, numbers.Integral) or - (hasattr(dim_sel, 'dtype') and dim_sel.dtype.kind in 'ui')) - for dim_sel in selection] - ) - ) +def normalize_integer_selection(dim_sel, dim_len): + # normalize type to int + dim_sel = int(dim_sel) -def is_mask_selection(selection, array): - return ( - hasattr(selection, 'dtype') and - selection.dtype == bool and - hasattr(selection, 'shape') and - len(selection.shape) == len(array.shape) - ) + # handle wraparound + if dim_sel < 0: + dim_sel = dim_len + dim_sel + # handle out of bounds + if dim_sel >= dim_len or dim_sel < 0: + raise IndexError('index out of bounds') -def replace_lists(selection): - return tuple( - np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel - for dim_sel in selection - ) + return dim_sel -class VIndex(object): +ChunkDimProjection = collections.namedtuple('ChunkDimProjection', + ('dim_chunk_ix', 'dim_chunk_sel', 'dim_out_sel')) +"""A mapping from chunk to output array for a single dimension. - def __init__(self, array): - self.array = array +Parameters +---------- +dim_chunk_ix + Index of chunk. +dim_chunk_sel + Selection of items from chunk array. +dim_out_sel + Selection of items in target (output) array. - def __getitem__(self, selection): - if not isinstance(selection, tuple): - selection = tuple(selection) - selection = replace_lists(selection) - if is_coordinate_selection(selection, self.array): - return self.array.get_coordinate_selection(selection) - # elif is_mask_selection(selection, self.array): - # return self.array.get_mask_selection(selection) - else: - raise IndexError('unsupported selection') +""" - # def __setitem__(self, selection, value): - # return self.array.set_orthogonal_selection(selection, value) +class IntDimIndexer(object): -def normalize_integer_selection(dim_sel, dim_len): + def __init__(self, dim_sel, dim_len, dim_chunk_len): - # normalize type to int - dim_sel = int(dim_sel) + # check type + if not isinstance(dim_sel, numbers.Integral): + raise ValueError('selection must be an integer') - # handle wraparound - if dim_sel < 0: - dim_sel = dim_len + dim_sel + # normalize + dim_sel = normalize_integer_selection(dim_sel, dim_len) - # handle out of bounds - if dim_sel >= dim_len or dim_sel < 0: - raise IndexError('index out of bounds: %s' % dim_sel) + # store attributes + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nitems = 1 - return dim_sel + def __iter__(self): + dim_chunk_ix = self.dim_sel // self.dim_chunk_len + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel - dim_offset + dim_out_sel = None + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) def normalize_slice_selection(dim_sel, dim_len): @@ -142,418 +93,456 @@ def normalize_slice_selection(dim_sel, dim_len): return slice(start, stop, step) -class IndexerBase(object): - - def __init__(self, selection, array): - self.selection = selection - self.array = array - self.squeeze_axes = None +class SliceDimIndexer(object): - def __iter__(self): - return iter(self.selection) + def __init__(self, dim_sel, dim_len, dim_chunk_len): - def __len__(self): - return len(self.selection) + # check type + if not isinstance(dim_sel, slice): + raise ValueError('selection must be a slice') + # normalize + dim_sel = normalize_slice_selection(dim_sel, dim_len) -# noinspection PyProtectedMember -class BasicIndexer(IndexerBase): + # store attributes + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nitems = dim_sel.stop - dim_sel.start - def __init__(self, selection, array): + def __iter__(self): - # ensure tuple - if not isinstance(selection, tuple): - selection = (selection,) + dim_chunk_from = self.dim_sel.start // self.dim_chunk_len + dim_chunk_to = int(np.ceil(self.dim_sel.stop / self.dim_chunk_len)) - # handle ellipsis - selection = replace_ellipsis(selection, array._shape) + for dim_chunk_ix in range(dim_chunk_from, dim_chunk_to): - # validation - check dimensionality - if len(selection) > len(array._shape): - raise IndexError('too many indices for array') - if len(selection) < len(array._shape): - raise IndexError('not enough indices for array') + dim_offset = dim_chunk_ix * self.dim_chunk_len - # TODO refactor with OrthogonalIndexer + if self.dim_sel.start <= dim_offset: + # selection starts before current chunk + dim_chunk_sel_start = 0 + dim_out_offset = dim_offset - self.dim_sel.start - # normalization - selection = self.normalize_selection(selection, array) + else: + # selection starts within current chunk + dim_chunk_sel_start = self.dim_sel.start - dim_offset + dim_out_offset = 0 - # complete initialisation - super(BasicIndexer, self).__init__(selection, array) + if self.dim_sel.stop > (dim_offset + self.dim_chunk_len): + # selection ends after current chunk + dim_chunk_sel_stop = self.dim_chunk_len - def normalize_selection(self, selection, array): - # normalize each dimension - selection = tuple(self.normalize_dim_selection(s, l) - for s, l in zip(selection, array._shape)) - return selection + else: + # selection ends within current chunk + dim_chunk_sel_stop = self.dim_sel.stop - dim_offset - def normalize_dim_selection(self, dim_sel, dim_len): + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) + dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - if isinstance(dim_sel, numbers.Integral): + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - dim_sel = normalize_integer_selection(dim_sel, dim_len) - return dim_sel - elif isinstance(dim_sel, slice): +def replace_ellipsis(selection, shape): - dim_sel = normalize_slice_selection(dim_sel, dim_len) + selection = ensure_tuple(selection) - # handle slice with step - if dim_sel.step is not None and dim_sel.step != 1: - raise IndexError('slice with step not supported via basic indexing; use ' - 'orthogonal indexing instead') + # count number of ellipsis present + n_ellipsis = sum(1 for i in selection if i is Ellipsis) - return dim_sel + if n_ellipsis > 1: + # more than 1 is an error + raise IndexError("an index can only have a single ellipsis ('...')") - else: - raise IndexError('unsupported index item type: %r' % dim_sel) + elif n_ellipsis == 1: + # locate the ellipsis, count how many items to left and right + n_items_l = selection.index(Ellipsis) # items to left of ellipsis + n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis + n_items = len(selection) - 1 # all non-ellipsis items - def get_overlapping_chunks(self): - """Convenience function to find chunks overlapping an array selection. N.B., - assumes selection has already been normalized.""" + if n_items >= len(shape): + # ellipsis does nothing, just remove it + selection = tuple(i for i in selection if i != Ellipsis) - # indices of chunks overlapping the selection - chunk_ranges = [] + else: + # replace ellipsis with as many slices are needed for number of dims + new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) + if n_items_r: + new_item += selection[-n_items_r:] + selection = new_item - # shape of the selection - sel_shape = [] + # fill out selection if not completely specified + if len(selection) < len(shape): + selection += (slice(None),) * (len(shape) - len(selection)) - # iterate over dimensions of the array - for dim_sel, dim_chunk_len in zip(self.selection, self.array._chunks): + return selection - # dim_sel: selection for current dimension - # dim_chunk_len: length of chunk along current dimension - dim_sel_len = None +def ensure_tuple(v): + if not isinstance(v, tuple): + v = (v,) + return v - if isinstance(dim_sel, int): - # dim selection is an integer, i.e., single item, so only need single chunk index - # for this dimension - dim_chunk_range = [dim_sel//dim_chunk_len] +ChunkProjection = collections.namedtuple('ChunkProjection', + ('chunk_coords', 'chunk_selection', 'out_selection')) +"""A mapping of items from chunk to output array. Can be used to extract items from the chunk +array for loading into an output array. Can also be used to extract items from a value array for +setting/updating in a chunk array. - elif isinstance(dim_sel, slice): +Parameters +---------- +chunk_coords + Indices of chunk. +chunk_selection + Selection of items from chunk array. +out_selection + Selection of items in target (output) array. - # dim selection is a slice, need range of chunk indices including start and stop of - # selection - dim_chunk_from = dim_sel.start//dim_chunk_len - dim_chunk_to = int(np.ceil(dim_sel.stop/dim_chunk_len)) - dim_chunk_range = range(dim_chunk_from, dim_chunk_to) - dim_sel_len = dim_sel.stop - dim_sel.start +""" - else: - raise RuntimeError('unexpected selection type') - chunk_ranges.append(dim_chunk_range) - if dim_sel_len is not None: - sel_shape.append(dim_sel_len) +def check_selection_length(selection, shape): + if len(selection) > len(shape): + raise IndexError('too many indices for array') + if len(selection) < len(shape): + raise IndexError('not enough indices for array') - return chunk_ranges, tuple(sel_shape) - def get_chunk_projection(self, chunk_coords): +# noinspection PyProtectedMember +class BasicIndexer(object): - # chunk_coords: holds the index along each dimension for the current chunk within the - # chunk grid. E.g., (0, 0) locates the first (top left) chunk in a 2D chunk grid. + def __init__(self, selection, array): - chunk_selection = [] - out_selection = [] + # ensure tuple + selection = ensure_tuple(selection) - # iterate over dimensions (axes) of the array - for dim_sel, dim_chunk_idx, dim_chunk_len in zip(self.selection, chunk_coords, - self.array._chunks): + # handle ellipsis + selection = replace_ellipsis(selection, array._shape) + check_selection_length(selection, array._shape) - # dim_sel: selection for current dimension - # dim_chunk_idx: chunk index along current dimension - # dim_chunk_len: chunk length along current dimension + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): - # selection into output array to store data from current chunk - dim_out_sel = None + if isinstance(dim_sel, int): + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) - # calculate offset for current chunk along current dimension - this is used to - # determine the values to be extracted from the current chunk - dim_chunk_offset = dim_chunk_idx * dim_chunk_len + elif isinstance(dim_sel, slice): + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) - # handle integer selection, i.e., single item - if isinstance(dim_sel, int): + else: + raise IndexError('bad selection type') - dim_chunk_sel = dim_sel - dim_chunk_offset + dim_indexers.append(dim_indexer) - # N.B., leave dim_out_sel as None, as this dimension has been dropped in the - # output array because of single value index + self.dim_indexers = dim_indexers + self.shape = tuple(s.nitems for s in self.dim_indexers + if not isinstance(s, IntDimIndexer)) + self.drop_axes = None - # handle slice selection, i.e., contiguous range of items - elif isinstance(dim_sel, slice): + def __iter__(self): + for dim_projections in itertools.product(*self.dim_indexers): - if dim_sel.start <= dim_chunk_offset: - # selection starts before current chunk - dim_chunk_sel_start = 0 - dim_out_offset = dim_chunk_offset - dim_sel.start + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple(p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None) - else: - # selection starts within current chunk - dim_chunk_sel_start = dim_sel.start - dim_chunk_offset - dim_out_offset = 0 + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) - if dim_sel.stop > dim_chunk_offset + dim_chunk_len: - # selection ends after current chunk - dim_chunk_sel_stop = dim_chunk_len - else: - # selection ends within current chunk - dim_chunk_sel_stop = dim_sel.stop - dim_chunk_offset +class BoolArrayDimIndexer(object): - dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) - dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start - dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + def __init__(self, dim_sel, dim_len, dim_chunk_len): - # TODO refactor code with OrthogonalIndexer + # check number of dimensions + if len(dim_sel.shape) > 1: + raise IndexError('selection must be a 1d array') - else: - raise RuntimeError('unexpected selection type') + # check shape + if dim_sel.shape[0] != dim_len: + raise IndexError('selection has the wrong length') - # add to chunk selection - chunk_selection.append(dim_chunk_sel) + # store attributes + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) - # add to output selection - if dim_out_sel is not None: - out_selection.append(dim_out_sel) + # precompute number of selected items for each chunk + self.chunk_nitems = np.zeros(self.nchunks, dtype='i8') + for dim_chunk_ix in range(self.nchunks): + dim_offset = dim_chunk_ix * self.dim_chunk_len + self.chunk_nitems[dim_chunk_ix] = np.count_nonzero( + self.dim_sel[dim_offset:dim_offset + self.dim_chunk_len] + ) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + self.nitems = self.chunk_nitems_cumsum[-1] + self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] - # normalise for indexing into numpy arrays - chunk_selection = tuple(chunk_selection) - out_selection = tuple(out_selection) + def __iter__(self): - return chunk_selection, out_selection + # iterate over chunks with at least one item + for dim_chunk_ix in self.dim_chunk_ixs: + # find region in chunk + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[dim_offset:dim_offset + self.dim_chunk_len] -# noinspection PyProtectedMember -class OrthogonalIndexer(IndexerBase): + # pad out if final chunk + if dim_chunk_sel.shape[0] < self.dim_chunk_len: + tmp = np.zeros(self.dim_chunk_len, dtype=bool) + tmp[:dim_chunk_sel.shape[0]] = dim_chunk_sel + dim_chunk_sel = tmp - def __init__(self, selection, array): + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + dim_out_sel = slice(start, stop) - # ensure tuple - if not isinstance(selection, tuple): - selection = (selection,) + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - # handle ellipsis - selection = replace_ellipsis(selection, array._shape) - # validation - check dimensionality - if len(selection) > len(array._shape): - raise IndexError('too many indices for array') - if len(selection) < len(array._shape): - raise IndexError('not enough indices for array') +class IntArrayDimIndexer(object): + """Integer array selection against a single dimension.""" - # normalization - selection = self.normalize_selection(selection, array) + def __init__(self, dim_sel, dim_len, dim_chunk_len): - # super initialisation - super(OrthogonalIndexer, self).__init__(selection, array) + # ensure array + dim_sel = np.asanyarray(dim_sel) - # figure out if we're going to be doing advanced indexing on chunks, if so then - # chunk selections will need special handling - self.is_advanced = any([not isinstance(dim_sel, (int, slice)) - for dim_sel in selection]) + # check number of dimensions + if dim_sel.ndim != 1: + raise IndexError('selection must be a 1d array') - # locate axes that need to get squeezed out later if doing advanced selection - if self.is_advanced: - self.squeeze_axes = tuple([i for i, dim_sel in enumerate(selection) - if isinstance(dim_sel, int)]) - else: - self.squeeze_axes = None + # check dtype + if dim_sel.dtype.kind not in 'ui': + raise IndexError('selection must be an integer array') - def normalize_selection(self, selection, array): - # normalize each dimension - selection = tuple(self.normalize_dim_selection(s, l, c) - for s, l, c in zip(selection, array._shape, array._chunks)) - return selection + # handle wraparound + loc_neg = dim_sel < 0 + if np.any(loc_neg): + dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len - def normalize_dim_selection(self, dim_sel, dim_len, dim_chunk_len): + # handle out of bounds + if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): + raise IndexError('selection contains index out of bounds') - # normalize list to array - if isinstance(dim_sel, list): - dim_sel = np.asarray(dim_sel) + # handle non-monotonic indices + dim_sel_chunk = dim_sel // dim_chunk_len + if np.any(np.diff(dim_sel) < 0): + self.is_monotonic = False + # sort indices to group by chunk + self.dim_sort = np.argsort(dim_sel_chunk) + self.dim_sel = np.take(dim_sel, self.dim_sort) - if isinstance(dim_sel, numbers.Integral): + else: + self.is_monotonic = True + self.dim_sort = None + self.dim_sel = dim_sel - dim_sel = normalize_integer_selection(dim_sel, dim_len) - return dim_sel + # store attributes + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) + self.nitems = len(self.dim_sel) - elif isinstance(dim_sel, slice): + # precompute number of selected items for each chunk + # note: for dense integer selections, the division operation here is the bottleneck + self.chunk_nitems = np.bincount(dim_sel_chunk, minlength=self.nchunks) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] - dim_sel = normalize_slice_selection(dim_sel, dim_len) + def __iter__(self): - # handle slice with step - if dim_sel.step != 1: - dim_sel = np.arange(dim_sel.start, dim_sel.stop, dim_sel.step) - return IntArrayDimSelection(dim_sel, dim_len, dim_chunk_len) + for dim_chunk_ix in self.dim_chunk_ixs: - return dim_sel + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + if self.is_monotonic: + dim_out_sel = slice(start, stop) + else: + dim_out_sel = self.dim_sort[start:stop] - elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): + # find region in chunk + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[start:stop] - dim_offset - if dim_sel.dtype == bool: - return BoolArrayDimSelection(dim_sel, dim_len, dim_chunk_len) + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - elif dim_sel.dtype.kind in 'ui': - return IntArrayDimSelection(dim_sel, dim_len, dim_chunk_len) - else: - raise IndexError('unsupported index item type: %r' % dim_sel) +def slice_to_range(s): + return range(s.start, s.stop, 1 if s.step is None else s.step) - else: - raise IndexError('unsupported index item type: %r' % dim_sel) - def get_overlapping_chunks(self): - """Convenience function to find chunks overlapping an array selection. N.B., - assumes selection has already been normalized.""" +def ix_(*selection): + """Convert an orthogonal selection to a numpy advanced (fancy) selection, with support for + slices and single ints.""" - # indices of chunks overlapping the selection - chunk_ranges = [] + # replace slice and int as these are not supported by numpy ix_() + selection = [slice_to_range(dim_sel) if isinstance(dim_sel, slice) + else [dim_sel] if isinstance(dim_sel, int) + else dim_sel + for dim_sel in selection] - # shape of the selection - sel_shape = [] + selection = np.ix_(*selection) - # iterate over dimensions of the array - for dim_sel, dim_chunk_len in zip(self.selection, self.array._chunks): + return selection - # dim_sel: selection for current dimension - # dim_chunk_len: length of chunk along current dimension - dim_sel_len = None +def oindex(a, selection): + """Implementation of orthogonal indexing with slices and ints.""" + drop_axes = tuple([i for i, s in enumerate(selection) if isinstance(s, int)]) + selection = ix_(*selection) + result = a[selection] + if drop_axes: + result = result.squeeze(axis=drop_axes) + return result - if isinstance(dim_sel, int): - # dim selection is an integer, i.e., single item, so only need single chunk index for - # this dimension - dim_chunk_range = [dim_sel//dim_chunk_len] +class OrthogonalIndexer(object): - elif isinstance(dim_sel, slice): + def __init__(self, selection, array): - # dim selection is a slice, need range of chunk indices including start and stop of - # selection - dim_chunk_from = dim_sel.start//dim_chunk_len - dim_chunk_to = int(np.ceil(dim_sel.stop/dim_chunk_len)) - dim_chunk_range = range(dim_chunk_from, dim_chunk_to) - dim_sel_len = dim_sel.stop - dim_sel.start + # ensure tuple + selection = ensure_tuple(selection) - elif isinstance(dim_sel, BoolArrayDimSelection): + # handle ellipsis + selection = replace_ellipsis(selection, array._shape) - # dim selection is a boolean array, delegate this to the BooleanSelection class - dim_chunk_range = dim_sel.get_overlapping_chunks() - dim_sel_len = dim_sel.nitems + # normalize list to array + selection = replace_lists(selection) - elif isinstance(dim_sel, IntArrayDimSelection): + # validation - check dimensionality + if len(selection) > len(array._shape): + raise IndexError('too many indices for array') + if len(selection) < len(array._shape): + raise IndexError('not enough indices for array') - # dim selection is an integer array, delegate this to the integerSelection class - dim_chunk_range = dim_sel.get_overlapping_chunks() - dim_sel_len = dim_sel.nitems + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): - else: - raise RuntimeError('unexpected selection type') + if isinstance(dim_sel, numbers.Integral): - chunk_ranges.append(dim_chunk_range) - if dim_sel_len is not None: - sel_shape.append(dim_sel_len) + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) - return chunk_ranges, tuple(sel_shape) + elif isinstance(dim_sel, slice): - def get_chunk_projection(self, chunk_coords): + # normalize so we can check for step + dim_sel = normalize_slice_selection(dim_sel, dim_len) - # chunk_coords: holds the index along each dimension for the current chunk within the - # chunk grid. E.g., (0, 0) locates the first (top left) chunk in a 2D chunk grid. + # handle slice with step + if dim_sel.step != 1: + dim_sel = np.arange(dim_sel.start, dim_sel.stop, dim_sel.step) + dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + else: + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) - chunk_selection = [] - out_selection = [] + elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): - # iterate over dimensions (axes) of the array - for dim_sel, dim_chunk_idx, dim_chunk_len in zip(self.selection, chunk_coords, - self.array._chunks): + if dim_sel.dtype == bool: + dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) - # dim_sel: selection for current dimension - # dim_chunk_idx: chunk index along current dimension - # dim_chunk_len: chunk length along current dimension + elif dim_sel.dtype.kind in 'ui': + dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) - # selection into output array to store data from current chunk - dim_out_sel = None + else: + raise IndexError('bad selection type') - # calculate offset for current chunk along current dimension - this is used to - # determine the values to be extracted from the current chunk - dim_chunk_offset = dim_chunk_idx * dim_chunk_len + else: + raise IndexError('bad selection type') - # handle integer selection, i.e., single item - if isinstance(dim_sel, int): + dim_indexers.append(dim_indexer) - dim_chunk_sel = dim_sel - dim_chunk_offset + self.dim_indexers = dim_indexers + self.shape = tuple(s.nitems for s in self.dim_indexers + if not isinstance(s, IntDimIndexer)) + self.is_advanced = any([not isinstance(dim_indexer, (IntDimIndexer, SliceDimIndexer)) + for dim_indexer in self.dim_indexers]) + if self.is_advanced: + self.drop_axes = tuple([i for i, dim_indexer in enumerate(self.dim_indexers) + if isinstance(dim_indexer, IntDimIndexer)]) + else: + self.drop_axes = None - # N.B., leave dim_out_sel as None, as this dimension has been dropped in the - # output array because of single value index + def __iter__(self): + for dim_projections in itertools.product(*self.dim_indexers): - # handle slice selection, i.e., contiguous range of items - elif isinstance(dim_sel, slice): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple(p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None) - if dim_sel.start <= dim_chunk_offset: - # selection starts before current chunk - dim_chunk_sel_start = 0 - dim_out_offset = dim_chunk_offset - dim_sel.start + # handle advanced indexing arrays orthogonally + if self.is_advanced: + # numpy doesn't support orthogonal indexing directly as yet, so need to work + # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices + # or integers, so need to convert slices and integers into ranges. + chunk_selection = ix_(*chunk_selection) - else: - # selection starts within current chunk - dim_chunk_sel_start = dim_sel.start - dim_chunk_offset - dim_out_offset = 0 + # special case for non-monotonic indices + if any([not isinstance(s, (int, slice)) for s in out_selection]): + out_selection = ix_(*out_selection) - if dim_sel.stop > dim_chunk_offset + dim_chunk_len: - # selection ends after current chunk - dim_chunk_sel_stop = dim_chunk_len + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) - else: - # selection ends within current chunk - dim_chunk_sel_stop = dim_sel.stop - dim_chunk_offset - dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) - dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start - dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) +class OIndex(object): - elif isinstance(dim_sel, (BoolArrayDimSelection, IntArrayDimSelection)): + def __init__(self, array): + self.array = array - # get selection to extract data for the current chunk - dim_chunk_sel = dim_sel.get_chunk_sel(dim_chunk_idx) + def __getitem__(self, selection): + return self.array.get_orthogonal_selection(selection) - # figure out where to put these items in the output array - dim_out_sel = dim_sel.get_out_sel(dim_chunk_idx) + def __setitem__(self, selection, value): + return self.array.set_orthogonal_selection(selection, value) - else: - raise RuntimeError('unexpected selection type') - # add to chunk selection - chunk_selection.append(dim_chunk_sel) +def is_coordinate_selection(selection, array): + return ( + (len(selection) == len(array._shape)) and + all( + [(isinstance(dim_sel, numbers.Integral) or + (hasattr(dim_sel, 'dtype') and dim_sel.dtype.kind in 'ui')) + for dim_sel in selection] + ) + ) - # add to output selection - if dim_out_sel is not None: - out_selection.append(dim_out_sel) - # normalise for indexing into numpy arrays - chunk_selection = tuple(chunk_selection) - out_selection = tuple(out_selection) +def is_mask_selection(selection, array): + return ( + hasattr(selection, 'dtype') and + selection.dtype == bool and + hasattr(selection, 'shape') and + len(selection.shape) == len(array.shape) + ) - # handle advanced indexing arrays orthogonally - if self.is_advanced: - # numpy doesn't support orthogonal indexing directly as yet, so need to work - # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices - # or integers, so need to convert slices and integers into ranges. - chunk_selection = ix_(*chunk_selection) - return chunk_selection, out_selection +def replace_lists(selection): + return tuple( + np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel + for dim_sel in selection + ) # noinspection PyProtectedMember -class CoordinateIndexer(IndexerBase): +class CoordinateIndexer(object): def __init__(self, selection, array): # some initial normalization - if not isinstance(selection, tuple): - selection = tuple(selection) + selection = ensure_tuple(selection) + selection = tuple([i] if isinstance(i, numbers.Integral) else i + for i in selection) selection = replace_lists(selection) # validation @@ -561,42 +550,15 @@ def __init__(self, selection, array): # TODO refactor error messages for consistency raise IndexError('invalid coordinate selection') - # more normalization - selection = self.normalize_selection(selection, array) - - # super initialisation - super(CoordinateIndexer, self).__init__(selection, array) - - # compute flattened chunk indices for each point selected - chunks_multi_index = tuple( - dim_sel // dim_chunk_len - for (dim_sel, dim_chunk_len) in zip(selection, array._chunks) - ) - chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, - dims=array._cdata_shape) - - # validated that indices are monotonically increasing - if np.any(np.diff(chunks_raveled_indices) < 0): - raise NotImplementedError('only monotonically increasing indices are supported') - - # compute various useful things - self.chunk_nitems = np.bincount(chunks_raveled_indices) - self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) - self.nitems = len(chunks_raveled_indices) - self.sel_shape = (self.nitems,) - self.chunk_ranges = np.unravel_index(np.unique(chunks_raveled_indices), - dims=array._cdata_shape) - - def normalize_selection(self, selection, array): - # attempt to broadcast selection - this will raise error if array dimensions don't match selection = np.broadcast_arrays(*selection) + # normalization for dim_sel, dim_len in zip(selection, array.shape): # check number of dimensions, only support indexing with 1d array if len(dim_sel.shape) > 1: - raise IndexError('can only index with integer or 1-dimensional integer array') + raise IndexError('selection must be 1-dimensional integer array') # handle wraparound loc_neg = dim_sel < 0 @@ -608,162 +570,73 @@ def normalize_selection(self, selection, array): if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): raise IndexError('index out of bounds') - return selection - - def get_overlapping_chunks(self): - """Convenience function to find chunks overlapping an array selection. N.B., - assumes selection has already been normalized.""" - - return self.chunk_ranges, self.sel_shape - - def get_chunk_projection(self, chunk_coords): - - # chunk_coords: holds the index along each dimension for the current chunk within the - # chunk grid. E.g., (0, 0) locates the first (top left) chunk in a 2D chunk grid. - - chunk_idx = np.ravel_multi_index(*chunk_coords, dims=self.array._cdata_shape) - if chunk_idx == 0: - out_start = 0 - else: - out_start = self.chunk_nitems_cumsum[chunk_idx - 1] - out_stop = self.chunk_nitems_cumsum[chunk_idx] - out_selection = slice(out_start, out_stop) - - chunk_offsets = tuple( - dim_chunk_idx * dim_chunk_len - for dim_chunk_idx, dim_chunk_len in zip(chunk_coords, self.array._chunks) - ) - chunk_selection = tuple( - dim_sel[out_selection] - dim_chunk_offset - for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets) + # compute flattened chunk index for each point in the selection + chunks_multi_index = tuple( + dim_sel // dim_chunk_len + for (dim_sel, dim_chunk_len) in zip(selection, array._chunks) ) + chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, + dims=array._cdata_shape) - return chunk_selection, out_selection - - -def slice_to_range(dim_sel): - return range(dim_sel.start, dim_sel.stop, 1 if dim_sel.step is None else dim_sel.step) - - -def ix_(*selection): - """Convert an orthogonal selection to a numpy advanced (fancy) selection, with support for - slices and single ints.""" - - # replace slice and int as these are not supported by numpy ix_() - selection = [slice_to_range(dim_sel) if isinstance(dim_sel, slice) - else [dim_sel] if isinstance(dim_sel, int) - else dim_sel - for dim_sel in selection] - - selection = np.ix_(*selection) - - return selection - - -class IntArrayDimSelection(object): - - def __init__(self, dim_sel, dim_len, dim_chunk_len): - - # has to be a numpy array so we can do bincount - dim_sel = np.asanyarray(dim_sel) - - # check number of dimensions, only support indexing with 1d array - if len(dim_sel.shape) > 1: - raise IndexError('can only index with 1-dimensional integer array') - - # handle wraparound - loc_neg = dim_sel < 0 - if np.any(loc_neg): - dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len - - # handle out of bounds - if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): - raise IndexError('index out of bounds') - - # validate monotonically increasing - if np.any(np.diff(dim_sel) < 0): - raise NotImplementedError('only monotonically increasing indices are supported') + # group points by chunk + sel_sort = np.argsort(chunks_raveled_indices) + chunks_raveled_indices = chunks_raveled_indices[sel_sort] + selection = tuple(dim_sel[sel_sort] for dim_sel in selection) - # store attributes - self.dim_sel = dim_sel - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) + # store atrributes + self.selection = selection + self.sel_sort = sel_sort + self.shape = len(selection[0]) if selection[0].shape else 1 + self.drop_axes = None + self.array = array # precompute number of selected items for each chunk - # note: for dense integer selections, the division operation here is the bottleneck - self.chunk_nitems = np.bincount(self.dim_sel // self.dim_chunk_len, minlength=self.nchunks) + self.chunk_nitems = np.bincount(chunks_raveled_indices, minlength=array.nchunks) self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) - self.nitems = len(dim_sel) - - def get_chunk_sel(self, dim_chunk_idx): - # need to slice out relevant indices from the total selection, then subtract the chunk - # offset + self.chunk_rixs = np.nonzero(self.chunk_nitems)[0] - dim_out_sel = self.get_out_sel(dim_chunk_idx) - dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len - dim_chunk_sel = self.dim_sel[dim_out_sel] - dim_chunk_offset - - return dim_chunk_sel - - def get_out_sel(self, dim_chunk_idx): - if dim_chunk_idx == 0: - start = 0 - else: - start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] - stop = self.chunk_nitems_cumsum[dim_chunk_idx] - return slice(start, stop) + # unravel + self.chunk_mixs = np.unravel_index(self.chunk_rixs, dims=array._cdata_shape) - def get_overlapping_chunks(self): - return np.nonzero(self.chunk_nitems)[0] + def __iter__(self): + # iterate over chunks + for i, chunk_rix in enumerate(self.chunk_rixs): -class BoolArrayDimSelection(object): + chunk_coords = tuple(m[i] for m in self.chunk_mixs) + if chunk_rix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[chunk_rix - 1] + stop = self.chunk_nitems_cumsum[chunk_rix] + out_selection = self.sel_sort[start:stop] - def __init__(self, dim_sel, dim_len, dim_chunk_len): + chunk_offsets = tuple( + dim_chunk_ix * dim_chunk_len + for dim_chunk_ix, dim_chunk_len in zip(chunk_coords, self.array._chunks) + ) + chunk_selection = tuple( + dim_sel[start:stop] - dim_chunk_offset + for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets) + ) - # check number of dimensions, only support indexing with 1d array - if len(dim_sel.shape) > 1: - raise IndexError('can only index with 1-dimensional Boolean array') + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) - # check shape - if dim_sel.shape[0] != dim_len: - raise IndexError('Boolean array has wrong length; expected %s, found %s' % - (dim_len, dim_sel.shape[0])) - # store attributes - self.dim_sel = dim_sel - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) +class VIndex(object): - # precompute number of selected items for each chunk - self.chunk_nitems = np.zeros(self.nchunks, dtype='i8') - for dim_chunk_idx in range(self.nchunks): - dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len - self.chunk_nitems[dim_chunk_idx] = np.count_nonzero( - self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] - ) - self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) - self.nitems = self.chunk_nitems_cumsum[-1] + def __init__(self, array): + self.array = array - def get_chunk_sel(self, dim_chunk_idx): - dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len - dim_chunk_sel = self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] - # pad out if final chunk - if dim_chunk_sel.shape[0] < self.dim_chunk_len: - tmp = np.zeros(self.dim_chunk_len, dtype=bool) - tmp[:dim_chunk_sel.shape[0]] = dim_chunk_sel - dim_chunk_sel = tmp - return dim_chunk_sel - - def get_out_sel(self, dim_chunk_idx): - if dim_chunk_idx == 0: - start = 0 + def __getitem__(self, selection): + selection = ensure_tuple(selection) + selection = replace_lists(selection) + if is_coordinate_selection(selection, self.array): + return self.array.get_coordinate_selection(selection) + # elif is_mask_selection(selection, self.array): + # return self.array.get_mask_selection(selection) else: - start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] - stop = self.chunk_nitems_cumsum[dim_chunk_idx] - return slice(start, stop) + raise IndexError('unsupported selection') - def get_overlapping_chunks(self): - return np.nonzero(self.chunk_nitems)[0] + def __setitem__(self, selection, value): + return self.array.set_orthogonal_selection(selection, value) diff --git a/zarr/new_indexing.py b/zarr/new_indexing.py deleted file mode 100644 index b4a2610c19..0000000000 --- a/zarr/new_indexing.py +++ /dev/null @@ -1,659 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import, print_function, division -import numbers -import itertools -import collections - - -import numpy as np - - -def normalize_integer_selection(dim_sel, dim_len): - - # normalize type to int - dim_sel = int(dim_sel) - - # handle wraparound - if dim_sel < 0: - dim_sel = dim_len + dim_sel - - # handle out of bounds - if dim_sel >= dim_len or dim_sel < 0: - raise IndexError('index out of bounds') - - return dim_sel - - -ChunkDimProjection = collections.namedtuple('ChunkDimProjection', - ('dim_chunk_ix', 'dim_chunk_sel', 'dim_out_sel')) -"""A mapping from chunk to output array for a single dimension. - -Parameters ----------- -dim_chunk_ix - Index of chunk. -dim_chunk_sel - Selection of items from chunk array. -dim_out_sel - Selection of items in target (output) array. - -""" - - -class IntDimIndexer(object): - - def __init__(self, dim_sel, dim_len, dim_chunk_len): - - # check type - if not isinstance(dim_sel, numbers.Integral): - raise ValueError('selection must be an integer') - - # normalize - dim_sel = normalize_integer_selection(dim_sel, dim_len) - - # store attributes - self.dim_sel = dim_sel - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nitems = 1 - - def __iter__(self): - dim_chunk_ix = self.dim_sel // self.dim_chunk_len - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_chunk_sel = self.dim_sel - dim_offset - dim_out_sel = None - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - - -def normalize_slice_selection(dim_sel, dim_len): - - # handle slice with None bound - start = 0 if dim_sel.start is None else dim_sel.start - stop = dim_len if dim_sel.stop is None else dim_sel.stop - step = 1 if dim_sel.step is None else dim_sel.step - - # handle wraparound - if start < 0: - start = dim_len + start - if stop < 0: - stop = dim_len + stop - - # handle out of bounds - if start < 0: - raise IndexError('start index out of bounds: %s' % dim_sel.start) - if stop < 0: - raise IndexError('stop index out of bounds: %s' % dim_sel.stop) - if start >= dim_len and dim_len > 0: - raise IndexError('start index out of bounds: %ss' % dim_sel.start) - if stop > dim_len: - stop = dim_len - if stop < start: - stop = start - - return slice(start, stop, step) - - -class SliceDimIndexer(object): - - def __init__(self, dim_sel, dim_len, dim_chunk_len): - - # check type - if not isinstance(dim_sel, slice): - raise ValueError('selection must be a slice') - - # normalize - dim_sel = normalize_slice_selection(dim_sel, dim_len) - - # store attributes - self.dim_sel = dim_sel - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nitems = dim_sel.stop - dim_sel.start - - def __iter__(self): - - dim_chunk_from = self.dim_sel.start // self.dim_chunk_len - dim_chunk_to = int(np.ceil(self.dim_sel.stop / self.dim_chunk_len)) - - for dim_chunk_ix in range(dim_chunk_from, dim_chunk_to): - - dim_offset = dim_chunk_ix * self.dim_chunk_len - - if self.dim_sel.start <= dim_offset: - # selection starts before current chunk - dim_chunk_sel_start = 0 - dim_out_offset = dim_offset - self.dim_sel.start - - else: - # selection starts within current chunk - dim_chunk_sel_start = self.dim_sel.start - dim_offset - dim_out_offset = 0 - - if self.dim_sel.stop > (dim_offset + self.dim_chunk_len): - # selection ends after current chunk - dim_chunk_sel_stop = self.dim_chunk_len - - else: - # selection ends within current chunk - dim_chunk_sel_stop = self.dim_sel.stop - dim_offset - - dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) - dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start - dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - - -def replace_ellipsis(selection, shape): - - # count number of ellipsis present - n_ellipsis = sum(1 for i in selection if i is Ellipsis) - - if n_ellipsis > 1: - # more than 1 is an error - raise IndexError("an index can only have a single ellipsis ('...')") - - elif n_ellipsis == 1: - # locate the ellipsis, count how many items to left and right - n_items_l = selection.index(Ellipsis) # items to left of ellipsis - n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis - n_items = len(selection) - 1 # all non-ellipsis items - - if n_items >= len(shape): - # ellipsis does nothing, just remove it - selection = tuple(i for i in selection if i != Ellipsis) - - else: - # replace ellipsis with as many slices are needed for number of dims - new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) - if n_items_r: - new_item += selection[-n_items_r:] - selection = new_item - - # fill out selection if not completely specified - if len(selection) < len(shape): - selection += tuple(slice(0, l) for l in shape[len(selection):]) - - return selection - - -def ensure_tuple(v): - if not isinstance(v, tuple): - v = (v,) - return v - - -ChunkProjection = collections.namedtuple('ChunkProjection', - ('chunk_coords', 'chunk_selection', 'out_selection')) -"""A mapping of items from chunk to output array. Can be used to extract items from the chunk -array for loading into an output array. Can also be used to extract items from a value array for -setting/updating in a chunk array. - -Parameters ----------- -chunk_coords - Indices of chunk. -chunk_selection - Selection of items from chunk array. -out_selection - Selection of items in target (output) array. - -""" - - -# noinspection PyProtectedMember -class BasicIndexer(object): - - def __init__(self, selection, array): - - # ensure tuple - selection = ensure_tuple(selection) - - # handle ellipsis - selection = replace_ellipsis(selection, array._shape) - - # validation - check dimensionality - if len(selection) > len(array._shape): - raise IndexError('too many indices for array') - if len(selection) < len(array._shape): - raise IndexError('not enough indices for array') - - # setup per-dimension indexers - dim_indexers = [] - for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): - - if isinstance(dim_sel, numbers.Integral): - dim_sel = normalize_integer_selection(dim_sel, dim_len) - dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) - - elif isinstance(dim_sel, slice): - dim_sel = normalize_slice_selection(dim_sel, dim_len) - dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) - - else: - raise IndexError('bad selection type') - - dim_indexers.append(dim_indexer) - - self.dim_indexers = dim_indexers - self.shape = tuple(s.nitems for s in self.dim_indexers - if not isinstance(s, IntDimIndexer)) - self.drop_axes = None - - def __iter__(self): - for dim_projections in itertools.product(*self.dim_indexers): - - chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) - chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) - out_selection = tuple(p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None) - - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) - - -class BoolArrayDimIndexer(object): - - def __init__(self, dim_sel, dim_len, dim_chunk_len): - - # check number of dimensions - if len(dim_sel.shape) > 1: - raise IndexError('selection must be a 1d array') - - # check shape - if dim_sel.shape[0] != dim_len: - raise IndexError('selection has the wrong length') - - # store attributes - self.dim_sel = dim_sel - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) - - # precompute number of selected items for each chunk - self.chunk_nitems = np.zeros(self.nchunks, dtype='i8') - for dim_chunk_ix in range(self.nchunks): - dim_offset = dim_chunk_ix * self.dim_chunk_len - self.chunk_nitems[dim_chunk_ix] = np.count_nonzero( - self.dim_sel[dim_offset:dim_offset + self.dim_chunk_len] - ) - self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) - self.nitems = self.chunk_nitems_cumsum[-1] - self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] - - def __iter__(self): - - # iterate over chunks with at least one item - for dim_chunk_ix in self.dim_chunk_ixs: - - # find region in chunk - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_chunk_sel = self.dim_sel[dim_offset:dim_offset + self.dim_chunk_len] - - # pad out if final chunk - if dim_chunk_sel.shape[0] < self.dim_chunk_len: - tmp = np.zeros(self.dim_chunk_len, dtype=bool) - tmp[:dim_chunk_sel.shape[0]] = dim_chunk_sel - dim_chunk_sel = tmp - - # find region in output - if dim_chunk_ix == 0: - start = 0 - else: - start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] - stop = self.chunk_nitems_cumsum[dim_chunk_ix] - dim_out_sel = slice(start, stop) - - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - - -# def find_runs(x): -# """Find runs of consecutive items in an array.""" -# -# # ensure array -# x = np.asanyarray(x) -# if x.ndim != 1: -# raise ValueError('only 1D array supported') -# n = x.shape[0] -# -# # handle empty array -# if n == 0: -# return np.array([]), np.array([]), np.array([]) -# -# else: -# # find run starts -# loc_run_start = np.empty(n, dtype=bool) -# loc_run_start[0] = True -# np.not_equal(x[:-1], x[1:], out=loc_run_start[1:]) -# run_starts = np.nonzero(loc_run_start)[0] -# -# # find run values -# run_values = x[loc_run_start] -# -# # find run lengths -# run_lengths = np.diff(np.append(run_starts, n)) -# -# return run_values, run_starts, run_lengths - - -class IntArrayDimIndexer(object): - """Integer array selection against a single dimension.""" - - def __init__(self, dim_sel, dim_len, dim_chunk_len): - - # ensure array - dim_sel = np.asanyarray(dim_sel) - - # check number of dimensions - if dim_sel.ndim != 1: - raise IndexError('selection must be a 1d array') - - # check dtype - if dim_sel.dtype.kind not in 'ui': - raise IndexError('selection must be an integer array') - - # handle wraparound - loc_neg = dim_sel < 0 - if np.any(loc_neg): - dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len - - # handle out of bounds - if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): - raise IndexError('selection contains index out of bounds') - - # handle non-monotonic indices - dim_sel_chunk = dim_sel // dim_chunk_len - if np.any(np.diff(dim_sel) < 0): - self.is_monotonic = False - # sort indices to group by chunk - self.dim_sort = np.argsort(dim_sel_chunk) - self.dim_sel = np.take(dim_sel, self.dim_sort) - - else: - self.is_monotonic = True - self.dim_sort = None - self.dim_sel = dim_sel - - # store attributes - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) - self.nitems = len(self.dim_sel) - - # precompute number of selected items for each chunk - # note: for dense integer selections, the division operation here is the bottleneck - self.chunk_nitems = np.bincount(dim_sel_chunk, minlength=self.nchunks) - self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) - self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] - - def __iter__(self): - - for dim_chunk_ix in self.dim_chunk_ixs: - - # find region in output - if dim_chunk_ix == 0: - start = 0 - else: - start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] - stop = self.chunk_nitems_cumsum[dim_chunk_ix] - if self.is_monotonic: - dim_out_sel = slice(start, stop) - else: - dim_out_sel = self.dim_sort[start:stop] - - # find region in chunk - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_chunk_sel = self.dim_sel[start:stop] - dim_offset - - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - - -def slice_to_range(s): - return range(s.start, s.stop, 1 if s.step is None else s.step) - - -def ix_(*selection): - """Convert an orthogonal selection to a numpy advanced (fancy) selection, with support for - slices and single ints.""" - - # replace slice and int as these are not supported by numpy ix_() - selection = [slice_to_range(dim_sel) if isinstance(dim_sel, slice) - else [dim_sel] if isinstance(dim_sel, int) - else dim_sel - for dim_sel in selection] - - selection = np.ix_(*selection) - - return selection - - -class OrthogonalIndexer(object): - - def __init__(self, selection, array): - - # ensure tuple - selection = ensure_tuple(selection) - - # handle ellipsis - selection = replace_ellipsis(selection, array._shape) - - # normalize list to array - selection = replace_lists(selection) - - # validation - check dimensionality - if len(selection) > len(array._shape): - raise IndexError('too many indices for array') - if len(selection) < len(array._shape): - raise IndexError('not enough indices for array') - - # setup per-dimension indexers - dim_indexers = [] - for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): - - if isinstance(dim_sel, numbers.Integral): - dim_sel = normalize_integer_selection(dim_sel, dim_len) - dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) - - elif isinstance(dim_sel, slice): - - # normalize - dim_sel = normalize_slice_selection(dim_sel, dim_len) - - # handle slice with step - if dim_sel.step != 1: - dim_sel = np.arange(dim_sel.start, dim_sel.stop, dim_sel.step) - dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) - else: - dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) - - elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): - - if dim_sel.dtype == bool: - dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) - - elif dim_sel.dtype.kind in 'ui': - dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) - - else: - raise IndexError('bad selection type') - - else: - raise IndexError('bad selection type') - - dim_indexers.append(dim_indexer) - - self.dim_indexers = dim_indexers - self.shape = tuple(s.nitems for s in self.dim_indexers - if not isinstance(s, IntDimIndexer)) - self.is_advanced = any([not isinstance(dim_indexer, (IntDimIndexer, SliceDimIndexer)) - for dim_indexer in self.dim_indexers]) - if self.is_advanced: - self.drop_axes = tuple([i for i, dim_indexer in enumerate(self.dim_indexers) - if isinstance(dim_indexer, IntDimIndexer)]) - else: - self.drop_axes = None - - def __iter__(self): - for dim_projections in itertools.product(*self.dim_indexers): - - chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) - chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) - out_selection = tuple(p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None) - - # handle advanced indexing arrays orthogonally - if self.is_advanced: - # numpy doesn't support orthogonal indexing directly as yet, so need to work - # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices - # or integers, so need to convert slices and integers into ranges. - chunk_selection = ix_(*chunk_selection) - - # special case for non-monotonic indices - if any([not isinstance(s, (int, slice)) for s in out_selection]): - out_selection = ix_(*out_selection) - - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) - - -class OIndex(object): - - def __init__(self, array): - self.array = array - - def __getitem__(self, selection): - return self.array.get_orthogonal_selection(selection) - - def __setitem__(self, selection, value): - return self.array.set_orthogonal_selection(selection, value) - - -def is_coordinate_selection(selection, array): - return ( - (len(selection) == len(array._shape)) and - all( - [(isinstance(dim_sel, numbers.Integral) or - (hasattr(dim_sel, 'dtype') and dim_sel.dtype.kind in 'ui')) - for dim_sel in selection] - ) - ) - - -def is_mask_selection(selection, array): - return ( - hasattr(selection, 'dtype') and - selection.dtype == bool and - hasattr(selection, 'shape') and - len(selection.shape) == len(array.shape) - ) - - -def replace_lists(selection): - return tuple( - np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel - for dim_sel in selection - ) - - -# noinspection PyProtectedMember -class CoordinateIndexer(object): - - def __init__(self, selection, array): - - # some initial normalization - selection = ensure_tuple(selection) - selection = tuple([i] if isinstance(i, numbers.Integral) else i - for i in selection) - selection = replace_lists(selection) - - # validation - if not is_coordinate_selection(selection, array): - # TODO refactor error messages for consistency - raise IndexError('invalid coordinate selection') - - # attempt to broadcast selection - this will raise error if array dimensions don't match - selection = np.broadcast_arrays(*selection) - - # normalization - for dim_sel, dim_len in zip(selection, array.shape): - - # check number of dimensions, only support indexing with 1d array - if len(dim_sel.shape) > 1: - raise IndexError('selection must be 1-dimensional integer array') - - # handle wraparound - loc_neg = dim_sel < 0 - if np.any(loc_neg): - # TODO need to take a copy here, or OK to replace? - dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len - - # handle out of bounds - if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): - raise IndexError('index out of bounds') - - # compute flattened chunk index for each point in the selection - chunks_multi_index = tuple( - dim_sel // dim_chunk_len - for (dim_sel, dim_chunk_len) in zip(selection, array._chunks) - ) - chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, - dims=array._cdata_shape) - - # group points by chunk - sel_sort = np.argsort(chunks_raveled_indices) - chunks_raveled_indices = chunks_raveled_indices[sel_sort] - selection = tuple(dim_sel[sel_sort] for dim_sel in selection) - - # store atrributes - self.selection = selection - self.sel_sort = sel_sort - self.shape = len(selection[0]) if selection[0].shape else 1 - self.drop_axes = None - self.array = array - - # precompute number of selected items for each chunk - self.chunk_nitems = np.bincount(chunks_raveled_indices, minlength=array.nchunks) - self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) - self.chunk_rixs = np.nonzero(self.chunk_nitems)[0] - - # unravel - self.chunk_mixs = np.unravel_index(self.chunk_rixs, dims=array._cdata_shape) - - def __iter__(self): - - # iterate over chunks - for i, chunk_rix in enumerate(self.chunk_rixs): - - chunk_coords = tuple(m[i] for m in self.chunk_mixs) - if chunk_rix == 0: - start = 0 - else: - start = self.chunk_nitems_cumsum[chunk_rix - 1] - stop = self.chunk_nitems_cumsum[chunk_rix] - out_selection = self.sel_sort[start:stop] - - chunk_offsets = tuple( - dim_chunk_ix * dim_chunk_len - for dim_chunk_ix, dim_chunk_len in zip(chunk_coords, self.array._chunks) - ) - chunk_selection = tuple( - dim_sel[start:stop] - dim_chunk_offset - for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets) - ) - - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) - - -class VIndex(object): - - def __init__(self, array): - self.array = array - - def __getitem__(self, selection): - selection = ensure_tuple(selection) - selection = replace_lists(selection) - if is_coordinate_selection(selection, self.array): - return self.array.get_coordinate_selection(selection) - # elif is_mask_selection(selection, self.array): - # return self.array.get_mask_selection(selection) - else: - raise IndexError('unsupported selection') - - def __setitem__(self, selection, value): - return self.array.set_orthogonal_selection(selection, value) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index cca98c0227..737830e04d 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -18,20 +18,9 @@ from zarr.errors import PermissionError from zarr.compat import PY2 from zarr.util import buffer_size -from zarr.indexing import ix_ from numcodecs import Delta, FixedScaleOffset, Zlib, Blosc, BZ2 -def oindex(a, selection): - """Implementation of orthogonal indexing with slices and ints.""" - squeeze_axes = tuple([i for i, s in enumerate(selection) if isinstance(s, int)]) - selection = ix_(*selection) - result = a[selection] - if squeeze_axes: - result = result.squeeze(axis=squeeze_axes) - return result - - # noinspection PyMethodMayBeStatic class TestArray(unittest.TestCase): @@ -734,595 +723,6 @@ def test_nchunks_initialized(self): z[:] = 42 eq(10, z.nchunks_initialized) - def _test_orthogonal_indexing_1d_common(self, a, z, ix): - expect = a[ix] - actual = z.get_orthogonal_selection(ix) - assert_array_equal(expect, actual) - actual = z.oindex[ix] - assert_array_equal(expect, actual) - # for 1d arrays, also available via __getitem__ - actual = z[ix] - assert_array_equal(expect, actual) - - # noinspection PyStatementEffect - def test_orthogonal_indexing_1d_bool(self): - - # setup - a = np.arange(1050, dtype=int) - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - self._test_orthogonal_indexing_1d_common(a, z, ix) - - # test errors - with assert_raises(IndexError): - z.oindex[np.zeros(50, dtype=bool)] # too short - with assert_raises(IndexError): - z.oindex[np.zeros(2000, dtype=bool)] # too long - with assert_raises(IndexError): - z.oindex[[[True, False], [False, True]]] # too many dimensions - - # noinspection PyStatementEffect - def test_orthogonal_indexing_1d_int(self): - - # setup - a = np.arange(1050, dtype=int) - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - self._test_orthogonal_indexing_1d_common(a, z, ix) - ix.sort() - self._test_orthogonal_indexing_1d_common(a, z, ix) - - # test wraparound - ix = [0, 3, 10, -23, -12, -1] - expect = a[ix] - actual = z.oindex[ix] - assert_array_equal(expect, actual) - - # test not sorted - ix = [3, 105, 23, 127] # not monotonically increasing - expect = a[ix] - actual = z.oindex[ix] - assert_array_equal(expect, actual) - - # test errors - with assert_raises(IndexError): - ix = [a.shape[0] + 1] # out of bounds - z.oindex[ix] - with assert_raises(IndexError): - ix = [-(a.shape[0] + 1)] # out of bounds - z.oindex[ix] - with assert_raises(IndexError): - ix = [[2, 4], [6, 8]] # too many dimensions - z.oindex[ix] - - def test_orthogonal_indexing_1d_slice_with_step(self): - - # setup - a = np.arange(1050, dtype=int) - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) - z[:] = a - - selections = [ - slice(0, 1050), - slice(0, 1050, 1), - slice(0, 1050, 10), - slice(0, 1050, 100), - slice(0, 1050, 1000), - slice(50, 150, 1), - slice(50, 150, 10), - slice(50, 150, 100), - ] - for selection in selections: - expect = a[selection] - actual = z.get_orthogonal_selection(selection) - assert_array_equal(expect, actual) - actual = z.oindex[selection] - assert_array_equal(expect, actual) - # for 1d arrays also available via __getitem__ - actual = z[selection] - assert_array_equal(expect, actual) - - def _test_orthogonal_indexing_2d_common(self, a, z, ix0, ix1): - - selections = [ - # index both axes with array - (ix0, ix1), - # mixed indexing with array / slice - (ix0, slice(1, 5)), - (ix0, slice(1, 5, 2)), - (slice(250, 350), ix1), - (slice(250, 350, 10), ix1), - # mixed indexing with array / int - (ix0, 4), - (42, ix1), - ] - - for selection in selections: - expect = oindex(a, selection) - actual = z.get_orthogonal_selection(selection) - assert_array_equal(expect, actual) - actual = z.oindex[selection] - assert_array_equal(expect, actual) - - def test_orthogonal_indexing_2d_bool(self): - - # setup - a = np.arange(10000, dtype=int).reshape(1000, 10) - z = self.create_array(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) - - # main tests - self._test_orthogonal_indexing_2d_common(a, z, ix0, ix1) - - # mixed int array / bool array - selections = ( - (ix0, np.nonzero(ix1)[0]), - (np.nonzero(ix0)[0], ix1), - ) - for selection in selections: - expect = oindex(a, selection) - actual = z.oindex[selection] - assert_array_equal(expect, actual) - - def test_orthogonal_indexing_2d_int(self): - - # setup - a = np.arange(10000, dtype=int).reshape(1000, 10) - z = self.create_array(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) - self._test_orthogonal_indexing_2d_common(a, z, ix0, ix1) - ix0.sort() - ix1.sort() - self._test_orthogonal_indexing_2d_common(a, z, ix0, ix1) - - def _test_orthogonal_indexing_3d_common(self, a, z, ix0, ix1, ix2): - - selections = [ - # index all axes with array - (ix0, ix1, ix2), - # mixed indexing with single array / slices - (ix0, slice(15, 25), slice(1, 5)), - (slice(50, 70), ix1, slice(1, 5)), - (slice(50, 70), slice(15, 25), ix2), - (ix0, slice(15, 25, 5), slice(1, 5, 2)), - (slice(50, 70, 3), ix1, slice(1, 5, 2)), - (slice(50, 70, 3), slice(15, 25, 5), ix2), - # mixed indexing with single array / ints - (ix0, 42, 4), - (84, ix1, 4), - (84, 42, ix2), - # mixed indexing with single array / slice / int - (ix0, slice(15, 25), 4), - (42, ix1, slice(1, 5)), - (slice(50, 70), 42, ix2), - # mixed indexing with two array / slice - (ix0, ix1, slice(1, 5)), - (slice(50, 70), ix1, ix2), - (ix0, slice(15, 25), ix2), - # mixed indexing with two array / integer - (ix0, ix1, 4), - (42, ix1, ix2), - (ix0, 42, ix2), - ] - for selection in selections: - expect = oindex(a, selection) - actual = z.get_orthogonal_selection(selection) - assert_array_equal(expect, actual) - actual = z.oindex[selection] - assert_array_equal(expect, actual) - - def test_orthogonal_indexing_3d_bool(self): - - # setup - a = np.arange(100000, dtype=int).reshape(200, 50, 10) - z = self.create_array(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) - ix2 = np.random.binomial(1, .5, size=a.shape[2]).astype(bool) - self._test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) - - def test_orthogonal_indexing_edge_cases(self): - - a = np.arange(6).reshape(1, 2, 3) - z = self.create_array(shape=a.shape, chunks=(1, 2, 3), dtype=a.dtype) - z[:] = a - - expect = a[ix_([0], range(2), [0, 1, 2])].squeeze(axis=0) - actual = z.oindex[0, :, [0, 1, 2]] - assert_array_equal(expect, actual) - - expect = a[ix_([0], range(2), [True, True, True])].squeeze(axis=0) - actual = z.oindex[0, :, [True, True, True]] - assert_array_equal(expect, actual) - - def test_orthogonal_indexing_3d_int(self): - - # setup - a = np.arange(100000, dtype=int).reshape(200, 50, 10) - z = self.create_array(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) - ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) - self._test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) - ix0.sort() - ix1.sort() - ix2.sort() - self._test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) - - def _test_orthogonal_indexing_1d_common_set(self, v, a, z, ix): - a[:] = 0 - a[ix] = v[ix] - z[:] = 0 - z.oindex[ix] = v[ix] - assert_array_equal(a, z[:]) - z[:] = 0 - z.set_orthogonal_selection(ix, v[ix]) - assert_array_equal(a, z[:]) - # also available via __getitem__ for 1d arrays - z[:] = 0 - z[ix] = v[ix] - assert_array_equal(a, z[:]) - - def test_orthogonal_indexing_1d_bool_set(self): - - # setup - v = np.arange(1050, dtype=int) - a = np.empty(v.shape, dtype=int) - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - self._test_orthogonal_indexing_1d_common_set(v, a, z, ix) - - def test_orthogonal_indexing_1d_int_set(self): - - # setup - v = np.arange(1050, dtype=int) - a = np.empty(v.shape, dtype=v.dtype) - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - self._test_orthogonal_indexing_1d_common_set(v, a, z, ix) - ix.sort() - self._test_orthogonal_indexing_1d_common_set(v, a, z, ix) - - def _test_orthogonal_indexing_2d_common_set(self, v, a, z, ix0, ix1): - - selections = ( - # index both axes with array - (ix0, ix1), - # mixed indexing with array / slice or int - (ix0, slice(1, 5)), - (slice(250, 350), ix1), - (ix0, 4), - (42, ix1), - ) - for selection in selections: - a[:] = 0 - a[ix_(*selection)] = v[ix_(*selection)] - z[:] = 0 - z.oindex[selection] = oindex(v, selection) - assert_array_equal(a, z[:]) - z[:] = 0 - z.set_orthogonal_selection(selection, oindex(v, selection)) - assert_array_equal(a, z[:]) - - def test_orthogonal_indexing_2d_bool_set(self): - - # setup - v = np.arange(10000, dtype=int).reshape(1000, 10) - a = np.empty_like(v) - z = self.create_array(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) - self._test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1) - - def test_orthogonal_indexing_2d_int_set(self): - - # setup - v = np.arange(10000, dtype=int).reshape(1000, 10) - a = np.empty_like(v) - z = self.create_array(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) - self._test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1) - ix0.sort() - ix1.sort() - self._test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1) - - def _test_orthogonal_indexing_3d_common_set(self, v, a, z, ix0, ix1, ix2): - - selections = ( - # index all axes with bool array - (ix0, ix1, ix2), - # mixed indexing with single bool array / slice or int - (ix0, slice(15, 25), slice(1, 5)), - (slice(50, 70), ix1, slice(1, 5)), - (slice(50, 70), slice(15, 25), ix2), - (ix0, 42, 4), - (84, ix1, 4), - (84, 42, ix2), - (ix0, slice(15, 25), 4), - (slice(50, 70), ix1, 4), - (slice(50, 70), 42, ix2), - # indexing with two arrays / slice - (ix0, ix1, slice(1, 5)), - # indexing with two arrays / integer - (ix0, ix1, 4), - ) - for selection in selections: - a[:] = 0 - a[ix_(*selection)] = v[ix_(*selection)] - z[:] = 0 - z.oindex[selection] = oindex(v, selection) - assert_array_equal(a, z[:]) - z[:] = 0 - z.set_orthogonal_selection(selection, oindex(v, selection)) - assert_array_equal(a, z[:]) - - def test_orthogonal_indexing_3d_bool_set(self): - - # setup - v = np.arange(100000, dtype=int).reshape(200, 50, 10) - a = np.empty_like(v) - z = self.create_array(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) - ix2 = np.random.binomial(1, .5, size=a.shape[2]).astype(bool) - self._test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) - - def test_orthogonal_indexing_3d_int_set(self): - - # setup - v = np.arange(100000, dtype=int).reshape(200, 50, 10) - a = np.empty_like(v) - z = self.create_array(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) - ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) - self._test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) - ix0.sort() - ix1.sort() - ix2.sort() - self._test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) - - # noinspection PyStatementEffect - def test_coordinate_indexing_1d(self): - - # setup - a = np.arange(1050, dtype=int) - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - expect = a[ix] - actual = z.get_coordinate_selection(ix) - assert_array_equal(expect, actual) - actual = z.vindex[ix] - assert_array_equal(expect, actual) - ix.sort() - expect = a[ix] - actual = z.get_coordinate_selection(ix) - assert_array_equal(expect, actual) - actual = z.vindex[ix] - assert_array_equal(expect, actual) - - # test single item - ix = 42 - expect = a[ix] - actual = z.get_coordinate_selection(ix) - assert_array_equal(expect, actual) - - # test wraparound - ix = [0, 3, 10, -23, -12, -1] - expect = a[ix] - actual = z.get_coordinate_selection(ix) - assert_array_equal(expect, actual) - - # test out of order - ix = [3, 105, 23, 127] # not monotonically increasing - expect = a[ix] - actual = z.get_coordinate_selection(ix) - assert_array_equal(expect, actual) - - # test errors - with assert_raises(IndexError): - ix = [a.shape[0] + 1] # out of bounds - z.get_coordinate_selection(ix) - with assert_raises(IndexError): - ix = [-(a.shape[0] + 1)] # out of bounds - z.get_coordinate_selection(ix) - with assert_raises(IndexError): - ix = [[2, 4], [6, 8]] # too many dimensions - z.get_coordinate_selection(ix) - with assert_raises(IndexError): - ix = slice(5, 15) - z.get_coordinate_selection(ix) - with assert_raises(IndexError): - ix = Ellipsis - z.get_coordinate_selection(ix) - - def test_coordinate_indexing_2d(self): - - # setup - a = np.arange(10000, dtype=int).reshape(1000, 10) - z = self.create_array(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - n = int(a.size * p) - ix0 = np.random.choice(a.shape[0], size=n, replace=True) - ix1 = np.random.choice(a.shape[1], size=n, replace=True) - selections = [ - # index both axes with array - (ix0, ix1), - # mixed indexing with array / int - (ix0, 4), - (42, ix1), - (42, 4), - ] - - for selection in selections: - expect = a[selection] - actual = z.get_coordinate_selection(selection) - assert_array_equal(expect, actual) - actual = z.vindex[selection] - assert_array_equal(expect, actual) - - srt = np.lexsort((ix0, ix1)) - ix0 = ix0[srt] - ix1 = ix1[srt] - selections = [ - # index both axes with array - (ix0, ix1), - # mixed indexing with array / int - (ix0, 4), - (42, ix1), - (42, 4), - ] - - for selection in selections: - expect = a[selection] - actual = z.get_coordinate_selection(selection) - assert_array_equal(expect, actual) - actual = z.vindex[selection] - assert_array_equal(expect, actual) - - # not monotonically increasing (first dim) - ix0 = [3, 3, 4, 2, 5] - ix1 = [1, 3, 5, 7, 9] - expect = a[ix0, ix1] - actual = z.get_coordinate_selection((ix0, ix1)) - assert_array_equal(expect, actual) - - # not monotonically increasing (second dim) - ix0 = [1, 1, 2, 2, 5] - ix1 = [1, 3, 2, 1, 0] - expect = a[ix0, ix1] - actual = z.get_coordinate_selection((ix0, ix1)) - assert_array_equal(expect, actual) - - with assert_raises(IndexError): - selection = slice(5, 15), [1, 2, 3] - z.get_coordinate_selection(selection) - with assert_raises(IndexError): - selection = [1, 2, 3], slice(5, 15) - z.get_coordinate_selection(selection) - with assert_raises(IndexError): - selection = Ellipsis, [1, 2, 3] - z.get_coordinate_selection(selection) - - def test_get_selection_out(self): - - # basic selections - a = np.arange(1050) - z = self.create_array(shape=1050, chunks=100, dtype=a.dtype) - z[:] = a - selections = [ - slice(50, 150), - slice(0, 1050), - slice(1, 2), - ] - for selection in selections: - expect = a[selection] - out = self.create_array(shape=expect.shape, chunks=10, dtype=expect.dtype, fill_value=0) - z.get_basic_selection(selection, out=out) - assert_array_equal(expect, out[:]) - - # orthogonal selections - a = np.arange(10000, dtype=int).reshape(1000, 10) - z = self.create_array(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - z[:] = a - np.random.seed(42) - # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) - selections = [ - # index both axes with array - (ix0, ix1), - # mixed indexing with array / slice - (ix0, slice(1, 5)), - (slice(250, 350), ix1), - # mixed indexing with array / int - (ix0, 4), - (42, ix1), - # mixed int array / bool array - (ix0, np.nonzero(ix1)[0]), - (np.nonzero(ix0)[0], ix1), - ] - for selection in selections: - expect = oindex(a, selection) - # out = self.create_array(shape=expect.shape, chunks=10, dtype=expect.dtype, - # fill_value=0) - out = np.zeros(expect.shape, dtype=expect.dtype) - z.get_orthogonal_selection(selection, out=out) - assert_array_equal(expect, out[:]) - - # TODO coordinate selection - - # TODO mask selection - - # TODO selection with fields - class TestArrayWithPath(TestArray): diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py new file mode 100644 index 0000000000..99bb9ed7fa --- /dev/null +++ b/zarr/tests/test_indexing.py @@ -0,0 +1,695 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, print_function, division + + +import numpy as np +from numpy.testing import assert_array_equal +from nose.tools import assert_raises, eq_ as eq + + +from zarr.indexing import normalize_integer_selection, normalize_slice_selection, \ + replace_ellipsis, ix_, oindex +import zarr + + +def test_normalize_integer_selection(): + + eq(1, normalize_integer_selection(1, 100)) + eq(99, normalize_integer_selection(-1, 100)) + with assert_raises(IndexError): + normalize_integer_selection(100, 100) + with assert_raises(IndexError): + normalize_integer_selection(1000, 100) + with assert_raises(IndexError): + normalize_integer_selection(-1000, 100) + + +def test_normalize_slice_selection(): + + eq(slice(0, 100, 1), normalize_slice_selection(slice(None), 100)) + eq(slice(0, 100, 1), normalize_slice_selection(slice(None, 100), 100)) + eq(slice(0, 100, 1), normalize_slice_selection(slice(0, None), 100)) + eq(slice(0, 100, 1), normalize_slice_selection(slice(0, 1000), 100)) + eq(slice(99, 100, 1), normalize_slice_selection(slice(-1, None), 100)) + eq(slice(98, 99, 1), normalize_slice_selection(slice(-2, -1), 100)) + eq(slice(10, 10, 1), normalize_slice_selection(slice(10, 0), 100)) + with assert_raises(IndexError): + normalize_slice_selection(slice(100, None), 100) + with assert_raises(IndexError): + normalize_slice_selection(slice(1000, 2000), 100) + with assert_raises(IndexError): + normalize_slice_selection(slice(-1000, 0), 100) + + +def test_replace_ellipsis(): + + # 1D, single item + eq((0,), replace_ellipsis(0, (100,))) + + # 1D + eq((slice(None),), replace_ellipsis(Ellipsis, (100,))) + eq((slice(None),), replace_ellipsis(slice(None), (100,))) + eq((slice(None, 100),), replace_ellipsis(slice(None, 100), (100,))) + eq((slice(0, None),), replace_ellipsis(slice(0, None), (100,))) + eq((slice(None),), replace_ellipsis((slice(None), Ellipsis), (100,))) + eq((slice(None),), replace_ellipsis((Ellipsis, slice(None)), (100,))) + + # 2D, single item + eq((0, 0), replace_ellipsis((0, 0), (100, 100))) + eq((-1, 1), replace_ellipsis((-1, 1), (100, 100))) + + # 2D, single col/row + eq((0, slice(None)), replace_ellipsis((0, slice(None)), (100, 100))) + eq((0, slice(None)), replace_ellipsis((0,), (100, 100))) + eq((slice(None), 0), replace_ellipsis((slice(None), 0), (100, 100))) + + # 2D slice + eq((slice(None), slice(None)), + replace_ellipsis(Ellipsis, (100, 100))) + eq((slice(None), slice(None)), + replace_ellipsis(slice(None), (100, 100))) + eq((slice(None), slice(None)), + replace_ellipsis((slice(None), slice(None)), (100, 100))) + eq((slice(None), slice(None)), + replace_ellipsis((Ellipsis, slice(None)), (100, 100))) + eq((slice(None), slice(None)), + replace_ellipsis((slice(None), Ellipsis), (100, 100))) + eq((slice(None), slice(None)), + replace_ellipsis((slice(None), Ellipsis, slice(None)), (100, 100))) + eq((slice(None), slice(None)), + replace_ellipsis((Ellipsis, slice(None), slice(None)), (100, 100))) + eq((slice(None), slice(None)), + replace_ellipsis((slice(None), slice(None), Ellipsis), (100, 100))) + + +def _test_orthogonal_indexing_1d_common(a, z, ix): + expect = a[ix] + actual = z.get_orthogonal_selection(ix) + assert_array_equal(expect, actual) + actual = z.oindex[ix] + assert_array_equal(expect, actual) + # for 1d arrays, also available via __getitem__ + actual = z[ix] + assert_array_equal(expect, actual) + + +# noinspection PyStatementEffect +def test_orthogonal_indexing_1d_bool(): + + # setup + a = np.arange(1050, dtype=int) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_orthogonal_indexing_1d_common(a, z, ix) + + # test errors + with assert_raises(IndexError): + z.oindex[np.zeros(50, dtype=bool)] # too short + with assert_raises(IndexError): + z.oindex[np.zeros(2000, dtype=bool)] # too long + with assert_raises(IndexError): + z.oindex[[[True, False], [False, True]]] # too many dimensions + + +# noinspection PyStatementEffect +def test_orthogonal_indexing_1d_int(): + + # setup + a = np.arange(1050, dtype=int) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + _test_orthogonal_indexing_1d_common(a, z, ix) + ix.sort() + _test_orthogonal_indexing_1d_common(a, z, ix) + + # test wraparound + ix = [0, 3, 10, -23, -12, -1] + expect = a[ix] + actual = z.oindex[ix] + assert_array_equal(expect, actual) + + # explicit test not sorted + ix = [3, 105, 23, 127] # not monotonically increasing + expect = a[ix] + actual = z.oindex[ix] + assert_array_equal(expect, actual) + + # test errors + with assert_raises(IndexError): + ix = [a.shape[0] + 1] # out of bounds + z.oindex[ix] + with assert_raises(IndexError): + ix = [-(a.shape[0] + 1)] # out of bounds + z.oindex[ix] + with assert_raises(IndexError): + ix = [[2, 4], [6, 8]] # too many dimensions + z.oindex[ix] + + +def test_orthogonal_indexing_1d_slice_with_step(): + + # setup + a = np.arange(1050, dtype=int) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + selections = [ + slice(0, 1050), + slice(0, 1050, 1), + slice(0, 1050, 10), + slice(0, 1050, 100), + slice(0, 1050, 1000), + slice(50, 150, 1), + slice(50, 150, 10), + slice(50, 150, 100), + ] + for selection in selections: + expect = a[selection] + actual = z.get_orthogonal_selection(selection) + assert_array_equal(expect, actual) + actual = z.oindex[selection] + assert_array_equal(expect, actual) + # for 1d arrays also available via __getitem__ + actual = z[selection] + assert_array_equal(expect, actual) + + +def _test_orthogonal_indexing_2d_common(a, z, ix0, ix1): + + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice + (ix0, slice(1, 5)), + (ix0, slice(1, 5, 2)), + (slice(250, 350), ix1), + (slice(250, 350, 10), ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + ] + + for selection in selections: + expect = oindex(a, selection) + actual = z.get_orthogonal_selection(selection) + assert_array_equal(expect, actual) + actual = z.oindex[selection] + assert_array_equal(expect, actual) + + +def test_orthogonal_indexing_2d_bool(): + + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + + # main tests + _test_orthogonal_indexing_2d_common(a, z, ix0, ix1) + + # mixed int array / bool array + selections = ( + (ix0, np.nonzero(ix1)[0]), + (np.nonzero(ix0)[0], ix1), + ) + for selection in selections: + expect = oindex(a, selection) + actual = z.oindex[selection] + assert_array_equal(expect, actual) + + +def test_orthogonal_indexing_2d_int(): + + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) + _test_orthogonal_indexing_2d_common(a, z, ix0, ix1) + ix0.sort() + ix1.sort() + _test_orthogonal_indexing_2d_common(a, z, ix0, ix1) + + +def _test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2): + + selections = [ + # index all axes with array + (ix0, ix1, ix2), + # mixed indexing with single array / slices + (ix0, slice(15, 25), slice(1, 5)), + (slice(50, 70), ix1, slice(1, 5)), + (slice(50, 70), slice(15, 25), ix2), + (ix0, slice(15, 25, 5), slice(1, 5, 2)), + (slice(50, 70, 3), ix1, slice(1, 5, 2)), + (slice(50, 70, 3), slice(15, 25, 5), ix2), + # mixed indexing with single array / ints + (ix0, 42, 4), + (84, ix1, 4), + (84, 42, ix2), + # mixed indexing with single array / slice / int + (ix0, slice(15, 25), 4), + (42, ix1, slice(1, 5)), + (slice(50, 70), 42, ix2), + # mixed indexing with two array / slice + (ix0, ix1, slice(1, 5)), + (slice(50, 70), ix1, ix2), + (ix0, slice(15, 25), ix2), + # mixed indexing with two array / integer + (ix0, ix1, 4), + (42, ix1, ix2), + (ix0, 42, ix2), + ] + for selection in selections: + expect = oindex(a, selection) + actual = z.get_orthogonal_selection(selection) + assert_array_equal(expect, actual) + actual = z.oindex[selection] + assert_array_equal(expect, actual) + + +def test_orthogonal_indexing_3d_bool(): + + # setup + a = np.arange(100000, dtype=int).reshape(200, 50, 10) + z = zarr.create(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) + ix2 = np.random.binomial(1, .5, size=a.shape[2]).astype(bool) + _test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) + + +def test_orthogonal_indexing_edge_cases(): + + a = np.arange(6).reshape(1, 2, 3) + z = zarr.create(shape=a.shape, chunks=(1, 2, 3), dtype=a.dtype) + z[:] = a + + expect = a[ix_([0], range(2), [0, 1, 2])].squeeze(axis=0) + actual = z.oindex[0, :, [0, 1, 2]] + assert_array_equal(expect, actual) + + expect = a[ix_([0], range(2), [True, True, True])].squeeze(axis=0) + actual = z.oindex[0, :, [True, True, True]] + assert_array_equal(expect, actual) + + +def test_orthogonal_indexing_3d_int(): + + # setup + a = np.arange(100000, dtype=int).reshape(200, 50, 10) + z = zarr.create(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) + ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) + _test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) + ix0.sort() + ix1.sort() + ix2.sort() + _test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) + + +def _test_orthogonal_indexing_1d_common_set(v, a, z, ix): + a[:] = 0 + a[ix] = v[ix] + z[:] = 0 + z.oindex[ix] = v[ix] + assert_array_equal(a, z[:]) + z[:] = 0 + z.set_orthogonal_selection(ix, v[ix]) + assert_array_equal(a, z[:]) + # also available via __getitem__ for 1d arrays + z[:] = 0 + z[ix] = v[ix] + assert_array_equal(a, z[:]) + + +def test_orthogonal_indexing_1d_bool_set(): + + # setup + v = np.arange(1050, dtype=int) + a = np.empty(v.shape, dtype=int) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_orthogonal_indexing_1d_common_set(v, a, z, ix) + + +def test_orthogonal_indexing_1d_int_set(): + + # setup + v = np.arange(1050, dtype=int) + a = np.empty(v.shape, dtype=v.dtype) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + _test_orthogonal_indexing_1d_common_set(v, a, z, ix) + ix.sort() + _test_orthogonal_indexing_1d_common_set(v, a, z, ix) + + +def _test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1): + + selections = ( + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice or int + (ix0, slice(1, 5)), + (slice(250, 350), ix1), + (ix0, 4), + (42, ix1), + ) + for selection in selections: + a[:] = 0 + a[ix_(*selection)] = v[ix_(*selection)] + z[:] = 0 + z.oindex[selection] = oindex(v, selection) + assert_array_equal(a, z[:]) + z[:] = 0 + z.set_orthogonal_selection(selection, oindex(v, selection)) + assert_array_equal(a, z[:]) + + +def test_orthogonal_indexing_2d_bool_set(): + + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty_like(v) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) + _test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1) + + +def test_orthogonal_indexing_2d_int_set(): + + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty_like(v) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) + _test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1) + ix0.sort() + ix1.sort() + _test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1) + + +def _test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2): + + selections = ( + # index all axes with bool array + (ix0, ix1, ix2), + # mixed indexing with single bool array / slice or int + (ix0, slice(15, 25), slice(1, 5)), + (slice(50, 70), ix1, slice(1, 5)), + (slice(50, 70), slice(15, 25), ix2), + (ix0, 42, 4), + (84, ix1, 4), + (84, 42, ix2), + (ix0, slice(15, 25), 4), + (slice(50, 70), ix1, 4), + (slice(50, 70), 42, ix2), + # indexing with two arrays / slice + (ix0, ix1, slice(1, 5)), + # indexing with two arrays / integer + (ix0, ix1, 4), + ) + for selection in selections: + a[:] = 0 + a[ix_(*selection)] = v[ix_(*selection)] + z[:] = 0 + z.oindex[selection] = oindex(v, selection) + assert_array_equal(a, z[:]) + z[:] = 0 + z.set_orthogonal_selection(selection, oindex(v, selection)) + assert_array_equal(a, z[:]) + + +def test_orthogonal_indexing_3d_bool_set(): + + # setup + v = np.arange(100000, dtype=int).reshape(200, 50, 10) + a = np.empty_like(v) + z = zarr.create(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) + ix2 = np.random.binomial(1, .5, size=a.shape[2]).astype(bool) + _test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) + + +def test_orthogonal_indexing_3d_int_set(): + + # setup + v = np.arange(100000, dtype=int).reshape(200, 50, 10) + a = np.empty_like(v) + z = zarr.create(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) + ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) + _test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) + ix0.sort() + ix1.sort() + ix2.sort() + _test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) + + +# noinspection PyStatementEffect +def test_coordinate_indexing_1d(): + + # setup + a = np.arange(1050, dtype=int) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + expect = a[ix] + actual = z.get_coordinate_selection(ix) + assert_array_equal(expect, actual) + actual = z.vindex[ix] + assert_array_equal(expect, actual) + ix.sort() + expect = a[ix] + actual = z.get_coordinate_selection(ix) + assert_array_equal(expect, actual) + actual = z.vindex[ix] + assert_array_equal(expect, actual) + + # test single item + ix = 42 + expect = a[ix] + actual = z.get_coordinate_selection(ix) + assert_array_equal(expect, actual) + + # test wraparound + ix = [0, 3, 10, -23, -12, -1] + expect = a[ix] + actual = z.get_coordinate_selection(ix) + assert_array_equal(expect, actual) + + # test out of order + ix = [3, 105, 23, 127] # not monotonically increasing + expect = a[ix] + actual = z.get_coordinate_selection(ix) + assert_array_equal(expect, actual) + + # test errors + with assert_raises(IndexError): + ix = [a.shape[0] + 1] # out of bounds + z.get_coordinate_selection(ix) + with assert_raises(IndexError): + ix = [-(a.shape[0] + 1)] # out of bounds + z.get_coordinate_selection(ix) + with assert_raises(IndexError): + ix = [[2, 4], [6, 8]] # too many dimensions + z.get_coordinate_selection(ix) + with assert_raises(IndexError): + ix = slice(5, 15) + z.get_coordinate_selection(ix) + with assert_raises(IndexError): + ix = Ellipsis + z.get_coordinate_selection(ix) + + +def test_coordinate_indexing_2d(): + + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + n = int(a.size * p) + ix0 = np.random.choice(a.shape[0], size=n, replace=True) + ix1 = np.random.choice(a.shape[1], size=n, replace=True) + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + (42, 4), + ] + + for selection in selections: + expect = a[selection] + actual = z.get_coordinate_selection(selection) + assert_array_equal(expect, actual) + actual = z.vindex[selection] + assert_array_equal(expect, actual) + + srt = np.lexsort((ix0, ix1)) + ix0 = ix0[srt] + ix1 = ix1[srt] + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + (42, 4), + ] + + for selection in selections: + expect = a[selection] + actual = z.get_coordinate_selection(selection) + assert_array_equal(expect, actual) + actual = z.vindex[selection] + assert_array_equal(expect, actual) + + # not monotonically increasing (first dim) + ix0 = [3, 3, 4, 2, 5] + ix1 = [1, 3, 5, 7, 9] + expect = a[ix0, ix1] + actual = z.get_coordinate_selection((ix0, ix1)) + assert_array_equal(expect, actual) + + # not monotonically increasing (second dim) + ix0 = [1, 1, 2, 2, 5] + ix1 = [1, 3, 2, 1, 0] + expect = a[ix0, ix1] + actual = z.get_coordinate_selection((ix0, ix1)) + assert_array_equal(expect, actual) + + with assert_raises(IndexError): + selection = slice(5, 15), [1, 2, 3] + z.get_coordinate_selection(selection) + with assert_raises(IndexError): + selection = [1, 2, 3], slice(5, 15) + z.get_coordinate_selection(selection) + with assert_raises(IndexError): + selection = Ellipsis, [1, 2, 3] + z.get_coordinate_selection(selection) + + +def test_get_selection_out(): + + # basic selections + a = np.arange(1050) + z = zarr.create(shape=1050, chunks=100, dtype=a.dtype) + z[:] = a + selections = [ + slice(50, 150), + slice(0, 1050), + slice(1, 2), + ] + for selection in selections: + expect = a[selection] + out = zarr.create(shape=expect.shape, chunks=10, dtype=expect.dtype, fill_value=0) + z.get_basic_selection(selection, out=out) + assert_array_equal(expect, out[:]) + + # orthogonal selections + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice + (ix0, slice(1, 5)), + (slice(250, 350), ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + # mixed int array / bool array + (ix0, np.nonzero(ix1)[0]), + (np.nonzero(ix0)[0], ix1), + ] + for selection in selections: + expect = oindex(a, selection) + # out = zarr.create(shape=expect.shape, chunks=10, dtype=expect.dtype, + # fill_value=0) + out = np.zeros(expect.shape, dtype=expect.dtype) + z.get_orthogonal_selection(selection, out=out) + assert_array_equal(expect, out[:]) + + +# TODO mask selection + +# TODO selection with fields + + diff --git a/zarr/tests/test_util.py b/zarr/tests/test_util.py index 0cdbca77f3..3f41ccefa7 100644 --- a/zarr/tests/test_util.py +++ b/zarr/tests/test_util.py @@ -5,9 +5,8 @@ from nose.tools import eq_ as eq, assert_raises, assert_true, assert_false, \ assert_is_instance -from zarr.util import normalize_shape, normalize_chunks, is_total_slice, normalize_dim_selection, \ - normalize_array_selection, normalize_resize_args, human_readable_size, normalize_order, \ - guess_chunks, info_html_report, info_text_report +from zarr.util import normalize_shape, normalize_chunks, is_total_slice, normalize_resize_args, \ + human_readable_size, normalize_order, guess_chunks, info_html_report, info_text_report def test_normalize_shape(): @@ -66,84 +65,6 @@ def test_is_total_slice(): is_total_slice('foo', (100,)) -def test_normalize_axis_selection(): - - # single item - eq(1, normalize_dim_selection(1, 100, 10)) - eq(99, normalize_dim_selection(-1, 100, 10)) - with assert_raises(IndexError): - normalize_dim_selection(100, 100, 10) - with assert_raises(IndexError): - normalize_dim_selection(1000, 100, 10) - with assert_raises(IndexError): - normalize_dim_selection(-1000, 100, 10) - - # slice - eq(slice(0, 100), normalize_dim_selection(slice(None), 100, 10)) - eq(slice(0, 100), normalize_dim_selection(slice(None, 100), 100, 10)) - eq(slice(0, 100), normalize_dim_selection(slice(0, None), 100, 10)) - eq(slice(0, 100), normalize_dim_selection(slice(0, 1000), 100, 10)) - eq(slice(99, 100), normalize_dim_selection(slice(-1, None), 100, 10)) - eq(slice(98, 99), normalize_dim_selection(slice(-2, -1), 100, 10)) - eq(slice(10, 10), normalize_dim_selection(slice(10, 0), 100, 10)) - with assert_raises(IndexError): - normalize_dim_selection(slice(100, None), 100, 10) - with assert_raises(IndexError): - normalize_dim_selection(slice(1000, 2000), 100, 10) - with assert_raises(IndexError): - normalize_dim_selection(slice(-1000, 0), 100, 10) - - with assert_raises(IndexError): - normalize_dim_selection('foo', 100, 10) - - with assert_raises(NotImplementedError): - normalize_dim_selection(slice(0, 100, 2), 100, 10) - - -def test_normalize_array_selection(): - - # 1D, single item - eq((0,), normalize_array_selection(0, (100,), (10,))) - - # 1D, slice - eq((slice(0, 100),), normalize_array_selection(Ellipsis, (100,), (10,))) - eq((slice(0, 100),), normalize_array_selection(slice(None), (100,), (10,))) - eq((slice(0, 100),), normalize_array_selection(slice(None, 100), (100,), (10,))) - eq((slice(0, 100),), normalize_array_selection(slice(0, None), (100,), (10,))) - eq((slice(0, 100),), normalize_array_selection((slice(None), Ellipsis), (100,), (10,))) - eq((slice(0, 100),), normalize_array_selection((Ellipsis, slice(None)), (100,), (10,))) - - # 2D, single item - eq((0, 0), normalize_array_selection((0, 0), (100, 100), (10, 10))) - eq((99, 1), normalize_array_selection((-1, 1), (100, 100), (10, 10))) - - # 2D, single col/row - eq((0, slice(0, 100)), normalize_array_selection((0, slice(None)), (100, 100), (10, 10))) - eq((0, slice(0, 100)), normalize_array_selection((0,), (100, 100), (10, 10))) - eq((slice(0, 100), 0), normalize_array_selection((slice(None), 0), (100, 100), (10, 10))) - - # 2D slice - eq((slice(0, 100), slice(0, 100)), - normalize_array_selection(Ellipsis, (100, 100), (10, 10))) - eq((slice(0, 100), slice(0, 100)), - normalize_array_selection(slice(None), (100, 100), (10, 10))) - eq((slice(0, 100), slice(0, 100)), - normalize_array_selection((slice(None), slice(None)), (100, 100), (10, 10))) - eq((slice(0, 100), slice(0, 100)), - normalize_array_selection((Ellipsis, slice(None)), (100, 100), (10, 10))) - eq((slice(0, 100), slice(0, 100)), - normalize_array_selection((slice(None), Ellipsis), (100, 100), (10, 10))) - eq((slice(0, 100), slice(0, 100)), - normalize_array_selection((slice(None), Ellipsis, slice(None)), (100, 100), (10, 10))) - eq((slice(0, 100), slice(0, 100)), - normalize_array_selection((Ellipsis, slice(None), slice(None)), (100, 100), (10, 10))) - eq((slice(0, 100), slice(0, 100)), - normalize_array_selection((slice(None), slice(None), Ellipsis), (100, 100), (10, 10))) - - with assert_raises(IndexError): - normalize_array_selection('foo', (100,), (10,)) - - def test_normalize_resize_args(): # 1D diff --git a/zarr/util.py b/zarr/util.py index 9d770ffbb3..a68f723f3f 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -134,381 +134,6 @@ def is_total_slice(item, shape): raise TypeError('expected slice or tuple of slices, found %r' % item) -# class BoolArraySelection(object): -# -# def __init__(self, dim_sel, dim_len, dim_chunk_len): -# -# # check number of dimensions, only support indexing with 1d array -# if len(dim_sel.shape) > 1: -# raise IndexError('can only index with 1-dimensional Boolean array') -# -# # check shape -# if dim_sel.shape[0] != dim_len: -# raise IndexError('Boolean array has wrong length; expected %s, found %s' % -# (dim_len, dim_sel.shape[0])) -# -# # store attributes -# self.dim_sel = dim_sel -# self.dim_len = dim_len -# self.dim_chunk_len = dim_chunk_len -# self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) -# -# # precompute number of selected items for each chunk -# self.chunk_nitems = np.zeros(self.nchunks, dtype='i8') -# for dim_chunk_idx in range(self.nchunks): -# dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len -# self.chunk_nitems[dim_chunk_idx] = np.count_nonzero( -# self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] -# ) -# self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) -# self.nitems = self.chunk_nitems_cumsum[-1] -# -# def get_chunk_sel(self, dim_chunk_idx): -# dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len -# dim_chunk_sel = self.dim_sel[dim_chunk_offset:dim_chunk_offset + self.dim_chunk_len] -# # pad out if final chunk -# if dim_chunk_sel.shape[0] < self.dim_chunk_len: -# tmp = np.zeros(self.dim_chunk_len, dtype=bool) -# tmp[:dim_chunk_sel.shape[0]] = dim_chunk_sel -# dim_chunk_sel = tmp -# return dim_chunk_sel -# -# def get_out_sel(self, dim_chunk_idx): -# if dim_chunk_idx == 0: -# start = 0 -# else: -# start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] -# stop = self.chunk_nitems_cumsum[dim_chunk_idx] -# return slice(start, stop) -# -# def get_chunk_ranges(self): -# return np.nonzero(self.chunk_nitems)[0] -# -# -# class IntArraySelection(object): -# -# def __init__(self, dim_sel, dim_len, dim_chunk_len): -# -# # has to be a numpy array so we can do bincount -# dim_sel = np.asanyarray(dim_sel) -# -# # check number of dimensions, only support indexing with 1d array -# if len(dim_sel.shape) > 1: -# raise IndexError('can only index with 1-dimensional integer array') -# -# # handle wraparound -# loc_neg = dim_sel < 0 -# if np.any(loc_neg): -# dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len -# -# # handle out of bounds -# if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): -# raise IndexError('index out of bounds') -# -# # validate monotonically increasing -# if np.any(np.diff(dim_sel) < 0): -# raise NotImplementedError('only monotonically increasing indices are supported') -# -# # store attributes -# self.dim_sel = dim_sel -# self.dim_len = dim_len -# self.dim_chunk_len = dim_chunk_len -# self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) -# -# # precompute number of selected items for each chunk -# # note: for dense integer selections, the division operation here is the bottleneck -# self.chunk_nitems = np.bincount(self.dim_sel // self.dim_chunk_len, minlength=self.nchunks) -# self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) -# self.nitems = len(dim_sel) -# -# def get_chunk_sel(self, dim_chunk_idx): -# # need to slice out relevant indices from the total selection, then subtract the chunk -# # offset -# -# dim_out_sel = self.get_out_sel(dim_chunk_idx) -# dim_chunk_offset = dim_chunk_idx * self.dim_chunk_len -# dim_chunk_sel = self.dim_sel[dim_out_sel] - dim_chunk_offset -# -# return dim_chunk_sel -# -# def get_out_sel(self, dim_chunk_idx): -# if dim_chunk_idx == 0: -# start = 0 -# else: -# start = self.chunk_nitems_cumsum[dim_chunk_idx - 1] -# stop = self.chunk_nitems_cumsum[dim_chunk_idx] -# return slice(start, stop) -# -# def get_chunk_ranges(self): -# return np.nonzero(self.chunk_nitems)[0] -# -# -# # TODO support slice with step via integer selection (convert to np.arange) -# -# -# def normalize_dim_selection(dim_sel, dim_len, dim_chunk_len): -# """Convenience function to normalize a selection within a single axis -# of size `dim_len` for an array with chunk length `dim_chunk_len`.""" -# -# # normalize list to array -# if isinstance(dim_sel, list): -# dim_sel = np.asarray(dim_sel) -# -# if isinstance(dim_sel, numbers.Integral): -# -# # normalize type to int -# dim_sel = int(dim_sel) -# -# # handle wraparound -# if dim_sel < 0: -# dim_sel = dim_len + dim_sel -# -# # handle out of bounds -# if dim_sel >= dim_len or dim_sel < 0: -# raise IndexError('index out of bounds: %s' % dim_sel) -# -# return dim_sel -# -# elif isinstance(dim_sel, slice): -# -# # handle slice with step -# if dim_sel.step is not None and dim_sel.step != 1: -# raise NotImplementedError('slice with step not implemented') -# -# # handle slice with None bound -# start = 0 if dim_sel.start is None else dim_sel.start -# stop = dim_len if dim_sel.stop is None else dim_sel.stop -# -# # handle wraparound -# if start < 0: -# start = dim_len + start -# if stop < 0: -# stop = dim_len + stop -# -# # handle zero-length axis -# if start == stop == dim_len == 0: -# return slice(0, 0) -# -# # handle out of bounds -# if start < 0: -# raise IndexError('start index out of bounds: %s' % dim_sel.start) -# if stop < 0: -# raise IndexError('stop index out of bounds: %s' % dim_sel.stop) -# if start >= dim_len: -# raise IndexError('start index out of bounds: %ss' % dim_sel.start) -# if stop > dim_len: -# stop = dim_len -# if stop < start: -# stop = start -# -# return slice(start, stop) -# -# elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): -# -# if dim_sel.dtype == bool: -# return BoolArraySelection(dim_sel, dim_len, dim_chunk_len) -# -# elif dim_sel.dtype.kind in 'ui': -# return IntArraySelection(dim_sel, dim_len, dim_chunk_len) -# -# else: -# raise IndexError('unsupported index item type: %r' % dim_sel) -# -# else: -# raise IndexError('unsupported index item type: %r' % dim_sel) -# -# -# # noinspection PyTypeChecker -# def normalize_array_selection(selection, shape, chunks): -# """Convenience function to normalize a selection within an array with -# the given `shape`.""" -# -# # ensure tuple -# if not isinstance(selection, tuple): -# selection = (selection,) -# -# # handle ellipsis -# n_ellipsis = sum(1 for i in selection if i is Ellipsis) -# if n_ellipsis > 1: -# raise IndexError("an index can only have a single ellipsis ('...')") -# elif n_ellipsis == 1: -# n_items_l = selection.index(Ellipsis) # items to left of ellipsis -# n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis -# n_items = len(selection) - 1 # all non-ellipsis items -# if n_items >= len(shape): -# # ellipsis does nothing, just remove it -# selection = tuple(i for i in selection if i != Ellipsis) -# else: -# # replace ellipsis with as many slices are needed for number of dims -# new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) -# if n_items_r: -# new_item += selection[-n_items_r:] -# selection = new_item -# -# # check dimensionality -# if len(selection) > len(shape): -# raise IndexError('too many indices for array') -# -# # determine start and stop indices for all axes -# selection = tuple(normalize_dim_selection(i, l, c) for i, l, c in zip(selection, shape, chunks)) -# -# # fill out selection if not completely specified -# if len(selection) < len(shape): -# selection += tuple(slice(0, l) for l in shape[len(selection):]) -# -# return selection -# -# -# def get_chunks_for_selection(selection, chunks): -# """Convenience function to find chunks overlapping an array selection. N.B., -# assumes selection has already been normalized.""" -# -# # indices of chunks overlapping the selection -# chunk_ranges = [] -# -# # shape of the selection -# sel_shape = [] -# -# # iterate over dimensions of the array -# for dim_sel, dim_chunk_len in zip(selection, chunks): -# -# # dim_sel: selection for current dimension -# # dim_chunk_len: length of chunk along current dimension -# -# dim_sel_len = None -# -# if isinstance(dim_sel, int): -# -# # dim selection is an integer, i.e., single item, so only need single chunk index for -# # this dimension -# dim_chunk_range = [dim_sel//dim_chunk_len] -# -# elif isinstance(dim_sel, slice): -# -# # dim selection is a slice, need range of chunk indices including start and stop of -# # selection -# dim_chunk_from = dim_sel.start//dim_chunk_len -# dim_chunk_to = int(np.ceil(dim_sel.stop/dim_chunk_len)) -# dim_chunk_range = range(dim_chunk_from, dim_chunk_to) -# dim_sel_len = dim_sel.stop - dim_sel.start -# -# elif isinstance(dim_sel, BoolArraySelection): -# -# # dim selection is a boolean array, delegate this to the BooleanSelection class -# dim_chunk_range = dim_sel.get_chunk_ranges() -# dim_sel_len = dim_sel.nitems -# -# elif isinstance(dim_sel, IntArraySelection): -# -# # dim selection is an integer array, delegate this to the integerSelection class -# dim_chunk_range = dim_sel.get_chunk_ranges() -# dim_sel_len = dim_sel.nitems -# -# else: -# raise RuntimeError('unexpected selection type') -# -# chunk_ranges.append(dim_chunk_range) -# if dim_sel_len is not None: -# sel_shape.append(dim_sel_len) -# -# return chunk_ranges, tuple(sel_shape) -# -# -# def get_chunk_selections(selection, chunk_coords, chunks, n_advanced_selection): -# -# # chunk_coords: holds the index along each dimension for the current chunk within the -# # chunk grid. E.g., (0, 0) locates the first (top left) chunk in a 2D chunk grid. -# -# chunk_selection = [] -# out_selection = [] -# -# # iterate over dimensions (axes) of the array -# for dim_sel, dim_chunk_idx, dim_chunk_len in zip(selection, chunk_coords, chunks): -# -# # dim_sel: selection for current dimension -# # dim_chunk_idx: chunk index along current dimension -# # dim_chunk_len: chunk length along current dimension -# -# # selection for current chunk along current dimension -# dim_chunk_sel = None -# -# # selection into output array to store data from current chunk -# dim_out_sel = None -# -# # calculate offset for current chunk along current dimension - this is used to -# # determine the values to be extracted from the current chunk -# dim_chunk_offset = dim_chunk_idx * dim_chunk_len -# -# # handle integer selection, i.e., single item -# if isinstance(dim_sel, int): -# -# dim_chunk_sel = dim_sel - dim_chunk_offset -# -# # N.B., leave dim_out_sel as None, as this dimension has been dropped in the -# # output array because of single value index -# -# # handle slice selection, i.e., contiguous range of items -# elif isinstance(dim_sel, slice): -# -# if dim_sel.start <= dim_chunk_offset: -# # selection starts before current chunk -# dim_chunk_sel_start = 0 -# dim_out_offset = dim_chunk_offset - dim_sel.start -# -# else: -# # selection starts within current chunk -# dim_chunk_sel_start = dim_sel.start - dim_chunk_offset -# dim_out_offset = 0 -# -# if dim_sel.stop > dim_chunk_offset + dim_chunk_len: -# # selection ends after current chunk -# dim_chunk_sel_stop = dim_chunk_len -# -# else: -# # selection ends within current chunk -# dim_chunk_sel_stop = dim_sel.stop - dim_chunk_offset -# -# dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) -# dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start -# dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) -# -# elif isinstance(dim_sel, (BoolArraySelection, IntArraySelection)): -# -# # get selection to extract data for the current chunk -# dim_chunk_sel = dim_sel.get_chunk_sel(dim_chunk_idx) -# -# # figure out where to put these items in the output array -# dim_out_sel = dim_sel.get_out_sel(dim_chunk_idx) -# -# else: -# raise RuntimeError('unexpected selection type') -# -# # add to chunk selection -# chunk_selection.append(dim_chunk_sel) -# -# # add to output selection -# if dim_out_sel is not None: -# out_selection.append(dim_out_sel) -# -# # normalise for indexing into numpy arrays -# chunk_selection = tuple(chunk_selection) -# out_selection = tuple(out_selection) -# -# # handle advanced indexing arrays orthogonally -# if n_advanced_selection > 0: -# # numpy doesn't support orthogonal indexing directly as yet, so need to work -# # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices -# # or integers, so need to convert slices and integers into ranges. -# chunk_selection = [range(dim_chunk_sel.start, dim_chunk_sel.stop) -# if isinstance(dim_chunk_sel, slice) -# else [dim_chunk_sel] if isinstance(dim_chunk_sel, int) -# else dim_chunk_sel -# for dim_chunk_sel in chunk_selection] -# chunk_selection = np.ix_(*chunk_selection) -# -# return chunk_selection, out_selection - - def normalize_resize_args(old_shape, *args): # normalize new shape argument From 197c6b61368765e0e2d38402a59834a5d405be28 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 6 Nov 2017 13:11:33 +0000 Subject: [PATCH 34/67] rework indexing tests --- notebooks/advanced_indexing.ipynb | 442 +++++++++++++++++++----------- zarr/core.py | 38 ++- zarr/indexing.py | 76 +++-- zarr/tests/test_indexing.py | 284 +++++++++++++++---- 4 files changed, 610 insertions(+), 230 deletions(-) diff --git a/notebooks/advanced_indexing.ipynb b/notebooks/advanced_indexing.ipynb index e608cf1021..bf535375ba 100644 --- a/notebooks/advanced_indexing.ipynb +++ b/notebooks/advanced_indexing.ipynb @@ -1,8 +1,15 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Advanced indexing" + ] + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -11,7 +18,7 @@ "'2.1.5.dev83'" ] }, - "execution_count": 1, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -30,104 +37,150 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Demonstrate advanced indexing" + "## Functionality and API" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Indexing with Boolean arrays" + "### Indexing a 1D array with a Boolean array" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ - "a = np.arange(10)" + "a = np.arange(10)\n", + "za = zarr.array(a, chunks=2)\n", + "ix = [False, True, False, True, False, True, False, True, False, True]" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([False, True, True, True, False, False, False, True, True, True], dtype=bool)" + "array([1, 3, 5, 7, 9])" ] }, - "execution_count": 3, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ix = np.random.binomial(1, 0.5, size=a.shape[0]).astype(bool)\n", - "ix" + "# get items\n", + "za[ix]" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([1, 2, 3, 7, 8, 9])" + "array([ 0, 10, 2, 30, 4, 50, 6, 70, 8, 90])" ] }, - "execution_count": 4, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "a[ix]" + "# set items\n", + "za[ix] = a[ix] * 10\n", + "za[:]" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([1, 2, 3, 7, 8, 9])" + "array([1, 3, 5, 7, 9])" ] }, - "execution_count": 5, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# indexing array can be any array-like, e.g., Zarr array\n", + "zix = zarr.array(ix, chunks=2)\n", + "za = zarr.array(a, chunks=2)\n", + "za[zix] # will not load all zix into memory" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indexing a 1D array with an integer array" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "a = np.arange(10)\n", "za = zarr.array(a, chunks=2)\n", + "ix = [1, 3, 5, 7, 9]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([10, 30, 50, 70, 90])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get items\n", "za[ix]" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([ 0, 10, 20, 30, 4, 5, 6, 70, 80, 90])" + "array([ 0, 10, 2, 30, 4, 50, 6, 70, 8, 90])" ] }, - "execution_count": 6, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# set items\n", "za[ix] = a[ix] * 10\n", "za[:]" ] @@ -136,411 +189,488 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Indexing with integer arrays" + "### Slicing a 1D array with step > 1\n", + "\n", + "Slices with step > 1 are supported. Internally these are converted to an integer array via ``np.arange``." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "a = np.arange(10)\n", + "za = zarr.array(a, chunks=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([5, 4, 1, 7, 5])" + "array([10, 30, 50, 70, 90])" ] }, - "execution_count": 7, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ix = np.random.choice(a.shape[0], size=a.shape[0]//2)\n", - "ix" + "# get items\n", + "za[1::2]" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([5, 4, 1, 7, 5])" + "array([ 0, 10, 2, 30, 4, 50, 6, 70, 8, 90])" ] }, - "execution_count": 8, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "a[ix]" + "# set items\n", + "za[1::2] = a[1::2] * 10\n", + "za[:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Orthogonal (outer) indexing of multi-dimensional arrays\n", + "\n", + "Orthogonal (a.k.a. outer) indexing is supported with either Boolean or integer arrays. This functionality is provided via the ``get/set_orthogonal_selection()`` methods. For convenience, this functionality is also available via the ``oindex[]`` property as has been proposed for numpy." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([5, 4, 1, 7, 5])" + "array([[ 0, 1, 2],\n", + " [ 3, 4, 5],\n", + " [ 6, 7, 8],\n", + " [ 9, 10, 11],\n", + " [12, 13, 14]])" ] }, - "execution_count": 9, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "za = zarr.array(a, chunks=2)\n", - "za[ix]" + "a = np.arange(15).reshape(5, 3)\n", + "za = zarr.array(a, chunks=(3, 2))\n", + "za[:]" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([ 0, 10, 2, 3, 40, 50, 6, 70, 8, 9])" + "array([[ 3, 5],\n", + " [ 9, 11]])" ] }, - "execution_count": 10, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "za[ix] = a[ix] * 10\n", - "za[:]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Multidimensional indexing\n", - "\n", - "N.B., orthogonal indexing is available. This is different from numpy fancy indexing if more than one dimension is indexed with an array." + "# orthogonal indexing with Boolean arrays\n", + "ix0 = [False, True, False, True, False]\n", + "ix1 = [True, False, True]\n", + "za.get_orthogonal_selection((ix0, ix1))" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 31, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 3, 5],\n", + " [ 9, 11]])" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "b = np.arange(100).reshape(10, 10)" + "# alternative API\n", + "za.oindex[ix0, ix1]" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([False, True, True, True, False, False, True, False, False, True], dtype=bool)" + "array([[ 3, 5],\n", + " [ 9, 11]])" ] }, - "execution_count": 12, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ix0 = np.random.binomial(1, 0.5, size=b.shape[0]).astype(bool)\n", - "ix0" + "# orthogonal indexing with integer arrays\n", + "ix0 = [1, 3]\n", + "ix1 = [0, 2]\n", + "za.get_orthogonal_selection((ix0, ix1))" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([False, False, True, False, True, False, True, True, False, False], dtype=bool)" + "array([[ 3, 5],\n", + " [ 9, 11]])" ] }, - "execution_count": 13, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ix1 = np.random.binomial(1, 0.5, size=b.shape[1]).astype(bool)\n", - "ix1" + "# alternative API\n", + "za.oindex[ix0, ix1]" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[12, 14, 16, 17],\n", - " [22, 24, 26, 27],\n", - " [32, 34, 36, 37],\n", - " [62, 64, 66, 67],\n", - " [92, 94, 96, 97]])" + "array([[ 3, 4, 5],\n", + " [ 9, 10, 11]])" ] }, - "execution_count": 14, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "b[np.ix_(ix0, ix1)]" + "# combine with slice\n", + "za.oindex[[1, 3], :]" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[12, 14, 16, 17],\n", - " [22, 24, 26, 27],\n", - " [32, 34, 36, 37],\n", - " [62, 64, 66, 67],\n", - " [92, 94, 96, 97]])" + "array([[ 0, 2],\n", + " [ 3, 5],\n", + " [ 6, 8],\n", + " [ 9, 11],\n", + " [12, 14]])" ] }, - "execution_count": 15, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "zb = zarr.array(b, chunks=(2, 2))\n", - "zb.oindex[ix0, ix1]" + "# combine with slice\n", + "za.oindex[:, [0, 2]]" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n", - " [10, 11, -1, 13, -1, 15, -1, -1, 18, 19],\n", - " [20, 21, -1, 23, -1, 25, -1, -1, 28, 29],\n", - " [30, 31, -1, 33, -1, 35, -1, -1, 38, 39],\n", - " [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],\n", - " [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],\n", - " [60, 61, -1, 63, -1, 65, -1, -1, 68, 69],\n", - " [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],\n", - " [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],\n", - " [90, 91, -1, 93, -1, 95, -1, -1, 98, 99]])" + "array([[ 0, 1, 2],\n", + " [42, 4, 42],\n", + " [ 6, 7, 8],\n", + " [42, 10, 42],\n", + " [12, 13, 14]])" ] }, - "execution_count": 16, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "zb.oindex[ix0, ix1] = -1\n", - "zb[:]" + "# set items via Boolean selection\n", + "ix0 = [False, True, False, True, False]\n", + "ix1 = [True, False, True]\n", + "selection = ix0, ix1\n", + "value = 42\n", + "za.set_orthogonal_selection(selection, value)\n", + "za[:]" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([8, 1, 9, 8, 9])" + "array([[ 0, 1, 2],\n", + " [44, 4, 44],\n", + " [ 6, 7, 8],\n", + " [44, 10, 44],\n", + " [12, 13, 14]])" ] }, - "execution_count": 17, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ix0 = np.random.choice(b.shape[0], size=b.shape[0]//2)\n", - "ix0" + "# alternative API\n", + "za.oindex[ix0, ix1] = 44\n", + "za[:]" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([4, 1, 3, 6, 7])" + "array([[ 0, 1, 2],\n", + " [46, 4, 46],\n", + " [ 6, 7, 8],\n", + " [46, 10, 46],\n", + " [12, 13, 14]])" ] }, - "execution_count": 18, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ix1 = np.random.choice(b.shape[1], size=b.shape[1]//2)\n", - "ix1" + "# set items via integer selection\n", + "ix0 = [1, 3]\n", + "ix1 = [0, 2]\n", + "selection = ix0, ix1\n", + "value = 46\n", + "za.set_orthogonal_selection(selection, value)\n", + "za[:]" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[84, 81, 83, 86, 87],\n", - " [14, 11, 13, 16, 17],\n", - " [94, 91, 93, 96, 97],\n", - " [84, 81, 83, 86, 87],\n", - " [94, 91, 93, 96, 97]])" + "array([[ 0, 1, 2],\n", + " [48, 4, 48],\n", + " [ 6, 7, 8],\n", + " [48, 10, 48],\n", + " [12, 13, 14]])" ] }, - "execution_count": 19, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "b[np.ix_(ix0, ix1)]" + "# alternative API\n", + "za.oindex[ix0, ix1] = 48\n", + "za[:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Coordinate indexing of multi-dimensional arrays\n", + "\n", + "Selecting arbitrary points from a multi-dimensional array by indexing with integer (coordinate) arrays is supported. This functionality is provided via the ``get/set_coordinate_selection()`` methods. For convenience, this functionality is also available via the ``vindex[]`` property as has been proposed for numpy." ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[84, 81, 83, 86, 87],\n", - " [14, 11, 13, 16, 17],\n", - " [94, 91, 93, 96, 97],\n", - " [84, 81, 83, 86, 87],\n", - " [94, 91, 93, 96, 97]])" + "array([[ 0, 1, 2],\n", + " [ 3, 4, 5],\n", + " [ 6, 7, 8],\n", + " [ 9, 10, 11],\n", + " [12, 13, 14]])" ] }, - "execution_count": 20, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "zb = zarr.array(b, chunks=(2, 2))\n", - "zb.oindex[ix0, ix1]" + "a = np.arange(15).reshape(5, 3)\n", + "za = zarr.array(a, chunks=(3, 2))\n", + "za[:]" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n", - " [10, -1, 12, -1, -1, 15, -1, -1, 18, 19],\n", - " [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],\n", - " [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],\n", - " [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],\n", - " [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],\n", - " [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],\n", - " [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],\n", - " [80, -1, 82, -1, -1, 85, -1, -1, 88, 89],\n", - " [90, -1, 92, -1, -1, 95, -1, -1, 98, 99]])" + "array([ 3, 11])" ] }, - "execution_count": 21, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "zb.oindex[ix0, ix1] = -1\n", - "zb[:]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Indexing with zarr bool arrays" + "# get items\n", + "ix0 = [1, 3]\n", + "ix1 = [0, 2]\n", + "za.get_coordinate_selection((ix0, ix1))" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 56, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 3, 11])" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "ix = np.random.binomial(1, 0.5, size=a.shape[0]).astype(bool)\n", - "zix = zarr.array(ix, chunks=2)" + "# alternative API\n", + "za.vindex[ix0, ix1]" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([1, 3, 5, 6, 8, 9])" + "array([[ 0, 1, 2],\n", + " [42, 4, 5],\n", + " [ 6, 7, 8],\n", + " [ 9, 10, 42],\n", + " [12, 13, 14]])" ] }, - "execution_count": 23, + "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "za = zarr.array(a, chunks=2)\n", - "za[ix]" + "# set items\n", + "za.set_coordinate_selection((ix0, ix1), 42)\n", + "za[:]" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([1, 3, 5, 6, 8, 9])" + "array([[ 0, 1, 2],\n", + " [44, 4, 44],\n", + " [ 6, 7, 8],\n", + " [44, 10, 44],\n", + " [12, 13, 14]])" ] }, - "execution_count": 24, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# will not load all zix into memory\n", - "za[zix]" + "# alternative API\n", + "za.vindex[ix0, ix1] = 44\n", + "za[:]" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/zarr/core.py b/zarr/core.py index af6df3aca2..dd28e04862 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -16,7 +16,8 @@ from zarr.errors import PermissionError, err_read_only, err_array_not_found from zarr.compat import reduce from zarr.codecs import AsType, get_codec -from zarr.indexing import OIndex, OrthogonalIndexer, BasicIndexer, VIndex, CoordinateIndexer +from zarr.indexing import OIndex, OrthogonalIndexer, BasicIndexer, VIndex, CoordinateIndexer, \ + MaskIndexer class Array(object): @@ -529,6 +530,7 @@ def _get_basic_selection_nd(self, selection, out=None): return self._get_selection(indexer, out=out) def get_orthogonal_selection(self, selection, out=None): + """TODO""" # refresh metadata if not self._cache_metadata: @@ -540,6 +542,7 @@ def get_orthogonal_selection(self, selection, out=None): return self._get_selection(indexer, out=out) def get_coordinate_selection(self, selection, out=None): + """TODO""" # refresh metadata if not self._cache_metadata: @@ -550,6 +553,18 @@ def get_coordinate_selection(self, selection, out=None): return self._get_selection(indexer, out=out) + def get_mask_selection(self, selection, out=None): + """TODO""" + + # refresh metadata + if not self._cache_metadata: + self._load_metadata() + + # setup indexer + indexer = MaskIndexer(selection, self) + + return self._get_selection(indexer, out=out) + def _get_selection(self, indexer, out=None): # We iterate over all chunks which overlap the selection and thus contain data that needs @@ -673,6 +688,7 @@ def set_basic_selection(self, selection, value): return self._set_basic_selection_nd(selection, value) def set_orthogonal_selection(self, selection, value): + """TODO""" # guard conditions if self._read_only: @@ -688,6 +704,7 @@ def set_orthogonal_selection(self, selection, value): self._set_selection(indexer, value) def set_coordinate_selection(self, selection, value): + """TODO""" # guard conditions if self._read_only: @@ -702,6 +719,22 @@ def set_coordinate_selection(self, selection, value): self._set_selection(indexer, value) + def set_mask_selection(self, selection, value): + """TODO""" + + # guard conditions + if self._read_only: + err_read_only() + + # refresh metadata + if not self._cache_metadata: + self._load_metadata_nosync() + + # setup indexer + indexer = MaskIndexer(selection, self) + + self._set_selection(indexer, value) + def _set_basic_selection_zd(self, selection, value): # special case __setitem__ for zero-dimensional array @@ -750,7 +783,8 @@ def _set_selection(self, indexer, value): if not hasattr(value, 'shape'): raise TypeError('value must be an array-like object') if value.shape != sel_shape: - raise ValueError('value has wrong shape for selection') + raise ValueError('value has wrong shape for selection; expected {}, got {}' + .format(sel_shape, value.shape)) # iterate over chunks in range for chunk_coords, chunk_selection, out_selection in indexer: diff --git a/zarr/indexing.py b/zarr/indexing.py index 2b412662c4..8712c4318c 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -8,6 +8,18 @@ import numpy as np +def is_integer(x): + return isinstance(x, numbers.Integral) + + +def is_integer_array(x): + return hasattr(x, 'dtype') and x.dtype.kind in 'ui' + + +def is_bool_array(x): + return hasattr(x, 'dtype') and x.dtype == bool + + def normalize_integer_selection(dim_sel, dim_len): # normalize type to int @@ -45,7 +57,7 @@ class IntDimIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): # check type - if not isinstance(dim_sel, numbers.Integral): + if not is_integer(dim_sel): raise ValueError('selection must be an integer') # normalize @@ -431,7 +443,7 @@ def __init__(self, selection, array): dim_indexers = [] for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): - if isinstance(dim_sel, numbers.Integral): + if is_integer(dim_sel): dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) @@ -447,16 +459,13 @@ def __init__(self, selection, array): else: dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) - elif hasattr(dim_sel, 'dtype') and hasattr(dim_sel, 'shape'): + elif is_integer_array(dim_sel): - if dim_sel.dtype == bool: - dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) - elif dim_sel.dtype.kind in 'ui': - dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + elif is_bool_array(dim_sel): - else: - raise IndexError('bad selection type') + dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) else: raise IndexError('bad selection type') @@ -511,8 +520,7 @@ def is_coordinate_selection(selection, array): return ( (len(selection) == len(array._shape)) and all( - [(isinstance(dim_sel, numbers.Integral) or - (hasattr(dim_sel, 'dtype') and dim_sel.dtype.kind in 'ui')) + [is_integer(dim_sel) or is_integer_array(dim_sel) for dim_sel in selection] ) ) @@ -520,10 +528,9 @@ def is_coordinate_selection(selection, array): def is_mask_selection(selection, array): return ( - hasattr(selection, 'dtype') and - selection.dtype == bool and - hasattr(selection, 'shape') and - len(selection.shape) == len(array.shape) + len(selection) == 1 and + is_bool_array(selection[0]) and + selection[0].shape == array.shape ) @@ -541,8 +548,7 @@ def __init__(self, selection, array): # some initial normalization selection = ensure_tuple(selection) - selection = tuple([i] if isinstance(i, numbers.Integral) else i - for i in selection) + selection = tuple([i] if is_integer(i) else i for i in selection) selection = replace_lists(selection) # validation @@ -586,7 +592,7 @@ def __init__(self, selection, array): # store atrributes self.selection = selection self.sel_sort = sel_sort - self.shape = len(selection[0]) if selection[0].shape else 1 + self.shape = selection[0].shape if selection[0].shape else (1,) self.drop_axes = None self.array = array @@ -623,6 +629,27 @@ def __iter__(self): yield ChunkProjection(chunk_coords, chunk_selection, out_selection) +# noinspection PyProtectedMember +class MaskIndexer(CoordinateIndexer): + + def __init__(self, selection, array): + + # some initial normalization + selection = ensure_tuple(selection) + selection = replace_lists(selection) + + # validation + if not is_mask_selection(selection, array): + # TODO refactor error messages for consistency + raise IndexError('invalid mask selection') + + # convert to indices + selection = np.nonzero(selection[0]) + + # delegate the rest to superclass + super(MaskIndexer, self).__init__(selection, array) + + class VIndex(object): def __init__(self, array): @@ -633,10 +660,17 @@ def __getitem__(self, selection): selection = replace_lists(selection) if is_coordinate_selection(selection, self.array): return self.array.get_coordinate_selection(selection) - # elif is_mask_selection(selection, self.array): - # return self.array.get_mask_selection(selection) + elif is_mask_selection(selection, self.array): + return self.array.get_mask_selection(selection) else: raise IndexError('unsupported selection') def __setitem__(self, selection, value): - return self.array.set_orthogonal_selection(selection, value) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + if is_coordinate_selection(selection, self.array): + return self.array.set_coordinate_selection(selection, value) + elif is_mask_selection(selection, self.array): + return self.array.set_mask_selection(selection, value) + else: + raise IndexError('unsupported selection') diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 99bb9ed7fa..a8d370de6c 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -82,7 +82,7 @@ def test_replace_ellipsis(): replace_ellipsis((slice(None), slice(None), Ellipsis), (100, 100))) -def _test_orthogonal_indexing_1d_common(a, z, ix): +def _test_get_orthogonal_selection_1d_common(a, z, ix): expect = a[ix] actual = z.get_orthogonal_selection(ix) assert_array_equal(expect, actual) @@ -94,7 +94,7 @@ def _test_orthogonal_indexing_1d_common(a, z, ix): # noinspection PyStatementEffect -def test_orthogonal_indexing_1d_bool(): +def test_get_orthogonal_selection_1d_bool(): # setup a = np.arange(1050, dtype=int) @@ -105,7 +105,7 @@ def test_orthogonal_indexing_1d_bool(): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - _test_orthogonal_indexing_1d_common(a, z, ix) + _test_get_orthogonal_selection_1d_common(a, z, ix) # test errors with assert_raises(IndexError): @@ -117,7 +117,7 @@ def test_orthogonal_indexing_1d_bool(): # noinspection PyStatementEffect -def test_orthogonal_indexing_1d_int(): +def test_get_orthogonal_selection_1d_int(): # setup a = np.arange(1050, dtype=int) @@ -126,11 +126,11 @@ def test_orthogonal_indexing_1d_int(): np.random.seed(42) # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: + for p in 2, 0.5, 0.1, 0.01: ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - _test_orthogonal_indexing_1d_common(a, z, ix) + _test_get_orthogonal_selection_1d_common(a, z, ix) ix.sort() - _test_orthogonal_indexing_1d_common(a, z, ix) + _test_get_orthogonal_selection_1d_common(a, z, ix) # test wraparound ix = [0, 3, 10, -23, -12, -1] @@ -156,7 +156,7 @@ def test_orthogonal_indexing_1d_int(): z.oindex[ix] -def test_orthogonal_indexing_1d_slice_with_step(): +def test_get_orthogonal_selection_1d_slice_with_step(): # setup a = np.arange(1050, dtype=int) @@ -184,7 +184,7 @@ def test_orthogonal_indexing_1d_slice_with_step(): assert_array_equal(expect, actual) -def _test_orthogonal_indexing_2d_common(a, z, ix0, ix1): +def _test_get_orthogonal_selection_2d_common(a, z, ix0, ix1): selections = [ # index both axes with array @@ -207,7 +207,7 @@ def _test_orthogonal_indexing_2d_common(a, z, ix0, ix1): assert_array_equal(expect, actual) -def test_orthogonal_indexing_2d_bool(): +def test_get_orthogonal_selection_2d_bool(): # setup a = np.arange(10000, dtype=int).reshape(1000, 10) @@ -221,7 +221,7 @@ def test_orthogonal_indexing_2d_bool(): ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) # main tests - _test_orthogonal_indexing_2d_common(a, z, ix0, ix1) + _test_get_orthogonal_selection_2d_common(a, z, ix0, ix1) # mixed int array / bool array selections = ( @@ -234,7 +234,7 @@ def test_orthogonal_indexing_2d_bool(): assert_array_equal(expect, actual) -def test_orthogonal_indexing_2d_int(): +def test_get_orthogonal_selection_2d_int(): # setup a = np.arange(10000, dtype=int).reshape(1000, 10) @@ -243,16 +243,16 @@ def test_orthogonal_indexing_2d_int(): np.random.seed(42) # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: + for p in 2, 0.5, 0.1, 0.01: ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) - _test_orthogonal_indexing_2d_common(a, z, ix0, ix1) + _test_get_orthogonal_selection_2d_common(a, z, ix0, ix1) ix0.sort() ix1.sort() - _test_orthogonal_indexing_2d_common(a, z, ix0, ix1) + _test_get_orthogonal_selection_2d_common(a, z, ix0, ix1) -def _test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2): +def _test_get_orthogonal_selection_3d_common(a, z, ix0, ix1, ix2): selections = [ # index all axes with array @@ -289,7 +289,7 @@ def _test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2): assert_array_equal(expect, actual) -def test_orthogonal_indexing_3d_bool(): +def test_get_orthogonal_selection_3d_bool(): # setup a = np.arange(100000, dtype=int).reshape(200, 50, 10) @@ -302,7 +302,7 @@ def test_orthogonal_indexing_3d_bool(): ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) ix2 = np.random.binomial(1, .5, size=a.shape[2]).astype(bool) - _test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) + _test_get_orthogonal_selection_3d_common(a, z, ix0, ix1, ix2) def test_orthogonal_indexing_edge_cases(): @@ -320,7 +320,7 @@ def test_orthogonal_indexing_edge_cases(): assert_array_equal(expect, actual) -def test_orthogonal_indexing_3d_int(): +def test_get_orthogonal_selection_3d_int(): # setup a = np.arange(100000, dtype=int).reshape(200, 50, 10) @@ -329,18 +329,18 @@ def test_orthogonal_indexing_3d_int(): np.random.seed(42) # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: + for p in 2, 0.5, 0.1, 0.01: ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) - _test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) + _test_get_orthogonal_selection_3d_common(a, z, ix0, ix1, ix2) ix0.sort() ix1.sort() ix2.sort() - _test_orthogonal_indexing_3d_common(a, z, ix0, ix1, ix2) + _test_get_orthogonal_selection_3d_common(a, z, ix0, ix1, ix2) -def _test_orthogonal_indexing_1d_common_set(v, a, z, ix): +def _test_set_orthogonal_selection_1d_common(v, a, z, ix): a[:] = 0 a[ix] = v[ix] z[:] = 0 @@ -355,7 +355,7 @@ def _test_orthogonal_indexing_1d_common_set(v, a, z, ix): assert_array_equal(a, z[:]) -def test_orthogonal_indexing_1d_bool_set(): +def test_set_orthogonal_selection_1d_bool(): # setup v = np.arange(1050, dtype=int) @@ -366,10 +366,10 @@ def test_orthogonal_indexing_1d_bool_set(): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - _test_orthogonal_indexing_1d_common_set(v, a, z, ix) + _test_set_orthogonal_selection_1d_common(v, a, z, ix) -def test_orthogonal_indexing_1d_int_set(): +def test_set_orthogonal_selection_1d_int(): # setup v = np.arange(1050, dtype=int) @@ -378,14 +378,14 @@ def test_orthogonal_indexing_1d_int_set(): np.random.seed(42) # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: + for p in 2, 0.5, 0.1, 0.01: ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - _test_orthogonal_indexing_1d_common_set(v, a, z, ix) + _test_set_orthogonal_selection_1d_common(v, a, z, ix) ix.sort() - _test_orthogonal_indexing_1d_common_set(v, a, z, ix) + _test_set_orthogonal_selection_1d_common(v, a, z, ix) -def _test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1): +def _test_set_orthogonal_selection_2d_common(v, a, z, ix0, ix1): selections = ( # index both axes with array @@ -407,7 +407,7 @@ def _test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1): assert_array_equal(a, z[:]) -def test_orthogonal_indexing_2d_bool_set(): +def test_set_orthogonal_selection_2d_bool(): # setup v = np.arange(10000, dtype=int).reshape(1000, 10) @@ -419,10 +419,10 @@ def test_orthogonal_indexing_2d_bool_set(): for p in 0.5, 0.1, 0.01: ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) - _test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1) + _test_set_orthogonal_selection_2d_common(v, a, z, ix0, ix1) -def test_orthogonal_indexing_2d_int_set(): +def test_set_orthogonal_selection_2d_int(): # setup v = np.arange(10000, dtype=int).reshape(1000, 10) @@ -431,16 +431,16 @@ def test_orthogonal_indexing_2d_int_set(): np.random.seed(42) # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: + for p in 2, 0.5, 0.1, 0.01: ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) - _test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1) + _test_set_orthogonal_selection_2d_common(v, a, z, ix0, ix1) ix0.sort() ix1.sort() - _test_orthogonal_indexing_2d_common_set(v, a, z, ix0, ix1) + _test_set_orthogonal_selection_2d_common(v, a, z, ix0, ix1) -def _test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2): +def _test_set_orthogonal_selection_3d_common(v, a, z, ix0, ix1, ix2): selections = ( # index all axes with bool array @@ -471,7 +471,7 @@ def _test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2): assert_array_equal(a, z[:]) -def test_orthogonal_indexing_3d_bool_set(): +def test_set_orthogonal_selection_3d_bool(): # setup v = np.arange(100000, dtype=int).reshape(200, 50, 10) @@ -484,10 +484,10 @@ def test_orthogonal_indexing_3d_bool_set(): ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) ix2 = np.random.binomial(1, .5, size=a.shape[2]).astype(bool) - _test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) + _test_set_orthogonal_selection_3d_common(v, a, z, ix0, ix1, ix2) -def test_orthogonal_indexing_3d_int_set(): +def test_set_orthogonal_selection_3d_int(): # setup v = np.arange(100000, dtype=int).reshape(200, 50, 10) @@ -496,19 +496,19 @@ def test_orthogonal_indexing_3d_int_set(): np.random.seed(42) # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: + for p in 2, 0.5, 0.1, 0.01: ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) - _test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) + _test_set_orthogonal_selection_3d_common(v, a, z, ix0, ix1, ix2) ix0.sort() ix1.sort() ix2.sort() - _test_orthogonal_indexing_3d_common_set(v, a, z, ix0, ix1, ix2) + _test_set_orthogonal_selection_3d_common(v, a, z, ix0, ix1, ix2) # noinspection PyStatementEffect -def test_coordinate_indexing_1d(): +def test_get_coordinate_selection_1d(): # setup a = np.arange(1050, dtype=int) @@ -517,8 +517,9 @@ def test_coordinate_indexing_1d(): np.random.seed(42) # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: - ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix = np.random.choice(a.shape[0], size=n, replace=True) expect = a[ix] actual = z.get_coordinate_selection(ix) assert_array_equal(expect, actual) @@ -567,7 +568,7 @@ def test_coordinate_indexing_1d(): z.get_coordinate_selection(ix) -def test_coordinate_indexing_2d(): +def test_get_coordinate_selection_2d(): # setup a = np.arange(10000, dtype=int).reshape(1000, 10) @@ -576,7 +577,7 @@ def test_coordinate_indexing_2d(): np.random.seed(42) # test with different degrees of sparseness - for p in 0.5, 0.1, 0.01: + for p in 2, 0.5, 0.1, 0.01: n = int(a.size * p) ix0 = np.random.choice(a.shape[0], size=n, replace=True) ix1 = np.random.choice(a.shape[1], size=n, replace=True) @@ -640,6 +641,168 @@ def test_coordinate_indexing_2d(): z.get_coordinate_selection(selection) +def test_set_coordinate_selection_1d_int(): + + # setup + v = np.arange(1050, dtype=int) + a = np.empty(v.shape, dtype=v.dtype) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix = np.random.choice(a.shape[0], size=n, replace=True) + + a[:] = 0 + a[ix] = v[ix] + z[:] = 0 + z.vindex[ix] = v[ix] + assert_array_equal(a, z[:]) + z[:] = 0 + z.set_coordinate_selection(ix, v[ix]) + assert_array_equal(a, z[:]) + + +def test_set_coordinate_selection_2d_int(): + + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty_like(v) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix0 = np.random.choice(a.shape[0], size=n, replace=True) + ix1 = np.random.choice(a.shape[1], size=n, replace=True) + + selections = ( + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + ) + + for selection in selections: + a[:] = 0 + a[selection] = v[selection] + z[:] = 0 + z.vindex[selection] = v[selection] + assert_array_equal(a, z[:]) + z[:] = 0 + z.set_coordinate_selection(selection, v[selection]) + assert_array_equal(a, z[:]) + + +# noinspection PyStatementEffect +def test_get_mask_selection_1d(): + + # setup + a = np.arange(1050, dtype=int) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + + expect = a[ix] + actual = z.get_mask_selection(ix) + assert_array_equal(expect, actual) + actual = z.vindex[ix] + assert_array_equal(expect, actual) + # for 1d arrays, also available via __getitem__ + actual = z[ix] + assert_array_equal(expect, actual) + + # test errors + with assert_raises(IndexError): + z.vindex[np.zeros(50, dtype=bool)] # too short + with assert_raises(IndexError): + z.vindex[np.zeros(2000, dtype=bool)] # too long + with assert_raises(IndexError): + z.vindex[[[True, False], [False, True]]] # too many dimensions + + +# noinspection PyStatementEffect +def test_get_mask_selection_2d(): + + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.size).astype(bool).reshape(a.shape) + expect = a[ix] + actual = z.get_mask_selection(ix) + assert_array_equal(expect, actual) + actual = z.vindex[ix] + assert_array_equal(expect, actual) + + # test errors + with assert_raises(IndexError): + z.vindex[np.zeros((1000, 5), dtype=bool)] # too short + with assert_raises(IndexError): + z.vindex[np.zeros((2000, 10), dtype=bool)] # too long + with assert_raises(IndexError): + z.vindex[[True, False]] # wrong no. dimensions + + +def test_set_mask_selection_1d(): + + # setup + v = np.arange(1050, dtype=int) + a = np.empty_like(v) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + + a[:] = 0 + z[:] = 0 + a[ix] = v[ix] + z.set_mask_selection(ix, v[ix]) + assert_array_equal(a, z[:]) + z[:] = 0 + z.vindex[ix] = v[ix] + assert_array_equal(a, z[:]) + # for 1d arrays, also available via __setitem__ + z[:] = 0 + z[ix] = v[ix] + assert_array_equal(a, z[:]) + + +def test_set_mask_selection_2d(): + + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty_like(v) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.size).astype(bool).reshape(a.shape) + + a[:] = 0 + z[:] = 0 + a[ix] = v[ix] + z.set_mask_selection(ix, v[ix]) + assert_array_equal(a, z[:]) + z[:] = 0 + z.vindex[ix] = v[ix] + assert_array_equal(a, z[:]) + + def test_get_selection_out(): # basic selections @@ -687,9 +850,28 @@ def test_get_selection_out(): z.get_orthogonal_selection(selection, out=out) assert_array_equal(expect, out[:]) + # coordinate selections + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + n = int(a.size * p) + ix0 = np.random.choice(a.shape[0], size=n, replace=True) + ix1 = np.random.choice(a.shape[1], size=n, replace=True) + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + ] + for selection in selections: + expect = a[selection] + out = np.zeros(expect.shape, dtype=expect.dtype) + z.get_coordinate_selection(selection, out=out) + assert_array_equal(expect, out[:]) -# TODO mask selection # TODO selection with fields - - From ec64c087ae838e1a15d5040469555f6d12c802eb Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 6 Nov 2017 13:39:48 +0000 Subject: [PATCH 35/67] tidy notebook --- notebooks/advanced_indexing.ipynb | 1384 +++++++++++++++++------------ 1 file changed, 827 insertions(+), 557 deletions(-) diff --git a/notebooks/advanced_indexing.ipynb b/notebooks/advanced_indexing.ipynb index bf535375ba..2005b40d8f 100644 --- a/notebooks/advanced_indexing.ipynb +++ b/notebooks/advanced_indexing.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -18,7 +18,7 @@ "'2.1.5.dev83'" ] }, - "execution_count": 2, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -44,12 +44,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Indexing a 1D array with a Boolean array" + "### Indexing a 1D array with a Boolean array\n", + "\n", + "Supported via ``__getitem__`` and ``__setitem__`` just like numpy array." ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -60,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -69,7 +71,7 @@ "array([1, 3, 5, 7, 9])" ] }, - "execution_count": 48, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -81,7 +83,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -90,7 +92,7 @@ "array([ 0, 10, 2, 30, 4, 50, 6, 70, 8, 90])" ] }, - "execution_count": 49, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -103,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -112,7 +114,7 @@ "array([1, 3, 5, 7, 9])" ] }, - "execution_count": 50, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -128,12 +130,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Indexing a 1D array with an integer array" + "### Indexing a 1D array with an integer array\n", + "\n", + "Supported via ``__getitem__`` and ``__setitem__`` just like numpy array." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -144,16 +148,16 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([10, 30, 50, 70, 90])" + "array([1, 3, 5, 7, 9])" ] }, - "execution_count": 19, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -165,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -174,7 +178,7 @@ "array([ 0, 10, 2, 30, 4, 50, 6, 70, 8, 90])" ] }, - "execution_count": 20, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -196,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -206,16 +210,16 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([10, 30, 50, 70, 90])" + "array([1, 3, 5, 7, 9])" ] }, - "execution_count": 21, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -227,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -236,7 +240,7 @@ "array([ 0, 10, 2, 30, 4, 50, 6, 70, 8, 90])" ] }, - "execution_count": 22, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -258,7 +262,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -271,7 +275,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 51, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -284,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -294,7 +298,7 @@ " [ 9, 11]])" ] }, - "execution_count": 30, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -308,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -318,7 +322,7 @@ " [ 9, 11]])" ] }, - "execution_count": 31, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -330,7 +334,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -340,7 +344,7 @@ " [ 9, 11]])" ] }, - "execution_count": 32, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -354,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -364,7 +368,7 @@ " [ 9, 11]])" ] }, - "execution_count": 33, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -376,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -386,7 +390,7 @@ " [ 9, 10, 11]])" ] }, - "execution_count": 38, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -398,7 +402,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -411,7 +415,7 @@ " [12, 14]])" ] }, - "execution_count": 39, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -423,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -436,7 +440,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 41, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -453,7 +457,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -466,7 +470,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 43, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -479,7 +483,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -492,7 +496,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 44, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -509,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -522,7 +526,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 45, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -537,14 +541,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Coordinate indexing of multi-dimensional arrays\n", + "### Coordinate indexing of multi-dimensional arrays\n", "\n", "Selecting arbitrary points from a multi-dimensional array by indexing with integer (coordinate) arrays is supported. This functionality is provided via the ``get/set_coordinate_selection()`` methods. For convenience, this functionality is also available via the ``vindex[]`` property as has been proposed for numpy." ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -557,7 +561,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 52, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -570,7 +574,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -579,7 +583,7 @@ "array([ 3, 11])" ] }, - "execution_count": 57, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -593,7 +597,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -602,7 +606,7 @@ "array([ 3, 11])" ] }, - "execution_count": 56, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -614,7 +618,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -627,7 +631,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 58, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -640,20 +644,20 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2],\n", - " [44, 4, 44],\n", + " [44, 4, 5],\n", " [ 6, 7, 8],\n", - " [44, 10, 44],\n", + " [ 9, 10, 44],\n", " [12, 13, 14]])" ] }, - "execution_count": 59, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -664,12 +668,133 @@ "za[:]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Mask indexing of multi-dimensional arrays\n", + "\n", + "Selecting arbitrary points from a multi-dimensional array by a Boolean array is supported. This functionality is provided via the ``get/set_mask_selection()`` methods. For convenience, this functionality is also available via the ``vindex[]`` property as has been proposed for numpy." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0, 1, 2],\n", + " [ 3, 4, 5],\n", + " [ 6, 7, 8],\n", + " [ 9, 10, 11],\n", + " [12, 13, 14]])" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a = np.arange(15).reshape(5, 3)\n", + "za = zarr.array(a, chunks=(3, 2))\n", + "za[:]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 3, 11])" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ix = np.zeros_like(a, dtype=bool)\n", + "ix[1, 0] = True\n", + "ix[3, 2] = True\n", + "za.get_mask_selection(ix)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 3, 11])" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "za.vindex[ix]" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0, 1, 2],\n", + " [42, 4, 5],\n", + " [ 6, 7, 8],\n", + " [ 9, 10, 42],\n", + " [12, 13, 14]])" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "za.set_mask_selection(ix, 42)\n", + "za[:]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0, 1, 2],\n", + " [44, 4, 5],\n", + " [ 6, 7, 8],\n", + " [ 9, 10, 44],\n", + " [12, 13, 14]])" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "za.vindex[ix] = 44\n", + "za[:]" + ] }, { "cell_type": "markdown", @@ -680,7 +805,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -689,7 +814,7 @@ "800000000" ] }, - "execution_count": 25, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -701,15 +826,15 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 508 ms, sys: 28 ms, total: 536 ms\n", - "Wall time: 162 ms\n" + "CPU times: user 520 ms, sys: 44 ms, total: 564 ms\n", + "Wall time: 171 ms\n" ] }, { @@ -732,7 +857,7 @@ "Chunks initialized : 1024/1024" ] }, - "execution_count": 26, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -744,15 +869,15 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 120 ms, sys: 60 ms, total: 180 ms\n", - "Wall time: 178 ms\n" + "CPU times: user 116 ms, sys: 60 ms, total: 176 ms\n", + "Wall time: 177 ms\n" ] }, { @@ -761,7 +886,7 @@ "array([ 0, 1, 2, ..., 99999997, 99999998, 99999999])" ] }, - "execution_count": 27, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -772,15 +897,15 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 520 ms, sys: 32 ms, total: 552 ms\n", - "Wall time: 261 ms\n" + "CPU times: user 492 ms, sys: 80 ms, total: 572 ms\n", + "Wall time: 282 ms\n" ] }, { @@ -789,7 +914,7 @@ "array([ 0, 1, 2, ..., 99999997, 99999998, 99999999])" ] }, - "execution_count": 29, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -807,16 +932,16 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "9998583" + "9995616" ] }, - "execution_count": 82, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -829,24 +954,24 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 312 ms, sys: 0 ns, total: 312 ms\n", - "Wall time: 311 ms\n" + "CPU times: user 348 ms, sys: 8 ms, total: 356 ms\n", + "Wall time: 355 ms\n" ] }, { "data": { "text/plain": [ - "array([ 23, 24, 39, ..., 99999967, 99999978, 99999995])" + "array([ 25, 30, 31, ..., 99999973, 99999982, 99999986])" ] }, - "execution_count": 83, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -857,24 +982,24 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 888 ms, sys: 52 ms, total: 940 ms\n", - "Wall time: 459 ms\n" + "CPU times: user 908 ms, sys: 68 ms, total: 976 ms\n", + "Wall time: 474 ms\n" ] }, { "data": { "text/plain": [ - "array([ 23, 24, 39, ..., 99999967, 99999978, 99999995])" + "array([ 25, 30, 31, ..., 99999973, 99999982, 99999986])" ] }, - "execution_count": 84, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -885,87 +1010,90 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 58423 function calls in 0.514 seconds\n", + " 58428 function calls in 0.492 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1025 0.205 0.000 0.205 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1024 0.169 0.000 0.179 0.000 core.py:930(_decode_chunk)\n", - " 1024 0.062 0.000 0.261 0.000 core.py:768(_chunk_getitem)\n", - " 1024 0.011 0.000 0.011 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", - " 1025 0.008 0.000 0.234 0.000 new_indexing.py:494(__iter__)\n", - " 1024 0.006 0.000 0.216 0.000 index_tricks.py:26(ix_)\n", - " 2048 0.006 0.000 0.006 0.000 core.py:323()\n", + " 1025 0.203 0.000 0.203 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1024 0.158 0.000 0.168 0.000 core.py:964(_decode_chunk)\n", + " 1024 0.051 0.000 0.240 0.000 core.py:802(_chunk_getitem)\n", + " 1024 0.014 0.000 0.014 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", + " 1025 0.008 0.000 0.232 0.000 indexing.py:486(__iter__)\n", + " 1024 0.006 0.000 0.214 0.000 index_tricks.py:26(ix_)\n", + " 2048 0.006 0.000 0.006 0.000 core.py:324()\n", " 2048 0.005 0.000 0.005 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", " 1024 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1 0.004 0.004 0.499 0.499 core.py:548(_get_selection)\n", + " 1 0.004 0.004 0.475 0.475 core.py:563(_get_selection)\n", " 1024 0.003 0.000 0.006 0.000 arrayprint.py:381(wrapper)\n", + " 1024 0.003 0.000 0.008 0.000 core.py:319(_cdata_shape)\n", " 1024 0.002 0.000 0.010 0.000 {method 'join' of 'str' objects}\n", - " 1025 0.002 0.000 0.003 0.000 new_indexing.py:282(__iter__)\n", - " 1024 0.002 0.000 0.008 0.000 core.py:318(_cdata_shape)\n", - " 1 0.002 0.002 0.514 0.514 :1()\n", + " 1025 0.002 0.000 0.003 0.000 indexing.py:296(__iter__)\n", + " 1 0.002 0.002 0.016 0.016 indexing.py:269(__init__)\n", + " 6152 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 6151 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", - " 1024 0.002 0.000 0.219 0.000 new_indexing.py:413(ix_)\n", - " 1 0.001 0.001 0.013 0.013 new_indexing.py:255(__init__)\n", + " 1024 0.002 0.000 0.217 0.000 indexing.py:398(ix_)\n", + " 1024 0.001 0.000 0.011 0.000 core.py:961(_chunk_key)\n", " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", - " 1024 0.001 0.000 0.011 0.000 core.py:927(_chunk_key)\n", " 1024 0.001 0.000 0.007 0.000 numeric.py:1905(array_str)\n", - " 1024 0.001 0.000 0.001 0.000 new_indexing.py:418()\n", - " 1024 0.001 0.000 0.001 0.000 new_indexing.py:509()\n", - " 2048 0.001 0.000 0.001 0.000 new_indexing.py:497()\n", + " 1024 0.001 0.000 0.001 0.000 indexing.py:403()\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", + " 1024 0.001 0.000 0.001 0.000 indexing.py:501()\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5642b95ef480}\n", " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55c5e1118480}\n", - " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", " 1024 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", - " 2048 0.001 0.000 0.001 0.000 new_indexing.py:499()\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:491()\n", + " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", " 3081 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 1024 0.000 0.000 0.014 0.000 numeric.py:380(count_nonzero)\n", + " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:205(chunk_store)\n", " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", - " 2048 0.000 0.000 0.000 0.000 new_indexing.py:498()\n", - " 1024 0.000 0.000 0.000 0.000 core.py:204(chunk_store)\n", - " 1024 0.000 0.000 0.011 0.000 numeric.py:380(count_nonzero)\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:490()\n", " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 1 0.000 0.000 0.492 0.492 :1()\n", " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 0.013 0.013 new_indexing.py:430(__init__)\n", + " 1 0.000 0.000 0.492 0.492 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.016 0.016 indexing.py:425(__init__)\n", " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.zeros}\n", - " 1 0.000 0.000 0.514 0.514 {built-in method builtins.exec}\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:159(replace_ellipsis)\n", " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.512 0.512 core.py:391(__getitem__)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.512 0.512 core.py:526(get_orthogonal_selection)\n", + " 1 0.000 0.000 0.492 0.492 core.py:527(get_orthogonal_selection)\n", + " 1 0.000 0.000 0.492 0.492 core.py:392(__getitem__)\n", " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:547(replace_lists)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:481()\n", " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:486()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:164()\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:194(ensure_tuple)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:537(replace_lists)\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:484()\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:549()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:19(is_bool_array)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:539()\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:476()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:478()\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:489()\n", "\n", "\n" ] @@ -975,6 +1103,13 @@ "cProfile.run('zc[ix_dense_bool]', sort='time')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Method ``nonzero`` is being called internally within numpy to convert bool to int selections, no way to avoid." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -984,17 +1119,16 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(10000000,\n", - " array([38852033, 29570639, 6153807, ..., 51604068, 33056119, 29899374]))" + "10000000" ] }, - "execution_count": 86, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -1003,29 +1137,29 @@ "ix_dense_int = np.random.choice(c.shape[0], size=c.shape[0]//10, replace=True)\n", "ix_dense_int_sorted = ix_dense_int.copy()\n", "ix_dense_int_sorted.sort()\n", - "len(ix_dense_int), ix_dense_int" + "len(ix_dense_int)" ] }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 60 ms, sys: 32 ms, total: 92 ms\n", - "Wall time: 91 ms\n" + "CPU times: user 60 ms, sys: 4 ms, total: 64 ms\n", + "Wall time: 64.1 ms\n" ] }, { "data": { "text/plain": [ - "array([ 6, 9, 15, ..., 99999956, 99999964, 99999985])" + "array([ 6, 23, 34, ..., 99999974, 99999986, 99999992])" ] }, - "execution_count": 87, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -1036,24 +1170,24 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 576 ms, sys: 104 ms, total: 680 ms\n", - "Wall time: 428 ms\n" + "CPU times: user 560 ms, sys: 100 ms, total: 660 ms\n", + "Wall time: 386 ms\n" ] }, { "data": { "text/plain": [ - "array([ 6, 9, 15, ..., 99999956, 99999964, 99999985])" + "array([ 6, 23, 34, ..., 99999974, 99999986, 99999992])" ] }, - "execution_count": 88, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -1064,24 +1198,24 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 144 ms, sys: 20 ms, total: 164 ms\n", - "Wall time: 162 ms\n" + "CPU times: user 108 ms, sys: 28 ms, total: 136 ms\n", + "Wall time: 135 ms\n" ] }, { "data": { "text/plain": [ - "array([38852033, 29570639, 6153807, ..., 51604068, 33056119, 29899374])" + "array([95165047, 93422705, 3887249, ..., 41392662, 20111139, 95001327])" ] }, - "execution_count": 89, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -1092,24 +1226,24 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 2.34 s, sys: 156 ms, total: 2.49 s\n", - "Wall time: 2.18 s\n" + "CPU times: user 2.11 s, sys: 84 ms, total: 2.19 s\n", + "Wall time: 1.86 s\n" ] }, { "data": { "text/plain": [ - "array([38852033, 29570639, 6153807, ..., 51604068, 33056119, 29899374])" + "array([95165047, 93422705, 3887249, ..., 41392662, 20111139, 95001327])" ] }, - "execution_count": 90, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -1120,90 +1254,92 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 55379 function calls in 0.491 seconds\n", + " 55382 function calls in 0.415 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 0.183 0.183 0.237 0.237 new_indexing.py:340(__init__)\n", - " 1024 0.099 0.000 0.107 0.000 core.py:930(_decode_chunk)\n", - " 1024 0.065 0.000 0.191 0.000 core.py:768(_chunk_getitem)\n", - " 1 0.026 0.026 0.026 0.026 {built-in method numpy.core.multiarray.bincount}\n", - " 1025 0.025 0.000 0.025 0.000 new_indexing.py:387(__iter__)\n", - " 1 0.024 0.024 0.024 0.024 function_base.py:1848(diff)\n", - " 1 0.007 0.007 0.245 0.245 core.py:548(_get_selection)\n", - " 2048 0.006 0.000 0.006 0.000 core.py:323()\n", - " 1025 0.006 0.000 0.046 0.000 new_indexing.py:494(__iter__)\n", - " 2048 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1024 0.004 0.000 0.008 0.000 index_tricks.py:26(ix_)\n", - " 1024 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1 0.004 0.004 0.241 0.241 new_indexing.py:430(__init__)\n", - " 1 0.004 0.004 0.491 0.491 :1()\n", - " 1024 0.003 0.000 0.009 0.000 {method 'join' of 'str' objects}\n", + " 1 0.158 0.158 0.216 0.216 indexing.py:325(__init__)\n", + " 1024 0.085 0.000 0.089 0.000 core.py:964(_decode_chunk)\n", + " 1024 0.042 0.000 0.145 0.000 core.py:802(_chunk_getitem)\n", + " 1025 0.031 0.000 0.031 0.000 indexing.py:372(__iter__)\n", + " 1 0.029 0.029 0.029 0.029 {built-in method numpy.core.multiarray.bincount}\n", + " 1 0.025 0.025 0.025 0.025 function_base.py:1848(diff)\n", + " 1025 0.005 0.000 0.049 0.000 indexing.py:486(__iter__)\n", + " 2048 0.004 0.000 0.004 0.000 core.py:324()\n", + " 1 0.003 0.003 0.415 0.415 core.py:527(get_orthogonal_selection)\n", + " 1024 0.003 0.000 0.007 0.000 index_tricks.py:26(ix_)\n", " 4 0.003 0.001 0.003 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1 0.003 0.003 0.488 0.488 core.py:526(get_orthogonal_selection)\n", - " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", - " 1024 0.002 0.000 0.008 0.000 core.py:318(_cdata_shape)\n", - " 6151 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", + " 2048 0.002 0.000 0.002 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1024 0.002 0.000 0.004 0.000 arrayprint.py:381(wrapper)\n", + " 1 0.002 0.002 0.195 0.195 core.py:563(_get_selection)\n", + " 1024 0.002 0.000 0.005 0.000 core.py:319(_cdata_shape)\n", + " 1024 0.002 0.000 0.007 0.000 {method 'join' of 'str' objects}\n", + " 6152 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", " 1024 0.001 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 1024 0.001 0.000 0.011 0.000 new_indexing.py:413(ix_)\n", - " 1024 0.001 0.000 0.010 0.000 core.py:927(_chunk_key)\n", + " 1024 0.001 0.000 0.009 0.000 indexing.py:398(ix_)\n", " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", + " 1024 0.001 0.000 0.007 0.000 core.py:961(_chunk_key)\n", + " 1024 0.001 0.000 0.001 0.000 indexing.py:403()\n", " 1024 0.001 0.000 0.005 0.000 numeric.py:1905(array_str)\n", - " 1024 0.001 0.000 0.001 0.000 new_indexing.py:418()\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55c5e1118480}\n", - " 2048 0.001 0.000 0.001 0.000 new_indexing.py:497()\n", - " 3081 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 1024 0.001 0.000 0.001 0.000 new_indexing.py:509()\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 2048 0.000 0.000 0.000 0.000 new_indexing.py:499()\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5642b95ef480}\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", + " 1024 0.001 0.000 0.001 0.000 indexing.py:501()\n", + " 1024 0.000 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:491()\n", + " 1 0.000 0.000 0.216 0.216 indexing.py:425(__init__)\n", + " 3081 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", " 1030 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", - " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:490()\n", " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", - " 2048 0.000 0.000 0.000 0.000 new_indexing.py:498()\n", - " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:204(chunk_store)\n", - " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:205(chunk_store)\n", + " 1 0.000 0.000 0.415 0.415 :1()\n", " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", " 4 0.000 0.000 0.003 0.001 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 0.491 0.491 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 1 0.000 0.000 0.415 0.415 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.415 0.415 core.py:392(__getitem__)\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", " 4 0.000 0.000 0.003 0.001 {method 'any' of 'numpy.ndarray' objects}\n", " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 1 0.000 0.000 0.488 0.488 core.py:391(__getitem__)\n", " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:537(replace_lists)\n", " 4 0.000 0.000 0.003 0.001 _methods.py:37(_any)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:547(replace_lists)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:484()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:486()\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:549()\n", + " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:159(replace_ellipsis)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:476()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:194(ensure_tuple)\n", " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:489()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:478()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:164()\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:539()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:481()\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", "\n", "\n" @@ -1216,95 +1352,97 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 69723 function calls in 2.217 seconds\n", + " 69726 function calls in 1.841 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 1.417 1.417 1.417 1.417 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 1 0.198 0.198 1.834 1.834 new_indexing.py:340(__init__)\n", - " 1 0.169 0.169 0.169 0.169 {method 'take' of 'numpy.ndarray' objects}\n", - " 1024 0.167 0.000 0.306 0.000 core.py:768(_chunk_getitem)\n", - " 1024 0.116 0.000 0.122 0.000 core.py:930(_decode_chunk)\n", - " 1025 0.026 0.000 0.027 0.000 new_indexing.py:387(__iter__)\n", - " 1 0.024 0.024 0.024 0.024 function_base.py:1848(diff)\n", - " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", - " 1025 0.008 0.000 0.056 0.000 new_indexing.py:494(__iter__)\n", - " 1 0.007 0.007 2.213 2.213 core.py:391(__getitem__)\n", - " 2048 0.007 0.000 0.013 0.000 index_tricks.py:26(ix_)\n", - " 2048 0.005 0.000 0.005 0.000 core.py:323()\n", - " 1 0.004 0.004 1.839 1.839 new_indexing.py:430(__init__)\n", - " 1 0.004 0.004 2.217 2.217 :1()\n", + " 1 1.160 1.160 1.160 1.160 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 1 0.164 0.164 1.490 1.490 indexing.py:325(__init__)\n", + " 1024 0.150 0.000 0.285 0.000 core.py:802(_chunk_getitem)\n", + " 1 0.128 0.128 0.128 0.128 {method 'take' of 'numpy.ndarray' objects}\n", + " 1024 0.113 0.000 0.120 0.000 core.py:964(_decode_chunk)\n", + " 1025 0.033 0.000 0.034 0.000 indexing.py:372(__iter__)\n", + " 1 0.024 0.024 0.024 0.024 {built-in method numpy.core.multiarray.bincount}\n", + " 1 0.012 0.012 0.012 0.012 function_base.py:1848(diff)\n", + " 1025 0.006 0.000 0.059 0.000 indexing.py:486(__iter__)\n", + " 2048 0.006 0.000 0.012 0.000 index_tricks.py:26(ix_)\n", " 3072 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1 0.003 0.003 0.365 0.365 core.py:548(_get_selection)\n", + " 2048 0.004 0.000 0.004 0.000 core.py:324()\n", + " 1 0.003 0.003 1.840 1.840 core.py:527(get_orthogonal_selection)\n", + " 1024 0.003 0.000 0.003 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", " 4 0.003 0.001 0.003 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1 0.003 0.003 2.206 2.206 core.py:526(get_orthogonal_selection)\n", + " 1 0.003 0.003 0.346 0.346 core.py:563(_get_selection)\n", " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", - " 2048 0.002 0.000 0.018 0.000 new_indexing.py:413(ix_)\n", - " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.002 0.000 0.007 0.000 core.py:318(_cdata_shape)\n", - " 8199 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", - " 2048 0.002 0.000 0.003 0.000 numerictypes.py:728(issubdtype)\n", - " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 2048 0.001 0.000 0.002 0.000 new_indexing.py:418()\n", - " 1024 0.001 0.000 0.009 0.000 core.py:927(_chunk_key)\n", - " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", + " 2048 0.002 0.000 0.015 0.000 indexing.py:398(ix_)\n", + " 8200 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.002 0.000 0.006 0.000 core.py:319(_cdata_shape)\n", + " 1024 0.002 0.000 0.007 0.000 {method 'join' of 'str' objects}\n", + " 2048 0.001 0.000 0.003 0.000 numerictypes.py:728(issubdtype)\n", + " 1024 0.001 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 2048 0.001 0.000 0.002 0.000 indexing.py:403()\n", + " 1024 0.001 0.000 0.008 0.000 core.py:961(_chunk_key)\n", + " 1024 0.001 0.000 0.005 0.000 numeric.py:1905(array_str)\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5642b95ef480}\n", " 2054 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55c5e1118480}\n", - " 2048 0.001 0.000 0.001 0.000 new_indexing.py:497()\n", " 4096 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", " 2048 0.001 0.000 0.001 0.000 numeric.py:463(asarray)\n", " 4105 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 1024 0.001 0.000 0.001 0.000 new_indexing.py:509()\n", " 2048 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", - " 2048 0.000 0.000 0.000 0.000 new_indexing.py:499()\n", + " 1 0.001 0.001 1.491 1.491 indexing.py:425(__init__)\n", + " 1024 0.001 0.000 0.001 0.000 indexing.py:501()\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:491()\n", + " 1024 0.000 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 1 0.000 0.000 1.841 1.841 core.py:392(__getitem__)\n", + " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 2048 0.000 0.000 0.000 0.000 new_indexing.py:498()\n", - " 1024 0.000 0.000 0.000 0.000 core.py:204(chunk_store)\n", - " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:490()\n", " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:205(chunk_store)\n", + " 1 0.000 0.000 1.841 1.841 :1()\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 2.217 2.217 {built-in method builtins.exec}\n", + " 1 0.000 0.000 1.841 1.841 {built-in method builtins.exec}\n", " 4 0.000 0.000 0.003 0.001 fromnumeric.py:1886(any)\n", - " 4 0.000 0.000 1.587 0.397 fromnumeric.py:55(_wrapfunc)\n", - " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 4 0.000 0.000 0.003 0.001 {method 'any' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 4 0.000 0.000 1.288 0.322 fromnumeric.py:55(_wrapfunc)\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:547(replace_lists)\n", - " 4 0.000 0.000 0.003 0.001 _methods.py:37(_any)\n", - " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 4 0.000 0.000 0.003 0.001 {method 'any' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.128 0.128 fromnumeric.py:70(take)\n", + " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 1 0.000 0.000 0.169 0.169 fromnumeric.py:70(take)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 1.160 1.160 fromnumeric.py:826(argsort)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 4 0.000 0.000 0.003 0.001 _methods.py:37(_any)\n", + " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:159(replace_ellipsis)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:194(ensure_tuple)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:476()\n", " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:549()\n", - " 1 0.000 0.000 1.417 1.417 fromnumeric.py:826(argsort)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:484()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:486()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:539()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:537(replace_lists)\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:489()\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:478()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:164()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:481()\n", "\n", "\n" ] @@ -1314,6 +1452,13 @@ "cProfile.run('zc[ix_dense_int]', sort='time')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When indices are not sorted, zarr needs to partially sort them so the occur in chunk order, so we only have to visit each chunk once. This sorting dominates the processing time and is unavoidable AFAIK." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1323,16 +1468,16 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "10033" + "9985" ] }, - "execution_count": 94, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -1345,7 +1490,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 56, "metadata": {}, "outputs": [ { @@ -1353,16 +1498,16 @@ "output_type": "stream", "text": [ "CPU times: user 20 ms, sys: 0 ns, total: 20 ms\n", - "Wall time: 21.6 ms\n" + "Wall time: 21.3 ms\n" ] }, { "data": { "text/plain": [ - "array([ 35449, 41893, 45592, ..., 99987487, 99990184, 99993538])" + "array([ 4039, 4499, 7512, ..., 99943621, 99959317, 99987208])" ] }, - "execution_count": 95, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -1373,24 +1518,24 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 440 ms, sys: 56 ms, total: 496 ms\n", - "Wall time: 222 ms\n" + "CPU times: user 436 ms, sys: 56 ms, total: 492 ms\n", + "Wall time: 210 ms\n" ] }, { "data": { "text/plain": [ - "array([ 35449, 41893, 45592, ..., 99987487, 99990184, 99993538])" + "array([ 4039, 4499, 7512, ..., 99943621, 99959317, 99987208])" ] }, - "execution_count": 96, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -1401,86 +1546,89 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 58423 function calls in 0.259 seconds\n", + " 58373 function calls in 0.243 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1024 0.137 0.000 0.144 0.000 core.py:930(_decode_chunk)\n", - " 1024 0.026 0.000 0.026 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", - " 1025 0.023 0.000 0.023 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1024 0.008 0.000 0.172 0.000 core.py:768(_chunk_getitem)\n", - " 1025 0.007 0.000 0.052 0.000 new_indexing.py:494(__iter__)\n", - " 1024 0.006 0.000 0.034 0.000 index_tricks.py:26(ix_)\n", - " 2048 0.006 0.000 0.006 0.000 core.py:323()\n", - " 1 0.005 0.005 0.032 0.032 new_indexing.py:255(__init__)\n", - " 2048 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1 0.003 0.003 0.227 0.227 core.py:548(_get_selection)\n", - " 1025 0.003 0.000 0.003 0.000 new_indexing.py:282(__iter__)\n", - " 1024 0.003 0.000 0.006 0.000 arrayprint.py:381(wrapper)\n", - " 1024 0.002 0.000 0.010 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.002 0.000 0.008 0.000 core.py:318(_cdata_shape)\n", - " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 6151 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", - " 1024 0.002 0.000 0.037 0.000 new_indexing.py:413(ix_)\n", - " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", - " 1024 0.001 0.000 0.011 0.000 core.py:927(_chunk_key)\n", - " 1024 0.001 0.000 0.007 0.000 numeric.py:1905(array_str)\n", - " 1024 0.001 0.000 0.027 0.000 numeric.py:380(count_nonzero)\n", - " 1024 0.001 0.000 0.001 0.000 new_indexing.py:418()\n", - " 1024 0.001 0.000 0.001 0.000 new_indexing.py:509()\n", - " 2048 0.001 0.000 0.001 0.000 new_indexing.py:497()\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55c5e1118480}\n", - " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", - " 3081 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 2048 0.001 0.000 0.001 0.000 new_indexing.py:499()\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 2048 0.000 0.000 0.000 0.000 new_indexing.py:498()\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:204(chunk_store)\n", - " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 0.260 0.260 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.032 0.032 new_indexing.py:430(__init__)\n", + " 1023 0.140 0.000 0.148 0.000 core.py:964(_decode_chunk)\n", + " 1024 0.024 0.000 0.024 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1024 0.009 0.000 0.009 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", + " 1023 0.007 0.000 0.177 0.000 core.py:802(_chunk_getitem)\n", + " 1024 0.007 0.000 0.052 0.000 indexing.py:486(__iter__)\n", + " 1023 0.006 0.000 0.034 0.000 index_tricks.py:26(ix_)\n", + " 2046 0.006 0.000 0.006 0.000 core.py:324()\n", + " 2046 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1023 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1 0.003 0.003 0.232 0.232 core.py:563(_get_selection)\n", + " 1023 0.003 0.000 0.006 0.000 arrayprint.py:381(wrapper)\n", + " 1023 0.003 0.000 0.010 0.000 {method 'join' of 'str' objects}\n", + " 1023 0.003 0.000 0.008 0.000 core.py:319(_cdata_shape)\n", + " 1024 0.002 0.000 0.003 0.000 indexing.py:296(__iter__)\n", + " 1023 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 6146 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", + " 1023 0.002 0.000 0.038 0.000 indexing.py:398(ix_)\n", + " 1023 0.001 0.000 0.011 0.000 core.py:961(_chunk_key)\n", + " 1023 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", + " 1 0.001 0.001 0.011 0.011 indexing.py:269(__init__)\n", + " 1023 0.001 0.000 0.008 0.000 numeric.py:1905(array_str)\n", + " 1023 0.001 0.000 0.001 0.000 indexing.py:403()\n", + " 2046 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5642b95ef480}\n", + " 1023 0.001 0.000 0.001 0.000 indexing.py:501()\n", + " 2046 0.001 0.000 0.001 0.000 indexing.py:489()\n", + " 1023 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 1023 0.001 0.000 0.001 0.000 :12(__new__)\n", + " 3078 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 2046 0.001 0.000 0.001 0.000 indexing.py:491()\n", + " 2046 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", + " 1023 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", + " 1023 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", + " 2046 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", + " 1023 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 2046 0.000 0.000 0.000 0.000 indexing.py:490()\n", + " 1023 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1023 0.000 0.000 0.000 0.000 core.py:205(chunk_store)\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", + " 1024 0.000 0.000 0.010 0.000 numeric.py:380(count_nonzero)\n", + " 1023 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1023 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 1023 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1023 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1 0.000 0.000 0.011 0.011 indexing.py:425(__init__)\n", " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.zeros}\n", + " 1 0.000 0.000 0.243 0.243 {built-in method builtins.exec}\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.259 0.259 core.py:526(get_orthogonal_selection)\n", " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.259 0.259 core.py:391(__getitem__)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:547(replace_lists)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", - " 1 0.000 0.000 0.259 0.259 :1()\n", - " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 1 0.000 0.000 0.243 0.243 core.py:392(__getitem__)\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.243 0.243 core.py:527(get_orthogonal_selection)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:159(replace_ellipsis)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:194(ensure_tuple)\n", + " 1 0.000 0.000 0.243 0.243 :1()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:549()\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:484()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:486()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:489()\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:19(is_bool_array)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:476()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:478()\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:539()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:164()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:537(replace_lists)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:481()\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", "\n", "\n" @@ -1500,17 +1648,16 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(10000,\n", - " array([49021295, 65674535, 71257616, ..., 12130114, 48117886, 98926729]))" + "10000" ] }, - "execution_count": 98, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } @@ -1519,12 +1666,12 @@ "ix_sparse_int = np.random.choice(c.shape[0], size=c.shape[0]//10000, replace=True)\n", "ix_sparse_int_sorted = ix_sparse_int.copy()\n", "ix_sparse_int_sorted.sort()\n", - "len(ix_sparse_int), ix_sparse_int" + "len(ix_sparse_int)" ] }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 61, "metadata": {}, "outputs": [ { @@ -1532,16 +1679,16 @@ "output_type": "stream", "text": [ "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", - "Wall time: 245 µs\n" + "Wall time: 136 µs\n" ] }, { "data": { "text/plain": [ - "array([ 14556, 48679, 54538, ..., 99958362, 99994365, 99999645])" + "array([ 7736, 25765, 27155, ..., 99982813, 99983779, 99986450])" ] }, - "execution_count": 99, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } @@ -1552,7 +1699,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 62, "metadata": {}, "outputs": [ { @@ -1560,16 +1707,16 @@ "output_type": "stream", "text": [ "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", - "Wall time: 233 µs\n" + "Wall time: 597 µs\n" ] }, { "data": { "text/plain": [ - "array([49021295, 65674535, 71257616, ..., 12130114, 48117886, 98926729])" + "array([11023673, 52339189, 27001951, ..., 37185717, 7541357, 28437835])" ] }, - "execution_count": 100, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } @@ -1580,24 +1727,24 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 388 ms, sys: 60 ms, total: 448 ms\n", - "Wall time: 172 ms\n" + "CPU times: user 412 ms, sys: 40 ms, total: 452 ms\n", + "Wall time: 171 ms\n" ] }, { "data": { "text/plain": [ - "array([ 14556, 48679, 54538, ..., 99958362, 99994365, 99999645])" + "array([ 7736, 25765, 27155, ..., 99982813, 99983779, 99986450])" ] }, - "execution_count": 101, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } @@ -1608,24 +1755,24 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 456 ms, sys: 32 ms, total: 488 ms\n", - "Wall time: 182 ms\n" + "CPU times: user 384 ms, sys: 64 ms, total: 448 ms\n", + "Wall time: 172 ms\n" ] }, { "data": { "text/plain": [ - "array([49021295, 65674535, 71257616, ..., 12130114, 48117886, 98926729])" + "array([11023673, 52339189, 27001951, ..., 37185717, 7541357, 28437835])" ] }, - "execution_count": 102, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -1636,94 +1783,96 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 69723 function calls in 0.224 seconds\n", + " 69726 function calls in 0.218 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1024 0.148 0.000 0.155 0.000 core.py:930(_decode_chunk)\n", - " 1025 0.008 0.000 0.038 0.000 new_indexing.py:494(__iter__)\n", + " 1024 0.141 0.000 0.149 0.000 core.py:964(_decode_chunk)\n", + " 1025 0.008 0.000 0.038 0.000 indexing.py:486(__iter__)\n", " 2048 0.008 0.000 0.015 0.000 index_tricks.py:26(ix_)\n", - " 1024 0.006 0.000 0.182 0.000 core.py:768(_chunk_getitem)\n", - " 2048 0.006 0.000 0.006 0.000 core.py:323()\n", + " 1024 0.006 0.000 0.176 0.000 core.py:802(_chunk_getitem)\n", + " 2048 0.006 0.000 0.006 0.000 core.py:324()\n", " 3072 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1025 0.004 0.000 0.005 0.000 new_indexing.py:387(__iter__)\n", - " 1 0.003 0.003 0.223 0.223 core.py:548(_get_selection)\n", + " 1025 0.004 0.000 0.005 0.000 indexing.py:372(__iter__)\n", + " 1 0.003 0.003 0.217 0.217 core.py:563(_get_selection)\n", " 1024 0.003 0.000 0.006 0.000 arrayprint.py:381(wrapper)\n", - " 2048 0.003 0.000 0.020 0.000 new_indexing.py:413(ix_)\n", + " 2048 0.003 0.000 0.020 0.000 indexing.py:398(ix_)\n", + " 1024 0.003 0.000 0.008 0.000 core.py:319(_cdata_shape)\n", " 1024 0.003 0.000 0.010 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.003 0.000 0.003 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1024 0.002 0.000 0.008 0.000 core.py:318(_cdata_shape)\n", - " 8199 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", - " 2048 0.002 0.000 0.004 0.000 numerictypes.py:728(issubdtype)\n", + " 8200 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 2048 0.002 0.000 0.002 0.000 new_indexing.py:418()\n", - " 1024 0.001 0.000 0.011 0.000 core.py:927(_chunk_key)\n", - " 1024 0.001 0.000 0.007 0.000 numeric.py:1905(array_str)\n", + " 2048 0.002 0.000 0.004 0.000 numerictypes.py:728(issubdtype)\n", + " 2048 0.002 0.000 0.002 0.000 indexing.py:403()\n", + " 1024 0.001 0.000 0.011 0.000 core.py:961(_chunk_key)\n", + " 1024 0.001 0.000 0.008 0.000 numeric.py:1905(array_str)\n", " 2054 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", " 4096 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", - " 2048 0.001 0.000 0.001 0.000 new_indexing.py:497()\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55c5e1118480}\n", - " 1024 0.001 0.000 0.001 0.000 new_indexing.py:509()\n", " 2048 0.001 0.000 0.002 0.000 numeric.py:463(asarray)\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5642b95ef480}\n", + " 4105 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", " 2048 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 4105 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 1024 0.001 0.000 0.001 0.000 indexing.py:501()\n", " 1 0.001 0.001 0.001 0.001 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:491()\n", " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", - " 2048 0.001 0.000 0.001 0.000 new_indexing.py:499()\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 2048 0.000 0.000 0.000 0.000 new_indexing.py:498()\n", - " 1024 0.000 0.000 0.000 0.000 core.py:204(chunk_store)\n", - " 1 0.000 0.000 0.001 0.001 new_indexing.py:340(__init__)\n", - " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:490()\n", + " 1 0.000 0.000 0.001 0.001 indexing.py:325(__init__)\n", " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:205(chunk_store)\n", " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 0.224 0.224 core.py:391(__getitem__)\n", + " 1 0.000 0.000 0.218 0.218 core.py:392(__getitem__)\n", " 4 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1 0.000 0.000 0.001 0.001 new_indexing.py:430(__init__)\n", + " 1 0.000 0.000 0.001 0.001 indexing.py:425(__init__)\n", " 1 0.000 0.000 0.000 0.000 {method 'take' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.bincount}\n", " 1 0.000 0.000 0.000 0.000 function_base.py:1848(diff)\n", - " 1 0.000 0.000 0.224 0.224 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.bincount}\n", + " 1 0.000 0.000 0.218 0.218 {built-in method builtins.exec}\n", " 4 0.000 0.000 0.000 0.000 fromnumeric.py:1886(any)\n", - " 4 0.000 0.000 0.001 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 1 0.000 0.000 0.218 0.218 core.py:527(get_orthogonal_selection)\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 4 0.000 0.000 0.001 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:159(replace_ellipsis)\n", " 4 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 1 0.000 0.000 0.224 0.224 core.py:526(get_orthogonal_selection)\n", " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:147(replace_ellipsis)\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:547(replace_lists)\n", - " 1 0.000 0.000 0.224 0.224 :1()\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", + " 1 0.000 0.000 0.218 0.218 :1()\n", + " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:194(ensure_tuple)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:537(replace_lists)\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:70(take)\n", " 1 0.000 0.000 0.001 0.001 fromnumeric.py:826(argsort)\n", " 4 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", + " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:150()\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:484()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:486()\n", - " 1 0.000 0.000 0.000 0.000 new_indexing.py:489()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:70(take)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:164()\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:539()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:478()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:481()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:549()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:476()\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", "\n", "\n" @@ -1734,6 +1883,13 @@ "cProfile.run('zc[ix_sparse_int]', sort='time')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For sparse selections, processing time is dominated by decompression, so we can't do any better." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1743,13 +1899,13 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Typezarr.core.Array
Data typebool
Shape(100000000,)
Chunk shape(390625,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes100000000 (95.4M)
No. bytes stored507558 (495.7K)
Storage ratio197.0
Chunks initialized256/256
" + "
Typezarr.core.Array
Data typebool
Shape(100000000,)
Chunk shape(390625,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes100000000 (95.4M)
No. bytes stored507490 (495.6K)
Storage ratio197.0
Chunks initialized256/256
" ], "text/plain": [ "Type : zarr.core.Array\n", @@ -1761,12 +1917,12 @@ "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", "Store type : builtins.dict\n", "No. bytes : 100000000 (95.4M)\n", - "No. bytes stored : 507558 (495.7K)\n", + "No. bytes stored : 507490 (495.6K)\n", "Storage ratio : 197.0\n", "Chunks initialized : 256/256" ] }, - "execution_count": 104, + "execution_count": 66, "metadata": {}, "output_type": "execute_result" } @@ -1778,24 +1934,24 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 852 ms, sys: 140 ms, total: 992 ms\n", - "Wall time: 450 ms\n" + "CPU times: user 920 ms, sys: 136 ms, total: 1.06 s\n", + "Wall time: 503 ms\n" ] }, { "data": { "text/plain": [ - "array([ 35449, 41893, 45592, ..., 99987487, 99990184, 99993538])" + "array([ 4039, 4499, 7512, ..., 99943621, 99959317, 99987208])" ] }, - "execution_count": 105, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } @@ -1813,15 +1969,15 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 68, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 68 ms, sys: 28 ms, total: 96 ms\n", - "Wall time: 92.7 ms\n" + "CPU times: user 68 ms, sys: 24 ms, total: 92 ms\n", + "Wall time: 90.4 ms\n" ] }, { @@ -1830,7 +1986,7 @@ "array([ 0, 2, 4, ..., 99999994, 99999996, 99999998])" ] }, - "execution_count": 53, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } @@ -1841,15 +1997,15 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.3 s, sys: 268 ms, total: 1.57 s\n", - "Wall time: 1.3 s\n" + "CPU times: user 1.34 s, sys: 236 ms, total: 1.57 s\n", + "Wall time: 1.29 s\n" ] }, { @@ -1858,7 +2014,7 @@ "array([ 0, 2, 4, ..., 99999994, 99999996, 99999998])" ] }, - "execution_count": 54, + "execution_count": 69, "metadata": {}, "output_type": "execute_result" } @@ -1869,15 +2025,15 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 564 ms, sys: 84 ms, total: 648 ms\n", - "Wall time: 396 ms\n" + "CPU times: user 548 ms, sys: 116 ms, total: 664 ms\n", + "Wall time: 400 ms\n" ] }, { @@ -1886,7 +2042,7 @@ "array([ 0, 10, 20, ..., 99999970, 99999980, 99999990])" ] }, - "execution_count": 55, + "execution_count": 70, "metadata": {}, "output_type": "execute_result" } @@ -1897,15 +2053,15 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 472 ms, sys: 40 ms, total: 512 ms\n", - "Wall time: 213 ms\n" + "CPU times: user 456 ms, sys: 40 ms, total: 496 ms\n", + "Wall time: 214 ms\n" ] }, { @@ -1914,7 +2070,7 @@ "array([ 0, 100, 200, ..., 99999700, 99999800, 99999900])" ] }, - "execution_count": 56, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" } @@ -1925,15 +2081,15 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 432 ms, sys: 48 ms, total: 480 ms\n", - "Wall time: 192 ms\n" + "CPU times: user 440 ms, sys: 36 ms, total: 476 ms\n", + "Wall time: 179 ms\n" ] }, { @@ -1942,7 +2098,7 @@ "array([ 0, 1000, 2000, ..., 99997000, 99998000, 99999000])" ] }, - "execution_count": 57, + "execution_count": 72, "metadata": {}, "output_type": "execute_result" } @@ -1951,6 +2107,111 @@ "%time zc[::1000]" ] }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 55382 function calls in 1.351 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.620 0.620 0.815 0.815 indexing.py:325(__init__)\n", + " 1024 0.130 0.000 0.135 0.000 core.py:964(_decode_chunk)\n", + " 1025 0.127 0.000 0.128 0.000 indexing.py:372(__iter__)\n", + " 1024 0.123 0.000 0.275 0.000 core.py:802(_chunk_getitem)\n", + " 1 0.121 0.121 0.121 0.121 {built-in method numpy.core.multiarray.bincount}\n", + " 1 0.087 0.087 0.087 0.087 {built-in method numpy.core.multiarray.arange}\n", + " 1 0.054 0.054 0.054 0.054 function_base.py:1848(diff)\n", + " 1 0.020 0.020 1.350 1.350 core.py:527(get_orthogonal_selection)\n", + " 4 0.020 0.005 0.020 0.005 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 1025 0.007 0.000 0.149 0.000 indexing.py:486(__iter__)\n", + " 2048 0.005 0.000 0.005 0.000 core.py:324()\n", + " 1024 0.004 0.000 0.008 0.000 index_tricks.py:26(ix_)\n", + " 2048 0.003 0.000 0.003 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1 0.003 0.003 0.427 0.427 core.py:563(_get_selection)\n", + " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", + " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", + " 1024 0.002 0.000 0.006 0.000 core.py:319(_cdata_shape)\n", + " 6152 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 1024 0.001 0.000 0.011 0.000 indexing.py:398(ix_)\n", + " 1024 0.001 0.000 0.009 0.000 core.py:961(_chunk_key)\n", + " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", + " 1 0.001 0.001 0.902 0.902 indexing.py:425(__init__)\n", + " 1024 0.001 0.000 0.001 0.000 indexing.py:403()\n", + " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", + " 1 0.001 0.001 1.350 1.350 core.py:392(__getitem__)\n", + " 1 0.001 0.001 1.351 1.351 :1()\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5642b95ef480}\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", + " 1024 0.001 0.000 0.001 0.000 indexing.py:501()\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:491()\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 3081 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 1030 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", + " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", + " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:490()\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:205(chunk_store)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1 0.000 0.000 1.351 1.351 {built-in method builtins.exec}\n", + " 4 0.000 0.000 0.020 0.005 fromnumeric.py:1886(any)\n", + " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 4 0.000 0.000 0.020 0.005 {method 'any' of 'numpy.ndarray' objects}\n", + " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:159(replace_ellipsis)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:537(replace_lists)\n", + " 4 0.000 0.000 0.020 0.005 _methods.py:37(_any)\n", + " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:194(ensure_tuple)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:80(normalize_slice_selection)\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:164()\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:539()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:478()\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:476()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:481()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run('zc[::2]', sort='time')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here there are various setup operations that need to be done on the integer array, can't see way to avoid ATM." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1960,7 +2221,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 74, "metadata": {}, "outputs": [ { @@ -1969,7 +2230,7 @@ "(100000000,)" ] }, - "execution_count": 106, + "execution_count": 74, "metadata": {}, "output_type": "execute_result" } @@ -1980,7 +2241,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 75, "metadata": {}, "outputs": [ { @@ -1989,7 +2250,7 @@ "(100000, 1000)" ] }, - "execution_count": 107, + "execution_count": 75, "metadata": {}, "output_type": "execute_result" } @@ -2001,7 +2262,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 76, "metadata": {}, "outputs": [ { @@ -2024,7 +2285,7 @@ "Chunks initialized : 1024/1024" ] }, - "execution_count": 108, + "execution_count": 76, "metadata": {}, "output_type": "execute_result" } @@ -2043,7 +2304,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -2053,30 +2314,30 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 140 ms, sys: 24 ms, total: 164 ms\n", - "Wall time: 165 ms\n" + "CPU times: user 140 ms, sys: 8 ms, total: 148 ms\n", + "Wall time: 146 ms\n" ] }, { "data": { "text/plain": [ - "array([[ 0, 1, 3, ..., 995, 998, 999],\n", - " [ 2000, 2001, 2003, ..., 2995, 2998, 2999],\n", - " [ 4000, 4001, 4003, ..., 4995, 4998, 4999],\n", + "array([[ 2, 4, 6, ..., 993, 994, 999],\n", + " [ 9002, 9004, 9006, ..., 9993, 9994, 9999],\n", + " [ 10002, 10004, 10006, ..., 10993, 10994, 10999],\n", " ..., \n", - " [99992000, 99992001, 99992003, ..., 99992995, 99992998, 99992999],\n", - " [99997000, 99997001, 99997003, ..., 99997995, 99997998, 99997999],\n", - " [99999000, 99999001, 99999003, ..., 99999995, 99999998, 99999999]])" + " [99997002, 99997004, 99997006, ..., 99997993, 99997994, 99997999],\n", + " [99998002, 99998004, 99998006, ..., 99998993, 99998994, 99998999],\n", + " [99999002, 99999004, 99999006, ..., 99999993, 99999994, 99999999]])" ] }, - "execution_count": 110, + "execution_count": 78, "metadata": {}, "output_type": "execute_result" } @@ -2087,30 +2348,30 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 79, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 860 ms, sys: 84 ms, total: 944 ms\n", - "Wall time: 468 ms\n" + "CPU times: user 748 ms, sys: 56 ms, total: 804 ms\n", + "Wall time: 409 ms\n" ] }, { "data": { "text/plain": [ - "array([[ 0, 1, 3, ..., 995, 998, 999],\n", - " [ 2000, 2001, 2003, ..., 2995, 2998, 2999],\n", - " [ 4000, 4001, 4003, ..., 4995, 4998, 4999],\n", + "array([[ 2, 4, 6, ..., 993, 994, 999],\n", + " [ 9002, 9004, 9006, ..., 9993, 9994, 9999],\n", + " [ 10002, 10004, 10006, ..., 10993, 10994, 10999],\n", " ..., \n", - " [99992000, 99992001, 99992003, ..., 99992995, 99992998, 99992999],\n", - " [99997000, 99997001, 99997003, ..., 99997995, 99997998, 99997999],\n", - " [99999000, 99999001, 99999003, ..., 99999995, 99999998, 99999999]])" + " [99997002, 99997004, 99997006, ..., 99997993, 99997994, 99997999],\n", + " [99998002, 99998004, 99998006, ..., 99998993, 99998994, 99998999],\n", + " [99999002, 99999004, 99999006, ..., 99999993, 99999994, 99999999]])" ] }, - "execution_count": 111, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } @@ -2128,7 +2389,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -2138,30 +2399,30 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 81, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 196 ms, sys: 56 ms, total: 252 ms\n", - "Wall time: 250 ms\n" + "CPU times: user 196 ms, sys: 24 ms, total: 220 ms\n", + "Wall time: 219 ms\n" ] }, { "data": { "text/plain": [ - "array([[50767038, 50767472, 50767242, ..., 50767418, 50767445, 50767947],\n", - " [28829038, 28829472, 28829242, ..., 28829418, 28829445, 28829947],\n", - " [17474038, 17474472, 17474242, ..., 17474418, 17474445, 17474947],\n", + "array([[90796980, 90796608, 90796172, ..., 90796527, 90796979, 90796445],\n", + " [50263980, 50263608, 50263172, ..., 50263527, 50263979, 50263445],\n", + " [47678980, 47678608, 47678172, ..., 47678527, 47678979, 47678445],\n", " ..., \n", - " [ 5185038, 5185472, 5185242, ..., 5185418, 5185445, 5185947],\n", - " [27248038, 27248472, 27248242, ..., 27248418, 27248445, 27248947],\n", - " [72575038, 72575472, 72575242, ..., 72575418, 72575445, 72575947]])" + " [34172980, 34172608, 34172172, ..., 34172527, 34172979, 34172445],\n", + " [56793980, 56793608, 56793172, ..., 56793527, 56793979, 56793445],\n", + " [12456980, 12456608, 12456172, ..., 12456527, 12456979, 12456445]])" ] }, - "execution_count": 113, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } @@ -2172,30 +2433,30 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.17 s, sys: 128 ms, total: 1.3 s\n", - "Wall time: 682 ms\n" + "CPU times: user 1.11 s, sys: 68 ms, total: 1.18 s\n", + "Wall time: 604 ms\n" ] }, { "data": { "text/plain": [ - "array([[50767038, 50767472, 50767242, ..., 50767418, 50767445, 50767947],\n", - " [28829038, 28829472, 28829242, ..., 28829418, 28829445, 28829947],\n", - " [17474038, 17474472, 17474242, ..., 17474418, 17474445, 17474947],\n", + "array([[90796980, 90796608, 90796172, ..., 90796527, 90796979, 90796445],\n", + " [50263980, 50263608, 50263172, ..., 50263527, 50263979, 50263445],\n", + " [47678980, 47678608, 47678172, ..., 47678527, 47678979, 47678445],\n", " ..., \n", - " [ 5185038, 5185472, 5185242, ..., 5185418, 5185445, 5185947],\n", - " [27248038, 27248472, 27248242, ..., 27248418, 27248445, 27248947],\n", - " [72575038, 72575472, 72575242, ..., 72575418, 72575445, 72575947]])" + " [34172980, 34172608, 34172172, ..., 34172527, 34172979, 34172445],\n", + " [56793980, 56793608, 56793172, ..., 56793527, 56793979, 56793445],\n", + " [12456980, 12456608, 12456172, ..., 12456527, 12456979, 12456445]])" ] }, - "execution_count": 114, + "execution_count": 82, "metadata": {}, "output_type": "execute_result" } @@ -2213,7 +2474,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 83, "metadata": {}, "outputs": [ { @@ -2222,7 +2483,7 @@ "10000000" ] }, - "execution_count": 115, + "execution_count": 83, "metadata": {}, "output_type": "execute_result" } @@ -2236,24 +2497,24 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 236 ms, sys: 56 ms, total: 292 ms\n", - "Wall time: 289 ms\n" + "CPU times: user 256 ms, sys: 12 ms, total: 268 ms\n", + "Wall time: 265 ms\n" ] }, { "data": { "text/plain": [ - "array([71132822, 44407411, 66463897, ..., 16188129, 30562595, 3115554])" + "array([ 6452573, 65841096, 70323990, ..., 44175624, 34778721, 67807976])" ] }, - "execution_count": 116, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } @@ -2264,24 +2525,24 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 85, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.06 s, sys: 296 ms, total: 3.36 s\n", - "Wall time: 2.83 s\n" + "CPU times: user 2.62 s, sys: 116 ms, total: 2.73 s\n", + "Wall time: 2.29 s\n" ] }, { "data": { "text/plain": [ - "array([71132822, 44407411, 66463897, ..., 16188129, 30562595, 3115554])" + "array([ 6452573, 65841096, 70323990, ..., 44175624, 34778721, 67807976])" ] }, - "execution_count": 117, + "execution_count": 85, "metadata": {}, "output_type": "execute_result" } @@ -2292,92 +2553,94 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 86, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 48284 function calls in 2.856 seconds\n", + " 48293 function calls in 2.312 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 1.401 1.401 1.401 1.401 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 3 0.412 0.137 0.412 0.137 new_indexing.py:601()\n", - " 3 0.259 0.086 0.259 0.086 new_indexing.py:592()\n", - " 1 0.242 0.242 2.414 2.414 new_indexing.py:557(__init__)\n", - " 1024 0.196 0.000 0.377 0.000 core.py:768(_chunk_getitem)\n", - " 1024 0.151 0.000 0.160 0.000 core.py:930(_decode_chunk)\n", - " 1 0.056 0.056 0.056 0.056 {built-in method numpy.core.multiarray.ravel_multi_index}\n", - " 1 0.038 0.038 0.038 0.038 {built-in method numpy.core.multiarray.bincount}\n", - " 3072 0.023 0.000 0.023 0.000 new_indexing.py:636()\n", - " 1 0.012 0.012 2.843 2.843 core.py:537(get_coordinate_selection)\n", - " 1025 0.010 0.000 0.036 0.000 new_indexing.py:618(__iter__)\n", - " 1 0.010 0.010 2.853 2.853 new_indexing.py:648(__getitem__)\n", - " 6 0.006 0.001 0.006 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 3081 0.005 0.000 0.005 0.000 core.py:323()\n", - " 1024 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1 1.161 1.161 1.161 1.161 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 3 0.275 0.092 0.275 0.092 indexing.py:590()\n", + " 3 0.223 0.074 0.223 0.074 indexing.py:581()\n", + " 1024 0.174 0.000 0.358 0.000 core.py:802(_chunk_getitem)\n", + " 1 0.167 0.167 1.914 1.914 indexing.py:547(__init__)\n", + " 1024 0.155 0.000 0.164 0.000 core.py:964(_decode_chunk)\n", + " 1 0.044 0.044 0.044 0.044 {built-in method numpy.core.multiarray.ravel_multi_index}\n", + " 1 0.039 0.039 0.039 0.039 {built-in method numpy.core.multiarray.bincount}\n", + " 3072 0.022 0.000 0.022 0.000 indexing.py:625()\n", + " 1025 0.009 0.000 0.034 0.000 indexing.py:607(__iter__)\n", + " 6 0.005 0.001 0.005 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 3081 0.005 0.000 0.005 0.000 core.py:324()\n", " 1024 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 2048 0.004 0.000 0.008 0.000 arrayprint.py:381(wrapper)\n", - " 1 0.004 0.004 0.417 0.417 core.py:548(_get_selection)\n", - " 1 0.003 0.003 2.856 2.856 :1()\n", + " 2048 0.003 0.000 0.007 0.000 arrayprint.py:381(wrapper)\n", + " 1024 0.003 0.000 0.003 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1 0.003 0.003 0.396 0.396 core.py:563(_get_selection)\n", " 1024 0.003 0.000 0.012 0.000 {method 'join' of 'str' objects}\n", " 2048 0.002 0.000 0.003 0.000 arrayprint.py:399(array2string)\n", - " 1027 0.002 0.000 0.007 0.000 core.py:318(_cdata_shape)\n", - " 2048 0.002 0.000 0.009 0.000 numeric.py:1905(array_str)\n", + " 1027 0.002 0.000 0.006 0.000 core.py:319(_cdata_shape)\n", + " 2048 0.001 0.000 0.009 0.000 numeric.py:1905(array_str)\n", " 3084 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", - " 1024 0.001 0.000 0.013 0.000 core.py:927(_chunk_key)\n", - " 3072 0.001 0.000 0.001 0.000 new_indexing.py:632()\n", - " 3072 0.001 0.000 0.001 0.000 new_indexing.py:623()\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 1 0.001 0.001 2.312 2.312 core.py:539(get_coordinate_selection)\n", + " 1024 0.001 0.000 0.013 0.000 core.py:961(_chunk_key)\n", + " 3072 0.001 0.000 0.001 0.000 indexing.py:621()\n", + " 3072 0.001 0.000 0.001 0.000 indexing.py:612()\n", + " 1 0.001 0.001 2.312 2.312 indexing.py:658(__getitem__)\n", " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", " 3072 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x55c5e1118480}\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x5642b95ef480}\n", " 2048 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 2056 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:204(chunk_store)\n", - " 2048 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 2055 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", " 2048 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:205(chunk_store)\n", + " 2048 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1 0.000 0.000 2.312 2.312 :1()\n", " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.unravel_index}\n", - " 6 0.000 0.000 0.006 0.001 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 2.856 2.856 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 3 0.000 0.000 1.401 0.467 fromnumeric.py:55(_wrapfunc)\n", - " 6 0.000 0.000 0.006 0.001 {method 'any' of 'numpy.ndarray' objects}\n", + " 6 0.000 0.000 0.005 0.001 fromnumeric.py:1886(any)\n", + " 1 0.000 0.000 2.312 2.312 {built-in method builtins.exec}\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 3 0.000 0.000 1.161 0.387 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 0.000 0.000 core.py:334(_nchunks)\n", + " 6 0.000 0.000 0.005 0.001 {method 'any' of 'numpy.ndarray' objects}\n", " 8 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 1 0.000 0.000 0.000 0.000 core.py:333(_nchunks)\n", - " 6 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:527(is_coordinate_selection)\n", + " 6 0.000 0.000 0.005 0.001 _methods.py:37(_any)\n", + " 12 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 1 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:519(is_coordinate_selection)\n", " 1 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", - " 6 0.000 0.000 0.006 0.001 _methods.py:37(_any)\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:531()\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:547(replace_lists)\n", - " 1 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", - " 1 0.000 0.000 1.401 1.401 fromnumeric.py:826(argsort)\n", - " 1 0.000 0.000 0.000 0.000 core.py:337(nchunks)\n", + " 1 0.000 0.000 1.161 1.161 fromnumeric.py:826(argsort)\n", " 3 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 4 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 1 0.000 0.000 0.000 0.000 core.py:338(nchunks)\n", + " 6 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", " 1 0.000 0.000 0.000 0.000 {built-in method _functools.reduce}\n", - " 1 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 12 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 2 0.000 0.000 0.000 0.000 new_indexing.py:180(ensure_tuple)\n", - " 6 0.000 0.000 0.000 0.000 new_indexing.py:549()\n", - " 3 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", - " 2 0.000 0.000 0.000 0.000 core.py:152(_refresh_metadata)\n", - " 1 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", " 3 0.000 0.000 0.000 0.000 {built-in method builtins.all}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:537(replace_lists)\n", " 4 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 6 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:194(ensure_tuple)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:523()\n", + " 6 0.000 0.000 0.000 0.000 indexing.py:539()\n", + " 3 0.000 0.000 0.000 0.000 indexing.py:551()\n", + " 1 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 core.py:363(vindex)\n", - " 1 0.000 0.000 0.000 0.000 core.py:212(shape)\n", - " 3 0.000 0.000 0.000 0.000 new_indexing.py:561()\n", + " 1 0.000 0.000 0.000 0.000 core.py:213(shape)\n", + " 1 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", + " 3 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", + " 1 0.000 0.000 0.000 0.000 core.py:364(vindex)\n", + " 2 0.000 0.000 0.000 0.000 core.py:153(_refresh_metadata)\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", "\n", "\n" @@ -2388,6 +2651,13 @@ "cProfile.run('zd.vindex[ix0, ix1]', sort='time')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Points need to be partially sorted so all points in the same chunk are grouped and processed together. This requires ``argsort`` which dominates time." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2499,7 +2769,7 @@ "metadata": {}, "outputs": [], "source": [ - "# # this is pathological, takes > 1 minute \n", + "# # this is pathological, takes minutes \n", "# %time hc[ix_dense_bool]" ] }, From 983355dbffa70b5fb57e1f02e36195f4e1b2e63c Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 6 Nov 2017 17:05:57 +0000 Subject: [PATCH 36/67] WIP index with fields --- zarr/tests/test_indexing.py | 115 +++++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 1 deletion(-) diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index a8d370de6c..f41c66df43 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -874,4 +874,117 @@ def test_get_selection_out(): assert_array_equal(expect, out[:]) -# TODO selection with fields +def test_get_selections_with_fields(): + + a = [[('aaa', 1, 4.2), + ('bbb', 2, 8.4), + ('ccc', 3, 12.6)]] + a = np.array(a, dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + z = zarr.create(shape=a.shape, chunks=2, dtype=a.dtype) + z[:] = a + + # single field + fields = 'foo' + + # no selection + expect = a[fields] + actual = z.get_basic_selection(fields=fields) + assert_array_equal(expect, actual) + # alternative API + actual = z[fields] + assert_array_equal(expect, actual) + + # basic selection with slice + expect = a[0:2][fields] + actual = z.get_basic_selection(slice(0, 2), fields=fields) + assert_array_equal(expect, actual) + # alternative API + actual = z[0:2, fields] + assert_array_equal(expect, actual) + + # orthogonal selection + ix = [0, 2] + expect = a[ix][fields] + actual = z.get_orthogonal_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative APIs + actual = z[ix, fields] + assert_array_equal(expect, actual) + actual = z.oindex[ix, fields] + assert_array_equal(expect, actual) + + # coordinate selection + ix = [0, 2] + expect = a[ix][fields] + actual = z.get_coordinate_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative APIs + actual = z[ix, fields] + assert_array_equal(expect, actual) + actual = z.vindex[ix, fields] + assert_array_equal(expect, actual) + + # mask selection + ix = [True, False, True] + expect = a[ix][fields] + actual = z.get_mask_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative APIs + actual = z[ix, fields] + assert_array_equal(expect, actual) + actual = z.vindex[ix, fields] + assert_array_equal(expect, actual) + + # multiple field + fields = ['foo', 'bar'] + + # no selection + expect = a[fields] + actual = z.get_basic_selection(fields=fields) + assert_array_equal(expect, actual) + # alternative API + actual = z[fields] + assert_array_equal(expect, actual) + actual = z[tuple(fields)] + assert_array_equal(expect, actual) + + # basic selection with slice + expect = a[0:2][fields] + actual = z.get_basic_selection(slice(0, 2), fields=fields) + assert_array_equal(expect, actual) + # alternative API + actual = z[0:2, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # orthogonal selection + ix = [0, 2] + expect = a[ix][fields] + actual = z.get_orthogonal_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative APIs + actual = z[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) + actual = z.oindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # coordinate selection + ix = [0, 2] + expect = a[ix][fields] + actual = z.get_coordinate_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative APIs + actual = z[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) + actual = z.vindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # mask selection + ix = [True, False, True] + expect = a[ix][fields] + actual = z.get_mask_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative APIs + actual = z[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) + actual = z.vindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) From c814c97e1499c27a328588cb13c967390381fb80 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 7 Nov 2017 01:02:39 +0000 Subject: [PATCH 37/67] add support for fields with selection, resolves #112 --- zarr/core.py | 164 +++++++++++--------- zarr/indexing.py | 78 ++++++++-- zarr/tests/test_indexing.py | 295 ++++++++++++++++++++++-------------- 3 files changed, 337 insertions(+), 200 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index dd28e04862..fb9fa3cad5 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -17,7 +17,15 @@ from zarr.compat import reduce from zarr.codecs import AsType, get_codec from zarr.indexing import OIndex, OrthogonalIndexer, BasicIndexer, VIndex, CoordinateIndexer, \ - MaskIndexer + MaskIndexer, check_fields, pop_fields, ensure_tuple + + +def is_scalar(value, dtype): + if np.isscalar(value): + return True + if isinstance(value, tuple) and dtype.names and len(value) == len(dtype.names): + return True + return False class Array(object): @@ -465,19 +473,10 @@ def __getitem__(self, selection): """ - if len(self._shape) == 0: - return self._get_basic_selection_zd(selection) - - elif len(self._shape) == 1: - # safe to do "fancy" indexing, no ambiguity - return self.get_orthogonal_selection(selection) - - else: - # "fancy" indexing can be ambiguous/hard to understand for multidimensional arrays, - # force people to go through explicit methods - return self.get_basic_selection(selection) + fields, selection = pop_fields(selection) + return self.get_basic_selection(selection, fields=fields) - def get_basic_selection(self, selection, out=None): + def get_basic_selection(self, selection, out=None, fields=None): """TODO""" # refresh metadata @@ -486,15 +485,16 @@ def get_basic_selection(self, selection, out=None): # handle zero-dimensional arrays if self._shape == (): - return self._get_basic_selection_zd(selection, out=out) + return self._get_basic_selection_zd(selection=selection, out=out, fields=fields) else: - return self._get_basic_selection_nd(selection, out=out) + return self._get_basic_selection_nd(selection=selection, out=out, fields=fields) - def _get_basic_selection_zd(self, selection, out=None): + def _get_basic_selection_zd(self, selection, out=None, fields=None): # special case basic selection for zero-dimensional array # check selection is valid - if selection not in ((), Ellipsis): + selection = ensure_tuple(selection) + if selection not in ((), (Ellipsis,)): raise IndexError('too many indices for array') try: @@ -519,17 +519,21 @@ def _get_basic_selection_zd(self, selection, out=None): else: out[selection] = chunk[selection] + # handle fields + if fields: + out = out[fields] + return out - def _get_basic_selection_nd(self, selection, out=None): + def _get_basic_selection_nd(self, selection, out=None, fields=None): # implementation of basic selection for array with at least one dimension # setup indexer indexer = BasicIndexer(selection, self) - return self._get_selection(indexer, out=out) + return self._get_selection(indexer=indexer, out=out, fields=fields) - def get_orthogonal_selection(self, selection, out=None): + def get_orthogonal_selection(self, selection, out=None, fields=None): """TODO""" # refresh metadata @@ -539,9 +543,9 @@ def get_orthogonal_selection(self, selection, out=None): # setup indexer indexer = OrthogonalIndexer(selection, self) - return self._get_selection(indexer, out=out) + return self._get_selection(indexer=indexer, out=out, fields=fields) - def get_coordinate_selection(self, selection, out=None): + def get_coordinate_selection(self, selection, out=None, fields=None): """TODO""" # refresh metadata @@ -551,9 +555,9 @@ def get_coordinate_selection(self, selection, out=None): # setup indexer indexer = CoordinateIndexer(selection, self) - return self._get_selection(indexer, out=out) + return self._get_selection(indexer=indexer, out=out, fields=fields) - def get_mask_selection(self, selection, out=None): + def get_mask_selection(self, selection, out=None, fields=None): """TODO""" # refresh metadata @@ -563,9 +567,9 @@ def get_mask_selection(self, selection, out=None): # setup indexer indexer = MaskIndexer(selection, self) - return self._get_selection(indexer, out=out) + return self._get_selection(indexer=indexer, out=out, fields=fields) - def _get_selection(self, indexer, out=None): + def _get_selection(self, indexer, out=None, fields=None): # We iterate over all chunks which overlap the selection and thus contain data that needs # to be extracted. Each chunk is processed in turn, extracting the necessary data and @@ -574,17 +578,20 @@ def _get_selection(self, indexer, out=None): # N.B., it is an important optimisation that we only visit chunks which overlap the # selection. This minimises the nuimber of iterations in the main for loop. + # check fields are sensible + out_dtype = check_fields(fields, self._dtype) + # determine output shape - sel_shape = indexer.shape + out_shape = indexer.shape # setup output array if out is None: - out = np.empty(sel_shape, dtype=self._dtype, order=self._order) + out = np.empty(out_shape, dtype=out_dtype, order=self._order) else: # validate 'out' parameter if not hasattr(out, 'shape'): raise TypeError('out must be an array-like object') - if out.shape != sel_shape: + if out.shape != out_shape: raise ValueError('out has wrong shape for selection') # iterate over chunks @@ -592,7 +599,7 @@ def _get_selection(self, indexer, out=None): # load chunk selection into output array self._chunk_getitem(chunk_coords, chunk_selection, out, out_selection, - drop_axes=indexer.drop_axes) + drop_axes=indexer.drop_axes, fields=fields) if out.shape: return out @@ -658,19 +665,10 @@ def __setitem__(self, selection, value): """ - if len(self._shape) == 0: - self._set_basic_selection_zd(selection, value) + fields, selection = pop_fields(selection) + self.set_basic_selection(selection, value, fields=fields) - elif len(self._shape) == 1: - # safe to do "fancy" indexing, no ambiguity - self.set_orthogonal_selection(selection, value) - - else: - # "fancy" indexing can be ambiguous/hard to understand for multidimensional arrays, - # force people to go through explicit methods - self.set_basic_selection(selection, value) - - def set_basic_selection(self, selection, value): + def set_basic_selection(self, selection, value, fields=None): """TODO""" # guard conditions @@ -683,11 +681,11 @@ def set_basic_selection(self, selection, value): # handle zero-dimensional arrays if self._shape == (): - return self._set_basic_selection_zd(selection, value) + return self._set_basic_selection_zd(selection, value, fields=fields) else: - return self._set_basic_selection_nd(selection, value) + return self._set_basic_selection_nd(selection, value, fields=fields) - def set_orthogonal_selection(self, selection, value): + def set_orthogonal_selection(self, selection, value, fields=None): """TODO""" # guard conditions @@ -701,9 +699,9 @@ def set_orthogonal_selection(self, selection, value): # setup indexer indexer = OrthogonalIndexer(selection, self) - self._set_selection(indexer, value) + self._set_selection(indexer, value, fields=fields) - def set_coordinate_selection(self, selection, value): + def set_coordinate_selection(self, selection, value, fields=None): """TODO""" # guard conditions @@ -717,9 +715,9 @@ def set_coordinate_selection(self, selection, value): # setup indexer indexer = CoordinateIndexer(selection, self) - self._set_selection(indexer, value) + self._set_selection(indexer, value, fields=fields) - def set_mask_selection(self, selection, value): + def set_mask_selection(self, selection, value, fields=None): """TODO""" # guard conditions @@ -733,13 +731,17 @@ def set_mask_selection(self, selection, value): # setup indexer indexer = MaskIndexer(selection, self) - self._set_selection(indexer, value) + self._set_selection(indexer, value, fields=fields) - def _set_basic_selection_zd(self, selection, value): + def _set_basic_selection_zd(self, selection, value, fields=None): # special case __setitem__ for zero-dimensional array + if fields: + raise IndexError('fields not supported for 0d array') + # check item is valid - if selection not in ((), Ellipsis): + selection = ensure_tuple(selection) + if selection not in ((), (Ellipsis,)): raise IndexError('too many indices for array') # setup data to store @@ -756,15 +758,15 @@ def _set_basic_selection_zd(self, selection, value): cdata = self._encode_chunk(arr) self.chunk_store[ckey] = cdata - def _set_basic_selection_nd(self, selection, value): + def _set_basic_selection_nd(self, selection, value, fields=None): # implementation of __setitem__ for array with at least one dimension # setup indexer indexer = BasicIndexer(selection, self) - self._set_selection(indexer, value) + self._set_selection(indexer, value, fields=fields) - def _set_selection(self, indexer, value): + def _set_selection(self, indexer, value, fields=None): # We iterate over all chunks which overlap the selection and thus contain data that needs # to be replaced. Each chunk is processed in turn, extracting the necessary data from the @@ -773,15 +775,20 @@ def _set_selection(self, indexer, value): # N.B., it is an important optimisation that we only visit chunks which overlap the # selection. This minimises the nuimber of iterations in the main for loop. + # check fields are sensible + check_fields(fields, self._dtype) + if fields and isinstance(fields, list): + raise ValueError('multi-field assignment is not supported') + # determine indices of chunks overlapping the selection sel_shape = indexer.shape # check value shape - if np.isscalar(value): + if is_scalar(value, self._dtype): pass else: if not hasattr(value, 'shape'): - raise TypeError('value must be an array-like object') + value = np.asarray(value) if value.shape != sel_shape: raise ValueError('value has wrong shape for selection; expected {}, got {}' .format(sel_shape, value.shape)) @@ -790,7 +797,7 @@ def _set_selection(self, indexer, value): for chunk_coords, chunk_selection, out_selection in indexer: # extract data to store - if np.isscalar(value): + if is_scalar(value, self._dtype): chunk_value = value else: chunk_value = value[out_selection] @@ -802,9 +809,10 @@ def _set_selection(self, indexer, value): chunk_value = chunk_value[item] # put data - self._chunk_setitem(chunk_coords, chunk_selection, chunk_value) + self._chunk_setitem(chunk_coords, chunk_selection, chunk_value, fields=fields) - def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop_axes=None): + def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop_axes=None, + fields=None): """Obtain part or whole of a chunk. Parameters @@ -819,6 +827,8 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop Location of region within output array to store results in. drop_axes : tuple of ints Axes to squeeze out of the chunk. + fields + TODO """ @@ -838,10 +848,11 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop else: - if isinstance(out, np.ndarray) and \ - isinstance(out_selection, slice) and \ - is_total_slice(chunk_selection, self._chunks) and \ - not self._filters: + if (isinstance(out, np.ndarray) and + not fields and + isinstance(out_selection, slice) and + is_total_slice(chunk_selection, self._chunks) and + not self._filters): dest = out[out_selection] contiguous = ((self._order == 'C' and dest.flags.c_contiguous) or @@ -864,13 +875,17 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop # decode chunk chunk = self._decode_chunk(cdata) - # set data in output array + # select data from chunk + if fields: + chunk = chunk[fields] tmp = chunk[chunk_selection] if drop_axes: tmp = np.squeeze(tmp, axis=drop_axes) + + # store selected data in output out[out_selection] = tmp - def _chunk_setitem(self, chunk_coords, chunk_selection, value): + def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): """Replace part or whole of a chunk. Parameters @@ -886,25 +901,25 @@ def _chunk_setitem(self, chunk_coords, chunk_selection, value): # synchronization if self._synchronizer is None: - self._chunk_setitem_nosync(chunk_coords, chunk_selection, value) + self._chunk_setitem_nosync(chunk_coords, chunk_selection, value, fields=fields) else: # synchronize on the chunk ckey = self._chunk_key(chunk_coords) with self._synchronizer[ckey]: - self._chunk_setitem_nosync(chunk_coords, chunk_selection, value) + self._chunk_setitem_nosync(chunk_coords, chunk_selection, value, fields=fields) - def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value): + def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None): # obtain key for chunk storage ckey = self._chunk_key(chunk_coords) - if is_total_slice(chunk_selection, self._chunks): + if is_total_slice(chunk_selection, self._chunks) and not fields: # totally replace chunk # optimization: we are completely replacing the chunk, so no need # to access the existing chunk data - if np.isscalar(value): + if is_scalar(value, self._dtype): # setup array filled with value chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order) @@ -955,7 +970,12 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value): chunk = chunk.copy(order='K') # modify - chunk[chunk_selection] = value + if fields: + # N.B., currently multi-field assignment is not supported in numpy, so this only + # works for a single field + chunk[fields][chunk_selection] = value + else: + chunk[chunk_selection] = value # encode chunk cdata = self._encode_chunk(chunk) diff --git a/zarr/indexing.py b/zarr/indexing.py index 8712c4318c..2c342bd42c 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -110,8 +110,8 @@ class SliceDimIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): # check type - if not isinstance(dim_sel, slice): - raise ValueError('selection must be a slice') + if not is_contiguous_slice(dim_sel): + raise ValueError('selection must be a contiguous slice') # normalize dim_sel = normalize_slice_selection(dim_sel, dim_len) @@ -192,7 +192,9 @@ def replace_ellipsis(selection, shape): def ensure_tuple(v): - if not isinstance(v, tuple): + if v is None: + v = () + elif not isinstance(v, tuple): v = (v,) return v @@ -222,6 +224,10 @@ def check_selection_length(selection, shape): raise IndexError('not enough indices for array') +def is_contiguous_slice(s): + return isinstance(s, slice) and (s.step is None or s.step == 1) + + # noinspection PyProtectedMember class BasicIndexer(object): @@ -238,14 +244,15 @@ def __init__(self, selection, array): dim_indexers = [] for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): - if isinstance(dim_sel, int): + if is_integer(dim_sel): dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) - elif isinstance(dim_sel, slice): + elif is_contiguous_slice(dim_sel): dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) else: - raise IndexError('bad selection type') + raise IndexError('bad selection type; expected integer or contiguous slice, ' + 'got {!r}'.format(dim_sel)) dim_indexers.append(dim_indexer) @@ -401,7 +408,7 @@ def ix_(*selection): # replace slice and int as these are not supported by numpy ix_() selection = [slice_to_range(dim_sel) if isinstance(dim_sel, slice) - else [dim_sel] if isinstance(dim_sel, int) + else [dim_sel] if is_integer(dim_sel) else dim_sel for dim_sel in selection] @@ -412,7 +419,7 @@ def ix_(*selection): def oindex(a, selection): """Implementation of orthogonal indexing with slices and ints.""" - drop_axes = tuple([i for i, s in enumerate(selection) if isinstance(s, int)]) + drop_axes = tuple([i for i, s in enumerate(selection) if is_integer(s)]) selection = ix_(*selection) result = a[selection] if drop_axes: @@ -510,10 +517,16 @@ def __init__(self, array): self.array = array def __getitem__(self, selection): - return self.array.get_orthogonal_selection(selection) + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.get_orthogonal_selection(selection, fields=fields) def __setitem__(self, selection, value): - return self.array.set_orthogonal_selection(selection, value) + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.set_orthogonal_selection(selection, value, fields=fields) def is_coordinate_selection(selection, array): @@ -656,21 +669,58 @@ def __init__(self, array): self.array = array def __getitem__(self, selection): + fields, selection = pop_fields(selection) selection = ensure_tuple(selection) selection = replace_lists(selection) if is_coordinate_selection(selection, self.array): - return self.array.get_coordinate_selection(selection) + return self.array.get_coordinate_selection(selection, fields=fields) elif is_mask_selection(selection, self.array): - return self.array.get_mask_selection(selection) + return self.array.get_mask_selection(selection, fields=fields) else: raise IndexError('unsupported selection') def __setitem__(self, selection, value): + fields, selection = pop_fields(selection) selection = ensure_tuple(selection) selection = replace_lists(selection) if is_coordinate_selection(selection, self.array): - return self.array.set_coordinate_selection(selection, value) + return self.array.set_coordinate_selection(selection, value, fields=fields) elif is_mask_selection(selection, self.array): - return self.array.set_mask_selection(selection, value) + return self.array.set_mask_selection(selection, value, fields=fields) else: raise IndexError('unsupported selection') + + +def check_fields(fields, dtype): + if fields: + if dtype.names is None: + raise IndexError('array does not have any fields') + try: + if isinstance(fields, str): + # single field selection + out_dtype = dtype[fields] + else: + # multiple field selection + out_dtype = np.dtype([(f, dtype[f]) for f in fields]) + except KeyError: + # TODO better error message + raise IndexError('bad field selection') + else: + return out_dtype + else: + return dtype + + +def pop_fields(selection): + if isinstance(selection, str): + fields = selection + selection = () + elif not isinstance(selection, tuple): + fields = None + # leave selection as-is + else: + fields = [f for f in selection if isinstance(f, str)] + fields = fields[0] if len(fields) == 1 else fields + selection = tuple(s for s in selection if not isinstance(s, str)) + selection = selection[0] if len(selection) == 1 else selection + return fields, selection diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index f41c66df43..e88fb08742 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -88,9 +88,9 @@ def _test_get_orthogonal_selection_1d_common(a, z, ix): assert_array_equal(expect, actual) actual = z.oindex[ix] assert_array_equal(expect, actual) - # for 1d arrays, also available via __getitem__ - actual = z[ix] - assert_array_equal(expect, actual) + # # for 1d arrays, also available via __getitem__ + # actual = z[ix] + # assert_array_equal(expect, actual) # noinspection PyStatementEffect @@ -179,9 +179,9 @@ def test_get_orthogonal_selection_1d_slice_with_step(): assert_array_equal(expect, actual) actual = z.oindex[selection] assert_array_equal(expect, actual) - # for 1d arrays also available via __getitem__ - actual = z[selection] - assert_array_equal(expect, actual) + # # for 1d arrays also available via __getitem__ + # actual = z[selection] + # assert_array_equal(expect, actual) def _test_get_orthogonal_selection_2d_common(a, z, ix0, ix1): @@ -349,10 +349,10 @@ def _test_set_orthogonal_selection_1d_common(v, a, z, ix): z[:] = 0 z.set_orthogonal_selection(ix, v[ix]) assert_array_equal(a, z[:]) - # also available via __getitem__ for 1d arrays - z[:] = 0 - z[ix] = v[ix] - assert_array_equal(a, z[:]) + # # also available via __setitem__ for 1d arrays + # z[:] = 0 + # z[ix] = v[ix] + # assert_array_equal(a, z[:]) def test_set_orthogonal_selection_1d_bool(): @@ -715,9 +715,9 @@ def test_get_mask_selection_1d(): assert_array_equal(expect, actual) actual = z.vindex[ix] assert_array_equal(expect, actual) - # for 1d arrays, also available via __getitem__ - actual = z[ix] - assert_array_equal(expect, actual) + # # for 1d arrays, also available via __getitem__ + # actual = z[ix] + # assert_array_equal(expect, actual) # test errors with assert_raises(IndexError): @@ -775,10 +775,10 @@ def test_set_mask_selection_1d(): z[:] = 0 z.vindex[ix] = v[ix] assert_array_equal(a, z[:]) - # for 1d arrays, also available via __setitem__ - z[:] = 0 - z[ix] = v[ix] - assert_array_equal(a, z[:]) + # # for 1d arrays, also available via __setitem__ + # z[:] = 0 + # z[ix] = v[ix] + # assert_array_equal(a, z[:]) def test_set_mask_selection_2d(): @@ -876,115 +876,182 @@ def test_get_selection_out(): def test_get_selections_with_fields(): - a = [[('aaa', 1, 4.2), - ('bbb', 2, 8.4), - ('ccc', 3, 12.6)]] + a = [('aaa', 1, 4.2), + ('bbb', 2, 8.4), + ('ccc', 3, 12.6)] a = np.array(a, dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) z = zarr.create(shape=a.shape, chunks=2, dtype=a.dtype) z[:] = a - # single field - fields = 'foo' + fields_fixture = [ + 'foo', + ['foo'], + ['foo', 'bar'], + ['foo', 'baz'], + ['bar', 'baz'], + ['foo', 'bar', 'baz'], + ['bar', 'foo'], + ['baz', 'bar', 'foo'], + ] - # no selection - expect = a[fields] - actual = z.get_basic_selection(fields=fields) - assert_array_equal(expect, actual) - # alternative API - actual = z[fields] - assert_array_equal(expect, actual) + for fields in fields_fixture: - # basic selection with slice - expect = a[0:2][fields] - actual = z.get_basic_selection(slice(0, 2), fields=fields) - assert_array_equal(expect, actual) - # alternative API - actual = z[0:2, fields] - assert_array_equal(expect, actual) + # total selection + expect = a[fields] + actual = z.get_basic_selection(Ellipsis, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z[fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[fields[0], fields[1]] + assert_array_equal(expect, actual) + if isinstance(fields, str): + actual = z[..., fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[..., fields[0], fields[1]] + assert_array_equal(expect, actual) - # orthogonal selection - ix = [0, 2] - expect = a[ix][fields] - actual = z.get_orthogonal_selection(ix, fields=fields) - assert_array_equal(expect, actual) - # alternative APIs - actual = z[ix, fields] - assert_array_equal(expect, actual) - actual = z.oindex[ix, fields] - assert_array_equal(expect, actual) + # basic selection with slice + expect = a[fields][0:2] + actual = z.get_basic_selection(slice(0, 2), fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z[0:2, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[0:2, fields[0], fields[1]] + assert_array_equal(expect, actual) - # coordinate selection - ix = [0, 2] - expect = a[ix][fields] - actual = z.get_coordinate_selection(ix, fields=fields) - assert_array_equal(expect, actual) - # alternative APIs - actual = z[ix, fields] - assert_array_equal(expect, actual) - actual = z.vindex[ix, fields] - assert_array_equal(expect, actual) + # basic selection with single item + expect = a[fields][1] + actual = z.get_basic_selection(1, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z[1, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[1, fields[0], fields[1]] + assert_array_equal(expect, actual) - # mask selection - ix = [True, False, True] - expect = a[ix][fields] - actual = z.get_mask_selection(ix, fields=fields) - assert_array_equal(expect, actual) - # alternative APIs - actual = z[ix, fields] - assert_array_equal(expect, actual) - actual = z.vindex[ix, fields] - assert_array_equal(expect, actual) + # orthogonal selection + ix = [0, 2] + expect = a[fields][ix] + actual = z.get_orthogonal_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z.oindex[ix, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z.oindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) - # multiple field - fields = ['foo', 'bar'] + # coordinate selection + ix = [0, 2] + expect = a[fields][ix] + actual = z.get_coordinate_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z.vindex[ix, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z.vindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) - # no selection - expect = a[fields] - actual = z.get_basic_selection(fields=fields) - assert_array_equal(expect, actual) - # alternative API - actual = z[fields] - assert_array_equal(expect, actual) - actual = z[tuple(fields)] - assert_array_equal(expect, actual) + # mask selection + ix = [True, False, True] + expect = a[fields][ix] + actual = z.get_mask_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z.vindex[ix, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z.vindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) - # basic selection with slice - expect = a[0:2][fields] - actual = z.get_basic_selection(slice(0, 2), fields=fields) - assert_array_equal(expect, actual) - # alternative API - actual = z[0:2, fields[0], fields[1]] - assert_array_equal(expect, actual) - # orthogonal selection - ix = [0, 2] - expect = a[ix][fields] - actual = z.get_orthogonal_selection(ix, fields=fields) - assert_array_equal(expect, actual) - # alternative APIs - actual = z[ix, fields[0], fields[1]] - assert_array_equal(expect, actual) - actual = z.oindex[ix, fields[0], fields[1]] - assert_array_equal(expect, actual) +def test_set_selections_with_fields(): - # coordinate selection - ix = [0, 2] - expect = a[ix][fields] - actual = z.get_coordinate_selection(ix, fields=fields) - assert_array_equal(expect, actual) - # alternative APIs - actual = z[ix, fields[0], fields[1]] - assert_array_equal(expect, actual) - actual = z.vindex[ix, fields[0], fields[1]] - assert_array_equal(expect, actual) + v = [('aaa', 1, 4.2), + ('bbb', 2, 8.4), + ('ccc', 3, 12.6)] + v = np.array(v, dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + a = np.empty_like(v) + z = zarr.empty_like(v, chunks=2) + + fields_fixture = [ + 'foo', + # ['foo'], + # ['foo', 'bar'], + # ['foo', 'baz'], + # ['bar', 'baz'], + # ['foo', 'bar', 'baz'], + # ['bar', 'foo'], + # ['baz', 'bar', 'foo'], + ] - # mask selection - ix = [True, False, True] - expect = a[ix][fields] - actual = z.get_mask_selection(ix, fields=fields) - assert_array_equal(expect, actual) - # alternative APIs - actual = z[ix, fields[0], fields[1]] - assert_array_equal(expect, actual) - actual = z.vindex[ix, fields[0], fields[1]] - assert_array_equal(expect, actual) + for fields in fields_fixture: + + # currently multi-field assignment is not supported in numpy, so we won't support it either + if isinstance(fields, list): + with assert_raises(ValueError): + z.set_basic_selection(Ellipsis, v[fields], fields=fields) + with assert_raises(ValueError): + z.set_orthogonal_selection([0, 2], v[fields], fields=fields) + with assert_raises(ValueError): + z.set_coordinate_selection([0, 2], v[fields], fields=fields) + with assert_raises(ValueError): + z.set_mask_selection([True, False, True], v[fields], fields=fields) + + else: + + # total selection + a[:] = ('', 0, 0) + z[:] = ('', 0, 0) + assert_array_equal(a, z[:]) + if isinstance(fields, str): + a[fields] = v[fields] + else: + for f in fields: + a[f] = v[f] + z.set_basic_selection(Ellipsis, v[fields], fields=fields) + assert_array_equal(a, z[:]) + + # basic selection with slice + a[:] = ('', 0, 0) + z[:] = ('', 0, 0) + a[0:2][fields] = v[0:2][fields] + z.set_basic_selection(slice(0, 2), v[0:2][fields], fields=fields) + assert_array_equal(a, z[:]) + + # orthogonal selection + a[:] = ('', 0, 0) + z[:] = ('', 0, 0) + ix = [0, 2] + a[fields][ix] = v[fields][ix] + z.set_orthogonal_selection(ix, v[fields][ix], fields=fields) + assert_array_equal(a, z[:]) + + # coordinate selection + a[:] = ('', 0, 0) + z[:] = ('', 0, 0) + ix = [0, 2] + a[fields][ix] = v[fields][ix] + z.set_coordinate_selection(ix, v[fields][ix], fields=fields) + assert_array_equal(a, z[:]) + + # mask selection + a[:] = ('', 0, 0) + z[:] = ('', 0, 0) + ix = [True, False, True] + a[fields][ix] = v[fields][ix] + z.set_mask_selection(ix, v[fields][ix], fields=fields) + assert_array_equal(a, z[:]) From e0aeb9b5c8444e40b313a1c0882b91cf0a78f38c Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 7 Nov 2017 10:01:03 +0000 Subject: [PATCH 38/67] nd coordinate indexing --- zarr/core.py | 11 ++++++++- zarr/indexing.py | 37 +++++++++++++++++------------ zarr/tests/test_indexing.py | 47 ++++++++++++++++++++++++++++++------- 3 files changed, 70 insertions(+), 25 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index fb9fa3cad5..8b8428a85a 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -555,7 +555,16 @@ def get_coordinate_selection(self, selection, out=None, fields=None): # setup indexer indexer = CoordinateIndexer(selection, self) - return self._get_selection(indexer=indexer, out=out, fields=fields) + # handle output - need to flatten + if out is not None: + out = out.reshape(-1) + + out = self._get_selection(indexer=indexer, out=out, fields=fields) + + # restore shape + out = out.reshape(indexer.sel_shape) + + return out def get_mask_selection(self, selection, out=None, fields=None): """TODO""" diff --git a/zarr/indexing.py b/zarr/indexing.py index 2c342bd42c..b39a3cfe78 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -529,21 +529,21 @@ def __setitem__(self, selection, value): return self.array.set_orthogonal_selection(selection, value, fields=fields) +# noinspection PyProtectedMember def is_coordinate_selection(selection, array): return ( (len(selection) == len(array._shape)) and - all( - [is_integer(dim_sel) or is_integer_array(dim_sel) - for dim_sel in selection] - ) + all([is_integer(dim_sel) or is_integer_array(dim_sel) + for dim_sel in selection]) ) +# noinspection PyProtectedMember def is_mask_selection(selection, array): return ( len(selection) == 1 and is_bool_array(selection[0]) and - selection[0].shape == array.shape + selection[0].shape == array._shape ) @@ -569,16 +569,9 @@ def __init__(self, selection, array): # TODO refactor error messages for consistency raise IndexError('invalid coordinate selection') - # attempt to broadcast selection - this will raise error if array dimensions don't match - selection = np.broadcast_arrays(*selection) - - # normalization + # handle wraparound, boundscheck for dim_sel, dim_len in zip(selection, array.shape): - # check number of dimensions, only support indexing with 1d array - if len(dim_sel.shape) > 1: - raise IndexError('selection must be 1-dimensional integer array') - # handle wraparound loc_neg = dim_sel < 0 if np.any(loc_neg): @@ -589,11 +582,24 @@ def __init__(self, selection, array): if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): raise IndexError('index out of bounds') - # compute flattened chunk index for each point in the selection + # compute chunk index for each point in the selection chunks_multi_index = tuple( dim_sel // dim_chunk_len for (dim_sel, dim_chunk_len) in zip(selection, array._chunks) ) + + # broadcast selection - this will raise error if array dimensions don't match + selection = np.broadcast_arrays(*selection) + chunks_multi_index = np.broadcast_arrays(*chunks_multi_index) + + # remember shape of selection, because we will flatten indices for processing + self.sel_shape = selection[0].shape if selection[0].shape else (1,) + + # flatten selection + selection = [dim_sel.reshape(-1) for dim_sel in selection] + chunks_multi_index = [dim_chunks.reshape(-1) for dim_chunks in chunks_multi_index] + + # ravel chunk indices chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, dims=array._cdata_shape) @@ -612,9 +618,10 @@ def __init__(self, selection, array): # precompute number of selected items for each chunk self.chunk_nitems = np.bincount(chunks_raveled_indices, minlength=array.nchunks) self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + # locate the chunks we need to process self.chunk_rixs = np.nonzero(self.chunk_nitems)[0] - # unravel + # unravel chunk indices self.chunk_mixs = np.unravel_index(self.chunk_rixs, dims=array._cdata_shape) def __iter__(self): diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index e88fb08742..95b281cfb3 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -537,18 +537,32 @@ def test_get_coordinate_selection_1d(): expect = a[ix] actual = z.get_coordinate_selection(ix) assert_array_equal(expect, actual) + actual = z.vindex[ix] + assert_array_equal(expect, actual) # test wraparound ix = [0, 3, 10, -23, -12, -1] expect = a[ix] actual = z.get_coordinate_selection(ix) assert_array_equal(expect, actual) + actual = z.vindex[ix] + assert_array_equal(expect, actual) # test out of order ix = [3, 105, 23, 127] # not monotonically increasing expect = a[ix] actual = z.get_coordinate_selection(ix) assert_array_equal(expect, actual) + actual = z.vindex[ix] + assert_array_equal(expect, actual) + + # test multi-dimensional selection + ix = np.array([[2, 4], [6, 8]]) + expect = a[ix] + actual = z.get_coordinate_selection(ix) + assert_array_equal(expect, actual) + actual = z.vindex[ix] + assert_array_equal(expect, actual) # test errors with assert_raises(IndexError): @@ -558,13 +572,10 @@ def test_get_coordinate_selection_1d(): ix = [-(a.shape[0] + 1)] # out of bounds z.get_coordinate_selection(ix) with assert_raises(IndexError): - ix = [[2, 4], [6, 8]] # too many dimensions - z.get_coordinate_selection(ix) - with assert_raises(IndexError): - ix = slice(5, 15) + ix = slice(5, 15) # not supported z.get_coordinate_selection(ix) with assert_raises(IndexError): - ix = Ellipsis + ix = Ellipsis # not supported z.get_coordinate_selection(ix) @@ -622,6 +633,8 @@ def test_get_coordinate_selection_2d(): expect = a[ix0, ix1] actual = z.get_coordinate_selection((ix0, ix1)) assert_array_equal(expect, actual) + actual = z.vindex[ix0, ix1] + assert_array_equal(expect, actual) # not monotonically increasing (second dim) ix0 = [1, 1, 2, 2, 5] @@ -629,6 +642,19 @@ def test_get_coordinate_selection_2d(): expect = a[ix0, ix1] actual = z.get_coordinate_selection((ix0, ix1)) assert_array_equal(expect, actual) + actual = z.vindex[ix0, ix1] + assert_array_equal(expect, actual) + + # multi-dimensional selection + ix0 = np.array([[1, 1, 2], + [2, 2, 5]]) + ix1 = np.array([[1, 3, 2], + [1, 0, 0]]) + expect = a[ix0, ix1] + actual = z.get_coordinate_selection((ix0, ix1)) + assert_array_equal(expect, actual) + actual = z.vindex[ix0, ix1] + assert_array_equal(expect, actual) with assert_raises(IndexError): selection = slice(5, 15), [1, 2, 3] @@ -639,6 +665,9 @@ def test_get_coordinate_selection_2d(): with assert_raises(IndexError): selection = Ellipsis, [1, 2, 3] z.get_coordinate_selection(selection) + with assert_raises(IndexError): + selection = Ellipsis + z.get_coordinate_selection(selection) def test_set_coordinate_selection_1d_int(): @@ -657,10 +686,10 @@ def test_set_coordinate_selection_1d_int(): a[:] = 0 a[ix] = v[ix] z[:] = 0 - z.vindex[ix] = v[ix] + z.set_coordinate_selection(ix, v[ix]) assert_array_equal(a, z[:]) z[:] = 0 - z.set_coordinate_selection(ix, v[ix]) + z.vindex[ix] = v[ix] assert_array_equal(a, z[:]) @@ -690,10 +719,10 @@ def test_set_coordinate_selection_2d_int(): a[:] = 0 a[selection] = v[selection] z[:] = 0 - z.vindex[selection] = v[selection] + z.set_coordinate_selection(selection, v[selection]) assert_array_equal(a, z[:]) z[:] = 0 - z.set_coordinate_selection(selection, v[selection]) + z.vindex[selection] = v[selection] assert_array_equal(a, z[:]) From b4b272eeb2397ad8eeba17ed03824ea015df9c14 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 7 Nov 2017 11:32:42 +0000 Subject: [PATCH 39/67] tidy up, improve tests --- notebooks/advanced_indexing.ipynb | 2 +- requirements.txt | 1 + zarr/core.py | 2 +- zarr/indexing.py | 121 +++++++++++-------- zarr/tests/test_core.py | 5 + zarr/tests/test_indexing.py | 193 +++++++++++++++--------------- 6 files changed, 177 insertions(+), 147 deletions(-) diff --git a/notebooks/advanced_indexing.ipynb b/notebooks/advanced_indexing.ipynb index 2005b40d8f..8f9457dca7 100644 --- a/notebooks/advanced_indexing.ipynb +++ b/notebooks/advanced_indexing.ipynb @@ -15,7 +15,7 @@ { "data": { "text/plain": [ - "'2.1.5.dev83'" + "'2.1.5.dev113'" ] }, "execution_count": 1, diff --git a/requirements.txt b/requirements.txt index 8427764e04..28a7ceece4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +nose numpy fasteners numcodecs diff --git a/zarr/core.py b/zarr/core.py index 8b8428a85a..c796fb3902 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -328,7 +328,7 @@ def nbytes_stored(self): @property def _cdata_shape(self): if self._shape == (): - return (1,) + return 1, else: return tuple(int(np.ceil(s / c)) for s, c in zip(self._shape, self._chunks)) diff --git a/zarr/indexing.py b/zarr/indexing.py index b39a3cfe78..97947c6423 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -77,32 +77,32 @@ def __iter__(self): yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) -def normalize_slice_selection(dim_sel, dim_len): - - # handle slice with None bound - start = 0 if dim_sel.start is None else dim_sel.start - stop = dim_len if dim_sel.stop is None else dim_sel.stop - step = 1 if dim_sel.step is None else dim_sel.step - - # handle wraparound - if start < 0: - start = dim_len + start - if stop < 0: - stop = dim_len + stop - - # handle out of bounds - if start < 0: - raise IndexError('start index out of bounds: %s' % dim_sel.start) - if stop < 0: - raise IndexError('stop index out of bounds: %s' % dim_sel.stop) - if start >= dim_len and dim_len > 0: - raise IndexError('start index out of bounds: %ss' % dim_sel.start) - if stop > dim_len: - stop = dim_len - if stop < start: - stop = start - - return slice(start, stop, step) +# def normalize_slice_selection(dim_sel, dim_len): +# +# # handle slice with None bound +# start = 0 if dim_sel.start is None else dim_sel.start +# stop = dim_len if dim_sel.stop is None else dim_sel.stop +# step = 1 if dim_sel.step is None else dim_sel.step +# +# # handle wraparound +# if start < 0: +# start = dim_len + start +# if stop < 0: +# stop = dim_len + stop +# +# # handle out of bounds +# if start < 0: +# raise IndexError('start index out of bounds: %s' % dim_sel.start) +# if stop < 0: +# raise IndexError('stop index out of bounds: %s' % dim_sel.stop) +# if start >= dim_len and dim_len > 0: +# raise IndexError('start index out of bounds: %ss' % dim_sel.start) +# if stop > dim_len: +# stop = dim_len +# if stop < start: +# stop = start +# +# return slice(start, stop, step) class SliceDimIndexer(object): @@ -114,40 +114,39 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): raise ValueError('selection must be a contiguous slice') # normalize - dim_sel = normalize_slice_selection(dim_sel, dim_len) + self.start, self.stop, _ = dim_sel.indices(dim_len) # store attributes - self.dim_sel = dim_sel self.dim_len = dim_len self.dim_chunk_len = dim_chunk_len - self.nitems = dim_sel.stop - dim_sel.start + self.nitems = self.stop - self.start def __iter__(self): - dim_chunk_from = self.dim_sel.start // self.dim_chunk_len - dim_chunk_to = int(np.ceil(self.dim_sel.stop / self.dim_chunk_len)) + dim_chunk_from = self.start // self.dim_chunk_len + dim_chunk_to = int(np.ceil(self.stop / self.dim_chunk_len)) for dim_chunk_ix in range(dim_chunk_from, dim_chunk_to): dim_offset = dim_chunk_ix * self.dim_chunk_len - if self.dim_sel.start <= dim_offset: + if self.start <= dim_offset: # selection starts before current chunk dim_chunk_sel_start = 0 - dim_out_offset = dim_offset - self.dim_sel.start + dim_out_offset = dim_offset - self.start else: # selection starts within current chunk - dim_chunk_sel_start = self.dim_sel.start - dim_offset + dim_chunk_sel_start = self.start - dim_offset dim_out_offset = 0 - if self.dim_sel.stop > (dim_offset + self.dim_chunk_len): + if self.stop > (dim_offset + self.dim_chunk_len): # selection ends after current chunk dim_chunk_sel_stop = self.dim_chunk_len else: # selection ends within current chunk - dim_chunk_sel_stop = self.dim_sel.stop - dim_offset + dim_chunk_sel_stop = self.stop - dim_offset dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start @@ -266,7 +265,8 @@ def __iter__(self): chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) - out_selection = tuple(p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None) + out_selection = tuple(p.dim_out_sel for p in dim_projections + if p.dim_out_sel is not None) yield ChunkProjection(chunk_coords, chunk_selection, out_selection) @@ -398,19 +398,19 @@ def __iter__(self): yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) -def slice_to_range(s): - return range(s.start, s.stop, 1 if s.step is None else s.step) +def slice_to_range(s, l): + return range(*s.indices(l)) -def ix_(*selection): +def ix_(selection, shape): """Convert an orthogonal selection to a numpy advanced (fancy) selection, with support for slices and single ints.""" # replace slice and int as these are not supported by numpy ix_() - selection = [slice_to_range(dim_sel) if isinstance(dim_sel, slice) + selection = [slice_to_range(dim_sel, dim_len) if isinstance(dim_sel, slice) else [dim_sel] if is_integer(dim_sel) else dim_sel - for dim_sel in selection] + for dim_sel, dim_len in zip(selection, shape)] selection = np.ix_(*selection) @@ -420,13 +420,25 @@ def ix_(*selection): def oindex(a, selection): """Implementation of orthogonal indexing with slices and ints.""" drop_axes = tuple([i for i, s in enumerate(selection) if is_integer(s)]) - selection = ix_(*selection) + selection = ix_(selection, a.shape) result = a[selection] if drop_axes: result = result.squeeze(axis=drop_axes) return result +def oindex_set(a, selection, value): + drop_axes = tuple([i for i, s in enumerate(selection) if is_integer(s)]) + selection = ix_(selection, a.shape) + if drop_axes: + value_selection = [slice(None)] * len(a.shape) + for i in drop_axes: + value_selection[i] = np.newaxis + value = value[value_selection] + a[selection] = value + + +# noinspection PyProtectedMember class OrthogonalIndexer(object): def __init__(self, selection, array): @@ -457,11 +469,12 @@ def __init__(self, selection, array): elif isinstance(dim_sel, slice): # normalize so we can check for step - dim_sel = normalize_slice_selection(dim_sel, dim_len) + start, stop, strides = dim_sel.indices(dim_len) + # dim_sel = normalize_slice_selection(dim_sel, dim_len) # handle slice with step - if dim_sel.step != 1: - dim_sel = np.arange(dim_sel.start, dim_sel.stop, dim_sel.step) + if strides != 1: + dim_sel = np.arange(start, stop, strides) dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) else: dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) @@ -479,6 +492,7 @@ def __init__(self, selection, array): dim_indexers.append(dim_indexer) + self.array = array self.dim_indexers = dim_indexers self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)) @@ -486,7 +500,7 @@ def __init__(self, selection, array): for dim_indexer in self.dim_indexers]) if self.is_advanced: self.drop_axes = tuple([i for i, dim_indexer in enumerate(self.dim_indexers) - if isinstance(dim_indexer, IntDimIndexer)]) + if isinstance(dim_indexer, IntDimIndexer)]) else: self.drop_axes = None @@ -495,18 +509,20 @@ def __iter__(self): chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) - out_selection = tuple(p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None) + out_selection = tuple(p.dim_out_sel for p in dim_projections + if p.dim_out_sel is not None) # handle advanced indexing arrays orthogonally if self.is_advanced: + # numpy doesn't support orthogonal indexing directly as yet, so need to work # around via np.ix_. Also np.ix_ does not support a mixture of arrays and slices # or integers, so need to convert slices and integers into ranges. - chunk_selection = ix_(*chunk_selection) + chunk_selection = ix_(chunk_selection, self.array._chunks) # special case for non-monotonic indices - if any([not isinstance(s, (int, slice)) for s in out_selection]): - out_selection = ix_(*out_selection) + if any([not isinstance(s, (numbers.Integral, slice)) for s in out_selection]): + out_selection = ix_(out_selection, self.shape) yield ChunkProjection(chunk_coords, chunk_selection, out_selection) @@ -720,12 +736,15 @@ def check_fields(fields, dtype): def pop_fields(selection): if isinstance(selection, str): + # single field selection fields = selection selection = () elif not isinstance(selection, tuple): + # single selection item, no fields fields = None # leave selection as-is else: + # multiple items, split fields from selection items fields = [f for f in selection if isinstance(f, str)] fields = fields[0] if len(fields) == 1 else fields selection = tuple(s for s in selection if not isinstance(s, str)) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 737830e04d..61965f42d7 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -82,6 +82,7 @@ def test_nbytes_stored(self): except TypeError: pass + # noinspection PyStatementEffect def test_array_1d(self): a = np.arange(1050) z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) @@ -193,6 +194,7 @@ def test_array_1d_set_scalar(self): z[:] = value assert_array_equal(a, z[:]) + # noinspection PyStatementEffect def test_array_2d(self): a = np.arange(10000).reshape((1000, 10)) z = self.create_array(shape=a.shape, chunks=(100, 2), dtype=a.dtype) @@ -599,6 +601,7 @@ def test_np_ufuncs(self): assert_array_equal(np.take(a, indices, axis=1), np.take(a, zi, axis=1)) + # noinspection PyStatementEffect def test_0len_dim_1d(self): # Test behaviour for 1D array with zero-length dimension. @@ -631,6 +634,7 @@ def test_0len_dim_1d(self): with assert_raises(IndexError): z[0] = 42 + # noinspection PyStatementEffect def test_0len_dim_2d(self): # Test behavioud for 2D array with a zero-length dimension. @@ -667,6 +671,7 @@ def test_0len_dim_2d(self): with assert_raises(IndexError): z[:, 0] = 42 + # noinspection PyStatementEffect def test_array_0d(self): # test behaviour for array with 0 dimensions diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 95b281cfb3..32d48b331d 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -7,8 +7,7 @@ from nose.tools import assert_raises, eq_ as eq -from zarr.indexing import normalize_integer_selection, normalize_slice_selection, \ - replace_ellipsis, ix_, oindex +from zarr.indexing import normalize_integer_selection, replace_ellipsis, oindex, oindex_set import zarr @@ -24,21 +23,21 @@ def test_normalize_integer_selection(): normalize_integer_selection(-1000, 100) -def test_normalize_slice_selection(): - - eq(slice(0, 100, 1), normalize_slice_selection(slice(None), 100)) - eq(slice(0, 100, 1), normalize_slice_selection(slice(None, 100), 100)) - eq(slice(0, 100, 1), normalize_slice_selection(slice(0, None), 100)) - eq(slice(0, 100, 1), normalize_slice_selection(slice(0, 1000), 100)) - eq(slice(99, 100, 1), normalize_slice_selection(slice(-1, None), 100)) - eq(slice(98, 99, 1), normalize_slice_selection(slice(-2, -1), 100)) - eq(slice(10, 10, 1), normalize_slice_selection(slice(10, 0), 100)) - with assert_raises(IndexError): - normalize_slice_selection(slice(100, None), 100) - with assert_raises(IndexError): - normalize_slice_selection(slice(1000, 2000), 100) - with assert_raises(IndexError): - normalize_slice_selection(slice(-1000, 0), 100) +# def test_normalize_slice_selection(): +# +# eq(slice(0, 100, 1), normalize_slice_selection(slice(None), 100)) +# eq(slice(0, 100, 1), normalize_slice_selection(slice(None, 100), 100)) +# eq(slice(0, 100, 1), normalize_slice_selection(slice(0, None), 100)) +# eq(slice(0, 100, 1), normalize_slice_selection(slice(0, 1000), 100)) +# eq(slice(99, 100, 1), normalize_slice_selection(slice(-1, None), 100)) +# eq(slice(98, 99, 1), normalize_slice_selection(slice(-2, -1), 100)) +# eq(slice(10, 10, 1), normalize_slice_selection(slice(10, 0), 100)) +# with assert_raises(IndexError): +# normalize_slice_selection(slice(100, None), 100) +# with assert_raises(IndexError): +# normalize_slice_selection(slice(1000, 2000), 100) +# with assert_raises(IndexError): +# normalize_slice_selection(slice(-1000, 0), 100) def test_replace_ellipsis(): @@ -132,17 +131,22 @@ def test_get_orthogonal_selection_1d_int(): ix.sort() _test_get_orthogonal_selection_1d_common(a, z, ix) - # test wraparound - ix = [0, 3, 10, -23, -12, -1] - expect = a[ix] - actual = z.oindex[ix] - assert_array_equal(expect, actual) + selections = [ + # test single value + 0, + -1, + # test wraparound + [0, 3, 10, -23, -12, -1], + # explicit test not sorted + [3, 105, 23, 127], # not monotonically increasing - # explicit test not sorted - ix = [3, 105, 23, 127] # not monotonically increasing - expect = a[ix] - actual = z.oindex[ix] - assert_array_equal(expect, actual) + ] + for selection in selections: + expect = a[selection] + actual = z.get_orthogonal_selection(selection) + assert_array_equal(expect, actual) + actual = z.oindex[selection] + assert_array_equal(expect, actual) # test errors with assert_raises(IndexError): @@ -169,9 +173,14 @@ def test_get_orthogonal_selection_1d_slice_with_step(): slice(0, 1050, 10), slice(0, 1050, 100), slice(0, 1050, 1000), + slice(1050, 0, -1), + slice(1050, 0, -10), + slice(50, 150), slice(50, 150, 1), slice(50, 150, 10), - slice(50, 150, 100), + slice(150, 50, -1), + slice(150, 50, -10), + slice(-1, 0, -1), ] for selection in selections: expect = a[selection] @@ -187,6 +196,9 @@ def test_get_orthogonal_selection_1d_slice_with_step(): def _test_get_orthogonal_selection_2d_common(a, z, ix0, ix1): selections = [ + # single value + (42, 4), + (-1, -1), # index both axes with array (ix0, ix1), # mixed indexing with array / slice @@ -255,6 +267,9 @@ def test_get_orthogonal_selection_2d_int(): def _test_get_orthogonal_selection_3d_common(a, z, ix0, ix1, ix2): selections = [ + # single value + (84, 42, 4), + (-1, -1, -1), # index all axes with array (ix0, ix1, ix2), # mixed indexing with single array / slices @@ -311,11 +326,11 @@ def test_orthogonal_indexing_edge_cases(): z = zarr.create(shape=a.shape, chunks=(1, 2, 3), dtype=a.dtype) z[:] = a - expect = a[ix_([0], range(2), [0, 1, 2])].squeeze(axis=0) + expect = oindex(a, (0, slice(None), [0, 1, 2])) actual = z.oindex[0, :, [0, 1, 2]] assert_array_equal(expect, actual) - expect = a[ix_([0], range(2), [True, True, True])].squeeze(axis=0) + expect = oindex(a, (0, slice(None), [True, True, True])) actual = z.oindex[0, :, [True, True, True]] assert_array_equal(expect, actual) @@ -341,13 +356,16 @@ def test_get_orthogonal_selection_3d_int(): def _test_set_orthogonal_selection_1d_common(v, a, z, ix): + # setup expectation a[:] = 0 a[ix] = v[ix] + # long-form API z[:] = 0 - z.oindex[ix] = v[ix] + z.set_orthogonal_selection(ix, v[ix]) assert_array_equal(a, z[:]) + # short-form API z[:] = 0 - z.set_orthogonal_selection(ix, v[ix]) + z.oindex[ix] = v[ix] assert_array_equal(a, z[:]) # # also available via __setitem__ for 1d arrays # z[:] = 0 @@ -388,6 +406,9 @@ def test_set_orthogonal_selection_1d_int(): def _test_set_orthogonal_selection_2d_common(v, a, z, ix0, ix1): selections = ( + # single value + (42, 4), + (-1, -1), # index both axes with array (ix0, ix1), # mixed indexing with array / slice or int @@ -397,13 +418,17 @@ def _test_set_orthogonal_selection_2d_common(v, a, z, ix0, ix1): (42, ix1), ) for selection in selections: + # setup expectation a[:] = 0 - a[ix_(*selection)] = v[ix_(*selection)] + value = oindex(v, selection) + oindex_set(a, selection, value) + # long-form API z[:] = 0 - z.oindex[selection] = oindex(v, selection) + z.set_orthogonal_selection(selection, value) assert_array_equal(a, z[:]) + # short-form API z[:] = 0 - z.set_orthogonal_selection(selection, oindex(v, selection)) + z.oindex[selection] = value assert_array_equal(a, z[:]) @@ -443,6 +468,9 @@ def test_set_orthogonal_selection_2d_int(): def _test_set_orthogonal_selection_3d_common(v, a, z, ix0, ix1, ix2): selections = ( + # single value + (84, 42, 4), + (-1, -1, -1), # index all axes with bool array (ix0, ix1, ix2), # mixed indexing with single bool array / slice or int @@ -461,13 +489,17 @@ def _test_set_orthogonal_selection_3d_common(v, a, z, ix0, ix1, ix2): (ix0, ix1, 4), ) for selection in selections: + # setup expectation a[:] = 0 - a[ix_(*selection)] = v[ix_(*selection)] + value = oindex(v, selection) + oindex_set(a, selection, value) + # long-form API z[:] = 0 - z.oindex[selection] = oindex(v, selection) + z.set_orthogonal_selection(selection, value) assert_array_equal(a, z[:]) + # short-form API z[:] = 0 - z.set_orthogonal_selection(selection, oindex(v, selection)) + z.oindex[selection] = value assert_array_equal(a, z[:]) @@ -532,37 +564,25 @@ def test_get_coordinate_selection_1d(): actual = z.vindex[ix] assert_array_equal(expect, actual) - # test single item - ix = 42 - expect = a[ix] - actual = z.get_coordinate_selection(ix) - assert_array_equal(expect, actual) - actual = z.vindex[ix] - assert_array_equal(expect, actual) - - # test wraparound - ix = [0, 3, 10, -23, -12, -1] - expect = a[ix] - actual = z.get_coordinate_selection(ix) - assert_array_equal(expect, actual) - actual = z.vindex[ix] - assert_array_equal(expect, actual) - - # test out of order - ix = [3, 105, 23, 127] # not monotonically increasing - expect = a[ix] - actual = z.get_coordinate_selection(ix) - assert_array_equal(expect, actual) - actual = z.vindex[ix] - assert_array_equal(expect, actual) - - # test multi-dimensional selection - ix = np.array([[2, 4], [6, 8]]) - expect = a[ix] - actual = z.get_coordinate_selection(ix) - assert_array_equal(expect, actual) - actual = z.vindex[ix] - assert_array_equal(expect, actual) + selections = [ + # test single item + 42, + -1, + # test wraparound + [0, 3, 10, -23, -12, -1], + # test out of order + [3, 105, 23, 127], # not monotonically increasing + # test multi-dimensional selection + np.array([[2, 4], [6, 8]]), + ] + for selection in selections: + expect = a[selection] + # long-form API + actual = z.get_coordinate_selection(selection) + assert_array_equal(expect, actual) + # short-form API + actual = z.vindex[selection] + assert_array_equal(expect, actual) # test errors with assert_raises(IndexError): @@ -593,25 +613,9 @@ def test_get_coordinate_selection_2d(): ix0 = np.random.choice(a.shape[0], size=n, replace=True) ix1 = np.random.choice(a.shape[1], size=n, replace=True) selections = [ - # index both axes with array - (ix0, ix1), - # mixed indexing with array / int - (ix0, 4), - (42, ix1), + # single value (42, 4), - ] - - for selection in selections: - expect = a[selection] - actual = z.get_coordinate_selection(selection) - assert_array_equal(expect, actual) - actual = z.vindex[selection] - assert_array_equal(expect, actual) - - srt = np.lexsort((ix0, ix1)) - ix0 = ix0[srt] - ix1 = ix1[srt] - selections = [ + (-1, -1), # index both axes with array (ix0, ix1), # mixed indexing with array / int @@ -622,8 +626,10 @@ def test_get_coordinate_selection_2d(): for selection in selections: expect = a[selection] + # long-form API actual = z.get_coordinate_selection(selection) assert_array_equal(expect, actual) + # short-form API actual = z.vindex[selection] assert_array_equal(expect, actual) @@ -708,6 +714,8 @@ def test_set_coordinate_selection_2d_int(): ix1 = np.random.choice(a.shape[1], size=n, replace=True) selections = ( + (42, 4), + (-1, -1), # index both axes with array (ix0, ix1), # mixed indexing with array / int @@ -1042,22 +1050,19 @@ def test_set_selections_with_fields(): else: - # total selection + # setup expectation a[:] = ('', 0, 0) z[:] = ('', 0, 0) assert_array_equal(a, z[:]) - if isinstance(fields, str): - a[fields] = v[fields] - else: - for f in fields: - a[f] = v[f] + a[fields] = v[fields] + # total selection z.set_basic_selection(Ellipsis, v[fields], fields=fields) assert_array_equal(a, z[:]) # basic selection with slice a[:] = ('', 0, 0) z[:] = ('', 0, 0) - a[0:2][fields] = v[0:2][fields] + a[fields][0:2] = v[fields][0:2] z.set_basic_selection(slice(0, 2), v[0:2][fields], fields=fields) assert_array_equal(a, z[:]) From c9304a01e722128cb8864623312ed1e711ddab52 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 7 Nov 2017 12:13:46 +0000 Subject: [PATCH 40/67] added basic selection tests --- zarr/indexing.py | 30 +-------- zarr/tests/test_indexing.py | 129 +++++++++++++++++++++++++++++++----- 2 files changed, 113 insertions(+), 46 deletions(-) diff --git a/zarr/indexing.py b/zarr/indexing.py index 97947c6423..85d40743b4 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -77,34 +77,6 @@ def __iter__(self): yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) -# def normalize_slice_selection(dim_sel, dim_len): -# -# # handle slice with None bound -# start = 0 if dim_sel.start is None else dim_sel.start -# stop = dim_len if dim_sel.stop is None else dim_sel.stop -# step = 1 if dim_sel.step is None else dim_sel.step -# -# # handle wraparound -# if start < 0: -# start = dim_len + start -# if stop < 0: -# stop = dim_len + stop -# -# # handle out of bounds -# if start < 0: -# raise IndexError('start index out of bounds: %s' % dim_sel.start) -# if stop < 0: -# raise IndexError('stop index out of bounds: %s' % dim_sel.stop) -# if start >= dim_len and dim_len > 0: -# raise IndexError('start index out of bounds: %ss' % dim_sel.start) -# if stop > dim_len: -# stop = dim_len -# if stop < start: -# stop = start -# -# return slice(start, stop, step) - - class SliceDimIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): @@ -119,7 +91,7 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): # store attributes self.dim_len = dim_len self.dim_chunk_len = dim_chunk_len - self.nitems = self.stop - self.start + self.nitems = max(0, self.stop - self.start) def __iter__(self): diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 32d48b331d..56a94ebb1d 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -23,23 +23,6 @@ def test_normalize_integer_selection(): normalize_integer_selection(-1000, 100) -# def test_normalize_slice_selection(): -# -# eq(slice(0, 100, 1), normalize_slice_selection(slice(None), 100)) -# eq(slice(0, 100, 1), normalize_slice_selection(slice(None, 100), 100)) -# eq(slice(0, 100, 1), normalize_slice_selection(slice(0, None), 100)) -# eq(slice(0, 100, 1), normalize_slice_selection(slice(0, 1000), 100)) -# eq(slice(99, 100, 1), normalize_slice_selection(slice(-1, None), 100)) -# eq(slice(98, 99, 1), normalize_slice_selection(slice(-2, -1), 100)) -# eq(slice(10, 10, 1), normalize_slice_selection(slice(10, 0), 100)) -# with assert_raises(IndexError): -# normalize_slice_selection(slice(100, None), 100) -# with assert_raises(IndexError): -# normalize_slice_selection(slice(1000, 2000), 100) -# with assert_raises(IndexError): -# normalize_slice_selection(slice(-1000, 0), 100) - - def test_replace_ellipsis(): # 1D, single item @@ -81,6 +64,118 @@ def test_replace_ellipsis(): replace_ellipsis((slice(None), slice(None), Ellipsis), (100, 100))) +# noinspection PyStatementEffect +def test_get_basic_selection_1d(): + + # setup + a = np.arange(1050, dtype=int) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + selections = [ + # single value + 42, + -1, + # slices + slice(None), + slice(0, 1050), + slice(50, 150), + slice(0, 2000), + slice(-150, -50), + slice(-2000, 2000), + slice(0, 0), # empty result + slice(-1, 0), # empty result + # total selections + Ellipsis, + (), + (Ellipsis, slice(None)), + ] + + for selection in selections: + expect = a[selection] + # long-form API + actual = z.get_basic_selection(selection) + assert_array_equal(expect, actual) + # basic selection available via __getitem__ + actual = z[selection] + assert_array_equal(expect, actual) + + with assert_raises(IndexError): + z[::2] # slice with step + with assert_raises(IndexError): + z[::-1] # slice with step + with assert_raises(IndexError): + z[[0, 1]] # fancy indexing + with assert_raises(IndexError): + z[0, 0] # too many indices + with assert_raises(IndexError): + z[:, :] # too many indices + + +# noinspection PyStatementEffect +def test_get_basic_selection_2d(): + + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + + selections = [ + # single row + 42, + -1, + (42, slice(None)), + (-1, slice(None)), + # single col + (slice(None), 4), + (slice(None), -1), + # row slices + slice(None), + slice(0, 1000), + slice(250, 350), + slice(0, 2000), + slice(-350, -250), + slice(0, 0), # empty result + slice(-1, 0), # empty result + slice(-2000, 0), + slice(-2000, 2000), + # 2D slices + (slice(None), slice(1, 5)), + (slice(250, 350), slice(None)), + (slice(250, 350), slice(1, 5)), + (slice(250, 350), slice(-5, -1)), + (slice(250, 350), slice(-50, 50)), + # total selections + (slice(None), slice(None)), + Ellipsis, + (), + (Ellipsis, slice(None)), + (Ellipsis, slice(None), slice(None)), + ] + + for selection in selections: + expect = a[selection] + # long-form API + actual = z.get_basic_selection(selection) + assert_array_equal(expect, actual) + # basic selection available via __getitem__ + actual = z[selection] + assert_array_equal(expect, actual) + + with assert_raises(IndexError): + z[::2] # slice with step + with assert_raises(IndexError): + z[:, ::2] # slice with step + with assert_raises(IndexError): + z[[0, 1]] # fancy indexing + with assert_raises(IndexError): + z[:, [0, 1]] # fancy indexing + with assert_raises(IndexError): + z[0, 0, 0] # too many indices + with assert_raises(IndexError): + z[:, :, :] # too many indices + + def _test_get_orthogonal_selection_1d_common(a, z, ix): expect = a[ix] actual = z.get_orthogonal_selection(ix) From 3b0966a86557cbe45682bf7bfe214e3c1655a1f8 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 7 Nov 2017 13:21:00 +0000 Subject: [PATCH 41/67] benchmarking, optimisation --- notebooks/advanced_indexing.ipynb | 2345 ++++++++++++++++++++--------- zarr/core.py | 6 +- zarr/indexing.py | 38 +- zarr/tests/test_indexing.py | 109 +- 4 files changed, 1722 insertions(+), 776 deletions(-) diff --git a/notebooks/advanced_indexing.ipynb b/notebooks/advanced_indexing.ipynb index 8f9457dca7..4dbbbd50ee 100644 --- a/notebooks/advanced_indexing.ipynb +++ b/notebooks/advanced_indexing.ipynb @@ -44,14 +44,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Indexing a 1D array with a Boolean array\n", + "### Indexing a 1D array with a Boolean (mask) array\n", "\n", - "Supported via ``__getitem__`` and ``__setitem__`` just like numpy array." + "Supported via ``get/set_mask_selection()`` and ``.vindex[]``. Also supported via ``get/set_orthogonal_selection()`` and ``.oindex[]``." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -60,6 +60,27 @@ "ix = [False, True, False, True, False, True, False, True, False, True]" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 3, 5, 7, 9])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get items\n", + "za.vindex[ix]" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -78,7 +99,7 @@ ], "source": [ "# get items\n", - "za[ix]" + "za.oindex[ix]" ] }, { @@ -99,7 +120,7 @@ ], "source": [ "# set items\n", - "za[ix] = a[ix] * 10\n", + "za.vindex[ix] = a[ix] * 10\n", "za[:]" ] }, @@ -111,7 +132,7 @@ { "data": { "text/plain": [ - "array([1, 3, 5, 7, 9])" + "array([ 0, 100, 2, 300, 4, 500, 6, 700, 8, 900])" ] }, "execution_count": 6, @@ -120,24 +141,46 @@ } ], "source": [ - "# indexing array can be any array-like, e.g., Zarr array\n", + "# set items\n", + "za.oindex[ix] = a[ix] * 100\n", + "za[:]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 3, 5, 7, 9])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# if using .oindex, indexing array can be any array-like, e.g., Zarr array\n", "zix = zarr.array(ix, chunks=2)\n", "za = zarr.array(a, chunks=2)\n", - "za[zix] # will not load all zix into memory" + "za.oindex[zix] # will not load all zix into memory" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Indexing a 1D array with an integer array\n", + "### Indexing a 1D array with a 1D integer (coordinate) array\n", "\n", - "Supported via ``__getitem__`` and ``__setitem__`` just like numpy array." + "Supported via ``get/set_coordinate_selection()`` and ``.vindex[]``. Also supported via ``get/set_orthogonal_selection()`` and ``.oindex[]``." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -148,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -157,19 +200,40 @@ "array([1, 3, 5, 7, 9])" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get items\n", - "za[ix]" + "za.vindex[ix]" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 3, 5, 7, 9])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get items\n", + "za.oindex[ix]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -178,14 +242,100 @@ "array([ 0, 10, 2, 30, 4, 50, 6, 70, 8, 90])" ] }, - "execution_count": 9, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# set items\n", + "za.vindex[ix] = a[ix] * 10\n", + "za[:]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 100, 2, 300, 4, 500, 6, 700, 8, 900])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# set items\n", + "za.oindex[ix] = a[ix] * 100\n", + "za[:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indexing a 1D array with a multi-dimensional integer (coordinate) array\n", + "\n", + "Supported via ``get/set_coordinate_selection()`` and ``.vindex[]``." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "a = np.arange(10)\n", + "za = zarr.array(a, chunks=2)\n", + "ix = np.array([[1, 3, 5], [2, 4, 6]])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1, 3, 5],\n", + " [2, 4, 6]])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get items\n", + "za.vindex[ix]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 10, 20, 30, 40, 50, 60, 7, 8, 9])" + ] + }, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set items\n", - "za[ix] = a[ix] * 10\n", + "za.vindex[ix] = a[ix] * 10\n", "za[:]" ] }, @@ -195,12 +345,12 @@ "source": [ "### Slicing a 1D array with step > 1\n", "\n", - "Slices with step > 1 are supported. Internally these are converted to an integer array via ``np.arange``." + "Slices with step > 1 are supported via ``get/set_orthogonal_selection()`` and ``.oindex[]``. Internally these are converted to an integer array via ``np.arange``." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -210,7 +360,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -219,19 +369,40 @@ "array([1, 3, 5, 7, 9])" ] }, - "execution_count": 11, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get items\n", - "za[1::2]" + "za.oindex[1::2]" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([9, 7, 5, 3, 1])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get items with negative step\n", + "za.oindex[-1::-2]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -240,14 +411,14 @@ "array([ 0, 10, 2, 30, 4, 50, 6, 70, 8, 90])" ] }, - "execution_count": 12, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set items\n", - "za[1::2] = a[1::2] * 10\n", + "za.oindex[1::2] = a[1::2] * 10\n", "za[:]" ] }, @@ -257,12 +428,12 @@ "source": [ "### Orthogonal (outer) indexing of multi-dimensional arrays\n", "\n", - "Orthogonal (a.k.a. outer) indexing is supported with either Boolean or integer arrays. This functionality is provided via the ``get/set_orthogonal_selection()`` methods. For convenience, this functionality is also available via the ``oindex[]`` property as has been proposed for numpy." + "Orthogonal (a.k.a. outer) indexing is supported with either Boolean or integer arrays, in combination with integers and slices. This functionality is provided via the ``get/set_orthogonal_selection()`` methods. For convenience, this functionality is also available via the ``.oindex[]`` property." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -275,7 +446,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 13, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -288,7 +459,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -298,7 +469,7 @@ " [ 9, 11]])" ] }, - "execution_count": 14, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -312,7 +483,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -322,7 +493,7 @@ " [ 9, 11]])" ] }, - "execution_count": 15, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -334,7 +505,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -344,7 +515,7 @@ " [ 9, 11]])" ] }, - "execution_count": 16, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -358,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -368,7 +539,7 @@ " [ 9, 11]])" ] }, - "execution_count": 17, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -380,7 +551,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -390,7 +561,7 @@ " [ 9, 10, 11]])" ] }, - "execution_count": 18, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -402,7 +573,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -415,7 +586,7 @@ " [12, 14]])" ] }, - "execution_count": 19, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -427,7 +598,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -440,7 +611,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 20, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -457,7 +628,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -470,7 +641,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 21, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -483,7 +654,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -496,7 +667,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 22, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -513,7 +684,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -526,7 +697,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 23, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -543,12 +714,12 @@ "source": [ "### Coordinate indexing of multi-dimensional arrays\n", "\n", - "Selecting arbitrary points from a multi-dimensional array by indexing with integer (coordinate) arrays is supported. This functionality is provided via the ``get/set_coordinate_selection()`` methods. For convenience, this functionality is also available via the ``vindex[]`` property as has been proposed for numpy." + "Selecting arbitrary points from a multi-dimensional array by indexing with integer (coordinate) arrays is supported. This functionality is provided via the ``get/set_coordinate_selection()`` methods. For convenience, this functionality is also available via the ``.vindex[]`` property." ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -561,7 +732,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 24, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -574,7 +745,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -583,7 +754,7 @@ "array([ 3, 11])" ] }, - "execution_count": 25, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -597,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -606,7 +777,7 @@ "array([ 3, 11])" ] }, - "execution_count": 26, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -618,7 +789,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -631,7 +802,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 27, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -644,7 +815,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -657,7 +828,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 28, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -674,12 +845,12 @@ "source": [ "### Mask indexing of multi-dimensional arrays\n", "\n", - "Selecting arbitrary points from a multi-dimensional array by a Boolean array is supported. This functionality is provided via the ``get/set_mask_selection()`` methods. For convenience, this functionality is also available via the ``vindex[]`` property as has been proposed for numpy." + "Selecting arbitrary points from a multi-dimensional array by a Boolean array is supported. This functionality is provided via the ``get/set_mask_selection()`` methods. For convenience, this functionality is also available via the ``.vindex[]`` property." ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -692,7 +863,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 31, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -705,7 +876,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -714,7 +885,7 @@ "array([ 3, 11])" ] }, - "execution_count": 33, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -728,7 +899,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -737,7 +908,7 @@ "array([ 3, 11])" ] }, - "execution_count": 34, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -748,7 +919,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -761,7 +932,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 35, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -773,7 +944,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -786,7 +957,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 36, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -805,7 +976,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -814,7 +985,7 @@ "800000000" ] }, - "execution_count": 37, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -826,15 +997,15 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 520 ms, sys: 44 ms, total: 564 ms\n", - "Wall time: 171 ms\n" + "CPU times: user 392 ms, sys: 84 ms, total: 476 ms\n", + "Wall time: 124 ms\n" ] }, { @@ -857,7 +1028,7 @@ "Chunks initialized : 1024/1024" ] }, - "execution_count": 38, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -869,15 +1040,15 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 116 ms, sys: 60 ms, total: 176 ms\n", - "Wall time: 177 ms\n" + "CPU times: user 88 ms, sys: 52 ms, total: 140 ms\n", + "Wall time: 143 ms\n" ] }, { @@ -886,7 +1057,7 @@ "array([ 0, 1, 2, ..., 99999997, 99999998, 99999999])" ] }, - "execution_count": 39, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -897,15 +1068,15 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 492 ms, sys: 80 ms, total: 572 ms\n", - "Wall time: 282 ms\n" + "CPU times: user 408 ms, sys: 88 ms, total: 496 ms\n", + "Wall time: 213 ms\n" ] }, { @@ -914,7 +1085,7 @@ "array([ 0, 1, 2, ..., 99999997, 99999998, 99999999])" ] }, - "execution_count": 40, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -932,16 +1103,16 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "9995616" + "9997476" ] }, - "execution_count": 44, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -954,24 +1125,24 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 348 ms, sys: 8 ms, total: 356 ms\n", - "Wall time: 355 ms\n" + "CPU times: user 256 ms, sys: 4 ms, total: 260 ms\n", + "Wall time: 258 ms\n" ] }, { "data": { "text/plain": [ - "array([ 25, 30, 31, ..., 99999973, 99999982, 99999986])" + "array([ 1, 11, 33, ..., 99999988, 99999989, 99999990])" ] }, - "execution_count": 45, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -982,125 +1153,156 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 908 ms, sys: 68 ms, total: 976 ms\n", - "Wall time: 474 ms\n" + "CPU times: user 864 ms, sys: 108 ms, total: 972 ms\n", + "Wall time: 439 ms\n" ] }, { "data": { "text/plain": [ - "array([ 25, 30, 31, ..., 99999973, 99999982, 99999986])" + "array([ 1, 11, 33, ..., 99999988, 99999989, 99999990])" ] }, - "execution_count": 46, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time zc[ix_dense_bool]" + "%time zc.oindex[ix_dense_bool]" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 808 ms, sys: 32 ms, total: 840 ms\n", + "Wall time: 564 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 1, 11, 33, ..., 99999988, 99999989, 99999990])" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc.vindex[ix_dense_bool]" + ] + }, + { + "cell_type": "code", + "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 58428 function calls in 0.492 seconds\n", + " 67653 function calls in 0.490 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1025 0.203 0.000 0.203 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1024 0.158 0.000 0.168 0.000 core.py:964(_decode_chunk)\n", - " 1024 0.051 0.000 0.240 0.000 core.py:802(_chunk_getitem)\n", - " 1024 0.014 0.000 0.014 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", - " 1025 0.008 0.000 0.232 0.000 indexing.py:486(__iter__)\n", - " 1024 0.006 0.000 0.214 0.000 index_tricks.py:26(ix_)\n", - " 2048 0.006 0.000 0.006 0.000 core.py:324()\n", + " 1025 0.201 0.000 0.201 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1024 0.158 0.000 0.167 0.000 core.py:997(_decode_chunk)\n", + " 1024 0.047 0.000 0.234 0.000 core.py:822(_chunk_getitem)\n", + " 1024 0.012 0.000 0.012 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", + " 1025 0.007 0.000 0.238 0.000 indexing.py:484(__iter__)\n", + " 1024 0.006 0.000 0.212 0.000 index_tricks.py:26(ix_)\n", + " 2048 0.005 0.000 0.005 0.000 core.py:332()\n", " 2048 0.005 0.000 0.005 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 7180 0.004 0.000 0.009 0.000 {built-in method builtins.isinstance}\n", + " 1 0.004 0.004 0.476 0.476 core.py:576(_get_selection)\n", " 1024 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1 0.004 0.004 0.475 0.475 core.py:563(_get_selection)\n", + " 2049 0.003 0.000 0.005 0.000 abc.py:178(__instancecheck__)\n", " 1024 0.003 0.000 0.006 0.000 arrayprint.py:381(wrapper)\n", - " 1024 0.003 0.000 0.008 0.000 core.py:319(_cdata_shape)\n", - " 1024 0.002 0.000 0.010 0.000 {method 'join' of 'str' objects}\n", - " 1025 0.002 0.000 0.003 0.000 indexing.py:296(__iter__)\n", - " 1 0.002 0.002 0.016 0.016 indexing.py:269(__init__)\n", - " 6152 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.003 0.000 0.009 0.000 {method 'join' of 'str' objects}\n", + " 1024 0.002 0.000 0.221 0.000 indexing.py:377(ix_)\n", + " 1025 0.002 0.000 0.003 0.000 indexing.py:275(__iter__)\n", + " 1024 0.002 0.000 0.007 0.000 core.py:327(_cdata_shape)\n", + " 4098 0.002 0.000 0.002 0.000 _weakrefset.py:70(__contains__)\n", " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 1024 0.002 0.000 0.217 0.000 indexing.py:398(ix_)\n", - " 1024 0.001 0.000 0.011 0.000 core.py:961(_chunk_key)\n", + " 1 0.002 0.002 0.014 0.014 indexing.py:248(__init__)\n", " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", + " 1024 0.001 0.000 0.011 0.000 core.py:994(_chunk_key)\n", " 1024 0.001 0.000 0.007 0.000 numeric.py:1905(array_str)\n", - " 1024 0.001 0.000 0.001 0.000 indexing.py:403()\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", - " 1024 0.001 0.000 0.001 0.000 indexing.py:501()\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5642b95ef480}\n", + " 1024 0.001 0.000 0.006 0.000 indexing.py:384()\n", + " 1024 0.001 0.000 0.004 0.000 indexing.py:501()\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:487()\n", " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", " 1024 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:491()\n", + " 1025 0.001 0.000 0.004 0.000 indexing.py:11(is_integer)\n", + " 3079 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", - " 3081 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 1027 0.000 0.000 0.001 0.000 indexing.py:165(ensure_tuple)\n", + " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 1024 0.000 0.000 0.014 0.000 numeric.py:380(count_nonzero)\n", - " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:205(chunk_store)\n", - " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:490()\n", + " 1024 0.000 0.000 0.012 0.000 numeric.py:380(count_nonzero)\n", + " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", + " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:488()\n", " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", - " 1 0.000 0.000 0.492 0.492 :1()\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1 0.000 0.000 0.491 0.491 :1()\n", " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 0.492 0.492 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.016 0.016 indexing.py:425(__init__)\n", + " 1 0.000 0.000 0.491 0.491 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.014 0.014 indexing.py:420(__init__)\n", " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.zeros}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:159(replace_ellipsis)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 1 0.000 0.000 0.490 0.490 indexing.py:512(__getitem__)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:130(replace_ellipsis)\n", " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.492 0.492 core.py:527(get_orthogonal_selection)\n", - " 1 0.000 0.000 0.492 0.492 core.py:392(__getitem__)\n", - " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", + " 1 0.000 0.000 0.490 0.490 core.py:531(get_orthogonal_selection)\n", " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:481()\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:164()\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:194(ensure_tuple)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:537(replace_lists)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", " 1 0.000 0.000 0.000 0.000 indexing.py:19(is_bool_array)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:539()\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:476()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:478()\n", + " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:474()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:476()\n", + " 1 0.000 0.000 0.000 0.000 core.py:367(oindex)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:135()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:479()\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zc[ix_dense_bool]', sort='time')" + "cProfile.run('zc.oindex[ix_dense_bool]', sort='time')" ] }, { @@ -1110,6 +1312,120 @@ "Method ``nonzero`` is being called internally within numpy to convert bool to int selections, no way to avoid." ] }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 43161 function calls in 0.575 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 2 0.215 0.108 0.215 0.108 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 2 0.094 0.047 0.094 0.047 indexing.py:580()\n", + " 1024 0.093 0.000 0.098 0.000 core.py:997(_decode_chunk)\n", + " 1024 0.042 0.000 0.157 0.000 core.py:822(_chunk_getitem)\n", + " 1 0.028 0.028 0.028 0.028 {built-in method numpy.core.multiarray.ravel_multi_index}\n", + " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", + " 1 0.021 0.021 0.179 0.179 indexing.py:553(__init__)\n", + " 2048 0.011 0.000 0.011 0.000 indexing.py:645()\n", + " 1 0.010 0.010 0.010 0.010 function_base.py:1848(diff)\n", + " 1025 0.008 0.000 0.021 0.000 indexing.py:624(__iter__)\n", + " 2054 0.003 0.000 0.003 0.000 core.py:332()\n", + " 4 0.003 0.001 0.003 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 1 0.002 0.002 0.180 0.180 core.py:576(_get_selection)\n", + " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", + " 1026 0.002 0.000 0.002 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1024 0.002 0.000 0.003 0.000 util.py:113(is_total_slice)\n", + " 1024 0.002 0.000 0.007 0.000 {method 'join' of 'str' objects}\n", + " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1027 0.002 0.000 0.005 0.000 core.py:327(_cdata_shape)\n", + " 1024 0.001 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 6155 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.001 0.000 0.005 0.000 numeric.py:1905(array_str)\n", + " 1024 0.001 0.000 0.008 0.000 core.py:994(_chunk_key)\n", + " 2048 0.001 0.000 0.001 0.000 util.py:128()\n", + " 1 0.001 0.001 0.395 0.395 indexing.py:655(__init__)\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:629()\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:641()\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", + " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 2056 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", + " 1028 0.000 0.000 0.001 0.000 {built-in method builtins.all}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.575 0.575 indexing.py:678(__getitem__)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1 0.000 0.000 0.575 0.575 :1()\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 4 0.000 0.000 0.003 0.001 fromnumeric.py:1886(any)\n", + " 1 0.000 0.000 0.575 0.575 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.unravel_index}\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", + " 4 0.000 0.000 0.003 0.001 {method 'any' of 'numpy.ndarray' objects}\n", + " 7 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 3 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 3 0.000 0.000 0.215 0.072 fromnumeric.py:55(_wrapfunc)\n", + " 5 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 1 0.000 0.000 0.575 0.575 core.py:564(get_mask_selection)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:526(is_coordinate_selection)\n", + " 1 0.000 0.000 0.000 0.000 core.py:342(_nchunks)\n", + " 3 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", + " 4 0.000 0.000 0.003 0.001 _methods.py:37(_any)\n", + " 2 0.000 0.000 0.000 0.000 core.py:161(_refresh_metadata)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:529()\n", + " 2 0.000 0.000 0.215 0.108 fromnumeric.py:1487(nonzero)\n", + " 6 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 3 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:535(is_mask_selection)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:19(is_bool_array)\n", + " 3 0.000 0.000 0.000 0.000 indexing.py:165(ensure_tuple)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", + " 1 0.000 0.000 0.000 0.000 core.py:346(nchunks)\n", + " 3 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:592()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:593()\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", + " 4 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.000 0.000 core.py:221(shape)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method _functools.reduce}\n", + " 6 0.000 0.000 0.000 0.000 indexing.py:545()\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:557()\n", + " 2 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", + " 1 0.000 0.000 0.000 0.000 core.py:372(vindex)\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run('zc.vindex[ix_dense_bool]', sort='time')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "``.vindex[]`` is a bit slower, possibly because internally it converts to a coordinate array first." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1119,7 +1435,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -1128,7 +1444,7 @@ "10000000" ] }, - "execution_count": 48, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -1142,24 +1458,24 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 60 ms, sys: 4 ms, total: 64 ms\n", - "Wall time: 64.1 ms\n" + "CPU times: user 64 ms, sys: 4 ms, total: 68 ms\n", + "Wall time: 64.4 ms\n" ] }, { "data": { "text/plain": [ - "array([ 6, 23, 34, ..., 99999974, 99999986, 99999992])" + "array([ 0, 33, 42, ..., 99999987, 99999994, 99999999])" ] }, - "execution_count": 49, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -1170,52 +1486,80 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 560 ms, sys: 100 ms, total: 660 ms\n", - "Wall time: 386 ms\n" + "CPU times: user 560 ms, sys: 52 ms, total: 612 ms\n", + "Wall time: 354 ms\n" ] }, { "data": { "text/plain": [ - "array([ 6, 23, 34, ..., 99999974, 99999986, 99999992])" + "array([ 0, 33, 42, ..., 99999987, 99999994, 99999999])" ] }, - "execution_count": 50, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time zc[ix_dense_int_sorted]" + "%time zc.oindex[ix_dense_int_sorted]" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 108 ms, sys: 28 ms, total: 136 ms\n", - "Wall time: 135 ms\n" + "CPU times: user 588 ms, sys: 60 ms, total: 648 ms\n", + "Wall time: 367 ms\n" ] }, { "data": { "text/plain": [ - "array([95165047, 93422705, 3887249, ..., 41392662, 20111139, 95001327])" + "array([ 0, 33, 42, ..., 99999987, 99999994, 99999999])" ] }, - "execution_count": 51, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc.vindex[ix_dense_int_sorted]" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 124 ms, sys: 0 ns, total: 124 ms\n", + "Wall time: 123 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([23268249, 15578653, 7864, ..., 68558269, 7682216, 66838288])" + ] + }, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -1226,653 +1570,1198 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 2.11 s, sys: 84 ms, total: 2.19 s\n", - "Wall time: 1.86 s\n" + "CPU times: user 1.94 s, sys: 68 ms, total: 2.01 s\n", + "Wall time: 1.71 s\n" ] }, { "data": { "text/plain": [ - "array([95165047, 93422705, 3887249, ..., 41392662, 20111139, 95001327])" + "array([23268249, 15578653, 7864, ..., 68558269, 7682216, 66838288])" ] }, - "execution_count": 52, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time zc[ix_dense_int]" + "%time zc.oindex[ix_dense_int]" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.99 s, sys: 76 ms, total: 2.06 s\n", + "Wall time: 1.71 s\n" + ] + }, + { + "data": { + "text/plain": [ + "array([23268249, 15578653, 7864, ..., 68558269, 7682216, 66838288])" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc.vindex[ix_dense_int]" + ] + }, + { + "cell_type": "code", + "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 55382 function calls in 0.415 seconds\n", + " 64607 function calls in 0.380 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 0.158 0.158 0.216 0.216 indexing.py:325(__init__)\n", - " 1024 0.085 0.000 0.089 0.000 core.py:964(_decode_chunk)\n", - " 1024 0.042 0.000 0.145 0.000 core.py:802(_chunk_getitem)\n", - " 1025 0.031 0.000 0.031 0.000 indexing.py:372(__iter__)\n", - " 1 0.029 0.029 0.029 0.029 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.025 0.025 0.025 0.025 function_base.py:1848(diff)\n", - " 1025 0.005 0.000 0.049 0.000 indexing.py:486(__iter__)\n", - " 2048 0.004 0.000 0.004 0.000 core.py:324()\n", - " 1 0.003 0.003 0.415 0.415 core.py:527(get_orthogonal_selection)\n", - " 1024 0.003 0.000 0.007 0.000 index_tricks.py:26(ix_)\n", + " 1 0.135 0.135 0.172 0.172 indexing.py:304(__init__)\n", + " 1024 0.090 0.000 0.095 0.000 core.py:997(_decode_chunk)\n", + " 1024 0.043 0.000 0.152 0.000 core.py:822(_chunk_getitem)\n", + " 1025 0.026 0.000 0.027 0.000 indexing.py:351(__iter__)\n", + " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", + " 1 0.011 0.011 0.011 0.011 function_base.py:1848(diff)\n", + " 1025 0.006 0.000 0.052 0.000 indexing.py:484(__iter__)\n", + " 1024 0.004 0.000 0.007 0.000 index_tricks.py:26(ix_)\n", + " 2048 0.004 0.000 0.004 0.000 core.py:332()\n", + " 1 0.003 0.003 0.381 0.381 core.py:531(get_orthogonal_selection)\n", + " 2048 0.003 0.000 0.003 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", " 4 0.003 0.001 0.003 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 2048 0.002 0.000 0.002 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 7180 0.003 0.000 0.006 0.000 {built-in method builtins.isinstance}\n", + " 2049 0.002 0.000 0.004 0.000 abc.py:178(__instancecheck__)\n", + " 1 0.002 0.002 0.206 0.206 core.py:576(_get_selection)\n", " 1024 0.002 0.000 0.004 0.000 arrayprint.py:381(wrapper)\n", - " 1 0.002 0.002 0.195 0.195 core.py:563(_get_selection)\n", - " 1024 0.002 0.000 0.005 0.000 core.py:319(_cdata_shape)\n", + " 1024 0.002 0.000 0.014 0.000 indexing.py:377(ix_)\n", " 1024 0.002 0.000 0.007 0.000 {method 'join' of 'str' objects}\n", - " 6152 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", - " 1024 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1024 0.002 0.000 0.005 0.000 core.py:327(_cdata_shape)\n", + " 4098 0.002 0.000 0.002 0.000 _weakrefset.py:70(__contains__)\n", " 1024 0.001 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 1024 0.001 0.000 0.009 0.000 indexing.py:398(ix_)\n", " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", - " 1024 0.001 0.000 0.007 0.000 core.py:961(_chunk_key)\n", - " 1024 0.001 0.000 0.001 0.000 indexing.py:403()\n", + " 1024 0.001 0.000 0.008 0.000 core.py:994(_chunk_key)\n", + " 1024 0.001 0.000 0.004 0.000 indexing.py:384()\n", " 1024 0.001 0.000 0.005 0.000 numeric.py:1905(array_str)\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5642b95ef480}\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", - " 1024 0.001 0.000 0.001 0.000 indexing.py:501()\n", - " 1024 0.000 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:491()\n", - " 1 0.000 0.000 0.216 0.216 indexing.py:425(__init__)\n", - " 3081 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", - " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", + " 1024 0.001 0.000 0.003 0.000 indexing.py:501()\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:487()\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 3079 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", " 1030 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", + " 1 0.000 0.000 0.172 0.172 indexing.py:420(__init__)\n", + " 1025 0.000 0.000 0.003 0.000 indexing.py:11(is_integer)\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:489()\n", + " 1027 0.000 0.000 0.001 0.000 indexing.py:165(ensure_tuple)\n", + " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:488()\n", " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:490()\n", - " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", + " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:205(chunk_store)\n", - " 1 0.000 0.000 0.415 0.415 :1()\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1 0.000 0.000 0.381 0.381 :1()\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", " 4 0.000 0.000 0.003 0.001 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 0.415 0.415 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.415 0.415 core.py:392(__getitem__)\n", + " 1 0.000 0.000 0.381 0.381 {built-in method builtins.exec}\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 4 0.000 0.000 0.003 0.001 {method 'any' of 'numpy.ndarray' objects}\n", " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 1 0.000 0.000 0.381 0.381 indexing.py:512(__getitem__)\n", + " 4 0.000 0.000 0.003 0.001 {method 'any' of 'numpy.ndarray' objects}\n", + " 4 0.000 0.000 0.003 0.001 _methods.py:37(_any)\n", " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:130(replace_ellipsis)\n", " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:537(replace_lists)\n", - " 4 0.000 0.000 0.003 0.001 _methods.py:37(_any)\n", - " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:159(replace_ellipsis)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:476()\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:474()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:476()\n", + " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:194(ensure_tuple)\n", - " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:478()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", " 1 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:164()\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:539()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:481()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:135()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:479()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 1 0.000 0.000 0.000 0.000 core.py:367(oindex)\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zc[ix_dense_int_sorted]', sort='time')" + "cProfile.run('zc.oindex[ix_dense_int_sorted]', sort='time')" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 69726 function calls in 1.841 seconds\n", + " 43143 function calls in 0.372 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 1.160 1.160 1.160 1.160 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 1 0.164 0.164 1.490 1.490 indexing.py:325(__init__)\n", - " 1024 0.150 0.000 0.285 0.000 core.py:802(_chunk_getitem)\n", - " 1 0.128 0.128 0.128 0.128 {method 'take' of 'numpy.ndarray' objects}\n", - " 1024 0.113 0.000 0.120 0.000 core.py:964(_decode_chunk)\n", - " 1025 0.033 0.000 0.034 0.000 indexing.py:372(__iter__)\n", - " 1 0.024 0.024 0.024 0.024 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.012 0.012 0.012 0.012 function_base.py:1848(diff)\n", - " 1025 0.006 0.000 0.059 0.000 indexing.py:486(__iter__)\n", - " 2048 0.006 0.000 0.012 0.000 index_tricks.py:26(ix_)\n", - " 3072 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 2048 0.004 0.000 0.004 0.000 core.py:324()\n", - " 1 0.003 0.003 1.840 1.840 core.py:527(get_orthogonal_selection)\n", - " 1024 0.003 0.000 0.003 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 2 0.104 0.052 0.104 0.052 indexing.py:580()\n", + " 1024 0.090 0.000 0.095 0.000 core.py:997(_decode_chunk)\n", + " 1024 0.042 0.000 0.154 0.000 core.py:822(_chunk_getitem)\n", + " 1 0.028 0.028 0.028 0.028 {built-in method numpy.core.multiarray.ravel_multi_index}\n", + " 1 0.026 0.026 0.194 0.194 indexing.py:553(__init__)\n", + " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", + " 2048 0.011 0.000 0.011 0.000 indexing.py:645()\n", + " 1 0.010 0.010 0.010 0.010 function_base.py:1848(diff)\n", + " 1025 0.008 0.000 0.021 0.000 indexing.py:624(__iter__)\n", + " 2054 0.003 0.000 0.003 0.000 core.py:332()\n", " 4 0.003 0.001 0.003 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1 0.003 0.003 0.346 0.346 core.py:563(_get_selection)\n", + " 1 0.002 0.002 0.177 0.177 core.py:576(_get_selection)\n", + " 1027 0.002 0.000 0.002 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", - " 2048 0.002 0.000 0.015 0.000 indexing.py:398(ix_)\n", - " 8200 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", - " 1024 0.002 0.000 0.006 0.000 core.py:319(_cdata_shape)\n", " 1024 0.002 0.000 0.007 0.000 {method 'join' of 'str' objects}\n", - " 2048 0.001 0.000 0.003 0.000 numerictypes.py:728(issubdtype)\n", - " 1024 0.001 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 2048 0.001 0.000 0.002 0.000 indexing.py:403()\n", - " 1024 0.001 0.000 0.008 0.000 core.py:961(_chunk_key)\n", + " 1024 0.002 0.000 0.003 0.000 util.py:113(is_total_slice)\n", + " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1027 0.001 0.000 0.005 0.000 core.py:327(_cdata_shape)\n", + " 6153 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.001 0.000 0.008 0.000 core.py:994(_chunk_key)\n", " 1024 0.001 0.000 0.005 0.000 numeric.py:1905(array_str)\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5642b95ef480}\n", - " 2054 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", - " 4096 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", - " 2048 0.001 0.000 0.001 0.000 numeric.py:463(asarray)\n", - " 4105 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 2048 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 1 0.001 0.001 1.491 1.491 indexing.py:425(__init__)\n", - " 1024 0.001 0.000 0.001 0.000 indexing.py:501()\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:491()\n", - " 1024 0.000 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 1 0.000 0.000 1.841 1.841 core.py:392(__getitem__)\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:629()\n", + " 2048 0.001 0.000 0.001 0.000 util.py:128()\n", + " 1 0.001 0.001 0.372 0.372 core.py:543(get_coordinate_selection)\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:641()\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:490()\n", - " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 2054 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", + " 1028 0.000 0.000 0.001 0.000 {built-in method builtins.all}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:205(chunk_store)\n", - " 1 0.000 0.000 1.841 1.841 :1()\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1 0.000 0.000 0.372 0.372 :1()\n", " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 1.841 1.841 {built-in method builtins.exec}\n", " 4 0.000 0.000 0.003 0.001 fromnumeric.py:1886(any)\n", - " 4 0.000 0.000 1.288 0.322 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.unravel_index}\n", + " 1 0.000 0.000 0.372 0.372 {built-in method builtins.exec}\n", + " 7 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 5 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", " 4 0.000 0.000 0.003 0.001 {method 'any' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.372 0.372 indexing.py:678(__getitem__)\n", + " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 6 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.128 0.128 fromnumeric.py:70(take)\n", - " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 1 0.000 0.000 1.160 1.160 fromnumeric.py:826(argsort)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", " 4 0.000 0.000 0.003 0.001 _methods.py:37(_any)\n", - " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:159(replace_ellipsis)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:194(ensure_tuple)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:476()\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:539()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:537(replace_lists)\n", + " 1 0.000 0.000 0.000 0.000 core.py:342(_nchunks)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:526(is_coordinate_selection)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:529()\n", + " 3 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:592()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:478()\n", + " 1 0.000 0.000 0.000 0.000 core.py:346(nchunks)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method _functools.reduce}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:557()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", + " 4 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", + " 2 0.000 0.000 0.000 0.000 core.py:161(_refresh_metadata)\n", + " 2 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 3 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:165(ensure_tuple)\n", + " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:593()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", + " 1 0.000 0.000 0.000 0.000 core.py:221(shape)\n", + " 1 0.000 0.000 0.000 0.000 core.py:372(vindex)\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:164()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:481()\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zc[ix_dense_int]', sort='time')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When indices are not sorted, zarr needs to partially sort them so the occur in chunk order, so we only have to visit each chunk once. This sorting dominates the processing time and is unavoidable AFAIK." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### bool sparse selection" + "cProfile.run('zc.vindex[ix_dense_int_sorted]', sort='time')" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 85095 function calls in 1.770 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 1.123 1.123 1.123 1.123 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 1 0.134 0.134 1.417 1.417 indexing.py:304(__init__)\n", + " 1024 0.134 0.000 0.273 0.000 core.py:822(_chunk_getitem)\n", + " 1 0.122 0.122 0.122 0.122 {method 'take' of 'numpy.ndarray' objects}\n", + " 1024 0.117 0.000 0.123 0.000 core.py:997(_decode_chunk)\n", + " 1025 0.032 0.000 0.033 0.000 indexing.py:351(__iter__)\n", + " 1 0.025 0.025 0.025 0.025 {built-in method numpy.core.multiarray.bincount}\n", + " 1 0.010 0.010 0.010 0.010 function_base.py:1848(diff)\n", + " 1025 0.007 0.000 0.073 0.000 indexing.py:484(__iter__)\n", + " 2048 0.007 0.000 0.014 0.000 index_tricks.py:26(ix_)\n", + " 2048 0.005 0.000 0.005 0.000 core.py:332()\n", + " 3072 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 10252 0.004 0.000 0.010 0.000 {built-in method builtins.isinstance}\n", + " 3073 0.004 0.000 0.006 0.000 abc.py:178(__instancecheck__)\n", + " 2048 0.004 0.000 0.027 0.000 indexing.py:377(ix_)\n", + " 1 0.003 0.003 0.350 0.350 core.py:576(_get_selection)\n", + " 1 0.003 0.003 1.770 1.770 core.py:531(get_orthogonal_selection)\n", + " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", + " 4 0.002 0.001 0.002 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 6146 0.002 0.000 0.002 0.000 _weakrefset.py:70(__contains__)\n", + " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", + " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 2048 0.002 0.000 0.003 0.000 numerictypes.py:728(issubdtype)\n", + " 1024 0.002 0.000 0.006 0.000 core.py:327(_cdata_shape)\n", + " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 2048 0.002 0.000 0.008 0.000 indexing.py:384()\n", + " 1024 0.001 0.000 0.009 0.000 core.py:994(_chunk_key)\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", + " 2054 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", + " 2049 0.001 0.000 0.007 0.000 indexing.py:11(is_integer)\n", + " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", + " 2051 0.001 0.000 0.001 0.000 indexing.py:165(ensure_tuple)\n", + " 4096 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:487()\n", + " 1024 0.001 0.000 0.003 0.000 indexing.py:501()\n", + " 2048 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 4103 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 2048 0.001 0.000 0.002 0.000 numeric.py:463(asarray)\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 1 0.001 0.001 1.418 1.418 indexing.py:420(__init__)\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", + " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:488()\n", + " 1 0.000 0.000 1.770 1.770 indexing.py:512(__getitem__)\n", + " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1 0.000 0.000 1.771 1.771 :1()\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 4 0.000 0.000 0.002 0.001 fromnumeric.py:1886(any)\n", + " 4 0.000 0.000 1.245 0.311 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 1.771 1.771 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 4 0.000 0.000 0.002 0.001 _methods.py:37(_any)\n", + " 4 0.000 0.000 0.002 0.001 {method 'any' of 'numpy.ndarray' objects}\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 1 0.000 0.000 0.122 0.122 fromnumeric.py:70(take)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:130(replace_ellipsis)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:474()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", + " 1 0.000 0.000 1.123 1.123 fromnumeric.py:826(argsort)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:476()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:479()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:135()\n", + " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", + " 1 0.000 0.000 0.000 0.000 core.py:367(oindex)\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run('zc.oindex[ix_dense_int]', sort='time')" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 35981 function calls in 1.735 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 1.117 1.117 1.117 1.117 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 1024 0.132 0.000 0.269 0.000 core.py:822(_chunk_getitem)\n", + " 2 0.121 0.061 0.121 0.061 indexing.py:604()\n", + " 1024 0.116 0.000 0.122 0.000 core.py:997(_decode_chunk)\n", + " 2 0.108 0.054 0.108 0.054 indexing.py:580()\n", + " 1 0.028 0.028 0.028 0.028 {built-in method numpy.core.multiarray.ravel_multi_index}\n", + " 1 0.026 0.026 1.437 1.437 indexing.py:553(__init__)\n", + " 1 0.024 0.024 0.024 0.024 {built-in method numpy.core.multiarray.bincount}\n", + " 2048 0.012 0.000 0.012 0.000 indexing.py:645()\n", + " 1 0.010 0.010 0.010 0.010 function_base.py:1848(diff)\n", + " 1025 0.009 0.000 0.024 0.000 indexing.py:624(__iter__)\n", + " 2054 0.004 0.000 0.004 0.000 core.py:332()\n", + " 1 0.003 0.003 0.296 0.296 core.py:576(_get_selection)\n", + " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", + " 1027 0.002 0.000 0.002 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 4 0.002 0.001 0.002 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", + " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 1027 0.002 0.000 0.005 0.000 core.py:327(_cdata_shape)\n", + " 3081 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.001 0.000 0.009 0.000 core.py:994(_chunk_key)\n", + " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", + " 1 0.001 0.001 1.734 1.734 core.py:543(get_coordinate_selection)\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:641()\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:629()\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 2054 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", + " 1 0.000 0.000 1.734 1.734 indexing.py:678(__getitem__)\n", + " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 1.734 1.734 :1()\n", + " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1 0.000 0.000 1.735 1.735 {built-in method builtins.exec}\n", + " 4 0.000 0.000 0.002 0.001 fromnumeric.py:1886(any)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.unravel_index}\n", + " 3 0.000 0.000 1.117 0.372 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 3 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 1 0.000 0.000 0.000 0.000 core.py:342(_nchunks)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:526(is_coordinate_selection)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", + " 6 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", + " 5 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 7 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", + " 4 0.000 0.000 0.002 0.001 {method 'any' of 'numpy.ndarray' objects}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:529()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", + " 4 0.000 0.000 0.002 0.001 _methods.py:37(_any)\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.all}\n", + " 1 0.000 0.000 0.000 0.000 core.py:346(nchunks)\n", + " 3 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:165(ensure_tuple)\n", + " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", + " 1 0.000 0.000 1.117 1.117 fromnumeric.py:826(argsort)\n", + " 3 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method _functools.reduce}\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", + " 1 0.000 0.000 0.000 0.000 core.py:221(shape)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:557()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:592()\n", + " 1 0.000 0.000 0.000 0.000 core.py:372(vindex)\n", + " 2 0.000 0.000 0.000 0.000 core.py:161(_refresh_metadata)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", + " 4 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 2 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:593()\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run('zc.vindex[ix_dense_int]', sort='time')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When indices are not sorted, zarr needs to partially sort them so the occur in chunk order, so we only have to visit each chunk once. This sorting dominates the processing time and is unavoidable AFAIK." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### bool sparse selection" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9932" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# relatively sparse selection\n", + "ix_sparse_bool = np.random.binomial(1, 0.0001, size=c.shape[0]).astype(bool)\n", + "np.count_nonzero(ix_sparse_bool)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 20 ms, sys: 0 ns, total: 20 ms\n", + "Wall time: 20 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 888, 9941, 15901, ..., 99988491, 99988714, 99995248])" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time c[ix_sparse_bool]" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 408 ms, sys: 28 ms, total: 436 ms\n", + "Wall time: 182 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 888, 9941, 15901, ..., 99988491, 99988714, 99995248])" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc.oindex[ix_sparse_bool]" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 372 ms, sys: 36 ms, total: 408 ms\n", + "Wall time: 149 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([ 888, 9941, 15901, ..., 99988491, 99988714, 99995248])" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time zc.vindex[ix_sparse_bool]" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 67653 function calls in 0.186 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1024 0.097 0.000 0.103 0.000 core.py:997(_decode_chunk)\n", + " 1025 0.018 0.000 0.018 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1024 0.009 0.000 0.009 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", + " 1024 0.006 0.000 0.124 0.000 core.py:822(_chunk_getitem)\n", + " 1025 0.006 0.000 0.049 0.000 indexing.py:484(__iter__)\n", + " 1024 0.005 0.000 0.027 0.000 index_tricks.py:26(ix_)\n", + " 2048 0.004 0.000 0.004 0.000 core.py:332()\n", + " 2048 0.003 0.000 0.003 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 7180 0.003 0.000 0.007 0.000 {built-in method builtins.isinstance}\n", + " 2049 0.003 0.000 0.004 0.000 abc.py:178(__instancecheck__)\n", + " 1 0.003 0.003 0.176 0.176 core.py:576(_get_selection)\n", + " 1025 0.002 0.000 0.003 0.000 indexing.py:275(__iter__)\n", + " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", + " 1024 0.002 0.000 0.034 0.000 indexing.py:377(ix_)\n", + " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", + " 1024 0.002 0.000 0.006 0.000 core.py:327(_cdata_shape)\n", + " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 4098 0.002 0.000 0.002 0.000 _weakrefset.py:70(__contains__)\n", + " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 1 0.001 0.001 0.011 0.011 indexing.py:248(__init__)\n", + " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", + " 1024 0.001 0.000 0.009 0.000 core.py:994(_chunk_key)\n", + " 1024 0.001 0.000 0.005 0.000 indexing.py:384()\n", + " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", + " 1024 0.001 0.000 0.003 0.000 indexing.py:501()\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:487()\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 3079 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:489()\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", + " 1025 0.000 0.000 0.004 0.000 indexing.py:11(is_integer)\n", + " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", + " 1027 0.000 0.000 0.001 0.000 indexing.py:165(ensure_tuple)\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:488()\n", + " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", + " 1024 0.000 0.000 0.009 0.000 numeric.py:380(count_nonzero)\n", + " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 1 0.000 0.000 0.011 0.011 indexing.py:420(__init__)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.zeros}\n", + " 1 0.000 0.000 0.187 0.187 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 0.187 0.187 indexing.py:512(__getitem__)\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:130(replace_ellipsis)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", + " 1 0.000 0.000 0.187 0.187 :1()\n", + " 1 0.000 0.000 0.187 0.187 core.py:531(get_orthogonal_selection)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:474()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:19(is_bool_array)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:476()\n", + " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:135()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:479()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", + " 1 0.000 0.000 0.000 0.000 core.py:367(oindex)\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run('zc.oindex[ix_sparse_bool]', sort='time')" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 43161 function calls in 0.159 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1024 0.093 0.000 0.098 0.000 core.py:997(_decode_chunk)\n", + " 2 0.020 0.010 0.020 0.010 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1025 0.008 0.000 0.015 0.000 indexing.py:624(__iter__)\n", + " 1024 0.006 0.000 0.122 0.000 core.py:822(_chunk_getitem)\n", + " 2048 0.005 0.000 0.005 0.000 indexing.py:645()\n", + " 2054 0.003 0.000 0.003 0.000 core.py:332()\n", + " 1 0.002 0.002 0.139 0.139 core.py:576(_get_selection)\n", + " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", + " 1026 0.002 0.000 0.002 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1024 0.002 0.000 0.003 0.000 util.py:113(is_total_slice)\n", + " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", + " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 1027 0.001 0.000 0.005 0.000 core.py:327(_cdata_shape)\n", + " 6155 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.001 0.000 0.009 0.000 core.py:994(_chunk_key)\n", + " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", + " 2048 0.001 0.000 0.001 0.000 util.py:128()\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:641()\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:629()\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 2056 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", + " 1028 0.000 0.000 0.001 0.000 {built-in method builtins.all}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:580()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:553(__init__)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.ravel_multi_index}\n", + " 4 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.unravel_index}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.bincount}\n", + " 1 0.000 0.000 0.159 0.159 {built-in method builtins.exec}\n", + " 1 0.000 0.000 0.000 0.000 function_base.py:1848(diff)\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 4 0.000 0.000 0.000 0.000 fromnumeric.py:1886(any)\n", + " 1 0.000 0.000 0.159 0.159 indexing.py:678(__getitem__)\n", + " 3 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 1 0.000 0.000 0.020 0.020 indexing.py:655(__init__)\n", + " 1 0.000 0.000 0.000 0.000 core.py:221(shape)\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", + " 3 0.000 0.000 0.020 0.007 fromnumeric.py:55(_wrapfunc)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:526(is_coordinate_selection)\n", + " 3 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", + " 1 0.000 0.000 0.159 0.159 core.py:564(get_mask_selection)\n", + " 6 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 1 0.000 0.000 0.159 0.159 :1()\n", + " 5 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 3 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:529()\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:535(is_mask_selection)\n", + " 7 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", + " 2 0.000 0.000 0.020 0.010 fromnumeric.py:1487(nonzero)\n", + " 4 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", + " 3 0.000 0.000 0.000 0.000 indexing.py:165(ensure_tuple)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", + " 4 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", + " 1 0.000 0.000 0.000 0.000 core.py:342(_nchunks)\n", + " 1 0.000 0.000 0.000 0.000 core.py:346(nchunks)\n", + " 3 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:19(is_bool_array)\n", + " 6 0.000 0.000 0.000 0.000 indexing.py:545()\n", + " 2 0.000 0.000 0.000 0.000 core.py:161(_refresh_metadata)\n", + " 2 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method _functools.reduce}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:557()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:592()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:593()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", + " 4 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.000 0.000 core.py:372(vindex)\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" + ] + } + ], + "source": [ + "cProfile.run('zc.vindex[ix_sparse_bool]', sort='time')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### int sparse selection" + ] + }, + { + "cell_type": "code", + "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "9985" + "10000" ] }, - "execution_count": 55, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# relatively sparse selection\n", - "ix_sparse_bool = np.random.binomial(1, 0.0001, size=c.shape[0]).astype(bool)\n", - "np.count_nonzero(ix_sparse_bool)" + "ix_sparse_int = np.random.choice(c.shape[0], size=c.shape[0]//10000, replace=True)\n", + "ix_sparse_int_sorted = ix_sparse_int.copy()\n", + "ix_sparse_int_sorted.sort()\n", + "len(ix_sparse_int)" ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 20 ms, sys: 0 ns, total: 20 ms\n", - "Wall time: 21.3 ms\n" + "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", + "Wall time: 183 µs\n" ] }, { "data": { "text/plain": [ - "array([ 4039, 4499, 7512, ..., 99943621, 99959317, 99987208])" + "array([ 26607, 37803, 43822, ..., 99980305, 99994438, 99995776])" ] }, - "execution_count": 56, + "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time c[ix_sparse_bool]" + "%time c[ix_sparse_int_sorted]" ] }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 436 ms, sys: 56 ms, total: 492 ms\n", - "Wall time: 210 ms\n" + "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", + "Wall time: 189 µs\n" ] }, { "data": { "text/plain": [ - "array([ 4039, 4499, 7512, ..., 99943621, 99959317, 99987208])" + "array([89501247, 55878596, 46320615, ..., 57048243, 1027560, 66644274])" ] }, - "execution_count": 57, + "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time zc[ix_sparse_bool]" + "%time c[ix_sparse_int]" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 58373 function calls in 0.243 seconds\n", - "\n", - " Ordered by: internal time\n", - "\n", - " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1023 0.140 0.000 0.148 0.000 core.py:964(_decode_chunk)\n", - " 1024 0.024 0.000 0.024 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1024 0.009 0.000 0.009 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", - " 1023 0.007 0.000 0.177 0.000 core.py:802(_chunk_getitem)\n", - " 1024 0.007 0.000 0.052 0.000 indexing.py:486(__iter__)\n", - " 1023 0.006 0.000 0.034 0.000 index_tricks.py:26(ix_)\n", - " 2046 0.006 0.000 0.006 0.000 core.py:324()\n", - " 2046 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1023 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1 0.003 0.003 0.232 0.232 core.py:563(_get_selection)\n", - " 1023 0.003 0.000 0.006 0.000 arrayprint.py:381(wrapper)\n", - " 1023 0.003 0.000 0.010 0.000 {method 'join' of 'str' objects}\n", - " 1023 0.003 0.000 0.008 0.000 core.py:319(_cdata_shape)\n", - " 1024 0.002 0.000 0.003 0.000 indexing.py:296(__iter__)\n", - " 1023 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 6146 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", - " 1023 0.002 0.000 0.038 0.000 indexing.py:398(ix_)\n", - " 1023 0.001 0.000 0.011 0.000 core.py:961(_chunk_key)\n", - " 1023 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", - " 1 0.001 0.001 0.011 0.011 indexing.py:269(__init__)\n", - " 1023 0.001 0.000 0.008 0.000 numeric.py:1905(array_str)\n", - " 1023 0.001 0.000 0.001 0.000 indexing.py:403()\n", - " 2046 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5642b95ef480}\n", - " 1023 0.001 0.000 0.001 0.000 indexing.py:501()\n", - " 2046 0.001 0.000 0.001 0.000 indexing.py:489()\n", - " 1023 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 1023 0.001 0.000 0.001 0.000 :12(__new__)\n", - " 3078 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 2046 0.001 0.000 0.001 0.000 indexing.py:491()\n", - " 2046 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", - " 1023 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", - " 1023 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", - " 2046 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", - " 1023 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 2046 0.000 0.000 0.000 0.000 indexing.py:490()\n", - " 1023 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1023 0.000 0.000 0.000 0.000 core.py:205(chunk_store)\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", - " 1024 0.000 0.000 0.010 0.000 numeric.py:380(count_nonzero)\n", - " 1023 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1023 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", - " 1023 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1023 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 0.011 0.011 indexing.py:425(__init__)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.zeros}\n", - " 1 0.000 0.000 0.243 0.243 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.000 0.000 0.243 0.243 core.py:392(__getitem__)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 1 0.000 0.000 0.243 0.243 core.py:527(get_orthogonal_selection)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:159(replace_ellipsis)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:194(ensure_tuple)\n", - " 1 0.000 0.000 0.243 0.243 :1()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:19(is_bool_array)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:476()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:478()\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:539()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:164()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:537(replace_lists)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:481()\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", - "\n", - "\n" + "CPU times: user 384 ms, sys: 44 ms, total: 428 ms\n", + "Wall time: 166 ms\n" ] - } - ], - "source": [ - "cProfile.run('zc[ix_sparse_bool]', sort='time')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### int sparse selection" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ + }, { "data": { "text/plain": [ - "10000" + "array([ 26607, 37803, 43822, ..., 99980305, 99994438, 99995776])" ] }, - "execution_count": 60, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ix_sparse_int = np.random.choice(c.shape[0], size=c.shape[0]//10000, replace=True)\n", - "ix_sparse_int_sorted = ix_sparse_int.copy()\n", - "ix_sparse_int_sorted.sort()\n", - "len(ix_sparse_int)" + "%time zc.oindex[ix_sparse_int_sorted]" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", - "Wall time: 136 µs\n" + "CPU times: user 376 ms, sys: 32 ms, total: 408 ms\n", + "Wall time: 137 ms\n" ] }, { "data": { "text/plain": [ - "array([ 7736, 25765, 27155, ..., 99982813, 99983779, 99986450])" + "array([ 26607, 37803, 43822, ..., 99980305, 99994438, 99995776])" ] }, - "execution_count": 61, + "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time c[ix_sparse_int_sorted]" + "%time zc.vindex[ix_sparse_int_sorted]" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 73, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", - "Wall time: 597 µs\n" + "CPU times: user 412 ms, sys: 20 ms, total: 432 ms\n", + "Wall time: 174 ms\n" ] }, { "data": { "text/plain": [ - "array([11023673, 52339189, 27001951, ..., 37185717, 7541357, 28437835])" + "array([89501247, 55878596, 46320615, ..., 57048243, 1027560, 66644274])" ] }, - "execution_count": 62, + "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time c[ix_sparse_int]" + "%time zc.oindex[ix_sparse_int]" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 74, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 412 ms, sys: 40 ms, total: 452 ms\n", - "Wall time: 171 ms\n" + "CPU times: user 360 ms, sys: 36 ms, total: 396 ms\n", + "Wall time: 134 ms\n" ] }, { "data": { "text/plain": [ - "array([ 7736, 25765, 27155, ..., 99982813, 99983779, 99986450])" + "array([89501247, 55878596, 46320615, ..., 57048243, 1027560, 66644274])" ] }, - "execution_count": 63, + "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time zc[ix_sparse_int_sorted]" + "%time zc.vindex[ix_sparse_int]" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 75, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 384 ms, sys: 64 ms, total: 448 ms\n", - "Wall time: 172 ms\n" + " 85095 function calls in 0.170 seconds\n", + "\n", + " Ordered by: internal time\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1024 0.097 0.000 0.102 0.000 core.py:997(_decode_chunk)\n", + " 1025 0.006 0.000 0.044 0.000 indexing.py:484(__iter__)\n", + " 2048 0.006 0.000 0.013 0.000 index_tricks.py:26(ix_)\n", + " 1025 0.005 0.000 0.006 0.000 indexing.py:351(__iter__)\n", + " 1024 0.005 0.000 0.122 0.000 core.py:822(_chunk_getitem)\n", + " 2048 0.004 0.000 0.004 0.000 core.py:332()\n", + " 3072 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 10252 0.004 0.000 0.009 0.000 {built-in method builtins.isinstance}\n", + " 3073 0.004 0.000 0.006 0.000 abc.py:178(__instancecheck__)\n", + " 2048 0.003 0.000 0.025 0.000 indexing.py:377(ix_)\n", + " 1 0.003 0.003 0.169 0.169 core.py:576(_get_selection)\n", + " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", + " 6146 0.002 0.000 0.002 0.000 _weakrefset.py:70(__contains__)\n", + " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", + " 1024 0.002 0.000 0.006 0.000 core.py:327(_cdata_shape)\n", + " 2048 0.002 0.000 0.003 0.000 numerictypes.py:728(issubdtype)\n", + " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", + " 2048 0.002 0.000 0.008 0.000 indexing.py:384()\n", + " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", + " 1024 0.001 0.000 0.009 0.000 core.py:994(_chunk_key)\n", + " 1 0.001 0.001 0.001 0.001 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", + " 2049 0.001 0.000 0.006 0.000 indexing.py:11(is_integer)\n", + " 2054 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:487()\n", + " 2051 0.001 0.000 0.001 0.000 indexing.py:165(ensure_tuple)\n", + " 2048 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", + " 1024 0.001 0.000 0.003 0.000 indexing.py:501()\n", + " 4103 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 4096 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", + " 2048 0.001 0.000 0.001 0.000 numeric.py:463(asarray)\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 1 0.000 0.000 0.002 0.002 indexing.py:304(__init__)\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:488()\n", + " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", + " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", + " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", + " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", + " 4 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 1 0.000 0.000 0.171 0.171 indexing.py:512(__getitem__)\n", + " 1 0.000 0.000 0.000 0.000 {method 'take' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.002 0.002 indexing.py:420(__init__)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.bincount}\n", + " 1 0.000 0.000 0.000 0.000 function_base.py:1848(diff)\n", + " 1 0.000 0.000 0.171 0.171 {built-in method builtins.exec}\n", + " 4 0.000 0.000 0.001 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 4 0.000 0.000 0.000 0.000 fromnumeric.py:1886(any)\n", + " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.171 0.171 :1()\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:70(take)\n", + " 4 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", + " 1 0.000 0.000 0.170 0.170 core.py:531(get_orthogonal_selection)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:130(replace_ellipsis)\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", + " 4 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.001 0.001 fromnumeric.py:826(argsort)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:474()\n", + " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:476()\n", + " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", + " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:135()\n", + " 1 0.000 0.000 0.000 0.000 core.py:367(oindex)\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:479()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", + " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + "\n", + "\n" ] - }, - { - "data": { - "text/plain": [ - "array([11023673, 52339189, 27001951, ..., 37185717, 7541357, 28437835])" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc[ix_sparse_int]" + "cProfile.run('zc.oindex[ix_sparse_int]', sort='time')" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 76, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 69726 function calls in 0.218 seconds\n", + " 35981 function calls in 0.136 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1024 0.141 0.000 0.149 0.000 core.py:964(_decode_chunk)\n", - " 1025 0.008 0.000 0.038 0.000 indexing.py:486(__iter__)\n", - " 2048 0.008 0.000 0.015 0.000 index_tricks.py:26(ix_)\n", - " 1024 0.006 0.000 0.176 0.000 core.py:802(_chunk_getitem)\n", - " 2048 0.006 0.000 0.006 0.000 core.py:324()\n", - " 3072 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1025 0.004 0.000 0.005 0.000 indexing.py:372(__iter__)\n", - " 1 0.003 0.003 0.217 0.217 core.py:563(_get_selection)\n", - " 1024 0.003 0.000 0.006 0.000 arrayprint.py:381(wrapper)\n", - " 2048 0.003 0.000 0.020 0.000 indexing.py:398(ix_)\n", - " 1024 0.003 0.000 0.008 0.000 core.py:319(_cdata_shape)\n", - " 1024 0.003 0.000 0.010 0.000 {method 'join' of 'str' objects}\n", - " 8200 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.093 0.000 0.098 0.000 core.py:997(_decode_chunk)\n", + " 1025 0.008 0.000 0.015 0.000 indexing.py:624(__iter__)\n", + " 1024 0.005 0.000 0.117 0.000 core.py:822(_chunk_getitem)\n", + " 2048 0.004 0.000 0.004 0.000 indexing.py:645()\n", + " 2054 0.003 0.000 0.003 0.000 core.py:332()\n", + " 1 0.002 0.002 0.135 0.135 core.py:576(_get_selection)\n", + " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", + " 1027 0.002 0.000 0.002 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 2048 0.002 0.000 0.004 0.000 numerictypes.py:728(issubdtype)\n", - " 2048 0.002 0.000 0.002 0.000 indexing.py:403()\n", - " 1024 0.001 0.000 0.011 0.000 core.py:961(_chunk_key)\n", - " 1024 0.001 0.000 0.008 0.000 numeric.py:1905(array_str)\n", - " 2054 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", - " 4096 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", - " 2048 0.001 0.000 0.002 0.000 numeric.py:463(asarray)\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5642b95ef480}\n", - " 4105 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 2048 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 1024 0.001 0.000 0.001 0.000 indexing.py:501()\n", + " 1027 0.002 0.000 0.005 0.000 core.py:327(_cdata_shape)\n", + " 3081 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.001 0.000 0.009 0.000 core.py:994(_chunk_key)\n", + " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", " 1 0.001 0.001 0.001 0.001 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:491()\n", - " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:490()\n", - " 1 0.000 0.000 0.001 0.001 indexing.py:325(__init__)\n", - " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:205(chunk_store)\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:641()\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:629()\n", + " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", + " 1024 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", + " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", + " 2054 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", + " 1 0.000 0.000 0.002 0.002 indexing.py:553(__init__)\n", + " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 0.218 0.218 core.py:392(__getitem__)\n", - " 4 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1 0.000 0.000 0.001 0.001 indexing.py:425(__init__)\n", - " 1 0.000 0.000 0.000 0.000 {method 'take' of 'numpy.ndarray' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:580()\n", " 1 0.000 0.000 0.000 0.000 function_base.py:1848(diff)\n", + " 1 0.000 0.000 0.136 0.136 {built-in method builtins.exec}\n", + " 4 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.ravel_multi_index}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.unravel_index}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:604()\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.000 0.000 0.218 0.218 {built-in method builtins.exec}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", " 4 0.000 0.000 0.000 0.000 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 0.218 0.218 core.py:527(get_orthogonal_selection)\n", + " 1 0.000 0.000 0.136 0.136 indexing.py:678(__getitem__)\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", + " 1 0.000 0.000 0.136 0.136 :1()\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 4 0.000 0.000 0.001 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:159(replace_ellipsis)\n", + " 3 0.000 0.000 0.001 0.000 fromnumeric.py:55(_wrapfunc)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", + " 3 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", " 4 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 7 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", + " 1 0.000 0.000 0.136 0.136 core.py:543(get_coordinate_selection)\n", + " 3 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:526(is_coordinate_selection)\n", + " 4 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", + " 5 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:165(ensure_tuple)\n", + " 1 0.000 0.000 0.000 0.000 core.py:342(_nchunks)\n", + " 3 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", + " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:592()\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 1 0.000 0.000 0.218 0.218 :1()\n", - " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:194(ensure_tuple)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:537(replace_lists)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:70(take)\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", + " 4 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", " 1 0.000 0.000 0.001 0.001 fromnumeric.py:826(argsort)\n", - " 4 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", - " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:164()\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:539()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:478()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:481()\n", + " 6 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", + " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", + " 1 0.000 0.000 0.000 0.000 {built-in method _functools.reduce}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:529()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:476()\n", + " 1 0.000 0.000 0.000 0.000 core.py:221(shape)\n", + " 1 0.000 0.000 0.000 0.000 core.py:346(nchunks)\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.all}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:593()\n", + " 1 0.000 0.000 0.000 0.000 core.py:372(vindex)\n", + " 2 0.000 0.000 0.000 0.000 core.py:161(_refresh_metadata)\n", + " 2 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:557()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", "\n", "\n" @@ -1880,7 +2769,7 @@ } ], "source": [ - "cProfile.run('zc[ix_sparse_int]', sort='time')" + "cProfile.run('zc.vindex[ix_sparse_int]', sort='time')" ] }, { @@ -1899,13 +2788,13 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Typezarr.core.Array
Data typebool
Shape(100000000,)
Chunk shape(390625,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes100000000 (95.4M)
No. bytes stored507490 (495.6K)
Storage ratio197.0
Chunks initialized256/256
" + "
Typezarr.core.Array
Data typebool
Shape(100000000,)
Chunk shape(390625,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes100000000 (95.4M)
No. bytes stored507131 (495.2K)
Storage ratio197.2
Chunks initialized256/256
" ], "text/plain": [ "Type : zarr.core.Array\n", @@ -1917,12 +2806,12 @@ "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", "Store type : builtins.dict\n", "No. bytes : 100000000 (95.4M)\n", - "No. bytes stored : 507490 (495.6K)\n", - "Storage ratio : 197.0\n", + "No. bytes stored : 507131 (495.2K)\n", + "Storage ratio : 197.2\n", "Chunks initialized : 256/256" ] }, - "execution_count": 66, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } @@ -1934,30 +2823,30 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 920 ms, sys: 136 ms, total: 1.06 s\n", - "Wall time: 503 ms\n" + "CPU times: user 744 ms, sys: 120 ms, total: 864 ms\n", + "Wall time: 384 ms\n" ] }, { "data": { "text/plain": [ - "array([ 4039, 4499, 7512, ..., 99943621, 99959317, 99987208])" + "array([ 888, 9941, 15901, ..., 99988491, 99988714, 99995248])" ] }, - "execution_count": 67, + "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time zc[zix_sparse_bool]" + "%time zc.oindex[zix_sparse_bool]" ] }, { @@ -1969,15 +2858,15 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 79, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 68 ms, sys: 24 ms, total: 92 ms\n", - "Wall time: 90.4 ms\n" + "CPU times: user 40 ms, sys: 52 ms, total: 92 ms\n", + "Wall time: 89.5 ms\n" ] }, { @@ -1986,7 +2875,7 @@ "array([ 0, 2, 4, ..., 99999994, 99999996, 99999998])" ] }, - "execution_count": 68, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } @@ -1997,15 +2886,15 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.34 s, sys: 236 ms, total: 1.57 s\n", - "Wall time: 1.29 s\n" + "CPU times: user 1.29 s, sys: 240 ms, total: 1.53 s\n", + "Wall time: 1.24 s\n" ] }, { @@ -2014,26 +2903,26 @@ "array([ 0, 2, 4, ..., 99999994, 99999996, 99999998])" ] }, - "execution_count": 69, + "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time zc[::2]" + "%time zc.oindex[::2]" ] }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 81, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 548 ms, sys: 116 ms, total: 664 ms\n", - "Wall time: 400 ms\n" + "CPU times: user 588 ms, sys: 88 ms, total: 676 ms\n", + "Wall time: 411 ms\n" ] }, { @@ -2042,26 +2931,26 @@ "array([ 0, 10, 20, ..., 99999970, 99999980, 99999990])" ] }, - "execution_count": 70, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time zc[::10]" + "%time zc.oindex[::10]" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 456 ms, sys: 40 ms, total: 496 ms\n", - "Wall time: 214 ms\n" + "CPU times: user 400 ms, sys: 28 ms, total: 428 ms\n", + "Wall time: 184 ms\n" ] }, { @@ -2070,26 +2959,26 @@ "array([ 0, 100, 200, ..., 99999700, 99999800, 99999900])" ] }, - "execution_count": 71, + "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time zc[::100]" + "%time zc.oindex[::100]" ] }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 440 ms, sys: 36 ms, total: 476 ms\n", - "Wall time: 179 ms\n" + "CPU times: user 368 ms, sys: 52 ms, total: 420 ms\n", + "Wall time: 167 ms\n" ] }, { @@ -2098,102 +2987,105 @@ "array([ 0, 1000, 2000, ..., 99997000, 99998000, 99999000])" ] }, - "execution_count": 72, + "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%time zc[::1000]" + "%time zc.oindex[::1000]" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 55382 function calls in 1.351 seconds\n", + " 64607 function calls in 1.249 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 0.620 0.620 0.815 0.815 indexing.py:325(__init__)\n", - " 1024 0.130 0.000 0.135 0.000 core.py:964(_decode_chunk)\n", - " 1025 0.127 0.000 0.128 0.000 indexing.py:372(__iter__)\n", - " 1024 0.123 0.000 0.275 0.000 core.py:802(_chunk_getitem)\n", - " 1 0.121 0.121 0.121 0.121 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.087 0.087 0.087 0.087 {built-in method numpy.core.multiarray.arange}\n", - " 1 0.054 0.054 0.054 0.054 function_base.py:1848(diff)\n", - " 1 0.020 0.020 1.350 1.350 core.py:527(get_orthogonal_selection)\n", - " 4 0.020 0.005 0.020 0.005 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1025 0.007 0.000 0.149 0.000 indexing.py:486(__iter__)\n", - " 2048 0.005 0.000 0.005 0.000 core.py:324()\n", + " 1 0.593 0.593 0.776 0.776 indexing.py:304(__init__)\n", + " 1 0.119 0.119 0.119 0.119 {built-in method numpy.core.multiarray.bincount}\n", + " 1024 0.115 0.000 0.247 0.000 core.py:822(_chunk_getitem)\n", + " 1024 0.110 0.000 0.116 0.000 core.py:997(_decode_chunk)\n", + " 1025 0.106 0.000 0.107 0.000 indexing.py:351(__iter__)\n", + " 1 0.074 0.074 0.074 0.074 {built-in method numpy.core.multiarray.arange}\n", + " 1 0.051 0.051 0.051 0.051 function_base.py:1848(diff)\n", + " 1 0.014 0.014 1.248 1.248 core.py:531(get_orthogonal_selection)\n", + " 4 0.013 0.003 0.013 0.003 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 1025 0.006 0.000 0.134 0.000 indexing.py:484(__iter__)\n", " 1024 0.004 0.000 0.008 0.000 index_tricks.py:26(ix_)\n", + " 2048 0.004 0.000 0.004 0.000 core.py:332()\n", " 2048 0.003 0.000 0.003 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1 0.003 0.003 0.427 0.427 core.py:563(_get_selection)\n", + " 7180 0.003 0.000 0.007 0.000 {built-in method builtins.isinstance}\n", + " 1 0.003 0.003 0.383 0.383 core.py:576(_get_selection)\n", + " 2049 0.002 0.000 0.004 0.000 abc.py:178(__instancecheck__)\n", " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.002 0.000 0.006 0.000 core.py:319(_cdata_shape)\n", - " 6152 0.002 0.000 0.002 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.002 0.000 0.016 0.000 indexing.py:377(ix_)\n", + " 1024 0.002 0.000 0.006 0.000 core.py:327(_cdata_shape)\n", + " 4098 0.002 0.000 0.002 0.000 _weakrefset.py:70(__contains__)\n", " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 1024 0.001 0.000 0.011 0.000 indexing.py:398(ix_)\n", - " 1024 0.001 0.000 0.009 0.000 core.py:961(_chunk_key)\n", " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", - " 1 0.001 0.001 0.902 0.902 indexing.py:425(__init__)\n", - " 1024 0.001 0.000 0.001 0.000 indexing.py:403()\n", + " 1 0.001 0.001 0.851 0.851 indexing.py:420(__init__)\n", + " 1024 0.001 0.000 0.009 0.000 core.py:994(_chunk_key)\n", + " 1024 0.001 0.000 0.005 0.000 indexing.py:384()\n", + " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", - " 1 0.001 0.001 1.350 1.350 core.py:392(__getitem__)\n", - " 1 0.001 0.001 1.351 1.351 :1()\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x5642b95ef480}\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", - " 1024 0.001 0.000 0.001 0.000 indexing.py:501()\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:491()\n", + " 1 0.001 0.001 1.250 1.250 :1()\n", + " 1 0.001 0.001 1.249 1.249 indexing.py:512(__getitem__)\n", + " 2048 0.001 0.000 0.001 0.000 indexing.py:487()\n", + " 1024 0.001 0.000 0.003 0.000 indexing.py:501()\n", " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 3081 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", + " 3079 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", " 1030 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", + " 1025 0.000 0.000 0.004 0.000 indexing.py:11(is_integer)\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:489()\n", " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", - " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:490()\n", + " 1027 0.000 0.000 0.001 0.000 indexing.py:165(ensure_tuple)\n", " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", + " 2048 0.000 0.000 0.000 0.000 indexing.py:488()\n", + " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:205(chunk_store)\n", + " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", + " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 1.351 1.351 {built-in method builtins.exec}\n", - " 4 0.000 0.000 0.020 0.005 fromnumeric.py:1886(any)\n", - " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 1 0.000 0.000 1.250 1.250 {built-in method builtins.exec}\n", + " 4 0.000 0.000 0.014 0.003 fromnumeric.py:1886(any)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:130(replace_ellipsis)\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", + " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 4 0.000 0.000 0.020 0.005 {method 'any' of 'numpy.ndarray' objects}\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:159(replace_ellipsis)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:537(replace_lists)\n", - " 4 0.000 0.000 0.020 0.005 _methods.py:37(_any)\n", - " 1 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:194(ensure_tuple)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:80(normalize_slice_selection)\n", + " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", + " 4 0.000 0.000 0.013 0.003 {method 'any' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 2 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 4 0.000 0.000 0.013 0.003 _methods.py:37(_any)\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:164()\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:539()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:478()\n", + " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", + " 1 0.000 0.000 0.000 0.000 {method 'indices' of 'slice' objects}\n", " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:476()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:481()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:135()\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:474()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:476()\n", + " 1 0.000 0.000 0.000 0.000 core.py:367(oindex)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:479()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", "\n", @@ -2202,7 +3094,7 @@ } ], "source": [ - "cProfile.run('zc[::2]', sort='time')" + "cProfile.run('zc.oindex[::2]', sort='time')" ] }, { @@ -2221,7 +3113,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 85, "metadata": {}, "outputs": [ { @@ -2230,7 +3122,7 @@ "(100000000,)" ] }, - "execution_count": 74, + "execution_count": 85, "metadata": {}, "output_type": "execute_result" } @@ -2241,7 +3133,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 86, "metadata": {}, "outputs": [ { @@ -2250,7 +3142,7 @@ "(100000, 1000)" ] }, - "execution_count": 75, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } @@ -2262,7 +3154,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 87, "metadata": {}, "outputs": [ { @@ -2285,7 +3177,7 @@ "Chunks initialized : 1024/1024" ] }, - "execution_count": 76, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } @@ -2304,7 +3196,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ @@ -2314,30 +3206,30 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 140 ms, sys: 8 ms, total: 148 ms\n", - "Wall time: 146 ms\n" + "CPU times: user 116 ms, sys: 16 ms, total: 132 ms\n", + "Wall time: 129 ms\n" ] }, { "data": { "text/plain": [ - "array([[ 2, 4, 6, ..., 993, 994, 999],\n", - " [ 9002, 9004, 9006, ..., 9993, 9994, 9999],\n", - " [ 10002, 10004, 10006, ..., 10993, 10994, 10999],\n", + "array([[ 1000, 1001, 1003, ..., 1994, 1995, 1999],\n", + " [ 6000, 6001, 6003, ..., 6994, 6995, 6999],\n", + " [ 8000, 8001, 8003, ..., 8994, 8995, 8999],\n", " ..., \n", - " [99997002, 99997004, 99997006, ..., 99997993, 99997994, 99997999],\n", - " [99998002, 99998004, 99998006, ..., 99998993, 99998994, 99998999],\n", - " [99999002, 99999004, 99999006, ..., 99999993, 99999994, 99999999]])" + " [99991000, 99991001, 99991003, ..., 99991994, 99991995, 99991999],\n", + " [99997000, 99997001, 99997003, ..., 99997994, 99997995, 99997999],\n", + " [99998000, 99998001, 99998003, ..., 99998994, 99998995, 99998999]])" ] }, - "execution_count": 78, + "execution_count": 89, "metadata": {}, "output_type": "execute_result" } @@ -2348,30 +3240,30 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 90, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 748 ms, sys: 56 ms, total: 804 ms\n", - "Wall time: 409 ms\n" + "CPU times: user 780 ms, sys: 40 ms, total: 820 ms\n", + "Wall time: 387 ms\n" ] }, { "data": { "text/plain": [ - "array([[ 2, 4, 6, ..., 993, 994, 999],\n", - " [ 9002, 9004, 9006, ..., 9993, 9994, 9999],\n", - " [ 10002, 10004, 10006, ..., 10993, 10994, 10999],\n", + "array([[ 1000, 1001, 1003, ..., 1994, 1995, 1999],\n", + " [ 6000, 6001, 6003, ..., 6994, 6995, 6999],\n", + " [ 8000, 8001, 8003, ..., 8994, 8995, 8999],\n", " ..., \n", - " [99997002, 99997004, 99997006, ..., 99997993, 99997994, 99997999],\n", - " [99998002, 99998004, 99998006, ..., 99998993, 99998994, 99998999],\n", - " [99999002, 99999004, 99999006, ..., 99999993, 99999994, 99999999]])" + " [99991000, 99991001, 99991003, ..., 99991994, 99991995, 99991999],\n", + " [99997000, 99997001, 99997003, ..., 99997994, 99997995, 99997999],\n", + " [99998000, 99998001, 99998003, ..., 99998994, 99998995, 99998999]])" ] }, - "execution_count": 79, + "execution_count": 90, "metadata": {}, "output_type": "execute_result" } @@ -2389,7 +3281,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ @@ -2399,30 +3291,30 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 196 ms, sys: 24 ms, total: 220 ms\n", - "Wall time: 219 ms\n" + "CPU times: user 156 ms, sys: 28 ms, total: 184 ms\n", + "Wall time: 184 ms\n" ] }, { "data": { "text/plain": [ - "array([[90796980, 90796608, 90796172, ..., 90796527, 90796979, 90796445],\n", - " [50263980, 50263608, 50263172, ..., 50263527, 50263979, 50263445],\n", - " [47678980, 47678608, 47678172, ..., 47678527, 47678979, 47678445],\n", + "array([[38408139, 38408374, 38408509, ..., 38408966, 38408223, 38408367],\n", + " [29895139, 29895374, 29895509, ..., 29895966, 29895223, 29895367],\n", + " [79133139, 79133374, 79133509, ..., 79133966, 79133223, 79133367],\n", " ..., \n", - " [34172980, 34172608, 34172172, ..., 34172527, 34172979, 34172445],\n", - " [56793980, 56793608, 56793172, ..., 56793527, 56793979, 56793445],\n", - " [12456980, 12456608, 12456172, ..., 12456527, 12456979, 12456445]])" + " [95689139, 95689374, 95689509, ..., 95689966, 95689223, 95689367],\n", + " [47381139, 47381374, 47381509, ..., 47381966, 47381223, 47381367],\n", + " [20741139, 20741374, 20741509, ..., 20741966, 20741223, 20741367]])" ] }, - "execution_count": 81, + "execution_count": 92, "metadata": {}, "output_type": "execute_result" } @@ -2433,30 +3325,30 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.11 s, sys: 68 ms, total: 1.18 s\n", - "Wall time: 604 ms\n" + "CPU times: user 1.08 s, sys: 120 ms, total: 1.2 s\n", + "Wall time: 586 ms\n" ] }, { "data": { "text/plain": [ - "array([[90796980, 90796608, 90796172, ..., 90796527, 90796979, 90796445],\n", - " [50263980, 50263608, 50263172, ..., 50263527, 50263979, 50263445],\n", - " [47678980, 47678608, 47678172, ..., 47678527, 47678979, 47678445],\n", + "array([[38408139, 38408374, 38408509, ..., 38408966, 38408223, 38408367],\n", + " [29895139, 29895374, 29895509, ..., 29895966, 29895223, 29895367],\n", + " [79133139, 79133374, 79133509, ..., 79133966, 79133223, 79133367],\n", " ..., \n", - " [34172980, 34172608, 34172172, ..., 34172527, 34172979, 34172445],\n", - " [56793980, 56793608, 56793172, ..., 56793527, 56793979, 56793445],\n", - " [12456980, 12456608, 12456172, ..., 12456527, 12456979, 12456445]])" + " [95689139, 95689374, 95689509, ..., 95689966, 95689223, 95689367],\n", + " [47381139, 47381374, 47381509, ..., 47381966, 47381223, 47381367],\n", + " [20741139, 20741374, 20741509, ..., 20741966, 20741223, 20741367]])" ] }, - "execution_count": 82, + "execution_count": 93, "metadata": {}, "output_type": "execute_result" } @@ -2474,7 +3366,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 94, "metadata": {}, "outputs": [ { @@ -2483,7 +3375,7 @@ "10000000" ] }, - "execution_count": 83, + "execution_count": 94, "metadata": {}, "output_type": "execute_result" } @@ -2525,24 +3417,24 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 2.62 s, sys: 116 ms, total: 2.73 s\n", - "Wall time: 2.29 s\n" + "CPU times: user 2.43 s, sys: 148 ms, total: 2.58 s\n", + "Wall time: 2.09 s\n" ] }, { "data": { "text/plain": [ - "array([ 6452573, 65841096, 70323990, ..., 44175624, 34778721, 67807976])" + "array([55010547, 87536917, 88871707, ..., 73879431, 32878018, 25168834])" ] }, - "execution_count": 85, + "execution_count": 95, "metadata": {}, "output_type": "execute_result" } @@ -2553,95 +3445,102 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 96, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 48293 function calls in 2.312 seconds\n", + " 48332 function calls in 2.050 seconds\n", "\n", " Ordered by: internal time\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 1.161 1.161 1.161 1.161 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 3 0.275 0.092 0.275 0.092 indexing.py:590()\n", - " 3 0.223 0.074 0.223 0.074 indexing.py:581()\n", - " 1024 0.174 0.000 0.358 0.000 core.py:802(_chunk_getitem)\n", - " 1 0.167 0.167 1.914 1.914 indexing.py:547(__init__)\n", - " 1024 0.155 0.000 0.164 0.000 core.py:964(_decode_chunk)\n", - " 1 0.044 0.044 0.044 0.044 {built-in method numpy.core.multiarray.ravel_multi_index}\n", - " 1 0.039 0.039 0.039 0.039 {built-in method numpy.core.multiarray.bincount}\n", - " 3072 0.022 0.000 0.022 0.000 indexing.py:625()\n", - " 1025 0.009 0.000 0.034 0.000 indexing.py:607(__iter__)\n", - " 6 0.005 0.001 0.005 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 3081 0.005 0.000 0.005 0.000 core.py:324()\n", - " 1024 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1 1.107 1.107 1.107 1.107 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 3 0.255 0.085 0.255 0.085 indexing.py:604()\n", + " 3 0.193 0.064 0.193 0.064 indexing.py:580()\n", + " 1024 0.164 0.000 0.328 0.000 core.py:822(_chunk_getitem)\n", + " 1024 0.137 0.000 0.144 0.000 core.py:997(_decode_chunk)\n", + " 1 0.045 0.045 0.045 0.045 {built-in method numpy.core.multiarray.ravel_multi_index}\n", + " 1 0.044 0.044 1.683 1.683 indexing.py:553(__init__)\n", + " 1 0.024 0.024 0.024 0.024 {built-in method numpy.core.multiarray.bincount}\n", + " 3072 0.021 0.000 0.021 0.000 indexing.py:645()\n", + " 1 0.010 0.010 0.010 0.010 function_base.py:1848(diff)\n", + " 1025 0.010 0.000 0.034 0.000 indexing.py:624(__iter__)\n", + " 7 0.005 0.001 0.005 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", + " 3081 0.005 0.000 0.005 0.000 core.py:332()\n", " 2048 0.003 0.000 0.007 0.000 arrayprint.py:381(wrapper)\n", + " 1 0.003 0.003 0.365 0.365 core.py:576(_get_selection)\n", + " 1029 0.003 0.000 0.003 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", + " 1024 0.003 0.000 0.011 0.000 {method 'join' of 'str' objects}\n", " 1024 0.003 0.000 0.003 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1 0.003 0.003 0.396 0.396 core.py:563(_get_selection)\n", - " 1024 0.003 0.000 0.012 0.000 {method 'join' of 'str' objects}\n", " 2048 0.002 0.000 0.003 0.000 arrayprint.py:399(array2string)\n", - " 1027 0.002 0.000 0.006 0.000 core.py:319(_cdata_shape)\n", + " 1027 0.002 0.000 0.006 0.000 core.py:327(_cdata_shape)\n", + " 3090 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", " 2048 0.001 0.000 0.009 0.000 numeric.py:1905(array_str)\n", - " 3084 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", - " 1 0.001 0.001 2.312 2.312 core.py:539(get_coordinate_selection)\n", - " 1024 0.001 0.000 0.013 0.000 core.py:961(_chunk_key)\n", - " 3072 0.001 0.000 0.001 0.000 indexing.py:621()\n", - " 3072 0.001 0.000 0.001 0.000 indexing.py:612()\n", - " 1 0.001 0.001 2.312 2.312 indexing.py:658(__getitem__)\n", - " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", + " 1024 0.001 0.000 0.012 0.000 core.py:994(_chunk_key)\n", + " 3072 0.001 0.000 0.001 0.000 indexing.py:641()\n", + " 3072 0.001 0.000 0.001 0.000 indexing.py:629()\n", + " 1 0.001 0.001 2.049 2.049 core.py:543(get_coordinate_selection)\n", + " 1 0.001 0.001 2.049 2.049 indexing.py:678(__getitem__)\n", " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", + " 1024 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", + " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", " 3072 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x5642b95ef480}\n", " 2048 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 2055 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", - " 2048 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:205(chunk_store)\n", + " 2056 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", " 2048 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1 0.000 0.000 2.312 2.312 :1()\n", + " 2048 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", + " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", + " 1 0.000 0.000 2.050 2.050 :1()\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.unravel_index}\n", - " 6 0.000 0.000 0.005 0.001 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 2.312 2.312 {built-in method builtins.exec}\n", + " 7 0.000 0.000 0.005 0.001 fromnumeric.py:1886(any)\n", + " 1 0.000 0.000 2.050 2.050 {built-in method builtins.exec}\n", + " 3 0.000 0.000 1.107 0.369 fromnumeric.py:55(_wrapfunc)\n", " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 3 0.000 0.000 1.161 0.387 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 core.py:334(_nchunks)\n", - " 6 0.000 0.000 0.005 0.001 {method 'any' of 'numpy.ndarray' objects}\n", - " 8 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", + " 12 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", + " 8 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", + " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 7 0.000 0.000 0.005 0.001 {method 'any' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 6 0.000 0.000 0.005 0.001 _methods.py:37(_any)\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", + " 1 0.000 0.000 0.000 0.000 core.py:342(_nchunks)\n", + " 7 0.000 0.000 0.005 0.001 _methods.py:37(_any)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", " 12 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 1 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:519(is_coordinate_selection)\n", - " 1 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", - " 1 0.000 0.000 1.161 1.161 fromnumeric.py:826(argsort)\n", - " 3 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 4 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 1 0.000 0.000 0.000 0.000 core.py:338(nchunks)\n", + " 1 0.000 0.000 0.000 0.000 core.py:346(nchunks)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:526(is_coordinate_selection)\n", " 6 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", + " 3 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", " 1 0.000 0.000 0.000 0.000 {built-in method _functools.reduce}\n", + " 6 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:592()\n", + " 1 0.000 0.000 1.107 1.107 fromnumeric.py:826(argsort)\n", + " 4 0.000 0.000 0.000 0.000 {built-in method builtins.all}\n", + " 4 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", + " 2 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", + " 6 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 3 0.000 0.000 0.000 0.000 {built-in method builtins.all}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:537(replace_lists)\n", + " 2 0.000 0.000 0.000 0.000 core.py:161(_refresh_metadata)\n", + " 2 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", " 4 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 6 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:194(ensure_tuple)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:523()\n", - " 6 0.000 0.000 0.000 0.000 indexing.py:539()\n", - " 3 0.000 0.000 0.000 0.000 indexing.py:551()\n", - " 1 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", + " 3 0.000 0.000 0.000 0.000 indexing.py:734()\n", + " 3 0.000 0.000 0.000 0.000 indexing.py:557()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:732()\n", " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 core.py:213(shape)\n", - " 1 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", - " 3 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", - " 1 0.000 0.000 0.000 0.000 core.py:364(vindex)\n", - " 2 0.000 0.000 0.000 0.000 core.py:153(_refresh_metadata)\n", + " 1 0.000 0.000 0.000 0.000 core.py:221(shape)\n", " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:165(ensure_tuple)\n", + " 2 0.000 0.000 0.000 0.000 indexing.py:529()\n", + " 6 0.000 0.000 0.000 0.000 indexing.py:545()\n", + " 1 0.000 0.000 0.000 0.000 indexing.py:593()\n", + " 1 0.000 0.000 0.000 0.000 core.py:372(vindex)\n", "\n", "\n" ] diff --git a/zarr/core.py b/zarr/core.py index c796fb3902..e5ca2d553c 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -724,6 +724,10 @@ def set_coordinate_selection(self, selection, value, fields=None): # setup indexer indexer = CoordinateIndexer(selection, self) + # handle value - need to flatten + if hasattr(value, 'shape') and len(value.shape) > 1: + value = value.reshape(-1) + self._set_selection(indexer, value, fields=fields) def set_mask_selection(self, selection, value, fields=None): @@ -758,7 +762,7 @@ def _set_basic_selection_zd(self, selection, value, fields=None): # check value if arr.shape != (): - raise ValueError('bad value; expected scalar, found %r' % value) + raise ValueError('expected scalar or 0-dimensional array, found %r' % value) # obtain key for chunk storage ckey = self._chunk_key((0,)) diff --git a/zarr/indexing.py b/zarr/indexing.py index 85d40743b4..51d1afad5d 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -222,8 +222,8 @@ def __init__(self, selection, array): dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) else: - raise IndexError('bad selection type; expected integer or contiguous slice, ' - 'got {!r}'.format(dim_sel)) + raise IndexError('unsupported selection type; expected integer or contiguous ' + 'slice, got {!r}'.format(dim_sel)) dim_indexers.append(dim_indexer) @@ -375,15 +375,18 @@ def slice_to_range(s, l): def ix_(selection, shape): - """Convert an orthogonal selection to a numpy advanced (fancy) selection, with support for - slices and single ints.""" + """Convert an orthogonal selection to a numpy advanced (fancy) selection, like numpy.ix_ + but with support for slices and single ints.""" - # replace slice and int as these are not supported by numpy ix_() + selection = ensure_tuple(selection) + + # replace slice and int as these are not supported by numpy.ix_ selection = [slice_to_range(dim_sel, dim_len) if isinstance(dim_sel, slice) else [dim_sel] if is_integer(dim_sel) else dim_sel for dim_sel, dim_len in zip(selection, shape)] + # now get numpy to convert to a coordinate selection selection = np.ix_(*selection) return selection @@ -391,6 +394,7 @@ def ix_(selection, shape): def oindex(a, selection): """Implementation of orthogonal indexing with slices and ints.""" + selection = ensure_tuple(selection) drop_axes = tuple([i for i, s in enumerate(selection) if is_integer(s)]) selection = ix_(selection, a.shape) result = a[selection] @@ -402,7 +406,7 @@ def oindex(a, selection): def oindex_set(a, selection, value): drop_axes = tuple([i for i, s in enumerate(selection) if is_integer(s)]) selection = ix_(selection, a.shape) - if drop_axes: + if not np.isscalar(value) and drop_axes: value_selection = [slice(None)] * len(a.shape) for i in drop_axes: value_selection[i] = np.newaxis @@ -460,7 +464,8 @@ def __init__(self, selection, array): dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) else: - raise IndexError('bad selection type') + # TODO improve and refactor error messages + raise IndexError('unsupported selection type') dim_indexers.append(dim_indexer) @@ -592,9 +597,13 @@ def __init__(self, selection, array): dims=array._cdata_shape) # group points by chunk - sel_sort = np.argsort(chunks_raveled_indices) - chunks_raveled_indices = chunks_raveled_indices[sel_sort] - selection = tuple(dim_sel[sel_sort] for dim_sel in selection) + if np.any(np.diff(chunks_raveled_indices) < 0): + # optimisation, only sort if needed + sel_sort = np.argsort(chunks_raveled_indices) + # chunks_raveled_indices = chunks_raveled_indices[sel_sort] + selection = tuple(dim_sel[sel_sort] for dim_sel in selection) + else: + sel_sort = None # store atrributes self.selection = selection @@ -623,7 +632,10 @@ def __iter__(self): else: start = self.chunk_nitems_cumsum[chunk_rix - 1] stop = self.chunk_nitems_cumsum[chunk_rix] - out_selection = self.sel_sort[start:stop] + if self.sel_sort is None: + out_selection = slice(start, stop) + else: + out_selection = self.sel_sort[start:stop] chunk_offsets = tuple( dim_chunk_ix * dim_chunk_len @@ -697,9 +709,9 @@ def check_fields(fields, dtype): else: # multiple field selection out_dtype = np.dtype([(f, dtype[f]) for f in fields]) - except KeyError: + except KeyError as e: # TODO better error message - raise IndexError('bad field selection') + raise IndexError('field not found: {!s}'.format(e)) else: return out_dtype else: diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 56a94ebb1d..5a9d8aee34 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -451,21 +451,18 @@ def test_get_orthogonal_selection_3d_int(): def _test_set_orthogonal_selection_1d_common(v, a, z, ix): - # setup expectation - a[:] = 0 - a[ix] = v[ix] - # long-form API - z[:] = 0 - z.set_orthogonal_selection(ix, v[ix]) - assert_array_equal(a, z[:]) - # short-form API - z[:] = 0 - z.oindex[ix] = v[ix] - assert_array_equal(a, z[:]) - # # also available via __setitem__ for 1d arrays - # z[:] = 0 - # z[ix] = v[ix] - # assert_array_equal(a, z[:]) + for value in 42, oindex(v, ix): + # setup expectation + a[:] = 0 + a[ix] = value + # long-form API + z[:] = 0 + z.set_orthogonal_selection(ix, value) + assert_array_equal(a, z[:]) + # short-form API + z[:] = 0 + z.oindex[ix] = value + assert_array_equal(a, z[:]) def test_set_orthogonal_selection_1d_bool(): @@ -513,18 +510,18 @@ def _test_set_orthogonal_selection_2d_common(v, a, z, ix0, ix1): (42, ix1), ) for selection in selections: - # setup expectation - a[:] = 0 - value = oindex(v, selection) - oindex_set(a, selection, value) - # long-form API - z[:] = 0 - z.set_orthogonal_selection(selection, value) - assert_array_equal(a, z[:]) - # short-form API - z[:] = 0 - z.oindex[selection] = value - assert_array_equal(a, z[:]) + for value in 42, oindex(v, selection): + # setup expectation + a[:] = 0 + oindex_set(a, selection, value) + # long-form API + z[:] = 0 + z.set_orthogonal_selection(selection, value) + assert_array_equal(a, z[:]) + # short-form API + z[:] = 0 + z.oindex[selection] = value + assert_array_equal(a, z[:]) def test_set_orthogonal_selection_2d_bool(): @@ -584,18 +581,18 @@ def _test_set_orthogonal_selection_3d_common(v, a, z, ix0, ix1, ix2): (ix0, ix1, 4), ) for selection in selections: - # setup expectation - a[:] = 0 - value = oindex(v, selection) - oindex_set(a, selection, value) - # long-form API - z[:] = 0 - z.set_orthogonal_selection(selection, value) - assert_array_equal(a, z[:]) - # short-form API - z[:] = 0 - z.oindex[selection] = value - assert_array_equal(a, z[:]) + for value in 42, oindex(v, selection): + # setup expectation + a[:] = 0 + oindex_set(a, selection, value) + # long-form API + z[:] = 0 + z.set_orthogonal_selection(selection, value) + assert_array_equal(a, z[:]) + # short-form API + z[:] = 0 + z.oindex[selection] = value + assert_array_equal(a, z[:]) def test_set_orthogonal_selection_3d_bool(): @@ -793,6 +790,21 @@ def test_set_coordinate_selection_1d_int(): z.vindex[ix] = v[ix] assert_array_equal(a, z[:]) + # multi-dimensional selection + ix = np.array([[2, 4], [6, 8]]) + for value in 42, v[ix]: + # setup expectation + a[:] = 0 + a[ix] = value + # test long-form API + z[:] = 0 + z.set_coordinate_selection(ix, value) + assert_array_equal(a, z[:]) + # test short-form API + z[:] = 0 + z.vindex[ix] = value + assert_array_equal(a, z[:]) + def test_set_coordinate_selection_2d_int(): @@ -828,6 +840,25 @@ def test_set_coordinate_selection_2d_int(): z.vindex[selection] = v[selection] assert_array_equal(a, z[:]) + # multi-dimensional selection + ix0 = np.array([[1, 2, 3], + [4, 5, 6]]) + ix1 = np.array([[1, 3, 2], + [2, 0, 5]]) + + for value in 42, v[ix0, ix1]: + # setup expectation + a[:] = 0 + a[ix0, ix1] = value + # test long-form API + z[:] = 0 + z.set_coordinate_selection((ix0, ix1), value) + assert_array_equal(a, z[:]) + # test short-form API + z[:] = 0 + z.vindex[ix0, ix1] = value + assert_array_equal(a, z[:]) + # noinspection PyStatementEffect def test_get_mask_selection_1d(): From 92a0b5c1d2370fb254c28ae9a1b281ae3af36a01 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 7 Nov 2017 13:42:46 +0000 Subject: [PATCH 42/67] add fields selection examples --- notebooks/advanced_indexing.ipynb | 302 +++++++++++++++++++++++++++++- zarr/core.py | 12 +- 2 files changed, 307 insertions(+), 7 deletions(-) diff --git a/notebooks/advanced_indexing.ipynb b/notebooks/advanced_indexing.ipynb index 4dbbbd50ee..e79e22ac34 100644 --- a/notebooks/advanced_indexing.ipynb +++ b/notebooks/advanced_indexing.ipynb @@ -15,7 +15,7 @@ { "data": { "text/plain": [ - "'2.1.5.dev113'" + "'2.1.5.dev118+dirty'" ] }, "execution_count": 1, @@ -967,6 +967,306 @@ "za[:]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Selecting fields from arrays with a structured dtype\n", + "\n", + "All ``get/set_selection_...()`` methods support a ``fields`` argument which allows retrieving/replacing data for a specific field or fields. Also h5py-like API is supported where fields can be provided within ``__getitem__``, ``.oindex[]`` and ``.vindex[]``." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([(b'aaa', 1, 4.2), (b'bbb', 2, 8.4), (b'ccc', 3, 12.6)],\n", + " dtype=[('foo', 'S3'), ('bar', '\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices" + ] + } + ], + "source": [ + "a['foo', 'baz']" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([(b'aaa', 4.2), (b'bbb', 8.4), (b'ccc', 12.6)],\n", + " dtype=[('foo', 'S3'), ('baz', '\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mza\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, selection)\u001b[0m\n\u001b[1;32m 470\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 471\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpop_fields\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 472\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 473\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 474\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36mget_basic_selection\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m 483\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 484\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 485\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_nd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 486\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 487\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m_get_basic_selection_nd\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m 525\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 526\u001b[0m \u001b[0;31m# setup indexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 527\u001b[0;31m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBasicIndexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 528\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/indexing.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, selection, array)\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 225\u001b[0m raise IndexError('unsupported selection type; expected integer or contiguous '\n\u001b[0;32m--> 226\u001b[0;31m 'slice, got {!r}'.format(dim_sel))\n\u001b[0m\u001b[1;32m 227\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 228\u001b[0m \u001b[0mdim_indexers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim_indexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIndexError\u001b[0m: unsupported selection type; expected integer or contiguous slice, got ['foo', 'baz']" + ] + } + ], + "source": [ + "za[['foo', 'baz']]" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/zarr/core.py b/zarr/core.py index e5ca2d553c..a0a8695251 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,16 +8,16 @@ import numpy as np -from zarr.util import is_total_slice, human_readable_size, normalize_resize_args, \ - normalize_storage_path, normalize_shape, normalize_chunks, InfoReporter +from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, + normalize_storage_path, normalize_shape, normalize_chunks, InfoReporter) from zarr.storage import array_meta_key, attrs_key, listdir, getsize from zarr.meta import decode_array_metadata, encode_array_metadata from zarr.attrs import Attributes from zarr.errors import PermissionError, err_read_only, err_array_not_found from zarr.compat import reduce from zarr.codecs import AsType, get_codec -from zarr.indexing import OIndex, OrthogonalIndexer, BasicIndexer, VIndex, CoordinateIndexer, \ - MaskIndexer, check_fields, pop_fields, ensure_tuple +from zarr.indexing import (OIndex, OrthogonalIndexer, BasicIndexer, VIndex, CoordinateIndexer, + MaskIndexer, check_fields, pop_fields, ensure_tuple) def is_scalar(value, dtype): @@ -1130,8 +1130,8 @@ def bytestr(n): return items def __getstate__(self): - return self._store, self._path, self._read_only, self._chunk_store, self._synchronizer, \ - self._cache_metadata + return (self._store, self._path, self._read_only, self._chunk_store, self._synchronizer, + self._cache_metadata) def __setstate__(self, state): self.__init__(*state) From 8655a7788dfbe3a7d498d9ddbebc3e2609f815d0 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 8 Nov 2017 01:57:16 +0000 Subject: [PATCH 43/67] improve test coverage --- zarr/core.py | 125 +++++++++++++++++++----------- zarr/indexing.py | 17 ++++ zarr/tests/test_core.py | 54 +++++++++++++ zarr/tests/test_indexing.py | 150 ++++++++++++++++++++++++++++-------- zarr/util.py | 9 +++ 5 files changed, 276 insertions(+), 79 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index a0a8695251..681da4391e 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -9,7 +9,8 @@ from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, - normalize_storage_path, normalize_shape, normalize_chunks, InfoReporter) + normalize_storage_path, normalize_shape, normalize_chunks, InfoReporter, + check_array_shape) from zarr.storage import array_meta_key, attrs_key, listdir, getsize from zarr.meta import decode_array_metadata, encode_array_metadata from zarr.attrs import Attributes @@ -17,17 +18,11 @@ from zarr.compat import reduce from zarr.codecs import AsType, get_codec from zarr.indexing import (OIndex, OrthogonalIndexer, BasicIndexer, VIndex, CoordinateIndexer, - MaskIndexer, check_fields, pop_fields, ensure_tuple) - - -def is_scalar(value, dtype): - if np.isscalar(value): - return True - if isinstance(value, tuple) and dtype.names and len(value) == len(dtype.names): - return True - return False + MaskIndexer, check_fields, pop_fields, ensure_tuple, is_scalar, + is_contiguous_selection) +# noinspection PyUnresolvedReferences class Array(object): """Instantiate an array from an initialized store. @@ -76,6 +71,8 @@ class Array(object): nchunks_initialized is_view info + vindex + oindex Methods ------- @@ -85,6 +82,14 @@ class Array(object): append view astype + get_basic_selection + set_basic_selection + get_mask_selection + set_mask_selection + get_orthogonal_selection + set_orthogonal_selection + get_coordinate_selection + set_coordinate_selection """ @@ -483,6 +488,9 @@ def get_basic_selection(self, selection, out=None, fields=None): if not self._cache_metadata: self._load_metadata() + # check args + check_fields(fields, self._dtype) + # handle zero-dimensional arrays if self._shape == (): return self._get_basic_selection_zd(selection=selection, out=out, fields=fields) @@ -504,25 +512,23 @@ def _get_basic_selection_zd(self, selection, out=None, fields=None): except KeyError: # chunk not initialized + chunk = np.zeros((), dtype=self._dtype) if self._fill_value is not None: - chunk = np.empty((), dtype=self._dtype) chunk.fill(self._fill_value) - else: - chunk = np.zeros((), dtype=self._dtype) else: chunk = self._decode_chunk(cdata) + # handle fields + if fields: + chunk = chunk[fields] + # handle selection of the scalar value via empty tuple if out is None: out = chunk[selection] else: out[selection] = chunk[selection] - # handle fields - if fields: - out = out[fields] - return out def _get_basic_selection_nd(self, selection, out=None, fields=None): @@ -540,6 +546,9 @@ def get_orthogonal_selection(self, selection, out=None, fields=None): if not self._cache_metadata: self._load_metadata() + # check args + check_fields(fields, self._dtype) + # setup indexer indexer = OrthogonalIndexer(selection, self) @@ -552,6 +561,9 @@ def get_coordinate_selection(self, selection, out=None, fields=None): if not self._cache_metadata: self._load_metadata() + # check args + check_fields(fields, self._dtype) + # setup indexer indexer = CoordinateIndexer(selection, self) @@ -573,6 +585,9 @@ def get_mask_selection(self, selection, out=None, fields=None): if not self._cache_metadata: self._load_metadata() + # check args + check_fields(fields, self._dtype) + # setup indexer indexer = MaskIndexer(selection, self) @@ -597,11 +612,7 @@ def _get_selection(self, indexer, out=None, fields=None): if out is None: out = np.empty(out_shape, dtype=out_dtype, order=self._order) else: - # validate 'out' parameter - if not hasattr(out, 'shape'): - raise TypeError('out must be an array-like object') - if out.shape != out_shape: - raise ValueError('out has wrong shape for selection') + check_array_shape('out', out, out_shape) # iterate over chunks for chunk_coords, chunk_selection, out_selection in indexer: @@ -725,6 +736,8 @@ def set_coordinate_selection(self, selection, value, fields=None): indexer = CoordinateIndexer(selection, self) # handle value - need to flatten + if not is_scalar(value, self._dtype): + value = np.asanyarray(value) if hasattr(value, 'shape') and len(value.shape) > 1: value = value.reshape(-1) @@ -749,26 +762,42 @@ def set_mask_selection(self, selection, value, fields=None): def _set_basic_selection_zd(self, selection, value, fields=None): # special case __setitem__ for zero-dimensional array - if fields: - raise IndexError('fields not supported for 0d array') - - # check item is valid + # check selection is valid selection = ensure_tuple(selection) - if selection not in ((), (Ellipsis,)): + if selection not in ((), (...,)): raise IndexError('too many indices for array') - # setup data to store - arr = np.asarray(value, dtype=self._dtype) - - # check value - if arr.shape != (): - raise ValueError('expected scalar or 0-dimensional array, found %r' % value) + # check fields + check_fields(fields, self._dtype) + if fields and isinstance(fields, list): + raise ValueError('multi-field assignment is not supported') - # obtain key for chunk storage + # obtain key for chunk ckey = self._chunk_key((0,)) + # setup chunk + try: + # obtain compressed data for chunk + cdata = self.chunk_store[ckey] + + except KeyError: + # chunk not initialized + chunk = np.zeros((), dtype=self._dtype) + if self._fill_value is not None: + chunk.fill(self._fill_value) + + else: + # decode chunk + chunk = self._decode_chunk(cdata).copy() + + # set value + if fields: + chunk[fields][selection] = value + else: + chunk[selection] = value + # encode and store - cdata = self._encode_chunk(arr) + cdata = self._encode_chunk(chunk) self.chunk_store[ckey] = cdata def _set_basic_selection_nd(self, selection, value, fields=None): @@ -801,10 +830,8 @@ def _set_selection(self, indexer, value, fields=None): pass else: if not hasattr(value, 'shape'): - value = np.asarray(value) - if value.shape != sel_shape: - raise ValueError('value has wrong shape for selection; expected {}, got {}' - .format(sel_shape, value.shape)) + value = np.asanyarray(value) + check_array_shape('value', value, sel_shape) # iterate over chunks in range for chunk_coords, chunk_selection, out_selection in indexer: @@ -847,14 +874,14 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop assert len(chunk_coords) == len(self._cdata_shape) - try: + # obtain key for chunk + ckey = self._chunk_key(chunk_coords) + try: # obtain compressed data for chunk - ckey = self._chunk_key(chunk_coords) cdata = self.chunk_store[ckey] except KeyError: - # chunk not initialized if self._fill_value is not None: out[out_selection] = self._fill_value @@ -863,15 +890,19 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop if (isinstance(out, np.ndarray) and not fields and - isinstance(out_selection, slice) and + is_contiguous_selection(out_selection) and is_total_slice(chunk_selection, self._chunks) and not self._filters): dest = out[out_selection] - contiguous = ((self._order == 'C' and dest.flags.c_contiguous) or - (self._order == 'F' and dest.flags.f_contiguous)) + write_direct = ( + dest.flags.writeable and ( + (self._order == 'C' and dest.flags.c_contiguous) or + (self._order == 'F' and dest.flags.f_contiguous) + ) + ) - if contiguous: + if write_direct: # optimization: we want the whole chunk, and the destination is # contiguous, so we can decompress directly from the chunk @@ -1256,7 +1287,7 @@ def append(self, data, axis=0): def _append_nosync(self, data, axis=0): # ensure data is array-like - if not hasattr(data, 'shape') or not hasattr(data, 'dtype'): + if not hasattr(data, 'shape'): data = np.asanyarray(data) # ensure shapes are compatible for non-append dimensions diff --git a/zarr/indexing.py b/zarr/indexing.py index 51d1afad5d..677f9bb972 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -20,6 +20,14 @@ def is_bool_array(x): return hasattr(x, 'dtype') and x.dtype == bool +def is_scalar(value, dtype): + if np.isscalar(value): + return True + if isinstance(value, tuple) and dtype.names and len(value) == len(dtype.names): + return True + return False + + def normalize_integer_selection(dim_sel, dim_len): # normalize type to int @@ -199,6 +207,14 @@ def is_contiguous_slice(s): return isinstance(s, slice) and (s.step is None or s.step == 1) +def is_contiguous_selection(selection): + selection = ensure_tuple(selection) + return all([ + (is_integer_array(s) or is_contiguous_slice(s) or s == Ellipsis) + for s in selection + ]) + + # noinspection PyProtectedMember class BasicIndexer(object): @@ -407,6 +423,7 @@ def oindex_set(a, selection, value): drop_axes = tuple([i for i, s in enumerate(selection) if is_integer(s)]) selection = ix_(selection, a.shape) if not np.isscalar(value) and drop_axes: + value = np.asanyarray(value) value_selection = [slice(None)] * len(a.shape) for i in drop_axes: value_selection[i] = np.newaxis diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 61965f42d7..36b9605596 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -194,6 +194,48 @@ def test_array_1d_set_scalar(self): z[:] = value assert_array_equal(a, z[:]) + def test_array_1d_selections(self): + # light test here, full tests in test_indexing + + # setup + a = np.arange(1050) + z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + # get + assert_array_equal(a[50:150], z.get_orthogonal_selection(slice(50, 150))) + assert_array_equal(a[50:150], z.oindex[50: 150]) + ix = [99, 100, 101] + bix = np.zeros_like(a, dtype=bool) + bix[ix] = True + assert_array_equal(a[ix], z.get_orthogonal_selection(ix)) + assert_array_equal(a[ix], z.oindex[ix]) + assert_array_equal(a[ix], z.get_coordinate_selection(ix)) + assert_array_equal(a[ix], z.vindex[ix]) + assert_array_equal(a[bix], z.get_mask_selection(bix)) + assert_array_equal(a[bix], z.oindex[bix]) + assert_array_equal(a[bix], z.vindex[bix]) + + # set + z.set_orthogonal_selection(slice(50, 150), 1) + assert_array_equal(1, z[50:150]) + z.oindex[50:150] = 2 + assert_array_equal(2, z[50:150]) + z.set_orthogonal_selection(ix, 3) + assert_array_equal(3, z.get_coordinate_selection(ix)) + z.oindex[ix] = 4 + assert_array_equal(4, z.oindex[ix]) + z.set_coordinate_selection(ix, 5) + assert_array_equal(5, z.get_coordinate_selection(ix)) + z.vindex[ix] = 6 + assert_array_equal(6, z.vindex[ix]) + z.set_mask_selection(bix, 7) + assert_array_equal(7, z.get_mask_selection(bix)) + z.vindex[bix] = 8 + assert_array_equal(8, z.vindex[bix]) + z.oindex[bix] = 9 + assert_array_equal(9, z.oindex[bix]) + # noinspection PyStatementEffect def test_array_2d(self): a = np.arange(10000).reshape((1000, 10)) @@ -558,6 +600,18 @@ def test_read_only(self): z.resize(2000) with assert_raises(PermissionError): z.append(np.arange(1000)) + with assert_raises(PermissionError): + z.set_basic_selection(..., 42) + with assert_raises(PermissionError): + z.set_orthogonal_selection([0, 1, 2], 42) + with assert_raises(PermissionError): + z.oindex[[0, 1, 2]] = 42 + with assert_raises(PermissionError): + z.set_coordinate_selection([0, 1, 2], 42) + with assert_raises(PermissionError): + z.vindex[[0, 1, 2]] = 42 + with assert_raises(PermissionError): + z.set_mask_selection(np.ones(z.shape, dtype=bool), 42) def test_pickle(self): diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 5a9d8aee34..6b2e34a107 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -29,12 +29,12 @@ def test_replace_ellipsis(): eq((0,), replace_ellipsis(0, (100,))) # 1D - eq((slice(None),), replace_ellipsis(Ellipsis, (100,))) + eq((slice(None),), replace_ellipsis(..., (100,))) eq((slice(None),), replace_ellipsis(slice(None), (100,))) eq((slice(None, 100),), replace_ellipsis(slice(None, 100), (100,))) eq((slice(0, None),), replace_ellipsis(slice(0, None), (100,))) - eq((slice(None),), replace_ellipsis((slice(None), Ellipsis), (100,))) - eq((slice(None),), replace_ellipsis((Ellipsis, slice(None)), (100,))) + eq((slice(None),), replace_ellipsis((slice(None), ...), (100,))) + eq((slice(None),), replace_ellipsis((..., slice(None)), (100,))) # 2D, single item eq((0, 0), replace_ellipsis((0, 0), (100, 100))) @@ -47,21 +47,60 @@ def test_replace_ellipsis(): # 2D slice eq((slice(None), slice(None)), - replace_ellipsis(Ellipsis, (100, 100))) + replace_ellipsis(..., (100, 100))) eq((slice(None), slice(None)), replace_ellipsis(slice(None), (100, 100))) eq((slice(None), slice(None)), replace_ellipsis((slice(None), slice(None)), (100, 100))) eq((slice(None), slice(None)), - replace_ellipsis((Ellipsis, slice(None)), (100, 100))) + replace_ellipsis((..., slice(None)), (100, 100))) eq((slice(None), slice(None)), - replace_ellipsis((slice(None), Ellipsis), (100, 100))) + replace_ellipsis((slice(None), ...), (100, 100))) eq((slice(None), slice(None)), - replace_ellipsis((slice(None), Ellipsis, slice(None)), (100, 100))) + replace_ellipsis((slice(None), ..., slice(None)), (100, 100))) eq((slice(None), slice(None)), - replace_ellipsis((Ellipsis, slice(None), slice(None)), (100, 100))) + replace_ellipsis((..., slice(None), slice(None)), (100, 100))) eq((slice(None), slice(None)), - replace_ellipsis((slice(None), slice(None), Ellipsis), (100, 100))) + replace_ellipsis((slice(None), slice(None), ...), (100, 100))) + + +def test_get_basic_selection_0d(): + + # setup + a = np.array(42) + z = zarr.create(shape=a.shape, dtype=a.dtype, fill_value=None) + z[...] = a + + assert_array_equal(a, z.get_basic_selection(...)) + assert_array_equal(a, z[...]) + eq(42, z.get_basic_selection(())) + eq(42, z[()]) + + # test out param + b = np.zeros_like(a) + z.get_basic_selection(..., out=b) + assert_array_equal(a, b) + + # test structured array + value = (b'aaa', 1, 4.2) + a = np.array(value, dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + z = zarr.create(shape=a.shape, dtype=a.dtype, fill_value=None) + z[()] = value + assert_array_equal(a, z.get_basic_selection(...)) + assert_array_equal(a, z[...]) + eq(a[()], z.get_basic_selection(())) + eq(a[()], z[()]) + eq(b'aaa', z.get_basic_selection((), fields='foo')) + eq(b'aaa', z['foo']) + eq(a[['foo', 'bar']], z.get_basic_selection((), fields=['foo', 'bar'])) + eq(a[['foo', 'bar']], z['foo', 'bar']) + # test out param + b = np.zeros_like(a) + z.get_basic_selection(..., out=b) + assert_array_equal(a, b) + c = np.zeros_like(a[['foo', 'bar']]) + z.get_basic_selection(..., out=c, fields=['foo', 'bar']) + assert_array_equal(a[['foo', 'bar']], c) # noinspection PyStatementEffect @@ -86,9 +125,9 @@ def test_get_basic_selection_1d(): slice(0, 0), # empty result slice(-1, 0), # empty result # total selections - Ellipsis, + ..., (), - (Ellipsis, slice(None)), + (..., slice(None)), ] for selection in selections: @@ -147,10 +186,10 @@ def test_get_basic_selection_2d(): (slice(250, 350), slice(-50, 50)), # total selections (slice(None), slice(None)), - Ellipsis, + ..., (), - (Ellipsis, slice(None)), - (Ellipsis, slice(None), slice(None)), + (..., slice(None)), + (..., slice(None), slice(None)), ] for selection in selections: @@ -176,6 +215,53 @@ def test_get_basic_selection_2d(): z[:, :, :] # too many indices +def test_set_basic_selection_0d(): + + # setup + v = np.array(42) + a = np.zeros_like(v) + z = zarr.zeros_like(v) + assert_array_equal(a, z) + + # tests + z.set_basic_selection(..., v) + assert_array_equal(v, z) + z[...] = 0 + assert_array_equal(a, z) + z[...] = v + assert_array_equal(v, z) + + # test structured array + value = (b'aaa', 1, 4.2) + v = np.array(value, dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + a = np.zeros_like(v) + z = zarr.create(shape=a.shape, dtype=a.dtype, fill_value=None) + + # tests + z.set_basic_selection(..., v) + assert_array_equal(v, z) + z.set_basic_selection(..., a) + assert_array_equal(a, z) + z[...] = v + assert_array_equal(v, z) + z[...] = a + assert_array_equal(a, z) + # with fields + z.set_basic_selection(..., v['foo'], fields='foo') + eq(v['foo'], z['foo']) + eq(a['bar'], z['bar']) + eq(a['baz'], z['baz']) + z['bar'] = v['bar'] + eq(v['foo'], z['foo']) + eq(v['bar'], z['bar']) + eq(a['baz'], z['baz']) + # multiple field assignment not supported + with assert_raises(ValueError): + z.set_basic_selection(..., v[['foo', 'bar']], fields=['foo', 'bar']) + with assert_raises(ValueError): + z[..., 'foo', 'bar'] = v[['foo', 'bar']] + + def _test_get_orthogonal_selection_1d_common(a, z, ix): expect = a[ix] actual = z.get_orthogonal_selection(ix) @@ -451,7 +537,7 @@ def test_get_orthogonal_selection_3d_int(): def _test_set_orthogonal_selection_1d_common(v, a, z, ix): - for value in 42, oindex(v, ix): + for value in 42, oindex(v, ix), oindex(v, ix).tolist(): # setup expectation a[:] = 0 a[ix] = value @@ -510,7 +596,7 @@ def _test_set_orthogonal_selection_2d_common(v, a, z, ix0, ix1): (42, ix1), ) for selection in selections: - for value in 42, oindex(v, selection): + for value in 42, oindex(v, selection), oindex(v, selection).tolist(): # setup expectation a[:] = 0 oindex_set(a, selection, value) @@ -581,7 +667,7 @@ def _test_set_orthogonal_selection_3d_common(v, a, z, ix0, ix1, ix2): (ix0, ix1, 4), ) for selection in selections: - for value in 42, oindex(v, selection): + for value in 42, oindex(v, selection), oindex(v, selection).tolist(): # setup expectation a[:] = 0 oindex_set(a, selection, value) @@ -687,7 +773,7 @@ def test_get_coordinate_selection_1d(): ix = slice(5, 15) # not supported z.get_coordinate_selection(ix) with assert_raises(IndexError): - ix = Ellipsis # not supported + ix = ... # not supported z.get_coordinate_selection(ix) @@ -761,10 +847,10 @@ def test_get_coordinate_selection_2d(): selection = [1, 2, 3], slice(5, 15) z.get_coordinate_selection(selection) with assert_raises(IndexError): - selection = Ellipsis, [1, 2, 3] + selection = ..., [1, 2, 3] z.get_coordinate_selection(selection) with assert_raises(IndexError): - selection = Ellipsis + selection = ... z.get_coordinate_selection(selection) @@ -792,7 +878,7 @@ def test_set_coordinate_selection_1d_int(): # multi-dimensional selection ix = np.array([[2, 4], [6, 8]]) - for value in 42, v[ix]: + for value in 42, v[ix], v[ix].tolist(): # setup expectation a[:] = 0 a[ix] = value @@ -846,7 +932,7 @@ def test_set_coordinate_selection_2d_int(): ix1 = np.array([[1, 3, 2], [2, 0, 5]]) - for value in 42, v[ix0, ix1]: + for value in 42, v[ix0, ix1], v[ix0, ix1].tolist(): # setup expectation a[:] = 0 a[ix0, ix1] = value @@ -1061,7 +1147,7 @@ def test_get_selections_with_fields(): # total selection expect = a[fields] - actual = z.get_basic_selection(Ellipsis, fields=fields) + actual = z.get_basic_selection(..., fields=fields) assert_array_equal(expect, actual) # alternative API if isinstance(fields, str): @@ -1152,13 +1238,13 @@ def test_set_selections_with_fields(): fields_fixture = [ 'foo', - # ['foo'], - # ['foo', 'bar'], - # ['foo', 'baz'], - # ['bar', 'baz'], - # ['foo', 'bar', 'baz'], - # ['bar', 'foo'], - # ['baz', 'bar', 'foo'], + ['foo'], + ['foo', 'bar'], + ['foo', 'baz'], + ['bar', 'baz'], + ['foo', 'bar', 'baz'], + ['bar', 'foo'], + ['baz', 'bar', 'foo'], ] for fields in fields_fixture: @@ -1166,7 +1252,7 @@ def test_set_selections_with_fields(): # currently multi-field assignment is not supported in numpy, so we won't support it either if isinstance(fields, list): with assert_raises(ValueError): - z.set_basic_selection(Ellipsis, v[fields], fields=fields) + z.set_basic_selection(..., v[fields], fields=fields) with assert_raises(ValueError): z.set_orthogonal_selection([0, 2], v[fields], fields=fields) with assert_raises(ValueError): @@ -1182,7 +1268,7 @@ def test_set_selections_with_fields(): assert_array_equal(a, z[:]) a[fields] = v[fields] # total selection - z.set_basic_selection(Ellipsis, v[fields], fields=fields) + z.set_basic_selection(..., v[fields], fields=fields) assert_array_equal(a, z[:]) # basic selection with slice diff --git a/zarr/util.py b/zarr/util.py index a68f723f3f..5a274f0467 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -272,3 +272,12 @@ def __repr__(self): def _repr_html_(self): items = self.obj.info_items() return info_html_report(items) + + +def check_array_shape(param, array, shape): + if not hasattr(array, 'shape'): + raise TypeError('parameter {!r}: expected an array-like object, got {!r}' + .format(param, type(array))) + if array.shape != shape: + raise ValueError('parameter {!r}: expected array with shape {!r}, got {!r}' + .format(param, shape, array.shape)) From 7a57458d3d235f6a40a0b30330f4087845733cce Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 8 Nov 2017 11:38:59 +0000 Subject: [PATCH 44/67] optimise slice with step --- notebooks/advanced_indexing.ipynb | 1935 ++++++----------------------- zarr/indexing.py | 136 +- 2 files changed, 481 insertions(+), 1590 deletions(-) diff --git a/notebooks/advanced_indexing.ipynb b/notebooks/advanced_indexing.ipynb index e79e22ac34..55fcd633ea 100644 --- a/notebooks/advanced_indexing.ipynb +++ b/notebooks/advanced_indexing.ipynb @@ -343,9 +343,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Slicing a 1D array with step > 1\n", + "### Slicing a 1D array with step <> 1\n", "\n", - "Slices with step > 1 are supported via ``get/set_orthogonal_selection()`` and ``.oindex[]``. Internally these are converted to an integer array via ``np.arange``." + "Slices with step <> 1 are supported via ``get/set_orthogonal_selection()`` and ``.oindex[]``. Internally these are converted to an integer array via ``np.arange``." ] }, { @@ -978,7 +978,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -988,7 +988,7 @@ " dtype=[('foo', 'S3'), ('bar', '\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mIndexError\u001b[0m: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices" ] } @@ -1202,7 +1202,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -1212,7 +1212,7 @@ " dtype=[('foo', 'S3'), ('baz', '\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mza\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, selection)\u001b[0m\n\u001b[1;32m 470\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 471\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpop_fields\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 472\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 473\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 474\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36mget_basic_selection\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m 483\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 484\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 485\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_nd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 486\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 487\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m_get_basic_selection_nd\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m 525\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 526\u001b[0m \u001b[0;31m# setup indexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 527\u001b[0;31m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBasicIndexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 528\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/indexing.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, selection, array)\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 225\u001b[0m raise IndexError('unsupported selection type; expected integer or contiguous '\n\u001b[0;32m--> 226\u001b[0;31m 'slice, got {!r}'.format(dim_sel))\n\u001b[0m\u001b[1;32m 227\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 228\u001b[0m \u001b[0mdim_indexers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim_indexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mza\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, selection)\u001b[0m\n\u001b[1;32m 475\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpop_fields\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 477\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 478\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 479\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36mget_basic_selection\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m 491\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 492\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 493\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_nd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 494\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 495\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m_get_basic_selection_nd\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 532\u001b[0m \u001b[0;31m# setup indexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 533\u001b[0;31m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBasicIndexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 534\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 535\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/indexing.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, selection, array)\u001b[0m\n\u001b[1;32m 240\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 241\u001b[0m raise IndexError('unsupported selection type; expected integer or contiguous '\n\u001b[0;32m--> 242\u001b[0;31m 'slice, got {!r}'.format(dim_sel))\n\u001b[0m\u001b[1;32m 243\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0mdim_indexers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim_indexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mIndexError\u001b[0m: unsupported selection type; expected integer or contiguous slice, got ['foo', 'baz']" ] } @@ -1276,7 +1276,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 54, "metadata": {}, "outputs": [ { @@ -1285,7 +1285,7 @@ "800000000" ] }, - "execution_count": 41, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -1297,15 +1297,15 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 392 ms, sys: 84 ms, total: 476 ms\n", - "Wall time: 124 ms\n" + "CPU times: user 428 ms, sys: 56 ms, total: 484 ms\n", + "Wall time: 128 ms\n" ] }, { @@ -1328,7 +1328,7 @@ "Chunks initialized : 1024/1024" ] }, - "execution_count": 42, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -1340,58 +1340,36 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 61, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 88 ms, sys: 52 ms, total: 140 ms\n", - "Wall time: 143 ms\n" + "121 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 0, 1, 2, ..., 99999997, 99999998, 99999999])" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time c.copy()" + "%timeit c.copy()" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 62, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 408 ms, sys: 88 ms, total: 496 ms\n", - "Wall time: 213 ms\n" + "256 ms ± 7.92 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 0, 1, 2, ..., 99999997, 99999998, 99999999])" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc[:]" + "%timeit zc[:]" ] }, { @@ -1403,16 +1381,16 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "9997476" + "9995616" ] }, - "execution_count": 45, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } @@ -1425,184 +1403,133 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 256 ms, sys: 4 ms, total: 260 ms\n", - "Wall time: 258 ms\n" + "243 ms ± 5.16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 1, 11, 33, ..., 99999988, 99999989, 99999990])" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time c[ix_dense_bool]" + "%timeit c[ix_dense_bool]" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 864 ms, sys: 108 ms, total: 972 ms\n", - "Wall time: 439 ms\n" + "426 ms ± 3.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 1, 11, 33, ..., 99999988, 99999989, 99999990])" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc.oindex[ix_dense_bool]" + "%timeit zc.oindex[ix_dense_bool]" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 808 ms, sys: 32 ms, total: 840 ms\n", - "Wall time: 564 ms\n" + "550 ms ± 13.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, + } + ], + "source": [ + "%timeit zc.vindex[ix_dense_bool]" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [], + "source": [ + "import tempfile\n", + "import cProfile\n", + "import pstats\n", + "\n", + "def profile(statement, sort='time', restrictions=(7,)):\n", + " with tempfile.NamedTemporaryFile() as f:\n", + " cProfile.run(statement, filename=f.name)\n", + " pstats.Stats(f.name).sort_stats(sort).print_stats(*restrictions)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [ { - "data": { - "text/plain": [ - "array([ 1, 11, 33, ..., 99999988, 99999989, 99999990])" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Wed Nov 8 11:19:16 2017 /tmp/tmp4t23nh90\n", + "\n", + " 83015 function calls in 0.469 seconds\n", + "\n", + " Ordered by: internal time\n", + " List reduced from 82 to 7 due to restriction <7>\n", + "\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1025 0.196 0.000 0.196 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1024 0.144 0.000 0.153 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1024 0.043 0.000 0.223 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1024 0.009 0.000 0.009 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", + " 1025 0.007 0.000 0.232 0.000 ../zarr/indexing.py:544(__iter__)\n", + " 1024 0.006 0.000 0.206 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n", + " 2048 0.005 0.000 0.005 0.000 ../zarr/core.py:337()\n", + "\n", + "\n" + ] } ], "source": [ - "%time zc.vindex[ix_dense_bool]" + "profile('zc.oindex[ix_dense_bool]')" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 67653 function calls in 0.490 seconds\n", + "Wed Nov 8 11:14:55 2017 /tmp/tmpc6yiwbhy\n", + "\n", + " 83015 function calls in 0.486 seconds\n", "\n", " Ordered by: internal time\n", + " List reduced from 82 to 5 due to restriction <5>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1025 0.201 0.000 0.201 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1024 0.158 0.000 0.167 0.000 core.py:997(_decode_chunk)\n", - " 1024 0.047 0.000 0.234 0.000 core.py:822(_chunk_getitem)\n", - " 1024 0.012 0.000 0.012 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", - " 1025 0.007 0.000 0.238 0.000 indexing.py:484(__iter__)\n", - " 1024 0.006 0.000 0.212 0.000 index_tricks.py:26(ix_)\n", - " 2048 0.005 0.000 0.005 0.000 core.py:332()\n", - " 2048 0.005 0.000 0.005 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 7180 0.004 0.000 0.009 0.000 {built-in method builtins.isinstance}\n", - " 1 0.004 0.004 0.476 0.476 core.py:576(_get_selection)\n", - " 1024 0.004 0.000 0.004 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 2049 0.003 0.000 0.005 0.000 abc.py:178(__instancecheck__)\n", - " 1024 0.003 0.000 0.006 0.000 arrayprint.py:381(wrapper)\n", - " 1024 0.003 0.000 0.009 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.002 0.000 0.221 0.000 indexing.py:377(ix_)\n", - " 1025 0.002 0.000 0.003 0.000 indexing.py:275(__iter__)\n", - " 1024 0.002 0.000 0.007 0.000 core.py:327(_cdata_shape)\n", - " 4098 0.002 0.000 0.002 0.000 _weakrefset.py:70(__contains__)\n", - " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 1 0.002 0.002 0.014 0.014 indexing.py:248(__init__)\n", - " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", - " 1024 0.001 0.000 0.011 0.000 core.py:994(_chunk_key)\n", - " 1024 0.001 0.000 0.007 0.000 numeric.py:1905(array_str)\n", - " 1024 0.001 0.000 0.006 0.000 indexing.py:384()\n", - " 1024 0.001 0.000 0.004 0.000 indexing.py:501()\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:487()\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 1024 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", - " 1025 0.001 0.000 0.004 0.000 indexing.py:11(is_integer)\n", - " 3079 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", - " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", - " 1027 0.000 0.000 0.001 0.000 indexing.py:165(ensure_tuple)\n", - " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", - " 1024 0.000 0.000 0.012 0.000 numeric.py:380(count_nonzero)\n", - " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", - " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:488()\n", - " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", - " 1 0.000 0.000 0.491 0.491 :1()\n", - " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 0.491 0.491 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.014 0.014 indexing.py:420(__init__)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.zeros}\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 1 0.000 0.000 0.490 0.490 indexing.py:512(__getitem__)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:130(replace_ellipsis)\n", - " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", - " 1 0.000 0.000 0.490 0.490 core.py:531(get_orthogonal_selection)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:19(is_bool_array)\n", - " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:474()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:476()\n", - " 1 0.000 0.000 0.000 0.000 core.py:367(oindex)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:135()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:479()\n", + " 1025 0.195 0.000 0.195 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1024 0.149 0.000 0.158 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1024 0.044 0.000 0.229 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1024 0.017 0.000 0.017 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", + " 1025 0.007 0.000 0.233 0.000 ../zarr/indexing.py:544(__iter__)\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zc.oindex[ix_dense_bool]', sort='time')" + "profile('zc.oindex[ix_dense_bool]')" ] }, { @@ -1614,109 +1541,35 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 117, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 43161 function calls in 0.575 seconds\n", + "Wed Nov 8 11:19:20 2017 /tmp/tmpxd6eg1gj\n", + "\n", + " 51354 function calls in 0.592 seconds\n", "\n", " Ordered by: internal time\n", + " List reduced from 87 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 2 0.215 0.108 0.215 0.108 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 2 0.094 0.047 0.094 0.047 indexing.py:580()\n", - " 1024 0.093 0.000 0.098 0.000 core.py:997(_decode_chunk)\n", - " 1024 0.042 0.000 0.157 0.000 core.py:822(_chunk_getitem)\n", - " 1 0.028 0.028 0.028 0.028 {built-in method numpy.core.multiarray.ravel_multi_index}\n", + " 2 0.217 0.108 0.217 0.108 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1024 0.100 0.000 0.106 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 2 0.093 0.046 0.093 0.046 ../zarr/indexing.py:640()\n", + " 1024 0.044 0.000 0.171 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1 0.029 0.029 0.029 0.029 {built-in method numpy.core.multiarray.ravel_multi_index}\n", " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.021 0.021 0.179 0.179 indexing.py:553(__init__)\n", - " 2048 0.011 0.000 0.011 0.000 indexing.py:645()\n", - " 1 0.010 0.010 0.010 0.010 function_base.py:1848(diff)\n", - " 1025 0.008 0.000 0.021 0.000 indexing.py:624(__iter__)\n", - " 2054 0.003 0.000 0.003 0.000 core.py:332()\n", - " 4 0.003 0.001 0.003 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1 0.002 0.002 0.180 0.180 core.py:576(_get_selection)\n", - " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", - " 1026 0.002 0.000 0.002 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1024 0.002 0.000 0.003 0.000 util.py:113(is_total_slice)\n", - " 1024 0.002 0.000 0.007 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1027 0.002 0.000 0.005 0.000 core.py:327(_cdata_shape)\n", - " 1024 0.001 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 6155 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", - " 1024 0.001 0.000 0.005 0.000 numeric.py:1905(array_str)\n", - " 1024 0.001 0.000 0.008 0.000 core.py:994(_chunk_key)\n", - " 2048 0.001 0.000 0.001 0.000 util.py:128()\n", - " 1 0.001 0.001 0.395 0.395 indexing.py:655(__init__)\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:629()\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:641()\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", - " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 2056 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", - " 1028 0.000 0.000 0.001 0.000 {built-in method builtins.all}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.575 0.575 indexing.py:678(__getitem__)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1 0.000 0.000 0.575 0.575 :1()\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 4 0.000 0.000 0.003 0.001 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 0.575 0.575 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.unravel_index}\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", - " 4 0.000 0.000 0.003 0.001 {method 'any' of 'numpy.ndarray' objects}\n", - " 7 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 3 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 3 0.000 0.000 0.215 0.072 fromnumeric.py:55(_wrapfunc)\n", - " 5 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 1 0.000 0.000 0.575 0.575 core.py:564(get_mask_selection)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:526(is_coordinate_selection)\n", - " 1 0.000 0.000 0.000 0.000 core.py:342(_nchunks)\n", - " 3 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", - " 4 0.000 0.000 0.003 0.001 _methods.py:37(_any)\n", - " 2 0.000 0.000 0.000 0.000 core.py:161(_refresh_metadata)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:529()\n", - " 2 0.000 0.000 0.215 0.108 fromnumeric.py:1487(nonzero)\n", - " 6 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 3 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:535(is_mask_selection)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:19(is_bool_array)\n", - " 3 0.000 0.000 0.000 0.000 indexing.py:165(ensure_tuple)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", - " 1 0.000 0.000 0.000 0.000 core.py:346(nchunks)\n", - " 3 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 4 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:592()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:593()\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", - " 4 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 core.py:221(shape)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method _functools.reduce}\n", - " 6 0.000 0.000 0.000 0.000 indexing.py:545()\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:557()\n", - " 2 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", - " 1 0.000 0.000 0.000 0.000 core.py:372(vindex)\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.022 0.022 0.179 0.179 ../zarr/indexing.py:613(__init__)\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zc.vindex[ix_dense_bool]', sort='time')" + "profile('zc.vindex[ix_dense_bool]')" ] }, { @@ -1735,7 +1588,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 104, "metadata": {}, "outputs": [ { @@ -1744,7 +1597,7 @@ "10000000" ] }, - "execution_count": 51, + "execution_count": 104, "metadata": {}, "output_type": "execute_result" } @@ -1758,585 +1611,236 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 105, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 64 ms, sys: 4 ms, total: 68 ms\n", - "Wall time: 64.4 ms\n" + "60.7 ms ± 599 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 0, 33, 42, ..., 99999987, 99999994, 99999999])" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time c[ix_dense_int_sorted]" + "%timeit c[ix_dense_int_sorted]" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 106, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 560 ms, sys: 52 ms, total: 612 ms\n", - "Wall time: 354 ms\n" + "361 ms ± 22.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 0, 33, 42, ..., 99999987, 99999994, 99999999])" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc.oindex[ix_dense_int_sorted]" + "%timeit zc.oindex[ix_dense_int_sorted]" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 107, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 588 ms, sys: 60 ms, total: 648 ms\n", - "Wall time: 367 ms\n" + "349 ms ± 3.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 0, 33, 42, ..., 99999987, 99999994, 99999999])" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc.vindex[ix_dense_int_sorted]" + "%timeit zc.vindex[ix_dense_int_sorted]" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 108, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 124 ms, sys: 0 ns, total: 124 ms\n", - "Wall time: 123 ms\n" + "128 ms ± 555 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([23268249, 15578653, 7864, ..., 68558269, 7682216, 66838288])" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time c[ix_dense_int]" + "%timeit c[ix_dense_int]" ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 109, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.94 s, sys: 68 ms, total: 2.01 s\n", - "Wall time: 1.71 s\n" + "1.72 s ± 35 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([23268249, 15578653, 7864, ..., 68558269, 7682216, 66838288])" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc.oindex[ix_dense_int]" + "%timeit zc.oindex[ix_dense_int]" ] }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 110, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.99 s, sys: 76 ms, total: 2.06 s\n", - "Wall time: 1.71 s\n" + "1.69 s ± 12.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([23268249, 15578653, 7864, ..., 68558269, 7682216, 66838288])" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc.vindex[ix_dense_int]" + "%timeit zc.vindex[ix_dense_int]" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 118, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 64607 function calls in 0.380 seconds\n", + "Wed Nov 8 11:19:28 2017 /tmp/tmpk_0eq5a2\n", + "\n", + " 79967 function calls in 0.410 seconds\n", "\n", " Ordered by: internal time\n", + " List reduced from 88 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 0.135 0.135 0.172 0.172 indexing.py:304(__init__)\n", - " 1024 0.090 0.000 0.095 0.000 core.py:997(_decode_chunk)\n", - " 1024 0.043 0.000 0.152 0.000 core.py:822(_chunk_getitem)\n", - " 1025 0.026 0.000 0.027 0.000 indexing.py:351(__iter__)\n", + " 1 0.146 0.146 0.188 0.188 ../zarr/indexing.py:342(__init__)\n", + " 1024 0.093 0.000 0.098 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1024 0.045 0.000 0.164 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1025 0.025 0.000 0.026 0.000 ../zarr/indexing.py:404(__iter__)\n", " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.011 0.011 0.011 0.011 function_base.py:1848(diff)\n", - " 1025 0.006 0.000 0.052 0.000 indexing.py:484(__iter__)\n", - " 1024 0.004 0.000 0.007 0.000 index_tricks.py:26(ix_)\n", - " 2048 0.004 0.000 0.004 0.000 core.py:332()\n", - " 1 0.003 0.003 0.381 0.381 core.py:531(get_orthogonal_selection)\n", - " 2048 0.003 0.000 0.003 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 4 0.003 0.001 0.003 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 7180 0.003 0.000 0.006 0.000 {built-in method builtins.isinstance}\n", - " 2049 0.002 0.000 0.004 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.002 0.002 0.206 0.206 core.py:576(_get_selection)\n", - " 1024 0.002 0.000 0.004 0.000 arrayprint.py:381(wrapper)\n", - " 1024 0.002 0.000 0.014 0.000 indexing.py:377(ix_)\n", - " 1024 0.002 0.000 0.007 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1024 0.002 0.000 0.005 0.000 core.py:327(_cdata_shape)\n", - " 4098 0.002 0.000 0.002 0.000 _weakrefset.py:70(__contains__)\n", - " 1024 0.001 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", - " 1024 0.001 0.000 0.008 0.000 core.py:994(_chunk_key)\n", - " 1024 0.001 0.000 0.004 0.000 indexing.py:384()\n", - " 1024 0.001 0.000 0.005 0.000 numeric.py:1905(array_str)\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", - " 1024 0.001 0.000 0.003 0.000 indexing.py:501()\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:487()\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 3079 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 1030 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 1 0.000 0.000 0.172 0.172 indexing.py:420(__init__)\n", - " 1025 0.000 0.000 0.003 0.000 indexing.py:11(is_integer)\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:489()\n", - " 1027 0.000 0.000 0.001 0.000 indexing.py:165(ensure_tuple)\n", - " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:488()\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", - " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", - " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1 0.000 0.000 0.381 0.381 :1()\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 4 0.000 0.000 0.003 0.001 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 0.381 0.381 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 1 0.000 0.000 0.381 0.381 indexing.py:512(__getitem__)\n", - " 4 0.000 0.000 0.003 0.001 {method 'any' of 'numpy.ndarray' objects}\n", - " 4 0.000 0.000 0.003 0.001 _methods.py:37(_any)\n", - " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:130(replace_ellipsis)\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:474()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:476()\n", - " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:135()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:479()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 core.py:367(oindex)\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", + " 1 0.011 0.011 0.011 0.011 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/function_base.py:1848(diff)\n", + " 1025 0.006 0.000 0.052 0.000 ../zarr/indexing.py:544(__iter__)\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zc.oindex[ix_dense_int_sorted]', sort='time')" + "profile('zc.oindex[ix_dense_int_sorted]')" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 119, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 43143 function calls in 0.372 seconds\n", + "Wed Nov 8 11:19:31 2017 /tmp/tmpzhzjc9l7\n", + "\n", + " 51336 function calls in 0.384 seconds\n", "\n", " Ordered by: internal time\n", + " List reduced from 84 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 2 0.104 0.052 0.104 0.052 indexing.py:580()\n", - " 1024 0.090 0.000 0.095 0.000 core.py:997(_decode_chunk)\n", - " 1024 0.042 0.000 0.154 0.000 core.py:822(_chunk_getitem)\n", - " 1 0.028 0.028 0.028 0.028 {built-in method numpy.core.multiarray.ravel_multi_index}\n", - " 1 0.026 0.026 0.194 0.194 indexing.py:553(__init__)\n", + " 2 0.107 0.054 0.107 0.054 ../zarr/indexing.py:640()\n", + " 1024 0.090 0.000 0.095 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1024 0.043 0.000 0.160 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1 0.029 0.029 0.029 0.029 {built-in method numpy.core.multiarray.ravel_multi_index}\n", + " 1 0.027 0.027 0.199 0.199 ../zarr/indexing.py:613(__init__)\n", " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", - " 2048 0.011 0.000 0.011 0.000 indexing.py:645()\n", - " 1 0.010 0.010 0.010 0.010 function_base.py:1848(diff)\n", - " 1025 0.008 0.000 0.021 0.000 indexing.py:624(__iter__)\n", - " 2054 0.003 0.000 0.003 0.000 core.py:332()\n", - " 4 0.003 0.001 0.003 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1 0.002 0.002 0.177 0.177 core.py:576(_get_selection)\n", - " 1027 0.002 0.000 0.002 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", - " 1024 0.002 0.000 0.007 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.002 0.000 0.003 0.000 util.py:113(is_total_slice)\n", - " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1027 0.001 0.000 0.005 0.000 core.py:327(_cdata_shape)\n", - " 6153 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", - " 1024 0.001 0.000 0.008 0.000 core.py:994(_chunk_key)\n", - " 1024 0.001 0.000 0.005 0.000 numeric.py:1905(array_str)\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:629()\n", - " 2048 0.001 0.000 0.001 0.000 util.py:128()\n", - " 1 0.001 0.001 0.372 0.372 core.py:543(get_coordinate_selection)\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:641()\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", - " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 2054 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", - " 1028 0.000 0.000 0.001 0.000 {built-in method builtins.all}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.372 0.372 :1()\n", - " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 4 0.000 0.000 0.003 0.001 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.unravel_index}\n", - " 1 0.000 0.000 0.372 0.372 {built-in method builtins.exec}\n", - " 7 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 5 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 4 0.000 0.000 0.003 0.001 {method 'any' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.372 0.372 indexing.py:678(__getitem__)\n", - " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 6 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 4 0.000 0.000 0.003 0.001 _methods.py:37(_any)\n", - " 1 0.000 0.000 0.000 0.000 core.py:342(_nchunks)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:526(is_coordinate_selection)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:529()\n", - " 3 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:592()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 core.py:346(nchunks)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method _functools.reduce}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:557()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", - " 4 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", - " 2 0.000 0.000 0.000 0.000 core.py:161(_refresh_metadata)\n", - " 2 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 3 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:165(ensure_tuple)\n", - " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:593()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", - " 1 0.000 0.000 0.000 0.000 core.py:221(shape)\n", - " 1 0.000 0.000 0.000 0.000 core.py:372(vindex)\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 2048 0.011 0.000 0.011 0.000 ../zarr/indexing.py:705()\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zc.vindex[ix_dense_int_sorted]', sort='time')" + "profile('zc.vindex[ix_dense_int_sorted]')" ] }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 120, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 85095 function calls in 1.770 seconds\n", + "Wed Nov 8 11:19:35 2017 /tmp/tmp52ytv9go\n", + "\n", + " 98407 function calls in 1.780 seconds\n", "\n", " Ordered by: internal time\n", + " List reduced from 91 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 1.123 1.123 1.123 1.123 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 1 0.134 0.134 1.417 1.417 indexing.py:304(__init__)\n", - " 1024 0.134 0.000 0.273 0.000 core.py:822(_chunk_getitem)\n", - " 1 0.122 0.122 0.122 0.122 {method 'take' of 'numpy.ndarray' objects}\n", - " 1024 0.117 0.000 0.123 0.000 core.py:997(_decode_chunk)\n", - " 1025 0.032 0.000 0.033 0.000 indexing.py:351(__iter__)\n", - " 1 0.025 0.025 0.025 0.025 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.010 0.010 0.010 0.010 function_base.py:1848(diff)\n", - " 1025 0.007 0.000 0.073 0.000 indexing.py:484(__iter__)\n", - " 2048 0.007 0.000 0.014 0.000 index_tricks.py:26(ix_)\n", - " 2048 0.005 0.000 0.005 0.000 core.py:332()\n", - " 3072 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 10252 0.004 0.000 0.010 0.000 {built-in method builtins.isinstance}\n", - " 3073 0.004 0.000 0.006 0.000 abc.py:178(__instancecheck__)\n", - " 2048 0.004 0.000 0.027 0.000 indexing.py:377(ix_)\n", - " 1 0.003 0.003 0.350 0.350 core.py:576(_get_selection)\n", - " 1 0.003 0.003 1.770 1.770 core.py:531(get_orthogonal_selection)\n", - " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", - " 4 0.002 0.001 0.002 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 6146 0.002 0.000 0.002 0.000 _weakrefset.py:70(__contains__)\n", - " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 2048 0.002 0.000 0.003 0.000 numerictypes.py:728(issubdtype)\n", - " 1024 0.002 0.000 0.006 0.000 core.py:327(_cdata_shape)\n", - " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 2048 0.002 0.000 0.008 0.000 indexing.py:384()\n", - " 1024 0.001 0.000 0.009 0.000 core.py:994(_chunk_key)\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", - " 2054 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", - " 2049 0.001 0.000 0.007 0.000 indexing.py:11(is_integer)\n", - " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", - " 2051 0.001 0.000 0.001 0.000 indexing.py:165(ensure_tuple)\n", - " 4096 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:487()\n", - " 1024 0.001 0.000 0.003 0.000 indexing.py:501()\n", - " 2048 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 4103 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 2048 0.001 0.000 0.002 0.000 numeric.py:463(asarray)\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 1 0.001 0.001 1.418 1.418 indexing.py:420(__init__)\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", - " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:488()\n", - " 1 0.000 0.000 1.770 1.770 indexing.py:512(__getitem__)\n", - " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1 0.000 0.000 1.771 1.771 :1()\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 4 0.000 0.000 0.002 0.001 fromnumeric.py:1886(any)\n", - " 4 0.000 0.000 1.245 0.311 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 1.771 1.771 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 4 0.000 0.000 0.002 0.001 _methods.py:37(_any)\n", - " 4 0.000 0.000 0.002 0.001 {method 'any' of 'numpy.ndarray' objects}\n", - " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 1 0.000 0.000 0.122 0.122 fromnumeric.py:70(take)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:130(replace_ellipsis)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:474()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", - " 1 0.000 0.000 1.123 1.123 fromnumeric.py:826(argsort)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:476()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:479()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:135()\n", - " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", - " 1 0.000 0.000 0.000 0.000 core.py:367(oindex)\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 1.113 1.113 1.113 1.113 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 1024 0.149 0.000 0.301 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1 0.129 0.129 1.404 1.404 ../zarr/indexing.py:342(__init__)\n", + " 1024 0.123 0.000 0.130 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1 0.121 0.121 0.121 0.121 {method 'take' of 'numpy.ndarray' objects}\n", + " 1025 0.031 0.000 0.032 0.000 ../zarr/indexing.py:404(__iter__)\n", + " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zc.oindex[ix_dense_int]', sort='time')" + "profile('zc.oindex[ix_dense_int]')" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 121, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 35981 function calls in 1.735 seconds\n", + "Wed Nov 8 11:19:41 2017 /tmp/tmpmrkck66m\n", + "\n", + " 49294 function calls in 1.738 seconds\n", "\n", " Ordered by: internal time\n", + " List reduced from 86 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 1.117 1.117 1.117 1.117 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 1024 0.132 0.000 0.269 0.000 core.py:822(_chunk_getitem)\n", - " 2 0.121 0.061 0.121 0.061 indexing.py:604()\n", - " 1024 0.116 0.000 0.122 0.000 core.py:997(_decode_chunk)\n", - " 2 0.108 0.054 0.108 0.054 indexing.py:580()\n", + " 1 1.112 1.112 1.112 1.112 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 1024 0.137 0.000 0.278 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 2 0.121 0.061 0.121 0.061 ../zarr/indexing.py:664()\n", + " 1024 0.112 0.000 0.119 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 2 0.106 0.053 0.106 0.053 ../zarr/indexing.py:640()\n", " 1 0.028 0.028 0.028 0.028 {built-in method numpy.core.multiarray.ravel_multi_index}\n", - " 1 0.026 0.026 1.437 1.437 indexing.py:553(__init__)\n", - " 1 0.024 0.024 0.024 0.024 {built-in method numpy.core.multiarray.bincount}\n", - " 2048 0.012 0.000 0.012 0.000 indexing.py:645()\n", - " 1 0.010 0.010 0.010 0.010 function_base.py:1848(diff)\n", - " 1025 0.009 0.000 0.024 0.000 indexing.py:624(__iter__)\n", - " 2054 0.004 0.000 0.004 0.000 core.py:332()\n", - " 1 0.003 0.003 0.296 0.296 core.py:576(_get_selection)\n", - " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", - " 1027 0.002 0.000 0.002 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 4 0.002 0.001 0.002 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 1027 0.002 0.000 0.005 0.000 core.py:327(_cdata_shape)\n", - " 3081 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", - " 1024 0.001 0.000 0.009 0.000 core.py:994(_chunk_key)\n", - " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", - " 1 0.001 0.001 1.734 1.734 core.py:543(get_coordinate_selection)\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:641()\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:629()\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 2054 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", - " 1 0.000 0.000 1.734 1.734 indexing.py:678(__getitem__)\n", - " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 1.734 1.734 :1()\n", - " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 1.735 1.735 {built-in method builtins.exec}\n", - " 4 0.000 0.000 0.002 0.001 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.unravel_index}\n", - " 3 0.000 0.000 1.117 0.372 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 3 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.000 0.000 0.000 0.000 core.py:342(_nchunks)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:526(is_coordinate_selection)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", - " 6 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", - " 5 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 7 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 4 0.000 0.000 0.002 0.001 {method 'any' of 'numpy.ndarray' objects}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:529()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", - " 4 0.000 0.000 0.002 0.001 _methods.py:37(_any)\n", - " 4 0.000 0.000 0.000 0.000 {built-in method builtins.all}\n", - " 1 0.000 0.000 0.000 0.000 core.py:346(nchunks)\n", - " 3 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:165(ensure_tuple)\n", - " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", - " 1 0.000 0.000 1.117 1.117 fromnumeric.py:826(argsort)\n", - " 3 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method _functools.reduce}\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", - " 1 0.000 0.000 0.000 0.000 core.py:221(shape)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:557()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:592()\n", - " 1 0.000 0.000 0.000 0.000 core.py:372(vindex)\n", - " 2 0.000 0.000 0.000 0.000 core.py:161(_refresh_metadata)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", - " 4 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 2 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:593()\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.026 0.026 1.431 1.431 ../zarr/indexing.py:613(__init__)\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zc.vindex[ix_dense_int]', sort='time')" + "profile('zc.vindex[ix_dense_int]')" ] }, { @@ -2355,16 +1859,16 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 122, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "9932" + "9985" ] }, - "execution_count": 62, + "execution_count": 122, "metadata": {}, "output_type": "execute_result" } @@ -2377,291 +1881,119 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 123, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 20 ms, sys: 0 ns, total: 20 ms\n", - "Wall time: 20 ms\n" + "15.6 ms ± 51 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 888, 9941, 15901, ..., 99988491, 99988714, 99995248])" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time c[ix_sparse_bool]" + "%timeit c[ix_sparse_bool]" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 124, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 408 ms, sys: 28 ms, total: 436 ms\n", - "Wall time: 182 ms\n" + "153 ms ± 1.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 888, 9941, 15901, ..., 99988491, 99988714, 99995248])" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc.oindex[ix_sparse_bool]" + "%timeit zc.oindex[ix_sparse_bool]" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 125, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 372 ms, sys: 36 ms, total: 408 ms\n", - "Wall time: 149 ms\n" + "132 ms ± 580 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 888, 9941, 15901, ..., 99988491, 99988714, 99995248])" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc.vindex[ix_sparse_bool]" + "%timeit zc.vindex[ix_sparse_bool]" ] }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 126, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 67653 function calls in 0.186 seconds\n", + "Wed Nov 8 11:20:47 2017 /tmp/tmpsvc8enk3\n", + "\n", + " 82936 function calls in 0.221 seconds\n", "\n", " Ordered by: internal time\n", + " List reduced from 82 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1024 0.097 0.000 0.103 0.000 core.py:997(_decode_chunk)\n", - " 1025 0.018 0.000 0.018 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1024 0.009 0.000 0.009 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", - " 1024 0.006 0.000 0.124 0.000 core.py:822(_chunk_getitem)\n", - " 1025 0.006 0.000 0.049 0.000 indexing.py:484(__iter__)\n", - " 1024 0.005 0.000 0.027 0.000 index_tricks.py:26(ix_)\n", - " 2048 0.004 0.000 0.004 0.000 core.py:332()\n", - " 2048 0.003 0.000 0.003 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 7180 0.003 0.000 0.007 0.000 {built-in method builtins.isinstance}\n", - " 2049 0.003 0.000 0.004 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.003 0.003 0.176 0.176 core.py:576(_get_selection)\n", - " 1025 0.002 0.000 0.003 0.000 indexing.py:275(__iter__)\n", - " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", - " 1024 0.002 0.000 0.034 0.000 indexing.py:377(ix_)\n", - " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.002 0.000 0.006 0.000 core.py:327(_cdata_shape)\n", - " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 4098 0.002 0.000 0.002 0.000 _weakrefset.py:70(__contains__)\n", - " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 1 0.001 0.001 0.011 0.011 indexing.py:248(__init__)\n", - " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", - " 1024 0.001 0.000 0.009 0.000 core.py:994(_chunk_key)\n", - " 1024 0.001 0.000 0.005 0.000 indexing.py:384()\n", - " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", - " 1024 0.001 0.000 0.003 0.000 indexing.py:501()\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:487()\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 3079 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:489()\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 1025 0.000 0.000 0.004 0.000 indexing.py:11(is_integer)\n", - " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", - " 1027 0.000 0.000 0.001 0.000 indexing.py:165(ensure_tuple)\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:488()\n", - " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", - " 1024 0.000 0.000 0.009 0.000 numeric.py:380(count_nonzero)\n", - " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 0.011 0.011 indexing.py:420(__init__)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.zeros}\n", - " 1 0.000 0.000 0.187 0.187 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.187 0.187 indexing.py:512(__getitem__)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:130(replace_ellipsis)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", - " 1 0.000 0.000 0.187 0.187 :1()\n", - " 1 0.000 0.000 0.187 0.187 core.py:531(get_orthogonal_selection)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:474()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:19(is_bool_array)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:476()\n", - " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:135()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:479()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", - " 1 0.000 0.000 0.000 0.000 core.py:367(oindex)\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1023 0.108 0.000 0.113 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1024 0.018 0.000 0.018 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1024 0.018 0.000 0.018 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", + " 1023 0.007 0.000 0.145 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1024 0.006 0.000 0.052 0.000 ../zarr/indexing.py:544(__iter__)\n", + " 1023 0.005 0.000 0.027 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n", + " 2046 0.004 0.000 0.004 0.000 ../zarr/core.py:337()\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zc.oindex[ix_sparse_bool]', sort='time')" + "profile('zc.oindex[ix_sparse_bool]')" ] }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 127, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 43161 function calls in 0.159 seconds\n", + "Wed Nov 8 11:20:51 2017 /tmp/tmplmk05u0l\n", + "\n", + " 51304 function calls in 0.171 seconds\n", "\n", " Ordered by: internal time\n", + " List reduced from 87 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1024 0.093 0.000 0.098 0.000 core.py:997(_decode_chunk)\n", - " 2 0.020 0.010 0.020 0.010 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1025 0.008 0.000 0.015 0.000 indexing.py:624(__iter__)\n", - " 1024 0.006 0.000 0.122 0.000 core.py:822(_chunk_getitem)\n", - " 2048 0.005 0.000 0.005 0.000 indexing.py:645()\n", - " 2054 0.003 0.000 0.003 0.000 core.py:332()\n", - " 1 0.002 0.002 0.139 0.139 core.py:576(_get_selection)\n", - " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", - " 1026 0.002 0.000 0.002 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1024 0.002 0.000 0.003 0.000 util.py:113(is_total_slice)\n", - " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1027 0.001 0.000 0.005 0.000 core.py:327(_cdata_shape)\n", - " 6155 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", - " 1024 0.001 0.000 0.009 0.000 core.py:994(_chunk_key)\n", - " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", - " 2048 0.001 0.000 0.001 0.000 util.py:128()\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:641()\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:629()\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 2056 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", - " 1028 0.000 0.000 0.001 0.000 {built-in method builtins.all}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:580()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:553(__init__)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.ravel_multi_index}\n", - " 4 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.unravel_index}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.000 0.000 0.159 0.159 {built-in method builtins.exec}\n", - " 1 0.000 0.000 0.000 0.000 function_base.py:1848(diff)\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 4 0.000 0.000 0.000 0.000 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 0.159 0.159 indexing.py:678(__getitem__)\n", - " 3 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 1 0.000 0.000 0.020 0.020 indexing.py:655(__init__)\n", - " 1 0.000 0.000 0.000 0.000 core.py:221(shape)\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", - " 3 0.000 0.000 0.020 0.007 fromnumeric.py:55(_wrapfunc)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:526(is_coordinate_selection)\n", - " 3 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", - " 1 0.000 0.000 0.159 0.159 core.py:564(get_mask_selection)\n", - " 6 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 1 0.000 0.000 0.159 0.159 :1()\n", - " 5 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 3 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:529()\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:535(is_mask_selection)\n", - " 7 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 2 0.000 0.000 0.020 0.010 fromnumeric.py:1487(nonzero)\n", - " 4 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", - " 3 0.000 0.000 0.000 0.000 indexing.py:165(ensure_tuple)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", - " 4 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", - " 1 0.000 0.000 0.000 0.000 core.py:342(_nchunks)\n", - " 1 0.000 0.000 0.000 0.000 core.py:346(nchunks)\n", - " 3 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 4 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:19(is_bool_array)\n", - " 6 0.000 0.000 0.000 0.000 indexing.py:545()\n", - " 2 0.000 0.000 0.000 0.000 core.py:161(_refresh_metadata)\n", - " 2 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method _functools.reduce}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:557()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:592()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:593()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", - " 4 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 core.py:372(vindex)\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1023 0.099 0.000 0.104 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 2 0.019 0.010 0.019 0.010 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1024 0.008 0.000 0.015 0.000 ../zarr/indexing.py:684(__iter__)\n", + " 1023 0.007 0.000 0.134 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 2046 0.005 0.000 0.005 0.000 ../zarr/indexing.py:705()\n", + " 2052 0.003 0.000 0.003 0.000 ../zarr/core.py:337()\n", + " 1 0.002 0.002 0.151 0.151 ../zarr/core.py:591(_get_selection)\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zc.vindex[ix_sparse_bool]', sort='time')" + "profile('zc.vindex[ix_sparse_bool]')" ] }, { @@ -2673,7 +2005,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 128, "metadata": {}, "outputs": [ { @@ -2682,7 +2014,7 @@ "10000" ] }, - "execution_count": 68, + "execution_count": 128, "metadata": {}, "output_type": "execute_result" } @@ -2696,380 +2028,170 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 129, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", - "Wall time: 183 µs\n" + "18.6 µs ± 199 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 26607, 37803, 43822, ..., 99980305, 99994438, 99995776])" - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time c[ix_sparse_int_sorted]" + "%timeit c[ix_sparse_int_sorted]" ] }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 130, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", - "Wall time: 189 µs\n" + "21.6 µs ± 348 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([89501247, 55878596, 46320615, ..., 57048243, 1027560, 66644274])" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time c[ix_sparse_int]" + "%timeit c[ix_sparse_int]" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 131, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 384 ms, sys: 44 ms, total: 428 ms\n", - "Wall time: 166 ms\n" + "126 ms ± 2.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 26607, 37803, 43822, ..., 99980305, 99994438, 99995776])" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc.oindex[ix_sparse_int_sorted]" + "%timeit zc.oindex[ix_sparse_int_sorted]" ] }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 132, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 376 ms, sys: 32 ms, total: 408 ms\n", - "Wall time: 137 ms\n" + "110 ms ± 483 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 26607, 37803, 43822, ..., 99980305, 99994438, 99995776])" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc.vindex[ix_sparse_int_sorted]" + "%timeit zc.vindex[ix_sparse_int_sorted]" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 133, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 412 ms, sys: 20 ms, total: 432 ms\n", - "Wall time: 174 ms\n" + "127 ms ± 652 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([89501247, 55878596, 46320615, ..., 57048243, 1027560, 66644274])" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc.oindex[ix_sparse_int]" + "%timeit zc.oindex[ix_sparse_int]" ] }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 134, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 360 ms, sys: 36 ms, total: 396 ms\n", - "Wall time: 134 ms\n" + "106 ms ± 332 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([89501247, 55878596, 46320615, ..., 57048243, 1027560, 66644274])" - ] - }, - "execution_count": 74, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc.vindex[ix_sparse_int]" + "%timeit zc.vindex[ix_sparse_int]" ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 135, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 85095 function calls in 0.170 seconds\n", + "Wed Nov 8 11:22:21 2017 /tmp/tmp58yngtag\n", + "\n", + " 98407 function calls in 0.198 seconds\n", "\n", " Ordered by: internal time\n", + " List reduced from 91 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1024 0.097 0.000 0.102 0.000 core.py:997(_decode_chunk)\n", - " 1025 0.006 0.000 0.044 0.000 indexing.py:484(__iter__)\n", - " 2048 0.006 0.000 0.013 0.000 index_tricks.py:26(ix_)\n", - " 1025 0.005 0.000 0.006 0.000 indexing.py:351(__iter__)\n", - " 1024 0.005 0.000 0.122 0.000 core.py:822(_chunk_getitem)\n", - " 2048 0.004 0.000 0.004 0.000 core.py:332()\n", - " 3072 0.004 0.000 0.004 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 10252 0.004 0.000 0.009 0.000 {built-in method builtins.isinstance}\n", - " 3073 0.004 0.000 0.006 0.000 abc.py:178(__instancecheck__)\n", - " 2048 0.003 0.000 0.025 0.000 indexing.py:377(ix_)\n", - " 1 0.003 0.003 0.169 0.169 core.py:576(_get_selection)\n", - " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", - " 6146 0.002 0.000 0.002 0.000 _weakrefset.py:70(__contains__)\n", - " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.002 0.000 0.006 0.000 core.py:327(_cdata_shape)\n", - " 2048 0.002 0.000 0.003 0.000 numerictypes.py:728(issubdtype)\n", - " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 2048 0.002 0.000 0.008 0.000 indexing.py:384()\n", - " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 1024 0.001 0.000 0.009 0.000 core.py:994(_chunk_key)\n", - " 1 0.001 0.001 0.001 0.001 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", - " 2049 0.001 0.000 0.006 0.000 indexing.py:11(is_integer)\n", - " 2054 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:487()\n", - " 2051 0.001 0.000 0.001 0.000 indexing.py:165(ensure_tuple)\n", - " 2048 0.001 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 1024 0.001 0.000 0.003 0.000 indexing.py:501()\n", - " 4103 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 4096 0.001 0.000 0.001 0.000 {built-in method builtins.issubclass}\n", - " 2048 0.001 0.000 0.001 0.000 numeric.py:463(asarray)\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:489()\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 1 0.000 0.000 0.002 0.002 indexing.py:304(__init__)\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:488()\n", - " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", - " 2049 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 4 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1 0.000 0.000 0.171 0.171 indexing.py:512(__getitem__)\n", - " 1 0.000 0.000 0.000 0.000 {method 'take' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.002 0.002 indexing.py:420(__init__)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.000 0.000 0.000 0.000 function_base.py:1848(diff)\n", - " 1 0.000 0.000 0.171 0.171 {built-in method builtins.exec}\n", - " 4 0.000 0.000 0.001 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 4 0.000 0.000 0.000 0.000 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.171 0.171 :1()\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:70(take)\n", - " 4 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", - " 1 0.000 0.000 0.170 0.170 core.py:531(get_orthogonal_selection)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:130(replace_ellipsis)\n", - " 4 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", - " 4 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.001 0.001 fromnumeric.py:826(argsort)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:474()\n", - " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:476()\n", - " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:135()\n", - " 1 0.000 0.000 0.000 0.000 core.py:367(oindex)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:479()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1024 0.104 0.000 0.110 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1025 0.012 0.000 0.014 0.000 ../zarr/indexing.py:404(__iter__)\n", + " 1025 0.007 0.000 0.053 0.000 ../zarr/indexing.py:544(__iter__)\n", + " 2048 0.007 0.000 0.013 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n", + " 1024 0.006 0.000 0.139 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 2048 0.004 0.000 0.004 0.000 ../zarr/core.py:337()\n", + " 13324 0.004 0.000 0.010 0.000 {built-in method builtins.isinstance}\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zc.oindex[ix_sparse_int]', sort='time')" + "profile('zc.oindex[ix_sparse_int]')" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 136, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 35981 function calls in 0.136 seconds\n", + "Wed Nov 8 11:22:27 2017 /tmp/tmpgyi7rp6s\n", + "\n", + " 49294 function calls in 0.157 seconds\n", "\n", " Ordered by: internal time\n", + " List reduced from 86 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1024 0.093 0.000 0.098 0.000 core.py:997(_decode_chunk)\n", - " 1025 0.008 0.000 0.015 0.000 indexing.py:624(__iter__)\n", - " 1024 0.005 0.000 0.117 0.000 core.py:822(_chunk_getitem)\n", - " 2048 0.004 0.000 0.004 0.000 indexing.py:645()\n", - " 2054 0.003 0.000 0.003 0.000 core.py:332()\n", - " 1 0.002 0.002 0.135 0.135 core.py:576(_get_selection)\n", - " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", - " 1027 0.002 0.000 0.002 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 1027 0.002 0.000 0.005 0.000 core.py:327(_cdata_shape)\n", - " 3081 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", - " 1024 0.001 0.000 0.009 0.000 core.py:994(_chunk_key)\n", - " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", - " 1 0.001 0.001 0.001 0.001 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:641()\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:629()\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 2054 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", - " 1 0.000 0.000 0.002 0.002 indexing.py:553(__init__)\n", - " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:580()\n", - " 1 0.000 0.000 0.000 0.000 function_base.py:1848(diff)\n", - " 1 0.000 0.000 0.136 0.136 {built-in method builtins.exec}\n", - " 4 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.ravel_multi_index}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.unravel_index}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:604()\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.bincount}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", - " 4 0.000 0.000 0.000 0.000 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 0.136 0.136 indexing.py:678(__getitem__)\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", - " 1 0.000 0.000 0.136 0.136 :1()\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 3 0.000 0.000 0.001 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", - " 3 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 4 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 7 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 1 0.000 0.000 0.136 0.136 core.py:543(get_coordinate_selection)\n", - " 3 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:526(is_coordinate_selection)\n", - " 4 0.000 0.000 0.000 0.000 _methods.py:37(_any)\n", - " 5 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:165(ensure_tuple)\n", - " 1 0.000 0.000 0.000 0.000 core.py:342(_nchunks)\n", - " 3 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", - " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:592()\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", - " 4 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", - " 1 0.000 0.000 0.001 0.001 fromnumeric.py:826(argsort)\n", - " 6 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method _functools.reduce}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:529()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 core.py:221(shape)\n", - " 1 0.000 0.000 0.000 0.000 core.py:346(nchunks)\n", - " 4 0.000 0.000 0.000 0.000 {built-in method builtins.all}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:593()\n", - " 1 0.000 0.000 0.000 0.000 core.py:372(vindex)\n", - " 2 0.000 0.000 0.000 0.000 core.py:161(_refresh_metadata)\n", - " 2 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:557()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1024 0.103 0.000 0.109 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1025 0.009 0.000 0.016 0.000 ../zarr/indexing.py:684(__iter__)\n", + " 1024 0.006 0.000 0.137 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 2048 0.005 0.000 0.005 0.000 ../zarr/indexing.py:705()\n", + " 2054 0.003 0.000 0.003 0.000 ../zarr/core.py:337()\n", + " 1 0.003 0.003 0.156 0.156 ../zarr/core.py:591(_get_selection)\n", + " 1024 0.002 0.000 0.005 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/core/arrayprint.py:381(wrapper)\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zc.vindex[ix_sparse_int]', sort='time')" + "profile('zc.vindex[ix_sparse_int]')" ] }, { @@ -3088,13 +2210,13 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 137, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Typezarr.core.Array
Data typebool
Shape(100000000,)
Chunk shape(390625,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes100000000 (95.4M)
No. bytes stored507131 (495.2K)
Storage ratio197.2
Chunks initialized256/256
" + "
Typezarr.core.Array
Data typebool
Shape(100000000,)
Chunk shape(390625,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes100000000 (95.4M)
No. bytes stored507490 (495.6K)
Storage ratio197.0
Chunks initialized256/256
" ], "text/plain": [ "Type : zarr.core.Array\n", @@ -3106,12 +2228,12 @@ "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", "Store type : builtins.dict\n", "No. bytes : 100000000 (95.4M)\n", - "No. bytes stored : 507131 (495.2K)\n", - "Storage ratio : 197.2\n", + "No. bytes stored : 507490 (495.6K)\n", + "Storage ratio : 197.0\n", "Chunks initialized : 256/256" ] }, - "execution_count": 77, + "execution_count": 137, "metadata": {}, "output_type": "execute_result" } @@ -3123,30 +2245,19 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 138, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 744 ms, sys: 120 ms, total: 864 ms\n", - "Wall time: 384 ms\n" + "377 ms ± 12.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 888, 9941, 15901, ..., 99988491, 99988714, 99995248])" - ] - }, - "execution_count": 78, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc.oindex[zix_sparse_bool]" + "%timeit zc.oindex[zix_sparse_bool]" ] }, { @@ -3158,243 +2269,171 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 139, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 40 ms, sys: 52 ms, total: 92 ms\n", - "Wall time: 89.5 ms\n" + "81 ms ± 969 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 0, 2, 4, ..., 99999994, 99999996, 99999998])" - ] - }, - "execution_count": 79, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time np.array(c[::2])" + "%timeit np.array(c[::2])" ] }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 140, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.29 s, sys: 240 ms, total: 1.53 s\n", - "Wall time: 1.24 s\n" + "85.3 ms ± 5.18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 0, 2, 4, ..., 99999994, 99999996, 99999998])" - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zc.oindex[::2]" + "%timeit np.array(c[::-2])" ] }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 143, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 588 ms, sys: 88 ms, total: 676 ms\n", - "Wall time: 411 ms\n" + "1.01 s ± 6.74 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, + } + ], + "source": [ + "%timeit zc.oindex[::2]" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [ { - "data": { - "text/plain": [ - "array([ 0, 10, 20, ..., 99999970, 99999980, 99999990])" - ] - }, - "execution_count": 81, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "1.18 s ± 5.95 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] } ], "source": [ - "%time zc.oindex[::10]" + "%timeit zc.oindex[::-2]" ] }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 145, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 400 ms, sys: 28 ms, total: 428 ms\n", - "Wall time: 184 ms\n" + "326 ms ± 3.55 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, + } + ], + "source": [ + "%timeit zc.oindex[::10]" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [ { - "data": { - "text/plain": [ - "array([ 0, 100, 200, ..., 99999700, 99999800, 99999900])" - ] - }, - "execution_count": 82, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "361 ms ± 2.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] } ], "source": [ - "%time zc.oindex[::100]" + "%timeit zc.oindex[::-10]" ] }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 146, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 368 ms, sys: 52 ms, total: 420 ms\n", - "Wall time: 167 ms\n" + "152 ms ± 585 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] - }, + } + ], + "source": [ + "%timeit zc.oindex[::100]" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [ { - "data": { - "text/plain": [ - "array([ 0, 1000, 2000, ..., 99997000, 99998000, 99999000])" - ] - }, - "execution_count": 83, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "125 ms ± 772 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] } ], "source": [ - "%time zc.oindex[::1000]" + "%timeit zc.oindex[::1000]" ] }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 148, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 64607 function calls in 1.249 seconds\n", + "Wed Nov 8 11:24:57 2017 /tmp/tmpodht8l8m\n", + "\n", + " 79942 function calls in 1.075 seconds\n", "\n", " Ordered by: internal time\n", + " List reduced from 82 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 0.593 0.593 0.776 0.776 indexing.py:304(__init__)\n", - " 1 0.119 0.119 0.119 0.119 {built-in method numpy.core.multiarray.bincount}\n", - " 1024 0.115 0.000 0.247 0.000 core.py:822(_chunk_getitem)\n", - " 1024 0.110 0.000 0.116 0.000 core.py:997(_decode_chunk)\n", - " 1025 0.106 0.000 0.107 0.000 indexing.py:351(__iter__)\n", - " 1 0.074 0.074 0.074 0.074 {built-in method numpy.core.multiarray.arange}\n", - " 1 0.051 0.051 0.051 0.051 function_base.py:1848(diff)\n", - " 1 0.014 0.014 1.248 1.248 core.py:531(get_orthogonal_selection)\n", - " 4 0.013 0.003 0.013 0.003 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 1025 0.006 0.000 0.134 0.000 indexing.py:484(__iter__)\n", - " 1024 0.004 0.000 0.008 0.000 index_tricks.py:26(ix_)\n", - " 2048 0.004 0.000 0.004 0.000 core.py:332()\n", - " 2048 0.003 0.000 0.003 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 7180 0.003 0.000 0.007 0.000 {built-in method builtins.isinstance}\n", - " 1 0.003 0.003 0.383 0.383 core.py:576(_get_selection)\n", - " 2049 0.002 0.000 0.004 0.000 abc.py:178(__instancecheck__)\n", - " 1024 0.002 0.000 0.005 0.000 arrayprint.py:381(wrapper)\n", - " 1024 0.002 0.000 0.002 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 1024 0.002 0.000 0.008 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.002 0.000 0.016 0.000 indexing.py:377(ix_)\n", - " 1024 0.002 0.000 0.006 0.000 core.py:327(_cdata_shape)\n", - " 4098 0.002 0.000 0.002 0.000 _weakrefset.py:70(__contains__)\n", - " 1024 0.002 0.000 0.002 0.000 arrayprint.py:399(array2string)\n", - " 1024 0.001 0.000 0.002 0.000 numerictypes.py:728(issubdtype)\n", - " 1 0.001 0.001 0.851 0.851 indexing.py:420(__init__)\n", - " 1024 0.001 0.000 0.009 0.000 core.py:994(_chunk_key)\n", - " 1024 0.001 0.000 0.005 0.000 indexing.py:384()\n", - " 2048 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", - " 1024 0.001 0.000 0.006 0.000 numeric.py:1905(array_str)\n", - " 1 0.001 0.001 1.250 1.250 :1()\n", - " 1 0.001 0.001 1.249 1.249 indexing.py:512(__getitem__)\n", - " 2048 0.001 0.000 0.001 0.000 indexing.py:487()\n", - " 1024 0.001 0.000 0.003 0.000 indexing.py:501()\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 3079 0.001 0.000 0.001 0.000 {built-in method builtins.len}\n", - " 1030 0.001 0.000 0.001 0.000 {built-in method numpy.core.multiarray.array}\n", - " 1025 0.000 0.000 0.004 0.000 indexing.py:11(is_integer)\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:489()\n", - " 1024 0.000 0.000 0.001 0.000 :12(__new__)\n", - " 1027 0.000 0.000 0.001 0.000 indexing.py:165(ensure_tuple)\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", - " 1024 0.000 0.000 0.001 0.000 numerictypes.py:660(issubclass_)\n", - " 1024 0.000 0.000 0.001 0.000 numeric.py:463(asarray)\n", - " 2048 0.000 0.000 0.000 0.000 indexing.py:488()\n", - " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", - " 1025 0.000 0.000 0.000 0.000 {built-in method builtins.any}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1025 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}\n", - " 1024 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1 0.000 0.000 1.250 1.250 {built-in method builtins.exec}\n", - " 4 0.000 0.000 0.014 0.003 fromnumeric.py:1886(any)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:130(replace_ellipsis)\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 6 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", - " 2 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc)\n", - " 4 0.000 0.000 0.000 0.000 indexing.py:545()\n", - " 4 0.000 0.000 0.013 0.003 {method 'any' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 4 0.000 0.000 0.013 0.003 _methods.py:37(_any)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}\n", - " 1 0.000 0.000 0.000 0.000 {method 'indices' of 'slice' objects}\n", - " 2 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:135()\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:474()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:476()\n", - " 1 0.000 0.000 0.000 0.000 core.py:367(oindex)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:479()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", + " 1 0.479 0.479 0.595 0.595 ../zarr/indexing.py:342(__init__)\n", + " 1024 0.117 0.000 0.256 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1 0.116 0.116 0.116 0.116 {built-in method numpy.core.multiarray.bincount}\n", + " 1024 0.110 0.000 0.116 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1025 0.109 0.000 0.110 0.000 ../zarr/indexing.py:404(__iter__)\n", + " 1 0.069 0.069 0.069 0.069 {built-in method numpy.core.multiarray.arange}\n", + " 1 0.014 0.014 1.073 1.073 ../zarr/core.py:537(get_orthogonal_selection)\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zc.oindex[::2]', sort='time')" + "profile('zc.oindex[::2]')" ] }, { @@ -3413,7 +2452,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 150, "metadata": {}, "outputs": [ { @@ -3422,7 +2461,7 @@ "(100000000,)" ] }, - "execution_count": 85, + "execution_count": 150, "metadata": {}, "output_type": "execute_result" } @@ -3433,7 +2472,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 151, "metadata": {}, "outputs": [ { @@ -3442,7 +2481,7 @@ "(100000, 1000)" ] }, - "execution_count": 86, + "execution_count": 151, "metadata": {}, "output_type": "execute_result" } @@ -3454,7 +2493,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 152, "metadata": {}, "outputs": [ { @@ -3477,7 +2516,7 @@ "Chunks initialized : 1024/1024" ] }, - "execution_count": 87, + "execution_count": 152, "metadata": {}, "output_type": "execute_result" } @@ -3496,7 +2535,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 153, "metadata": {}, "outputs": [], "source": [ @@ -3506,70 +2545,36 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 154, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 116 ms, sys: 16 ms, total: 132 ms\n", - "Wall time: 129 ms\n" + "98.2 ms ± 995 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([[ 1000, 1001, 1003, ..., 1994, 1995, 1999],\n", - " [ 6000, 6001, 6003, ..., 6994, 6995, 6999],\n", - " [ 8000, 8001, 8003, ..., 8994, 8995, 8999],\n", - " ..., \n", - " [99991000, 99991001, 99991003, ..., 99991994, 99991995, 99991999],\n", - " [99997000, 99997001, 99997003, ..., 99997994, 99997995, 99997999],\n", - " [99998000, 99998001, 99998003, ..., 99998994, 99998995, 99998999]])" - ] - }, - "execution_count": 89, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time d[np.ix_(ix0, ix1)]" + "%timeit d[np.ix_(ix0, ix1)]" ] }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 155, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 780 ms, sys: 40 ms, total: 820 ms\n", - "Wall time: 387 ms\n" + "362 ms ± 5.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([[ 1000, 1001, 1003, ..., 1994, 1995, 1999],\n", - " [ 6000, 6001, 6003, ..., 6994, 6995, 6999],\n", - " [ 8000, 8001, 8003, ..., 8994, 8995, 8999],\n", - " ..., \n", - " [99991000, 99991001, 99991003, ..., 99991994, 99991995, 99991999],\n", - " [99997000, 99997001, 99997003, ..., 99997994, 99997995, 99997999],\n", - " [99998000, 99998001, 99998003, ..., 99998994, 99998995, 99998999]])" - ] - }, - "execution_count": 90, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zd.oindex[ix0, ix1]" + "%timeit zd.oindex[ix0, ix1]" ] }, { @@ -3581,7 +2586,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 156, "metadata": {}, "outputs": [], "source": [ @@ -3591,70 +2596,36 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 157, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 156 ms, sys: 28 ms, total: 184 ms\n", - "Wall time: 184 ms\n" + "176 ms ± 2.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([[38408139, 38408374, 38408509, ..., 38408966, 38408223, 38408367],\n", - " [29895139, 29895374, 29895509, ..., 29895966, 29895223, 29895367],\n", - " [79133139, 79133374, 79133509, ..., 79133966, 79133223, 79133367],\n", - " ..., \n", - " [95689139, 95689374, 95689509, ..., 95689966, 95689223, 95689367],\n", - " [47381139, 47381374, 47381509, ..., 47381966, 47381223, 47381367],\n", - " [20741139, 20741374, 20741509, ..., 20741966, 20741223, 20741367]])" - ] - }, - "execution_count": 92, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time d[np.ix_(ix0, ix1)]" + "%timeit d[np.ix_(ix0, ix1)]" ] }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 158, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.08 s, sys: 120 ms, total: 1.2 s\n", - "Wall time: 586 ms\n" + "550 ms ± 9.16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([[38408139, 38408374, 38408509, ..., 38408966, 38408223, 38408367],\n", - " [29895139, 29895374, 29895509, ..., 29895966, 29895223, 29895367],\n", - " [79133139, 79133374, 79133509, ..., 79133966, 79133223, 79133367],\n", - " ..., \n", - " [95689139, 95689374, 95689509, ..., 95689966, 95689223, 95689367],\n", - " [47381139, 47381374, 47381509, ..., 47381966, 47381223, 47381367],\n", - " [20741139, 20741374, 20741509, ..., 20741966, 20741223, 20741367]])" - ] - }, - "execution_count": 93, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zd.oindex[ix0, ix1]" + "%timeit zd.oindex[ix0, ix1]" ] }, { @@ -3666,7 +2637,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 159, "metadata": {}, "outputs": [ { @@ -3675,7 +2646,7 @@ "10000000" ] }, - "execution_count": 94, + "execution_count": 159, "metadata": {}, "output_type": "execute_result" } @@ -3689,165 +2660,69 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 160, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 256 ms, sys: 12 ms, total: 268 ms\n", - "Wall time: 265 ms\n" + "241 ms ± 1.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([ 6452573, 65841096, 70323990, ..., 44175624, 34778721, 67807976])" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time d[ix0, ix1]" + "%timeit d[ix0, ix1]" ] }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 161, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 2.43 s, sys: 148 ms, total: 2.58 s\n", - "Wall time: 2.09 s\n" + "2.02 s ± 7.81 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] - }, - { - "data": { - "text/plain": [ - "array([55010547, 87536917, 88871707, ..., 73879431, 32878018, 25168834])" - ] - }, - "execution_count": 95, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "%time zd.vindex[ix0, ix1]" + "%timeit zd.vindex[ix0, ix1]" ] }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 162, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 48332 function calls in 2.050 seconds\n", + "Wed Nov 8 11:28:12 2017 /tmp/tmp2r3kxkv1\n", + "\n", + " 61645 function calls in 2.260 seconds\n", "\n", " Ordered by: internal time\n", + " List reduced from 88 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 1.107 1.107 1.107 1.107 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 3 0.255 0.085 0.255 0.085 indexing.py:604()\n", - " 3 0.193 0.064 0.193 0.064 indexing.py:580()\n", - " 1024 0.164 0.000 0.328 0.000 core.py:822(_chunk_getitem)\n", - " 1024 0.137 0.000 0.144 0.000 core.py:997(_decode_chunk)\n", - " 1 0.045 0.045 0.045 0.045 {built-in method numpy.core.multiarray.ravel_multi_index}\n", - " 1 0.044 0.044 1.683 1.683 indexing.py:553(__init__)\n", - " 1 0.024 0.024 0.024 0.024 {built-in method numpy.core.multiarray.bincount}\n", - " 3072 0.021 0.000 0.021 0.000 indexing.py:645()\n", - " 1 0.010 0.010 0.010 0.010 function_base.py:1848(diff)\n", - " 1025 0.010 0.000 0.034 0.000 indexing.py:624(__iter__)\n", - " 7 0.005 0.001 0.005 0.001 {method 'reduce' of 'numpy.ufunc' objects}\n", - " 3081 0.005 0.000 0.005 0.000 core.py:332()\n", - " 2048 0.003 0.000 0.007 0.000 arrayprint.py:381(wrapper)\n", - " 1 0.003 0.003 0.365 0.365 core.py:576(_get_selection)\n", - " 1029 0.003 0.000 0.003 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", - " 1024 0.003 0.000 0.011 0.000 {method 'join' of 'str' objects}\n", - " 1024 0.003 0.000 0.003 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", - " 2048 0.002 0.000 0.003 0.000 arrayprint.py:399(array2string)\n", - " 1027 0.002 0.000 0.006 0.000 core.py:327(_cdata_shape)\n", - " 3090 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}\n", - " 2048 0.001 0.000 0.009 0.000 numeric.py:1905(array_str)\n", - " 1024 0.001 0.000 0.012 0.000 core.py:994(_chunk_key)\n", - " 3072 0.001 0.000 0.001 0.000 indexing.py:641()\n", - " 3072 0.001 0.000 0.001 0.000 indexing.py:629()\n", - " 1 0.001 0.001 2.049 2.049 core.py:543(get_coordinate_selection)\n", - " 1 0.001 0.001 2.049 2.049 indexing.py:678(__getitem__)\n", - " 1024 0.001 0.000 0.001 0.000 threading.py:1230(current_thread)\n", - " 1024 0.001 0.000 0.001 0.000 {built-in method __new__ of type object at 0x55f19771c480}\n", - " 1024 0.001 0.000 0.001 0.000 :12(__new__)\n", - " 3072 0.001 0.000 0.001 0.000 {built-in method _thread.get_ident}\n", - " 2048 0.000 0.000 0.000 0.000 {method 'item' of 'numpy.ndarray' objects}\n", - " 2056 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", - " 2048 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}\n", - " 2048 0.000 0.000 0.000 0.000 {method 'discard' of 'set' objects}\n", - " 1024 0.000 0.000 0.000 0.000 core.py:213(chunk_store)\n", - " 2048 0.000 0.000 0.000 0.000 {built-in method builtins.id}\n", - " 1024 0.000 0.000 0.000 0.000 threading.py:1304(main_thread)\n", - " 1 0.000 0.000 2.050 2.050 :1()\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.unravel_index}\n", - " 7 0.000 0.000 0.005 0.001 fromnumeric.py:1886(any)\n", - " 1 0.000 0.000 2.050 2.050 {built-in method builtins.exec}\n", - " 3 0.000 0.000 1.107 0.369 fromnumeric.py:55(_wrapfunc)\n", - " 1 0.000 0.000 0.000 0.000 {method 'cumsum' of 'numpy.ndarray' objects}\n", - " 12 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.array}\n", - " 8 0.000 0.000 0.000 0.000 numeric.py:534(asanyarray)\n", - " 1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 7 0.000 0.000 0.005 0.001 {method 'any' of 'numpy.ndarray' objects}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:176(_broadcast_shape)\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:195(broadcast_arrays)\n", - " 1 0.000 0.000 0.000 0.000 core.py:342(_nchunks)\n", - " 7 0.000 0.000 0.005 0.001 _methods.py:37(_any)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:721(pop_fields)\n", - " 12 0.000 0.000 0.000 0.000 _weakrefset.py:70(__contains__)\n", - " 1 0.000 0.000 0.000 0.000 core.py:346(nchunks)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:526(is_coordinate_selection)\n", - " 6 0.000 0.000 0.000 0.000 abc.py:178(__instancecheck__)\n", - " 3 0.000 0.000 0.000 0.000 {built-in method builtins.getattr}\n", - " 1 0.000 0.000 0.000 0.000 {built-in method _functools.reduce}\n", - " 6 0.000 0.000 0.000 0.000 indexing.py:11(is_integer)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:592()\n", - " 1 0.000 0.000 1.107 1.107 fromnumeric.py:826(argsort)\n", - " 4 0.000 0.000 0.000 0.000 {built-in method builtins.all}\n", - " 4 0.000 0.000 0.000 0.000 indexing.py:15(is_integer_array)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:543(replace_lists)\n", - " 2 0.000 0.000 0.000 0.000 stride_tricks.py:247()\n", - " 6 0.000 0.000 0.000 0.000 stride_tricks.py:251()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:2053(cumsum)\n", - " 2 0.000 0.000 0.000 0.000 core.py:161(_refresh_metadata)\n", - " 2 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects}\n", - " 4 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", - " 3 0.000 0.000 0.000 0.000 indexing.py:734()\n", - " 3 0.000 0.000 0.000 0.000 indexing.py:557()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:701(check_fields)\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:732()\n", - " 1 0.000 0.000 0.000 0.000 fromnumeric.py:1487(nonzero)\n", - " 1 0.000 0.000 0.000 0.000 core.py:221(shape)\n", - " 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:165(ensure_tuple)\n", - " 2 0.000 0.000 0.000 0.000 indexing.py:529()\n", - " 6 0.000 0.000 0.000 0.000 indexing.py:545()\n", - " 1 0.000 0.000 0.000 0.000 indexing.py:593()\n", - " 1 0.000 0.000 0.000 0.000 core.py:372(vindex)\n", + " 1 1.164 1.164 1.164 1.164 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 3 0.272 0.091 0.272 0.091 ../zarr/indexing.py:664()\n", + " 3 0.202 0.067 0.202 0.067 ../zarr/indexing.py:640()\n", + " 1024 0.201 0.000 0.433 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1024 0.187 0.000 0.197 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1 0.056 0.056 1.778 1.778 ../zarr/indexing.py:613(__init__)\n", + " 1 0.044 0.044 0.044 0.044 {built-in method numpy.core.multiarray.ravel_multi_index}\n", "\n", "\n" ] } ], "source": [ - "cProfile.run('zd.vindex[ix0, ix1]', sort='time')" + "profile('zd.vindex[ix0, ix1]')" ] }, { diff --git a/zarr/indexing.py b/zarr/indexing.py index 677f9bb972..2da6de2665 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -64,10 +64,6 @@ class IntDimIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): - # check type - if not is_integer(dim_sel): - raise ValueError('selection must be an integer') - # normalize dim_sel = normalize_integer_selection(dim_sel, dim_len) @@ -89,10 +85,6 @@ class SliceDimIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): - # check type - if not is_contiguous_slice(dim_sel): - raise ValueError('selection must be a contiguous slice') - # normalize self.start, self.stop, _ = dim_sel.indices(dim_len) @@ -167,13 +159,15 @@ def replace_ellipsis(selection, shape): if len(selection) < len(shape): selection += (slice(None),) * (len(shape) - len(selection)) + # check selection not too long + if len(selection) > len(shape): + raise IndexError('too many indices for array') + return selection def ensure_tuple(v): - if v is None: - v = () - elif not isinstance(v, tuple): + if not isinstance(v, tuple): v = (v,) return v @@ -196,13 +190,6 @@ def ensure_tuple(v): """ -def check_selection_length(selection, shape): - if len(selection) > len(shape): - raise IndexError('too many indices for array') - if len(selection) < len(shape): - raise IndexError('not enough indices for array') - - def is_contiguous_slice(s): return isinstance(s, slice) and (s.step is None or s.step == 1) @@ -220,12 +207,8 @@ class BasicIndexer(object): def __init__(self, selection, array): - # ensure tuple - selection = ensure_tuple(selection) - # handle ellipsis selection = replace_ellipsis(selection, array._shape) - check_selection_length(selection, array._shape) # setup per-dimension indexers dim_indexers = [] @@ -314,10 +297,33 @@ def __iter__(self): yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) +class Order: + UNKNOWN = 0 + INCREASING = 1 + DECREASING = 2 + UNORDERED = 3 + + @staticmethod + def check(a): + diff = np.diff(a) + diff_positive = diff >= 0 + n_diff_positive = np.count_nonzero(diff_positive) + all_increasing = n_diff_positive == len(diff_positive) + any_increasing = n_diff_positive > 0 + if all_increasing: + order = Order.INCREASING + elif any_increasing: + order = Order.UNORDERED + else: + order = Order.DECREASING + return order + + class IntArrayDimIndexer(object): """Integer array selection against a single dimension.""" - def __init__(self, dim_sel, dim_len, dim_chunk_len): + def __init__(self, dim_sel, dim_len, dim_chunk_len, wraparound=True, boundscheck=True, + order=Order.UNKNOWN): # ensure array dim_sel = np.asanyarray(dim_sel) @@ -326,44 +332,54 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): if dim_sel.ndim != 1: raise IndexError('selection must be a 1d array') - # check dtype - if dim_sel.dtype.kind not in 'ui': - raise IndexError('selection must be an integer array') - # handle wraparound - loc_neg = dim_sel < 0 - if np.any(loc_neg): - dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len + if wraparound: + loc_neg = dim_sel < 0 + if np.any(loc_neg): + dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len # handle out of bounds - if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): - raise IndexError('selection contains index out of bounds') - - # handle non-monotonic indices - dim_sel_chunk = dim_sel // dim_chunk_len - if np.any(np.diff(dim_sel) < 0): - self.is_monotonic = False - # sort indices to group by chunk - self.dim_sort = np.argsort(dim_sel_chunk) - self.dim_sel = np.take(dim_sel, self.dim_sort) - - else: - self.is_monotonic = True - self.dim_sort = None - self.dim_sel = dim_sel + if boundscheck: + if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): + raise IndexError('selection contains index out of bounds') # store attributes self.dim_len = dim_len self.dim_chunk_len = dim_chunk_len self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) - self.nitems = len(self.dim_sel) + self.nitems = len(dim_sel) - # precompute number of selected items for each chunk + # determine which chunk is needed for each selection item # note: for dense integer selections, the division operation here is the bottleneck + dim_sel_chunk = dim_sel // dim_chunk_len + + # determine order of indices + if order == Order.UNKNOWN: + order = Order.check(dim_sel) + self.order = order + + if self.order == Order.INCREASING: + self.dim_sel = dim_sel + self.dim_out_sel = None + elif self.order == Order.DECREASING: + self.dim_sel = dim_sel[::-1] + # TODO I'm sure there's a way to do this without creating an arange, but can't see it + # at the moment + self.dim_out_sel = np.arange(self.nitems - 1, -1, -1) + else: + # sort indices to group by chunk + self.dim_out_sel = np.argsort(dim_sel_chunk) + self.dim_sel = np.take(dim_sel, self.dim_out_sel) + + # precompute number of selected items for each chunk self.chunk_nitems = np.bincount(dim_sel_chunk, minlength=self.nchunks) - self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + + # find chunks that we need to visit self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] + # compute offsets into the output array + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + def __iter__(self): for dim_chunk_ix in self.dim_chunk_ixs: @@ -374,10 +390,10 @@ def __iter__(self): else: start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] stop = self.chunk_nitems_cumsum[dim_chunk_ix] - if self.is_monotonic: + if self.order == Order.INCREASING: dim_out_sel = slice(start, stop) else: - dim_out_sel = self.dim_sort[start:stop] + dim_out_sel = self.dim_out_sel[start:stop] # find region in chunk dim_offset = dim_chunk_ix * self.dim_chunk_len @@ -436,21 +452,12 @@ class OrthogonalIndexer(object): def __init__(self, selection, array): - # ensure tuple - selection = ensure_tuple(selection) - # handle ellipsis selection = replace_ellipsis(selection, array._shape) # normalize list to array selection = replace_lists(selection) - # validation - check dimensionality - if len(selection) > len(array._shape): - raise IndexError('too many indices for array') - if len(selection) < len(array._shape): - raise IndexError('not enough indices for array') - # setup per-dimension indexers dim_indexers = [] for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): @@ -468,7 +475,16 @@ def __init__(self, selection, array): # handle slice with step if strides != 1: dim_sel = np.arange(start, stop, strides) - dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + if strides > 1: + order = Order.INCREASING + elif strides < 0: + order = Order.DECREASING + else: + # TODO better error here? + raise RuntimeError('unexpected strides') + dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len, + wraparound=False, boundscheck=False, + order=order) else: dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) From fad4db1ff24ad5abc066a301f929870acfdf56e5 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 8 Nov 2017 17:25:05 +0000 Subject: [PATCH 45/67] rework slices; refactor tests; increase coverage --- notebooks/advanced_indexing.ipynb | 695 +++++++++++-------------- zarr/indexing.py | 135 ++--- zarr/tests/test_indexing.py | 835 +++++++++++++++--------------- zarr/tests/test_util.py | 2 + 4 files changed, 778 insertions(+), 889 deletions(-) diff --git a/notebooks/advanced_indexing.ipynb b/notebooks/advanced_indexing.ipynb index 55fcd633ea..99fbc0d13d 100644 --- a/notebooks/advanced_indexing.ipynb +++ b/notebooks/advanced_indexing.ipynb @@ -343,9 +343,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Slicing a 1D array with step <> 1\n", + "### Slicing a 1D array with step > 1\n", "\n", - "Slices with step <> 1 are supported via ``get/set_orthogonal_selection()`` and ``.oindex[]``. Internally these are converted to an integer array via ``np.arange``." + "Slices with step > 1 are supported via ``get/set_basic_selection()``, ``get/set_orthogonal_selection()``, ``__getitem__`` and ``.oindex[]``. Negative steps are not supported." ] }, { @@ -376,34 +376,13 @@ ], "source": [ "# get items\n", - "za.oindex[1::2]" + "za[1::2]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([9, 7, 5, 3, 1])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# get items with negative step\n", - "za.oindex[-1::-2]" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, "outputs": [ { "data": { @@ -411,7 +390,7 @@ "array([ 0, 10, 2, 30, 4, 50, 6, 70, 8, 90])" ] }, - "execution_count": 19, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -433,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -446,7 +425,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 20, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -459,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -469,7 +448,7 @@ " [ 9, 11]])" ] }, - "execution_count": 21, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -483,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -493,7 +472,7 @@ " [ 9, 11]])" ] }, - "execution_count": 22, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -505,7 +484,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -515,7 +494,7 @@ " [ 9, 11]])" ] }, - "execution_count": 23, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -529,7 +508,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -539,7 +518,7 @@ " [ 9, 11]])" ] }, - "execution_count": 24, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -551,7 +530,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -561,7 +540,7 @@ " [ 9, 10, 11]])" ] }, - "execution_count": 25, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -573,7 +552,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -586,7 +565,7 @@ " [12, 14]])" ] }, - "execution_count": 26, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -598,7 +577,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -611,7 +590,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 27, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -628,7 +607,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -641,7 +620,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 28, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -654,7 +633,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -667,7 +646,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 29, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -684,7 +663,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -697,7 +676,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 30, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -719,7 +698,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -732,7 +711,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 31, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -745,7 +724,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -754,7 +733,7 @@ "array([ 3, 11])" ] }, - "execution_count": 32, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -768,7 +747,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -777,7 +756,7 @@ "array([ 3, 11])" ] }, - "execution_count": 33, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -789,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -802,7 +781,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 34, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -815,7 +794,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -828,7 +807,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 35, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -850,7 +829,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -863,7 +842,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 36, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -876,7 +855,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -885,7 +864,7 @@ "array([ 3, 11])" ] }, - "execution_count": 37, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -899,7 +878,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -908,7 +887,7 @@ "array([ 3, 11])" ] }, - "execution_count": 38, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -919,7 +898,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -932,7 +911,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 39, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -944,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -957,7 +936,7 @@ " [12, 13, 14]])" ] }, - "execution_count": 40, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -978,7 +957,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -988,7 +967,7 @@ " dtype=[('foo', 'S3'), ('bar', '\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mIndexError\u001b[0m: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices" ] } @@ -1202,7 +1181,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -1212,7 +1191,7 @@ " dtype=[('foo', 'S3'), ('baz', '", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mza\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mza\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, selection)\u001b[0m\n\u001b[1;32m 475\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpop_fields\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 477\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 478\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 479\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36mget_basic_selection\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m 491\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 492\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 493\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_nd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 494\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 495\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m_get_basic_selection_nd\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 532\u001b[0m \u001b[0;31m# setup indexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 533\u001b[0;31m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBasicIndexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 534\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 535\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/indexing.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, selection, array)\u001b[0m\n\u001b[1;32m 240\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 241\u001b[0m raise IndexError('unsupported selection type; expected integer or contiguous '\n\u001b[0;32m--> 242\u001b[0;31m 'slice, got {!r}'.format(dim_sel))\n\u001b[0m\u001b[1;32m 243\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0mdim_indexers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim_indexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mIndexError\u001b[0m: unsupported selection type; expected integer or contiguous slice, got ['foo', 'baz']" + "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/indexing.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, selection, array)\u001b[0m\n\u001b[1;32m 267\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 268\u001b[0m raise IndexError('unsupported selection type; expected integer or slice, got {!r}'\n\u001b[0;32m--> 269\u001b[0;31m .format(type(dim_sel)))\n\u001b[0m\u001b[1;32m 270\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 271\u001b[0m \u001b[0mdim_indexers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim_indexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIndexError\u001b[0m: unsupported selection type; expected integer or slice, got " ] } ], @@ -1276,7 +1255,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 53, "metadata": {}, "outputs": [ { @@ -1285,7 +1264,7 @@ "800000000" ] }, - "execution_count": 54, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -1297,15 +1276,15 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 428 ms, sys: 56 ms, total: 484 ms\n", - "Wall time: 128 ms\n" + "CPU times: user 480 ms, sys: 16 ms, total: 496 ms\n", + "Wall time: 141 ms\n" ] }, { @@ -1328,7 +1307,7 @@ "Chunks initialized : 1024/1024" ] }, - "execution_count": 55, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -1340,14 +1319,14 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "121 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "121 ms ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -1357,14 +1336,14 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "256 ms ± 7.92 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "254 ms ± 942 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -1381,16 +1360,16 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "9995616" + "9997476" ] }, - "execution_count": 63, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -1403,14 +1382,14 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "243 ms ± 5.16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "243 ms ± 5.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -1420,14 +1399,14 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "426 ms ± 3.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "433 ms ± 6.49 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -1437,14 +1416,14 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "550 ms ± 13.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "548 ms ± 5.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -1454,7 +1433,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -1470,27 +1449,27 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 62, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Wed Nov 8 11:19:16 2017 /tmp/tmp4t23nh90\n", + "Wed Nov 8 17:17:48 2017 /tmp/tmpruua2rs_\n", "\n", - " 83015 function calls in 0.469 seconds\n", + " 98386 function calls in 0.483 seconds\n", "\n", " Ordered by: internal time\n", - " List reduced from 82 to 7 due to restriction <7>\n", + " List reduced from 83 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1025 0.196 0.000 0.196 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1024 0.144 0.000 0.153 0.000 ../zarr/core.py:1028(_decode_chunk)\n", - " 1024 0.043 0.000 0.223 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1025 0.197 0.000 0.197 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1024 0.149 0.000 0.159 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1024 0.044 0.000 0.231 0.000 ../zarr/core.py:849(_chunk_getitem)\n", " 1024 0.009 0.000 0.009 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", - " 1025 0.007 0.000 0.232 0.000 ../zarr/indexing.py:544(__iter__)\n", - " 1024 0.006 0.000 0.206 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n", + " 1025 0.007 0.000 0.238 0.000 ../zarr/indexing.py:541(__iter__)\n", + " 1024 0.006 0.000 0.207 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n", " 2048 0.005 0.000 0.005 0.000 ../zarr/core.py:337()\n", "\n", "\n" @@ -1501,37 +1480,6 @@ "profile('zc.oindex[ix_dense_bool]')" ] }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Wed Nov 8 11:14:55 2017 /tmp/tmpc6yiwbhy\n", - "\n", - " 83015 function calls in 0.486 seconds\n", - "\n", - " Ordered by: internal time\n", - " List reduced from 82 to 5 due to restriction <5>\n", - "\n", - " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1025 0.195 0.000 0.195 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1024 0.149 0.000 0.158 0.000 ../zarr/core.py:1028(_decode_chunk)\n", - " 1024 0.044 0.000 0.229 0.000 ../zarr/core.py:849(_chunk_getitem)\n", - " 1024 0.017 0.000 0.017 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", - " 1025 0.007 0.000 0.233 0.000 ../zarr/indexing.py:544(__iter__)\n", - "\n", - "\n" - ] - } - ], - "source": [ - "profile('zc.oindex[ix_dense_bool]')" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1541,28 +1489,28 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Wed Nov 8 11:19:20 2017 /tmp/tmpxd6eg1gj\n", + "Wed Nov 8 17:18:06 2017 /tmp/tmp7_bautep\n", "\n", - " 51354 function calls in 0.592 seconds\n", + " 52382 function calls in 0.592 seconds\n", "\n", " Ordered by: internal time\n", - " List reduced from 87 to 7 due to restriction <7>\n", + " List reduced from 88 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 2 0.217 0.108 0.217 0.108 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1024 0.100 0.000 0.106 0.000 ../zarr/core.py:1028(_decode_chunk)\n", - " 2 0.093 0.046 0.093 0.046 ../zarr/indexing.py:640()\n", - " 1024 0.044 0.000 0.171 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 2 0.219 0.110 0.219 0.110 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1024 0.096 0.000 0.101 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 2 0.094 0.047 0.094 0.047 ../zarr/indexing.py:630()\n", + " 1024 0.044 0.000 0.167 0.000 ../zarr/core.py:849(_chunk_getitem)\n", " 1 0.029 0.029 0.029 0.029 {built-in method numpy.core.multiarray.ravel_multi_index}\n", " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.022 0.022 0.179 0.179 ../zarr/indexing.py:613(__init__)\n", + " 1 0.021 0.021 0.181 0.181 ../zarr/indexing.py:603(__init__)\n", "\n", "\n" ] @@ -1588,7 +1536,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 64, "metadata": {}, "outputs": [ { @@ -1597,7 +1545,7 @@ "10000000" ] }, - "execution_count": 104, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -1611,14 +1559,14 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "60.7 ms ± 599 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "62.2 ms ± 2.36 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -1628,14 +1576,14 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "361 ms ± 22.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "355 ms ± 3.53 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -1645,14 +1593,14 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "349 ms ± 3.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "351 ms ± 3.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -1662,14 +1610,14 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 68, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "128 ms ± 555 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "128 ms ± 137 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -1679,14 +1627,14 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "1.72 s ± 35 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "1.71 s ± 5.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -1696,14 +1644,14 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "1.69 s ± 12.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "1.68 s ± 3.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -1713,28 +1661,28 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Wed Nov 8 11:19:28 2017 /tmp/tmpk_0eq5a2\n", + "Wed Nov 8 17:19:09 2017 /tmp/tmpgmu5btr_\n", "\n", - " 79967 function calls in 0.410 seconds\n", + " 95338 function calls in 0.424 seconds\n", "\n", " Ordered by: internal time\n", - " List reduced from 88 to 7 due to restriction <7>\n", + " List reduced from 89 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 0.146 0.146 0.188 0.188 ../zarr/indexing.py:342(__init__)\n", - " 1024 0.093 0.000 0.098 0.000 ../zarr/core.py:1028(_decode_chunk)\n", - " 1024 0.045 0.000 0.164 0.000 ../zarr/core.py:849(_chunk_getitem)\n", - " 1025 0.025 0.000 0.026 0.000 ../zarr/indexing.py:404(__iter__)\n", + " 1 0.141 0.141 0.184 0.184 ../zarr/indexing.py:369(__init__)\n", + " 1024 0.099 0.000 0.106 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1024 0.046 0.000 0.175 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1025 0.027 0.000 0.027 0.000 ../zarr/indexing.py:424(__iter__)\n", " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", - " 1 0.011 0.011 0.011 0.011 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/function_base.py:1848(diff)\n", - " 1025 0.006 0.000 0.052 0.000 ../zarr/indexing.py:544(__iter__)\n", + " 1 0.010 0.010 0.010 0.010 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/function_base.py:1848(diff)\n", + " 1025 0.006 0.000 0.059 0.000 ../zarr/indexing.py:541(__iter__)\n", "\n", "\n" ] @@ -1746,28 +1694,28 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Wed Nov 8 11:19:31 2017 /tmp/tmpzhzjc9l7\n", + "Wed Nov 8 17:19:13 2017 /tmp/tmpay1gvnx8\n", "\n", - " 51336 function calls in 0.384 seconds\n", + " 52362 function calls in 0.398 seconds\n", "\n", " Ordered by: internal time\n", - " List reduced from 84 to 7 due to restriction <7>\n", + " List reduced from 85 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 2 0.107 0.054 0.107 0.054 ../zarr/indexing.py:640()\n", - " 1024 0.090 0.000 0.095 0.000 ../zarr/core.py:1028(_decode_chunk)\n", - " 1024 0.043 0.000 0.160 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 2 0.107 0.054 0.107 0.054 ../zarr/indexing.py:630()\n", + " 1024 0.091 0.000 0.096 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1024 0.041 0.000 0.160 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1 0.040 0.040 0.213 0.213 ../zarr/indexing.py:603(__init__)\n", " 1 0.029 0.029 0.029 0.029 {built-in method numpy.core.multiarray.ravel_multi_index}\n", - " 1 0.027 0.027 0.199 0.199 ../zarr/indexing.py:613(__init__)\n", " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", - " 2048 0.011 0.000 0.011 0.000 ../zarr/indexing.py:705()\n", + " 2048 0.011 0.000 0.011 0.000 ../zarr/indexing.py:695()\n", "\n", "\n" ] @@ -1779,27 +1727,27 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 73, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Wed Nov 8 11:19:35 2017 /tmp/tmp52ytv9go\n", + "Wed Nov 8 17:19:20 2017 /tmp/tmpngsf6zpp\n", "\n", - " 98407 function calls in 1.780 seconds\n", + " 120946 function calls in 1.793 seconds\n", "\n", " Ordered by: internal time\n", - " List reduced from 91 to 7 due to restriction <7>\n", + " List reduced from 92 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 1.113 1.113 1.113 1.113 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 1024 0.149 0.000 0.301 0.000 ../zarr/core.py:849(_chunk_getitem)\n", - " 1 0.129 0.129 1.404 1.404 ../zarr/indexing.py:342(__init__)\n", - " 1024 0.123 0.000 0.130 0.000 ../zarr/core.py:1028(_decode_chunk)\n", - " 1 0.121 0.121 0.121 0.121 {method 'take' of 'numpy.ndarray' objects}\n", - " 1025 0.031 0.000 0.032 0.000 ../zarr/indexing.py:404(__iter__)\n", + " 1 1.128 1.128 1.128 1.128 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 1024 0.139 0.000 0.285 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1 0.132 0.132 1.422 1.422 ../zarr/indexing.py:369(__init__)\n", + " 1 0.120 0.120 0.120 0.120 {method 'take' of 'numpy.ndarray' objects}\n", + " 1024 0.116 0.000 0.123 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1025 0.034 0.000 0.034 0.000 ../zarr/indexing.py:424(__iter__)\n", " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", "\n", "\n" @@ -1812,28 +1760,28 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 74, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Wed Nov 8 11:19:41 2017 /tmp/tmpmrkck66m\n", + "Wed Nov 8 17:19:22 2017 /tmp/tmpbskhj8de\n", "\n", - " 49294 function calls in 1.738 seconds\n", + " 50320 function calls in 1.730 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 86 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 1.112 1.112 1.112 1.112 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 1024 0.137 0.000 0.278 0.000 ../zarr/core.py:849(_chunk_getitem)\n", - " 2 0.121 0.061 0.121 0.061 ../zarr/indexing.py:664()\n", - " 1024 0.112 0.000 0.119 0.000 ../zarr/core.py:1028(_decode_chunk)\n", - " 2 0.106 0.053 0.106 0.053 ../zarr/indexing.py:640()\n", - " 1 0.028 0.028 0.028 0.028 {built-in method numpy.core.multiarray.ravel_multi_index}\n", - " 1 0.026 0.026 1.431 1.431 ../zarr/indexing.py:613(__init__)\n", + " 1 1.116 1.116 1.116 1.116 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 1024 0.133 0.000 0.275 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 2 0.121 0.060 0.121 0.060 ../zarr/indexing.py:654()\n", + " 1024 0.113 0.000 0.119 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 2 0.100 0.050 0.100 0.050 ../zarr/indexing.py:630()\n", + " 1 0.030 0.030 0.030 0.030 {built-in method numpy.core.multiarray.ravel_multi_index}\n", + " 1 0.024 0.024 1.427 1.427 ../zarr/indexing.py:603(__init__)\n", "\n", "\n" ] @@ -1859,16 +1807,16 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "9985" + "9932" ] }, - "execution_count": 122, + "execution_count": 75, "metadata": {}, "output_type": "execute_result" } @@ -1881,14 +1829,14 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 76, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "15.6 ms ± 51 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "15.7 ms ± 38.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], @@ -1898,14 +1846,14 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 77, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "153 ms ± 1.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "156 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -1915,14 +1863,14 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "132 ms ± 580 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "133 ms ± 2.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -1932,28 +1880,28 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 79, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Wed Nov 8 11:20:47 2017 /tmp/tmpsvc8enk3\n", + "Wed Nov 8 17:20:09 2017 /tmp/tmpb7nqc9ax\n", "\n", - " 82936 function calls in 0.221 seconds\n", + " 98386 function calls in 0.191 seconds\n", "\n", " Ordered by: internal time\n", - " List reduced from 82 to 7 due to restriction <7>\n", + " List reduced from 83 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1023 0.108 0.000 0.113 0.000 ../zarr/core.py:1028(_decode_chunk)\n", - " 1024 0.018 0.000 0.018 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1024 0.018 0.000 0.018 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", - " 1023 0.007 0.000 0.145 0.000 ../zarr/core.py:849(_chunk_getitem)\n", - " 1024 0.006 0.000 0.052 0.000 ../zarr/indexing.py:544(__iter__)\n", - " 1023 0.005 0.000 0.027 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n", - " 2046 0.004 0.000 0.004 0.000 ../zarr/core.py:337()\n", + " 1024 0.093 0.000 0.098 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1025 0.017 0.000 0.017 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1024 0.007 0.000 0.007 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", + " 1024 0.007 0.000 0.129 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1025 0.005 0.000 0.052 0.000 ../zarr/indexing.py:541(__iter__)\n", + " 1024 0.005 0.000 0.025 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n", + " 2048 0.004 0.000 0.004 0.000 ../zarr/core.py:337()\n", "\n", "\n" ] @@ -1965,28 +1913,28 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Wed Nov 8 11:20:51 2017 /tmp/tmplmk05u0l\n", + "Wed Nov 8 17:20:09 2017 /tmp/tmphsko8nvh\n", "\n", - " 51304 function calls in 0.171 seconds\n", + " 52382 function calls in 0.160 seconds\n", "\n", " Ordered by: internal time\n", - " List reduced from 87 to 7 due to restriction <7>\n", + " List reduced from 88 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1023 0.099 0.000 0.104 0.000 ../zarr/core.py:1028(_decode_chunk)\n", - " 2 0.019 0.010 0.019 0.010 {method 'nonzero' of 'numpy.ndarray' objects}\n", - " 1024 0.008 0.000 0.015 0.000 ../zarr/indexing.py:684(__iter__)\n", - " 1023 0.007 0.000 0.134 0.000 ../zarr/core.py:849(_chunk_getitem)\n", - " 2046 0.005 0.000 0.005 0.000 ../zarr/indexing.py:705()\n", - " 2052 0.003 0.000 0.003 0.000 ../zarr/core.py:337()\n", - " 1 0.002 0.002 0.151 0.151 ../zarr/core.py:591(_get_selection)\n", + " 1024 0.093 0.000 0.098 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 2 0.017 0.008 0.017 0.008 {method 'nonzero' of 'numpy.ndarray' objects}\n", + " 1025 0.008 0.000 0.014 0.000 ../zarr/indexing.py:674(__iter__)\n", + " 1024 0.006 0.000 0.127 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 2048 0.004 0.000 0.004 0.000 ../zarr/indexing.py:695()\n", + " 2054 0.003 0.000 0.003 0.000 ../zarr/core.py:337()\n", + " 1024 0.002 0.000 0.005 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/core/arrayprint.py:381(wrapper)\n", "\n", "\n" ] @@ -2005,7 +1953,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 81, "metadata": {}, "outputs": [ { @@ -2014,7 +1962,7 @@ "10000" ] }, - "execution_count": 128, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } @@ -2028,14 +1976,14 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "18.6 µs ± 199 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" + "18.9 µs ± 392 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" ] } ], @@ -2045,14 +1993,14 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "21.6 µs ± 348 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" + "20.3 µs ± 155 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" ] } ], @@ -2062,14 +2010,14 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "126 ms ± 2.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "125 ms ± 296 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -2079,14 +2027,14 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 85, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "110 ms ± 483 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "109 ms ± 428 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -2096,14 +2044,14 @@ }, { "cell_type": "code", - "execution_count": 133, + "execution_count": 86, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "127 ms ± 652 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "132 ms ± 489 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -2113,14 +2061,14 @@ }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 87, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "106 ms ± 332 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "108 ms ± 579 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -2130,28 +2078,28 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 88, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Wed Nov 8 11:22:21 2017 /tmp/tmp58yngtag\n", + "Wed Nov 8 17:21:12 2017 /tmp/tmp0b0o2quo\n", "\n", - " 98407 function calls in 0.198 seconds\n", + " 120946 function calls in 0.196 seconds\n", "\n", " Ordered by: internal time\n", - " List reduced from 91 to 7 due to restriction <7>\n", + " List reduced from 92 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1024 0.104 0.000 0.110 0.000 ../zarr/core.py:1028(_decode_chunk)\n", - " 1025 0.012 0.000 0.014 0.000 ../zarr/indexing.py:404(__iter__)\n", - " 1025 0.007 0.000 0.053 0.000 ../zarr/indexing.py:544(__iter__)\n", - " 2048 0.007 0.000 0.013 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n", - " 1024 0.006 0.000 0.139 0.000 ../zarr/core.py:849(_chunk_getitem)\n", - " 2048 0.004 0.000 0.004 0.000 ../zarr/core.py:337()\n", - " 13324 0.004 0.000 0.010 0.000 {built-in method builtins.isinstance}\n", + " 1024 0.105 0.000 0.111 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 2048 0.006 0.000 0.013 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n", + " 1025 0.006 0.000 0.051 0.000 ../zarr/indexing.py:541(__iter__)\n", + " 1024 0.006 0.000 0.141 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 2048 0.005 0.000 0.005 0.000 ../zarr/core.py:337()\n", + " 15373 0.004 0.000 0.010 0.000 {built-in method builtins.isinstance}\n", + " 1025 0.004 0.000 0.005 0.000 ../zarr/indexing.py:424(__iter__)\n", "\n", "\n" ] @@ -2163,28 +2111,28 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Wed Nov 8 11:22:27 2017 /tmp/tmpgyi7rp6s\n", + "Wed Nov 8 17:21:19 2017 /tmp/tmpdwju98kn\n", "\n", - " 49294 function calls in 0.157 seconds\n", + " 50320 function calls in 0.167 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 86 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1024 0.103 0.000 0.109 0.000 ../zarr/core.py:1028(_decode_chunk)\n", - " 1025 0.009 0.000 0.016 0.000 ../zarr/indexing.py:684(__iter__)\n", - " 1024 0.006 0.000 0.137 0.000 ../zarr/core.py:849(_chunk_getitem)\n", - " 2048 0.005 0.000 0.005 0.000 ../zarr/indexing.py:705()\n", - " 2054 0.003 0.000 0.003 0.000 ../zarr/core.py:337()\n", - " 1 0.003 0.003 0.156 0.156 ../zarr/core.py:591(_get_selection)\n", - " 1024 0.002 0.000 0.005 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/core/arrayprint.py:381(wrapper)\n", + " 1024 0.105 0.000 0.111 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1025 0.009 0.000 0.017 0.000 ../zarr/indexing.py:674(__iter__)\n", + " 1024 0.006 0.000 0.142 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 2048 0.005 0.000 0.005 0.000 ../zarr/indexing.py:695()\n", + " 2054 0.004 0.000 0.004 0.000 ../zarr/core.py:337()\n", + " 1 0.003 0.003 0.162 0.162 ../zarr/core.py:591(_get_selection)\n", + " 1027 0.003 0.000 0.003 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", "\n", "\n" ] @@ -2210,13 +2158,13 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 90, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Typezarr.core.Array
Data typebool
Shape(100000000,)
Chunk shape(390625,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes100000000 (95.4M)
No. bytes stored507490 (495.6K)
Storage ratio197.0
Chunks initialized256/256
" + "
Typezarr.core.Array
Data typebool
Shape(100000000,)
Chunk shape(390625,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes100000000 (95.4M)
No. bytes stored507131 (495.2K)
Storage ratio197.2
Chunks initialized256/256
" ], "text/plain": [ "Type : zarr.core.Array\n", @@ -2228,12 +2176,12 @@ "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", "Store type : builtins.dict\n", "No. bytes : 100000000 (95.4M)\n", - "No. bytes stored : 507490 (495.6K)\n", - "Storage ratio : 197.0\n", + "No. bytes stored : 507131 (495.2K)\n", + "Storage ratio : 197.2\n", "Chunks initialized : 256/256" ] }, - "execution_count": 137, + "execution_count": 90, "metadata": {}, "output_type": "execute_result" } @@ -2245,14 +2193,14 @@ }, { "cell_type": "code", - "execution_count": 138, + "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "377 ms ± 12.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "387 ms ± 5.47 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -2269,14 +2217,14 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "81 ms ± 969 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "80.3 ms ± 377 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -2286,154 +2234,103 @@ }, { "cell_type": "code", - "execution_count": 140, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "85.3 ms ± 5.18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" - ] - } - ], - "source": [ - "%timeit np.array(c[::-2])" - ] - }, - { - "cell_type": "code", - "execution_count": 143, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.01 s ± 6.74 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit zc.oindex[::2]" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.18 s ± 5.95 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit zc.oindex[::-2]" - ] - }, - { - "cell_type": "code", - "execution_count": 145, + "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "326 ms ± 3.55 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "168 ms ± 837 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ - "%timeit zc.oindex[::10]" + "%timeit zc[::2]" ] }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "361 ms ± 2.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "136 ms ± 1.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ - "%timeit zc.oindex[::-10]" + "%timeit zc[::10]" ] }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "152 ms ± 585 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "104 ms ± 1.86 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ - "%timeit zc.oindex[::100]" + "%timeit zc[::100]" ] }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 96, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "125 ms ± 772 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "100 ms ± 1.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ - "%timeit zc.oindex[::1000]" + "%timeit zc[::1000]" ] }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 97, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Wed Nov 8 11:24:57 2017 /tmp/tmpodht8l8m\n", + "Wed Nov 8 17:22:44 2017 /tmp/tmpg9dxqcpg\n", "\n", - " 79942 function calls in 1.075 seconds\n", + " 49193 function calls in 0.211 seconds\n", "\n", " Ordered by: internal time\n", - " List reduced from 82 to 7 due to restriction <7>\n", + " List reduced from 55 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 0.479 0.479 0.595 0.595 ../zarr/indexing.py:342(__init__)\n", - " 1024 0.117 0.000 0.256 0.000 ../zarr/core.py:849(_chunk_getitem)\n", - " 1 0.116 0.116 0.116 0.116 {built-in method numpy.core.multiarray.bincount}\n", - " 1024 0.110 0.000 0.116 0.000 ../zarr/core.py:1028(_decode_chunk)\n", - " 1025 0.109 0.000 0.110 0.000 ../zarr/indexing.py:404(__iter__)\n", - " 1 0.069 0.069 0.069 0.069 {built-in method numpy.core.multiarray.arange}\n", - " 1 0.014 0.014 1.073 1.073 ../zarr/core.py:537(get_orthogonal_selection)\n", + " 1024 0.104 0.000 0.110 0.000 ../zarr/core.py:1028(_decode_chunk)\n", + " 1024 0.067 0.000 0.195 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1025 0.005 0.000 0.013 0.000 ../zarr/indexing.py:278(__iter__)\n", + " 2048 0.004 0.000 0.004 0.000 ../zarr/core.py:337()\n", + " 2050 0.003 0.000 0.003 0.000 ../zarr/indexing.py:90(ceildiv)\n", + " 1025 0.003 0.000 0.006 0.000 ../zarr/indexing.py:109(__iter__)\n", + " 1024 0.003 0.000 0.003 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", "\n", "\n" ] } ], "source": [ - "profile('zc.oindex[::2]')" + "profile('zc[::2]')" ] }, { @@ -2452,7 +2349,7 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 99, "metadata": {}, "outputs": [ { @@ -2461,7 +2358,7 @@ "(100000000,)" ] }, - "execution_count": 150, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } @@ -2472,7 +2369,7 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 100, "metadata": {}, "outputs": [ { @@ -2481,7 +2378,7 @@ "(100000, 1000)" ] }, - "execution_count": 151, + "execution_count": 100, "metadata": {}, "output_type": "execute_result" } @@ -2493,7 +2390,7 @@ }, { "cell_type": "code", - "execution_count": 152, + "execution_count": 101, "metadata": {}, "outputs": [ { @@ -2516,7 +2413,7 @@ "Chunks initialized : 1024/1024" ] }, - "execution_count": 152, + "execution_count": 101, "metadata": {}, "output_type": "execute_result" } @@ -2535,7 +2432,7 @@ }, { "cell_type": "code", - "execution_count": 153, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -2545,14 +2442,14 @@ }, { "cell_type": "code", - "execution_count": 154, + "execution_count": 103, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "98.2 ms ± 995 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "101 ms ± 577 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -2562,14 +2459,14 @@ }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 104, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "362 ms ± 5.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "373 ms ± 5.45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -2586,7 +2483,7 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -2596,14 +2493,14 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 106, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "176 ms ± 2.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "174 ms ± 4.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -2613,14 +2510,14 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 107, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "550 ms ± 9.16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "566 ms ± 12.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -2637,7 +2534,7 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 108, "metadata": {}, "outputs": [ { @@ -2646,7 +2543,7 @@ "10000000" ] }, - "execution_count": 159, + "execution_count": 108, "metadata": {}, "output_type": "execute_result" } @@ -2660,14 +2557,14 @@ }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 109, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "241 ms ± 1.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "243 ms ± 3.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -2677,14 +2574,14 @@ }, { "cell_type": "code", - "execution_count": 161, + "execution_count": 110, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2.02 s ± 7.81 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "2.03 s ± 17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -2694,28 +2591,28 @@ }, { "cell_type": "code", - "execution_count": 162, + "execution_count": 111, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Wed Nov 8 11:28:12 2017 /tmp/tmp2r3kxkv1\n", + "Wed Nov 8 17:24:31 2017 /tmp/tmp7c68z70p\n", "\n", - " 61645 function calls in 2.260 seconds\n", + " 62673 function calls in 2.065 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 88 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", - " 1 1.164 1.164 1.164 1.164 {method 'argsort' of 'numpy.ndarray' objects}\n", - " 3 0.272 0.091 0.272 0.091 ../zarr/indexing.py:664()\n", - " 3 0.202 0.067 0.202 0.067 ../zarr/indexing.py:640()\n", - " 1024 0.201 0.000 0.433 0.000 ../zarr/core.py:849(_chunk_getitem)\n", - " 1024 0.187 0.000 0.197 0.000 ../zarr/core.py:1028(_decode_chunk)\n", - " 1 0.056 0.056 1.778 1.778 ../zarr/indexing.py:613(__init__)\n", + " 1 1.112 1.112 1.112 1.112 {method 'argsort' of 'numpy.ndarray' objects}\n", + " 3 0.244 0.081 0.244 0.081 ../zarr/indexing.py:654()\n", + " 3 0.193 0.064 0.193 0.064 ../zarr/indexing.py:630()\n", + " 1024 0.170 0.000 0.350 0.000 ../zarr/core.py:849(_chunk_getitem)\n", + " 1024 0.142 0.000 0.151 0.000 ../zarr/core.py:1028(_decode_chunk)\n", " 1 0.044 0.044 0.044 0.044 {built-in method numpy.core.multiarray.ravel_multi_index}\n", + " 1 0.043 0.043 1.676 1.676 ../zarr/indexing.py:603(__init__)\n", "\n", "\n" ] diff --git a/zarr/indexing.py b/zarr/indexing.py index 2da6de2665..5a79642c94 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -12,12 +12,18 @@ def is_integer(x): return isinstance(x, numbers.Integral) -def is_integer_array(x): - return hasattr(x, 'dtype') and x.dtype.kind in 'ui' +def is_integer_array(x, ndim=None): + t = hasattr(x, 'shape') and hasattr(x, 'dtype') and x.dtype.kind in 'ui' + if ndim is not None: + t = t and len(x.shape) == ndim + return t -def is_bool_array(x): - return hasattr(x, 'dtype') and x.dtype == bool +def is_bool_array(x, ndim=None): + t = hasattr(x, 'shape') and hasattr(x, 'dtype') and x.dtype == bool + if ndim is not None: + t = t and len(x.shape) == ndim + return t def is_scalar(value, dtype): @@ -81,47 +87,65 @@ def __iter__(self): yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) +def ceildiv(a, b): + return int(np.ceil(a / b)) + + class SliceDimIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): # normalize - self.start, self.stop, _ = dim_sel.indices(dim_len) + self.start, self.stop, self.step = dim_sel.indices(dim_len) + if self.step < 1: + raise IndexError('only slices with step >= 1 are supported') # store attributes self.dim_len = dim_len self.dim_chunk_len = dim_chunk_len - self.nitems = max(0, self.stop - self.start) + self.nitems = max(0, ceildiv((self.stop - self.start), self.step)) + self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) def __iter__(self): - dim_chunk_from = self.start // self.dim_chunk_len - dim_chunk_to = int(np.ceil(self.stop / self.dim_chunk_len)) + # figure out the range of chunks we need to visit + dim_chunk_ix_from = self.start // self.dim_chunk_len + dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len) - for dim_chunk_ix in range(dim_chunk_from, dim_chunk_to): + # iterate over chunks in range + for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): + # compute offsets for chunk within overall array dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_limit = min(self.dim_len, (dim_chunk_ix + 1) * self.dim_chunk_len) - if self.start <= dim_offset: + # determine chunk length, accounting for trailing chunk + dim_chunk_len = dim_limit - dim_offset + + if self.start < dim_offset: # selection starts before current chunk dim_chunk_sel_start = 0 - dim_out_offset = dim_offset - self.start + remainder = (dim_offset - self.start) % self.step + if remainder: + dim_chunk_sel_start += self.step - remainder + # compute number of previous items, provides offset into output array + dim_out_offset = ceildiv((dim_offset - self.start), self.step) else: # selection starts within current chunk dim_chunk_sel_start = self.start - dim_offset dim_out_offset = 0 - if self.stop > (dim_offset + self.dim_chunk_len): + if self.stop > dim_limit: # selection ends after current chunk - dim_chunk_sel_stop = self.dim_chunk_len + dim_chunk_sel_stop = dim_chunk_len else: # selection ends within current chunk dim_chunk_sel_stop = self.stop - dim_offset - dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop) - dim_chunk_nitems = dim_chunk_sel_stop - dim_chunk_sel_start + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) + dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) @@ -166,6 +190,13 @@ def replace_ellipsis(selection, shape): return selection +def replace_lists(selection): + return tuple( + np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel + for dim_sel in selection + ) + + def ensure_tuple(v): if not isinstance(v, tuple): v = (v,) @@ -190,8 +221,16 @@ def ensure_tuple(v): """ +def is_slice(s): + return isinstance(s, slice) + + def is_contiguous_slice(s): - return isinstance(s, slice) and (s.step is None or s.step == 1) + return is_slice(s) and (s.step is None or s.step == 1) + + +def is_positive_slice(s): + return is_slice(s) and (s.step is None or s.step >= 1) def is_contiguous_selection(selection): @@ -202,6 +241,11 @@ def is_contiguous_selection(selection): ]) +def is_basic_selection(selection): + selection = ensure_tuple(selection) + return all([is_integer(s) or is_positive_slice(s) for s in selection]) + + # noinspection PyProtectedMember class BasicIndexer(object): @@ -217,12 +261,12 @@ def __init__(self, selection, array): if is_integer(dim_sel): dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) - elif is_contiguous_slice(dim_sel): + elif is_slice(dim_sel): dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) else: - raise IndexError('unsupported selection type; expected integer or contiguous ' - 'slice, got {!r}'.format(dim_sel)) + raise IndexError('unsupported selection type; expected integer or slice, got {!r}' + .format(type(dim_sel))) dim_indexers.append(dim_indexer) @@ -247,7 +291,7 @@ class BoolArrayDimIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): # check number of dimensions - if len(dim_sel.shape) > 1: + if not is_bool_array(dim_sel, 1): raise IndexError('selection must be a 1d array') # check shape @@ -258,7 +302,7 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): self.dim_sel = dim_sel self.dim_len = dim_len self.dim_chunk_len = dim_chunk_len - self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) + self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) # precompute number of selected items for each chunk self.chunk_nitems = np.zeros(self.nchunks, dtype='i8') @@ -327,9 +371,7 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len, wraparound=True, boundscheck # ensure array dim_sel = np.asanyarray(dim_sel) - - # check number of dimensions - if dim_sel.ndim != 1: + if not is_integer_array(dim_sel, 1): raise IndexError('selection must be a 1d array') # handle wraparound @@ -346,7 +388,7 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len, wraparound=True, boundscheck # store attributes self.dim_len = dim_len self.dim_chunk_len = dim_chunk_len - self.nchunks = int(np.ceil(self.dim_len / self.dim_chunk_len)) + self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) self.nitems = len(dim_sel) # determine which chunk is needed for each selection item @@ -363,8 +405,7 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len, wraparound=True, boundscheck self.dim_out_sel = None elif self.order == Order.DECREASING: self.dim_sel = dim_sel[::-1] - # TODO I'm sure there's a way to do this without creating an arange, but can't see it - # at the moment + # TODO do this without creating an arange self.dim_out_sel = np.arange(self.nitems - 1, -1, -1) else: # sort indices to group by chunk @@ -410,7 +451,8 @@ def ix_(selection, shape): """Convert an orthogonal selection to a numpy advanced (fancy) selection, like numpy.ix_ but with support for slices and single ints.""" - selection = ensure_tuple(selection) + # normalisation + selection = replace_ellipsis(selection, shape) # replace slice and int as these are not supported by numpy.ix_ selection = [slice_to_range(dim_sel, dim_len) if isinstance(dim_sel, slice) @@ -426,7 +468,7 @@ def ix_(selection, shape): def oindex(a, selection): """Implementation of orthogonal indexing with slices and ints.""" - selection = ensure_tuple(selection) + selection = replace_ellipsis(selection, a.shape) drop_axes = tuple([i for i, s in enumerate(selection) if is_integer(s)]) selection = ix_(selection, a.shape) result = a[selection] @@ -436,6 +478,7 @@ def oindex(a, selection): def oindex_set(a, selection, value): + selection = replace_ellipsis(selection, a.shape) drop_axes = tuple([i for i, s in enumerate(selection) if is_integer(s)]) selection = ix_(selection, a.shape) if not np.isscalar(value) and drop_axes: @@ -468,25 +511,7 @@ def __init__(self, selection, array): elif isinstance(dim_sel, slice): - # normalize so we can check for step - start, stop, strides = dim_sel.indices(dim_len) - # dim_sel = normalize_slice_selection(dim_sel, dim_len) - - # handle slice with step - if strides != 1: - dim_sel = np.arange(start, stop, strides) - if strides > 1: - order = Order.INCREASING - elif strides < 0: - order = Order.DECREASING - else: - # TODO better error here? - raise RuntimeError('unexpected strides') - dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len, - wraparound=False, boundscheck=False, - order=order) - else: - dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) elif is_integer_array(dim_sel): @@ -498,7 +523,7 @@ def __init__(self, selection, array): else: # TODO improve and refactor error messages - raise IndexError('unsupported selection type') + raise IndexError('unsupported selection type {!r}'.format(type(dim_sel))) dim_indexers.append(dim_indexer) @@ -506,8 +531,7 @@ def __init__(self, selection, array): self.dim_indexers = dim_indexers self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)) - self.is_advanced = any([not isinstance(dim_indexer, (IntDimIndexer, SliceDimIndexer)) - for dim_indexer in self.dim_indexers]) + self.is_advanced = not is_basic_selection(selection) if self.is_advanced: self.drop_axes = tuple([i for i, dim_indexer in enumerate(self.dim_indexers) if isinstance(dim_indexer, IntDimIndexer)]) @@ -531,7 +555,7 @@ def __iter__(self): chunk_selection = ix_(chunk_selection, self.array._chunks) # special case for non-monotonic indices - if any([not isinstance(s, (numbers.Integral, slice)) for s in out_selection]): + if not is_basic_selection(out_selection): out_selection = ix_(out_selection, self.shape) yield ChunkProjection(chunk_coords, chunk_selection, out_selection) @@ -573,13 +597,6 @@ def is_mask_selection(selection, array): ) -def replace_lists(selection): - return tuple( - np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel - for dim_sel in selection - ) - - # noinspection PyProtectedMember class CoordinateIndexer(object): diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 6b2e34a107..02e9a00c63 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -7,7 +7,7 @@ from nose.tools import assert_raises, eq_ as eq -from zarr.indexing import normalize_integer_selection, replace_ellipsis, oindex, oindex_set +from zarr.indexing import (normalize_integer_selection, replace_ellipsis, oindex, oindex_set) import zarr @@ -103,6 +103,89 @@ def test_get_basic_selection_0d(): assert_array_equal(a[['foo', 'bar']], c) +basic_selections_1d = [ + # single value + 42, + -1, + # slices + slice(0, 1050), + slice(50, 150), + slice(0, 2000), + slice(-150, -50), + slice(-2000, 2000), + slice(0, 0), # empty result + slice(-1, 0), # empty result + # total selections + slice(None), + ..., + (), + (..., slice(None)), + # slice with step + slice(None), + slice(None, None), + slice(None, None, 1), + slice(None, None, 10), + slice(None, None, 100), + slice(None, None, 1000), + slice(None, None, 10000), + slice(0, 1050), + slice(0, 1050, 1), + slice(0, 1050, 10), + slice(0, 1050, 100), + slice(0, 1050, 1000), + slice(0, 1050, 10000), + slice(1, 31, 3), + slice(1, 31, 30), + slice(1, 31, 300), + slice(81, 121, 3), + slice(81, 121, 30), + slice(81, 121, 300), + slice(50, 150), + slice(50, 150, 1), + slice(50, 150, 10), +] + + +basic_selections_1d_bad = [ + # only positive step supported + slice(None, None, -1), + slice(None, None, -10), + slice(None, None, -100), + slice(None, None, -1000), + slice(None, None, -10000), + slice(1050, -1, -1), + slice(1050, -1, -10), + slice(1050, -1, -100), + slice(1050, -1, -1000), + slice(1050, -1, -10000), + slice(1050, 0, -1), + slice(1050, 0, -10), + slice(1050, 0, -100), + slice(1050, 0, -1000), + slice(1050, 0, -10000), + slice(150, 50, -1), + slice(150, 50, -10), + slice(31, 1, -3), + slice(121, 81, -3), + slice(-1, 0, -1), + # bad stuff + 2.3, + 'foo', + b'xxx', + None, + (0, 0), + (slice(None), slice(None)), +] + + +def _test_get_basic_selection(a, z, selection): + expect = a[selection] + actual = z.get_basic_selection(selection) + assert_array_equal(expect, actual) + actual = z[selection] + assert_array_equal(expect, actual) + + # noinspection PyStatementEffect def test_get_basic_selection_1d(): @@ -111,44 +194,69 @@ def test_get_basic_selection_1d(): z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) z[:] = a - selections = [ - # single value - 42, - -1, - # slices - slice(None), - slice(0, 1050), - slice(50, 150), - slice(0, 2000), - slice(-150, -50), - slice(-2000, 2000), - slice(0, 0), # empty result - slice(-1, 0), # empty result - # total selections - ..., - (), - (..., slice(None)), - ] - - for selection in selections: - expect = a[selection] - # long-form API - actual = z.get_basic_selection(selection) - assert_array_equal(expect, actual) - # basic selection available via __getitem__ - actual = z[selection] - assert_array_equal(expect, actual) + for selection in basic_selections_1d: + _test_get_basic_selection(a, z, selection) - with assert_raises(IndexError): - z[::2] # slice with step - with assert_raises(IndexError): - z[::-1] # slice with step - with assert_raises(IndexError): - z[[0, 1]] # fancy indexing - with assert_raises(IndexError): - z[0, 0] # too many indices - with assert_raises(IndexError): - z[:, :] # too many indices + bad_selections = basic_selections_1d_bad + [ + [0, 1], # fancy indexing + ] + for selection in bad_selections: + with assert_raises(IndexError): + z.get_basic_selection(selection) + with assert_raises(IndexError): + z[selection] + + +basic_selections_2d = [ + # single row + 42, + -1, + (42, slice(None)), + (-1, slice(None)), + # single col + (slice(None), 4), + (slice(None), -1), + # row slices + slice(None), + slice(0, 1000), + slice(250, 350), + slice(0, 2000), + slice(-350, -250), + slice(0, 0), # empty result + slice(-1, 0), # empty result + slice(-2000, 0), + slice(-2000, 2000), + # 2D slices + (slice(None), slice(1, 5)), + (slice(250, 350), slice(None)), + (slice(250, 350), slice(1, 5)), + (slice(250, 350), slice(-5, -1)), + (slice(250, 350), slice(-50, 50)), + (slice(250, 350, 10), slice(1, 5)), + (slice(250, 350), slice(1, 5, 2)), + (slice(250, 350, 33), slice(1, 5, 3)), + # total selections + (slice(None), slice(None)), + ..., + (), + (..., slice(None)), + (..., slice(None), slice(None)), +] + + +basic_selections_2d_bad = [ + # bad stuff + 2.3, + 'foo', + b'xxx', + None, + (2.3, slice(None)), + # only positive step supported + slice(None, None, -1), + (slice(None, None, -1), slice(None)), + (0, 0, 0), + (slice(None), slice(None), slice(None)), +] # noinspection PyStatementEffect @@ -159,60 +267,20 @@ def test_get_basic_selection_2d(): z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) z[:] = a - selections = [ - # single row - 42, - -1, - (42, slice(None)), - (-1, slice(None)), - # single col - (slice(None), 4), - (slice(None), -1), - # row slices - slice(None), - slice(0, 1000), - slice(250, 350), - slice(0, 2000), - slice(-350, -250), - slice(0, 0), # empty result - slice(-1, 0), # empty result - slice(-2000, 0), - slice(-2000, 2000), - # 2D slices - (slice(None), slice(1, 5)), - (slice(250, 350), slice(None)), - (slice(250, 350), slice(1, 5)), - (slice(250, 350), slice(-5, -1)), - (slice(250, 350), slice(-50, 50)), - # total selections - (slice(None), slice(None)), - ..., - (), - (..., slice(None)), - (..., slice(None), slice(None)), - ] - - for selection in selections: - expect = a[selection] - # long-form API - actual = z.get_basic_selection(selection) - assert_array_equal(expect, actual) - # basic selection available via __getitem__ - actual = z[selection] - assert_array_equal(expect, actual) + for selection in basic_selections_2d: + _test_get_basic_selection(a, z, selection) - with assert_raises(IndexError): - z[::2] # slice with step - with assert_raises(IndexError): - z[:, ::2] # slice with step - with assert_raises(IndexError): - z[[0, 1]] # fancy indexing - with assert_raises(IndexError): - z[:, [0, 1]] # fancy indexing - with assert_raises(IndexError): - z[0, 0, 0] # too many indices - with assert_raises(IndexError): - z[:, :, :] # too many indices + bad_selections = basic_selections_2d_bad + [ + # integer arrays + [0, 1], + ([0, 1], [0, 1]), + (slice(None), [0, 1]), + ] + for selection in bad_selections: + with assert_raises(IndexError): + z.get_basic_selection(selection) + with assert_raises(IndexError): + z[selection] def test_set_basic_selection_0d(): @@ -262,15 +330,12 @@ def test_set_basic_selection_0d(): z[..., 'foo', 'bar'] = v[['foo', 'bar']] -def _test_get_orthogonal_selection_1d_common(a, z, ix): - expect = a[ix] - actual = z.get_orthogonal_selection(ix) +def _test_get_orthogonal_selection(a, z, selection): + expect = oindex(a, selection) + actual = z.get_orthogonal_selection(selection) assert_array_equal(expect, actual) - actual = z.oindex[ix] + actual = z.oindex[selection] assert_array_equal(expect, actual) - # # for 1d arrays, also available via __getitem__ - # actual = z[ix] - # assert_array_equal(expect, actual) # noinspection PyStatementEffect @@ -285,7 +350,7 @@ def test_get_orthogonal_selection_1d_bool(): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - _test_get_orthogonal_selection_1d_common(a, z, ix) + _test_get_orthogonal_selection(a, z, ix) # test errors with assert_raises(IndexError): @@ -307,79 +372,40 @@ def test_get_orthogonal_selection_1d_int(): np.random.seed(42) # test with different degrees of sparseness for p in 2, 0.5, 0.1, 0.01: + # unordered ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - _test_get_orthogonal_selection_1d_common(a, z, ix) + _test_get_orthogonal_selection(a, z, ix) + # increasing ix.sort() - _test_get_orthogonal_selection_1d_common(a, z, ix) + _test_get_orthogonal_selection(a, z, ix) + # decreasing + ix = ix[::-1] + _test_get_orthogonal_selection(a, z, ix) - selections = [ - # test single value - 0, - -1, + selections = basic_selections_1d + [ # test wraparound [0, 3, 10, -23, -12, -1], # explicit test not sorted - [3, 105, 23, 127], # not monotonically increasing + [3, 105, 23, 127], ] for selection in selections: - expect = a[selection] - actual = z.get_orthogonal_selection(selection) - assert_array_equal(expect, actual) - actual = z.oindex[selection] - assert_array_equal(expect, actual) - - # test errors - with assert_raises(IndexError): - ix = [a.shape[0] + 1] # out of bounds - z.oindex[ix] - with assert_raises(IndexError): - ix = [-(a.shape[0] + 1)] # out of bounds - z.oindex[ix] - with assert_raises(IndexError): - ix = [[2, 4], [6, 8]] # too many dimensions - z.oindex[ix] - - -def test_get_orthogonal_selection_1d_slice_with_step(): - - # setup - a = np.arange(1050, dtype=int) - z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) - z[:] = a + _test_get_orthogonal_selection(a, z, selection) - selections = [ - slice(0, 1050), - slice(0, 1050, 1), - slice(0, 1050, 10), - slice(0, 1050, 100), - slice(0, 1050, 1000), - slice(1050, 0, -1), - slice(1050, 0, -10), - slice(50, 150), - slice(50, 150, 1), - slice(50, 150, 10), - slice(150, 50, -1), - slice(150, 50, -10), - slice(-1, 0, -1), + bad_selections = basic_selections_1d_bad + [ + [a.shape[0] + 1], # out of bounds + [-(a.shape[0] + 1)], # out of bounds + [[2, 4], [6, 8]], # too many dimensions ] - for selection in selections: - expect = a[selection] - actual = z.get_orthogonal_selection(selection) - assert_array_equal(expect, actual) - actual = z.oindex[selection] - assert_array_equal(expect, actual) - # # for 1d arrays also available via __getitem__ - # actual = z[selection] - # assert_array_equal(expect, actual) + for selection in bad_selections: + with assert_raises(IndexError): + z.get_orthogonal_selection(selection) + with assert_raises(IndexError): + z.oindex[selection] -def _test_get_orthogonal_selection_2d_common(a, z, ix0, ix1): - +def _test_get_orthogonal_selection_2d(a, z, ix0, ix1): selections = [ - # single value - (42, 4), - (-1, -1), # index both axes with array (ix0, ix1), # mixed indexing with array / slice @@ -391,16 +417,12 @@ def _test_get_orthogonal_selection_2d_common(a, z, ix0, ix1): (ix0, 4), (42, ix1), ] - for selection in selections: - expect = oindex(a, selection) - actual = z.get_orthogonal_selection(selection) - assert_array_equal(expect, actual) - actual = z.oindex[selection] - assert_array_equal(expect, actual) + _test_get_orthogonal_selection(a, z, selection) -def test_get_orthogonal_selection_2d_bool(): +# noinspection PyStatementEffect +def test_get_orthogonal_selection_2d(): # setup a = np.arange(10000, dtype=int).reshape(1000, 10) @@ -410,11 +432,11 @@ def test_get_orthogonal_selection_2d_bool(): np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: + + # boolean arrays ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) - - # main tests - _test_get_orthogonal_selection_2d_common(a, z, ix0, ix1) + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) # mixed int array / bool array selections = ( @@ -422,31 +444,30 @@ def test_get_orthogonal_selection_2d_bool(): (np.nonzero(ix0)[0], ix1), ) for selection in selections: - expect = oindex(a, selection) - actual = z.oindex[selection] - assert_array_equal(expect, actual) - - -def test_get_orthogonal_selection_2d_int(): - - # setup - a = np.arange(10000, dtype=int).reshape(1000, 10) - z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - z[:] = a + _test_get_orthogonal_selection(a, z, selection) - np.random.seed(42) - # test with different degrees of sparseness - for p in 2, 0.5, 0.1, 0.01: + # integer arrays ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) - _test_get_orthogonal_selection_2d_common(a, z, ix0, ix1) + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) ix0.sort() ix1.sort() - _test_get_orthogonal_selection_2d_common(a, z, ix0, ix1) + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) + ix0 = ix0[::-1] + ix1 = ix1[::-1] + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) + for selection in basic_selections_2d: + _test_get_orthogonal_selection(a, z, selection) -def _test_get_orthogonal_selection_3d_common(a, z, ix0, ix1, ix2): + for selection in basic_selections_2d_bad: + with assert_raises(IndexError): + z.get_orthogonal_selection(selection) + with assert_raises(IndexError): + z.oindex[selection] + +def _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2): selections = [ # single value (84, 42, 4), @@ -478,14 +499,10 @@ def _test_get_orthogonal_selection_3d_common(a, z, ix0, ix1, ix2): (ix0, 42, ix2), ] for selection in selections: - expect = oindex(a, selection) - actual = z.get_orthogonal_selection(selection) - assert_array_equal(expect, actual) - actual = z.oindex[selection] - assert_array_equal(expect, actual) + _test_get_orthogonal_selection(a, z, selection) -def test_get_orthogonal_selection_3d_bool(): +def test_get_orthogonal_selection_3d(): # setup a = np.arange(100000, dtype=int).reshape(200, 50, 10) @@ -495,10 +512,26 @@ def test_get_orthogonal_selection_3d_bool(): np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: + + # boolean arrays ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) ix2 = np.random.binomial(1, .5, size=a.shape[2]).astype(bool) - _test_get_orthogonal_selection_3d_common(a, z, ix0, ix1, ix2) + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + + # integer arrays + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) + ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + ix0.sort() + ix1.sort() + ix2.sort() + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + ix0 = ix0[::-1] + ix1 = ix1[::-1] + ix2 = ix2[::-1] + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) def test_orthogonal_indexing_edge_cases(): @@ -516,77 +549,55 @@ def test_orthogonal_indexing_edge_cases(): assert_array_equal(expect, actual) -def test_get_orthogonal_selection_3d_int(): - - # setup - a = np.arange(100000, dtype=int).reshape(200, 50, 10) - z = zarr.create(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) - z[:] = a - - np.random.seed(42) - # test with different degrees of sparseness - for p in 2, 0.5, 0.1, 0.01: - ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) - ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) - _test_get_orthogonal_selection_3d_common(a, z, ix0, ix1, ix2) - ix0.sort() - ix1.sort() - ix2.sort() - _test_get_orthogonal_selection_3d_common(a, z, ix0, ix1, ix2) - - -def _test_set_orthogonal_selection_1d_common(v, a, z, ix): - for value in 42, oindex(v, ix), oindex(v, ix).tolist(): +def _test_set_orthogonal_selection(v, a, z, selection): + for value in 42, oindex(v, selection), oindex(v, selection).tolist(): + if isinstance(value, list) and value == []: + # skip these cases as cannot preserve all dimensions + continue # setup expectation a[:] = 0 - a[ix] = value + oindex_set(a, selection, value) # long-form API z[:] = 0 - z.set_orthogonal_selection(ix, value) + z.set_orthogonal_selection(selection, value) assert_array_equal(a, z[:]) # short-form API z[:] = 0 - z.oindex[ix] = value + z.oindex[selection] = value assert_array_equal(a, z[:]) -def test_set_orthogonal_selection_1d_bool(): +def test_set_orthogonal_selection_1d(): # setup v = np.arange(1050, dtype=int) a = np.empty(v.shape, dtype=int) z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) - np.random.seed(42) # test with different degrees of sparseness + np.random.seed(42) for p in 0.5, 0.1, 0.01: - ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - _test_set_orthogonal_selection_1d_common(v, a, z, ix) - -def test_set_orthogonal_selection_1d_int(): - - # setup - v = np.arange(1050, dtype=int) - a = np.empty(v.shape, dtype=v.dtype) - z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + # boolean arrays + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_set_orthogonal_selection(v, a, z, ix) - np.random.seed(42) - # test with different degrees of sparseness - for p in 2, 0.5, 0.1, 0.01: + # integer arrays ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) - _test_set_orthogonal_selection_1d_common(v, a, z, ix) + _test_set_orthogonal_selection(v, a, z, ix) ix.sort() - _test_set_orthogonal_selection_1d_common(v, a, z, ix) + _test_set_orthogonal_selection(v, a, z, ix) + ix = ix[::-1] + _test_set_orthogonal_selection(v, a, z, ix) + # basic selections + for selection in basic_selections_1d: + _test_set_orthogonal_selection(v, a, z, selection) -def _test_set_orthogonal_selection_2d_common(v, a, z, ix0, ix1): - selections = ( - # single value - (42, 4), - (-1, -1), +def _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1): + + selections = [ # index both axes with array (ix0, ix1), # mixed indexing with array / slice or int @@ -594,23 +605,12 @@ def _test_set_orthogonal_selection_2d_common(v, a, z, ix0, ix1): (slice(250, 350), ix1), (ix0, 4), (42, ix1), - ) + ] for selection in selections: - for value in 42, oindex(v, selection), oindex(v, selection).tolist(): - # setup expectation - a[:] = 0 - oindex_set(a, selection, value) - # long-form API - z[:] = 0 - z.set_orthogonal_selection(selection, value) - assert_array_equal(a, z[:]) - # short-form API - z[:] = 0 - z.oindex[selection] = value - assert_array_equal(a, z[:]) + _test_set_orthogonal_selection(v, a, z, selection) -def test_set_orthogonal_selection_2d_bool(): +def test_set_orthogonal_selection_2d(): # setup v = np.arange(10000, dtype=int).reshape(1000, 10) @@ -620,30 +620,28 @@ def test_set_orthogonal_selection_2d_bool(): np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: + + # boolean arrays ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) - _test_set_orthogonal_selection_2d_common(v, a, z, ix0, ix1) - + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) -def test_set_orthogonal_selection_2d_int(): - - # setup - v = np.arange(10000, dtype=int).reshape(1000, 10) - a = np.empty_like(v) - z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) - - np.random.seed(42) - # test with different degrees of sparseness - for p in 2, 0.5, 0.1, 0.01: + # integer arrays ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) - _test_set_orthogonal_selection_2d_common(v, a, z, ix0, ix1) + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) ix0.sort() ix1.sort() - _test_set_orthogonal_selection_2d_common(v, a, z, ix0, ix1) + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) + ix0 = ix0[::-1] + ix1 = ix1[::-1] + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) + + for selection in basic_selections_2d: + _test_set_orthogonal_selection(v, a, z, selection) -def _test_set_orthogonal_selection_3d_common(v, a, z, ix0, ix1, ix2): +def _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2): selections = ( # single value @@ -667,21 +665,10 @@ def _test_set_orthogonal_selection_3d_common(v, a, z, ix0, ix1, ix2): (ix0, ix1, 4), ) for selection in selections: - for value in 42, oindex(v, selection), oindex(v, selection).tolist(): - # setup expectation - a[:] = 0 - oindex_set(a, selection, value) - # long-form API - z[:] = 0 - z.set_orthogonal_selection(selection, value) - assert_array_equal(a, z[:]) - # short-form API - z[:] = 0 - z.oindex[selection] = value - assert_array_equal(a, z[:]) + _test_set_orthogonal_selection(v, a, z, selection) -def test_set_orthogonal_selection_3d_bool(): +def test_set_orthogonal_selection_3d(): # setup v = np.arange(100000, dtype=int).reshape(200, 50, 10) @@ -691,30 +678,49 @@ def test_set_orthogonal_selection_3d_bool(): np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: + + # boolean arrays ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, .5, size=a.shape[1]).astype(bool) ix2 = np.random.binomial(1, .5, size=a.shape[2]).astype(bool) - _test_set_orthogonal_selection_3d_common(v, a, z, ix0, ix1, ix2) - - -def test_set_orthogonal_selection_3d_int(): + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) - # setup - v = np.arange(100000, dtype=int).reshape(200, 50, 10) - a = np.empty_like(v) - z = zarr.create(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) - - np.random.seed(42) - # test with different degrees of sparseness - for p in 2, 0.5, 0.1, 0.01: + # integer arrays ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) - _test_set_orthogonal_selection_3d_common(v, a, z, ix0, ix1, ix2) + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) ix0.sort() ix1.sort() ix2.sort() - _test_set_orthogonal_selection_3d_common(v, a, z, ix0, ix1, ix2) + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + ix0 = ix0[::-1] + ix1 = ix1[::-1] + ix2 = ix2[::-1] + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + +def _test_get_coordinate_selection(a, z, selection): + expect = a[selection] + actual = z.get_coordinate_selection(selection) + assert_array_equal(expect, actual) + actual = z.vindex[selection] + assert_array_equal(expect, actual) + + +coordinate_selections_1d_bad = [ + # slice not supported + slice(5, 15), + slice(None), + ..., + # bad stuff + 2.3, + 'foo', + b'xxx', + None, + (0, 0), + (slice(None), slice(None)), +] # noinspection PyStatementEffect @@ -730,17 +736,11 @@ def test_get_coordinate_selection_1d(): for p in 2, 0.5, 0.1, 0.01: n = int(a.size * p) ix = np.random.choice(a.shape[0], size=n, replace=True) - expect = a[ix] - actual = z.get_coordinate_selection(ix) - assert_array_equal(expect, actual) - actual = z.vindex[ix] - assert_array_equal(expect, actual) + _test_get_coordinate_selection(a, z, ix) ix.sort() - expect = a[ix] - actual = z.get_coordinate_selection(ix) - assert_array_equal(expect, actual) - actual = z.vindex[ix] - assert_array_equal(expect, actual) + _test_get_coordinate_selection(a, z, ix) + ix = ix[::-1] + _test_get_coordinate_selection(a, z, ix) selections = [ # test single item @@ -754,27 +754,18 @@ def test_get_coordinate_selection_1d(): np.array([[2, 4], [6, 8]]), ] for selection in selections: - expect = a[selection] - # long-form API - actual = z.get_coordinate_selection(selection) - assert_array_equal(expect, actual) - # short-form API - actual = z.vindex[selection] - assert_array_equal(expect, actual) + _test_get_coordinate_selection(a, z, selection) # test errors - with assert_raises(IndexError): - ix = [a.shape[0] + 1] # out of bounds - z.get_coordinate_selection(ix) - with assert_raises(IndexError): - ix = [-(a.shape[0] + 1)] # out of bounds - z.get_coordinate_selection(ix) - with assert_raises(IndexError): - ix = slice(5, 15) # not supported - z.get_coordinate_selection(ix) - with assert_raises(IndexError): - ix = ... # not supported - z.get_coordinate_selection(ix) + bad_selections = coordinate_selections_1d_bad + [ + [a.shape[0] + 1], # out of bounds + [-(a.shape[0] + 1)], # out of bounds + ] + for selection in bad_selections: + with assert_raises(IndexError): + z.get_coordinate_selection(selection) + with assert_raises(IndexError): + z.vindex[selection] def test_get_coordinate_selection_2d(): @@ -801,44 +792,25 @@ def test_get_coordinate_selection_2d(): (42, ix1), (42, 4), ] - for selection in selections: - expect = a[selection] - # long-form API - actual = z.get_coordinate_selection(selection) - assert_array_equal(expect, actual) - # short-form API - actual = z.vindex[selection] - assert_array_equal(expect, actual) + _test_get_coordinate_selection(a, z, selection) # not monotonically increasing (first dim) ix0 = [3, 3, 4, 2, 5] ix1 = [1, 3, 5, 7, 9] - expect = a[ix0, ix1] - actual = z.get_coordinate_selection((ix0, ix1)) - assert_array_equal(expect, actual) - actual = z.vindex[ix0, ix1] - assert_array_equal(expect, actual) + _test_get_coordinate_selection(a, z, (ix0, ix1)) # not monotonically increasing (second dim) ix0 = [1, 1, 2, 2, 5] ix1 = [1, 3, 2, 1, 0] - expect = a[ix0, ix1] - actual = z.get_coordinate_selection((ix0, ix1)) - assert_array_equal(expect, actual) - actual = z.vindex[ix0, ix1] - assert_array_equal(expect, actual) + _test_get_coordinate_selection(a, z, (ix0, ix1)) # multi-dimensional selection ix0 = np.array([[1, 1, 2], [2, 2, 5]]) ix1 = np.array([[1, 3, 2], [1, 0, 0]]) - expect = a[ix0, ix1] - actual = z.get_coordinate_selection((ix0, ix1)) - assert_array_equal(expect, actual) - actual = z.vindex[ix0, ix1] - assert_array_equal(expect, actual) + _test_get_coordinate_selection(a, z, (ix0, ix1)) with assert_raises(IndexError): selection = slice(5, 15), [1, 2, 3] @@ -854,7 +826,22 @@ def test_get_coordinate_selection_2d(): z.get_coordinate_selection(selection) -def test_set_coordinate_selection_1d_int(): +def _test_set_coordinate_selection(v, a, z, selection): + for value in 42, v[selection], v[selection].tolist(): + # setup expectation + a[:] = 0 + a[selection] = value + # test long-form API + z[:] = 0 + z.set_coordinate_selection(selection, value) + assert_array_equal(a, z[:]) + # test short-form API + z[:] = 0 + z.vindex[selection] = value + assert_array_equal(a, z[:]) + + +def test_set_coordinate_selection_1d(): # setup v = np.arange(1050, dtype=int) @@ -866,33 +853,20 @@ def test_set_coordinate_selection_1d_int(): for p in 2, 0.5, 0.1, 0.01: n = int(a.size * p) ix = np.random.choice(a.shape[0], size=n, replace=True) - - a[:] = 0 - a[ix] = v[ix] - z[:] = 0 - z.set_coordinate_selection(ix, v[ix]) - assert_array_equal(a, z[:]) - z[:] = 0 - z.vindex[ix] = v[ix] - assert_array_equal(a, z[:]) + _test_set_coordinate_selection(v, a, z, ix) # multi-dimensional selection ix = np.array([[2, 4], [6, 8]]) - for value in 42, v[ix], v[ix].tolist(): - # setup expectation - a[:] = 0 - a[ix] = value - # test long-form API - z[:] = 0 - z.set_coordinate_selection(ix, value) - assert_array_equal(a, z[:]) - # test short-form API - z[:] = 0 - z.vindex[ix] = value - assert_array_equal(a, z[:]) + _test_set_coordinate_selection(v, a, z, ix) + for selection in coordinate_selections_1d_bad: + with assert_raises(IndexError): + z.set_coordinate_selection(selection, 42) + with assert_raises(IndexError): + z.vindex[selection] = 42 -def test_set_coordinate_selection_2d_int(): + +def test_set_coordinate_selection_2d(): # setup v = np.arange(10000, dtype=int).reshape(1000, 10) @@ -915,35 +889,38 @@ def test_set_coordinate_selection_2d_int(): (ix0, 4), (42, ix1), ) - for selection in selections: - a[:] = 0 - a[selection] = v[selection] - z[:] = 0 - z.set_coordinate_selection(selection, v[selection]) - assert_array_equal(a, z[:]) - z[:] = 0 - z.vindex[selection] = v[selection] - assert_array_equal(a, z[:]) + _test_set_coordinate_selection(v, a, z, selection) # multi-dimensional selection ix0 = np.array([[1, 2, 3], [4, 5, 6]]) ix1 = np.array([[1, 3, 2], [2, 0, 5]]) + _test_set_coordinate_selection(v, a, z, (ix0, ix1)) + + +def _test_get_mask_selection(a, z, selection): + expect = a[selection] + actual = z.get_mask_selection(selection) + assert_array_equal(expect, actual) + actual = z.vindex[selection] + assert_array_equal(expect, actual) - for value in 42, v[ix0, ix1], v[ix0, ix1].tolist(): - # setup expectation - a[:] = 0 - a[ix0, ix1] = value - # test long-form API - z[:] = 0 - z.set_coordinate_selection((ix0, ix1), value) - assert_array_equal(a, z[:]) - # test short-form API - z[:] = 0 - z.vindex[ix0, ix1] = value - assert_array_equal(a, z[:]) + +mask_selections_1d_bad = [ + # slice not supported + slice(5, 15), + slice(None), + ..., + # bad stuff + 2.3, + 'foo', + b'xxx', + None, + (0, 0), + (slice(None), slice(None)), +] # noinspection PyStatementEffect @@ -958,23 +935,19 @@ def test_get_mask_selection_1d(): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) - - expect = a[ix] - actual = z.get_mask_selection(ix) - assert_array_equal(expect, actual) - actual = z.vindex[ix] - assert_array_equal(expect, actual) - # # for 1d arrays, also available via __getitem__ - # actual = z[ix] - # assert_array_equal(expect, actual) + _test_get_mask_selection(a, z, ix) # test errors - with assert_raises(IndexError): - z.vindex[np.zeros(50, dtype=bool)] # too short - with assert_raises(IndexError): - z.vindex[np.zeros(2000, dtype=bool)] # too long - with assert_raises(IndexError): - z.vindex[[[True, False], [False, True]]] # too many dimensions + bad_selections = mask_selections_1d_bad + [ + np.zeros(50, dtype=bool), # too short + np.zeros(2000, dtype=bool), # too long + [[True, False], [False, True]], # too many dimensions + ] + for selection in bad_selections: + with assert_raises(IndexError): + z.get_mask_selection(selection) + with assert_raises(IndexError): + z.vindex[selection] # noinspection PyStatementEffect @@ -989,11 +962,7 @@ def test_get_mask_selection_2d(): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix = np.random.binomial(1, p, size=a.size).astype(bool).reshape(a.shape) - expect = a[ix] - actual = z.get_mask_selection(ix) - assert_array_equal(expect, actual) - actual = z.vindex[ix] - assert_array_equal(expect, actual) + _test_get_mask_selection(a, z, ix) # test errors with assert_raises(IndexError): @@ -1004,6 +973,17 @@ def test_get_mask_selection_2d(): z.vindex[[True, False]] # wrong no. dimensions +def _test_set_mask_selection(v, a, z, selection): + a[:] = 0 + z[:] = 0 + a[selection] = v[selection] + z.set_mask_selection(selection, v[selection]) + assert_array_equal(a, z[:]) + z[:] = 0 + z.vindex[selection] = v[selection] + assert_array_equal(a, z[:]) + + def test_set_mask_selection_1d(): # setup @@ -1015,19 +995,13 @@ def test_set_mask_selection_1d(): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_set_mask_selection(v, a, z, ix) - a[:] = 0 - z[:] = 0 - a[ix] = v[ix] - z.set_mask_selection(ix, v[ix]) - assert_array_equal(a, z[:]) - z[:] = 0 - z.vindex[ix] = v[ix] - assert_array_equal(a, z[:]) - # # for 1d arrays, also available via __setitem__ - # z[:] = 0 - # z[ix] = v[ix] - # assert_array_equal(a, z[:]) + for selection in mask_selections_1d_bad: + with assert_raises(IndexError): + z.set_mask_selection(selection, 42) + with assert_raises(IndexError): + z.vindex[selection] = 42 def test_set_mask_selection_2d(): @@ -1041,15 +1015,7 @@ def test_set_mask_selection_2d(): # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: ix = np.random.binomial(1, p, size=a.size).astype(bool).reshape(a.shape) - - a[:] = 0 - z[:] = 0 - a[ix] = v[ix] - z.set_mask_selection(ix, v[ix]) - assert_array_equal(a, z[:]) - z[:] = 0 - z.vindex[ix] = v[ix] - assert_array_equal(a, z[:]) + _test_set_mask_selection(v, a, z, ix) def test_get_selection_out(): @@ -1069,6 +1035,9 @@ def test_get_selection_out(): z.get_basic_selection(selection, out=out) assert_array_equal(expect, out[:]) + with assert_raises(TypeError): + z.get_basic_selection(..., out=[]) + # orthogonal selections a = np.arange(10000, dtype=int).reshape(1000, 10) z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) @@ -1226,6 +1195,10 @@ def test_get_selections_with_fields(): actual = z.vindex[ix, fields[0], fields[1]] assert_array_equal(expect, actual) + # missing/bad fields + with assert_raises(IndexError): + z.get_basic_selection(..., fields=['notafield']) + def test_set_selections_with_fields(): diff --git a/zarr/tests/test_util.py b/zarr/tests/test_util.py index 3f41ccefa7..fee10c37cc 100644 --- a/zarr/tests/test_util.py +++ b/zarr/tests/test_util.py @@ -104,6 +104,8 @@ def test_guess_chunks(): (100,), (100, 100), (1000000,), + (1000000000,), + (10000000000000000,), (10000, 10000), (10000000, 1000), (1000, 10000000), From 069c670a52cfec527596ec3d5da86479f5e071f5 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 8 Nov 2017 17:42:45 +0000 Subject: [PATCH 46/67] fix syntax and pep8 --- docs/tutorial.rst | 2 +- zarr/core.py | 2 +- zarr/indexing.py | 4 +-- zarr/tests/test_core.py | 2 +- zarr/tests/test_indexing.py | 66 ++++++++++++++++++------------------- 5 files changed, 38 insertions(+), 38 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index cb633af27a..d3137aaf52 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -618,7 +618,7 @@ simple heuristics and may be far from optimal. E.g.:: >>> z4 = zarr.zeros((10000, 10000), dtype='i4') >>> z4.chunks - (313, 313) + (313, 625) .. _tutorial_tips_blosc: diff --git a/zarr/core.py b/zarr/core.py index 681da4391e..1895b572cf 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -764,7 +764,7 @@ def _set_basic_selection_zd(self, selection, value, fields=None): # check selection is valid selection = ensure_tuple(selection) - if selection not in ((), (...,)): + if selection not in ((), (Ellipsis,)): raise IndexError('too many indices for array') # check fields diff --git a/zarr/indexing.py b/zarr/indexing.py index 5a79642c94..c2c134078f 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -205,8 +205,8 @@ def ensure_tuple(v): ChunkProjection = collections.namedtuple('ChunkProjection', ('chunk_coords', 'chunk_selection', 'out_selection')) -"""A mapping of items from chunk to output array. Can be used to extract items from the chunk -array for loading into an output array. Can also be used to extract items from a value array for +"""A mapping of items from chunk to output array. Can be used to extract items from the chunk +array for loading into an output array. Can also be used to extract items from a value array for setting/updating in a chunk array. Parameters diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 36b9605596..6134ad91ad 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -601,7 +601,7 @@ def test_read_only(self): with assert_raises(PermissionError): z.append(np.arange(1000)) with assert_raises(PermissionError): - z.set_basic_selection(..., 42) + z.set_basic_selection(Ellipsis, 42) with assert_raises(PermissionError): z.set_orthogonal_selection([0, 1, 2], 42) with assert_raises(PermissionError): diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 02e9a00c63..ecf82d9e0a 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -29,12 +29,12 @@ def test_replace_ellipsis(): eq((0,), replace_ellipsis(0, (100,))) # 1D - eq((slice(None),), replace_ellipsis(..., (100,))) + eq((slice(None),), replace_ellipsis(Ellipsis, (100,))) eq((slice(None),), replace_ellipsis(slice(None), (100,))) eq((slice(None, 100),), replace_ellipsis(slice(None, 100), (100,))) eq((slice(0, None),), replace_ellipsis(slice(0, None), (100,))) - eq((slice(None),), replace_ellipsis((slice(None), ...), (100,))) - eq((slice(None),), replace_ellipsis((..., slice(None)), (100,))) + eq((slice(None),), replace_ellipsis((slice(None), Ellipsis), (100,))) + eq((slice(None),), replace_ellipsis((Ellipsis, slice(None)), (100,))) # 2D, single item eq((0, 0), replace_ellipsis((0, 0), (100, 100))) @@ -47,21 +47,21 @@ def test_replace_ellipsis(): # 2D slice eq((slice(None), slice(None)), - replace_ellipsis(..., (100, 100))) + replace_ellipsis(Ellipsis, (100, 100))) eq((slice(None), slice(None)), replace_ellipsis(slice(None), (100, 100))) eq((slice(None), slice(None)), replace_ellipsis((slice(None), slice(None)), (100, 100))) eq((slice(None), slice(None)), - replace_ellipsis((..., slice(None)), (100, 100))) + replace_ellipsis((Ellipsis, slice(None)), (100, 100))) eq((slice(None), slice(None)), - replace_ellipsis((slice(None), ...), (100, 100))) + replace_ellipsis((slice(None), Ellipsis), (100, 100))) eq((slice(None), slice(None)), - replace_ellipsis((slice(None), ..., slice(None)), (100, 100))) + replace_ellipsis((slice(None), Ellipsis, slice(None)), (100, 100))) eq((slice(None), slice(None)), - replace_ellipsis((..., slice(None), slice(None)), (100, 100))) + replace_ellipsis((Ellipsis, slice(None), slice(None)), (100, 100))) eq((slice(None), slice(None)), - replace_ellipsis((slice(None), slice(None), ...), (100, 100))) + replace_ellipsis((slice(None), slice(None), Ellipsis), (100, 100))) def test_get_basic_selection_0d(): @@ -71,14 +71,14 @@ def test_get_basic_selection_0d(): z = zarr.create(shape=a.shape, dtype=a.dtype, fill_value=None) z[...] = a - assert_array_equal(a, z.get_basic_selection(...)) + assert_array_equal(a, z.get_basic_selection(Ellipsis)) assert_array_equal(a, z[...]) eq(42, z.get_basic_selection(())) eq(42, z[()]) # test out param b = np.zeros_like(a) - z.get_basic_selection(..., out=b) + z.get_basic_selection(Ellipsis, out=b) assert_array_equal(a, b) # test structured array @@ -86,7 +86,7 @@ def test_get_basic_selection_0d(): a = np.array(value, dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) z = zarr.create(shape=a.shape, dtype=a.dtype, fill_value=None) z[()] = value - assert_array_equal(a, z.get_basic_selection(...)) + assert_array_equal(a, z.get_basic_selection(Ellipsis)) assert_array_equal(a, z[...]) eq(a[()], z.get_basic_selection(())) eq(a[()], z[()]) @@ -96,10 +96,10 @@ def test_get_basic_selection_0d(): eq(a[['foo', 'bar']], z['foo', 'bar']) # test out param b = np.zeros_like(a) - z.get_basic_selection(..., out=b) + z.get_basic_selection(Ellipsis, out=b) assert_array_equal(a, b) c = np.zeros_like(a[['foo', 'bar']]) - z.get_basic_selection(..., out=c, fields=['foo', 'bar']) + z.get_basic_selection(Ellipsis, out=c, fields=['foo', 'bar']) assert_array_equal(a[['foo', 'bar']], c) @@ -117,9 +117,9 @@ def test_get_basic_selection_0d(): slice(-1, 0), # empty result # total selections slice(None), - ..., + Ellipsis, (), - (..., slice(None)), + (Ellipsis, slice(None)), # slice with step slice(None), slice(None, None), @@ -237,10 +237,10 @@ def test_get_basic_selection_1d(): (slice(250, 350, 33), slice(1, 5, 3)), # total selections (slice(None), slice(None)), - ..., + Ellipsis, (), - (..., slice(None)), - (..., slice(None), slice(None)), + (Ellipsis, slice(None)), + (Ellipsis, slice(None), slice(None)), ] @@ -292,7 +292,7 @@ def test_set_basic_selection_0d(): assert_array_equal(a, z) # tests - z.set_basic_selection(..., v) + z.set_basic_selection(Ellipsis, v) assert_array_equal(v, z) z[...] = 0 assert_array_equal(a, z) @@ -306,16 +306,16 @@ def test_set_basic_selection_0d(): z = zarr.create(shape=a.shape, dtype=a.dtype, fill_value=None) # tests - z.set_basic_selection(..., v) + z.set_basic_selection(Ellipsis, v) assert_array_equal(v, z) - z.set_basic_selection(..., a) + z.set_basic_selection(Ellipsis, a) assert_array_equal(a, z) z[...] = v assert_array_equal(v, z) z[...] = a assert_array_equal(a, z) # with fields - z.set_basic_selection(..., v['foo'], fields='foo') + z.set_basic_selection(Ellipsis, v['foo'], fields='foo') eq(v['foo'], z['foo']) eq(a['bar'], z['bar']) eq(a['baz'], z['baz']) @@ -325,7 +325,7 @@ def test_set_basic_selection_0d(): eq(a['baz'], z['baz']) # multiple field assignment not supported with assert_raises(ValueError): - z.set_basic_selection(..., v[['foo', 'bar']], fields=['foo', 'bar']) + z.set_basic_selection(Ellipsis, v[['foo', 'bar']], fields=['foo', 'bar']) with assert_raises(ValueError): z[..., 'foo', 'bar'] = v[['foo', 'bar']] @@ -712,7 +712,7 @@ def _test_get_coordinate_selection(a, z, selection): # slice not supported slice(5, 15), slice(None), - ..., + Ellipsis, # bad stuff 2.3, 'foo', @@ -819,10 +819,10 @@ def test_get_coordinate_selection_2d(): selection = [1, 2, 3], slice(5, 15) z.get_coordinate_selection(selection) with assert_raises(IndexError): - selection = ..., [1, 2, 3] + selection = Ellipsis, [1, 2, 3] z.get_coordinate_selection(selection) with assert_raises(IndexError): - selection = ... + selection = Ellipsis z.get_coordinate_selection(selection) @@ -912,7 +912,7 @@ def _test_get_mask_selection(a, z, selection): # slice not supported slice(5, 15), slice(None), - ..., + Ellipsis, # bad stuff 2.3, 'foo', @@ -1036,7 +1036,7 @@ def test_get_selection_out(): assert_array_equal(expect, out[:]) with assert_raises(TypeError): - z.get_basic_selection(..., out=[]) + z.get_basic_selection(Ellipsis, out=[]) # orthogonal selections a = np.arange(10000, dtype=int).reshape(1000, 10) @@ -1116,7 +1116,7 @@ def test_get_selections_with_fields(): # total selection expect = a[fields] - actual = z.get_basic_selection(..., fields=fields) + actual = z.get_basic_selection(Ellipsis, fields=fields) assert_array_equal(expect, actual) # alternative API if isinstance(fields, str): @@ -1197,7 +1197,7 @@ def test_get_selections_with_fields(): # missing/bad fields with assert_raises(IndexError): - z.get_basic_selection(..., fields=['notafield']) + z.get_basic_selection(Ellipsis, fields=['notafield']) def test_set_selections_with_fields(): @@ -1225,7 +1225,7 @@ def test_set_selections_with_fields(): # currently multi-field assignment is not supported in numpy, so we won't support it either if isinstance(fields, list): with assert_raises(ValueError): - z.set_basic_selection(..., v[fields], fields=fields) + z.set_basic_selection(Ellipsis, v[fields], fields=fields) with assert_raises(ValueError): z.set_orthogonal_selection([0, 2], v[fields], fields=fields) with assert_raises(ValueError): @@ -1241,7 +1241,7 @@ def test_set_selections_with_fields(): assert_array_equal(a, z[:]) a[fields] = v[fields] # total selection - z.set_basic_selection(..., v[fields], fields=fields) + z.set_basic_selection(Ellipsis, v[fields], fields=fields) assert_array_equal(a, z[:]) # basic selection with slice From 7d754600478886f1040b4a90d0c9bbcfc95f4a26 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 8 Nov 2017 23:25:46 +0000 Subject: [PATCH 47/67] WIP documentation --- docs/api/codecs.rst | 35 ++++----- docs/api/core.rst | 8 +++ zarr/core.py | 168 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 180 insertions(+), 31 deletions(-) diff --git a/docs/api/codecs.rst b/docs/api/codecs.rst index e35de08b30..f35ea861b4 100644 --- a/docs/api/codecs.rst +++ b/docs/api/codecs.rst @@ -2,27 +2,22 @@ Compressors and filters (``zarr.codecs``) ========================================= .. module:: zarr.codecs -This module contains compressor and filter classes for use with Zarr. +This module contains compressor and filter classes for use with Zarr. Please note that this module +is provided for backwards compatibility with previous versions of Zarr. From Zarr version 2.2 +onwards, all codec classes have been moved to a separate package called Numcodecs_. The two +packages (Zarr and Numcodecs_) are designed to be used together. For example, a Numcodecs_ codec +class can be used as a compressor for a Zarr array:: -Other codecs can be registered dynamically with Zarr. All that is required -is to implement a class that provides the same interface as the classes listed -below, and then to add the class to the ``codec_registry``. See the source -code of this module for details. + >>> import zarr + >>> from numcodecs import Blosc + >>> z = zarr.zeros(1000000, compressor=Blosc(cname='zstd', clevel=1, shuffle=Blosc.SHUFFLE) -.. autoclass:: Codec +Codec classes can also be used as filters. See the tutorial section on :ref:`tutorial_filters` +for more information. - .. automethod:: encode - .. automethod:: decode - .. automethod:: get_config - .. automethod:: from_config +Please note that it is also relatively straightforward to define and register custom codec +classes. See the Numcodecs `codec API `_ and +`codec registry `_ documentation for more +information. -.. autoclass:: Blosc -.. autoclass:: Zlib -.. autoclass:: BZ2 -.. autoclass:: LZMA -.. autoclass:: Delta -.. autoclass:: AsType -.. autoclass:: FixedScaleOffset -.. autoclass:: Quantize -.. autoclass:: PackBits -.. autoclass:: Categorize +.. _Numcodecs: http://numcodecs.readthedocs.io/ diff --git a/docs/api/core.rst b/docs/api/core.rst index 4f2c5cc6bb..ada6a653ca 100644 --- a/docs/api/core.rst +++ b/docs/api/core.rst @@ -6,6 +6,14 @@ The Array class (``zarr.core``) .. automethod:: __getitem__ .. automethod:: __setitem__ + .. automethod:: get_basic_selection + .. automethod:: set_basic_selection + .. automethod:: get_mask_selection + .. automethod:: set_mask_selection + .. automethod:: get_coordinate_selection + .. automethod:: set_coordinate_selection + .. automethod:: get_orthogonal_selection + .. automethod:: set_orthogonal_selection .. automethod:: resize .. automethod:: append .. automethod:: view diff --git a/zarr/core.py b/zarr/core.py index 1895b572cf..0e8f4a7bac 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -376,12 +376,15 @@ def is_view(self): @property def oindex(self): - """TODO""" + """Shortcut for orthogonal (outer) indexing, see :func:`get_orthogonal_selection` and + :func:`set_orthogonal_selection` for documentation and examples.""" return self._oindex @property def vindex(self): - """TODO""" + """Shortcut for vectorized (inner) indexing, see :func:`get_coordinate_selection`, + :func:`set_coordinate_selection`, :func:`get_mask_selection` and + :func:`set_mask_selection` for documentation and examples.""" return self._vindex def __eq__(self, other): @@ -408,8 +411,13 @@ def __len__(self): raise TypeError('len() of unsized object') def __getitem__(self, selection): - """Retrieve data for some portion of the array. Most NumPy-style - slicing operations are supported. + """Retrieve data for some portion of the array. + + Parameters + ---------- + selection : int, slice or tuple of int/slice + An integer index or slice or tuple of int/slice specifying the requested region for + each dimension of the array. Returns ------- @@ -418,7 +426,6 @@ def __getitem__(self, selection): Examples -------- - Setup a 1-dimensional array:: >>> import zarr @@ -437,15 +444,19 @@ def __getitem__(self, selection): array([99999995, 99999996, 99999997, 99999998, 99999999], dtype=int32) >>> z[5:10] array([5, 6, 7, 8, 9], dtype=int32) + >>> z[5:10:2] + array([5, 7, 9], dtype=int32) >>> z[:] array([ 0, 1, 2, ..., 99999997, 99999998, 99999999], dtype=int32) + >>> z[::2] + array([ 0, 2, 4, ..., 99999994, 99999996, 99999998], dtype=int32) Setup a 2-dimensional array:: >>> import zarr >>> import numpy as np - >>> z = zarr.array(np.arange(100000000).reshape(10000, 10000), - ... chunks=(1000, 1000), dtype='i4') + >>> z = zarr.array(np.arange(100000000).reshape(10000, 10000), chunks=(1000, 1000), + ... dtype='i4') >>> z @@ -475,6 +486,24 @@ def __getitem__(self, selection): [99970000, 99970001, 99970002, ..., 99979997, 99979998, 99979999], [99980000, 99980001, 99980002, ..., 99989997, 99989998, 99989999], [99990000, 99990001, 99990002, ..., 99999997, 99999998, 99999999]], dtype=int32) + >>> z[::10, ::2] + array([[ 0, 2, 4, ..., 9994, 9996, 9998], + [ 100000, 100002, 100004, ..., 109994, 109996, 109998], + [ 200000, 200002, 200004, ..., 209994, 209996, 209998], + ..., + [99700000, 99700002, 99700004, ..., 99709994, 99709996, 99709998], + [99800000, 99800002, 99800004, ..., 99809994, 99809996, 99809998], + [99900000, 99900002, 99900004, ..., 99909994, 99909996, 99909998]], dtype=int32) + + Notes + ----- + Slices with step > 1 are supported, but slices with negative step are not. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + set_orthogonal_selection, vindex, oindex, __setitem__ """ @@ -482,7 +511,67 @@ def __getitem__(self, selection): return self.get_basic_selection(selection, fields=fields) def get_basic_selection(self, selection, out=None, fields=None): - """TODO""" + """Retrieve data for some portion of the array. + + Parameters + ---------- + selection : int, slice or tuple of int/slice + An integer index or slice or tuple of int/slice specifying the requested region for + each dimension of the array. + out : ndarray + If given, load the selected data directly into this array. + fields : str or sequence of str + For arrays with a structured dtype, one or more fields can be specified to extract + data for. + + Returns + ------- + out : ndarray + A NumPy array containing the data for the requested region. + + Examples + -------- + This method provides the implementation for indexing operations via ``__getitem__``. For + example, given a Zarr array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.array(np.arange(100000000), chunks=1000000, dtype='i4') + + ...the following operations are equivalent: + + >>> np.all(z[5] == z.get_basic_selection(5)) + True + >>> np.all(z[5:10:2] == z.get_basic_selection(slice(5, 10, 2))) + True + >>> np.all(z[...] == z.get_basic_selection(Ellipsis)) + True + + However, this method provides some additional parameters which may be useful. For + example, data may be loaded directly into an output array via the `out` parameter:: + + >>> out = np.zeros(10, dtype=z.dtype) + >>> z.get_basic_selection(slice(1000, 1010), out=out) + array([1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009], dtype=int32) + >>> out + array([1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009], dtype=int32) + + For structured arrays, data may be loaded from a subset of the data fields via the + `fields` parameter, e.g.:: + + >>> a = np.array([(b'aaa', 1, 4.2), + ... (b'bbb', 2, 8.4), + ... (b'ccc', 3, 12.6)], + ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + >>> z = zarr.array(a) + >>> z.get_basic_selection(slice(0, 2), fields='foo') + array([b'aaa', b'bbb'], + dtype='|S3') + >>> z.get_basic_selection(slice(0, 2), fields=['bar', 'baz']) + array([(1, 4.2), (2, 8.4)], + dtype=[('bar', '>> import zarr @@ -683,13 +779,63 @@ def __setitem__(self, selection, value): [9998, 42, 42, ..., 42, 42, 42], [9999, 42, 42, ..., 42, 42, 42]], dtype=int32) + Notes + ----- + Slices with step > 1 are supported, but slices with negative step are not. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + set_orthogonal_selection, vindex, oindex, __getitem__ + """ fields, selection = pop_fields(selection) self.set_basic_selection(selection, value, fields=fields) def set_basic_selection(self, selection, value, fields=None): - """TODO""" + """Modify data for some portion of the array. + + Parameters + ---------- + selection : int, slice or tuple of int/slice + An integer index or slice or tuple of int/slice specifying the requested region for + each dimension of the array. + value : scalar or array-like + Value to be stored into the array. + fields : str or sequence of str + For arrays with a structured dtype, one or more fields can be specified to set + data for. + + Examples + -------- + This method provides the implementation for indexing operations via ``__setitem__``. For + example, given a Zarr array:: + + >>> import zarr + >>> import numpy as np + >>> a = np.arange(100000000) + >>> z = zarr.zeros_like(a) + + ...the following assignment operations are equivalent: + + >>> z[:10] = 24 + >>> z[:10] + array([24, 24, 24, 24, 24, 24, 24, 24, 24, 24]) + >>> z.set_basic_selection(slice(10), 42) + >>> z[:10] + array([42, 42, 42, 42, 42, 42, 42, 42, 42, 42]) + + @@TODO doc fieldss + + See Also + -------- + get_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + set_orthogonal_selection, vindex, oindex, __getitem__, __setitem__ + + """ # guard conditions if self._read_only: From cbee2fc8a03df6e24c6324fca3121ce224bb8c20 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 8 Nov 2017 23:40:31 +0000 Subject: [PATCH 48/67] WIP documentation --- zarr/core.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 0e8f4a7bac..d20cd38ec9 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -818,7 +818,7 @@ def set_basic_selection(self, selection, value, fields=None): >>> a = np.arange(100000000) >>> z = zarr.zeros_like(a) - ...the following assignment operations are equivalent: + ...the following assignment operations are equivalent:: >>> z[:10] = 24 >>> z[:10] @@ -827,7 +827,18 @@ def set_basic_selection(self, selection, value, fields=None): >>> z[:10] array([42, 42, 42, 42, 42, 42, 42, 42, 42, 42]) - @@TODO doc fieldss + For arrays with a structured dtype, the `fields` parameter can be used to set data for + a specific field, e.g.:: + + >>> a = np.array([(b'aaa', 1, 4.2), + ... (b'bbb', 2, 8.4), + ... (b'ccc', 3, 12.6)], + ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + >>> z = zarr.array(a) + >>> z.set_basic_selection(slice(0, 2), b'zzz', fields='foo') + >>> z[:] + array([(b'zzz', 1, 4.2), (b'zzz', 2, 8.4), (b'ccc', 3, 12.6)], + dtype=[('foo', 'S3'), ('bar', ' Date: Thu, 9 Nov 2017 10:52:46 +0000 Subject: [PATCH 49/67] docstrings --- zarr/core.py | 577 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 441 insertions(+), 136 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index d20cd38ec9..ea2acc881b 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -411,13 +411,13 @@ def __len__(self): raise TypeError('len() of unsized object') def __getitem__(self, selection): - """Retrieve data for some portion of the array. + """Retrieve data for an item or region of the array. Parameters ---------- - selection : int, slice or tuple of int/slice - An integer index or slice or tuple of int/slice specifying the requested region for - each dimension of the array. + selection : tuple + An integer index or slice or tuple of int/slice objects specifying the requested + item or region for each dimension of the array. Returns ------- @@ -430,75 +430,95 @@ def __getitem__(self, selection): >>> import zarr >>> import numpy as np - >>> z = zarr.array(np.arange(100000000), chunks=1000000, dtype='i4') + >>> z = zarr.array(np.arange(100)) >>> z - + - Take some slices:: + Retrieve a single item:: >>> z[5] 5 + + Retrieve a region via slicing:: + >>> z[:5] - array([0, 1, 2, 3, 4], dtype=int32) + array([0, 1, 2, 3, 4]) >>> z[-5:] - array([99999995, 99999996, 99999997, 99999998, 99999999], dtype=int32) + array([95, 96, 97, 98, 99]) >>> z[5:10] - array([5, 6, 7, 8, 9], dtype=int32) + array([5, 6, 7, 8, 9]) >>> z[5:10:2] - array([5, 7, 9], dtype=int32) - >>> z[:] - array([ 0, 1, 2, ..., 99999997, 99999998, 99999999], dtype=int32) + array([5, 7, 9]) >>> z[::2] - array([ 0, 2, 4, ..., 99999994, 99999996, 99999998], dtype=int32) + array([ 0, 2, 4, ..., 94, 96, 98]) + + Load the entire array into memory:: + + >>> z[...] + array([ 0, 1, 2, ..., 97, 98, 99]) Setup a 2-dimensional array:: - >>> import zarr - >>> import numpy as np - >>> z = zarr.array(np.arange(100000000).reshape(10000, 10000), chunks=(1000, 1000), - ... dtype='i4') + >>> z = zarr.array(np.arange(100).reshape(10, 10)) >>> z - + - Take some slices:: + Retrieve an item:: >>> z[2, 2] - 20002 - >>> z[:2, :2] - array([[ 0, 1], - [10000, 10001]], dtype=int32) - >>> z[:2] - array([[ 0, 1, 2, ..., 9997, 9998, 9999], - [10000, 10001, 10002, ..., 19997, 19998, 19999]], dtype=int32) - >>> z[:, :2] - array([[ 0, 1], - [ 10000, 10001], - [ 20000, 20001], - ..., - [99970000, 99970001], - [99980000, 99980001], - [99990000, 99990001]], dtype=int32) - >>> z[:] - array([[ 0, 1, 2, ..., 9997, 9998, 9999], - [ 10000, 10001, 10002, ..., 19997, 19998, 19999], - [ 20000, 20001, 20002, ..., 29997, 29998, 29999], - ..., - [99970000, 99970001, 99970002, ..., 99979997, 99979998, 99979999], - [99980000, 99980001, 99980002, ..., 99989997, 99989998, 99989999], - [99990000, 99990001, 99990002, ..., 99999997, 99999998, 99999999]], dtype=int32) - >>> z[::10, ::2] - array([[ 0, 2, 4, ..., 9994, 9996, 9998], - [ 100000, 100002, 100004, ..., 109994, 109996, 109998], - [ 200000, 200002, 200004, ..., 209994, 209996, 209998], - ..., - [99700000, 99700002, 99700004, ..., 99709994, 99709996, 99709998], - [99800000, 99800002, 99800004, ..., 99809994, 99809996, 99809998], - [99900000, 99900002, 99900004, ..., 99909994, 99909996, 99909998]], dtype=int32) + 22 + + Retrieve a region via slicing:: + + >>> z[1:3, 1:3] + array([[11, 12], + [21, 22]]) + >>> z[1:3, :] + array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]) + >>> z[:, 1:3] + array([[ 1, 2], + [11, 12], + [21, 22], + [31, 32], + [41, 42], + [51, 52], + [61, 62], + [71, 72], + [81, 82], + [91, 92]]) + >>> z[0:5:2, 0:5:2] + array([[ 0, 2, 4], + [20, 22, 24], + [40, 42, 44]]) + >>> z[::2, ::2] + array([[ 0, 2, 4, 6, 8], + [20, 22, 24, 26, 28], + [40, 42, 44, 46, 48], + [60, 62, 64, 66, 68], + [80, 82, 84, 86, 88]]) + + Load the entire array into memory:: + + >>> z[...] + array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24, 25, 26, 27, 28, 29], + [30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59], + [60, 61, 62, 63, 64, 65, 66, 67, 68, 69], + [70, 71, 72, 73, 74, 75, 76, 77, 78, 79], + [80, 81, 82, 83, 84, 85, 86, 87, 88, 89], + [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]]) Notes ----- Slices with step > 1 are supported, but slices with negative step are not. + Currently the implementation for __getitem__ is provided by :func:`get_basic_selection`. + For advanced ("fancy") indexing, see the methods listed below. + See Also -------- get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, @@ -510,17 +530,17 @@ def __getitem__(self, selection): fields, selection = pop_fields(selection) return self.get_basic_selection(selection, fields=fields) - def get_basic_selection(self, selection, out=None, fields=None): - """Retrieve data for some portion of the array. + def get_basic_selection(self, selection=Ellipsis, out=None, fields=None): + """Retrieve data for an item or region of the array. Parameters ---------- - selection : int, slice or tuple of int/slice - An integer index or slice or tuple of int/slice specifying the requested region for - each dimension of the array. - out : ndarray + selection : tuple + A tuple specifying the requested item or region for each dimension of the array. May + be any combination of int and/or slice for multidimensional arrays. + out : ndarray, optional If given, load the selected data directly into this array. - fields : str or sequence of str + fields : str or sequence of str, optional For arrays with a structured dtype, one or more fields can be specified to extract data for. @@ -531,45 +551,101 @@ def get_basic_selection(self, selection, out=None, fields=None): Examples -------- - This method provides the implementation for indexing operations via ``__getitem__``. For - example, given a Zarr array:: + Setup a 1-dimensional array:: >>> import zarr >>> import numpy as np - >>> z = zarr.array(np.arange(100000000), chunks=1000000, dtype='i4') + >>> z = zarr.array(np.arange(100)) + >>> z + - ...the following operations are equivalent: + Retrieve a single item:: - >>> np.all(z[5] == z.get_basic_selection(5)) - True - >>> np.all(z[5:10:2] == z.get_basic_selection(slice(5, 10, 2))) - True - >>> np.all(z[...] == z.get_basic_selection(Ellipsis)) - True + >>> z.get_basic_selection(5) + 5 - However, this method provides some additional parameters which may be useful. For - example, data may be loaded directly into an output array via the `out` parameter:: + Retrieve a region via slicing:: - >>> out = np.zeros(10, dtype=z.dtype) - >>> z.get_basic_selection(slice(1000, 1010), out=out) - array([1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009], dtype=int32) - >>> out - array([1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009], dtype=int32) + >>> z.get_basic_selection(slice(5)) + array([0, 1, 2, 3, 4]) + >>> z.get_basic_selection(slice(-5, None)) + array([95, 96, 97, 98, 99]) + >>> z.get_basic_selection(slice(5, 10)) + array([5, 6, 7, 8, 9]) + >>> z.get_basic_selection(slice(5, 10, 2)) + array([5, 7, 9]) + >>> z.get_basic_selection(slice(None, None, 2)) + array([ 0, 2, 4, ..., 94, 96, 98]) - For structured arrays, data may be loaded from a subset of the data fields via the - `fields` parameter, e.g.:: + Load the entire array into memory:: - >>> a = np.array([(b'aaa', 1, 4.2), - ... (b'bbb', 2, 8.4), - ... (b'ccc', 3, 12.6)], - ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) - >>> z = zarr.array(a) - >>> z.get_basic_selection(slice(0, 2), fields='foo') - array([b'aaa', b'bbb'], - dtype='|S3') - >>> z.get_basic_selection(slice(0, 2), fields=['bar', 'baz']) - array([(1, 4.2), (2, 8.4)], - dtype=[('bar', '>> z.get_basic_selection() + array([ 0, 1, 2, ..., 97, 98, 99]) + + Setup a 2-dimensional array:: + + >>> z = zarr.array(np.arange(100).reshape(10, 10)) + >>> z + + + Retrieve an item:: + + >>> z.get_basic_selection((2, 2)) + 22 + + Retrieve a region via slicing:: + + >>> z.get_basic_selection((slice(1, 3), slice(1, 3))) + array([[11, 12], + [21, 22]]) + >>> z.get_basic_selection((slice(1, 3), slice(None))) + array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]) + >>> z.get_basic_selection((slice(None), slice(1, 3))) + array([[ 1, 2], + [11, 12], + [21, 22], + [31, 32], + [41, 42], + [51, 52], + [61, 62], + [71, 72], + [81, 82], + [91, 92]]) + >>> z.get_basic_selection((slice(0, 5, 2), slice(0, 5, 2))) + array([[ 0, 2, 4], + [20, 22, 24], + [40, 42, 44]]) + >>> z.get_basic_selection((slice(None, None, 2), slice(None, None, 2))) + array([[ 0, 2, 4, 6, 8], + [20, 22, 24, 26, 28], + [40, 42, 44, 46, 48], + [60, 62, 64, 66, 68], + [80, 82, 84, 86, 88]]) + + Load the entire array into memory:: + + >>> z.get_basic_selection() + array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24, 25, 26, 27, 28, 29], + [30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59], + [60, 61, 62, 63, 64, 65, 66, 67, 68, 69], + [70, 71, 72, 73, 74, 75, 76, 77, 78, 79], + [80, 81, 82, 83, 84, 85, 86, 87, 88, 89], + [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]]) + + Notes + ----- + Slices with step > 1 are supported, but slices with negative step are not. + + See Also + -------- + set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + set_orthogonal_selection, vindex, oindex, __getitem__, __setitem__ """ @@ -629,7 +705,104 @@ def _get_basic_selection_nd(self, selection, out=None, fields=None): return self._get_selection(indexer=indexer, out=out, fields=fields) def get_orthogonal_selection(self, selection, out=None, fields=None): - """TODO""" + """Retrieve data by making a selection for each dimension of the array. For example, + if an array has 2 dimensions, allows selecting specific rows and/or columns. The + selection for each dimension can be either an integer (indexing a single item), a slice, + an array of integers, or a Boolean array where True values indicate a selection. + + Parameters + ---------- + selection : tuple + A selection for each dimension of the array. May be any combination of int, slice, + integer array or Boolean array. + out : ndarray, optional + If given, load the selected data directly into this array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to extract + data for. + + Returns + ------- + out : ndarray + A NumPy array containing the data for the requested selection. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.array(np.arange(100).reshape(10, 10)) + >>> z + + + Retrieve rows and columns via any combination of int, slice, integer array and/or Boolean + array:: + + >>> z.get_orthogonal_selection(([1, 4], slice(None))) + array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]) + >>> z.get_orthogonal_selection((slice(None), [1, 4])) + array([[ 1, 4], + [11, 14], + [21, 24], + [31, 34], + [41, 44], + [51, 54], + [61, 64], + [71, 74], + [81, 84], + [91, 94]]) + >>> z.get_orthogonal_selection(([1, 4], [1, 4])) + array([[11, 14], + [41, 44]]) + >>> sel = np.zeros(z.shape[0], dtype=bool) + >>> sel[1] = True + >>> sel[4] = True + >>> z.get_orthogonal_selection((sel, sel)) + array([[11, 14], + [41, 44]]) + + For convenience, the orthogonal selection functionality is also available via the + `oindex` property, e.g.:: + + >>> z.oindex[[1, 4], :] + array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]) + >>> z.oindex[:, [1, 4]] + array([[ 1, 4], + [11, 14], + [21, 24], + [31, 34], + [41, 44], + [51, 54], + [61, 64], + [71, 74], + [81, 84], + [91, 94]]) + >>> z.oindex[[1, 4], [1, 4]] + array([[11, 14], + [41, 44]]) + >>> sel = np.zeros(z.shape[0], dtype=bool) + >>> sel[1] = True + >>> sel[4] = True + >>> z.oindex[sel, sel] + array([[11, 14], + [41, 44]]) + + Notes + ----- + Orthogonal indexing is also known as outer indexing. + + Slices with step > 1 are supported, but slices with negative step are not. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, set_orthogonal_selection, vindex, + oindex, __getitem__, __setitem__ + + """ # refresh metadata if not self._cache_metadata: @@ -644,7 +817,65 @@ def get_orthogonal_selection(self, selection, out=None, fields=None): return self._get_selection(indexer=indexer, out=out, fields=fields) def get_coordinate_selection(self, selection, out=None, fields=None): - """TODO""" + """Retrieve a selection of individual items, by providing the indices (coordinates) for + each selected item. + + Parameters + ---------- + selection : tuple + An integer (coordinate) array for each dimension of the array. + out : ndarray, optional + If given, load the selected data directly into this array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to extract + data for. + + Returns + ------- + out : ndarray + A NumPy array containing the data for the requested selection. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.array(np.arange(100).reshape(10, 10)) + >>> z + + + Retrieve items by specifying their coordinates:: + + >>> z.get_coordinate_selection(([1, 4], [1, 4])) + array([11, 44]) + + For convenience, the coordinate selection functionality is also available via the + `vindex` property, e.g.:: + + >>> z.vindex[[1, 4], [1, 4]] + array([11, 44]) + + Notes + ----- + Coordinate indexing is also known as point selection, and is a form of vectorized or inner + indexing. + + Slices are not supported. Coordinate arrays must be provided for all dimensions of the + array. + + Coordinate arrays may be multidimensional, in which case the output array will also be + multidimensional. Coordinate arrays are broadcast against each other before being + applied. The shape of the output will be the same as the shape of each coordinate array + after broadcasting. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, set_coordinate_selection, vindex, + oindex, __getitem__, __setitem__ + + """ # refresh metadata if not self._cache_metadata: @@ -668,7 +899,63 @@ def get_coordinate_selection(self, selection, out=None, fields=None): return out def get_mask_selection(self, selection, out=None, fields=None): - """TODO""" + """Retrieve a selection of individual items, by providing a Boolean array of the same + shape as the array against which the selection is being made, where True values indicate + a selected item. + + Parameters + ---------- + selection : ndarray, bool + A Boolean array of the same shape as the array against which the selection is being + made. + out : ndarray, optional + If given, load the selected data directly into this array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to extract + data for. + + Returns + ------- + out : ndarray + A NumPy array containing the data for the requested selection. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.array(np.arange(100).reshape(10, 10)) + >>> z + + + Retrieve items by specifying a maks:: + + >>> sel = np.zeros_like(z, dtype=bool) + >>> sel[1, 1] = True + >>> sel[4, 4] = True + >>> z.get_mask_selection(sel) + array([11, 44]) + + For convenience, the mask selection functionality is also available via the + `vindex` property, e.g.:: + + >>> z.vindex[sel] + array([11, 44]) + + Notes + ----- + Mask indexing is a form of vectorized or inner indexing, and is equivalent to coordinate + indexing. Internally the mask array is converted to coordinate arrays by calling + `np.nonzero`. + + See Also + -------- + get_basic_selection, set_basic_selection, set_mask_selection, get_orthogonal_selection, + set_orthogonal_selection, get_coordinate_selection, set_coordinate_selection, vindex, + oindex, __getitem__, __setitem__ + + """ # refresh metadata if not self._cache_metadata: @@ -720,7 +1007,7 @@ def __setitem__(self, selection, value): Parameters ---------- - selection : int, slice or tuple of int/slice + selection : tuple An integer index or slice or tuple of int/slice specifying the requested region for each dimension of the array. value : scalar or array-like @@ -731,58 +1018,52 @@ def __setitem__(self, selection, value): Setup a 1-dimensional array:: >>> import zarr - >>> z = zarr.zeros(100000000, chunks=1000000, dtype='i4') + >>> z = zarr.zeros(100, dtype=int) >>> z - + Set all array elements to the same scalar value:: - >>> z[:] = 42 - >>> z[:] - array([42, 42, 42, ..., 42, 42, 42], dtype=int32) + >>> z[...] = 42 + >>> z[...] + array([42, 42, 42, ..., 42, 42, 42]) Set a portion of the array:: - >>> z[:100] = np.arange(100) - >>> z[-100:] = np.arange(100)[::-1] - >>> z[:] - array([0, 1, 2, ..., 2, 1, 0], dtype=int32) + >>> z[:10] = np.arange(10) + >>> z[-10:] = np.arange(10)[::-1] + >>> z[...] + array([ 0, 1, 2, ..., 2, 1, 0]) Setup a 2-dimensional array:: - >>> z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') + >>> z = zarr.zeros((5, 5), dtype=int) >>> z - + Set all array elements to the same scalar value:: - >>> z[:] = 42 - >>> z[:] - array([[42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - ..., - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42]], dtype=int32) + >>> z[...] = 42 Set a portion of the array:: >>> z[0, :] = np.arange(z.shape[1]) >>> z[:, 0] = np.arange(z.shape[0]) - >>> z[:] - array([[ 0, 1, 2, ..., 9997, 9998, 9999], - [ 1, 42, 42, ..., 42, 42, 42], - [ 2, 42, 42, ..., 42, 42, 42], - ..., - [9997, 42, 42, ..., 42, 42, 42], - [9998, 42, 42, ..., 42, 42, 42], - [9999, 42, 42, ..., 42, 42, 42]], dtype=int32) + >>> z[...] + array([[ 0, 1, 2, 3, 4], + [ 1, 42, 42, 42, 42], + [ 2, 42, 42, 42, 42], + [ 3, 42, 42, 42, 42], + [ 4, 42, 42, 42, 42]]) Notes ----- Slices with step > 1 are supported, but slices with negative step are not. + Currently the implementation for __setitem__ is provided by :func:`set_basic_selection`, + which means that only integers and slices are supported within the selection. For + advanced ("fancy") indexing, see the methods listed below. + See Also -------- get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, @@ -799,7 +1080,7 @@ def set_basic_selection(self, selection, value, fields=None): Parameters ---------- - selection : int, slice or tuple of int/slice + selection : tuple An integer index or slice or tuple of int/slice specifying the requested region for each dimension of the array. value : scalar or array-like @@ -810,22 +1091,46 @@ def set_basic_selection(self, selection, value, fields=None): Examples -------- - This method provides the implementation for indexing operations via ``__setitem__``. For - example, given a Zarr array:: + Setup a 1-dimensional array:: >>> import zarr - >>> import numpy as np - >>> a = np.arange(100000000) - >>> z = zarr.zeros_like(a) + >>> z = zarr.zeros(100, dtype=int) + >>> z + + + Set all array elements to the same scalar value:: - ...the following assignment operations are equivalent:: + >>> z.set_basic_selection(..., 42) + >>> z[...] + array([42, 42, 42, ..., 42, 42, 42]) - >>> z[:10] = 24 - >>> z[:10] - array([24, 24, 24, 24, 24, 24, 24, 24, 24, 24]) - >>> z.set_basic_selection(slice(10), 42) - >>> z[:10] - array([42, 42, 42, 42, 42, 42, 42, 42, 42, 42]) + Set a portion of the array:: + + >>> z.set_basic_selection(slice(10), np.arange(10)) + >>> z.set_basic_selection(slice(-10, None), np.arange(10)[::-1]) + >>> z[...] + array([ 0, 1, 2, ..., 2, 1, 0]) + + Setup a 2-dimensional array:: + + >>> z = zarr.zeros((5, 5), dtype=int) + >>> z + + + Set all array elements to the same scalar value:: + + >>> z.set_basic_selection(..., 42) + + Set a portion of the array:: + + >>> z.set_basic_selection(0, slice(None), np.arange(z.shape[1])) + >>> z.set_basic_selection(slice(None), 0, np.arange(z.shape[0])) + >>> z[...] + array([[ 0, 1, 2, 3, 4], + [ 1, 42, 42, 42, 42], + [ 2, 42, 42, 42, 42], + [ 3, 42, 42, 42, 42], + [ 4, 42, 42, 42, 42]]) For arrays with a structured dtype, the `fields` parameter can be used to set data for a specific field, e.g.:: @@ -842,9 +1147,9 @@ def set_basic_selection(self, selection, value, fields=None): See Also -------- - get_basic_selection, get_mask_selection, set_mask_selection, - get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, - set_orthogonal_selection, vindex, oindex, __getitem__, __setitem__ + get_basic_selection, get_mask_selection, set_mask_selection, get_coordinate_selection, + set_coordinate_selection, get_orthogonal_selection, set_orthogonal_selection, vindex, + oindex, __getitem__, __setitem__ """ From 9115ffe7699903cab1180f3d9c34e0276a6c13c7 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 9 Nov 2017 13:59:45 +0000 Subject: [PATCH 50/67] documentation --- docs/index.rst | 43 +++---- zarr/core.py | 307 ++++++++++++++++++++++++++++++++++++++--------- zarr/indexing.py | 12 +- 3 files changed, 278 insertions(+), 84 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 5215ba272a..80c7de664d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,27 +12,25 @@ Highlights * Create N-dimensional arrays with any NumPy dtype. * Chunk arrays along any dimension. -* Compress chunks using the fast Blosc_ meta-compressor or alternatively using zlib, BZ2 or LZMA. +* Compress and/or filter chunks using any numcodecs_ codec. * Store arrays in memory, on disk, inside a Zip file, on S3, ... * Read an array concurrently from multiple threads or processes. * Write to an array concurrently from multiple threads or processes. * Organize arrays into hierarchies via groups. -* Use filters to preprocess data and improve compression. Status ------ -Zarr is still in an early phase of development. Feedback and bug -reports are very welcome, please get in touch via the `GitHub issue -tracker `_. +Zarr is still a young project. Feedback and bug reports are very welcome, please get in touch via +the `GitHub issue tracker `_. Installation ------------ Zarr depends on NumPy. It is generally best to `install NumPy -`_ first using -whatever method is most appropriate for you operating system and -Python distribution. +`_ first using whatever method is most +appropriate for you operating system and Python distribution. Other dependencies should be +installed automatically if using one of the installation methods below. Install Zarr from PyPI:: @@ -41,26 +39,18 @@ Install Zarr from PyPI:: Alternatively, install Zarr via conda:: $ conda install -c conda-forge zarr - -Zarr includes a C extension providing integration with the Blosc_ -library. Installing via conda will install a pre-compiled binary distribution. -However, if you have a newer CPU that supports the AVX2 instruction set (e.g., -Intel Haswell, Broadwell or Skylake) then installing via pip is preferable, -because this will compile the Blosc library from source with optimisations -for AVX2. - + To work with Zarr source code in development, install from GitHub:: $ git clone --recursive https://github.com/alimanfoo/zarr.git $ cd zarr $ python setup.py install -To verify that Zarr has been fully installed (including the Blosc -extension) run the test suite:: +To verify that Zarr has been fully installed, run the test suite:: $ pip install nose $ python -m nose -v zarr - + Contents -------- @@ -75,13 +65,20 @@ Contents Acknowledgments --------------- -Zarr bundles the `c-blosc `_ -library and uses it as the default compressor. +The following people have contributed to the development of Zarr, by contributing code and/or +providing ideas, feedback and advice: + +* `Francesc Alted `_ +* `Stephan Hoyer `_ +* `John Kirkham `_ +* `Alistair Miles `_ +* `Matthew Rocklin `_ +* `Vincent Schut `_ Zarr is inspired by `HDF5 `_, `h5py `_ and `bcolz `_. -Development of this package is supported by the +Development of Zarr is supported by the `MRC Centre for Genomics and Global Health `_. Indices and tables @@ -91,4 +88,4 @@ Indices and tables * :ref:`modindex` * :ref:`search` -.. _Blosc: http://www.blosc.org/ +.. _numcodecs: http://numcodecs.readthedocs.io/ diff --git a/zarr/core.py b/zarr/core.py index ea2acc881b..91fa11633b 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -78,18 +78,18 @@ class Array(object): ------- __getitem__ __setitem__ - resize - append - view - astype get_basic_selection set_basic_selection - get_mask_selection - set_mask_selection get_orthogonal_selection set_orthogonal_selection + get_mask_selection + set_mask_selection get_coordinate_selection set_coordinate_selection + resize + append + view + astype """ @@ -431,8 +431,6 @@ def __getitem__(self, selection): >>> import zarr >>> import numpy as np >>> z = zarr.array(np.arange(100)) - >>> z - Retrieve a single item:: @@ -460,8 +458,6 @@ def __getitem__(self, selection): Setup a 2-dimensional array:: >>> z = zarr.array(np.arange(100).reshape(10, 10)) - >>> z - Retrieve an item:: @@ -512,12 +508,23 @@ def __getitem__(self, selection): [80, 81, 82, 83, 84, 85, 86, 87, 88, 89], [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]]) + For arrays with a structured dtype, specific fields can be retrieved, e.g.:: + + >>> a = np.array([(b'aaa', 1, 4.2), + ... (b'bbb', 2, 8.4), + ... (b'ccc', 3, 12.6)], + ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + >>> z = zarr.array(a) + >>> z['foo'] + array([b'aaa', b'bbb', b'ccc'], + dtype='|S3') + Notes ----- Slices with step > 1 are supported, but slices with negative step are not. Currently the implementation for __getitem__ is provided by :func:`get_basic_selection`. - For advanced ("fancy") indexing, see the methods listed below. + For advanced ("fancy") indexing, see the methods listed under See Also. See Also -------- @@ -556,8 +563,6 @@ def get_basic_selection(self, selection=Ellipsis, out=None, fields=None): >>> import zarr >>> import numpy as np >>> z = zarr.array(np.arange(100)) - >>> z - Retrieve a single item:: @@ -577,16 +582,9 @@ def get_basic_selection(self, selection=Ellipsis, out=None, fields=None): >>> z.get_basic_selection(slice(None, None, 2)) array([ 0, 2, 4, ..., 94, 96, 98]) - Load the entire array into memory:: - - >>> z.get_basic_selection() - array([ 0, 1, 2, ..., 97, 98, 99]) - Setup a 2-dimensional array:: >>> z = zarr.array(np.arange(100).reshape(10, 10)) - >>> z - Retrieve an item:: @@ -623,24 +621,25 @@ def get_basic_selection(self, selection=Ellipsis, out=None, fields=None): [60, 62, 64, 66, 68], [80, 82, 84, 86, 88]]) - Load the entire array into memory:: + For arrays with a structured dtype, specific fields can be retrieved, e.g.:: - >>> z.get_basic_selection() - array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], - [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], - [20, 21, 22, 23, 24, 25, 26, 27, 28, 29], - [30, 31, 32, 33, 34, 35, 36, 37, 38, 39], - [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], - [50, 51, 52, 53, 54, 55, 56, 57, 58, 59], - [60, 61, 62, 63, 64, 65, 66, 67, 68, 69], - [70, 71, 72, 73, 74, 75, 76, 77, 78, 79], - [80, 81, 82, 83, 84, 85, 86, 87, 88, 89], - [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]]) + >>> a = np.array([(b'aaa', 1, 4.2), + ... (b'bbb', 2, 8.4), + ... (b'ccc', 3, 12.6)], + ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + >>> z = zarr.array(a) + >>> z.get_basic_selection(slice(2), fields='foo') + array([b'aaa', b'bbb'], + dtype='|S3') Notes ----- Slices with step > 1 are supported, but slices with negative step are not. + Currently this method provides the implementation for accessing data via the square + bracket notation (__getitem__). See :func:`__getitem__` for examples using the + alternative notation. + See Also -------- set_basic_selection, get_mask_selection, set_mask_selection, @@ -733,8 +732,6 @@ def get_orthogonal_selection(self, selection, out=None, fields=None): >>> import zarr >>> import numpy as np >>> z = zarr.array(np.arange(100).reshape(10, 10)) - >>> z - Retrieve rows and columns via any combination of int, slice, integer array and/or Boolean array:: @@ -842,8 +839,6 @@ def get_coordinate_selection(self, selection, out=None, fields=None): >>> import zarr >>> import numpy as np >>> z = zarr.array(np.arange(100).reshape(10, 10)) - >>> z - Retrieve items by specifying their coordinates:: @@ -926,8 +921,6 @@ def get_mask_selection(self, selection, out=None, fields=None): >>> import zarr >>> import numpy as np >>> z = zarr.array(np.arange(100).reshape(10, 10)) - >>> z - Retrieve items by specifying a maks:: @@ -1003,7 +996,7 @@ def _get_selection(self, indexer, out=None, fields=None): return out[()] def __setitem__(self, selection, value): - """Modify data for some portion of the array. + """Modify data for an item or region of the array. Parameters ---------- @@ -1019,8 +1012,6 @@ def __setitem__(self, selection, value): >>> import zarr >>> z = zarr.zeros(100, dtype=int) - >>> z - Set all array elements to the same scalar value:: @@ -1038,8 +1029,6 @@ def __setitem__(self, selection, value): Setup a 2-dimensional array:: >>> z = zarr.zeros((5, 5), dtype=int) - >>> z - Set all array elements to the same scalar value:: @@ -1056,13 +1045,26 @@ def __setitem__(self, selection, value): [ 3, 42, 42, 42, 42], [ 4, 42, 42, 42, 42]]) + For arrays with a structured dtype, specific fields can be modified, e.g.:: + + >>> a = np.array([(b'aaa', 1, 4.2), + ... (b'bbb', 2, 8.4), + ... (b'ccc', 3, 12.6)], + ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + >>> z = zarr.array(a) + >>> z['foo'] = b'zzz' + >>> z[...] + array([(b'zzz', 1, 4.2), (b'zzz', 2, 8.4), (b'zzz', 3, 12.6)], + dtype=[('foo', 'S3'), ('bar', ' 1 are supported, but slices with negative step are not. Currently the implementation for __setitem__ is provided by :func:`set_basic_selection`, which means that only integers and slices are supported within the selection. For - advanced ("fancy") indexing, see the methods listed below. + advanced ("fancy") indexing, see the methods listed under See Also. See Also -------- @@ -1076,7 +1078,7 @@ def __setitem__(self, selection, value): self.set_basic_selection(selection, value, fields=fields) def set_basic_selection(self, selection, value, fields=None): - """Modify data for some portion of the array. + """Modify data for an item or region of the array. Parameters ---------- @@ -1085,7 +1087,7 @@ def set_basic_selection(self, selection, value, fields=None): each dimension of the array. value : scalar or array-like Value to be stored into the array. - fields : str or sequence of str + fields : str or sequence of str, optional For arrays with a structured dtype, one or more fields can be specified to set data for. @@ -1094,9 +1096,8 @@ def set_basic_selection(self, selection, value, fields=None): Setup a 1-dimensional array:: >>> import zarr + >>> import numpy as np >>> z = zarr.zeros(100, dtype=int) - >>> z - Set all array elements to the same scalar value:: @@ -1114,8 +1115,6 @@ def set_basic_selection(self, selection, value, fields=None): Setup a 2-dimensional array:: >>> z = zarr.zeros((5, 5), dtype=int) - >>> z - Set all array elements to the same scalar value:: @@ -1123,8 +1122,8 @@ def set_basic_selection(self, selection, value, fields=None): Set a portion of the array:: - >>> z.set_basic_selection(0, slice(None), np.arange(z.shape[1])) - >>> z.set_basic_selection(slice(None), 0, np.arange(z.shape[0])) + >>> z.set_basic_selection((0, slice(None)), np.arange(z.shape[1])) + >>> z.set_basic_selection((slice(None), 0), np.arange(z.shape[0])) >>> z[...] array([[ 0, 1, 2, 3, 4], [ 1, 42, 42, 42, 42], @@ -1145,6 +1144,12 @@ def set_basic_selection(self, selection, value, fields=None): array([(b'zzz', 1, 4.2), (b'zzz', 2, 8.4), (b'ccc', 3, 12.6)], dtype=[('foo', 'S3'), ('bar', '>> import zarr + >>> import numpy as np + >>> z = zarr.zeros((5, 5), dtype=int) + + Set data for a selection of rows:: + + >>> z.set_orthogonal_selection(([1, 4], slice(None)), 1) + >>> z[...] + array([[0, 0, 0, 0, 0], + [1, 1, 1, 1, 1], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [1, 1, 1, 1, 1]]) + + Set data for a selection of columns:: + + >>> z.set_orthogonal_selection((slice(None), [1, 4]), 2) + >>> z[...] + array([[0, 2, 0, 0, 2], + [1, 2, 1, 1, 2], + [0, 2, 0, 0, 2], + [0, 2, 0, 0, 2], + [1, 2, 1, 1, 2]]) + + Set data for a selection of rows and columns:: + + >>> z.set_orthogonal_selection(([1, 4], [1, 4]), 3) + >>> z[...] + array([[0, 2, 0, 0, 2], + [1, 3, 1, 1, 3], + [0, 2, 0, 0, 2], + [0, 2, 0, 0, 2], + [1, 3, 1, 1, 3]]) + + For convenience, this functionality is also available via the `oindex` property. E.g.:: + + >>> z.oindex[[1, 4], [1, 4]] = 4 + >>> z[...] + array([[0, 2, 0, 0, 2], + [1, 4, 1, 1, 4], + [0, 2, 0, 0, 2], + [0, 2, 0, 0, 2], + [1, 4, 1, 1, 4]]) + + Notes + ----- + Orthogonal indexing is also known as outer indexing. + + Slices with step > 1 are supported, but slices with negative step are not. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + vindex, oindex, __getitem__, __setitem__ + + """ # guard conditions if self._read_only: @@ -1184,7 +1262,62 @@ def set_orthogonal_selection(self, selection, value, fields=None): self._set_selection(indexer, value, fields=fields) def set_coordinate_selection(self, selection, value, fields=None): - """TODO""" + """Modify a selection of individual items, by providing the indices (coordinates) for + each item to be modified. + + Parameters + ---------- + selection : tuple + An integer (coordinate) array for each dimension of the array. + value : scalar or array-like + Value to be stored into the array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to set + data for. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.zeros((5, 5), dtype=int) + + Set data for a selection of items:: + + >>> z.set_coordinate_selection(([1, 4], [1, 4]), 1) + >>> z[...] + array([[0, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 1]]) + + For convenience, this functionality is also available via the `vindex` property. E.g.:: + + >>> z.vindex[[1, 4], [1, 4]] = 2 + >>> z[...] + array([[0, 0, 0, 0, 0], + [0, 2, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 2]]) + + Notes + ----- + Coordinate indexing is also known as point selection, and is a form of vectorized or inner + indexing. + + Slices are not supported. Coordinate arrays must be provided for all dimensions of the + array. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, vindex, + oindex, __getitem__, __setitem__ + + """ # guard conditions if self._read_only: @@ -1206,7 +1339,65 @@ def set_coordinate_selection(self, selection, value, fields=None): self._set_selection(indexer, value, fields=fields) def set_mask_selection(self, selection, value, fields=None): - """TODO""" + """Modify a selection of individual items, by providing a Boolean array of the same + shape as the array against which the selection is being made, where True values indicate + a selected item. + + Parameters + ---------- + selection : ndarray, bool + A Boolean array of the same shape as the array against which the selection is being + made. + value : scalar or array-like + Value to be stored into the array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to set + data for. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.zeros((5, 5), dtype=int) + + Set data for a selection of items:: + + >>> sel = np.zeros_like(z, dtype=bool) + >>> sel[1, 1] = True + >>> sel[4, 4] = True + >>> z.set_mask_selection(sel, 1) + >>> z[...] + array([[0, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 1]]) + + For convenience, this functionality is also available via the `vindex` property. E.g.:: + + >>> z.vindex[sel] = 2 + >>> z[...] + array([[0, 0, 0, 0, 0], + [0, 2, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 2]]) + + Notes + ----- + Mask indexing is a form of vectorized or inner indexing, and is equivalent to coordinate + indexing. Internally the mask array is converted to coordinate arrays by calling + `np.nonzero`. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, get_orthogonal_selection, + set_orthogonal_selection, get_coordinate_selection, set_coordinate_selection, vindex, + oindex, __getitem__, __setitem__ + + """ # guard conditions if self._read_only: @@ -1740,8 +1931,8 @@ def append(self, data, axis=0): (20000, 1000) >>> z.append(np.vstack([a, a]), axis=1) (20000, 2000) - >>> z - + >>> z.shape + (20000, 2000) """ return self._write_op(self._append_nosync, data, axis=axis) @@ -1925,7 +2116,7 @@ def view(self, shape=None, chunks=None, dtype=None, return a def astype(self, dtype): - """Does on the fly type conversion of the underlying data. + """Returns a view that does on the fly type conversion of the underlying data. Parameters ---------- diff --git a/zarr/indexing.py b/zarr/indexing.py index c2c134078f..eed768d85b 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -749,9 +749,16 @@ def __setitem__(self, selection, value): def check_fields(fields, dtype): + # early out + if fields is None: + return dtype + # check type + if not isinstance(fields, (str, list, tuple)): + raise TypeError("'fields' argument must be a string or list of strings; found {!r}" + .format(type(fields))) if fields: if dtype.names is None: - raise IndexError('array does not have any fields') + raise IndexError("invalid 'fields' argument, array does not have any fields") try: if isinstance(fields, str): # single field selection @@ -760,8 +767,7 @@ def check_fields(fields, dtype): # multiple field selection out_dtype = np.dtype([(f, dtype[f]) for f in fields]) except KeyError as e: - # TODO better error message - raise IndexError('field not found: {!s}'.format(e)) + raise IndexError("invalid 'fields' argument, field not found: {!r}".format(e)) else: return out_dtype else: From 0bac9f3c1d2d5c798e38a6840bc9a8f5acde0386 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 9 Nov 2017 17:46:21 +0000 Subject: [PATCH 51/67] review tutorial, add section on advanced indexing --- docs/tutorial.rst | 307 +++++++++++++++++++++++++++++------- zarr/creation.py | 15 +- zarr/hierarchy.py | 15 +- zarr/indexing.py | 4 +- zarr/tests/test_creation.py | 6 + zarr/tests/test_indexing.py | 2 + 6 files changed, 277 insertions(+), 72 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index d3137aaf52..80a2019dc6 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -40,10 +40,6 @@ scalar value:: >>> z[:] = 42 -Notice that the values of ``initialized`` has changed. This is because -when a Zarr array is first created, none of the chunks are initialized. -Writing data into the array will cause the necessary chunks to be initialized. - Regions of the array can also be written to, e.g.:: >>> import numpy as np @@ -51,7 +47,7 @@ Regions of the array can also be written to, e.g.:: >>> z[:, 0] = np.arange(10000) The contents of the array can be retrieved by slicing, which will load -the requested region into a NumPy array, e.g.:: +the requested region into memory as a NumPy array, e.g.:: >>> z[0, 0] 0 @@ -61,7 +57,7 @@ the requested region into a NumPy array, e.g.:: array([ 0, 1, 2, ..., 9997, 9998, 9999], dtype=int32) >>> z[:, 0] array([ 0, 1, 2, ..., 9997, 9998, 9999], dtype=int32) - >>> z[:] + >>> z[...] array([[ 0, 1, 2, ..., 9997, 9998, 9999], [ 1, 42, 42, ..., 42, 42, 42], [ 2, 42, 42, ..., 42, 42, 42], @@ -81,8 +77,6 @@ enabling persistence of data between sessions. For example:: >>> z1 = zarr.open_array('example.zarr', mode='w', shape=(10000, 10000), ... chunks=(1000, 1000), dtype='i4', fill_value=0) - >>> z1 - The array above will store its configuration metadata and all compressed chunk data in a directory called 'example.zarr' relative to @@ -102,11 +96,12 @@ data, e.g.:: Check that the data have been written and can be read again:: >>> z2 = zarr.open_array('example.zarr', mode='r') - >>> z2 - - >>> np.all(z1[:] == z2[:]) + >>> np.all(z1[...] == z2[...]) True +Please note that there are a number of other options for persistent array storage, see the +section on :ref:`tutorial_tips_storage` below. + .. _tutorial_resize: Resizing and appending @@ -145,44 +140,57 @@ which can be used to append data to any axis. E.g.:: Compressors ----------- -By default, Zarr uses the `Blosc `_ compression -library to compress each chunk of an array. Blosc is extremely fast -and can be configured in a variety of ways to improve the compression -ratio for different types of data. Blosc is in fact a -"meta-compressor", which means that it can used a number of different -compression algorithms internally to compress the data. Blosc also -provides highly optimized implementations of byte and bit shuffle -filters, which can significantly improve compression ratios for some -data. - -Different compressors can be provided via the ``compressor`` keyword argument -accepted by all array creation functions. For example:: +A number of different compressors can be used with Zarr. A separate package called Numcodecs_ is +available which provides an interface to various compressor libraries including Blosc, Zstandard, +LZ4, Zlib, BZ2 and LZMA. Different compressors can be provided via the ``compressor`` keyword +argument accepted by all array creation functions. For example:: >>> from numcodecs import Blosc - >>> z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000), - ... chunks=(1000, 1000), - ... compressor=Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE)) + >>> compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE) + >>> data = np.arange(100000000, dtype='i4').reshape(10000, 10000) + >>> z = zarr.array(data, chunks=(1000, 1000), compressor=compressor) >>> z.compressor Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, blocksize=0) -The array above will use Blosc as the primary compressor, using the -Zstandard algorithm (compression level 3) internally within Blosc, and with -the bitshuffle filter applied. +This array above will use Blosc as the primary compressor, using the Zstandard algorithm +(compression level 3) internally within Blosc, and with the bitshuffle filter applied. + +When using a compressor, it can be useful to get some diagnostics on the compression ratio. Zarr +arrays provide a ``info`` property which can be used to print some diagnostics, e.g.:: + + >>> z.info + Type : zarr.core.Array + Data type : int32 + Shape : (10000, 10000) + Chunk shape : (1000, 1000) + Order : C + Read-only : False + Compressor : Blosc(cname='zstd', clevel=3, shuffle=BITSHUFFLE, + : blocksize=0) + Store type : builtins.dict + No. bytes : 400000000 (381.5M) + No. bytes stored : 4565055 (4.4M) + Storage ratio : 87.6 + Chunks initialized : 100/100 -A list of the internal compression libraries available within Blosc can be -obtained via:: +If you don't specify a compressor, by default Zarr uses the Blosc compressor. Blosc is extremely +fast and can be configured in a variety of ways to improve the compression ratio for different +types of data. Blosc is in fact a "meta-compressor", which means that it can used a number of +different compression algorithms internally to compress the data. Blosc also provides highly +optimized implementations of byte and bit shuffle filters, which can significantly improve +compression ratios for some data. A list of the internal compression libraries available within +Blosc can be obtained via:: >>> from numcodecs import blosc >>> blosc.list_compressors() ['blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd'] -In addition to Blosc, other compression libraries can also be -used. For example, here is an array using Zstandard compression, level 1:: +In addition to Blosc, other compression libraries can also be used. For example, here is an array +using Zstandard compression, level 1:: >>> from numcodecs import Zstd >>> z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000), - ... chunks=(1000, 1000), - ... compressor=Zstd(level=1)) + ... chunks=(1000, 1000), compressor=Zstd(level=1)) >>> z.compressor Zstd(level=1) @@ -235,15 +243,13 @@ flexibility for implementing and using filters in combination with different compressors, Zarr also provides a mechanism for configuring filters outside of the primary compressor. -Here is an example using the delta filter with the Blosc compressor: +Here is an example using the delta filter with the Blosc compressor:: >>> from numcodecs import Blosc, Delta >>> filters = [Delta(dtype='i4')] >>> compressor = Blosc(cname='zstd', clevel=1, shuffle=Blosc.SHUFFLE) - >>> z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000), - ... chunks=(1000, 1000), filters=filters, compressor=compressor) - >>> z - + >>> data = np.arange(100000000, dtype='i4').reshape(10000, 10000) + >>> z = zarr.array(data, chunks=(1000, 1000), filters=filters, compressor=compressor) >>> z.info Type : zarr.core.Array Data type : int32 @@ -409,6 +415,186 @@ stored in sub-directories, e.g.:: For more information on groups see the :mod:`zarr.hierarchy` API docs. +.. _tutorial_indexing: + +Advanced indexing +----------------- + +As of Zarr version 2.2, Zarr arrays support several methods for advanced or "fancy" indexing, +which enable a subset of data items to be extracted or updated in an array without loading the +entire array into memory. Note that although this functionality is similar to some of the +advanced indexing capabilities available on NumPy arrays and on h5py datasets, **the Zarr API for +advanced indexing is different from both NumPy and h5py**, so please read this section carefully. +For a complete description of the indexing API, see the documentation for the +:class:`zarr.core.Array` class. + +Indexing with coordinate arrays +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Items from a Zarr array can be extracted by providing an integer array of coordinates. E.g.:: + + >>> z = zarr.array(np.arange(10)) + >>> z[...] + array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> z.get_coordinate_selection([1, 4]) + array([1, 4]) + +Coordinate arrays can also be used to update data, e.g.:: + + >>> z.set_coordinate_selection([1, 4], [-1, -2]) + >>> z[...] + array([ 0, -1, 2, 3, -2, 5, 6, 7, 8, 9]) + +For multidimensional arrays, coordinates must be provided for each dimension, e.g.:: + + >>> z = zarr.array(np.arange(15).reshape(3, 5)) + >>> z[...] + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14]]) + >>> z.get_coordinate_selection(([0, 2], [1, 3])) + array([ 1, 13]) + >>> z.set_coordinate_selection(([0, 2], [1, 3]), [-1, -2]) + >>> z[...] + array([[ 0, -1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, -2, 14]]) + +For convenience, coordinate indexing is also available via the ``vindex`` property, e.g.:: + + >>> z.vindex[[0, 2], [1, 3]] + array([-1, -2]) + >>> z.vindex[[0, 2], [1, 3]] = [-3, -4] + >>> z[...] + array([[ 0, -3, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, -4, 14]]) + +Indexing with a mask array +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Items can also be extracted by providing a Boolean mask array. E.g.:: + + >>> z = zarr.array(np.arange(10)) + >>> z[...] + array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> sel = np.zeros_like(z, dtype=bool) + >>> sel[1] = True + >>> sel[4] = True + >>> z.get_mask_selection(sel) + array([1, 4]) + >>> z.set_mask_selection(sel, [-1, -2]) + >>> z[...] + array([ 0, -1, 2, 3, -2, 5, 6, 7, 8, 9]) + +Here is a multidimensional example:: + + >>> z = zarr.array(np.arange(15).reshape(3, 5)) + >>> z[...] + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14]]) + >>> sel = np.zeros_like(z, dtype=bool) + >>> sel[0, 1] = True + >>> sel[2, 3] = True + >>> z.get_mask_selection(sel) + array([ 1, 13]) + >>> z.set_mask_selection(sel, [-1, -2]) + >>> z[...] + array([[ 0, -1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, -2, 14]]) + +For convenience, mask indexing is also available via the ``vindex`` property, e.g.:: + + >>> z.vindex[sel] + array([-1, -2]) + >>> z.vindex[sel] = [-3, -4] + >>> z[...] + array([[ 0, -3, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, -4, 14]]) + +Mask indexing is conceptually the same as coordinate indexing, and is implemented internally via +the same machinery. Both styles of indexing allow selecting arbitrary items from an array, also +known as point selection. + +Orthogonal indexing +~~~~~~~~~~~~~~~~~~~ + +Zarr arrays also support methods for orthogonal indexing, which allows selections to be made +along each dimension of an array independently. For example, this allows selecting a subset of +rows and/or columns from a 2-dimensional array. E.g.:: + + >>> z = zarr.array(np.arange(15).reshape(3, 5)) + >>> z[...] + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14]]) + >>> z.get_orthogonal_selection(([0, 2], slice(None))) # select first and third rows + array([[ 0, 1, 2, 3, 4], + [10, 11, 12, 13, 14]]) + >>> z.get_orthogonal_selection((slice(None), [1, 3])) # select second and fourth columns + array([[ 1, 3], + [ 6, 8], + [11, 13]]) + >>> z.get_orthogonal_selection(([0, 2], [1, 3])) # select rows [0, 2] and columns [1, 4] + array([[ 1, 3], + [11, 13]]) + +Data can also be modified, e.g.:: + + >>> z.set_orthogonal_selection(([0, 2], [1, 3]), [[-1, -2], [-3, -4]]) + >>> z[...] + array([[ 0, -1, 2, -2, 4], + [ 5, 6, 7, 8, 9], + [10, -3, 12, -4, 14]]) + +For convenience, the orthogonal indexing functionality is also available via the ``oindex`` +property, e.g.:: + + >>> z = zarr.array(np.arange(15).reshape(3, 5)) + >>> z.oindex[[0, 2], :] # select first and third rows + array([[ 0, 1, 2, 3, 4], + [10, 11, 12, 13, 14]]) + >>> z.oindex[:, [1, 3]] # select second and fourth columns + array([[ 1, 3], + [ 6, 8], + [11, 13]]) + >>> z.oindex[[0, 2], [1, 3]] # select rows [0, 2] and columns [1, 4] + array([[ 1, 3], + [11, 13]]) + >>> z.oindex[[0, 2], [1, 3]] = [[-1, -2], [-3, -4]] + >>> z[...] + array([[ 0, -1, 2, -2, 4], + [ 5, 6, 7, 8, 9], + [10, -3, 12, -4, 14]]) + +Any combination of integer, slice, integer array and/or Boolean array can be used for orthogonal +indexing. + +Indexing fields in structured arrays +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +All selection methods support a ``fields`` parameter which allows retrieving or replacing data +for a specific field in an array with a structured dtype. E.g.:: + + >>> a = np.array([(b'aaa', 1, 4.2), + ... (b'bbb', 2, 8.4), + ... (b'ccc', 3, 12.6)], + ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + >>> z = zarr.array(a) + >>> z['foo'] + array([b'aaa', b'bbb', b'ccc'], + dtype='|S3') + >>> z['baz'] + array([ 4.2, 8.4, 12.6]) + >>> z.get_basic_selection(slice(0, 2), fields='bar') + array([1, 2], dtype=int32) + >>> z.get_coordinate_selection([0, 2], fields=['foo', 'baz']) + array([(b'aaa', 4.2), (b'ccc', 12.6)], + dtype=[('foo', 'S3'), ('baz', '>> foo_group = root_group.create_group('foo') >>> z = foo_group.zeros('bar', shape=1000000, chunks=100000) >>> z[:] = 42 - >>> root_group - >>> root_group.info Name : / Type : zarr.hierarchy.Group @@ -437,8 +621,6 @@ Diagnostic information about arrays and groups is available via the ``info`` pro No. groups : 1 Groups : foo - >>> foo_group - >>> foo_group.info Name : /foo Type : zarr.hierarchy.Group @@ -449,8 +631,6 @@ Diagnostic information about arrays and groups is available via the ``info`` pro No. groups : 0 Arrays : bar - >>> z - >>> z.info Name : /foo/bar Type : zarr.core.Array @@ -535,10 +715,31 @@ which compression filters (e.g., byte shuffle) have been applied. Storage alternatives ~~~~~~~~~~~~~~~~~~~~ -Zarr can use any object that implements the ``MutableMapping`` interface as -the store for a group or an array. +Zarr can use any object that implements the ``MutableMapping`` interface as the store for a group +or an array. Some storage classes are provided in the :mod:`zarr.storage` module. For example, +the :class:`zarr.storage.DirectoryStore` class provides a ``MutableMapping`` interface to a +directory on the local file system. This is used under the hood by the +:func:`zarr.creation.open_array` and :func:`zarr.hierarchy.open_group` functions. In other words, +the following code:: + + >>> z = zarr.open_array('example.zarr', mode='w', shape=1000000, dtype='i4', fill_value=0) + +...is just short-hand for:: + + >>> store = zarr.DirectoryStore('example.zarr') + >>> z = zarr.zeros(store=store, overwrite=True, shape=1000000, dtype='i4') + +...and the following code:: -Here is an example storing an array directly into a Zip file:: + >>> grp = zarr.open_group('example.zarr', mode='w') + +...is just a short-hand for:: + + >>> store = zarr.DirectoryStore('example.zarr') + >>> grp = zarr.group(store=store, overwrite=True) + +Any other storage class could be used in place of :class:`zarr.storage.DirectoryStore`. For +example, here is an array stored directly into a Zip file:: >>> store = zarr.ZipStore('example.zip', mode='w') >>> root_group = zarr.group(store=store) @@ -567,11 +768,9 @@ Re-open and check that data have been written:: Note that there are some restrictions on how Zip files can be used, because items within a Zip file cannot be updated in place. This means that data in the array should only be written once and write -operations should be aligned with chunk boundaries. - -Note also that the ``close()`` method must be called after writing any data to -the store, otherwise essential records will not be written to the underlying -zip file. +operations should be aligned with chunk boundaries. Note also that the ``close()`` method must be +called after writing any data to the store, otherwise essential records will not be written to +the underlying zip file. The Dask project has implementations of the ``MutableMapping`` interface for distributed storage systems, see the `S3Map diff --git a/zarr/creation.py b/zarr/creation.py index 0e3e3750cc..14db02f9fe 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -329,22 +329,21 @@ def array(data, **kwargs): return z -def open_array(store=None, mode='a', shape=None, chunks=None, dtype=None, - compressor='default', fill_value=0, order='C', - synchronizer=None, filters=None, cache_metadata=True, +def open_array(store, mode='a', shape=None, chunks=None, dtype=None, compressor='default', + fill_value=0, order='C', synchronizer=None, filters=None, cache_metadata=True, path=None, **kwargs): - """Open array using mode-like semantics. + """Open an array using file-mode-like semantics. Parameters ---------- store : MutableMapping or string Store or path to directory in file system. - mode : {'r', 'r+', 'a', 'w', 'w-'} + mode : {'r', 'r+', 'a', 'w', 'w-'}, optional Persistence mode: 'r' means read only (must exist); 'r+' means read/write (must exist); 'a' means read/write (create if doesn't exist); 'w' means create (overwrite if exists); 'w-' means create (fail if exists). - shape : int or tuple of ints + shape : int or tuple of ints, optional Array shape. chunks : int or tuple of ints, optional Chunk shape. If not provided, will be guessed from `shape` and `dtype`. @@ -352,7 +351,7 @@ def open_array(store=None, mode='a', shape=None, chunks=None, dtype=None, NumPy dtype. compressor : Codec, optional Primary compressor. - fill_value : object + fill_value : object, optional Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Memory layout to be used within each chunk. @@ -366,7 +365,7 @@ def open_array(store=None, mode='a', shape=None, chunks=None, dtype=None, prior to all data access and modification operations (may incur overhead depending on storage and data access pattern). path : string, optional - Array path. + Array path within store. Returns ------- diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index 937c53d12c..2d211861e8 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -855,13 +855,12 @@ def _handle_store_arg(store): return store -def group(store=None, overwrite=False, chunk_store=None, synchronizer=None, - path=None): +def group(store=None, overwrite=False, chunk_store=None, synchronizer=None, path=None): """Create a group. Parameters ---------- - store : MutableMapping or string + store : MutableMapping or string, optional Store or path to directory in file system. overwrite : bool, optional If True, delete any pre-existing data in `store` at `path` before @@ -872,7 +871,7 @@ def group(store=None, overwrite=False, chunk_store=None, synchronizer=None, synchronizer : object, optional Array synchronizer. path : string, optional - Group path. + Group path within store. Returns ------- @@ -910,14 +909,14 @@ def group(store=None, overwrite=False, chunk_store=None, synchronizer=None, synchronizer=synchronizer, path=path) -def open_group(store=None, mode='a', synchronizer=None, path=None): - """Open a group using mode-like semantics. +def open_group(store, mode='a', synchronizer=None, path=None): + """Open a group using file-mode-like semantics. Parameters ---------- store : MutableMapping or string Store or path to directory in file system. - mode : {'r', 'r+', 'a', 'w', 'w-'} + mode : {'r', 'r+', 'a', 'w', 'w-'}, optional Persistence mode: 'r' means read only (must exist); 'r+' means read/write (must exist); 'a' means read/write (create if doesn't exist); 'w' means create (overwrite if exists); 'w-' means create @@ -925,7 +924,7 @@ def open_group(store=None, mode='a', synchronizer=None, path=None): synchronizer : object, optional Array synchronizer. path : string, optional - Group path. + Group path within store. Returns ------- diff --git a/zarr/indexing.py b/zarr/indexing.py index eed768d85b..31cbd12f94 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -754,8 +754,8 @@ def check_fields(fields, dtype): return dtype # check type if not isinstance(fields, (str, list, tuple)): - raise TypeError("'fields' argument must be a string or list of strings; found {!r}" - .format(type(fields))) + raise IndexError("'fields' argument must be a string or list of strings; found {!r}" + .format(type(fields))) if fields: if dtype.names is None: raise IndexError("invalid 'fields' argument, array does not have any fields") diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index bb617fff14..272d573724 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -97,6 +97,12 @@ def test_array(): assert_array_equal(a[:], z[:]) eq(a.dtype, z.dtype) + # with dtype=something else + a = np.arange(100, dtype='i4') + z = array(a, dtype='i8') + assert_array_equal(a[:], z[:]) + eq(np.dtype('i8'), z.dtype) + def test_empty(): z = empty(100, chunks=10) diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index ecf82d9e0a..0912f9acab 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -1198,6 +1198,8 @@ def test_get_selections_with_fields(): # missing/bad fields with assert_raises(IndexError): z.get_basic_selection(Ellipsis, fields=['notafield']) + with assert_raises(IndexError): + z.get_basic_selection(Ellipsis, fields=slice(None)) def test_set_selections_with_fields(): From af2b50513073e6aad8f7e72e16419bfeb78a1b49 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 10 Nov 2017 00:05:44 +0000 Subject: [PATCH 52/67] improve errors --- zarr/indexing.py | 90 +++++++++++++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 36 deletions(-) diff --git a/zarr/indexing.py b/zarr/indexing.py index 31cbd12f94..de0f113b69 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -34,6 +34,11 @@ def is_scalar(value, dtype): return False +def err_boundscheck(dim_len): + raise IndexError('index out of bounds for dimension with length {}' + .format(dim_len)) + + def normalize_integer_selection(dim_sel, dim_len): # normalize type to int @@ -45,7 +50,7 @@ def normalize_integer_selection(dim_sel, dim_len): # handle out of bounds if dim_sel >= dim_len or dim_sel < 0: - raise IndexError('index out of bounds') + err_boundscheck(dim_len) return dim_sel @@ -91,6 +96,10 @@ def ceildiv(a, b): return int(np.ceil(a / b)) +def err_negative_step(): + raise IndexError('only slices with step >= 1 are supported') + + class SliceDimIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): @@ -98,7 +107,7 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): # normalize self.start, self.stop, self.step = dim_sel.indices(dim_len) if self.step < 1: - raise IndexError('only slices with step >= 1 are supported') + err_negative_step() # store attributes self.dim_len = dim_len @@ -185,7 +194,8 @@ def replace_ellipsis(selection, shape): # check selection not too long if len(selection) > len(shape): - raise IndexError('too many indices for array') + raise IndexError('too many indices for array; expected {}, got {}' + .format(len(shape), len(selection))) return selection @@ -265,8 +275,8 @@ def __init__(self, selection, array): dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) else: - raise IndexError('unsupported selection type; expected integer or slice, got {!r}' - .format(type(dim_sel))) + raise IndexError('unsupported selection item for basic indexing; expected integer ' + 'or slice, got {!r}'.format(type(dim_sel))) dim_indexers.append(dim_indexer) @@ -292,11 +302,12 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): # check number of dimensions if not is_bool_array(dim_sel, 1): - raise IndexError('selection must be a 1d array') + raise IndexError('Boolean arrays in an orthogonal selection must 1-dimensional only') # check shape if dim_sel.shape[0] != dim_len: - raise IndexError('selection has the wrong length') + raise IndexError('Boolean array has the wrong length for dimension; ' + 'expected {}, got {}'.format(dim_len, dim_sel.shape[0])) # store attributes self.dim_sel = dim_sel @@ -363,27 +374,35 @@ def check(a): return order +def wraparound_indices(x, dim_len): + loc_neg = x < 0 + if np.any(loc_neg): + x[loc_neg] = x[loc_neg] + dim_len + + +def boundscheck_indices(x, dim_len): + if np.any(x < 0) or np.any(x >= dim_len): + err_boundscheck(dim_len) + + class IntArrayDimIndexer(object): """Integer array selection against a single dimension.""" def __init__(self, dim_sel, dim_len, dim_chunk_len, wraparound=True, boundscheck=True, order=Order.UNKNOWN): - # ensure array + # ensure 1d array dim_sel = np.asanyarray(dim_sel) if not is_integer_array(dim_sel, 1): - raise IndexError('selection must be a 1d array') + raise IndexError('integer arrays in an orthogonal selection must be 1-dimensional only') # handle wraparound if wraparound: - loc_neg = dim_sel < 0 - if np.any(loc_neg): - dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len + wraparound_indices(dim_sel, dim_len) # handle out of bounds if boundscheck: - if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): - raise IndexError('selection contains index out of bounds') + boundscheck_indices(dim_sel, dim_len) # store attributes self.dim_len = dim_len @@ -405,7 +424,7 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len, wraparound=True, boundscheck self.dim_out_sel = None elif self.order == Order.DECREASING: self.dim_sel = dim_sel[::-1] - # TODO do this without creating an arange + # TODO should be possible to do this without creating an arange self.dim_out_sel = np.arange(self.nitems - 1, -1, -1) else: # sort indices to group by chunk @@ -506,24 +525,21 @@ def __init__(self, selection, array): for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): if is_integer(dim_sel): - dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) elif isinstance(dim_sel, slice): - dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) elif is_integer_array(dim_sel): - dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) elif is_bool_array(dim_sel): - dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) else: - # TODO improve and refactor error messages - raise IndexError('unsupported selection type {!r}'.format(type(dim_sel))) + raise IndexError('unsupported selection item for orthogonal indexing; expected ' + 'integer, slice, integer array or Boolean array, got {!r}' + .format(type(dim_sel))) dim_indexers.append(dim_indexer) @@ -609,21 +625,17 @@ def __init__(self, selection, array): # validation if not is_coordinate_selection(selection, array): - # TODO refactor error messages for consistency - raise IndexError('invalid coordinate selection') + raise IndexError('invalid coordinate selection; expected one integer (coordinate) ' + 'array per dimension of the target array, got {!r}'.format(selection)) # handle wraparound, boundscheck for dim_sel, dim_len in zip(selection, array.shape): # handle wraparound - loc_neg = dim_sel < 0 - if np.any(loc_neg): - # TODO need to take a copy here, or OK to replace? - dim_sel[loc_neg] = dim_sel[loc_neg] + dim_len + wraparound_indices(dim_sel, dim_len) # handle out of bounds - if np.any(dim_sel < 0) or np.any(dim_sel >= dim_len): - raise IndexError('index out of bounds') + boundscheck_indices(dim_sel, dim_len) # compute chunk index for each point in the selection chunks_multi_index = tuple( @@ -650,7 +662,6 @@ def __init__(self, selection, array): if np.any(np.diff(chunks_raveled_indices) < 0): # optimisation, only sort if needed sel_sort = np.argsort(chunks_raveled_indices) - # chunks_raveled_indices = chunks_raveled_indices[sel_sort] selection = tuple(dim_sel[sel_sort] for dim_sel in selection) else: sel_sort = None @@ -710,8 +721,9 @@ def __init__(self, selection, array): # validation if not is_mask_selection(selection, array): - # TODO refactor error messages for consistency - raise IndexError('invalid mask selection') + raise IndexError('invalid mask selection; expected one Boolean (mask)' + 'array with the same shape as the target array, got {!r}' + .format(selection)) # convert to indices selection = np.nonzero(selection[0]) @@ -720,6 +732,12 @@ def __init__(self, selection, array): super(MaskIndexer, self).__init__(selection, array) +def err_vindex_invalid_selection(selection): + raise IndexError('unsupported selection type for vectorized indexing; only coordinate ' + 'selection (tuple of integer arrays) and mask selection (single ' + 'Boolean array) are supported; got {!r}'.format(selection)) + + class VIndex(object): def __init__(self, array): @@ -734,18 +752,18 @@ def __getitem__(self, selection): elif is_mask_selection(selection, self.array): return self.array.get_mask_selection(selection, fields=fields) else: - raise IndexError('unsupported selection') + err_vindex_invalid_selection(selection) def __setitem__(self, selection, value): fields, selection = pop_fields(selection) selection = ensure_tuple(selection) selection = replace_lists(selection) if is_coordinate_selection(selection, self.array): - return self.array.set_coordinate_selection(selection, value, fields=fields) + self.array.set_coordinate_selection(selection, value, fields=fields) elif is_mask_selection(selection, self.array): - return self.array.set_mask_selection(selection, value, fields=fields) + self.array.set_mask_selection(selection, value, fields=fields) else: - raise IndexError('unsupported selection') + err_vindex_invalid_selection(selection) def check_fields(fields, dtype): From 600aa93eabd52ca72dfaf589b6775d951cbb9443 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 10 Nov 2017 01:03:13 +0000 Subject: [PATCH 53/67] review errors --- zarr/core.py | 20 +++++++------- zarr/errors.py | 20 ++++++++++++++ zarr/hierarchy.py | 12 ++++++--- zarr/indexing.py | 37 +++++++++++++------------- zarr/tests/test_indexing.py | 53 ++++++++++++++++++++++--------------- 5 files changed, 90 insertions(+), 52 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 91fa11633b..dbe2ad59cc 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -19,7 +19,7 @@ from zarr.codecs import AsType, get_codec from zarr.indexing import (OIndex, OrthogonalIndexer, BasicIndexer, VIndex, CoordinateIndexer, MaskIndexer, check_fields, pop_fields, ensure_tuple, is_scalar, - is_contiguous_selection) + is_contiguous_selection, err_too_many_indices, check_no_multi_fields) # noinspection PyUnresolvedReferences @@ -174,7 +174,7 @@ def _refresh_metadata_nosync(self): def _flush_metadata_nosync(self): if self._is_view: - raise PermissionError('not permitted for views') + raise PermissionError('operation not permitted for views') if self._compressor: compressor_config = self._compressor.get_config() @@ -408,6 +408,7 @@ def __len__(self): if self.shape: return self.shape[0] else: + # 0-dimensional array, same error message as numpy raise TypeError('len() of unsized object') def __getitem__(self, selection): @@ -667,7 +668,7 @@ def _get_basic_selection_zd(self, selection, out=None, fields=None): # check selection is valid selection = ensure_tuple(selection) if selection not in ((), (Ellipsis,)): - raise IndexError('too many indices for array') + err_too_many_indices(selection, ()) try: # obtain encoded data for chunk @@ -1418,12 +1419,11 @@ def _set_basic_selection_zd(self, selection, value, fields=None): # check selection is valid selection = ensure_tuple(selection) if selection not in ((), (Ellipsis,)): - raise IndexError('too many indices for array') + err_too_many_indices(selection, self._shape) # check fields check_fields(fields, self._dtype) - if fields and isinstance(fields, list): - raise ValueError('multi-field assignment is not supported') + fields = check_no_multi_fields(fields) # obtain key for chunk ckey = self._chunk_key((0,)) @@ -1472,8 +1472,7 @@ def _set_selection(self, indexer, value, fields=None): # check fields are sensible check_fields(fields, self._dtype) - if fields and isinstance(fields, list): - raise ValueError('multi-field assignment is not supported') + fields = check_no_multi_fields(fields) # determine indices of chunks overlapping the selection sel_shape = indexer.shape @@ -1949,7 +1948,8 @@ def _append_nosync(self, data, axis=0): data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis) if self_shape_preserved != data_shape_preserved: - raise ValueError('shapes not compatible') + raise ValueError('shape of data to append is not compatible with the array; all ' + 'dimensions must match except for the dimension being appended') # remember old shape old_shape = self._shape @@ -2079,7 +2079,7 @@ def view(self, shape=None, chunks=None, dtype=None, ... v.resize(20000) ... except PermissionError as e: ... print(e) - not permitted for views + operation not permitted for views """ diff --git a/zarr/errors.py b/zarr/errors.py index 82c9306ca7..8829ec3e02 100644 --- a/zarr/errors.py +++ b/zarr/errors.py @@ -50,3 +50,23 @@ def err_fspath_exists_notdir(fspath): def err_read_only(): raise PermissionError('object is read-only') + + +def err_boundscheck(dim_len): + raise IndexError('index out of bounds for dimension with length {}' + .format(dim_len)) + + +def err_negative_step(): + raise IndexError('only slices with step >= 1 are supported') + + +def err_too_many_indices(selection, shape): + raise IndexError('too many indices for array; expected {}, got {}' + .format(len(shape), len(selection))) + + +def err_vindex_invalid_selection(selection): + raise IndexError('unsupported selection type for vectorized indexing; only coordinate ' + 'selection (tuple of integer arrays) and mask selection (single ' + 'Boolean array) are supported; got {!r}'.format(selection)) diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index 2d211861e8..fb19782168 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -719,6 +719,9 @@ def _require_dataset_nosync(self, name, shape, dtype=None, exact=False, path = self._item_path(name) if contains_array(self._store, path): + + # array already exists at path, validate that it is the right shape and type + synchronizer = kwargs.get('synchronizer', self._synchronizer) cache_metadata = kwargs.get('cache_metadata', True) a = Array(self._store, path=path, read_only=self._read_only, @@ -726,14 +729,17 @@ def _require_dataset_nosync(self, name, shape, dtype=None, exact=False, cache_metadata=cache_metadata) shape = normalize_shape(shape) if shape != a.shape: - raise TypeError('shapes do not match') + raise TypeError('shape do not match existing array; expected {}, got {}' + .format(a.shape, shape)) dtype = np.dtype(dtype) if exact: if dtype != a.dtype: - raise TypeError('dtypes do not match exactly') + raise TypeError('dtypes do not match exactly; expected {}, got {}' + .format(a.dtype, dtype)) else: if not np.can_cast(dtype, a.dtype): - raise TypeError('dtypes cannot be safely cast') + raise TypeError('dtypes ({}, {}) cannot be safely cast' + .format(dtype, a.dtype)) return a else: diff --git a/zarr/indexing.py b/zarr/indexing.py index de0f113b69..656efc201b 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -8,6 +8,10 @@ import numpy as np +from zarr.errors import (err_too_many_indices, err_boundscheck, err_negative_step, + err_vindex_invalid_selection) + + def is_integer(x): return isinstance(x, numbers.Integral) @@ -34,11 +38,6 @@ def is_scalar(value, dtype): return False -def err_boundscheck(dim_len): - raise IndexError('index out of bounds for dimension with length {}' - .format(dim_len)) - - def normalize_integer_selection(dim_sel, dim_len): # normalize type to int @@ -96,10 +95,6 @@ def ceildiv(a, b): return int(np.ceil(a / b)) -def err_negative_step(): - raise IndexError('only slices with step >= 1 are supported') - - class SliceDimIndexer(object): def __init__(self, dim_sel, dim_len, dim_chunk_len): @@ -160,6 +155,11 @@ def __iter__(self): yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) +def check_selection_length(selection, shape): + if len(selection) > len(shape): + err_too_many_indices(selection, shape) + + def replace_ellipsis(selection, shape): selection = ensure_tuple(selection) @@ -193,9 +193,7 @@ def replace_ellipsis(selection, shape): selection += (slice(None),) * (len(shape) - len(selection)) # check selection not too long - if len(selection) > len(shape): - raise IndexError('too many indices for array; expected {}, got {}' - .format(len(shape), len(selection))) + check_selection_length(selection, shape) return selection @@ -732,12 +730,6 @@ def __init__(self, selection, array): super(MaskIndexer, self).__init__(selection, array) -def err_vindex_invalid_selection(selection): - raise IndexError('unsupported selection type for vectorized indexing; only coordinate ' - 'selection (tuple of integer arrays) and mask selection (single ' - 'Boolean array) are supported; got {!r}'.format(selection)) - - class VIndex(object): def __init__(self, array): @@ -792,6 +784,15 @@ def check_fields(fields, dtype): return dtype +def check_no_multi_fields(fields): + if isinstance(fields, list): + if len(fields) == 1: + return fields[0] + elif len(fields) > 1: + raise IndexError('multiple fields are not supported for this operation') + return fields + + def pop_fields(selection): if isinstance(selection, str): # single field selection diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 0912f9acab..a034a289b0 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -324,9 +324,9 @@ def test_set_basic_selection_0d(): eq(v['bar'], z['bar']) eq(a['baz'], z['baz']) # multiple field assignment not supported - with assert_raises(ValueError): + with assert_raises(IndexError): z.set_basic_selection(Ellipsis, v[['foo', 'bar']], fields=['foo', 'bar']) - with assert_raises(ValueError): + with assert_raises(IndexError): z[..., 'foo', 'bar'] = v[['foo', 'bar']] @@ -1213,6 +1213,7 @@ def test_set_selections_with_fields(): fields_fixture = [ 'foo', + [], ['foo'], ['foo', 'bar'], ['foo', 'baz'], @@ -1225,54 +1226,64 @@ def test_set_selections_with_fields(): for fields in fields_fixture: # currently multi-field assignment is not supported in numpy, so we won't support it either - if isinstance(fields, list): - with assert_raises(ValueError): - z.set_basic_selection(Ellipsis, v[fields], fields=fields) - with assert_raises(ValueError): - z.set_orthogonal_selection([0, 2], v[fields], fields=fields) - with assert_raises(ValueError): - z.set_coordinate_selection([0, 2], v[fields], fields=fields) - with assert_raises(ValueError): - z.set_mask_selection([True, False, True], v[fields], fields=fields) + if isinstance(fields, list) and len(fields) > 1: + with assert_raises(IndexError): + z.set_basic_selection(Ellipsis, v, fields=fields) + with assert_raises(IndexError): + z.set_orthogonal_selection([0, 2], v, fields=fields) + with assert_raises(IndexError): + z.set_coordinate_selection([0, 2], v, fields=fields) + with assert_raises(IndexError): + z.set_mask_selection([True, False, True], v, fields=fields) else: + if isinstance(fields, list) and len(fields) == 1: + # work around numpy does not support multi-field assignment even if there is only + # one field + key = fields[0] + elif isinstance(fields, list) and len(fields) == 0: + # work around numpy ambiguity about what is a field selection + key = Ellipsis + else: + key = fields + # setup expectation a[:] = ('', 0, 0) z[:] = ('', 0, 0) assert_array_equal(a, z[:]) - a[fields] = v[fields] + a[key] = v[key] # total selection - z.set_basic_selection(Ellipsis, v[fields], fields=fields) + z.set_basic_selection(Ellipsis, v[key], fields=fields) assert_array_equal(a, z[:]) # basic selection with slice a[:] = ('', 0, 0) z[:] = ('', 0, 0) - a[fields][0:2] = v[fields][0:2] - z.set_basic_selection(slice(0, 2), v[0:2][fields], fields=fields) + a[key][0:2] = v[key][0:2] + z.set_basic_selection(slice(0, 2), v[key][0:2], fields=fields) assert_array_equal(a, z[:]) # orthogonal selection a[:] = ('', 0, 0) z[:] = ('', 0, 0) ix = [0, 2] - a[fields][ix] = v[fields][ix] - z.set_orthogonal_selection(ix, v[fields][ix], fields=fields) + a[key][ix] = v[key][ix] + z.set_orthogonal_selection(ix, v[key][ix], fields=fields) assert_array_equal(a, z[:]) # coordinate selection a[:] = ('', 0, 0) z[:] = ('', 0, 0) ix = [0, 2] - a[fields][ix] = v[fields][ix] - z.set_coordinate_selection(ix, v[fields][ix], fields=fields) + a[key][ix] = v[key][ix] + z.set_coordinate_selection(ix, v[key][ix], fields=fields) assert_array_equal(a, z[:]) # mask selection a[:] = ('', 0, 0) z[:] = ('', 0, 0) ix = [True, False, True] - a[fields][ix] = v[fields][ix] - z.set_mask_selection(ix, v[fields][ix], fields=fields) + a[key][ix] = v[key][ix] + z.set_mask_selection(ix, v[key][ix], fields=fields) assert_array_equal(a, z[:]) From 4e19759d0044f591606c27cd2a17defe7b91980f Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 11 Nov 2017 00:30:22 +0000 Subject: [PATCH 54/67] rebase; resolve issues with structured arrays --- docs/spec/v2.rst | 126 +++++++++++++++--------------- docs/tutorial.rst | 10 +-- notebooks/advanced_indexing.ipynb | 75 ++++++++---------- zarr/core.py | 7 +- zarr/creation.py | 4 - zarr/meta.py | 4 +- zarr/storage.py | 12 +++ zarr/tests/test_core.py | 28 +++++++ zarr/tests/test_creation.py | 6 +- zarr/tests/test_indexing.py | 2 +- zarr/tests/test_meta.py | 4 +- 11 files changed, 157 insertions(+), 121 deletions(-) diff --git a/docs/spec/v2.rst b/docs/spec/v2.rst index 00a9bcc495..88df4f9439 100644 --- a/docs/spec/v2.rst +++ b/docs/spec/v2.rst @@ -3,31 +3,31 @@ Zarr storage specification version 2 ==================================== -This document provides a technical specification of the protocol and format -used for storing Zarr arrays. The key words "MUST", "MUST NOT", "REQUIRED", -"SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and -"OPTIONAL" in this document are to be interpreted as described in `RFC 2119 +This document provides a technical specification of the protocol and format +used for storing Zarr arrays. The key words "MUST", "MUST NOT", "REQUIRED", +"SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and +"OPTIONAL" in this document are to be interpreted as described in `RFC 2119 `_. Status ------ -This specification is the latest version. See :ref:`spec` for previous +This specification is the latest version. See :ref:`spec` for previous versions. Storage ------- -A Zarr array can be stored in any storage system that provides a key/value -interface, where a key is an ASCII string and a value is an arbitrary sequence -of bytes, and the supported operations are read (get the sequence of bytes -associated with a given key), write (set the sequence of bytes associated with +A Zarr array can be stored in any storage system that provides a key/value +interface, where a key is an ASCII string and a value is an arbitrary sequence +of bytes, and the supported operations are read (get the sequence of bytes +associated with a given key), write (set the sequence of bytes associated with a given key) and delete (remove a key/value pair). -For example, a directory in a file system can provide this interface, where -keys are file names, values are file contents, and files can be read, written -or deleted via the operating system. Equally, an S3 bucket can provide this -interface, where keys are resource names, values are resource contents, and +For example, a directory in a file system can provide this interface, where +keys are file names, values are file contents, and files can be read, written +or deleted via the operating system. Equally, an S3 bucket can provide this +interface, where keys are resource names, values are resource contents, and resources can be read, written or deleted via HTTP. Below an "array store" refers to any system implementing this interface. @@ -38,11 +38,11 @@ Arrays Metadata ~~~~~~~~ -Each array requires essential configuration metadata to be stored, enabling -correct interpretation of the stored data. This metadata is encoded using JSON +Each array requires essential configuration metadata to be stored, enabling +correct interpretation of the stored data. This metadata is encoded using JSON and stored as the value of the ".zarray" key within an array store. -The metadata resource is a JSON object. The following keys MUST be present +The metadata resource is a JSON object. The following keys MUST be present within the object: zarr_format @@ -57,8 +57,8 @@ dtype A string or list defining a valid data type for the array. See also the subsection below on data type encoding. compressor - A JSON object identifying the primary compression codec and providing - configuration parameters, or ``null`` if no compressor is to be used. + A JSON object identifying the primary compression codec and providing + configuration parameters, or ``null`` if no compressor is to be used. The object MUST contain an ``"id"`` key identifying the codec to be used. fill_value A scalar value providing the default value to use for uninitialized @@ -74,10 +74,10 @@ filters Other keys MUST NOT be present within the metadata object. -For example, the JSON object below defines a 2-dimensional array of 64-bit -little-endian floating point numbers with 10000 rows and 10000 columns, divided -into chunks of 1000 rows and 1000 columns (so there will be 100 chunks in total -arranged in a 10 by 10 grid). Within each chunk the data are laid out in C +For example, the JSON object below defines a 2-dimensional array of 64-bit +little-endian floating point numbers with 10000 rows and 10000 columns, divided +into chunks of 1000 rows and 1000 columns (so there will be 100 chunks in total +arranged in a 10 by 10 grid). Within each chunk the data are laid out in C contiguous order. Each chunk is encoded using a delta filter and compressed using the Blosc compression library prior to storage:: @@ -109,8 +109,8 @@ Data type encoding ~~~~~~~~~~~~~~~~~~ Simple data types are encoded within the array metadata as a string, -following the `NumPy array protocol type string (typestr) format -`_. The format +following the `NumPy array protocol type string (typestr) format +`_. The format consists of 3 parts: * One character describing the byteorder of the data (``"<"``: little-endian; @@ -127,9 +127,9 @@ The byte order MUST be specified. E.g., ``"i4"``, ``"|b1"`` and ``"|S12"`` are valid data type encodings. Structured data types (i.e., with multiple named fields) are encoded as a list -of two-element lists, following `NumPy array protocol type descriptions (descr) -`_. For -example, the JSON list ``[["r", "|u1"], ["g", "|u1"], ["b", "|u1"]]`` defines a +of two-element lists, following `NumPy array protocol type descriptions (descr) +`_. For +example, the JSON list ``[["r", "|u1"], ["g", "|u1"], ["b", "|u1"]]`` defines a data type composed of three single-byte unsigned integers labelled "r", "g" and "b". @@ -147,37 +147,41 @@ Positive Infinity ``"Infinity"`` Negative Infinity ``"-Infinity"`` ================= =============== +If an array has a fixed length byte string data type (e.g., ``"|S12"``), or a +structured data type, and if the fill value is not null, then the fill value +MUST be encoded as an ASCII string using the standard Base64 alphabet. + Chunks ~~~~~~ -Each chunk of the array is compressed by passing the raw bytes for the chunk -through the primary compression library to obtain a new sequence of bytes -comprising the compressed chunk data. No header is added to the compressed -bytes or any other modification made. The internal structure of the compressed -bytes will depend on which primary compressor was used. For example, the `Blosc -compressor `_ -produces a sequence of bytes that begins with a 16-byte header followed by +Each chunk of the array is compressed by passing the raw bytes for the chunk +through the primary compression library to obtain a new sequence of bytes +comprising the compressed chunk data. No header is added to the compressed +bytes or any other modification made. The internal structure of the compressed +bytes will depend on which primary compressor was used. For example, the `Blosc +compressor `_ +produces a sequence of bytes that begins with a 16-byte header followed by compressed data. -The compressed sequence of bytes for each chunk is stored under a key formed -from the index of the chunk within the grid of chunks representing the array. -To form a string key for a chunk, the indices are converted to strings and +The compressed sequence of bytes for each chunk is stored under a key formed +from the index of the chunk within the grid of chunks representing the array. +To form a string key for a chunk, the indices are converted to strings and concatenated with the period character (".") separating each index. For -example, given an array with shape (10000, 10000) and chunk shape (1000, 1000) -there will be 100 chunks laid out in a 10 by 10 grid. The chunk with indices -(0, 0) provides data for rows 0-1000 and columns 0-1000 and is stored under the +example, given an array with shape (10000, 10000) and chunk shape (1000, 1000) +there will be 100 chunks laid out in a 10 by 10 grid. The chunk with indices +(0, 0) provides data for rows 0-1000 and columns 0-1000 and is stored under the key "0.0"; the chunk with indices (2, 4) provides data for rows 2000-3000 and columns 4000-5000 and is stored under the key "2.4"; etc. -There is no need for all chunks to be present within an array store. If a chunk -is not present then it is considered to be in an uninitialized state. An -unitialized chunk MUST be treated as if it was uniformly filled with the value +There is no need for all chunks to be present within an array store. If a chunk +is not present then it is considered to be in an uninitialized state. An +unitialized chunk MUST be treated as if it was uniformly filled with the value of the "fill_value" field in the array metadata. If the "fill_value" field is ``null`` then the contents of the chunk are undefined. -Note that all chunks in an array have the same shape. If the length of any -array dimension is not exactly divisible by the length of the corresponding -chunk dimension then some chunks will overhang the edge of the array. The +Note that all chunks in an array have the same shape. If the length of any +array dimension is not exactly divisible by the length of the corresponding +chunk dimension then some chunks will overhang the edge of the array. The contents of any chunk region falling outside the array are undefined. Filters @@ -196,15 +200,15 @@ Hierarchies Logical storage paths ~~~~~~~~~~~~~~~~~~~~~ -Multiple arrays can be stored in the same array store by associating each array -with a different logical path. A logical path is simply an ASCII string. The -logical path is used to form a prefix for keys used by the array. For example, +Multiple arrays can be stored in the same array store by associating each array +with a different logical path. A logical path is simply an ASCII string. The +logical path is used to form a prefix for keys used by the array. For example, if an array is stored at logical path "foo/bar" then the array metadata will be stored under the key "foo/bar/.zarray", the user-defined attributes will be stored under the key "foo/bar/.zattrs", and the chunks will be stored under keys like "foo/bar/0.0", "foo/bar/0.1", etc. -To ensure consistent behaviour across different storage systems, logical paths +To ensure consistent behaviour across different storage systems, logical paths MUST be normalized as follows: * Replace all backward slash characters ("\\") with forward slash characters @@ -221,11 +225,11 @@ After normalization, if splitting a logical path by the "/" character results in any path segment equal to the string "." or the string ".." then an error MUST be raised. -N.B., how the underlying array store processes requests to store values under +N.B., how the underlying array store processes requests to store values under keys containing the "/" character is entirely up to the store implementation -and is not constrained by this specification. E.g., an array store could simply -treat all keys as opaque ASCII strings; equally, an array store could map -logical paths onto some kind of hierarchical storage (e.g., directories on a +and is not constrained by this specification. E.g., an array store could simply +treat all keys as opaque ASCII strings; equally, an array store could map +logical paths onto some kind of hierarchical storage (e.g., directories on a file system). Groups @@ -233,12 +237,12 @@ Groups Arrays can be organized into groups which can also contain other groups. A group is created by storing group metadata under the ".zgroup" key under some -logical path. E.g., a group exists at the root of an array store if the +logical path. E.g., a group exists at the root of an array store if the ".zgroup" key exists in the store, and a group exists at logical path "foo/bar" if the "foo/bar/.zgroup" key exists in the store. -If the user requests a group to be created under some logical path, then groups -MUST also be created at all ancestor paths. E.g., if the user requests group +If the user requests a group to be created under some logical path, then groups +MUST also be created at all ancestor paths. E.g., if the user requests group creation at path "foo/bar" then groups MUST be created at path "foo" and the root of the store, if they don't already exist. @@ -256,7 +260,7 @@ zarr_format Other keys MUST NOT be present within the metadata object. -The members of a group are arrays and groups stored under logical paths that +The members of a group are arrays and groups stored under logical paths that are direct children of the parent group's logical path. E.g., if groups exist under the logical paths "foo" and "foo/bar" and an array exists at logical path "foo/baz" then the members of the group at path "foo" are the group at path @@ -265,8 +269,8 @@ under the logical paths "foo" and "foo/bar" and an array exists at logical path Attributes ---------- -An array or group can be associated with custom attributes, which are simple -key/value items with application-specific meaning. Custom attributes are +An array or group can be associated with custom attributes, which are simple +key/value items with application-specific meaning. Custom attributes are encoded as a JSON object and stored under the ".zattrs" key within an array store. @@ -377,7 +381,7 @@ Modify the array attributes:: Storing multiple arrays in a hierarchy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Below is an example of storing multiple Zarr arrays organized into a group +Below is an example of storing multiple Zarr arrays organized into a group hierarchy, using a directory on the local file system as storage. This storage implementation maps logical paths onto directory paths on the file system, however this is an implementation choice and is not required. diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 80a2019dc6..8b8d2cbe83 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -76,7 +76,7 @@ stored in memory. Zarr arrays can also be stored on a file system, enabling persistence of data between sessions. For example:: >>> z1 = zarr.open_array('example.zarr', mode='w', shape=(10000, 10000), - ... chunks=(1000, 1000), dtype='i4', fill_value=0) + ... chunks=(1000, 1000), dtype='i4') The array above will store its configuration metadata and all compressed chunk data in a directory called 'example.zarr' relative to @@ -382,8 +382,7 @@ and :func:`zarr.hierarchy.Group.require_dataset` methods, e.g.:: >>> z = bar_group.create_dataset('quux', shape=(10000, 10000), ... chunks=(1000, 1000), dtype='i4', - ... fill_value=0, compression='gzip', - ... compression_opts=1) + ... compression='gzip', compression_opts=1) >>> z @@ -408,8 +407,7 @@ stored in sub-directories, e.g.:: >>> persistent_group >>> z = persistent_group.create_dataset('foo/bar/baz', shape=(10000, 10000), - ... chunks=(1000, 1000), dtype='i4', - ... fill_value=0) + ... chunks=(1000, 1000), dtype='i4') >>> z @@ -722,7 +720,7 @@ directory on the local file system. This is used under the hood by the :func:`zarr.creation.open_array` and :func:`zarr.hierarchy.open_group` functions. In other words, the following code:: - >>> z = zarr.open_array('example.zarr', mode='w', shape=1000000, dtype='i4', fill_value=0) + >>> z = zarr.open_array('example.zarr', mode='w', shape=1000000, dtype='i4') ...is just short-hand for:: diff --git a/notebooks/advanced_indexing.ipynb b/notebooks/advanced_indexing.ipynb index 99fbc0d13d..eba6b5880b 100644 --- a/notebooks/advanced_indexing.ipynb +++ b/notebooks/advanced_indexing.ipynb @@ -15,7 +15,7 @@ { "data": { "text/plain": [ - "'2.1.5.dev118+dirty'" + "'2.1.5.dev144'" ] }, "execution_count": 1, @@ -957,7 +957,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -967,7 +967,7 @@ " dtype=[('foo', 'S3'), ('bar', '\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mIndexError\u001b[0m: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices" ] } @@ -1181,7 +1181,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 52, "metadata": {}, "outputs": [ { @@ -1191,7 +1191,7 @@ " dtype=[('foo', 'S3'), ('baz', '", + "evalue": "unsupported selection item for basic indexing; expected integer or slice, got ", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mza\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, selection)\u001b[0m\n\u001b[1;32m 475\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpop_fields\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 477\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 478\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 479\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36mget_basic_selection\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m 491\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 492\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 493\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_nd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 494\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 495\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m_get_basic_selection_nd\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 532\u001b[0m \u001b[0;31m# setup indexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 533\u001b[0;31m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBasicIndexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 534\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 535\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/indexing.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, selection, array)\u001b[0m\n\u001b[1;32m 267\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 268\u001b[0m raise IndexError('unsupported selection type; expected integer or slice, got {!r}'\n\u001b[0;32m--> 269\u001b[0;31m .format(type(dim_sel)))\n\u001b[0m\u001b[1;32m 270\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 271\u001b[0m \u001b[0mdim_indexers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim_indexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mIndexError\u001b[0m: unsupported selection type; expected integer or slice, got " + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mza\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, selection)\u001b[0m\n\u001b[1;32m 537\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 538\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpop_fields\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 539\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 540\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 541\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mEllipsis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36mget_basic_selection\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m 661\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 662\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 663\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_nd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 664\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 665\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m_get_basic_selection_nd\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m 701\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[0;31m# setup indexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 703\u001b[0;31m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBasicIndexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 704\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 705\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/indexing.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, selection, array)\u001b[0m\n\u001b[1;32m 275\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 276\u001b[0m raise IndexError('unsupported selection item for basic indexing; expected integer '\n\u001b[0;32m--> 277\u001b[0;31m 'or slice, got {!r}'.format(type(dim_sel)))\n\u001b[0m\u001b[1;32m 278\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[0mdim_indexers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim_indexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIndexError\u001b[0m: unsupported selection item for basic indexing; expected integer or slice, got " ] } ], @@ -2333,13 +2333,6 @@ "profile('zc[::2]')" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here there are various setup operations that need to be done on the integer array, can't see way to avoid ATM." - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/zarr/core.py b/zarr/core.py index dbe2ad59cc..774487db28 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -148,7 +148,10 @@ def _load_metadata_nosync(self): self._shape = meta['shape'] self._chunks = meta['chunks'] self._dtype = meta['dtype'] - self._fill_value = meta['fill_value'] + fill_value = meta['fill_value'] + if fill_value is not None: + fill_value = np.array(fill_value, self._dtype)[()] + self._fill_value = fill_value self._order = meta['order'] # setup compressor @@ -358,7 +361,6 @@ def nchunks(self): @property def nchunks_initialized(self): """The number of chunks that have been initialized with some data.""" - # TODO fix bug here, need to only count chunks # key pattern for chunk keys prog = re.compile(r'\.'.join([r'\d+'] * min(1, self.ndim))) @@ -1058,7 +1060,6 @@ def __setitem__(self, selection, value): array([(b'zzz', 1, 4.2), (b'zzz', 2, 8.4), (b'zzz', 3, 12.6)], dtype=[('foo', 'S3'), ('bar', ' 1 are supported, but slices with negative step are not. diff --git a/zarr/creation.py b/zarr/creation.py index 14db02f9fe..4dfc20c4c6 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -103,10 +103,6 @@ def create(shape, chunks=None, dtype=None, compressor='default', # API compatibility with h5py compressor, fill_value = _handle_kwargs(compressor, fill_value, kwargs) - # ensure fill_value of correct type - if fill_value is not None: - fill_value = np.array(fill_value, dtype=dtype)[()] - # initialize array metadata init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, diff --git a/zarr/meta.py b/zarr/meta.py index 59fe2d22d5..d35e0f87ed 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -124,7 +124,7 @@ def decode_fill_value(v, dtype): return np.NINF else: return np.array(v, dtype=dtype)[()] - elif dtype.kind == 'S': + elif dtype.kind in 'SV': try: return base64.standard_b64decode(v) except Exception: @@ -152,7 +152,7 @@ def encode_fill_value(v, dtype): return int(v) elif dtype.kind == 'b': return bool(v) - elif dtype.kind == 'S': + elif dtype.kind in 'SV': v = base64.standard_b64encode(v) if not PY2: v = str(v, 'ascii') diff --git a/zarr/storage.py b/zarr/storage.py index 939e4ef85a..494d536f0e 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -286,6 +286,18 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, compressor='defa chunks = normalize_chunks(chunks, shape, dtype.itemsize) order = normalize_order(order) + # ensure fill_value of correct type + if fill_value == 0 and dtype.kind == 'V': + # special case because 0 used as default, but cannot be used for structured arrays + fill_value = b'' + elif fill_value is not None: + try: + fill_value = np.array(fill_value, dtype=dtype)[()] + except Exception as e: + # re-raise with our own error message to be helpful + raise ValueError('fill_value {!r} is not valid for dtype {}; nested exception: {}' + .format(fill_value, dtype, e)) + # compressor prep if shape == (): # no point in compressing a 0-dimensional array, only a single value diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 6134ad91ad..b9fb86c10c 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -782,6 +782,30 @@ def test_nchunks_initialized(self): z[:] = 42 eq(10, z.nchunks_initialized) + def test_structured_array(self): + + # setup some data + a = np.array([(b'aaa', 1, 4.2), + (b'bbb', 2, 8.4), + (b'ccc', 3, 12.6)], + dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + for fill_value in None, b'', (b'zzz', 0, 0.0): + if fill_value is not None: + fill_value = np.array(fill_value, dtype=a.dtype)[()] + z = self.create_array(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=fill_value) + eq(3, len(z)) + eq(fill_value, z.fill_value) + z[...] = a + eq(a[0], z[0]) + assert_array_equal(a, z[...]) + assert_array_equal(a['foo'], z['foo']) + assert_array_equal(a['bar'], z['bar']) + assert_array_equal(a['baz'], z['baz']) + + with assert_raises(ValueError): + # dodgy fill value + self.create_array(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=42) + class TestArrayWithPath(TestArray): @@ -967,6 +991,10 @@ def test_astype(self): expected = data.astype(astype) assert_array_equal(expected, z2) + def test_structured_array(self): + # don't implement this one, cannot do delta on structured array + pass + # custom store, does not support getsize() class CustomMapping(object): diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index 272d573724..4f40b81233 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -376,7 +376,7 @@ def test_create(): with assert_raises(ValueError): create(100, chunks=10, compressor='zlib') - # compatibility + # h5py compatibility z = create(100, compression='zlib', compression_opts=9) eq('zlib', z.compressor.codec_id) @@ -387,7 +387,11 @@ def test_create(): # errors with assert_raises(ValueError): + # bad compression argument create(100, compression=1) + with assert_raises(ValueError): + # bad fill value + create(100, dtype='i4', fill_value='foo') def test_compression_args(): diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index a034a289b0..c73ed4b530 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -1098,7 +1098,7 @@ def test_get_selections_with_fields(): ('bbb', 2, 8.4), ('ccc', 3, 12.6)] a = np.array(a, dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) - z = zarr.create(shape=a.shape, chunks=2, dtype=a.dtype) + z = zarr.create(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=None) z[:] = a fields_fixture = [ diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index d1f1814cf2..beb59bbbc5 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -74,7 +74,7 @@ def test_encode_decode_array_2(): chunks=(10, 10), dtype=np.dtype([('a', 'i4'), ('b', 'S10')]), compressor=compressor.get_config(), - fill_value=42, + fill_value=b'', order='F', filters=[df.get_config()] ) @@ -89,7 +89,7 @@ def test_encode_decode_array_2(): "blocksize": 0 }, "dtype": [["a", " Date: Sat, 11 Nov 2017 22:50:04 +0000 Subject: [PATCH 55/67] spike appveyor --- zarr/tests/test_creation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index 4f40b81233..e51fea30cb 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -136,7 +136,7 @@ def test_full(): # "NaN" z = full(100, chunks=10, fill_value='NaN', dtype='U3') - assert np.all(z[:] == 'NaN') + assert np.all(z[:] == 'NaN'), (z[0], type(z[0])) def test_open_array(): From c4597305b5dccfe1b63fbf4d5b49e63260ed1ebd Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 11 Nov 2017 22:55:07 +0000 Subject: [PATCH 56/67] spike appveyor 2 --- zarr/tests/test_creation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index e51fea30cb..9e730b33fd 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -136,7 +136,10 @@ def test_full(): # "NaN" z = full(100, chunks=10, fill_value='NaN', dtype='U3') - assert np.all(z[:] == 'NaN'), (z[0], type(z[0])) + eq(np.array('NaN', dtype='U3')[()], z[0]) + eq('NaN', z[0]) + t = z[:] == 'NaN' + assert np.all(t), (np.count_nonzero(t), t.size) def test_open_array(): From 7347c4dd31152a5df09985c95e0e21d2e2bf80db Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 11 Nov 2017 23:00:53 +0000 Subject: [PATCH 57/67] spike appveyor 3 --- zarr/tests/test_creation.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index 9e730b33fd..7dceee5be8 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -136,8 +136,14 @@ def test_full(): # "NaN" z = full(100, chunks=10, fill_value='NaN', dtype='U3') - eq(np.array('NaN', dtype='U3')[()], z[0]) + v = np.array('NaN', dtype='U3')[()] + eq(v, z[0]) eq('NaN', z[0]) + a = z[...] + print(a.dtype, repr(a[0]), type(a[0]), a[0] == 'NaN', a[0:2] == 'NaN', a[0:2] == v) + eq('NaN', z[...][0]) + t = z[:] == u'NaN' + assert np.all(t), (np.count_nonzero(t), t.size) t = z[:] == 'NaN' assert np.all(t), (np.count_nonzero(t), t.size) From e776977b92a679346bd0f0ed78ea9cfa49bb4693 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 11 Nov 2017 23:11:57 +0000 Subject: [PATCH 58/67] spike appveyor 4 --- zarr/tests/test_creation.py | 23 +++++++++++++++-------- zarr/tests/test_indexing.py | 4 ++++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index 7dceee5be8..6b55749e3e 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -134,17 +134,24 @@ def test_full(): z = full(100, chunks=10, fill_value=np.nan, dtype='f8') assert np.all(np.isnan(z[:])) - # "NaN" - z = full(100, chunks=10, fill_value='NaN', dtype='U3') - v = np.array('NaN', dtype='U3')[()] + # "NaN" byte string + v = b'NaN' + z = full(100, chunks=10, fill_value=v, dtype='S3') eq(v, z[0]) - eq('NaN', z[0]) a = z[...] - print(a.dtype, repr(a[0]), type(a[0]), a[0] == 'NaN', a[0:2] == 'NaN', a[0:2] == v) - eq('NaN', z[...][0]) - t = z[:] == u'NaN' + print(a.dtype, a[0], a[0:2], repr(a[0]), type(a[0]), a[0] == v, a[0:2] == v) + eq(v, a[0]) + t = z[...] == v assert np.all(t), (np.count_nonzero(t), t.size) - t = z[:] == 'NaN' + + # "NaN" unicode string + v = 'NaN' + z = full(100, chunks=10, fill_value=v, dtype='U3') + eq(v, z[0]) + a = z[...] + print(a.dtype, a[0], a[0:2], repr(a[0]), type(a[0]), a[0] == v, a[0:2] == v) + eq(v, a[0]) + t = z[...] == v assert np.all(t), (np.count_nonzero(t), t.size) diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index c73ed4b530..6400d5d62b 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -690,10 +690,14 @@ def test_set_orthogonal_selection_3d(): ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * .5), replace=True) ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * .5), replace=True) _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + # sorted increasing ix0.sort() ix1.sort() ix2.sort() _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + # sorted decreasing ix0 = ix0[::-1] ix1 = ix1[::-1] ix2 = ix2[::-1] From d1d8bbe709b9b42d136f5a40dcab1c68a0f78f09 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 11 Nov 2017 23:16:08 +0000 Subject: [PATCH 59/67] spike appveyor 5 --- zarr/tests/test_creation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index 6b55749e3e..77dd3a9a64 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -137,6 +137,7 @@ def test_full(): # "NaN" byte string v = b'NaN' z = full(100, chunks=10, fill_value=v, dtype='S3') + print(z.store['.zarray']) eq(v, z[0]) a = z[...] print(a.dtype, a[0], a[0:2], repr(a[0]), type(a[0]), a[0] == v, a[0:2] == v) @@ -147,6 +148,7 @@ def test_full(): # "NaN" unicode string v = 'NaN' z = full(100, chunks=10, fill_value=v, dtype='U3') + print(z.store['.zarray']) eq(v, z[0]) a = z[...] print(a.dtype, a[0], a[0:2], repr(a[0]), type(a[0]), a[0] == v, a[0:2] == v) From 5283bd9397a5301e60c8fb52e53f9f69a2d66930 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 11 Nov 2017 23:29:32 +0000 Subject: [PATCH 60/67] spike appveyor 6 --- zarr/core.py | 2 ++ zarr/meta.py | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/zarr/core.py b/zarr/core.py index 774487db28..26659b15a8 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -149,8 +149,10 @@ def _load_metadata_nosync(self): self._chunks = meta['chunks'] self._dtype = meta['dtype'] fill_value = meta['fill_value'] + print('Array._load_metadata_nosync, from meta', fill_value) if fill_value is not None: fill_value = np.array(fill_value, self._dtype)[()] + print('Array._load_metadata_nosync, after pass through array', fill_value) self._fill_value = fill_value self._order = meta['order'] diff --git a/zarr/meta.py b/zarr/meta.py index d35e0f87ed..50664f57fd 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -112,6 +112,7 @@ def encode_group_metadata(meta=None): def decode_fill_value(v, dtype): + print('decode_fill_value', v, dtype, dtype.kind) # early out if v is None: return v @@ -131,11 +132,15 @@ def decode_fill_value(v, dtype): # be lenient, allow for other values that may have been used before base64 encoding # and may work as fill values, e.g., the number 0 return v + elif dtype.kind == 'U': + print('decoding unicode fill value') + return v else: return v def encode_fill_value(v, dtype): + print('encode_fill_value', v, dtype, dtype.kind) # early out if v is None: return v @@ -157,5 +162,8 @@ def encode_fill_value(v, dtype): if not PY2: v = str(v, 'ascii') return v + elif dtype.kind == 'U': + print('encoding unicode fill value') + return v else: return v From c1d2c5f415596804eb2c02e9018c27618c2ba012 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 11 Nov 2017 23:39:09 +0000 Subject: [PATCH 61/67] spike appveyor 7 --- zarr/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 26659b15a8..f29ef45c81 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -149,10 +149,10 @@ def _load_metadata_nosync(self): self._chunks = meta['chunks'] self._dtype = meta['dtype'] fill_value = meta['fill_value'] - print('Array._load_metadata_nosync, from meta', fill_value) + print('Array._load_metadata_nosync, from meta', repr(fill_value)) if fill_value is not None: fill_value = np.array(fill_value, self._dtype)[()] - print('Array._load_metadata_nosync, after pass through array', fill_value) + print('Array._load_metadata_nosync, after pass through array', repr(fill_value)) self._fill_value = fill_value self._order = meta['order'] From fbb29a90d6479bc114108e6a5c5e13ad363f769f Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 13 Nov 2017 00:26:53 +0000 Subject: [PATCH 62/67] fix for windows unicode issue --- windows_conda_dev.txt | 11 +++++++++++ zarr/core.py | 7 +------ zarr/meta.py | 11 +++++------ zarr/storage.py | 15 ++------------- zarr/tests/test_core.py | 9 +++++---- zarr/tests/test_creation.py | 7 ++----- zarr/tests/test_filters.py | 7 ------- zarr/tests/test_meta.py | 9 ++++++--- zarr/util.py | 31 ++++++++++++++++++++++++++++++- 9 files changed, 62 insertions(+), 45 deletions(-) create mode 100644 windows_conda_dev.txt diff --git a/windows_conda_dev.txt b/windows_conda_dev.txt new file mode 100644 index 0000000000..85b43d4255 --- /dev/null +++ b/windows_conda_dev.txt @@ -0,0 +1,11 @@ +coverage +coveralls +fasteners +flake8 +monotonic +msgpack-python +nose +numcodecs +numpy +setuptools_scm +twine diff --git a/zarr/core.py b/zarr/core.py index f29ef45c81..69872537ec 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -148,12 +148,7 @@ def _load_metadata_nosync(self): self._shape = meta['shape'] self._chunks = meta['chunks'] self._dtype = meta['dtype'] - fill_value = meta['fill_value'] - print('Array._load_metadata_nosync, from meta', repr(fill_value)) - if fill_value is not None: - fill_value = np.array(fill_value, self._dtype)[()] - print('Array._load_metadata_nosync, after pass through array', repr(fill_value)) - self._fill_value = fill_value + self._fill_value = meta['fill_value'] self._order = meta['order'] # setup compressor diff --git a/zarr/meta.py b/zarr/meta.py index 50664f57fd..62852247ec 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -112,7 +112,6 @@ def encode_group_metadata(meta=None): def decode_fill_value(v, dtype): - print('decode_fill_value', v, dtype, dtype.kind) # early out if v is None: return v @@ -127,20 +126,21 @@ def decode_fill_value(v, dtype): return np.array(v, dtype=dtype)[()] elif dtype.kind in 'SV': try: - return base64.standard_b64decode(v) + v = base64.standard_b64decode(v) + v = np.array(v, dtype=dtype)[()] + return v except Exception: # be lenient, allow for other values that may have been used before base64 encoding # and may work as fill values, e.g., the number 0 return v elif dtype.kind == 'U': - print('decoding unicode fill value') + # leave as-is return v else: - return v + return np.array(v, dtype=dtype)[()] def encode_fill_value(v, dtype): - print('encode_fill_value', v, dtype, dtype.kind) # early out if v is None: return v @@ -163,7 +163,6 @@ def encode_fill_value(v, dtype): v = str(v, 'ascii') return v elif dtype.kind == 'U': - print('encoding unicode fill value') return v else: return v diff --git a/zarr/storage.py b/zarr/storage.py index 494d536f0e..13cc62a95a 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -19,7 +19,7 @@ from zarr.util import normalize_shape, normalize_chunks, normalize_order, \ - normalize_storage_path, buffer_size + normalize_storage_path, buffer_size, normalize_fill_value from zarr.meta import encode_array_metadata, encode_group_metadata from zarr.compat import PY2, binary_type from numcodecs.registry import codec_registry @@ -285,18 +285,7 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, compressor='defa dtype = np.dtype(dtype) chunks = normalize_chunks(chunks, shape, dtype.itemsize) order = normalize_order(order) - - # ensure fill_value of correct type - if fill_value == 0 and dtype.kind == 'V': - # special case because 0 used as default, but cannot be used for structured arrays - fill_value = b'' - elif fill_value is not None: - try: - fill_value = np.array(fill_value, dtype=dtype)[()] - except Exception as e: - # re-raise with our own error message to be helpful - raise ValueError('fill_value {!r} is not valid for dtype {}; nested exception: {}' - .format(fill_value, dtype, e)) + fill_value = normalize_fill_value(fill_value) # compressor prep if shape == (): diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index b9fb86c10c..03811e1ab7 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -790,11 +790,13 @@ def test_structured_array(self): (b'ccc', 3, 12.6)], dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) for fill_value in None, b'', (b'zzz', 0, 0.0): - if fill_value is not None: - fill_value = np.array(fill_value, dtype=a.dtype)[()] z = self.create_array(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=fill_value) eq(3, len(z)) - eq(fill_value, z.fill_value) + if fill_value is not None: + np_fill_value = np.array(fill_value, dtype=a.dtype)[()] + eq(np_fill_value, z.fill_value) + eq(np_fill_value, z[0]) + eq(np_fill_value, z[-1]) z[...] = a eq(a[0], z[0]) assert_array_equal(a, z[...]) @@ -950,7 +952,6 @@ def create_array(read_only=False, **kwargs): Delta(dtype=dtype), FixedScaleOffset(dtype=dtype, scale=1, offset=0), ] - # print(dtype, filters[1].astype) kwargs.setdefault('filters', filters) compressor = Zlib(1) kwargs.setdefault('compressor', compressor) diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index 77dd3a9a64..5233044c97 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -137,10 +137,8 @@ def test_full(): # "NaN" byte string v = b'NaN' z = full(100, chunks=10, fill_value=v, dtype='S3') - print(z.store['.zarray']) eq(v, z[0]) a = z[...] - print(a.dtype, a[0], a[0:2], repr(a[0]), type(a[0]), a[0] == v, a[0:2] == v) eq(v, a[0]) t = z[...] == v assert np.all(t), (np.count_nonzero(t), t.size) @@ -148,13 +146,12 @@ def test_full(): # "NaN" unicode string v = 'NaN' z = full(100, chunks=10, fill_value=v, dtype='U3') - print(z.store['.zarray']) eq(v, z[0]) a = z[...] - print(a.dtype, a[0], a[0:2], repr(a[0]), type(a[0]), a[0] == v, a[0:2] == v) + eq(z.dtype, a.dtype) eq(v, a[0]) t = z[...] == v - assert np.all(t), (np.count_nonzero(t), t.size) + assert np.all(t) def test_open_array(): diff --git a/zarr/tests/test_filters.py b/zarr/tests/test_filters.py index f9c9d04434..101f90d1d3 100644 --- a/zarr/tests/test_filters.py +++ b/zarr/tests/test_filters.py @@ -35,7 +35,6 @@ def test_array_with_delta_filter(): data = np.arange(100, dtype=dtype) for compressor in compressors: - # print(repr(compressor)) a = array(data, chunks=10, compressor=compressor, filters=filters) @@ -66,7 +65,6 @@ def test_array_with_astype_filter(): data = np.arange(shape, dtype=decode_dtype) for compressor in compressors: - # print(repr(compressor)) a = array(data, chunks=chunks, compressor=compressor, filters=filters) @@ -96,7 +94,6 @@ def test_array_with_scaleoffset_filter(): data = np.linspace(1000, 1001, 34, dtype='f8') for compressor in compressors: - # print(repr(compressor)) a = array(data, chunks=5, compressor=compressor, filters=filters) @@ -125,7 +122,6 @@ def test_array_with_quantize_filter(): data = np.linspace(0, 1, 34, dtype=dtype) for compressor in compressors: - # print(repr(compressor)) a = array(data, chunks=5, compressor=compressor, filters=filters) @@ -152,7 +148,6 @@ def test_array_with_packbits_filter(): data = np.random.randint(0, 2, size=100, dtype=bool) for compressor in compressors: - # print(repr(compressor)) a = array(data, chunks=5, compressor=compressor, filters=filters) @@ -179,7 +174,6 @@ def test_array_with_categorize_filter(): filters = [flt] for compressor in compressors: - # print(repr(compressor)) a = array(data, chunks=5, compressor=compressor, filters=filters) @@ -203,7 +197,6 @@ def test_compressor_as_filter(): if compressor is None: # skip continue - # print(repr(compressor)) # setup filters dtype = 'i8' diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index beb59bbbc5..3269760b1d 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -110,7 +110,8 @@ def test_encode_decode_array_2(): eq(meta['dtype'], meta_dec['dtype']) eq(meta['compressor'], meta_dec['compressor']) eq(meta['order'], meta_dec['order']) - eq(meta['fill_value'], meta_dec['fill_value']) + np_fill_value = np.array(meta['fill_value'], dtype=meta['dtype'])[()] + eq(np_fill_value, meta_dec['fill_value']) eq([df.get_config()], meta_dec['filters']) @@ -157,6 +158,7 @@ def test_encode_decode_fill_values_nan(): def test_encode_decode_fill_values_bytes(): + dtype = np.dtype('S10') fills = b'foo', bytes(10) for v in fills: @@ -168,7 +170,7 @@ def test_encode_decode_fill_values_bytes(): meta = dict( shape=(100,), chunks=(10,), - dtype=np.dtype('S10'), + dtype=dtype, compressor=Zlib(1).get_config(), fill_value=v, filters=None, @@ -193,7 +195,8 @@ def test_encode_decode_fill_values_bytes(): # test decoding meta_dec = decode_array_metadata(meta_enc) actual = meta_dec['fill_value'] - eq(v, actual) + np_v = np.array(v, dtype=dtype)[()] + eq(np_v, actual) def test_decode_array_unsupported_format(): diff --git a/zarr/util.py b/zarr/util.py index 5a274f0467..e95ed7f7dc 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -8,7 +8,7 @@ import numpy as np -from zarr.compat import PY2, reduce +from zarr.compat import PY2, reduce, text_type def normalize_shape(shape): @@ -177,6 +177,35 @@ def normalize_order(order): return order +def normalize_fill_value(fill_value, dtype): + + if fill_value is None: + # no fill value + pass + + elif fill_value == 0 and dtype.kind == 'V': + # special case because 0 used as default, but cannot be used for structured arrays + fill_value = b'' + + elif dtype.kind == 'U': + # special case unicode because of encoding issues on Windows if pass through numpy... + # UnicodeDecodeError: 'utf-32-le' codec can't decode bytes in position ...: code point not + # in range(0x110000) + if not isinstance(fill_value, text_type): + raise ValueError('fill_value {!r} is not valid for dtype{}; must be a unicode string' + .format(fill_value, dtype)) + # otherwise leave as-is + + else: + try: + fill_value = np.array(fill_value, dtype=dtype)[()] + except Exception as e: + # re-raise with our own error message to be helpful + raise ValueError('fill_value {!r} is not valid for dtype {}; nested exception: {}' + .format(fill_value, dtype, e)) + return fill_value + + def normalize_storage_path(path): # handle bytes From 23f7e96ad5b94e623552e4601ba561c805e5addf Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 13 Nov 2017 00:33:37 +0000 Subject: [PATCH 63/67] doh --- zarr/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index 13cc62a95a..302dc44530 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -285,7 +285,7 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, compressor='defa dtype = np.dtype(dtype) chunks = normalize_chunks(chunks, shape, dtype.itemsize) order = normalize_order(order) - fill_value = normalize_fill_value(fill_value) + fill_value = normalize_fill_value(fill_value, dtype) # compressor prep if shape == (): From 8b414149aad34926171fdfd88b4d53dcef85c028 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 13 Nov 2017 00:36:29 +0000 Subject: [PATCH 64/67] comments --- zarr/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/core.py b/zarr/core.py index f29ef45c81..b17c3e45df 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -142,7 +142,7 @@ def _load_metadata_nosync(self): err_array_not_found(self._path) else: - # decode and store metadata + # decode and store metadata as instance members meta = decode_array_metadata(meta_bytes) self._meta = meta self._shape = meta['shape'] From 19de33360113104d876226fa2bfcbab4ba759286 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 13 Nov 2017 01:02:08 +0000 Subject: [PATCH 65/67] py2 compat --- zarr/tests/test_creation.py | 16 +++++++++++----- zarr/util.py | 17 ++++++++++------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index 5233044c97..f870861164 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -19,6 +19,7 @@ from zarr.hierarchy import open_group from zarr.errors import PermissionError from zarr.codecs import Zlib +from zarr.compat import PY2 # something bcolz-like @@ -134,17 +135,17 @@ def test_full(): z = full(100, chunks=10, fill_value=np.nan, dtype='f8') assert np.all(np.isnan(z[:])) - # "NaN" byte string - v = b'NaN' + # byte string + v = b'xxx' z = full(100, chunks=10, fill_value=v, dtype='S3') eq(v, z[0]) a = z[...] eq(v, a[0]) t = z[...] == v - assert np.all(t), (np.count_nonzero(t), t.size) + assert np.all(t) - # "NaN" unicode string - v = 'NaN' + # unicode string + v = u'xxx' z = full(100, chunks=10, fill_value=v, dtype='U3') eq(v, z[0]) a = z[...] @@ -153,6 +154,11 @@ def test_full(): t = z[...] == v assert np.all(t) + # dodgy fill value + if not PY2: + with assert_raises(ValueError): + full(100, chunks=10, fill_value=b'NaN', dtype='U3') + def test_open_array(): diff --git a/zarr/util.py b/zarr/util.py index e95ed7f7dc..53c3479387 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -8,7 +8,7 @@ import numpy as np -from zarr.compat import PY2, reduce, text_type +from zarr.compat import PY2, reduce, text_type, binary_type def normalize_shape(shape): @@ -188,13 +188,16 @@ def normalize_fill_value(fill_value, dtype): fill_value = b'' elif dtype.kind == 'U': - # special case unicode because of encoding issues on Windows if pass through numpy... - # UnicodeDecodeError: 'utf-32-le' codec can't decode bytes in position ...: code point not - # in range(0x110000) - if not isinstance(fill_value, text_type): - raise ValueError('fill_value {!r} is not valid for dtype{}; must be a unicode string' + # special case unicode because of encoding issues on Windows if passed through numpy + # https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713 + + if PY2 and isinstance(fill_value, binary_type): # pragma: py3 no cover + # this is OK on PY2, can be written as JSON + pass + + elif not isinstance(fill_value, text_type): + raise ValueError('fill_value {!r} is not valid for dtype {}; must be a unicode string' .format(fill_value, dtype)) - # otherwise leave as-is else: try: From 64db65c1ad03f72764cc67542baddafdcee8e811 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 13 Nov 2017 01:22:27 +0000 Subject: [PATCH 66/67] cover py27 --- zarr/tests/test_creation.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index f870861164..63e3e4d9c6 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -135,29 +135,37 @@ def test_full(): z = full(100, chunks=10, fill_value=np.nan, dtype='f8') assert np.all(np.isnan(z[:])) - # byte string + # byte string dtype v = b'xxx' z = full(100, chunks=10, fill_value=v, dtype='S3') eq(v, z[0]) a = z[...] + eq(z.dtype, a.dtype) eq(v, a[0]) - t = z[...] == v - assert np.all(t) + assert np.all(a == v) - # unicode string + # unicode string dtype v = u'xxx' z = full(100, chunks=10, fill_value=v, dtype='U3') eq(v, z[0]) a = z[...] eq(z.dtype, a.dtype) eq(v, a[0]) - t = z[...] == v - assert np.all(t) + assert np.all(a == v) - # dodgy fill value - if not PY2: + # bytes fill value / unicode dtype + v = b'xxx' + if PY2: + # allow this on PY2 + z = full(100, chunks=10, fill_value=v, dtype='U3') + a = z[...] + eq(z.dtype, a.dtype) + eq(v, a[0]) + assert np.all(a == v) + else: + # be strict on PY3 with assert_raises(ValueError): - full(100, chunks=10, fill_value=b'NaN', dtype='U3') + full(100, chunks=10, fill_value=v, dtype='U3') def test_open_array(): From d08189c351ff6a193277ba9e94fc355307aea7ee Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Mon, 13 Nov 2017 01:24:13 +0000 Subject: [PATCH 67/67] pragma --- zarr/tests/test_creation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index 63e3e4d9c6..e159019921 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -155,14 +155,14 @@ def test_full(): # bytes fill value / unicode dtype v = b'xxx' - if PY2: + if PY2: # pragma: py3 no cover # allow this on PY2 z = full(100, chunks=10, fill_value=v, dtype='U3') a = z[...] eq(z.dtype, a.dtype) eq(v, a[0]) assert np.all(a == v) - else: + else: # pragma: py2 no cover # be strict on PY3 with assert_raises(ValueError): full(100, chunks=10, fill_value=v, dtype='U3')