From 331d2a8cb6ba89447afbadcea044b6444f3be4ed Mon Sep 17 00:00:00 2001 From: Andrew Fulton Date: Tue, 11 Aug 2020 14:43:54 -0600 Subject: [PATCH 01/12] adds partial_decompress capabilites --- zarr/core.py | 17 +++++++++-- zarr/indexing.py | 59 +++++++++++++++++++++++++++++++++++++ zarr/tests/test_core.py | 2 ++ zarr/tests/test_indexing.py | 28 +++++++++++++++++- 4 files changed, 102 insertions(+), 4 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 6edcbb475f..06ed47ecdd 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1626,12 +1626,18 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, return # decode chunk + print(is_contiguous_selection(chunk_selection)) + print(self.chunks) + print(chunk_selection) + if self._compressor.codec_id == 'blosc': + pass chunk = self._decode_chunk(cdata) - + # select data from chunk if fields: chunk = chunk[fields] tmp = chunk[chunk_selection] + print(tmp) if drop_axes: tmp = np.squeeze(tmp, axis=drop_axes) @@ -1731,11 +1737,16 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non def _chunk_key(self, chunk_coords): return self._key_prefix + '.'.join(map(str, chunk_coords)) - def _decode_chunk(self, cdata): + def _decode_chunk(self, cdata, start=None, nitems=None): # decompress if self._compressor: - chunk = self._compressor.decode(cdata) + # only decode requested items + if (all([x is not None for x in [start, nitems]]) + and self._compressor.codec_id == 'blosc'): + chunk = self._compressor.decode_partial(cdata, start, nitems) + else: + chunk = self._compressor.decode(cdata) else: chunk = cdata diff --git a/zarr/indexing.py b/zarr/indexing.py index f1b1f89aa7..c1a07a7663 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -822,3 +822,62 @@ def pop_fields(selection): selection = tuple(s for s in selection if not isinstance(s, str)) selection = selection[0] if len(selection) == 1 else selection return fields, selection + + +def selection_size(selection, arr): + if len(selection) > len(arr.shape): + raise ValueError(f'dimensions in selection cant be greater than dimensions or array: {len(selection)} > {len(arr.shape)}') + selection_shape = [] + for i, size in arr.shape: + selection_slice = selection[i] if i < len(selection) else None + if selection_slice: + selection_slice_size = len(range(*selection_slice.indices(len(arr)))) + selection_shape.append(selection_slice_size) + else: + selection_shape.append(size) + return tuple(selection_shape) + + +class PartialChunkIterator(object): + + def __init__(self, selection, arr): + self.arr = arr + self.selection = list(selection) + + for i, dim_shape in enumerate(self.arr.shape[slice(None, None, -1)]): + index = len(self.arr.shape) - (i+1) + if index <= len(selection)-1: + slice_nitems = len(range(*selection[index].indices(len(self.arr)))) + if slice_nitems == dim_shape: + self.selection.pop() + else: + break + + out_slices = [] + chunk_loc_slices = [] + + last_dim_slice = None if self.selection[-1].step > 1 else self.selection.pop() + for sl in self.selection: + dim_out_slices = [] + dim_chunk_loc_slices = [] + for i, x in enumerate(range(*sl.indices(len(self.arr)))): + dim_out_slices.append(slice(i, i+1, 1)) + dim_chunk_loc_slices.append(slice(x, x+1, 1)) + out_slices.append(dim_out_slices) + chunk_loc_slices.append(dim_chunk_loc_slices) + if last_dim_slice: + out_slices.append( + [slice(0, last_dim_slice.stop - last_dim_slice.start, 1)]) + chunk_loc_slices.append([last_dim_slice]) + + self.out_slices = itertools.product(*out_slices) + self.chunk_loc_slices = itertools.product(*chunk_loc_slices) + + def __iter__(self): + for out_selection, chunk_selection in zip(self.out_slices, self.chunk_loc_slices): + start = 0 + for i, sl in enumerate(chunk_selection): + start += sl.start * np.prod(self.arr.shape[i+1:]) + nitems = (chunk_selection[-1].stop - chunk_selection[-1].start) * np.prod(self.arr.shape[len(chunk_selection):]) + yield start, nitems, out_selection + diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index c05ff4a660..56d4741d08 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -184,6 +184,7 @@ def test_array_1d(self): # noinspection PyTypeChecker assert_array_equal(a, z[slice(None)]) assert_array_equal(a[:10], z[:10]) + assert False assert_array_equal(a[10:20], z[10:20]) assert_array_equal(a[-10:], z[-10:]) assert_array_equal(a[:10, ...], z[:10, ...]) @@ -374,6 +375,7 @@ def test_array_2d(self): # slicing across chunk boundaries assert_array_equal(a[:110], z[:110]) assert_array_equal(a[190:310], z[190:310]) + assert False assert_array_equal(a[-110:], z[-110:]) assert_array_equal(a[:110, :], z[:110, :]) assert_array_equal(a[190:310, :], z[190:310, :]) diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index a68223a5f0..0e30c8dbb4 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -5,7 +5,7 @@ import zarr from zarr.indexing import (normalize_integer_selection, oindex, oindex_set, - replace_ellipsis) + replace_ellipsis, PartialChunkIterator) def test_normalize_integer_selection(): @@ -1289,3 +1289,29 @@ def test_set_selections_with_fields(): a[key][ix] = v[key][ix] z.set_mask_selection(ix, v[key][ix], fields=fields) assert_array_equal(a, z[:]) + + +@pytest.mark.parametrize('selection, expected', [ + ((slice(5, 8, 1), slice(2, 4, 1), slice(0, 100, 1)), + [(5200, 200, (slice(0, 1, 1), slice(0, 2, 1))), + (6200, 200, (slice(1, 2, 1), slice(0, 2, 1))), + (7200, 200, (slice(2, 3, 1), slice(0, 2, 1)))]), + ((slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), + [(5200.0, 5.0, (slice(0, 1, 1), slice(0, 1, 1), slice(0, 5, 1))), + (5300.0, 5.0, (slice(0, 1, 1), slice(1, 2, 1), slice(0, 5, 1))), + (6200.0, 5.0, (slice(1, 2, 1), slice(0, 1, 1), slice(0, 5, 1))), + (6300.0, 5.0, (slice(1, 2, 1), slice(1, 2, 1), slice(0, 5, 1))), + (7200.0, 5.0, (slice(2, 3, 1), slice(0, 1, 1), slice(0, 5, 1))), + (7300.0, 5.0, (slice(2, 3, 1), slice(1, 2, 1), slice(0, 5, 1)))]), + ((slice(5, 8, 1), slice(2, 4, 1)), + [(5200, 200, (slice(0, 1, 1), slice(0, 2, 1))), + (6200, 200, (slice(1, 2, 1), slice(0, 2, 1))), + (7200, 200, (slice(2, 3, 1), slice(0, 2, 1)))]) +]) +def test_PartialChunkIterator(selection, expected): + arr = np.arange(2, 100002).reshape((100, 10, 100)) + print(selection) + PCI = PartialChunkIterator(selection, arr) + results = list(PCI) + assert(results == expected) + From 0a81a0325afbc5f4b3b39855d44821ab7661d31d Mon Sep 17 00:00:00 2001 From: Andrew Fulton Date: Tue, 11 Aug 2020 15:48:01 -0600 Subject: [PATCH 02/12] Update zarr/indexing.py Co-authored-by: Matthias Bussonnier --- zarr/indexing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/zarr/indexing.py b/zarr/indexing.py index c1a07a7663..056c74b3e8 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -844,7 +844,7 @@ def __init__(self, selection, arr): self.arr = arr self.selection = list(selection) - for i, dim_shape in enumerate(self.arr.shape[slice(None, None, -1)]): + for i, dim_shape in enumerate(self.arr.shape[::-1]): index = len(self.arr.shape) - (i+1) if index <= len(selection)-1: slice_nitems = len(range(*selection[index].indices(len(self.arr)))) @@ -880,4 +880,3 @@ def __iter__(self): start += sl.start * np.prod(self.arr.shape[i+1:]) nitems = (chunk_selection[-1].stop - chunk_selection[-1].start) * np.prod(self.arr.shape[len(chunk_selection):]) yield start, nitems, out_selection - From 50797dee5980ff1b716b5752a77f063dbe8ec64c Mon Sep 17 00:00:00 2001 From: Andrew Fulton Date: Tue, 11 Aug 2020 15:48:09 -0600 Subject: [PATCH 03/12] Update zarr/indexing.py Co-authored-by: Matthias Bussonnier --- zarr/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/indexing.py b/zarr/indexing.py index 056c74b3e8..b355390482 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -838,7 +838,7 @@ def selection_size(selection, arr): return tuple(selection_shape) -class PartialChunkIterator(object): +class PartialChunkIterator: def __init__(self, selection, arr): self.arr = arr From a4e1cf686586713508e7f1ebdc46b9e5004c36df Mon Sep 17 00:00:00 2001 From: Andrew Fulton Date: Wed, 12 Aug 2020 10:41:39 -0600 Subject: [PATCH 04/12] improve tests --- zarr/tests/test_indexing.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 0e30c8dbb4..f2463cd315 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -1291,12 +1291,14 @@ def test_set_selections_with_fields(): assert_array_equal(a, z[:]) -@pytest.mark.parametrize('selection, expected', [ +@pytest.mark.parametrize('selection, arr, expected', [ ((slice(5, 8, 1), slice(2, 4, 1), slice(0, 100, 1)), + np.arange(2, 100_002).reshape((100, 10, 100)), [(5200, 200, (slice(0, 1, 1), slice(0, 2, 1))), (6200, 200, (slice(1, 2, 1), slice(0, 2, 1))), (7200, 200, (slice(2, 3, 1), slice(0, 2, 1)))]), ((slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), + np.arange(2, 100_002).reshape((100, 10, 100)), [(5200.0, 5.0, (slice(0, 1, 1), slice(0, 1, 1), slice(0, 5, 1))), (5300.0, 5.0, (slice(0, 1, 1), slice(1, 2, 1), slice(0, 5, 1))), (6200.0, 5.0, (slice(1, 2, 1), slice(0, 1, 1), slice(0, 5, 1))), @@ -1304,14 +1306,24 @@ def test_set_selections_with_fields(): (7200.0, 5.0, (slice(2, 3, 1), slice(0, 1, 1), slice(0, 5, 1))), (7300.0, 5.0, (slice(2, 3, 1), slice(1, 2, 1), slice(0, 5, 1)))]), ((slice(5, 8, 1), slice(2, 4, 1)), + np.arange(2, 100_002).reshape((100, 10, 100)), [(5200, 200, (slice(0, 1, 1), slice(0, 2, 1))), (6200, 200, (slice(1, 2, 1), slice(0, 2, 1))), - (7200, 200, (slice(2, 3, 1), slice(0, 2, 1)))]) + (7200, 200, (slice(2, 3, 1), slice(0, 2, 1)))]), + pytest.param((slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), + np.arange(2, 100002).reshape((10, 1, 10000)), + None, + # marks=[pytest.mark.xfail(reason='slice 2 is out of range')] + ), + pytest.param((slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), + np.arange(2, 100_002).reshape((10, 10_000)), + None, + # marks=[pytest.mark.xfail(reason='slice 2 is out of range')] + ), ]) -def test_PartialChunkIterator(selection, expected): - arr = np.arange(2, 100002).reshape((100, 10, 100)) +def test_PartialChunkIterator(selection, arr, expected): print(selection) - PCI = PartialChunkIterator(selection, arr) + PCI = PartialChunkIterator(selection, arr.shape) results = list(PCI) assert(results == expected) From 2333fd0c7e01f8d52e703a87ef392b0c883f9996 Mon Sep 17 00:00:00 2001 From: Andrew Fulton Date: Wed, 19 Aug 2020 19:50:48 -0400 Subject: [PATCH 05/12] partial chunk read working --- zarr/core.py | 36 ++++++++++----- zarr/errors.py | 4 ++ zarr/indexing.py | 90 +++++++++++++++++++++++++------------ zarr/tests/test_core.py | 3 +- zarr/tests/test_indexing.py | 24 +++++++++- 5 files changed, 115 insertions(+), 42 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 06ed47ecdd..4ce01b7264 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -12,9 +12,10 @@ from zarr.attrs import Attributes from zarr.codecs import AsType, get_codec -from zarr.errors import err_array_not_found, err_read_only +from zarr.errors import ArrayIndexError, err_array_not_found, err_read_only from zarr.indexing import (BasicIndexer, CoordinateIndexer, MaskIndexer, - OIndex, OrthogonalIndexer, VIndex, check_fields, + OIndex, OrthogonalIndexer, VIndex, PartialChunkIterator, + check_fields, check_no_multi_fields, ensure_tuple, err_too_many_indices, is_contiguous_selection, is_scalar, pop_fields) @@ -1626,18 +1627,33 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, return # decode chunk - print(is_contiguous_selection(chunk_selection)) - print(self.chunks) - print(chunk_selection) - if self._compressor.codec_id == 'blosc': + try: + if self._compressor and self._compressor.codec_id == 'blosc' and not fields and self.dtype != object: + index_selection = PartialChunkIterator(chunk_selection, self.chunks) + if len(index_selection) < 10: + for start, nitems, partial_out_selection in index_selection: + expected_shape = [ + len(range(*partial_out_selection[i].indices(self.chunks[0]+1))) + if i < len(partial_out_selection) else dim + for i, dim in enumerate(self.chunks)] + chunk_partial = self._decode_chunk( + cdata, start=start, nitems=nitems, + expected_shape=expected_shape) + # if isinstance(out_selection, slice) or len(out_selection) < len(partial_out_selection): + # out[out_selection] = chunk_partial + if out[out_selection].size == chunk_partial.size: + out[out_selection] = chunk_partial + else: + out[out_selection][partial_out_selection] = chunk_partial + return + except ArrayIndexError: pass chunk = self._decode_chunk(cdata) - + # select data from chunk if fields: chunk = chunk[fields] tmp = chunk[chunk_selection] - print(tmp) if drop_axes: tmp = np.squeeze(tmp, axis=drop_axes) @@ -1737,7 +1753,7 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non def _chunk_key(self, chunk_coords): return self._key_prefix + '.'.join(map(str, chunk_coords)) - def _decode_chunk(self, cdata, start=None, nitems=None): + def _decode_chunk(self, cdata, start=None, nitems=None, expected_shape=None): # decompress if self._compressor: @@ -1771,7 +1787,7 @@ def _decode_chunk(self, cdata, start=None, nitems=None): # ensure correct chunk shape chunk = chunk.reshape(-1, order='A') - chunk = chunk.reshape(self._chunks, order=self._order) + chunk = chunk.reshape(expected_shape or self._chunks, order=self._order) return chunk diff --git a/zarr/errors.py b/zarr/errors.py index fa3cff04d5..acd588faa1 100644 --- a/zarr/errors.py +++ b/zarr/errors.py @@ -9,6 +9,10 @@ class CopyError(RuntimeError): pass +class ArrayIndexError(IndexError): + pass + + def err_contains_group(path): raise ValueError('path %r contains a group' % path) diff --git a/zarr/indexing.py b/zarr/indexing.py index c1a07a7663..38af666354 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -6,7 +6,7 @@ import numpy as np -from zarr.errors import (err_boundscheck, err_negative_step, +from zarr.errors import (ArrayIndexError, err_boundscheck, err_negative_step, err_too_many_indices, err_vindex_invalid_selection) @@ -824,43 +824,73 @@ def pop_fields(selection): return fields, selection -def selection_size(selection, arr): - if len(selection) > len(arr.shape): - raise ValueError(f'dimensions in selection cant be greater than dimensions or array: {len(selection)} > {len(arr.shape)}') - selection_shape = [] - for i, size in arr.shape: - selection_slice = selection[i] if i < len(selection) else None - if selection_slice: - selection_slice_size = len(range(*selection_slice.indices(len(arr)))) - selection_shape.append(selection_slice_size) +def int_to_slice(dim_selection): + return slice(dim_selection, dim_selection+1, 1) + +def make_slice_selection(selection): + ls = [] + for dim_selection in selection: + if is_integer(dim_selection): + ls.append(int_to_slice(dim_selection)) + elif isinstance(dim_selection, np.ndarray): + if len(dim_selection) == 1: + ls.append(int_to_slice(dim_selection[0])) + else: + raise ArrayIndexError() else: - selection_shape.append(size) - return tuple(selection_shape) + ls.append(dim_selection) + return ls class PartialChunkIterator(object): - - def __init__(self, selection, arr): - self.arr = arr - self.selection = list(selection) - - for i, dim_shape in enumerate(self.arr.shape[slice(None, None, -1)]): - index = len(self.arr.shape) - (i+1) - if index <= len(selection)-1: - slice_nitems = len(range(*selection[index].indices(len(self.arr)))) - if slice_nitems == dim_shape: + """Iterator tp retrieve the specific coordinates of requested data + from within a compressed chunk. + + Parameters + ----------- + selection : tuple + tuple of slice objects to take from the chunk + arr_shape : shape of chunk to select data from + + Attributes + ----------- + arr_shape + selection + """ + + def __init__(self, selection, arr_shape): + self.selection = make_slice_selection(selection) + self.arr_shape = arr_shape + + # number of selection dimensions can't be greater than the number of chunk dimensions + if len(self.selection) > len(self.arr_shape): + raise ValueError('Selection has more dimensions then the array:\n' + 'selection dimensions = {len(self.selection)\n' + 'array dimensions = {len(self.arr_shape)}') + + # any selection can not be out of the range of the chunk and + self.selection_shape = np.empty(self.arr_shape)[self.selection].shape + if any( + [selection_dim < 0 or selection_dim > arr_dim for selection_dim, arr_dim + in zip(self.selection_shape, self.arr_shape)]): + raise IndexError('a selection index is out of range for the dimension') + + for i, dim_size in enumerate(self.arr_shape[::-1]): + index = len(self.arr_shape) - (i+1) + if index <= len(self.selection)-1: + slice_size = self.selection_shape[index] + if slice_size == dim_size and index > 0: self.selection.pop() else: break out_slices = [] chunk_loc_slices = [] - last_dim_slice = None if self.selection[-1].step > 1 else self.selection.pop() - for sl in self.selection: + for i, sl in enumerate(self.selection): dim_out_slices = [] dim_chunk_loc_slices = [] - for i, x in enumerate(range(*sl.indices(len(self.arr)))): + for i, x in enumerate(slice_to_range(sl, arr_shape[i])): dim_out_slices.append(slice(i, i+1, 1)) dim_chunk_loc_slices.append(slice(x, x+1, 1)) out_slices.append(dim_out_slices) @@ -871,13 +901,17 @@ def __init__(self, selection, arr): chunk_loc_slices.append([last_dim_slice]) self.out_slices = itertools.product(*out_slices) - self.chunk_loc_slices = itertools.product(*chunk_loc_slices) + self.chunk_loc_slices = list(itertools.product(*chunk_loc_slices)) + + def __len__(self): + return len(self.chunk_loc_slices) def __iter__(self): for out_selection, chunk_selection in zip(self.out_slices, self.chunk_loc_slices): start = 0 for i, sl in enumerate(chunk_selection): - start += sl.start * np.prod(self.arr.shape[i+1:]) - nitems = (chunk_selection[-1].stop - chunk_selection[-1].start) * np.prod(self.arr.shape[len(chunk_selection):]) + start += sl.start * np.prod(self.arr_shape[i+1:]) + nitems = (chunk_selection[-1].stop - chunk_selection[-1].start) * np.prod(self.arr_shape[len(chunk_selection):]) yield start, nitems, out_selection + diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 56d4741d08..83da44caf9 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -184,7 +184,6 @@ def test_array_1d(self): # noinspection PyTypeChecker assert_array_equal(a, z[slice(None)]) assert_array_equal(a[:10], z[:10]) - assert False assert_array_equal(a[10:20], z[10:20]) assert_array_equal(a[-10:], z[-10:]) assert_array_equal(a[:10, ...], z[:10, ...]) @@ -279,6 +278,7 @@ def test_array_1d_selections(self): ix = [99, 100, 101] bix = np.zeros_like(a, dtype=bool) bix[ix] = True + print(a[ix]) assert_array_equal(a[ix], z.get_orthogonal_selection(ix)) assert_array_equal(a[ix], z.oindex[ix]) assert_array_equal(a[ix], z.get_coordinate_selection(ix)) @@ -375,7 +375,6 @@ def test_array_2d(self): # slicing across chunk boundaries assert_array_equal(a[:110], z[:110]) assert_array_equal(a[190:310], z[190:310]) - assert False assert_array_equal(a[-110:], z[-110:]) assert_array_equal(a[:110, :], z[:110, :]) assert_array_equal(a[190:310, :], z[190:310, :]) diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index f2463cd315..08399603b1 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -1305,20 +1305,40 @@ def test_set_selections_with_fields(): (6300.0, 5.0, (slice(1, 2, 1), slice(1, 2, 1), slice(0, 5, 1))), (7200.0, 5.0, (slice(2, 3, 1), slice(0, 1, 1), slice(0, 5, 1))), (7300.0, 5.0, (slice(2, 3, 1), slice(1, 2, 1), slice(0, 5, 1)))]), + ((slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), + np.asfortranarray(np.arange(2, 100_002).reshape((100, 10, 100))), + [(5200.0, 5.0, (slice(0, 1, 1), slice(0, 1, 1), slice(0, 5, 1))), + (5300.0, 5.0, (slice(0, 1, 1), slice(1, 2, 1), slice(0, 5, 1))), + (6200.0, 5.0, (slice(1, 2, 1), slice(0, 1, 1), slice(0, 5, 1))), + (6300.0, 5.0, (slice(1, 2, 1), slice(1, 2, 1), slice(0, 5, 1))), + (7200.0, 5.0, (slice(2, 3, 1), slice(0, 1, 1), slice(0, 5, 1))), + (7300.0, 5.0, (slice(2, 3, 1), slice(1, 2, 1), slice(0, 5, 1)))]), ((slice(5, 8, 1), slice(2, 4, 1)), np.arange(2, 100_002).reshape((100, 10, 100)), [(5200, 200, (slice(0, 1, 1), slice(0, 2, 1))), (6200, 200, (slice(1, 2, 1), slice(0, 2, 1))), (7200, 200, (slice(2, 3, 1), slice(0, 2, 1)))]), + ((slice(0, 10, 1),), + np.arange(0, 10).reshape((10)), + [(0, 10, (slice(0, 10, 1),))]), + ((0,), + np.arange(0, 100).reshape((10, 10)), + [(0, 10, (slice(0, 1, 1),))]), + ((0,0,), + np.arange(0, 100).reshape((10, 10)), + [(0, 1, (slice(0, 1, 1), slice(0, 1, 1)))]), + ((0,), + np.arange(0, 10).reshape((10)), + [(0, 1, (slice(0, 1, 1),))]), pytest.param((slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), np.arange(2, 100002).reshape((10, 1, 10000)), None, - # marks=[pytest.mark.xfail(reason='slice 2 is out of range')] + marks=[pytest.mark.xfail(reason='slice 2 is out of range')] ), pytest.param((slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), np.arange(2, 100_002).reshape((10, 10_000)), None, - # marks=[pytest.mark.xfail(reason='slice 2 is out of range')] + marks=[pytest.mark.xfail(reason='slice 2 is out of range')] ), ]) def test_PartialChunkIterator(selection, arr, expected): From 4f00a5f3faf63131d9012de69567357045aa5b41 Mon Sep 17 00:00:00 2001 From: Andrew Fulton Date: Wed, 19 Aug 2020 21:43:34 -0400 Subject: [PATCH 06/12] improves indexer and partial chunk --- zarr/core.py | 31 ++++++++++++++----------------- zarr/indexing.py | 20 +++++--------------- zarr/tests/test_indexing.py | 36 ++++++++++++++++++------------------ 3 files changed, 37 insertions(+), 50 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 4ce01b7264..2fbfada6a1 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1628,24 +1628,21 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, # decode chunk try: - if self._compressor and self._compressor.codec_id == 'blosc' and not fields and self.dtype != object: + if self._compressor and self._compressor.codec_id == 'blosc' \ + and not fields and self.dtype != object: + tmp = np.empty(self._chunks, dtype=self.dtype) index_selection = PartialChunkIterator(chunk_selection, self.chunks) - if len(index_selection) < 10: - for start, nitems, partial_out_selection in index_selection: - expected_shape = [ - len(range(*partial_out_selection[i].indices(self.chunks[0]+1))) - if i < len(partial_out_selection) else dim - for i, dim in enumerate(self.chunks)] - chunk_partial = self._decode_chunk( - cdata, start=start, nitems=nitems, - expected_shape=expected_shape) - # if isinstance(out_selection, slice) or len(out_selection) < len(partial_out_selection): - # out[out_selection] = chunk_partial - if out[out_selection].size == chunk_partial.size: - out[out_selection] = chunk_partial - else: - out[out_selection][partial_out_selection] = chunk_partial - return + for start, nitems, partial_out_selection in index_selection: + expected_shape = [ + len(range(*partial_out_selection[i].indices(self.chunks[0]+1))) + if i < len(partial_out_selection) else dim + for i, dim in enumerate(self.chunks)] + chunk_partial = self._decode_chunk( + cdata, start=start, nitems=nitems, + expected_shape=expected_shape) + tmp[partial_out_selection] = chunk_partial + out[out_selection] = tmp[chunk_selection] + return except ArrayIndexError: pass chunk = self._decode_chunk(cdata) diff --git a/zarr/indexing.py b/zarr/indexing.py index 588d8bf835..9c1fecc3f1 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -868,11 +868,10 @@ def __init__(self, selection, arr_shape): 'selection dimensions = {len(self.selection)\n' 'array dimensions = {len(self.arr_shape)}') - # any selection can not be out of the range of the chunk and + # any selection can not be out of the range of the chunk self.selection_shape = np.empty(self.arr_shape)[self.selection].shape - if any( - [selection_dim < 0 or selection_dim > arr_dim for selection_dim, arr_dim - in zip(self.selection_shape, self.arr_shape)]): + if any([selection_dim < 0 or selection_dim > arr_dim for selection_dim, arr_dim + in zip(self.selection_shape, self.arr_shape)]): raise IndexError('a selection index is out of range for the dimension') for i, dim_size in enumerate(self.arr_shape[::-1]): @@ -884,32 +883,23 @@ def __init__(self, selection, arr_shape): else: break - out_slices = [] chunk_loc_slices = [] last_dim_slice = None if self.selection[-1].step > 1 else self.selection.pop() for i, sl in enumerate(self.selection): - dim_out_slices = [] dim_chunk_loc_slices = [] for i, x in enumerate(slice_to_range(sl, arr_shape[i])): - dim_out_slices.append(slice(i, i+1, 1)) dim_chunk_loc_slices.append(slice(x, x+1, 1)) - out_slices.append(dim_out_slices) chunk_loc_slices.append(dim_chunk_loc_slices) if last_dim_slice: - out_slices.append( - [slice(0, last_dim_slice.stop - last_dim_slice.start, 1)]) chunk_loc_slices.append([last_dim_slice]) - self.out_slices = itertools.product(*out_slices) self.chunk_loc_slices = list(itertools.product(*chunk_loc_slices)) - def __len__(self): - return len(self.chunk_loc_slices) def __iter__(self): - for out_selection, chunk_selection in zip(self.out_slices, self.chunk_loc_slices): + for chunk_selection in self.chunk_loc_slices: start = 0 for i, sl in enumerate(chunk_selection): start += sl.start * np.prod(self.arr_shape[i+1:]) nitems = (chunk_selection[-1].stop - chunk_selection[-1].start) * np.prod(self.arr_shape[len(chunk_selection):]) - yield start, nitems, out_selection + yield start, nitems, chunk_selection diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 08399603b1..648210164b 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -1294,30 +1294,30 @@ def test_set_selections_with_fields(): @pytest.mark.parametrize('selection, arr, expected', [ ((slice(5, 8, 1), slice(2, 4, 1), slice(0, 100, 1)), np.arange(2, 100_002).reshape((100, 10, 100)), - [(5200, 200, (slice(0, 1, 1), slice(0, 2, 1))), - (6200, 200, (slice(1, 2, 1), slice(0, 2, 1))), - (7200, 200, (slice(2, 3, 1), slice(0, 2, 1)))]), + [(5200, 200, (slice(5, 6, 1), slice(2, 4, 1))), + (6200, 200, (slice(6, 7, 1), slice(2, 4, 1))), + (7200, 200, (slice(7, 8, 1), slice(2, 4, 1)))]), ((slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), np.arange(2, 100_002).reshape((100, 10, 100)), - [(5200.0, 5.0, (slice(0, 1, 1), slice(0, 1, 1), slice(0, 5, 1))), - (5300.0, 5.0, (slice(0, 1, 1), slice(1, 2, 1), slice(0, 5, 1))), - (6200.0, 5.0, (slice(1, 2, 1), slice(0, 1, 1), slice(0, 5, 1))), - (6300.0, 5.0, (slice(1, 2, 1), slice(1, 2, 1), slice(0, 5, 1))), - (7200.0, 5.0, (slice(2, 3, 1), slice(0, 1, 1), slice(0, 5, 1))), - (7300.0, 5.0, (slice(2, 3, 1), slice(1, 2, 1), slice(0, 5, 1)))]), + [(5200.0, 5.0, (slice(5, 6, 1), slice(2, 3, 1), slice(0, 5, 1))), + (5300.0, 5.0, (slice(5, 6, 1), slice(3, 4, 1), slice(0, 5, 1))), + (6200.0, 5.0, (slice(6, 7, 1), slice(2, 3, 1), slice(0, 5, 1))), + (6300.0, 5.0, (slice(6, 7, 1), slice(3, 4, 1), slice(0, 5, 1))), + (7200.0, 5.0, (slice(7, 8, 1), slice(2, 3, 1), slice(0, 5, 1))), + (7300.0, 5.0, (slice(7, 8, 1), slice(3, 4, 1), slice(0, 5, 1)))]), ((slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), np.asfortranarray(np.arange(2, 100_002).reshape((100, 10, 100))), - [(5200.0, 5.0, (slice(0, 1, 1), slice(0, 1, 1), slice(0, 5, 1))), - (5300.0, 5.0, (slice(0, 1, 1), slice(1, 2, 1), slice(0, 5, 1))), - (6200.0, 5.0, (slice(1, 2, 1), slice(0, 1, 1), slice(0, 5, 1))), - (6300.0, 5.0, (slice(1, 2, 1), slice(1, 2, 1), slice(0, 5, 1))), - (7200.0, 5.0, (slice(2, 3, 1), slice(0, 1, 1), slice(0, 5, 1))), - (7300.0, 5.0, (slice(2, 3, 1), slice(1, 2, 1), slice(0, 5, 1)))]), + [(5200.0, 5.0, (slice(5, 6, 1), slice(2, 3, 1), slice(0, 5, 1))), + (5300.0, 5.0, (slice(5, 6, 1), slice(3, 4, 1), slice(0, 5, 1))), + (6200.0, 5.0, (slice(6, 7, 1), slice(2, 3, 1), slice(0, 5, 1))), + (6300.0, 5.0, (slice(6, 7, 1), slice(3, 4, 1), slice(0, 5, 1))), + (7200.0, 5.0, (slice(7, 8, 1), slice(2, 3, 1), slice(0, 5, 1))), + (7300.0, 5.0, (slice(7, 8, 1), slice(3, 4, 1), slice(0, 5, 1)))]), ((slice(5, 8, 1), slice(2, 4, 1)), np.arange(2, 100_002).reshape((100, 10, 100)), - [(5200, 200, (slice(0, 1, 1), slice(0, 2, 1))), - (6200, 200, (slice(1, 2, 1), slice(0, 2, 1))), - (7200, 200, (slice(2, 3, 1), slice(0, 2, 1)))]), + [(5200, 200, (slice(5, 6, 1), slice(2, 4, 1))), + (6200, 200, (slice(6, 7, 1), slice(2, 4, 1))), + (7200, 200, (slice(7, 8, 1), slice(2, 4, 1)))]), ((slice(0, 10, 1),), np.arange(0, 10).reshape((10)), [(0, 10, (slice(0, 10, 1),))]), From e5f7e58598ed3cf1135215c9d5bbc03de9bec869 Mon Sep 17 00:00:00 2001 From: Andrew Fulton Date: Thu, 20 Aug 2020 11:14:40 -0400 Subject: [PATCH 07/12] makes nitems a constant --- zarr/indexing.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/zarr/indexing.py b/zarr/indexing.py index 9c1fecc3f1..34517f5a02 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -892,14 +892,13 @@ def __init__(self, selection, arr_shape): chunk_loc_slices.append(dim_chunk_loc_slices) if last_dim_slice: chunk_loc_slices.append([last_dim_slice]) - self.chunk_loc_slices = list(itertools.product(*chunk_loc_slices)) - def __iter__(self): + chunk1 = self.chunk_loc_slices[0] + nitems = (chunk1[-1].stop - chunk1[-1].start) * np.prod(self.arr_shape[len(chunk1):]) for chunk_selection in self.chunk_loc_slices: start = 0 for i, sl in enumerate(chunk_selection): start += sl.start * np.prod(self.arr_shape[i+1:]) - nitems = (chunk_selection[-1].stop - chunk_selection[-1].start) * np.prod(self.arr_shape[len(chunk_selection):]) yield start, nitems, chunk_selection From 559b04128f9a63f8ff9e857a714cd9bddc8b6d67 Mon Sep 17 00:00:00 2001 From: Andrew Fulton Date: Thu, 20 Aug 2020 11:17:59 -0400 Subject: [PATCH 08/12] makes start and nitems ints --- zarr/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/indexing.py b/zarr/indexing.py index 34517f5a02..a0f4edfffc 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -901,4 +901,4 @@ def __iter__(self): start = 0 for i, sl in enumerate(chunk_selection): start += sl.start * np.prod(self.arr_shape[i+1:]) - yield start, nitems, chunk_selection + yield int(start), int(nitems), chunk_selection From 81ec2f0df881bb6b25dd53e328ef64c4b29b87f5 Mon Sep 17 00:00:00 2001 From: Andrew Fulton Date: Fri, 21 Aug 2020 12:05:50 -0400 Subject: [PATCH 09/12] Update zarr/core.py Co-authored-by: Matthias Bussonnier --- zarr/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/core.py b/zarr/core.py index 2fbfada6a1..9e674eab0a 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1756,7 +1756,7 @@ def _decode_chunk(self, cdata, start=None, nitems=None, expected_shape=None): if self._compressor: # only decode requested items if (all([x is not None for x in [start, nitems]]) - and self._compressor.codec_id == 'blosc'): + and self._compressor.codec_id == 'blosc') and hasattr(self._compressor, 'decode_partial'): chunk = self._compressor.decode_partial(cdata, start, nitems) else: chunk = self._compressor.decode(cdata) From 5c508d7cb9d98ae2af6b865bddcb01e672469ee1 Mon Sep 17 00:00:00 2001 From: Andrew Fulton Date: Fri, 23 Oct 2020 10:39:19 -0600 Subject: [PATCH 10/12] figuring out tests --- requirements_dev_minimal.txt | 2 +- tox.ini | 1 + zarr/core.py | 38 ++++++++++++++++++------------------ 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/requirements_dev_minimal.txt b/requirements_dev_minimal.txt index 61c3fe691f..e372137811 100644 --- a/requirements_dev_minimal.txt +++ b/requirements_dev_minimal.txt @@ -1,7 +1,7 @@ # library requirements asciitree==0.3.3 fasteners==0.15 -numcodecs==0.6.4 +numcodecs==0.7.2 msgpack-python==0.5.6 setuptools-scm==3.3.3 # test requirements diff --git a/tox.ini b/tox.ini index ef66d0250a..ac23f01f8d 100644 --- a/tox.ini +++ b/tox.ini @@ -27,6 +27,7 @@ commands = py35,py36,py37-npylatest,py38: coverage report -m # run doctests in the tutorial and spec py38: python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst docs/spec/v2.rst + py38_test: pytest {posargs} # pep8 checks py38: flake8 zarr # print environment for debugging diff --git a/zarr/core.py b/zarr/core.py index 59dc7f88ec..53aab8768a 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1575,12 +1575,12 @@ def _process_chunk(self, out, cdata, chunk_selection, drop_axes, ) ) + assert write_direct if write_direct: # optimization: we want the whole chunk, and the destination is # contiguous, so we can decompress directly from the chunk # into the destination array - if self._compressor: self._compressor.decode(cdata, dest) else: @@ -1590,24 +1590,24 @@ def _process_chunk(self, out, cdata, chunk_selection, drop_axes, return # decode chunk - try: - if self._compressor and self._compressor.codec_id == 'blosc' \ - and not fields and self.dtype != object: - tmp = np.empty(self._chunks, dtype=self.dtype) - index_selection = PartialChunkIterator(chunk_selection, self.chunks) - for start, nitems, partial_out_selection in index_selection: - expected_shape = [ - len(range(*partial_out_selection[i].indices(self.chunks[0]+1))) - if i < len(partial_out_selection) else dim - for i, dim in enumerate(self.chunks)] - chunk_partial = self._decode_chunk( - cdata, start=start, nitems=nitems, - expected_shape=expected_shape) - tmp[partial_out_selection] = chunk_partial - out[out_selection] = tmp[chunk_selection] - return - except ArrayIndexError: - pass + # try: + # if self._compressor and self._compressor.codec_id == 'blosc' \ + # and not fields and self.dtype != object: + # tmp = np.empty(self._chunks, dtype=self.dtype) + # index_selection = PartialChunkIterator(chunk_selection, self.chunks) + # for start, nitems, partial_out_selection in index_selection: + # expected_shape = [ + # len(range(*partial_out_selection[i].indices(self.chunks[0]+1))) + # if i < len(partial_out_selection) else dim + # for i, dim in enumerate(self.chunks)] + # chunk_partial = self._decode_chunk( + # cdata, start=start, nitems=nitems, + # expected_shape=expected_shape) + # tmp[partial_out_selection] = chunk_partial + # out[out_selection] = tmp[chunk_selection] + # return + # except ArrayIndexError: + # pass chunk = self._decode_chunk(cdata) # select data from chunk From 07344e73b49a7c06eac734f465d7cef9dff3ed85 Mon Sep 17 00:00:00 2001 From: Andrew Fulton Date: Fri, 23 Oct 2020 11:27:14 -0600 Subject: [PATCH 11/12] fixed indendation causing failing tests --- zarr/core.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 53aab8768a..8fcdf8a0e0 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1575,7 +1575,6 @@ def _process_chunk(self, out, cdata, chunk_selection, drop_axes, ) ) - assert write_direct if write_direct: # optimization: we want the whole chunk, and the destination is @@ -1608,17 +1607,17 @@ def _process_chunk(self, out, cdata, chunk_selection, drop_axes, # return # except ArrayIndexError: # pass - chunk = self._decode_chunk(cdata) + chunk = self._decode_chunk(cdata) # select data from chunk - if fields: - chunk = chunk[fields] - tmp = chunk[chunk_selection] - if drop_axes: - tmp = np.squeeze(tmp, axis=drop_axes) + if fields: + chunk = chunk[fields] + tmp = chunk[chunk_selection] + if drop_axes: + tmp = np.squeeze(tmp, axis=drop_axes) # store selected data in output - out[out_selection] = tmp + out[out_selection] = tmp def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop_axes=None, fields=None): From 6e11703fd0a57f0f5e366539f12bca59a7e0fd04 Mon Sep 17 00:00:00 2001 From: Andrew Fulton Date: Fri, 23 Oct 2020 11:29:45 -0600 Subject: [PATCH 12/12] add partial_decompress back in --- zarr/core.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 8fcdf8a0e0..89616ab851 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1588,25 +1588,25 @@ def _process_chunk(self, out, cdata, chunk_selection, drop_axes, np.copyto(dest, chunk) return - # decode chunk - # try: - # if self._compressor and self._compressor.codec_id == 'blosc' \ - # and not fields and self.dtype != object: - # tmp = np.empty(self._chunks, dtype=self.dtype) - # index_selection = PartialChunkIterator(chunk_selection, self.chunks) - # for start, nitems, partial_out_selection in index_selection: - # expected_shape = [ - # len(range(*partial_out_selection[i].indices(self.chunks[0]+1))) - # if i < len(partial_out_selection) else dim - # for i, dim in enumerate(self.chunks)] - # chunk_partial = self._decode_chunk( - # cdata, start=start, nitems=nitems, - # expected_shape=expected_shape) - # tmp[partial_out_selection] = chunk_partial - # out[out_selection] = tmp[chunk_selection] - # return - # except ArrayIndexError: - # pass + # decode chunk + try: + if self._compressor and self._compressor.codec_id == 'blosc' \ + and not fields and self.dtype != object: + tmp = np.empty(self._chunks, dtype=self.dtype) + index_selection = PartialChunkIterator(chunk_selection, self.chunks) + for start, nitems, partial_out_selection in index_selection: + expected_shape = [ + len(range(*partial_out_selection[i].indices(self.chunks[0]+1))) + if i < len(partial_out_selection) else dim + for i, dim in enumerate(self.chunks)] + chunk_partial = self._decode_chunk( + cdata, start=start, nitems=nitems, + expected_shape=expected_shape) + tmp[partial_out_selection] = chunk_partial + out[out_selection] = tmp[chunk_selection] + return + except ArrayIndexError: + pass chunk = self._decode_chunk(cdata) # select data from chunk