-
-
Notifications
You must be signed in to change notification settings - Fork 330
adds partial_decompress capabilites #584
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
331d2a8
0a81a03
50797de
a4e1cf6
2333fd0
492a2ce
4f00a5f
e5f7e58
559b041
81ec2f0
67f7380
5c508d7
07344e7
6e11703
3b0e651
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,7 @@ | |
|
||
import numpy as np | ||
|
||
from zarr.errors import (err_boundscheck, err_negative_step, | ||
from zarr.errors import (ArrayIndexError, err_boundscheck, err_negative_step, | ||
err_too_many_indices, err_vindex_invalid_selection) | ||
|
||
|
||
|
@@ -822,3 +822,83 @@ def pop_fields(selection): | |
selection = tuple(s for s in selection if not isinstance(s, str)) | ||
selection = selection[0] if len(selection) == 1 else selection | ||
return fields, selection | ||
|
||
|
||
def int_to_slice(dim_selection): | ||
return slice(dim_selection, dim_selection+1, 1) | ||
|
||
def make_slice_selection(selection): | ||
ls = [] | ||
for dim_selection in selection: | ||
if is_integer(dim_selection): | ||
ls.append(int_to_slice(dim_selection)) | ||
elif isinstance(dim_selection, np.ndarray): | ||
if len(dim_selection) == 1: | ||
ls.append(int_to_slice(dim_selection[0])) | ||
else: | ||
raise ArrayIndexError() | ||
else: | ||
ls.append(dim_selection) | ||
return ls | ||
|
||
|
||
class PartialChunkIterator(object): | ||
andrewfulton9 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""Iterator tp retrieve the specific coordinates of requested data | ||
from within a compressed chunk. | ||
|
||
Parameters | ||
----------- | ||
selection : tuple | ||
tuple of slice objects to take from the chunk | ||
arr_shape : shape of chunk to select data from | ||
|
||
Attributes | ||
----------- | ||
arr_shape | ||
selection | ||
""" | ||
|
||
def __init__(self, selection, arr_shape): | ||
self.selection = make_slice_selection(selection) | ||
self.arr_shape = arr_shape | ||
|
||
# number of selection dimensions can't be greater than the number of chunk dimensions | ||
if len(self.selection) > len(self.arr_shape): | ||
raise ValueError('Selection has more dimensions then the array:\n' | ||
'selection dimensions = {len(self.selection)\n' | ||
'array dimensions = {len(self.arr_shape)}') | ||
|
||
# any selection can not be out of the range of the chunk | ||
self.selection_shape = np.empty(self.arr_shape)[self.selection].shape | ||
if any([selection_dim < 0 or selection_dim > arr_dim for selection_dim, arr_dim | ||
in zip(self.selection_shape, self.arr_shape)]): | ||
raise IndexError('a selection index is out of range for the dimension') | ||
|
||
for i, dim_size in enumerate(self.arr_shape[::-1]): | ||
index = len(self.arr_shape) - (i+1) | ||
if index <= len(self.selection)-1: | ||
slice_size = self.selection_shape[index] | ||
if slice_size == dim_size and index > 0: | ||
self.selection.pop() | ||
else: | ||
break | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I understand this block of code correctly: here we seem to be looking for dimensions where we select the whole thing. Is that correct ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's right. I do this to maximize nitems/minimize the number of partial decompresions called. This logic helps get to the 200 nitems in the test example above. |
||
|
||
chunk_loc_slices = [] | ||
last_dim_slice = None if self.selection[-1].step > 1 else self.selection.pop() | ||
for i, sl in enumerate(self.selection): | ||
dim_chunk_loc_slices = [] | ||
for i, x in enumerate(slice_to_range(sl, arr_shape[i])): | ||
dim_chunk_loc_slices.append(slice(x, x+1, 1)) | ||
chunk_loc_slices.append(dim_chunk_loc_slices) | ||
if last_dim_slice: | ||
chunk_loc_slices.append([last_dim_slice]) | ||
self.chunk_loc_slices = list(itertools.product(*chunk_loc_slices)) | ||
|
||
def __iter__(self): | ||
chunk1 = self.chunk_loc_slices[0] | ||
nitems = (chunk1[-1].stop - chunk1[-1].start) * np.prod(self.arr_shape[len(chunk1):]) | ||
for chunk_selection in self.chunk_loc_slices: | ||
start = 0 | ||
for i, sl in enumerate(chunk_selection): | ||
start += sl.start * np.prod(self.arr_shape[i+1:]) | ||
yield int(start), int(nitems), chunk_selection |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ | |
|
||
import zarr | ||
from zarr.indexing import (normalize_integer_selection, oindex, oindex_set, | ||
replace_ellipsis) | ||
replace_ellipsis, PartialChunkIterator) | ||
|
||
|
||
def test_normalize_integer_selection(): | ||
|
@@ -1289,3 +1289,61 @@ def test_set_selections_with_fields(): | |
a[key][ix] = v[key][ix] | ||
z.set_mask_selection(ix, v[key][ix], fields=fields) | ||
assert_array_equal(a, z[:]) | ||
|
||
|
||
@pytest.mark.parametrize('selection, arr, expected', [ | ||
((slice(5, 8, 1), slice(2, 4, 1), slice(0, 100, 1)), | ||
np.arange(2, 100_002).reshape((100, 10, 100)), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Python 3.5 won't like those... we can argue to drop Python 3.5 I think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The underscore in the number you mean? I can also just take them out too. I didn't realize they weren't supported in all the python3 versions |
||
[(5200, 200, (slice(5, 6, 1), slice(2, 4, 1))), | ||
(6200, 200, (slice(6, 7, 1), slice(2, 4, 1))), | ||
(7200, 200, (slice(7, 8, 1), slice(2, 4, 1)))]), | ||
((slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), | ||
np.arange(2, 100_002).reshape((100, 10, 100)), | ||
[(5200.0, 5.0, (slice(5, 6, 1), slice(2, 3, 1), slice(0, 5, 1))), | ||
(5300.0, 5.0, (slice(5, 6, 1), slice(3, 4, 1), slice(0, 5, 1))), | ||
(6200.0, 5.0, (slice(6, 7, 1), slice(2, 3, 1), slice(0, 5, 1))), | ||
(6300.0, 5.0, (slice(6, 7, 1), slice(3, 4, 1), slice(0, 5, 1))), | ||
(7200.0, 5.0, (slice(7, 8, 1), slice(2, 3, 1), slice(0, 5, 1))), | ||
(7300.0, 5.0, (slice(7, 8, 1), slice(3, 4, 1), slice(0, 5, 1)))]), | ||
((slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), | ||
np.asfortranarray(np.arange(2, 100_002).reshape((100, 10, 100))), | ||
[(5200.0, 5.0, (slice(5, 6, 1), slice(2, 3, 1), slice(0, 5, 1))), | ||
(5300.0, 5.0, (slice(5, 6, 1), slice(3, 4, 1), slice(0, 5, 1))), | ||
(6200.0, 5.0, (slice(6, 7, 1), slice(2, 3, 1), slice(0, 5, 1))), | ||
(6300.0, 5.0, (slice(6, 7, 1), slice(3, 4, 1), slice(0, 5, 1))), | ||
(7200.0, 5.0, (slice(7, 8, 1), slice(2, 3, 1), slice(0, 5, 1))), | ||
(7300.0, 5.0, (slice(7, 8, 1), slice(3, 4, 1), slice(0, 5, 1)))]), | ||
((slice(5, 8, 1), slice(2, 4, 1)), | ||
np.arange(2, 100_002).reshape((100, 10, 100)), | ||
[(5200, 200, (slice(5, 6, 1), slice(2, 4, 1))), | ||
(6200, 200, (slice(6, 7, 1), slice(2, 4, 1))), | ||
(7200, 200, (slice(7, 8, 1), slice(2, 4, 1)))]), | ||
((slice(0, 10, 1),), | ||
np.arange(0, 10).reshape((10)), | ||
[(0, 10, (slice(0, 10, 1),))]), | ||
((0,), | ||
np.arange(0, 100).reshape((10, 10)), | ||
[(0, 10, (slice(0, 1, 1),))]), | ||
((0,0,), | ||
np.arange(0, 100).reshape((10, 10)), | ||
[(0, 1, (slice(0, 1, 1), slice(0, 1, 1)))]), | ||
((0,), | ||
np.arange(0, 10).reshape((10)), | ||
[(0, 1, (slice(0, 1, 1),))]), | ||
pytest.param((slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), | ||
np.arange(2, 100002).reshape((10, 1, 10000)), | ||
None, | ||
marks=[pytest.mark.xfail(reason='slice 2 is out of range')] | ||
), | ||
pytest.param((slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), | ||
np.arange(2, 100_002).reshape((10, 10_000)), | ||
None, | ||
marks=[pytest.mark.xfail(reason='slice 2 is out of range')] | ||
), | ||
]) | ||
def test_PartialChunkIterator(selection, arr, expected): | ||
print(selection) | ||
PCI = PartialChunkIterator(selection, arr.shape) | ||
results = list(PCI) | ||
assert(results == expected) | ||
|
Uh oh!
There was an error while loading. Please reload this page.