-
-
Notifications
You must be signed in to change notification settings - Fork 330
adds partial_decompress capabilites #584
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
331d2a8
0a81a03
50797de
a4e1cf6
2333fd0
492a2ce
4f00a5f
e5f7e58
559b041
81ec2f0
67f7380
5c508d7
07344e7
6e11703
3b0e651
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -822,3 +822,62 @@ def pop_fields(selection): | |
selection = tuple(s for s in selection if not isinstance(s, str)) | ||
selection = selection[0] if len(selection) == 1 else selection | ||
return fields, selection | ||
|
||
|
||
def selection_size(selection, arr): | ||
if len(selection) > len(arr.shape): | ||
raise ValueError(f'dimensions in selection cant be greater than dimensions or array: {len(selection)} > {len(arr.shape)}') | ||
selection_shape = [] | ||
for i, size in arr.shape: | ||
selection_slice = selection[i] if i < len(selection) else None | ||
if selection_slice: | ||
selection_slice_size = len(range(*selection_slice.indices(len(arr)))) | ||
selection_shape.append(selection_slice_size) | ||
else: | ||
selection_shape.append(size) | ||
return tuple(selection_shape) | ||
|
||
|
||
class PartialChunkIterator(object): | ||
andrewfulton9 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def __init__(self, selection, arr): | ||
self.arr = arr | ||
self.selection = list(selection) | ||
|
||
for i, dim_shape in enumerate(self.arr.shape[slice(None, None, -1)]): | ||
andrewfulton9 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
index = len(self.arr.shape) - (i+1) | ||
if index <= len(selection)-1: | ||
slice_nitems = len(range(*selection[index].indices(len(self.arr)))) | ||
if slice_nitems == dim_shape: | ||
self.selection.pop() | ||
else: | ||
break | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I understand this block of code correctly: here we seem to be looking for dimensions where we select the whole thing. Is that correct ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's right. I do this to maximize nitems/minimize the number of partial decompresions called. This logic helps get to the 200 nitems in the test example above. |
||
|
||
out_slices = [] | ||
chunk_loc_slices = [] | ||
|
||
last_dim_slice = None if self.selection[-1].step > 1 else self.selection.pop() | ||
for sl in self.selection: | ||
dim_out_slices = [] | ||
dim_chunk_loc_slices = [] | ||
for i, x in enumerate(range(*sl.indices(len(self.arr)))): | ||
dim_out_slices.append(slice(i, i+1, 1)) | ||
dim_chunk_loc_slices.append(slice(x, x+1, 1)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC here you are computing each individual 1 element wide selection across each axis both in the origin chunk coordinate system, and in the output version right ? Would/could this be simpler if the step of the No a request to change, it can be an optimisation for later. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thats right, except for the slice of the last dimension, if the that slice has a step of 1. if that slice has a step of more than one, then it is included here, though if thats the last dimension of the chunk, then it would be really inefficient since |
||
out_slices.append(dim_out_slices) | ||
chunk_loc_slices.append(dim_chunk_loc_slices) | ||
if last_dim_slice: | ||
out_slices.append( | ||
[slice(0, last_dim_slice.stop - last_dim_slice.start, 1)]) | ||
chunk_loc_slices.append([last_dim_slice]) | ||
|
||
self.out_slices = itertools.product(*out_slices) | ||
self.chunk_loc_slices = itertools.product(*chunk_loc_slices) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and for myself, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thats right |
||
|
||
def __iter__(self): | ||
for out_selection, chunk_selection in zip(self.out_slices, self.chunk_loc_slices): | ||
start = 0 | ||
for i, sl in enumerate(chunk_selection): | ||
start += sl.start * np.prod(self.arr.shape[i+1:]) | ||
nitems = (chunk_selection[-1].stop - chunk_selection[-1].start) * np.prod(self.arr.shape[len(chunk_selection):]) | ||
andrewfulton9 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
yield start, nitems, out_selection | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ | |
|
||
import zarr | ||
from zarr.indexing import (normalize_integer_selection, oindex, oindex_set, | ||
replace_ellipsis) | ||
replace_ellipsis, PartialChunkIterator) | ||
|
||
|
||
def test_normalize_integer_selection(): | ||
|
@@ -1289,3 +1289,29 @@ def test_set_selections_with_fields(): | |
a[key][ix] = v[key][ix] | ||
z.set_mask_selection(ix, v[key][ix], fields=fields) | ||
assert_array_equal(a, z[:]) | ||
|
||
|
||
@pytest.mark.parametrize('selection, expected', [ | ||
((slice(5, 8, 1), slice(2, 4, 1), slice(0, 100, 1)), | ||
[(5200, 200, (slice(0, 1, 1), slice(0, 2, 1))), | ||
(6200, 200, (slice(1, 2, 1), slice(0, 2, 1))), | ||
(7200, 200, (slice(2, 3, 1), slice(0, 2, 1)))]), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So trying to understand this.
mean we need to read 200 items at position 5200, 6200 and 7200 ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, really its taking 1 item from the first dimension and 2 items of 100 from the second dimension. So I wrote the code so that it takes 200 items since they are consecutive in the compressed buffer anyway. The data just has to be reshaped before it is put into the chunk output array |
||
((slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), | ||
[(5200.0, 5.0, (slice(0, 1, 1), slice(0, 1, 1), slice(0, 5, 1))), | ||
(5300.0, 5.0, (slice(0, 1, 1), slice(1, 2, 1), slice(0, 5, 1))), | ||
(6200.0, 5.0, (slice(1, 2, 1), slice(0, 1, 1), slice(0, 5, 1))), | ||
(6300.0, 5.0, (slice(1, 2, 1), slice(1, 2, 1), slice(0, 5, 1))), | ||
(7200.0, 5.0, (slice(2, 3, 1), slice(0, 1, 1), slice(0, 5, 1))), | ||
(7300.0, 5.0, (slice(2, 3, 1), slice(1, 2, 1), slice(0, 5, 1)))]), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. floats are weird, only int in output no ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I think the np.prod call used to generate the nitems is making that a float. I'll update that |
||
((slice(5, 8, 1), slice(2, 4, 1)), | ||
[(5200, 200, (slice(0, 1, 1), slice(0, 2, 1))), | ||
(6200, 200, (slice(1, 2, 1), slice(0, 2, 1))), | ||
(7200, 200, (slice(2, 3, 1), slice(0, 2, 1)))]) | ||
]) | ||
def test_PartialChunkIterator(selection, expected): | ||
arr = np.arange(2, 100002).reshape((100, 10, 100)) | ||
print(selection) | ||
PCI = PartialChunkIterator(selection, arr) | ||
results = list(PCI) | ||
assert(results == expected) | ||
|
Uh oh!
There was an error while loading. Please reload this page.