Skip to content

Appending performance improvement #1014

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

22 changes: 18 additions & 4 deletions zarr/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2428,17 +2428,31 @@ def _resize_nosync(self, *args):
for s, c in zip(new_shape, chunks))

# remove any chunks not within range
# The idea is that, along each dimension,
# only find and remove the chunk slices that exist in 'old' but not 'new' data.
# Note that a mutable list ('old_cdata_shape_working_list') is introduced here
# to dynamically adjust the number of chunks along the already-processed dimensions
# in order to avoid duplicate chunk removal.
chunk_store = self.chunk_store
for cidx in itertools.product(*[range(n) for n in old_cdata_shape]):
if all(i < c for i, c in zip(cidx, new_cdata_shape)):
pass # keep the chunk
else:
old_cdata_shape_working_list = list(old_cdata_shape)
for idx_cdata, (val_old_cdata, val_new_cdata) in enumerate(
zip(old_cdata_shape_working_list, new_cdata_shape)
):
for cidx in itertools.product(
*[
range(n_new, n_old) if (idx == idx_cdata) else range(n_old)
for idx, (n_old, n_new) in enumerate(
zip(old_cdata_shape_working_list, new_cdata_shape)
)
]
):
key = self._chunk_key(cidx)
try:
del chunk_store[key]
except KeyError:
# chunk not initialized
pass
old_cdata_shape_working_list[idx_cdata] = min(val_old_cdata, val_new_cdata)

def append(self, data, axis=0):
"""Append `data` to `axis`.
Expand Down
9 changes: 9 additions & 0 deletions zarr/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,15 @@ def test_resize_2d(self):
assert (10, 10) == z.chunks
assert_array_equal(a[:55, :1], z[:])

z.resize((1, 55))
assert (1, 55) == z.shape
assert (1, 55) == z[:].shape
assert np.dtype('i4') == z.dtype
assert np.dtype('i4') == z[:].dtype
assert (10, 10) == z.chunks
assert_array_equal(a[:1, :10], z[:, :10])
assert_array_equal(np.zeros((1, 55-10), dtype='i4'), z[:, 10:55])

# via shape setter
z.shape = (105, 105)
assert (105, 105) == z.shape
Expand Down