Skip to content

Commit d6e35a5

Browse files
authored
Omit chunks with no elements in slice selection with step (#1154)
* Omit chunks with no elements in slice selection with step This stops chunks being read unnecessarily when a slice selection with a step was used. Previously all chunks spanning the start-end range would be used regardless of whether they contained any elements. Fixes #843. * Test that only the required chunks are accessed during basic selections This tests that only the expected set of chunks are accessed during basic slice selection operations for both reads and writes to an array. * Update release notes. * Fix typos in release notes.
1 parent 2dcffcd commit d6e35a5

File tree

3 files changed

+88
-0
lines changed

3 files changed

+88
-0
lines changed

docs/release.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,15 @@ Release notes
1414
# .. warning::
1515
# Pre-release! Use :command:`pip install --pre zarr` to evaluate this release.
1616
17+
.. _release_2.13.3:
18+
19+
2.13.3
20+
------
21+
22+
* Improve performance of slice selections with steps by omitting chunks with no relevant
23+
data.
24+
By :user:`Richard Shaw <jrs65>` :issue:`843`.
25+
1726
.. _release_2.13.2:
1827

1928
2.13.2

zarr/indexing.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,11 @@ def __iter__(self):
216216
dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step)
217217
dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start),
218218
self.step)
219+
220+
# If there are no elements on the selection within this chunk, then skip
221+
if dim_chunk_nitems == 0:
222+
continue
223+
219224
dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems)
220225

221226
yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel)

zarr/tests/test_indexing.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
PartialChunkIterator,
1414
)
1515

16+
from zarr.tests.util import CountingDict
17+
1618

1719
def test_normalize_integer_selection():
1820

@@ -1451,3 +1453,75 @@ def test_numpy_int_indexing():
14511453
z[:] = a
14521454
assert a[42] == z[42]
14531455
assert a[numpy.int64(42)] == z[numpy.int64(42)]
1456+
1457+
1458+
@pytest.mark.parametrize(
1459+
"shape, chunks, ops",
1460+
[
1461+
# 1D test cases
1462+
((1070,), (50,), [("__getitem__", (slice(200, 400),))]),
1463+
((1070,), (50,), [("__getitem__", (slice(200, 400, 100),))]),
1464+
((1070,), (50,), [
1465+
("__getitem__", (slice(200, 400),)),
1466+
("__setitem__", (slice(200, 400, 100),)),
1467+
]),
1468+
1469+
# 2D test cases
1470+
((40, 50), (5, 8), [
1471+
("__getitem__", (slice(6, 37, 13), (slice(4, 10)))),
1472+
("__setitem__", (slice(None), (slice(None)))),
1473+
]),
1474+
]
1475+
)
1476+
def test_accessed_chunks(shape, chunks, ops):
1477+
# Test that only the required chunks are accessed during basic selection operations
1478+
# shape: array shape
1479+
# chunks: chunk size
1480+
# ops: list of tuples with (optype, tuple of slices)
1481+
# optype = "__getitem__" or "__setitem__", tuple length must match number of dims
1482+
import itertools
1483+
1484+
# Use a counting dict as the backing store so we can track the items access
1485+
store = CountingDict()
1486+
z = zarr.create(shape=shape, chunks=chunks, store=store)
1487+
1488+
for ii, (optype, slices) in enumerate(ops):
1489+
1490+
# Resolve the slices into the accessed chunks for each dimension
1491+
chunks_per_dim = []
1492+
for N, C, sl in zip(shape, chunks, slices):
1493+
chunk_ind = np.arange(N, dtype=int)[sl] // C
1494+
chunks_per_dim.append(np.unique(chunk_ind))
1495+
1496+
# Combine and generate the cartesian product to determine the chunks keys that
1497+
# will be accessed
1498+
chunks_accessed = []
1499+
for comb in itertools.product(*chunks_per_dim):
1500+
chunks_accessed.append(".".join([str(ci) for ci in comb]))
1501+
1502+
counts_before = store.counter.copy()
1503+
1504+
# Perform the operation
1505+
if optype == "__getitem__":
1506+
z[slices]
1507+
else:
1508+
z[slices] = ii
1509+
1510+
# Get the change in counts
1511+
delta_counts = store.counter - counts_before
1512+
1513+
# Check that the access counts for the operation have increased by one for all
1514+
# the chunks we expect to be included
1515+
for ci in chunks_accessed:
1516+
assert delta_counts.pop((optype, ci)) == 1
1517+
1518+
# If the chunk was partially written to it will also have been read once. We
1519+
# don't determine if the chunk was actually partial here, just that the
1520+
# counts are consistent that this might have happened
1521+
if optype == "__setitem__":
1522+
assert (
1523+
("__getitem__", ci) not in delta_counts or
1524+
delta_counts.pop(("__getitem__", ci)) == 1
1525+
)
1526+
# Check that no other chunks were accessed
1527+
assert len(delta_counts) == 0

0 commit comments

Comments
 (0)