Omit chunks with no elements in slice selection with step (#1154)

jrs65 · web-flow · commit d6e35a5641c6 · 2022-10-07T08:32:52.000-07:00
* Omit chunks with no elements in slice selection with step This stops chunks being read unnecessarily when a slice selection with a step was used. Previously all chunks spanning the start-end range would be used regardless of whether they contained any elements. Fixes #843. * Test that only the required chunks are accessed during basic selections This tests that only the expected set of chunks are accessed during basic slice selection operations for both reads and writes to an array. * Update release notes. * Fix typos in release notes.
diff --git a/docs/release.rst b/docs/release.rst
@@ -14,6 +14,15 @@ Release notes
     # .. warning::
     #    Pre-release! Use :command:`pip install --pre zarr` to evaluate this release.
 
+.. _release_2.13.3:
+
+2.13.3
+------
+
+* Improve performance of slice selections with steps by omitting chunks with no relevant
+  data.
+  By :user:`Richard Shaw <jrs65>` :issue:`843`.
+
 .. _release_2.13.2:
 
 2.13.2
diff --git a/zarr/indexing.py b/zarr/indexing.py
@@ -216,6 +216,11 @@ def __iter__(self):
             dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step)
             dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start),
                                        self.step)
+
+            # If there are no elements on the selection within this chunk, then skip
+            if dim_chunk_nitems == 0:
+                continue
+
             dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems)
 
             yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel)
diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py
@@ -13,6 +13,8 @@
     PartialChunkIterator,
 )
 
+from zarr.tests.util import CountingDict
+
 
 def test_normalize_integer_selection():
 
@@ -1451,3 +1453,75 @@ def test_numpy_int_indexing():
     z[:] = a
     assert a[42] == z[42]
     assert a[numpy.int64(42)] == z[numpy.int64(42)]
+
+
+@pytest.mark.parametrize(
+    "shape, chunks, ops",
+    [
+        # 1D test cases
+        ((1070,), (50,), [("__getitem__", (slice(200, 400),))]),
+        ((1070,), (50,), [("__getitem__", (slice(200, 400, 100),))]),
+        ((1070,), (50,), [
+            ("__getitem__", (slice(200, 400),)),
+            ("__setitem__", (slice(200, 400, 100),)),
+        ]),
+
+        # 2D test cases
+        ((40, 50), (5, 8), [
+            ("__getitem__", (slice(6, 37, 13), (slice(4, 10)))),
+            ("__setitem__", (slice(None), (slice(None)))),
+        ]),
+    ]
+)
+def test_accessed_chunks(shape, chunks, ops):
+    # Test that only the required chunks are accessed during basic selection operations
+    # shape: array shape
+    # chunks: chunk size
+    # ops: list of tuples with (optype, tuple of slices)
+    # optype = "__getitem__" or "__setitem__", tuple length must match number of dims
+    import itertools
+
+    # Use a counting dict as the backing store so we can track the items access
+    store = CountingDict()
+    z = zarr.create(shape=shape, chunks=chunks, store=store)
+
+    for ii, (optype, slices) in enumerate(ops):
+
+        # Resolve the slices into the accessed chunks for each dimension
+        chunks_per_dim = []
+        for N, C, sl in zip(shape, chunks, slices):
+            chunk_ind = np.arange(N, dtype=int)[sl] // C
+            chunks_per_dim.append(np.unique(chunk_ind))
+
+        # Combine and generate the cartesian product to determine the chunks keys that
+        # will be accessed
+        chunks_accessed = []
+        for comb in itertools.product(*chunks_per_dim):
+            chunks_accessed.append(".".join([str(ci) for ci in comb]))
+
+        counts_before = store.counter.copy()
+
+        # Perform the operation
+        if optype == "__getitem__":
+            z[slices]
+        else:
+            z[slices] = ii
+
+        # Get the change in counts
+        delta_counts = store.counter - counts_before
+
+        # Check that the access counts for the operation have increased by one for all
+        # the chunks we expect to be included
+        for ci in chunks_accessed:
+            assert delta_counts.pop((optype, ci)) == 1
+
+            # If the chunk was partially written to it will also have been read once. We
+            # don't determine if the chunk was actually partial here, just that the
+            # counts are consistent that this might have happened
+            if optype == "__setitem__":
+                assert (
+                    ("__getitem__", ci) not in delta_counts or
+                    delta_counts.pop(("__getitem__", ci)) == 1
+                )
+        # Check that no other chunks were accessed
+        assert len(delta_counts) == 0