zarr-developers · shikharsg · Aug 29, 2018 · Aug 29, 2018 · Oct 9, 2018 · Oct 13, 2018
diff --git a/docs/api/storage.rst b/docs/api/storage.rst
@@ -27,6 +27,10 @@ Storage (``zarr.storage``)
     .. automethod:: invalidate_values
     .. automethod:: invalidate_keys
 
+.. autoclass:: LRUChunkCache
+
+    .. automethod:: invalidate
+
 .. autofunction:: init_array
 .. autofunction:: init_group
 .. autofunction:: contains_array

diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -797,6 +797,30 @@ layer over a remote store. E.g.::
     b'Hello from the cloud!'
     0.0009490990014455747
 
+The above :class:`zarr.storage.LRUStoreCache` wraps any Zarr storage class, and stores
+encoded chunks. So every time cache is accessed, the chunk has to be decoded. For cases
-encoded chunks. So every time cache is accessed, the chunk has to be decoded. For cases
+encoded chunks. Every time the cache is accessed, the chunk must be decoded. For cases
-encoded chunks. So every time cache is accessed, the chunk has to be decoded. For cases
+encoded chunks. Every time the cache is accessed, the chunk must be decoded. For cases
+where decoding is computationally expensive, Zarr also provides a
+:class:`zarr.storage.LRUChunkCache` which can store decoded chunks, e.g.::
+
+    >>> import zarr
+    >>> from numcodecs import LZMA
+    >>> import numpy as np
+    >>> store = zarr.DictStore()
+    >>> z = zarr.array(np.random.randn(1000000).reshape(1000,1000), chunks=(100,100),
+    ...                store=store, compressor=LZMA())
+    >>> from timeit import timeit
+    >>> # data access without cache
+    ... timeit('z[:]', number=1, globals=globals())  # doctest: +SKIP
+    0.6703157789888792
+    >>> z_with_cache = zarr.Array(store=store, chunk_cache=zarr.LRUChunkCache(max_size=None))
+    >>> # first data access about the same as without cache
+    ... timeit('z_with_cache[:]', number=1, globals=globals())  # doctest: +SKIP
+    0.681269913999131
+    >>> # second time accesses the decoded chunks in the cache
+    ... timeit('z_with_cache[:]', number=1, globals=globals())  # doctest: +SKIP
+    0.007617925992235541
+
+
 If you are still experiencing poor performance with distributed/cloud storage, please
 raise an issue on the GitHub issue tracker with any profiling data you can provide, as
 there may be opportunities to optimise further either within Zarr or within the mapping

diff --git a/zarr/__init__.py b/zarr/__init__.py
@@ -7,7 +7,8 @@
 from zarr.creation import (empty, zeros, ones, full, array, empty_like, zeros_like,
                            ones_like, full_like, open_array, open_like, create)
 from zarr.storage import (DictStore, DirectoryStore, ZipStore, TempStore,
-                          NestedDirectoryStore, DBMStore, LMDBStore, LRUStoreCache)
+                          NestedDirectoryStore, DBMStore, LMDBStore, LRUStoreCache,
+                          LRUChunkCache)
 from zarr.hierarchy import group, open_group, Group
 from zarr.sync import ThreadSynchronizer, ProcessSynchronizer
 from zarr.codecs import *

diff --git a/zarr/core.py b/zarr/core.py
@@ -51,6 +51,10 @@ class Array(object):
         If True (default), user attributes will be cached for attribute read
         operations. If False, user attributes are reloaded from the store prior
         to all attribute read operations.
+    chunk_cache: MutableMapping, optional
+        Mapping to store decoded chunks for caching. Can be used in repeated
+        chunk access scenarios when decoding of data is computationally
+        expensive.
 
     Attributes
     ----------
@@ -103,7 +107,8 @@ class Array(object):
     """
 
     def __init__(self, store, path=None, read_only=False, chunk_store=None,
-                 synchronizer=None, cache_metadata=True, cache_attrs=True):
+                 synchronizer=None, cache_metadata=True, cache_attrs=True,
+                 chunk_cache=None):
         # N.B., expect at this point store is fully initialized with all
         # configuration metadata fully specified and normalized
 
@@ -118,6 +123,7 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None,
         self._synchronizer = synchronizer
         self._cache_metadata = cache_metadata
         self._is_view = False
+        self._chunk_cache = chunk_cache
 
         # initialize metadata
         self._load_metadata()
@@ -692,19 +698,36 @@ def _get_basic_selection_zd(self, selection, out=None, fields=None):
         if selection not in ((), (Ellipsis,)):
             err_too_many_indices(selection, ())
 
-        try:
-            # obtain encoded data for chunk
-            ckey = self._chunk_key((0,))
-            cdata = self.chunk_store[ckey]
+        # obtain key for chunk
+        ckey = self._chunk_key((0,))
 
-        except KeyError:
-            # chunk not initialized
-            chunk = np.zeros((), dtype=self._dtype)
-            if self._fill_value is not None:
-                chunk.fill(self._fill_value)
+        # setup variable to hold decoded chunk
+        chunk = None
 
-        else:
-            chunk = self._decode_chunk(cdata)
+        # check for cached chunk
+        if self._chunk_cache is not None:
+            try:
+                chunk = self._chunk_cache[ckey]
+            except KeyError:
+                pass
+
+        if chunk is None:
+            try:
+                # obtain encoded data for chunk
+                cdata = self.chunk_store[ckey]
+
+            except KeyError:
+                # chunk not initialized
+                chunk = np.zeros((), dtype=self._dtype)
+                if self._fill_value is not None:
+                    chunk.fill(self._fill_value)
+
+            else:
+                chunk = self._decode_chunk(cdata)
+
+        # cache decoded chunk
+        if self._chunk_cache is not None:
+            self._chunk_cache[ckey] = chunk
 
         # handle fields
         if fields:
@@ -1454,20 +1477,29 @@ def _set_basic_selection_zd(self, selection, value, fields=None):
         # obtain key for chunk
         ckey = self._chunk_key((0,))
 
-        # setup chunk
-        try:
-            # obtain compressed data for chunk
-            cdata = self.chunk_store[ckey]
+        chunk = None
 
-        except KeyError:
-            # chunk not initialized
-            chunk = np.zeros((), dtype=self._dtype)
-            if self._fill_value is not None:
-                chunk.fill(self._fill_value)
+        if self._chunk_cache is not None:
+            try:
+                chunk = self._chunk_cache[ckey]
+            except KeyError:
+                pass
 
-        else:
-            # decode chunk
-            chunk = self._decode_chunk(cdata).copy()
+        if chunk is None:
+            # setup chunk
+            try:
+                # obtain compressed data for chunk
+                cdata = self.chunk_store[ckey]
+
+            except KeyError:
+                # chunk not initialized
+                chunk = np.zeros((), dtype=self._dtype)
+                if self._fill_value is not None:
+                    chunk.fill(self._fill_value)
+
+            else:
+                # decode chunk
+                chunk = self._decode_chunk(cdata).copy()
 
         # set value
         if fields:
@@ -1478,6 +1510,8 @@ def _set_basic_selection_zd(self, selection, value, fields=None):
         # encode and store
         cdata = self._encode_chunk(chunk)
         self.chunk_store[ckey] = cdata
+        if self._chunk_cache is not None:
+            self._chunk_cache[ckey] = chunk
 
     def _set_basic_selection_nd(self, selection, value, fields=None):
         # implementation of __setitem__ for array with at least one dimension
@@ -1562,8 +1596,21 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection,
         ckey = self._chunk_key(chunk_coords)
 
         try:
+
+            cdata = None
+            chunk_was_cached = False
+
+            # first try getting from cache (if one has been provided)
+            if self._chunk_cache is not None:
+                try:
+                    cdata = self._chunk_cache[ckey]
+                    chunk_was_cached = True
+                except KeyError:
+                    pass
+
             # obtain compressed data for chunk
-            cdata = self.chunk_store[ckey]
+            if not chunk_was_cached:
+                cdata = self.chunk_store[ckey]
 
         except KeyError:
             # chunk not initialized
@@ -1593,19 +1640,30 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection,
                     # contiguous, so we can decompress directly from the chunk
                     # into the destination array
 
-                    if self._compressor:
+                    if chunk_was_cached:
+                        np.copyto(dest, cdata)
+                    elif self._compressor:
                         self._compressor.decode(cdata, dest)
+                        if self._chunk_cache is not None:
+                            self._chunk_cache[ckey] = np.copy(dest)
                     else:
                         if isinstance(cdata, np.ndarray):
                             chunk = cdata.view(self._dtype)
                         else:
                             chunk = np.frombuffer(cdata, dtype=self._dtype)
                         chunk = chunk.reshape(self._chunks, order=self._order)
                         np.copyto(dest, chunk)
+                        if self._chunk_cache is not None:
+                            self._chunk_cache[ckey] = np.copy(chunk)
                     return
 
             # decode chunk
-            chunk = self._decode_chunk(cdata)
+            if not chunk_was_cached:
+                chunk = self._decode_chunk(cdata)
+                if self._chunk_cache is not None:
+                    self._chunk_cache[ckey] = np.copy(chunk)
+            else:
+                chunk = cdata
 
             # select data from chunk
             if fields:
@@ -1720,6 +1778,13 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non
         # store
         self.chunk_store[ckey] = cdata
 
+        # cache the chunk
+        if self._chunk_cache is not None:
+            # ensure cached chunk has been round tripped through encode-decode if dtype=object
+            if self.dtype == object:
+                chunk = self._decode_chunk(cdata)
+            self._chunk_cache[ckey] = np.copy(chunk)
+
     def _chunk_key(self, chunk_coords):
         return self._key_prefix + '.'.join(map(str, chunk_coords))
 

diff --git a/zarr/storage.py b/zarr/storage.py
@@ -1883,3 +1883,145 @@ def __delitem__(self, key):
         with self._mutex:
             self._invalidate_keys()
             self._invalidate_value(key)
+
+
+class LRUChunkCache(MutableMapping):
+    """Class that implements a least-recently-used (LRU) cache for array chunks.
+    Intended primarily for use with stores that can be slow to access, e.g., remote stores that
+    require network communication to store and retrieve data, and/or arrays where decompression
+    of data is computationally expensive.
+
+    Parameters
+    ----------
+    max_size : int
+        The maximum size that the cache may grow to, in number of bytes. Provide `None`
+        if you would like the cache to have unlimited size.
+
+    Examples
+    --------
+    The example below uses a dict store to store the encoded array and uses LRUChunkCache to
+    store decoded chunks::
+
+        >>> import zarr
+        >>> from numcodecs import LZMA
+        >>> import numpy as np
+        >>> store = zarr.DictStore()
+        >>> z = zarr.array(np.random.randn(1000000).reshape(1000,1000), chunks=(100,100),
+        ...                store=store, compressor=LZMA())
+        >>> from timeit import timeit
+        >>> # data access without cache
+        ... timeit('z[:]', number=1, globals=globals())  # doctest: +SKIP
+        0.6703157789888792
+        >>> z_with_cache = zarr.Array(store=store, chunk_cache=zarr.LRUChunkCache(max_size=None))
+        >>> # first data access about the same as without cache
+        ... timeit('z_with_cache[:]', number=1, globals=globals())  # doctest: +SKIP
+        0.681269913999131
+        >>> # second time accesses the decoded chunks in the cache
+        ... timeit('z_with_cache[:]', number=1, globals=globals())  # doctest: +SKIP
+        0.007617925992235541
+
+    """
+
+    def __init__(self, max_size):
+        self._max_size = max_size
+        self._current_size = 0
+        self._values_cache = OrderedDict()
+        self._mutex = Lock()
+        self.hits = self.misses = 0
+
+    def __getstate__(self):
+        return (self._max_size, self._current_size,
+                self._values_cache, self.hits,
+                self.misses)
+
+    def __setstate__(self, state):
+        (self._store, self._max_size, self._current_size,
+         self._values_cache, self.hits,
+         self.misses) = state
+        self._mutex = Lock()
+
+    def __len__(self):
+        return len(self._keys())
+
+    def __iter__(self):
+        return self.keys()
+
+    def __contains__(self, key):
+        with self._mutex:
+            return key in self._keys()
+
+    def clear(self):
+        self.invalidate()
+
+    def keys(self):
+        with self._mutex:
+            return iter(self._keys())
+
+    def _keys(self):
+        return self._values_cache.keys()
+
+    def _pop_value(self):
+        # remove the first value from the cache, as this will be the least recently
+        # used value
+        _, v = self._values_cache.popitem(last=False)
+        return v
+
+    def _accommodate_value(self, value_size):
+        if self._max_size is None:
+            return
+        # ensure there is enough space in the cache for a new value
+        while self._current_size + value_size > self._max_size:
+            v = self._pop_value()
+            self._current_size -= buffer_size(v)
+
+    def _cache_value(self, key, value):
+        # cache a value
+        value_size = buffer_size(value)
+        # check size of the value against max size, as if the value itself exceeds max
+        # size then we are never going to cache it
+        if self._max_size is None or value_size <= self._max_size:
+            self._accommodate_value(value_size)
+            self._values_cache[key] = value
+            self._current_size += value_size
+
+    def invalidate(self):
+        """Completely clear the cache."""
+        with self._mutex:
+            self._values_cache.clear()
+
+    def invalidate_values(self):
+        """Clear the values cache."""
+        with self._mutex:
+            self._values_cache.clear()
+
+    def _invalidate_value(self, key):
+        if key in self._values_cache:
+            value = self._values_cache.pop(key)
+            self._current_size -= buffer_size(value)
+
+    def __getitem__(self, key):
+        try:
+            # try to obtain the value from the cache
+            with self._mutex:
+                value = self._values_cache[key]
+                # cache hit if no KeyError is raised
+                self.hits += 1
+                # treat the end as most recently used
+                OrderedDict_move_to_end(self._values_cache, key)
+
+        except KeyError:
+            # cache miss
+            with self._mutex:
+                self.misses += 1
+            raise KeyError
+
+        return value
+
+    def __setitem__(self, key, value):
+        with self._mutex:
+            self._invalidate_value(key)
+            self._cache_value(key, value)
+
+    def __delitem__(self, key):
+        with self._mutex:
+            self._invalidate_value(key)