From 7d3444c386ed5f9ee46786b89b7cc7d3ef51f833 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 13 Oct 2016 06:24:53 -0400 Subject: [PATCH 01/12] create Base class for Array --- zarr/core.py | 155 ++++++++++++++++++++++++++------------------------- 1 file changed, 80 insertions(+), 75 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 58356b884f..0109a2cb27 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -18,7 +18,76 @@ from zarr.codecs import get_codec -class Array(object): +class Base(object): + """ ABC for Array / Frame """ + + @property + def store(self): + """A MutableMapping providing the underlying storage for the array.""" + return self._store + + @property + def path(self): + """Storage path.""" + return self._path + + @property + def name(self): + """Array name following h5py convention.""" + if self.path: + # follow h5py convention: add leading slash + name = self.path + if name[0] != '/': + name = '/' + name + return name + return None + + @property + def read_only(self): + """A boolean, True if modification operations are not permitted.""" + return self._read_only + + @property + def chunk_store(self): + """A MutableMapping providing the underlying storage for array + chunks.""" + return self._chunk_store + + @property + def chunks(self): + """A tuple of integers describing the length of each dimension of a + chunk of the array.""" + return self._chunks + + @property + def compressor(self): + """Primary compression codec.""" + return self._compressor + + @property + def filters(self): + """One or more codecs used to transform data prior to compression.""" + return self._filters + + @property + def synchronizer(self): + """Object used to synchronize write access to the array.""" + return self._synchronizer + + @property + def attrs(self): + """A MutableMapping containing user-defined attributes. Note that + attribute values must be JSON serializable.""" + return self._attrs + + @property + def ndim(self): + """Number of dimensions.""" + return len(self.shape) + + + +class Array(Base): """Instantiate an array from an initialized store. Parameters @@ -170,36 +239,20 @@ def _flush_metadata_nosync(self): self._store[mkey] = encode_array_metadata(meta) @property - def store(self): - """A MutableMapping providing the underlying storage for the array.""" - return self._store - - @property - def path(self): - """Storage path.""" - return self._path - - @property - def name(self): - """Array name following h5py convention.""" - if self.path: - # follow h5py convention: add leading slash - name = self.path - if name[0] != '/': - name = '/' + name - return name - return None + def fill_value(self): + """A value used for uninitialized portions of the array.""" + return self._fill_value @property - def read_only(self): - """A boolean, True if modification operations are not permitted.""" - return self._read_only + def order(self): + """A string indicating the order in which bytes are arranged within + chunks of the array.""" + return self._order @property - def chunk_store(self): - """A MutableMapping providing the underlying storage for array - chunks.""" - return self._chunk_store + def dtype(self): + """The NumPy data type.""" + return self._dtype @property def shape(self): @@ -214,54 +267,6 @@ def shape(self): def shape(self, value): self.resize(value) - @property - def chunks(self): - """A tuple of integers describing the length of each dimension of a - chunk of the array.""" - return self._chunks - - @property - def dtype(self): - """The NumPy data type.""" - return self._dtype - - @property - def compressor(self): - """Primary compression codec.""" - return self._compressor - - @property - def fill_value(self): - """A value used for uninitialized portions of the array.""" - return self._fill_value - - @property - def order(self): - """A string indicating the order in which bytes are arranged within - chunks of the array.""" - return self._order - - @property - def filters(self): - """One or more codecs used to transform data prior to compression.""" - return self._filters - - @property - def synchronizer(self): - """Object used to synchronize write access to the array.""" - return self._synchronizer - - @property - def attrs(self): - """A MutableMapping containing user-defined attributes. Note that - attribute values must be JSON serializable.""" - return self._attrs - - @property - def ndim(self): - """Number of dimensions.""" - return len(self.shape) - @property def _size(self): return reduce(operator.mul, self._shape) From d1942d738a5d61ad5b0847c0a9dcc81bd5c5a1d7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 13 Oct 2016 06:38:48 -0400 Subject: [PATCH 02/12] add metadata encoder / decoder --- zarr/frame.py | 1100 ++++++++++++++++++++++++++++++++++++++ zarr/meta.py | 62 +++ zarr/storage.py | 13 +- zarr/tests/test_frame.py | 865 ++++++++++++++++++++++++++++++ zarr/tests/test_meta.py | 36 +- 5 files changed, 2069 insertions(+), 7 deletions(-) create mode 100644 zarr/frame.py create mode 100644 zarr/tests/test_frame.py diff --git a/zarr/frame.py b/zarr/frame.py new file mode 100644 index 0000000000..1e5853f939 --- /dev/null +++ b/zarr/frame.py @@ -0,0 +1,1100 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, print_function, division +import operator +import itertools + +import numpy as np +import pandas as pd +from zarr.core import Base + + +class Frame(Base): + """Instantiate a frame from an initialized store. + + Parameters + ---------- + store : MutableMapping + Array store, already initialized. + path : string, optional + Storage path. + read_only : bool, optional + True if array should be protected against modification. + chunk_store : MutableMapping, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + synchronizer : object, optional + Array synchronizer. + cache_metadata : bool, optional + If True, array configuration metadata will be cached for the + lifetime of the object. If False, array metadata will be reloaded + prior to all data access and modification operations (may incur + overhead depending on storage and data access pattern). + + Attributes + ---------- + store + path + name + read_only + chunk_store + shape + chunks + dtype + compression + compression_opts + order + synchronizer + filters + attrs + size + cdata_shape + nchunks + nchunks_initialized + is_view + + Methods + ------- + __getitem__ + __setitem__ + resize + append + view + + """ # flake8: noqa + + def __init__(self, store, path=None, read_only=False, chunk_store=None, + synchronizer=None, cache_metadata=True): + # N.B., expect at this point store is fully initialized with all + # configuration metadata fully specified and normalized + + self._store = store + self._path = normalize_storage_path(path) + if self._path: + self._key_prefix = self._path + '/' + else: + self._key_prefix = '' + self._read_only = read_only + if chunk_store is None: + self._chunk_store = store + else: + self._chunk_store = chunk_store + self._synchronizer = synchronizer + self._cache_metadata = cache_metadata + self._is_view = False + + # initialize metadata + self._load_metadata() + + # initialize attributes + akey = self._key_prefix + attrs_key + self._attrs = Attributes(store, key=akey, read_only=read_only, + synchronizer=synchronizer) + + def _load_metadata(self): + """(Re)load metadata from store.""" + if self._synchronizer is None: + self._load_metadata_nosync() + else: + mkey = self._key_prefix + array_meta_key + with self._synchronizer[mkey]: + self._load_metadata_nosync() + + def _load_metadata_nosync(self): + try: + mkey = self._key_prefix + array_meta_key + meta_bytes = self._store[mkey] + except KeyError: + err_array_not_found(self._path) + else: + + # decode and store metadata + meta = decode_array_metadata(meta_bytes) + self._meta = meta + self._shape = meta['shape'] + self._chunks = meta['chunks'] + self._dtype = meta['dtype'] + self._fill_value = meta['fill_value'] + self._order = meta['order'] + + # setup compressor + config = meta['compressor'] + if config is None: + self._compressor = None + else: + self._compressor = get_codec(config) + + # setup filters + filters = meta['filters'] + if filters: + filters = [get_codec(config) for config in filters] + self._filters = filters + + def _refresh_metadata(self): + if not self._cache_metadata: + self._load_metadata() + + def _refresh_metadata_nosync(self): + if not self._cache_metadata and not self._is_view: + self._load_metadata_nosync() + + def _flush_metadata_nosync(self): + if self._is_view: + raise PermissionError('not permitted for views') + + if self._compressor: + compressor_config = self._compressor.get_config() + else: + compressor_config = None + if self._filters: + filters_config = [f.get_config() for f in self._filters] + else: + filters_config = None + meta = dict(shape=self._shape, chunks=self._chunks, dtype=self._dtype, + compressor=compressor_config, fill_value=self._fill_value, + order=self._order, filters=filters_config) + mkey = self._key_prefix + array_meta_key + self._store[mkey] = encode_array_metadata(meta) + + @property + def fill_value(self): + """A value used for uninitialized portions of the array.""" + return self._fill_value + + @property + def order(self): + """A string indicating the order in which bytes are arranged within + chunks of the array.""" + return self._order + + @property + def dtype(self): + """The NumPy data type.""" + return self._dtype + + @property + def shape(self): + """A tuple of integers describing the length of each dimension of + the array.""" + # N.B., shape may change if array is resized, hence need to refresh + # metadata + self._refresh_metadata() + return self._shape + + @shape.setter + def shape(self, value): + self.resize(value) + + @property + def _size(self): + return reduce(operator.mul, self._shape) + + @property + def size(self): + """The total number of elements in the array.""" + # N.B., this property depends on shape, and shape may change if array + # is resized, hence need to refresh metadata + self._refresh_metadata() + return self._size + + @property + def itemsize(self): + """The size in bytes of each item in the array.""" + return self.dtype.itemsize + + @property + def _nbytes(self): + return self._size * self.itemsize + + @property + def nbytes(self): + """The total number of bytes that would be required to store the + array without compression.""" + # N.B., this property depends on shape, and shape may change if array + # is resized, hence need to refresh metadata + self._refresh_metadata() + return self._nbytes + + @property + def nbytes_stored(self): + """The total number of stored bytes of data for the array. This + includes storage required for configuration metadata and user + attributes.""" + m = getsize(self._store, self._path) + if self._store == self._chunk_store: + return m + else: + n = getsize(self._chunk_store, self._path) + if m < 0 or n < 0: + return -1 + else: + return m + n + + @property + def _cdata_shape(self): + return tuple(int(np.ceil(s / c)) + for s, c in zip(self._shape, self._chunks)) + + @property + def cdata_shape(self): + """A tuple of integers describing the number of chunks along each + dimension of the array.""" + self._refresh_metadata() + return self._cdata_shape + + @property + def _nchunks(self): + return reduce(operator.mul, self._cdata_shape) + + @property + def nchunks(self): + """Total number of chunks.""" + self._refresh_metadata() + return self._nchunks + + @property + def nchunks_initialized(self): + """The number of chunks that have been initialized with some data.""" + return sum(1 for k in listdir(self._chunk_store, self._path) + if k not in [array_meta_key, attrs_key]) + + # backwards compability + initialized = nchunks_initialized + + @property + def is_view(self): + """A boolean, True if this array is a view on another array.""" + return self._is_view + + def __eq__(self, other): + return ( + isinstance(other, Array) and + self.store == other.store and + self.read_only == other.read_only and + self.path == other.path and + not self._is_view + # N.B., no need to compare other properties, should be covered by + # store comparison + ) + + def __array__(self, *args): + a = self[:] + if args: + a = a.astype(args[0]) + return a + + def __len__(self): + return self.shape[0] + + def __getitem__(self, item): + """Retrieve data for some portion of the array. Most NumPy-style + slicing operations are supported. + + Returns + ------- + out : ndarray + A NumPy array containing the data for the requested region. + + Examples + -------- + + Setup a 1-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.array(np.arange(100000000), chunks=1000000, dtype='i4') + >>> z + Array((100000000,), int32, chunks=(1000000,), order=C) + nbytes: 381.5M; nbytes_stored: 6.4M; ratio: 59.9; initialized: 100/100 + compressor: Blosc(cname='lz4', clevel=5, shuffle=1) + store: dict + + Take some slices:: + + >>> z[5] + 5 + >>> z[:5] + array([0, 1, 2, 3, 4], dtype=int32) + >>> z[-5:] + array([99999995, 99999996, 99999997, 99999998, 99999999], dtype=int32) + >>> z[5:10] + array([5, 6, 7, 8, 9], dtype=int32) + >>> z[:] + array([ 0, 1, 2, ..., 99999997, 99999998, 99999999], dtype=int32) + + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.array(np.arange(100000000).reshape(10000, 10000), + ... chunks=(1000, 1000), dtype='i4') + >>> z + Array((10000, 10000), int32, chunks=(1000, 1000), order=C) + nbytes: 381.5M; nbytes_stored: 9.2M; ratio: 41.6; initialized: 100/100 + compressor: Blosc(cname='lz4', clevel=5, shuffle=1) + store: dict + + Take some slices:: + + >>> z[2, 2] + 20002 + >>> z[:2, :2] + array([[ 0, 1], + [10000, 10001]], dtype=int32) + >>> z[:2] + array([[ 0, 1, 2, ..., 9997, 9998, 9999], + [10000, 10001, 10002, ..., 19997, 19998, 19999]], dtype=int32) + >>> z[:, :2] + array([[ 0, 1], + [ 10000, 10001], + [ 20000, 20001], + ..., + [99970000, 99970001], + [99980000, 99980001], + [99990000, 99990001]], dtype=int32) + >>> z[:] + array([[ 0, 1, 2, ..., 9997, 9998, 9999], + [ 10000, 10001, 10002, ..., 19997, 19998, 19999], + [ 20000, 20001, 20002, ..., 29997, 29998, 29999], + ..., + [99970000, 99970001, 99970002, ..., 99979997, 99979998, 99979999], + [99980000, 99980001, 99980002, ..., 99989997, 99989998, 99989999], + [99990000, 99990001, 99990002, ..., 99999997, 99999998, 99999999]], dtype=int32) + + """ # flake8: noqa + + # refresh metadata + if not self._cache_metadata: + self._load_metadata() + + # normalize selection + selection = normalize_array_selection(item, self._shape) + + # determine output array shape + out_shape = tuple(s.stop - s.start for s in selection + if isinstance(s, slice)) + + # setup output array + out = np.empty(out_shape, dtype=self._dtype, order=self._order) + + # determine indices of chunks overlapping the selection + chunk_range = get_chunk_range(selection, self._chunks) + + # iterate over chunks in range + for cidx in itertools.product(*chunk_range): + + # determine chunk offset + offset = [i * c for i, c in zip(cidx, self._chunks)] + + # determine region within output array + out_selection = tuple( + slice(max(0, o - s.start), + min(o + c - s.start, s.stop - s.start)) + for s, o, c, in zip(selection, offset, self._chunks) + if isinstance(s, slice) + ) + + # determine region within chunk + chunk_selection = tuple( + slice(max(0, s.start - o), min(c, s.stop - o)) + if isinstance(s, slice) + else s - o + for s, o, c in zip(selection, offset, self._chunks) + ) + + # obtain the destination array as a view of the output array + if out_selection: + dest = out[out_selection] + else: + dest = out + + # load chunk selection into output array + self._chunk_getitem(cidx, chunk_selection, dest) + + if out.shape: + return out + else: + return out[()] + + def __setitem__(self, item, value): + """Modify data for some portion of the array. + + Examples + -------- + + Setup a 1-dimensional array:: + + >>> import zarr + >>> z = zarr.zeros(100000000, chunks=1000000, dtype='i4') + >>> z + Array((100000000,), int32, chunks=(1000000,), order=C) + nbytes: 381.5M; nbytes_stored: 301; ratio: 1328903.7; initialized: 0/100 + compressor: Blosc(cname='lz4', clevel=5, shuffle=1) + store: dict + + Set all array elements to the same scalar value:: + + >>> z[:] = 42 + >>> z[:] + array([42, 42, 42, ..., 42, 42, 42], dtype=int32) + + Set a portion of the array:: + + >>> z[:100] = np.arange(100) + >>> z[-100:] = np.arange(100)[::-1] + >>> z[:] + array([0, 1, 2, ..., 2, 1, 0], dtype=int32) + + Setup a 2-dimensional array:: + + >>> z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') + >>> z + Array((10000, 10000), int32, chunks=(1000, 1000), order=C) + nbytes: 381.5M; nbytes_stored: 323; ratio: 1238390.1; initialized: 0/100 + compressor: Blosc(cname='lz4', clevel=5, shuffle=1) + store: dict + + Set all array elements to the same scalar value:: + + >>> z[:] = 42 + >>> z[:] + array([[42, 42, 42, ..., 42, 42, 42], + [42, 42, 42, ..., 42, 42, 42], + [42, 42, 42, ..., 42, 42, 42], + ..., + [42, 42, 42, ..., 42, 42, 42], + [42, 42, 42, ..., 42, 42, 42], + [42, 42, 42, ..., 42, 42, 42]], dtype=int32) + + Set a portion of the array:: + + >>> z[0, :] = np.arange(z.shape[1]) + >>> z[:, 0] = np.arange(z.shape[0]) + >>> z[:] + array([[ 0, 1, 2, ..., 9997, 9998, 9999], + [ 1, 42, 42, ..., 42, 42, 42], + [ 2, 42, 42, ..., 42, 42, 42], + ..., + [9997, 42, 42, ..., 42, 42, 42], + [9998, 42, 42, ..., 42, 42, 42], + [9999, 42, 42, ..., 42, 42, 42]], dtype=int32) + + """ + + # guard conditions + if self._read_only: + err_read_only() + + # refresh metadata + if not self._cache_metadata: + self._load_metadata_nosync() + + # normalize selection + selection = normalize_array_selection(item, self._shape) + + # check value shape + expected_shape = tuple( + s.stop - s.start for s in selection + if isinstance(s, slice) + ) + if np.isscalar(value): + pass + elif expected_shape != value.shape: + raise ValueError('value has wrong shape, expecting %s, found %s' + % (str(expected_shape), + str(value.shape))) + + # determine indices of chunks overlapping the selection + chunk_range = get_chunk_range(selection, self._chunks) + + # iterate over chunks in range + for cidx in itertools.product(*chunk_range): + + # determine chunk offset + offset = [i * c for i, c in zip(cidx, self._chunks)] + + # determine required index range within chunk + chunk_selection = tuple( + slice(max(0, s.start - o), min(c, s.stop - o)) + if isinstance(s, slice) + else s - o + for s, o, c in zip(selection, offset, self._chunks) + ) + + if np.isscalar(value): + + # put data + self._chunk_setitem(cidx, chunk_selection, value) + + else: + # assume value is array-like + + # determine index within value + value_selection = tuple( + slice(max(0, o - s.start), + min(o + c - s.start, s.stop - s.start)) + for s, o, c in zip(selection, offset, self._chunks) + if isinstance(s, slice) + ) + + # put data + self._chunk_setitem(cidx, chunk_selection, + value[value_selection]) + + def _chunk_getitem(self, cidx, item, dest): + """Obtain part or whole of a chunk. + + Parameters + ---------- + cidx : tuple of ints + Indices of the chunk. + item : tuple of slices + Location of region within the chunk. + dest : ndarray + Numpy array to store result in. + + """ + + try: + + # obtain compressed data for chunk + ckey = self._chunk_key(cidx) + cdata = self._chunk_store[ckey] + + except KeyError: + + # chunk not initialized + if self._fill_value is not None: + dest.fill(self._fill_value) + + else: + + if is_total_slice(item, self._chunks) and \ + not self._filters and \ + ((self._order == 'C' and dest.flags.c_contiguous) or + (self._order == 'F' and dest.flags.f_contiguous)): + + # optimization: we want the whole chunk, and the destination is + # contiguous, so we can decompress directly from the chunk + # into the destination array + if self._compressor: + self._compressor.decode(cdata, dest) + else: + arr = np.frombuffer(cdata, dtype=self._dtype) + arr = arr.reshape(self._chunks, order=self._order) + np.copyto(dest, arr) + + else: + + # decode chunk + chunk = self._decode_chunk(cdata) + + # set data in output array + # (split into two lines for profiling) + tmp = chunk[item] + if dest.shape: + dest[:] = tmp + else: + dest[()] = tmp + + def _chunk_setitem(self, cidx, item, value): + """Replace part or whole of a chunk. + + Parameters + ---------- + cidx : tuple of ints + Indices of the chunk. + item : tuple of slices + Location of region within the chunk. + value : scalar or ndarray + Value to set. + + """ + + # synchronization + if self._synchronizer is None: + self._chunk_setitem_nosync(cidx, item, value) + else: + # synchronize on the chunk + ckey = self._chunk_key(cidx) + with self._synchronizer[ckey]: + self._chunk_setitem_nosync(cidx, item, value) + + def _chunk_setitem_nosync(self, cidx, item, value): + + # obtain key for chunk storage + ckey = self._chunk_key(cidx) + + if is_total_slice(item, self._chunks): + # totally replace chunk + + # optimization: we are completely replacing the chunk, so no need + # to access the existing chunk data + + if np.isscalar(value): + + # setup array filled with value + chunk = np.empty(self._chunks, dtype=self._dtype, + order=self._order) + chunk.fill(value) + + else: + + if not self._compressor and not self._filters: + + # https://github.com/alimanfoo/zarr/issues/79 + # Ensure a copy is taken so we don't end up storing + # a view into someone else's array. + # N.B., this assumes that filters or compressor always + # take a copy and never attempt to apply encoding in-place. + chunk = np.array(value, dtype=self._dtype, + order=self._order) + + else: + # ensure array is contiguous + if self._order == 'F': + chunk = np.asfortranarray(value, dtype=self._dtype) + else: + chunk = np.ascontiguousarray(value, dtype=self._dtype) + + else: + # partially replace the contents of this chunk + + try: + + # obtain compressed data for chunk + cdata = self._chunk_store[ckey] + + except KeyError: + + # chunk not initialized + chunk = np.empty(self._chunks, dtype=self._dtype, + order=self._order) + if self._fill_value is not None: + chunk.fill(self._fill_value) + + else: + + # decode chunk + chunk = self._decode_chunk(cdata) + if not chunk.flags.writeable: + chunk = chunk.copy(order='K') + + # modify + chunk[item] = value + + # encode chunk + cdata = self._encode_chunk(chunk) + + # store + self._chunk_store[ckey] = cdata + + def _chunk_key(self, cidx): + return self._key_prefix + '.'.join(map(str, cidx)) + + def _decode_chunk(self, cdata): + + # decompress + if self._compressor: + chunk = self._compressor.decode(cdata) + else: + chunk = cdata + + # apply filters + if self._filters: + for f in self._filters[::-1]: + chunk = f.decode(chunk) + + # view as correct dtype + if isinstance(chunk, np.ndarray): + chunk = chunk.view(self._dtype) + else: + chunk = np.frombuffer(chunk, self._dtype) + + # reshape + chunk = chunk.reshape(self._chunks, order=self._order) + + return chunk + + def _encode_chunk(self, chunk): + + # apply filters + if self._filters: + for f in self._filters: + chunk = f.encode(chunk) + + # compress + if self._compressor: + cdata = self._compressor.encode(chunk) + else: + cdata = chunk + + return cdata + + def __repr__(self): + # N.B., __repr__ needs to be synchronized to ensure consistent view + # of metadata AND when retrieving nbytes_stored from filesystem storage + return self._synchronized_op(self._repr_nosync) + + def _repr_nosync(self): + + # main line + r = '%s(' % type(self).__name__ + if self.name: + r += '%s, ' % self.name + r += '%s, ' % str(self._shape) + r += '%s, ' % str(self._dtype) + r += 'chunks=%s, ' % str(self._chunks) + r += 'order=%s' % self._order + r += ')' + + # storage size info + r += '\n nbytes: %s' % human_readable_size(self._nbytes) + if self.nbytes_stored > 0: + r += '; nbytes_stored: %s' % human_readable_size( + self.nbytes_stored) + r += '; ratio: %.1f' % (self._nbytes / self.nbytes_stored) + r += '; initialized: %s/%s' % (self.nchunks_initialized, + self._nchunks) + + # filters + if self._filters: + # first line + r += '\n filters: %r' % self._filters[0] + # subsequent lines + for f in self._filters[1:]: + r += '\n %r' % f + + # compressor + if self._compressor: + r += '\n compressor: %r' % self._compressor + + # storage and synchronizer classes + r += '\n store: %s' % type(self._store).__name__ + if self._store != self._chunk_store: + r += '; chunk_store: %s' % type(self._chunk_store).__name__ + if self._synchronizer is not None: + r += '; synchronizer: %s' % type(self._synchronizer).__name__ + + return r + + def __getstate__(self): + return self._store, self._path, self._read_only, self._chunk_store, \ + self._synchronizer, self._cache_metadata + + def __setstate__(self, state): + self.__init__(*state) + + def _synchronized_op(self, f, *args, **kwargs): + + # no synchronization + if self._synchronizer is None: + self._refresh_metadata_nosync() + return f(*args, **kwargs) + + else: + # synchronize on the array + mkey = self._key_prefix + array_meta_key + with self._synchronizer[mkey]: + self._refresh_metadata_nosync() + result = f(*args, **kwargs) + return result + + def _write_op(self, f, *args, **kwargs): + + # guard condition + if self._read_only: + err_read_only() + + return self._synchronized_op(f, *args, **kwargs) + + def resize(self, *args): + """Change the shape of the array by growing or shrinking one or more + dimensions. + + Examples + -------- + >>> import zarr + >>> z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000)) + >>> z + Array((10000, 10000), float64, chunks=(1000, 1000), order=C) + nbytes: 762.9M; nbytes_stored: 323; ratio: 2476780.2; initialized: 0/100 + compressor: Blosc(cname='lz4', clevel=5, shuffle=1) + store: dict + >>> z.resize(20000, 10000) + >>> z + Array((20000, 10000), float64, chunks=(1000, 1000), order=C) + nbytes: 1.5G; nbytes_stored: 323; ratio: 4953560.4; initialized: 0/200 + compressor: Blosc(cname='lz4', clevel=5, shuffle=1) + store: dict + >>> z.resize(30000, 1000) + >>> z + Array((30000, 1000), float64, chunks=(1000, 1000), order=C) + nbytes: 228.9M; nbytes_stored: 322; ratio: 745341.6; initialized: 0/30 + compressor: Blosc(cname='lz4', clevel=5, shuffle=1) + store: dict + + Notes + ----- + When resizing an array, the data are not rearranged in any way. + + If one or more dimensions are shrunk, any chunks falling outside the + new array shape will be deleted from the underlying store. + + """ # flake8: noqa + + return self._write_op(self._resize_nosync, *args) + + def _resize_nosync(self, *args): + + # normalize new shape argument + old_shape = self._shape + new_shape = normalize_resize_args(old_shape, *args) + old_cdata_shape = self._cdata_shape + + # update metadata + self._shape = new_shape + self._flush_metadata_nosync() + + # determine the new number and arrangement of chunks + chunks = self._chunks + new_cdata_shape = tuple(int(np.ceil(s / c)) + for s, c in zip(new_shape, chunks)) + + # remove any chunks not within range + for cidx in itertools.product(*[range(n) for n in old_cdata_shape]): + if all(i < c for i, c in zip(cidx, new_cdata_shape)): + pass # keep the chunk + else: + key = self._chunk_key(cidx) + try: + del self._chunk_store[key] + except KeyError: + # chunk not initialized + pass + + def append(self, data, axis=0): + """Append `data` to `axis`. + + Parameters + ---------- + data : array_like + Data to be appended. + axis : int + Axis along which to append. + + Returns + ------- + new_shape : tuple + + Notes + ----- + The size of all dimensions other than `axis` must match between this + array and `data`. + + Examples + -------- + >>> import numpy as np + >>> import zarr + >>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000) + >>> z = zarr.array(a, chunks=(1000, 100)) + >>> z + Array((10000, 1000), int32, chunks=(1000, 100), order=C) + nbytes: 38.1M; nbytes_stored: 1.9M; ratio: 20.3; initialized: 100/100 + compressor: Blosc(cname='lz4', clevel=5, shuffle=1) + store: dict + >>> z.append(a) + (20000, 1000) + >>> z + Array((20000, 1000), int32, chunks=(1000, 100), order=C) + nbytes: 76.3M; nbytes_stored: 3.8M; ratio: 20.3; initialized: 200/200 + compressor: Blosc(cname='lz4', clevel=5, shuffle=1) + store: dict + >>> z.append(np.vstack([a, a]), axis=1) + (20000, 2000) + >>> z + Array((20000, 2000), int32, chunks=(1000, 100), order=C) + nbytes: 152.6M; nbytes_stored: 7.5M; ratio: 20.3; initialized: 400/400 + compressor: Blosc(cname='lz4', clevel=5, shuffle=1) + store: dict + + """ + return self._write_op(self._append_nosync, data, axis=axis) + + def _append_nosync(self, data, axis=0): + + # ensure data is array-like + if not hasattr(data, 'shape') or not hasattr(data, 'dtype'): + data = np.asanyarray(data) + + # ensure shapes are compatible for non-append dimensions + self_shape_preserved = tuple(s for i, s in enumerate(self._shape) + if i != axis) + data_shape_preserved = tuple(s for i, s in enumerate(data.shape) + if i != axis) + if self_shape_preserved != data_shape_preserved: + raise ValueError('shapes not compatible') + + # remember old shape + old_shape = self._shape + + # determine new shape + new_shape = tuple( + self._shape[i] if i != axis else self._shape[i] + data.shape[i] + for i in range(len(self._shape)) + ) + + # resize + self._resize_nosync(new_shape) + + # store data + # noinspection PyTypeChecker + append_selection = tuple( + slice(None) if i != axis else slice(old_shape[i], new_shape[i]) + for i in range(len(self._shape)) + ) + self[append_selection] = data + + return new_shape + + def view(self, shape=None, chunks=None, dtype=None, + fill_value=None, filters=None, read_only=None, + synchronizer=None): + """Return an array sharing the same data. + + Parameters + ---------- + shape : int or tuple of ints + Array shape. + chunks : int or tuple of ints, optional + Chunk shape. + dtype : string or dtype, optional + NumPy dtype. + fill_value : object + Default value to use for uninitialized portions of the array. + filters : sequence, optional + Sequence of filters to use to encode chunk data prior to + compression. + read_only : bool, optional + True if array should be protected against modification. + synchronizer : object, optional + Array synchronizer. + + Notes + ----- + WARNING: This is an experimental feature and should be used with care. + There are plenty of ways to generate errors and/or cause data + corruption. + + Examples + -------- + + Bypass filters: + + >>> import zarr + >>> import numpy as np + >>> np.random.seed(42) + >>> labels = [b'female', b'male'] + >>> data = np.random.choice(labels, size=10000) + >>> filters = [zarr.Categorize(labels=labels, + ... dtype=data.dtype, + ... astype='u1')] + >>> a = zarr.array(data, chunks=1000, filters=filters) + >>> a[:] + array([b'female', b'male', b'female', ..., b'male', b'male', b'female'], + dtype='|S6') + >>> v = a.view(dtype='u1', filters=[]) + >>> v.is_view + True + >>> v[:] + array([1, 2, 1, ..., 2, 2, 1], dtype=uint8) + + Views can be used to modify data: + + >>> x = v[:] + >>> x.sort() + >>> v[:] = x + >>> v[:] + array([1, 1, 1, ..., 2, 2, 2], dtype=uint8) + >>> a[:] + array([b'female', b'female', b'female', ..., b'male', b'male', b'male'], + dtype='|S6') + + View as a different dtype with the same itemsize: + + >>> data = np.random.randint(0, 2, size=10000, dtype='u1') + >>> a = zarr.array(data, chunks=1000) + >>> a[:] + array([0, 0, 1, ..., 1, 0, 0], dtype=uint8) + >>> v = a.view(dtype=bool) + >>> v[:] + array([False, False, True, ..., True, False, False], dtype=bool) + >>> np.all(a[:].view(dtype=bool) == v[:]) + True + + An array can be viewed with a dtype with a different itemsize, however + some care is needed to adjust the shape and chunk shape so that chunk + data is interpreted correctly: + + >>> data = np.arange(10000, dtype='u2') + >>> a = zarr.array(data, chunks=1000) + >>> a[:10] + array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint16) + >>> v = a.view(dtype='u1', shape=20000, chunks=2000) + >>> v[:10] + array([0, 0, 1, 0, 2, 0, 3, 0, 4, 0], dtype=uint8) + >>> np.all(a[:].view('u1') == v[:]) + True + + Change fill value for uninitialized chunks: + + >>> a = zarr.full(10000, chunks=1000, fill_value=-1, dtype='i1') + >>> a[:] + array([-1, -1, -1, ..., -1, -1, -1], dtype=int8) + >>> v = a.view(fill_value=42) + >>> v[:] + array([42, 42, 42, ..., 42, 42, 42], dtype=int8) + + Note that resizing or appending to views is not permitted: + + >>> a = zarr.empty(10000) + >>> v = a.view() + >>> try: + ... v.resize(20000) + ... except PermissionError as e: + ... print(e) + not permitted for views + + """ # flake8: noqa + + store = self._store + chunk_store = self._chunk_store + path = self._path + if read_only is None: + read_only = self._read_only + if synchronizer is None: + synchronizer = self._synchronizer + a = Array(store=store, path=path, chunk_store=chunk_store, + read_only=read_only, synchronizer=synchronizer, + cache_metadata=True) + a._is_view = True + + # allow override of some properties + if dtype is None: + dtype = self._dtype + else: + dtype = np.dtype(dtype) + a._dtype = dtype + if shape is None: + shape = self._shape + else: + shape = normalize_shape(shape) + a._shape = shape + if chunks is not None: + chunks = normalize_chunks(chunks, shape, dtype.itemsize) + a._chunks = chunks + if fill_value is not None: + a._fill_value = fill_value + if filters is not None: + a._filters = filters + + return a diff --git a/zarr/meta.py b/zarr/meta.py index 4831d402e9..c3f11815aa 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -13,6 +13,32 @@ ZARR_FORMAT = 2 +def decode_array_metadata(s): + if isinstance(s, binary_type): + s = text_type(s, 'ascii') + meta = json.loads(s) + zarr_format = meta.get('zarr_format', None) + if zarr_format != ZARR_FORMAT: + raise MetadataError('unsupported zarr format: %s' % zarr_format) + try: + dtype = decode_dtype(meta['dtype']) + fill_value = decode_fill_value(meta['fill_value'], dtype) + meta = dict( + zarr_format=meta['zarr_format'], + shape=tuple(meta['shape']), + chunks=tuple(meta['chunks']), + dtype=dtype, + compressor=meta['compressor'], + fill_value=fill_value, + order=meta['order'], + filters=meta['filters'], + ) + except Exception as e: + raise MetadataError('error decoding metadata: %s' % e) + else: + return meta + + def decode_array_metadata(s): if isinstance(s, binary_type): s = text_type(s, 'ascii') @@ -55,6 +81,42 @@ def encode_array_metadata(meta): return b +def encode_frame_metadata(meta): + meta = dict( + zarr_format=ZARR_FORMAT, + nrows=meta['nrows'], + ncols=meta['ncols'], + chunks=meta['chunks'], + compressor=meta['compressor'], + filters=meta['filters'], + ) + s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True) + b = s.encode('ascii') + return b + + +def decode_frame_metadata(s): + if isinstance(s, binary_type): + s = text_type(s, 'ascii') + meta = json.loads(s) + zarr_format = meta.get('zarr_format', None) + if zarr_format != ZARR_FORMAT: + raise MetadataError('unsupported zarr format: %s' % zarr_format) + try: + meta = dict( + zarr_format=meta['zarr_format'], + nrows=tuple(meta['nrows']), + ncols=tuple(meta['ncols']), + chunks=tuple(meta['chunks']), + compressor=meta['compressor'], + filters=meta['filters'], + ) + except Exception as e: + raise MetadataError('error decoding metadata: %s' % e) + else: + return meta + + def encode_dtype(d): if d.fields is None: return d.str diff --git a/zarr/storage.py b/zarr/storage.py index 2452cc5f1c..f03499a4f3 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -14,7 +14,7 @@ from zarr.util import normalize_shape, normalize_chunks, normalize_order, \ normalize_storage_path, buffer_size -from zarr.meta import encode_array_metadata, encode_group_metadata +from zarr.meta import encode_array_metadata, encode_frame_metadata, encode_group_metadata from zarr.compat import PY2, binary_type from zarr.codecs import codec_registry from zarr.errors import err_contains_group, err_contains_array, \ @@ -23,6 +23,7 @@ array_meta_key = '.zarray' +frame_meta_key = '.zframe' group_meta_key = '.zgroup' attrs_key = '.zattrs' try: @@ -40,7 +41,7 @@ def _path_to_prefix(path): else: prefix = '' return prefix - + def contains_array(store, path=None): """Return True if the store contains an array at the given logical path.""" @@ -98,8 +99,8 @@ def listdir(store, path=None): else: # slow version, iterate through all keys return _listdir_from_keys(store, path) - - + + def getsize(store, path=None): """Compute size of stored items for a given path.""" path = normalize_storage_path(path) @@ -240,14 +241,14 @@ def init_array(store, shape, chunks=None, dtype=None, compressor='default', Notes ----- The initialisation process involves normalising all array metadata, - encoding as JSON and storing under the '.zarray' key. User attributes are + encoding as JSON and storing under the '.zarray' key. User attributes are also initialized and stored as JSON under the '.zattrs' key. """ # normalize path path = normalize_storage_path(path) - + # ensure parent group initialized _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite) diff --git a/zarr/tests/test_frame.py b/zarr/tests/test_frame.py new file mode 100644 index 0000000000..35cfb372d5 --- /dev/null +++ b/zarr/tests/test_frame.py @@ -0,0 +1,865 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, print_function, division +import unittest +from tempfile import mkdtemp +import atexit +import shutil +import pickle +from collections import MutableMapping + + +import numpy as np +import pandas as pd +from pandas.util.testing import assert_frame_equal + +from zarr.storage import (DirectoryStore, ZipStore, + init_frame, init_group) +from zarr.core import Array +from zarr.errors import PermissionError +from zarr.compat import PY2 +from zarr.util import buffer_size +from zarr.codecs import Delta, FixedScaleOffset, Zlib,\ + Blosc, BZ2 + + +class TestFrame(unittest.TestCase): + + def test_array_init(self): + + # normal initialization + store = dict() + init_array(store, shape=100, chunks=10) + a = Array(store) + assert_is_instance(a, Array) + eq((100,), a.shape) + eq((10,), a.chunks) + eq('', a.path) + assert_is_none(a.name) + assert_is(store, a.store) + + # initialize at path + store = dict() + init_array(store, shape=100, chunks=10, path='foo/bar') + a = Array(store, path='foo/bar') + assert_is_instance(a, Array) + eq((100,), a.shape) + eq((10,), a.chunks) + eq('foo/bar', a.path) + eq('/foo/bar', a.name) + assert_is(store, a.store) + + # store not initialized + store = dict() + with assert_raises(KeyError): + Array(store) + + # group is in the way + store = dict() + init_group(store, path='baz') + with assert_raises(KeyError): + Array(store, path='baz') + + def create_array(self, read_only=False, **kwargs): + store = dict() + kwargs.setdefault('compressor', Zlib(level=1)) + init_array(store, **kwargs) + return Array(store, read_only=read_only) + + def test_nbytes_stored(self): + + # dict as store + z = self.create_array(shape=1000, chunks=100) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + eq(expect_nbytes_stored, z.nbytes_stored) + z[:] = 42 + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + eq(expect_nbytes_stored, z.nbytes_stored) + + # mess with store + try: + z.store[z._key_prefix + 'foo'] = list(range(10)) + eq(-1, z.nbytes_stored) + except TypeError: + pass + + def test_array_1d(self): + a = np.arange(1050) + z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) + + # check properties + eq(len(a), len(z)) + eq(a.ndim, z.ndim) + eq(a.shape, z.shape) + eq(a.dtype, z.dtype) + eq((100,), z.chunks) + eq(a.nbytes, z.nbytes) + eq(11, z.nchunks) + eq(0, z.nchunks_initialized) + eq((11,), z.cdata_shape) + + # check empty + b = z[:] + assert_is_instance(b, np.ndarray) + eq(a.shape, b.shape) + eq(a.dtype, b.dtype) + + # check attributes + z.attrs['foo'] = 'bar' + eq('bar', z.attrs['foo']) + + # set data + z[:] = a + + # check properties + eq(a.nbytes, z.nbytes) + eq(11, z.nchunks) + eq(11, z.nchunks_initialized) + + # check slicing + assert_array_equal(a, np.array(z)) + assert_array_equal(a, z[:]) + assert_array_equal(a, z[...]) + # noinspection PyTypeChecker + assert_array_equal(a, z[slice(None)]) + assert_array_equal(a[:10], z[:10]) + assert_array_equal(a[10:20], z[10:20]) + assert_array_equal(a[-10:], z[-10:]) + # ...across chunk boundaries... + assert_array_equal(a[:110], z[:110]) + assert_array_equal(a[190:310], z[190:310]) + assert_array_equal(a[-110:], z[-110:]) + # single item + eq(a[0], z[0]) + eq(a[-1], z[-1]) + + # check partial assignment + b = np.arange(1e5, 2e5) + z[190:310] = b[190:310] + assert_array_equal(a[:190], z[:190]) + assert_array_equal(b[190:310], z[190:310]) + assert_array_equal(a[310:], z[310:]) + + def test_array_1d_fill_value(self): + for fill_value in -1, 0, 1, 10: + + a = np.arange(1050) + f = np.empty_like(a) + f.fill(fill_value) + z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype, + fill_value=fill_value) + z[190:310] = a[190:310] + + assert_array_equal(f[:190], z[:190]) + assert_array_equal(a[190:310], z[190:310]) + assert_array_equal(f[310:], z[310:]) + + def test_array_1d_set_scalar(self): + # setup + a = np.zeros(100) + z = self.create_array(shape=a.shape, chunks=10, dtype=a.dtype) + z[:] = a + assert_array_equal(a, z[:]) + + for value in -1, 0, 1, 10: + print(value) + a[15:35] = value + z[15:35] = value + assert_array_equal(a, z[:]) + a[:] = value + z[:] = value + assert_array_equal(a, z[:]) + + def test_array_2d(self): + a = np.arange(10000).reshape((1000, 10)) + z = self.create_array(shape=a.shape, chunks=(100, 2), dtype=a.dtype) + + # check properties + eq(len(a), len(z)) + eq(a.ndim, z.ndim) + eq(a.shape, z.shape) + eq(a.dtype, z.dtype) + eq((100, 2), z.chunks) + eq(0, z.nchunks_initialized) + eq((10, 5), z.cdata_shape) + + # set data + z[:] = a + + # check properties + eq(a.nbytes, z.nbytes) + eq(50, z.nchunks_initialized) + + # check slicing + assert_array_equal(a, np.array(z)) + assert_array_equal(a, z[:]) + assert_array_equal(a, z[...]) + # noinspection PyTypeChecker + assert_array_equal(a, z[slice(None)]) + assert_array_equal(a[:10], z[:10]) + assert_array_equal(a[10:20], z[10:20]) + assert_array_equal(a[-10:], z[-10:]) + assert_array_equal(a[:, :2], z[:, :2]) + assert_array_equal(a[:, 2:4], z[:, 2:4]) + assert_array_equal(a[:, -2:], z[:, -2:]) + assert_array_equal(a[:10, :2], z[:10, :2]) + assert_array_equal(a[10:20, 2:4], z[10:20, 2:4]) + assert_array_equal(a[-10:, -2:], z[-10:, -2:]) + # ...across chunk boundaries... + assert_array_equal(a[:110], z[:110]) + assert_array_equal(a[190:310], z[190:310]) + assert_array_equal(a[-110:], z[-110:]) + assert_array_equal(a[:, :3], z[:, :3]) + assert_array_equal(a[:, 3:7], z[:, 3:7]) + assert_array_equal(a[:, -3:], z[:, -3:]) + assert_array_equal(a[:110, :3], z[:110, :3]) + assert_array_equal(a[190:310, 3:7], z[190:310, 3:7]) + assert_array_equal(a[-110:, -3:], z[-110:, -3:]) + # single item + assert_array_equal(a[0], z[0]) + assert_array_equal(a[-1], z[-1]) + eq(a[0, 0], z[0, 0]) + eq(a[-1, -1], z[-1, -1]) + + # check partial assignment + b = np.arange(10000, 20000).reshape((1000, 10)) + z[190:310, 3:7] = b[190:310, 3:7] + assert_array_equal(a[:190], z[:190]) + assert_array_equal(a[:, :3], z[:, :3]) + assert_array_equal(b[190:310, 3:7], z[190:310, 3:7]) + assert_array_equal(a[310:], z[310:]) + assert_array_equal(a[:, 7:], z[:, 7:]) + + def test_array_2d_partial(self): + z = self.create_array(shape=(1000, 10), chunks=(100, 2), dtype='i4', + fill_value=0) + + # check partial assignment, single row + c = np.arange(z.shape[1]) + z[0, :] = c + with assert_raises(ValueError): + # N.B., NumPy allows this, but we'll be strict for now + z[2:3] = c + with assert_raises(ValueError): + # N.B., NumPy allows this, but we'll be strict for now + z[-1:] = c + z[2:3] = c[None, :] + z[-1:] = c[None, :] + assert_array_equal(c, z[0, :]) + assert_array_equal(c, z[2, :]) + assert_array_equal(c, z[-1, :]) + + # check partial assignment, single column + d = np.arange(z.shape[0]) + z[:, 0] = d + with assert_raises(ValueError): + z[:, 2:3] = d + with assert_raises(ValueError): + z[:, -1:] = d + z[:, 2:3] = d[:, None] + z[:, -1:] = d[:, None] + assert_array_equal(d, z[:, 0]) + assert_array_equal(d, z[:, 2]) + assert_array_equal(d, z[:, -1]) + + # check single item assignment + z[0, 0] = -1 + z[2, 2] = -1 + z[-1, -1] = -1 + eq(-1, z[0, 0]) + eq(-1, z[2, 2]) + eq(-1, z[-1, -1]) + + def test_array_order(self): + + # 1D + a = np.arange(1050) + for order in 'C', 'F': + z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype, + order=order) + eq(order, z.order) + if order == 'F': + assert_true(z[:].flags.f_contiguous) + else: + assert_true(z[:].flags.c_contiguous) + z[:] = a + assert_array_equal(a, z[:]) + + # 2D + a = np.arange(10000).reshape((100, 100)) + for order in 'C', 'F': + z = self.create_array(shape=a.shape, chunks=(10, 10), + dtype=a.dtype, order=order) + eq(order, z.order) + if order == 'F': + assert_true(z[:].flags.f_contiguous) + else: + assert_true(z[:].flags.c_contiguous) + z[:] = a + actual = z[:] + assert_array_equal(a, actual) + + def test_setitem_data_not_shared(self): + # check that data don't end up being shared with another array + # https://github.com/alimanfoo/zarr/issues/79 + z = self.create_array(shape=20, chunks=10, dtype='i4') + a = np.arange(20, dtype='i4') + z[:] = a + assert_array_equal(z[:], np.arange(20, dtype='i4')) + a[:] = 0 + assert_array_equal(z[:], np.arange(20, dtype='i4')) + + def test_resize_1d(self): + + z = self.create_array(shape=105, chunks=10, dtype='i4', + fill_value=0) + a = np.arange(105, dtype='i4') + z[:] = a + eq((105,), z.shape) + eq((105,), z[:].shape) + eq(np.dtype('i4'), z.dtype) + eq(np.dtype('i4'), z[:].dtype) + eq((10,), z.chunks) + assert_array_equal(a, z[:]) + + z.resize(205) + eq((205,), z.shape) + eq((205,), z[:].shape) + eq(np.dtype('i4'), z.dtype) + eq(np.dtype('i4'), z[:].dtype) + eq((10,), z.chunks) + assert_array_equal(a, z[:105]) + assert_array_equal(np.zeros(100, dtype='i4'), z[105:]) + + z.resize(55) + eq((55,), z.shape) + eq((55,), z[:].shape) + eq(np.dtype('i4'), z.dtype) + eq(np.dtype('i4'), z[:].dtype) + eq((10,), z.chunks) + assert_array_equal(a[:55], z[:]) + + # via shape setter + z.shape = (105,) + eq((105,), z.shape) + eq((105,), z[:].shape) + + def test_resize_2d(self): + + z = self.create_array(shape=(105, 105), chunks=(10, 10), dtype='i4', + fill_value=0) + a = np.arange(105*105, dtype='i4').reshape((105, 105)) + z[:] = a + eq((105, 105), z.shape) + eq((105, 105), z[:].shape) + eq(np.dtype('i4'), z.dtype) + eq(np.dtype('i4'), z[:].dtype) + eq((10, 10), z.chunks) + assert_array_equal(a, z[:]) + + z.resize((205, 205)) + eq((205, 205), z.shape) + eq((205, 205), z[:].shape) + eq(np.dtype('i4'), z.dtype) + eq(np.dtype('i4'), z[:].dtype) + eq((10, 10), z.chunks) + assert_array_equal(a, z[:105, :105]) + assert_array_equal(np.zeros((100, 205), dtype='i4'), z[105:, :]) + assert_array_equal(np.zeros((205, 100), dtype='i4'), z[:, 105:]) + + z.resize((55, 55)) + eq((55, 55), z.shape) + eq((55, 55), z[:].shape) + eq(np.dtype('i4'), z.dtype) + eq(np.dtype('i4'), z[:].dtype) + eq((10, 10), z.chunks) + assert_array_equal(a[:55, :55], z[:]) + + z.resize((55, 1)) + eq((55, 1), z.shape) + eq((55, 1), z[:].shape) + eq(np.dtype('i4'), z.dtype) + eq(np.dtype('i4'), z[:].dtype) + eq((10, 10), z.chunks) + assert_array_equal(a[:55, :1], z[:]) + + # via shape setter + z.shape = (105, 105) + eq((105, 105), z.shape) + eq((105, 105), z[:].shape) + + def test_append_1d(self): + + a = np.arange(105) + z = self.create_array(shape=a.shape, chunks=10, dtype=a.dtype) + z[:] = a + eq(a.shape, z.shape) + eq(a.dtype, z.dtype) + eq((10,), z.chunks) + assert_array_equal(a, z[:]) + + b = np.arange(105, 205) + e = np.append(a, b) + z.append(b) + eq(e.shape, z.shape) + eq(e.dtype, z.dtype) + eq((10,), z.chunks) + assert_array_equal(e, z[:]) + + # check append handles array-like + c = [1, 2, 3] + f = np.append(e, c) + z.append(c) + eq(f.shape, z.shape) + eq(f.dtype, z.dtype) + eq((10,), z.chunks) + assert_array_equal(f, z[:]) + + def test_append_2d(self): + + a = np.arange(105*105, dtype='i4').reshape((105, 105)) + z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype) + z[:] = a + eq(a.shape, z.shape) + eq(a.dtype, z.dtype) + eq((10, 10), z.chunks) + actual = z[:] + assert_array_equal(a, actual) + + b = np.arange(105*105, 2*105*105, dtype='i4').reshape((105, 105)) + e = np.append(a, b, axis=0) + z.append(b) + eq(e.shape, z.shape) + eq(e.dtype, z.dtype) + eq((10, 10), z.chunks) + actual = z[:] + assert_array_equal(e, actual) + + def test_append_2d_axis(self): + + a = np.arange(105*105, dtype='i4').reshape((105, 105)) + z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype) + z[:] = a + eq(a.shape, z.shape) + eq(a.dtype, z.dtype) + eq((10, 10), z.chunks) + assert_array_equal(a, z[:]) + + b = np.arange(105*105, 2*105*105, dtype='i4').reshape((105, 105)) + e = np.append(a, b, axis=1) + z.append(b, axis=1) + eq(e.shape, z.shape) + eq(e.dtype, z.dtype) + eq((10, 10), z.chunks) + assert_array_equal(e, z[:]) + + def test_append_bad_shape(self): + a = np.arange(100) + z = self.create_array(shape=a.shape, chunks=10, dtype=a.dtype) + z[:] = a + b = a.reshape(10, 10) + with assert_raises(ValueError): + z.append(b) + + def test_read_only(self): + + z = self.create_array(shape=1000, chunks=100) + assert_false(z.read_only) + + z = self.create_array(shape=1000, chunks=100, read_only=True) + assert_true(z.read_only) + with assert_raises(PermissionError): + z[:] = 42 + with assert_raises(PermissionError): + z.resize(2000) + with assert_raises(PermissionError): + z.append(np.arange(1000)) + + def test_pickle(self): + + z = self.create_array(shape=1000, chunks=100, dtype=int) + z[:] = np.random.randint(0, 1000, 1000) + z2 = pickle.loads(pickle.dumps(z)) + eq(z.shape, z2.shape) + eq(z.chunks, z2.chunks) + eq(z.dtype, z2.dtype) + if z.compressor: + eq(z.compressor.get_config(), z2.compressor.get_config()) + eq(z.fill_value, z2.fill_value) + assert_array_equal(z[:], z2[:]) + + def test_repr(self): + if not PY2: + z = self.create_array(shape=100, chunks=10, dtype='f4') + expect = """Array((100,), float32, chunks=(10,), order=C) + nbytes: 400; nbytes_stored: 245; ratio: 1.6; initialized: 0/10 + compressor: Zlib(level=1) + store: dict +""" + actual = repr(z) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + def test_np_ufuncs(self): + z = self.create_array(shape=(100, 100), chunks=(10, 10)) + a = np.arange(10000).reshape(100, 100) + z[:] = a + + eq(np.sum(a), np.sum(z)) + assert_array_equal(np.sum(a, axis=0), np.sum(z, axis=0)) + eq(np.mean(a), np.mean(z)) + assert_array_equal(np.mean(a, axis=1), np.mean(z, axis=1)) + condition = np.random.randint(0, 2, size=100, dtype=bool) + assert_array_equal(np.compress(condition, a, axis=0), + np.compress(condition, z, axis=0)) + indices = np.random.choice(100, size=50, replace=True) + assert_array_equal(np.take(a, indices, axis=1), + np.take(z, indices, axis=1)) + + # use zarr array as indices or condition + zc = self.create_array(shape=condition.shape, dtype=condition.dtype, + chunks=10, filters=None) + zc[:] = condition + assert_array_equal(np.compress(condition, a, axis=0), + np.compress(zc, a, axis=0)) + zi = self.create_array(shape=indices.shape, dtype=indices.dtype, + chunks=10, filters=None) + zi[:] = indices + # this triggers __array__() call with dtype argument + assert_array_equal(np.take(a, indices, axis=1), + np.take(a, zi, axis=1)) + + +class TestArrayWithPath(TestArray): + + @staticmethod + def create_array(read_only=False, **kwargs): + store = dict() + init_array(store, path='foo/bar', **kwargs) + return Array(store, path='foo/bar', read_only=read_only) + + def test_nbytes_stored(self): + + # dict as store + z = self.create_array(shape=1000, chunks=100) + expect_nbytes_stored = sum(buffer_size(v) + for k, v in z.store.items() + if k.startswith('foo/bar/')) + eq(expect_nbytes_stored, z.nbytes_stored) + z[:] = 42 + expect_nbytes_stored = sum(buffer_size(v) + for k, v in z.store.items() + if k.startswith('foo/bar/')) + eq(expect_nbytes_stored, z.nbytes_stored) + + # mess with store + z.store[z._key_prefix + 'foo'] = list(range(10)) + eq(-1, z.nbytes_stored) + + def test_repr(self): + if not PY2: + z = self.create_array(shape=100, chunks=10, dtype='f4') + # flake8: noqa + expect = """Array(/foo/bar, (100,), float32, chunks=(10,), order=C) + nbytes: 400; nbytes_stored: 293; ratio: 1.4; initialized: 0/10 + compressor: Blosc(cname='lz4', clevel=5, shuffle=1) + store: dict +""" + actual = repr(z) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +class TestArrayWithChunkStore(TestArray): + + @staticmethod + def create_array(read_only=False, **kwargs): + store = dict() + # separate chunk store + chunk_store = dict() + init_array(store, chunk_store=chunk_store, **kwargs) + return Array(store, read_only=read_only, chunk_store=chunk_store) + + def test_nbytes_stored(self): + + z = self.create_array(shape=1000, chunks=100) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + expect_nbytes_stored += sum(buffer_size(v) + for v in z.chunk_store.values()) + eq(expect_nbytes_stored, z.nbytes_stored) + z[:] = 42 + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + expect_nbytes_stored += sum(buffer_size(v) + for v in z.chunk_store.values()) + eq(expect_nbytes_stored, z.nbytes_stored) + + # mess with store + z.chunk_store[z._key_prefix + 'foo'] = list(range(10)) + eq(-1, z.nbytes_stored) + + def test_repr(self): + if not PY2: + z = self.create_array(shape=100, chunks=10, dtype='f4') + # flake8: noqa + expect = """Array((100,), float32, chunks=(10,), order=C) + nbytes: 400; nbytes_stored: 293; ratio: 1.4; initialized: 0/10 + compressor: Blosc(cname='lz4', clevel=5, shuffle=1) + store: dict; chunk_store: dict +""" + actual = repr(z) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +class TestArrayWithDirectoryStore(TestArray): + + @staticmethod + def create_array(read_only=False, **kwargs): + path = mkdtemp() + atexit.register(shutil.rmtree, path) + store = DirectoryStore(path) + kwargs.setdefault('compressor', Zlib(1)) + init_array(store, **kwargs) + return Array(store, read_only=read_only) + + def test_nbytes_stored(self): + + # dict as store + z = self.create_array(shape=1000, chunks=100) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + eq(expect_nbytes_stored, z.nbytes_stored) + z[:] = 42 + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + eq(expect_nbytes_stored, z.nbytes_stored) + + def test_repr(self): + if not PY2: + z = self.create_array(shape=100, chunks=10, dtype='f4') + # flake8: noqa + expect = """Array((100,), float32, chunks=(10,), order=C) + nbytes: 400; nbytes_stored: 245; ratio: 1.6; initialized: 0/10 + compressor: Zlib(level=1) + store: DirectoryStore +""" + actual = repr(z) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +class TestArrayWithNoCompressor(TestArray): + + def create_array(self, read_only=False, **kwargs): + store = dict() + kwargs.setdefault('compressor', None) + init_array(store, **kwargs) + return Array(store, read_only=read_only) + + def test_repr(self): + if not PY2: + z = self.create_array(shape=100, chunks=10, dtype='f4') + expect = """Array((100,), float32, chunks=(10,), order=C) + nbytes: 400; nbytes_stored: 201; ratio: 2.0; initialized: 0/10 + store: dict +""" + actual = repr(z) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +class TestArrayWithBZ2Compressor(TestArray): + + def create_array(self, read_only=False, **kwargs): + store = dict() + compressor = BZ2(level=1) + kwargs.setdefault('compressor', compressor) + init_array(store, **kwargs) + return Array(store, read_only=read_only) + + def test_repr(self): + if not PY2: + z = self.create_array(shape=100, chunks=10, dtype='f4') + expect = """Array((100,), float32, chunks=(10,), order=C) + nbytes: 400; nbytes_stored: 244; ratio: 1.6; initialized: 0/10 + compressor: BZ2(level=1) + store: dict +""" + actual = repr(z) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +class TestArrayWithBloscCompressor(TestArray): + + def create_array(self, read_only=False, **kwargs): + store = dict() + compressor = Blosc(cname='zstd', clevel=1, shuffle=1) + kwargs.setdefault('compressor', compressor) + init_array(store, **kwargs) + return Array(store, read_only=read_only) + + def test_repr(self): + if not PY2: + z = self.create_array(shape=100, chunks=10, dtype='f4') + expect = """Array((100,), float32, chunks=(10,), order=C) + nbytes: 400; nbytes_stored: 294; ratio: 1.4; initialized: 0/10 + compressor: Blosc(cname='zstd', clevel=1, shuffle=1) + store: dict +""" + actual = repr(z) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +if not PY2: + + from zarr.codecs import LZMA + + class TestArrayWithLZMACompressor(TestArray): + + def create_array(self, read_only=False, **kwargs): + store = dict() + compressor = LZMA(preset=1) + kwargs.setdefault('compressor', compressor) + init_array(store, **kwargs) + return Array(store, read_only=read_only) + + def test_repr(self): + z = self.create_array(shape=100, chunks=10, dtype='f4') + expect = """Array((100,), float32, chunks=(10,), order=C) + nbytes: 400; nbytes_stored: 313; ratio: 1.3; initialized: 0/10 + compressor: LZMA(format=1, check=-1, preset=1, filters=None) + store: dict +""" + actual = repr(z) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +class TestArrayWithFilters(TestArray): + + @staticmethod + def create_array(read_only=False, **kwargs): + store = dict() + dtype = kwargs.get('dtype', None) + filters = [ + Delta(dtype=dtype), + FixedScaleOffset(dtype=dtype, scale=1, offset=0), + ] + kwargs.setdefault('filters', filters) + compressor = Zlib(1) + kwargs.setdefault('compressor', compressor) + init_array(store, **kwargs) + return Array(store, read_only=read_only) + + def test_repr(self): + if not PY2: + z = self.create_array(shape=100, chunks=10, dtype='f4') + # flake8: noqa + expect = """Array((100,), float32, chunks=(10,), order=C) + nbytes: 400; nbytes_stored: 515; ratio: 0.8; initialized: 0/10 + filters: Delta(dtype=float32) + FixedScaleOffset(scale=1, offset=0, dtype=float32) + compressor: Zlib(level=1) + store: dict +""" + actual = repr(z) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +# custom store, does not support getsize() +class CustomMapping(object): + + def __init__(self): + self.inner = dict() + + def keys(self): + return self.inner.keys() + + def __getitem__(self, item): + return self.inner[item] + + def __setitem__(self, item, value): + self.inner[item] = value + + def __delitem__(self, key): + del self.inner[key] + + def __contains__(self, item): + return item in self.inner + + +class TestArrayWithCustomMapping(TestArray): + + @staticmethod + def create_array(read_only=False, **kwargs): + store = CustomMapping() + kwargs.setdefault('compressor', Zlib(1)) + init_array(store, **kwargs) + return Array(store, read_only=read_only) + + def test_nbytes_stored(self): + z = self.create_array(shape=1000, chunks=100) + eq(-1, z.nbytes_stored) + z[:] = 42 + eq(-1, z.nbytes_stored) + + def test_repr(self): + if not PY2: + z = self.create_array(shape=100, chunks=10, dtype='f4') + # flake8: noqa + expect = """Array((100,), float32, chunks=(10,), order=C) + nbytes: 400; initialized: 0/10 + compressor: Zlib(level=1) + store: CustomMapping +""" + actual = repr(z) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +class TestArrayNoCacheMetadata(TestArray): + + @staticmethod + def create_array(read_only=False, **kwargs): + store = dict() + kwargs.setdefault('compressor', Zlib(level=1)) + init_array(store, **kwargs) + return Array(store, read_only=read_only, cache_metadata=False) + + def test_cache_metadata(self): + a1 = self.create_array(shape=100, chunks=10, dtype='i1') + a2 = Array(a1.store, cache_metadata=True) + eq(a1.shape, a2.shape) + eq(a1.size, a2.size) + eq(a1.nbytes, a2.nbytes) + eq(a1.nchunks, a2.nchunks) + + a2.resize(200) + eq((200,), a2.shape) + eq(200, a2.size) + eq(200, a2.nbytes) + eq(20, a2.nchunks) + eq(a1.shape, a2.shape) + eq(a1.size, a2.size) + eq(a1.nbytes, a2.nbytes) + eq(a1.nchunks, a2.nchunks) + + a2.append(np.zeros(100)) + eq((300,), a2.shape) + eq(300, a2.size) + eq(300, a2.nbytes) + eq(30, a2.nchunks) + eq(a1.shape, a2.shape) + eq(a1.size, a2.size) + eq(a1.nbytes, a2.nbytes) + eq(a1.nchunks, a2.nchunks) + + a1.resize(400) + eq((400,), a1.shape) + eq(400, a1.size) + eq(400, a1.nbytes) + eq(40, a1.nchunks) + eq((300,), a2.shape) + eq(300, a2.size) + eq(300, a2.nbytes) + eq(30, a2.nchunks) diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index ad5bc1e05b..e246e5ad76 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -9,7 +9,8 @@ from zarr.compat import binary_type, text_type from zarr.meta import decode_array_metadata, encode_dtype, decode_dtype, \ - ZARR_FORMAT, decode_group_metadata, encode_array_metadata + ZARR_FORMAT, decode_group_metadata, encode_array_metadata, \ + encode_frame_metadata, decode_frame_metadata from zarr.errors import MetadataError from zarr.codecs import Delta, Zlib, Blosc @@ -189,6 +190,39 @@ def test_encode_decode_dtype(): eq(np.dtype(dt), d) +def test_encode_decode_frame_1(): + + meta = dict( + nrows=(10,), + ncols=(10,), + chunks=(10,), + compressor=Zlib(1).get_config(), + filters=None, + ) + + meta_json = '''{ + "chunks": [10], + "compressor": {"id": "zlib", "level": 1}, + "filters": null, + "ncols": [10], + "nrows": [10], + "zarr_format": %s + }''' % ZARR_FORMAT + + # test encoding + meta_enc = encode_frame_metadata(meta) + assert_json_eq(meta_json, meta_enc) + + # test decoding + meta_dec = decode_frame_metadata(meta_enc) + eq(ZARR_FORMAT, meta_dec['zarr_format']) + eq(meta['nrows'], meta_dec['nrows']) + eq(meta['ncols'], meta_dec['ncols']) + eq(meta['chunks'], meta_dec['chunks']) + eq(meta['compressor'], meta_dec['compressor']) + assert_is_none(meta_dec['filters']) + + def test_decode_group(): # typical From 15682b132e9bfc021d9857f9e61f6ee1e69048ba Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 13 Oct 2016 06:46:07 -0400 Subject: [PATCH 03/12] fix up meta / errors --- zarr/errors.py | 4 +++ zarr/meta.py | 4 --- zarr/storage.py | 71 +++++++++++++++++++++++++++++++++++++++++ zarr/tests/test_meta.py | 6 ---- 4 files changed, 75 insertions(+), 10 deletions(-) diff --git a/zarr/errors.py b/zarr/errors.py index f1baf429e6..76609508fd 100644 --- a/zarr/errors.py +++ b/zarr/errors.py @@ -27,6 +27,10 @@ def err_contains_array(path): raise KeyError('path %r contains an array' % path) +def err_contains_frame(path): + raise KeyError('path %r contains a frame' % path) + + def err_array_not_found(path): raise KeyError('array not found at path %r' % path) diff --git a/zarr/meta.py b/zarr/meta.py index c3f11815aa..fbe83e6bd9 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -84,8 +84,6 @@ def encode_array_metadata(meta): def encode_frame_metadata(meta): meta = dict( zarr_format=ZARR_FORMAT, - nrows=meta['nrows'], - ncols=meta['ncols'], chunks=meta['chunks'], compressor=meta['compressor'], filters=meta['filters'], @@ -105,8 +103,6 @@ def decode_frame_metadata(s): try: meta = dict( zarr_format=meta['zarr_format'], - nrows=tuple(meta['nrows']), - ncols=tuple(meta['ncols']), chunks=tuple(meta['chunks']), compressor=meta['compressor'], filters=meta['filters'], diff --git a/zarr/storage.py b/zarr/storage.py index f03499a4f3..57f83a78bf 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -18,6 +18,7 @@ from zarr.compat import PY2, binary_type from zarr.codecs import codec_registry from zarr.errors import err_contains_group, err_contains_array, \ + err_contains_frame, \ err_path_not_found, err_bad_compressor, err_fspath_exists_notdir, \ err_read_only @@ -51,6 +52,14 @@ def contains_array(store, path=None): return key in store +def contains_frame(store, path=None): + """Return True if the store contains a frame at the given logical path.""" + path = normalize_storage_path(path) + prefix = _path_to_prefix(path) + key = prefix + frame_meta_key + return key in store + + def contains_group(store, path=None): """Return True if the store contains a group at the given logical path.""" path = normalize_storage_path(path) @@ -135,6 +144,9 @@ def _require_parent_group(path, store, chunk_store, overwrite): if contains_array(store, p): _init_group_metadata(store, path=p, chunk_store=chunk_store, overwrite=overwrite) + elif contains_frame(store, p): + _init_frame_metadata(store, path=p, chunk_store=chunk_store, + overwrite=overwrite) elif not contains_group(store, p): _init_group_metadata(store, path=p, chunk_store=chunk_store) @@ -272,6 +284,8 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, rmdir(chunk_store, path) elif contains_array(store, path): err_contains_array(path) + elif contains_frame(store, path): + err_contains_frame(path) elif contains_group(store, path): err_contains_group(path) @@ -317,6 +331,61 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, init_store = init_array +def init_frame(store, overwrite=False, path=None, chunk_store=None): + """initialize a frame store. + + Parameters + ---------- + store : MutableMapping + A mapping that supports string keys and byte sequence values. + overwrite : bool, optional + If True, erase all data in `store` prior to initialisation. + path : string, optional + Path under which array is stored. + chunk_store : MutableMapping, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + + """ + + # normalize path + path = normalize_storage_path(path) + + # ensure parent group initialized + _require_parent_group(path, store=store, chunk_store=chunk_store, + overwrite=overwrite) + + # initialise metadata + _init_frame_metadata(store=store, overwrite=overwrite, path=path, + chunk_store=chunk_store) + + +def _init_frame_metadata(store, overwrite=False, path=None, chunk_store=None): + + # guard conditions + if overwrite: + # attempt to delete any pre-existing items in store + rmdir(store, path) + if chunk_store is not None and chunk_store != store: + rmdir(chunk_store, path) + elif contains_array(store, path): + err_contains_array(path) + elif contains_frame(store, path): + err_contains_frame(path) + elif contains_group(store, path): + err_contains_group(path) + + # initialize metadata + # N.B., currently no metadata properties are needed, however there may + # be in future + meta = dict() + key = _path_to_prefix(path) + group_meta_key + store[key] = encode_frame_metadata(meta) + + # initialize attributes + key = _path_to_prefix(path) + attrs_key + store[key] = json.dumps(dict()).encode('ascii') + def init_group(store, overwrite=False, path=None, chunk_store=None): """initialize a group store. @@ -356,6 +425,8 @@ def _init_group_metadata(store, overwrite=False, path=None, chunk_store=None): rmdir(chunk_store, path) elif contains_array(store, path): err_contains_array(path) + elif contains_frame(store, path): + err_contains_frame(path) elif contains_group(store, path): err_contains_group(path) diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index e246e5ad76..cdf83ee1d8 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -193,8 +193,6 @@ def test_encode_decode_dtype(): def test_encode_decode_frame_1(): meta = dict( - nrows=(10,), - ncols=(10,), chunks=(10,), compressor=Zlib(1).get_config(), filters=None, @@ -204,8 +202,6 @@ def test_encode_decode_frame_1(): "chunks": [10], "compressor": {"id": "zlib", "level": 1}, "filters": null, - "ncols": [10], - "nrows": [10], "zarr_format": %s }''' % ZARR_FORMAT @@ -216,8 +212,6 @@ def test_encode_decode_frame_1(): # test decoding meta_dec = decode_frame_metadata(meta_enc) eq(ZARR_FORMAT, meta_dec['zarr_format']) - eq(meta['nrows'], meta_dec['nrows']) - eq(meta['ncols'], meta_dec['ncols']) eq(meta['chunks'], meta_dec['chunks']) eq(meta['compressor'], meta_dec['compressor']) assert_is_none(meta_dec['filters']) From 834d7c8ff0df755be75e735d80bc2bc07895c691 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 14 Oct 2016 06:34:43 -0400 Subject: [PATCH 04/12] update hierarchy a bit --- zarr/frame.py | 1 - zarr/hierarchy.py | 61 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/zarr/frame.py b/zarr/frame.py index 1e5853f939..654003dcca 100644 --- a/zarr/frame.py +++ b/zarr/frame.py @@ -4,7 +4,6 @@ import itertools import numpy as np -import pandas as pd from zarr.core import Base diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index 31cb062c91..a03003114d 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -8,8 +8,10 @@ from zarr.attrs import Attributes from zarr.core import Array -from zarr.storage import contains_array, contains_group, init_group, \ - DictStore, DirectoryStore, group_meta_key, attrs_key, listdir, rmdir +from zarr.frame import Frame +from zarr.storage import contains_array, contains_group, contains_frame, \ + init_group, DictStore, DirectoryStore, group_meta_key, attrs_key, \ + listdir, rmdir from zarr.creation import array, create, empty, zeros, ones, full, \ empty_like, zeros_like, ones_like, full_like from zarr.util import normalize_storage_path, normalize_shape @@ -30,7 +32,7 @@ class Group(MutableMapping): read_only : bool, optional True if group should be protected against modification. chunk_store : MutableMapping, optional - Separate storage for chunks. If not provided, `store` will be used + Separate storage for chunks. If not provided, `store` will be used for storage of both chunks and metadata. synchronizer : object, optional Array synchronizer. @@ -137,7 +139,7 @@ def read_only(self): @property def chunk_store(self): - """A MutableMapping providing the underlying storage for array + """A MutableMapping providing the underlying storage for array chunks.""" return self._chunk_store @@ -296,6 +298,10 @@ def __getitem__(self, item): return Array(self._store, read_only=self._read_only, path=path, chunk_store=self._chunk_store, synchronizer=self._synchronizer) + elif contains_frame(self._store, path): + return Frame(self._store, read_only=self._read_only, path=path, + chunk_store=self._chunk_store, + synchronizer=self._synchronizer) elif contains_group(self._store, path): return Group(self._store, read_only=self._read_only, path=path, chunk_store=self._chunk_store, @@ -304,7 +310,13 @@ def __getitem__(self, item): raise KeyError(item) def __setitem__(self, item, value): - self.array(item, value, overwrite=True) + + # TODO / use duck-like introspection + import pandas as pd + if isinstance(value, pd.DataFrame): + self.frame(item, value, overwrite=True) + else: + self.array(item, value, overwrite=True) def __delitem__(self, item): return self._write_op(self._delitem_nosync, item) @@ -411,6 +423,34 @@ def arrays(self): chunk_store=self._chunk_store, synchronizer=self._synchronizer) + def frame_keys(self): + """Return an iterator over member names for frames only. + + Examples + -------- + >>> import zarr + + """ + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_frame(self._store, path): + yield key + + def frames(self): + """Return an iterator over (name, value) pairs for frames only. + + Examples + -------- + >>> import zarr + """ + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_frame(self._store, path): + yield key, Frame(self._store, path=path, + read_only=self._read_only, + chunk_store=self._chunk_store, + synchronizer=self._synchronizer) + def _write_op(self, f, *args, **kwargs): # guard condition @@ -700,6 +740,17 @@ def _array_nosync(self, name, data, **kwargs): return array(data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) + def frame(self, name, data, **kwargs): + """Create a frame. Keyword arguments as per + :func:`zarr.creation.frame`.""" + return self._write_op(self._frame_nosync, name, data, **kwargs) + + def _frame_nosync(self, name, data, **kwargs): + path = self._item_path(name) + kwargs.setdefault('synchronizer', self._synchronizer) + return frame(data, store=self._store, path=path, + chunk_store=self._chunk_store, **kwargs) + def empty_like(self, name, data, **kwargs): """Create an array. Keyword arguments as per :func:`zarr.creation.empty_like`.""" From 3e1b8ed622e9bbdb651952b4bca74e2997644718 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 14 Oct 2016 16:40:50 -0400 Subject: [PATCH 05/12] add in nrows/dtypes to frame metadata --- zarr/meta.py | 4 ++++ zarr/storage.py | 24 ++++++++++++++++++++---- zarr/tests/test_frame.py | 8 +++++--- zarr/tests/test_hierarchy.py | 4 +++- zarr/tests/test_meta.py | 8 +++++++- 5 files changed, 39 insertions(+), 9 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index fbe83e6bd9..2b39385ac2 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -84,6 +84,8 @@ def encode_array_metadata(meta): def encode_frame_metadata(meta): meta = dict( zarr_format=ZARR_FORMAT, + nrows=meta['nrows'], + dtypes=[encode_dtype(d) for d in meta['dtypes']], chunks=meta['chunks'], compressor=meta['compressor'], filters=meta['filters'], @@ -103,6 +105,8 @@ def decode_frame_metadata(s): try: meta = dict( zarr_format=meta['zarr_format'], + nrows=meta['nrows'], + dtypes=meta['dtypes'], chunks=tuple(meta['chunks']), compressor=meta['compressor'], filters=meta['filters'], diff --git a/zarr/storage.py b/zarr/storage.py index 57f83a78bf..fe3ade43dd 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -331,13 +331,20 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, init_store = init_array -def init_frame(store, overwrite=False, path=None, chunk_store=None): +def init_frame(store, nrows, dtypes, chunks=None, overwrite=False, path=None, + chunk_store=None): """initialize a frame store. Parameters ---------- store : MutableMapping A mapping that supports string keys and byte sequence values. + nrows : int + Frame number of rows + dtypes : list + list of dtypes + chunks : int or tuple of ints, optional + Chunk shape. If not provided, will be guessed from `shape` and `dtype`. overwrite : bool, optional If True, erase all data in `store` prior to initialisation. path : string, optional @@ -356,11 +363,12 @@ def init_frame(store, overwrite=False, path=None, chunk_store=None): overwrite=overwrite) # initialise metadata - _init_frame_metadata(store=store, overwrite=overwrite, path=path, + _init_frame_metadata(store=store, nrows=nrows, dtypes=dtypes, chunks=chunks, + overwrite=overwrite, path=path, chunk_store=chunk_store) -def _init_frame_metadata(store, overwrite=False, path=None, chunk_store=None): +def _init_frame_metadata(store, nrows, dtypes, overwrite=False, path=None, chunk_store=None): # guard conditions if overwrite: @@ -375,10 +383,18 @@ def _init_frame_metadata(store, overwrite=False, path=None, chunk_store=None): elif contains_group(store, path): err_contains_group(path) + # normalize metadata + if not isinstance(dtypes, (list, tuple)): + raise ValueError("dtypes must be a list-like") + dtypes = tuple([ np.dtype(d) for d in dtypes ]) + # initialize metadata # N.B., currently no metadata properties are needed, however there may # be in future - meta = dict() + meta = dict(nrows=nrows, dtypes=dtypes, chunks=chunks, + compressor=compressor_config, + filters=filters_config) + key = _path_to_prefix(path) + group_meta_key store[key] = encode_frame_metadata(meta) diff --git a/zarr/tests/test_frame.py b/zarr/tests/test_frame.py index 35cfb372d5..ed71ed4436 100644 --- a/zarr/tests/test_frame.py +++ b/zarr/tests/test_frame.py @@ -13,8 +13,9 @@ from pandas.util.testing import assert_frame_equal from zarr.storage import (DirectoryStore, ZipStore, - init_frame, init_group) + init_array, init_frame, init_group) from zarr.core import Array +from zarr.frame import Frame from zarr.errors import PermissionError from zarr.compat import PY2 from zarr.util import buffer_size @@ -22,13 +23,14 @@ Blosc, BZ2 -class TestFrame(unittest.TestCase): +class TestArray(unittest.TestCase): def test_array_init(self): # normal initialization store = dict() - init_array(store, shape=100, chunks=10) + import pdb; pdb.set_trace() + init_frame(store, chunks=10) a = Array(store) assert_is_instance(a, Array) eq((100,), a.shape) diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index fca7093c0b..b407b1547e 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -12,11 +12,13 @@ assert_is_instance, assert_false, assert_is_none import numpy as np from numpy.testing import assert_array_equal - +import pandas as pd +from pandas.util.testing import assert_frame_equal from zarr.storage import DictStore, DirectoryStore, ZipStore, init_group, \ init_array, attrs_key, array_meta_key, group_meta_key from zarr.core import Array +from zarra.frame import Frame from zarr.hierarchy import Group, group, open_group from zarr.attrs import Attributes from zarr.errors import PermissionError diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index cdf83ee1d8..e2be8a6258 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -194,6 +194,8 @@ def test_encode_decode_frame_1(): meta = dict( chunks=(10,), + nrows=100, + dtypes=[np.dtype('f8'), np.dtype('i8'), np.dtype('O')], compressor=Zlib(1).get_config(), filters=None, ) @@ -201,9 +203,11 @@ def test_encode_decode_frame_1(): meta_json = '''{ "chunks": [10], "compressor": {"id": "zlib", "level": 1}, + "dtypes": [" Date: Sat, 15 Oct 2016 14:15:14 -0400 Subject: [PATCH 06/12] add columns meta data --- zarr/errors.py | 4 ++ zarr/frame.py | 102 +++++++++++++++------------------------ zarr/meta.py | 2 + zarr/storage.py | 39 +++++++++++++-- zarr/tests/test_frame.py | 18 ++++--- zarr/tests/test_meta.py | 3 ++ 6 files changed, 93 insertions(+), 75 deletions(-) diff --git a/zarr/errors.py b/zarr/errors.py index 76609508fd..eaf148f4a9 100644 --- a/zarr/errors.py +++ b/zarr/errors.py @@ -35,6 +35,10 @@ def err_array_not_found(path): raise KeyError('array not found at path %r' % path) +def err_frame_not_found(path): + raise KeyError('frame not found at path %r' % path) + + def err_group_not_found(path): raise KeyError('group not found at path %r' % path) diff --git a/zarr/frame.py b/zarr/frame.py index 654003dcca..cce630ee0e 100644 --- a/zarr/frame.py +++ b/zarr/frame.py @@ -5,6 +5,12 @@ import numpy as np from zarr.core import Base +from zarr.util import normalize_storage_path +from zarr.storage import frame_meta_key, attrs_key, listdir, getsize +from zarr.meta import decode_frame_metadata, encode_frame_metadata +from zarr.attrs import Attributes +from zarr.errors import PermissionError, err_read_only, err_frame_not_found +from zarr.codecs import get_codec class Frame(Base): @@ -79,7 +85,6 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, self._chunk_store = chunk_store self._synchronizer = synchronizer self._cache_metadata = cache_metadata - self._is_view = False # initialize metadata self._load_metadata() @@ -94,26 +99,24 @@ def _load_metadata(self): if self._synchronizer is None: self._load_metadata_nosync() else: - mkey = self._key_prefix + array_meta_key + mkey = self._key_prefix + frame_meta_key with self._synchronizer[mkey]: self._load_metadata_nosync() def _load_metadata_nosync(self): try: - mkey = self._key_prefix + array_meta_key + mkey = self._key_prefix + frame_meta_key meta_bytes = self._store[mkey] except KeyError: - err_array_not_found(self._path) + err_frame_not_found(self._path) else: # decode and store metadata - meta = decode_array_metadata(meta_bytes) + meta = decode_frame_metadata(meta_bytes) self._meta = meta - self._shape = meta['shape'] + self._nrows = meta['nrows'] self._chunks = meta['chunks'] - self._dtype = meta['dtype'] - self._fill_value = meta['fill_value'] - self._order = meta['order'] + self._dtypes = meta['dtypes'] # setup compressor config = meta['compressor'] @@ -133,12 +136,10 @@ def _refresh_metadata(self): self._load_metadata() def _refresh_metadata_nosync(self): - if not self._cache_metadata and not self._is_view: + if not self._cache_metadata: self._load_metadata_nosync() def _flush_metadata_nosync(self): - if self._is_view: - raise PermissionError('not permitted for views') if self._compressor: compressor_config = self._compressor.get_config() @@ -148,41 +149,39 @@ def _flush_metadata_nosync(self): filters_config = [f.get_config() for f in self._filters] else: filters_config = None - meta = dict(shape=self._shape, chunks=self._chunks, dtype=self._dtype, - compressor=compressor_config, fill_value=self._fill_value, - order=self._order, filters=filters_config) - mkey = self._key_prefix + array_meta_key - self._store[mkey] = encode_array_metadata(meta) + meta = dict(nrows=self._nrows, chunks=self._chunks, dtypes=self._dtypes, + compressor=compressor_config, filters=filters_config) + mkey = self._key_prefix + frame_meta_key + self._store[mkey] = encode_frame_metadata(meta) + + @property + def nrows(self): + """ our number of rows """ + return self._nrows + + @property + def dtypes(self): + """ a list of our dtypes """ + return self._dtypes @property - def fill_value(self): - """A value used for uninitialized portions of the array.""" - return self._fill_value + def _ncols(self): + return len(self.dtypes) @property - def order(self): - """A string indicating the order in which bytes are arranged within - chunks of the array.""" - return self._order + def ncols(self): + return self._ncols @property - def dtype(self): - """The NumPy data type.""" - return self._dtype + def _shape(self): + return (self._nrows, self._ncols) @property def shape(self): """A tuple of integers describing the length of each dimension of the array.""" - # N.B., shape may change if array is resized, hence need to refresh - # metadata - self._refresh_metadata() return self._shape - @shape.setter - def shape(self, value): - self.resize(value) - @property def _size(self): return reduce(operator.mul, self._shape) @@ -190,15 +189,12 @@ def _size(self): @property def size(self): """The total number of elements in the array.""" - # N.B., this property depends on shape, and shape may change if array - # is resized, hence need to refresh metadata - self._refresh_metadata() return self._size @property def itemsize(self): """The size in bytes of each item in the array.""" - return self.dtype.itemsize + return sum(dtype.itemsize for dtype in self.dtypes) @property def _nbytes(self): @@ -254,35 +250,18 @@ def nchunks(self): def nchunks_initialized(self): """The number of chunks that have been initialized with some data.""" return sum(1 for k in listdir(self._chunk_store, self._path) - if k not in [array_meta_key, attrs_key]) - - # backwards compability - initialized = nchunks_initialized - - @property - def is_view(self): - """A boolean, True if this array is a view on another array.""" - return self._is_view + if k not in [frame_meta_key, attrs_key]) def __eq__(self, other): return ( - isinstance(other, Array) and + isinstance(other, Frame) and self.store == other.store and self.read_only == other.read_only and - self.path == other.path and - not self._is_view - # N.B., no need to compare other properties, should be covered by - # store comparison + self.path == other.path ) - def __array__(self, *args): - a = self[:] - if args: - a = a.astype(args[0]) - return a - def __len__(self): - return self.shape[0] + return self.nrows def __getitem__(self, item): """Retrieve data for some portion of the array. Most NumPy-style @@ -741,9 +720,8 @@ def _repr_nosync(self): if self.name: r += '%s, ' % self.name r += '%s, ' % str(self._shape) - r += '%s, ' % str(self._dtype) + r += '%s, ' % str(self._dtypes) r += 'chunks=%s, ' % str(self._chunks) - r += 'order=%s' % self._order r += ')' # storage size info @@ -792,7 +770,7 @@ def _synchronized_op(self, f, *args, **kwargs): else: # synchronize on the array - mkey = self._key_prefix + array_meta_key + mkey = self._key_prefix + frame_meta_key with self._synchronizer[mkey]: self._refresh_metadata_nosync() result = f(*args, **kwargs) diff --git a/zarr/meta.py b/zarr/meta.py index 2b39385ac2..c431392ed7 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -85,6 +85,7 @@ def encode_frame_metadata(meta): meta = dict( zarr_format=ZARR_FORMAT, nrows=meta['nrows'], + columns=meta['columns'], dtypes=[encode_dtype(d) for d in meta['dtypes']], chunks=meta['chunks'], compressor=meta['compressor'], @@ -106,6 +107,7 @@ def decode_frame_metadata(s): meta = dict( zarr_format=meta['zarr_format'], nrows=meta['nrows'], + columns=meta['columns'], dtypes=meta['dtypes'], chunks=tuple(meta['chunks']), compressor=meta['compressor'], diff --git a/zarr/storage.py b/zarr/storage.py index fe3ade43dd..8149d06b51 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -331,7 +331,7 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, init_store = init_array -def init_frame(store, nrows, dtypes, chunks=None, overwrite=False, path=None, +def init_frame(store, nrows, columns, dtypes, chunks=None, overwrite=False, path=None, chunk_store=None): """initialize a frame store. @@ -341,6 +341,8 @@ def init_frame(store, nrows, dtypes, chunks=None, overwrite=False, path=None, A mapping that supports string keys and byte sequence values. nrows : int Frame number of rows + columns : list + list of string names of columns dtypes : list list of dtypes chunks : int or tuple of ints, optional @@ -363,12 +365,15 @@ def init_frame(store, nrows, dtypes, chunks=None, overwrite=False, path=None, overwrite=overwrite) # initialise metadata - _init_frame_metadata(store=store, nrows=nrows, dtypes=dtypes, chunks=chunks, + _init_frame_metadata(store=store, nrows=nrows, columns=columns, + dtypes=dtypes, chunks=chunks, overwrite=overwrite, path=path, chunk_store=chunk_store) -def _init_frame_metadata(store, nrows, dtypes, overwrite=False, path=None, chunk_store=None): +def _init_frame_metadata(store, nrows, columns, dtypes, chunks=None, + compressor='default', overwrite=False, + path=None, chunk_store=None, filters=None): # guard conditions if overwrite: @@ -388,14 +393,38 @@ def _init_frame_metadata(store, nrows, dtypes, overwrite=False, path=None, chunk raise ValueError("dtypes must be a list-like") dtypes = tuple([ np.dtype(d) for d in dtypes ]) + # chunks are based on the rows; treat each rows as singular + chunks = normalize_chunks(chunks, (nrows, 1), sum([dtype.itemsize for dtype in dtypes])) + + # obtain compressor config + if compressor == 'none': + # compatibility + compressor = None + elif compressor == 'default': + compressor = default_compressor + if compressor: + try: + compressor_config = compressor.get_config() + except AttributeError: + err_bad_compressor(compressor) + else: + compressor_config = None + + # obtain filters config + if filters: + filters_config = [f.get_config() for f in filters] + else: + filters_config = None + # initialize metadata # N.B., currently no metadata properties are needed, however there may # be in future - meta = dict(nrows=nrows, dtypes=dtypes, chunks=chunks, + meta = dict(nrows=nrows, columns=columns, + dtypes=dtypes, chunks=chunks, compressor=compressor_config, filters=filters_config) - key = _path_to_prefix(path) + group_meta_key + key = _path_to_prefix(path) + frame_meta_key store[key] = encode_frame_metadata(meta) # initialize attributes diff --git a/zarr/tests/test_frame.py b/zarr/tests/test_frame.py index ed71ed4436..d2765502ed 100644 --- a/zarr/tests/test_frame.py +++ b/zarr/tests/test_frame.py @@ -11,6 +11,8 @@ import numpy as np import pandas as pd from pandas.util.testing import assert_frame_equal +from nose.tools import eq_ as eq, assert_is_instance, \ + assert_raises, assert_true, assert_false, assert_is, assert_is_none from zarr.storage import (DirectoryStore, ZipStore, init_array, init_frame, init_group) @@ -30,14 +32,14 @@ def test_array_init(self): # normal initialization store = dict() import pdb; pdb.set_trace() - init_frame(store, chunks=10) - a = Array(store) - assert_is_instance(a, Array) - eq((100,), a.shape) - eq((10,), a.chunks) - eq('', a.path) - assert_is_none(a.name) - assert_is(store, a.store) + init_frame(store, nrows=10, dtypes=[np.float64, np.int64]) + fr = Frame(store) + assert_is_instance(fr, Frame) + eq((10,2), fr.shape) + eq((10,), fr.chunks) + eq('', fr.path) + assert_is_none(fr.name) + assert_is(store, fr.store) # initialize at path store = dict() diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index e2be8a6258..c6c26a07e3 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -195,6 +195,7 @@ def test_encode_decode_frame_1(): meta = dict( chunks=(10,), nrows=100, + columns=['float', 'int'], dtypes=[np.dtype('f8'), np.dtype('i8'), np.dtype('O')], compressor=Zlib(1).get_config(), filters=None, @@ -202,6 +203,7 @@ def test_encode_decode_frame_1(): meta_json = '''{ "chunks": [10], + "columns": ["float", "int"], "compressor": {"id": "zlib", "level": 1}, "dtypes": [" Date: Sat, 15 Oct 2016 15:36:07 -0400 Subject: [PATCH 07/12] add in _arrays --- zarr/core.py | 196 +++++++------ zarr/frame.py | 612 +++------------------------------------ zarr/meta.py | 2 +- zarr/storage.py | 26 +- zarr/tests/test_frame.py | 8 +- 5 files changed, 167 insertions(+), 677 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 0109a2cb27..56307b5b65 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -20,6 +20,25 @@ class Base(object): """ ABC for Array / Frame """ + _meta_key = None + _is_view = False + + def _load_metadata(self): + """(Re)load metadata from store.""" + if self._synchronizer is None: + self._load_metadata_nosync() + else: + mkey = self._key_prefix + self._meta_key + with self._synchronizer[mkey]: + self._load_metadata_nosync() + + def _refresh_metadata(self): + if not self._cache_metadata: + self._load_metadata() + + def _refresh_metadata_nosync(self): + if not self._cache_metadata and not self._is_view: + self._load_metadata_nosync() @property def store(self): @@ -59,6 +78,15 @@ def chunks(self): chunk of the array.""" return self._chunks + @property + def shape(self): + """A tuple of integers describing the length of each dimension of + the array.""" + # N.B., shape may change if array is resized, hence need to refresh + # metadata + self._refresh_metadata() + return self._shape + @property def compressor(self): """Primary compression codec.""" @@ -80,11 +108,86 @@ def attrs(self): attribute values must be JSON serializable.""" return self._attrs + @property + def _size(self): + return reduce(operator.mul, self._shape) + + @property + def size(self): + """The total number of elements in the array.""" + # N.B., this property depends on shape, and shape may change if array + # is resized, hence need to refresh metadata + self._refresh_metadata() + return self._size + @property def ndim(self): """Number of dimensions.""" return len(self.shape) + @property + def nbytes(self): + """The total number of bytes that would be required to store the + array without compression.""" + # N.B., this property depends on shape, and shape may change if array + # is resized, hence need to refresh metadata + self._refresh_metadata() + return self._nbytes + + @property + def nbytes_stored(self): + """The total number of stored bytes of data for the array. This + includes storage required for configuration metadata and user + attributes.""" + m = getsize(self._store, self._path) + if self._store == self._chunk_store: + return m + else: + n = getsize(self._chunk_store, self._path) + if m < 0 or n < 0: + return -1 + else: + return m + n + + @property + def _cdata_shape(self): + return tuple(int(np.ceil(s / c)) + for s, c in zip(self._shape, self._chunks)) + + @property + def cdata_shape(self): + """A tuple of integers describing the number of chunks along each + dimension of the array.""" + self._refresh_metadata() + return self._cdata_shape + + @property + def _nchunks(self): + return reduce(operator.mul, self._cdata_shape) + + @property + def nchunks(self): + """Total number of chunks.""" + self._refresh_metadata() + return self._nchunks + + @property + def nchunks_initialized(self): + """The number of chunks that have been initialized with some data.""" + return sum(1 for k in listdir(self._chunk_store, self._path) + if k not in [self._meta_key, attrs_key]) + + # backwards compability + initialized = nchunks_initialized + + def __len__(self): + return self.shape[0] + + def __repr__(self): + # N.B., __repr__ needs to be synchronized to ensure consistent view + # of metadata AND when retrieving nbytes_stored from filesystem storage + return self._synchronized_op(self._repr_nosync) + class Array(Base): @@ -144,6 +247,7 @@ class Array(Base): view """ # flake8: noqa + _meta_key = array_meta_key def __init__(self, store, path=None, read_only=False, chunk_store=None, synchronizer=None, cache_metadata=True): @@ -173,15 +277,6 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, self._attrs = Attributes(store, key=akey, read_only=read_only, synchronizer=synchronizer) - def _load_metadata(self): - """(Re)load metadata from store.""" - if self._synchronizer is None: - self._load_metadata_nosync() - else: - mkey = self._key_prefix + array_meta_key - with self._synchronizer[mkey]: - self._load_metadata_nosync() - def _load_metadata_nosync(self): try: mkey = self._key_prefix + array_meta_key @@ -212,14 +307,6 @@ def _load_metadata_nosync(self): filters = [get_codec(config) for config in filters] self._filters = filters - def _refresh_metadata(self): - if not self._cache_metadata: - self._load_metadata() - - def _refresh_metadata_nosync(self): - if not self._cache_metadata and not self._is_view: - self._load_metadata_nosync() - def _flush_metadata_nosync(self): if self._is_view: raise PermissionError('not permitted for views') @@ -267,18 +354,6 @@ def shape(self): def shape(self, value): self.resize(value) - @property - def _size(self): - return reduce(operator.mul, self._shape) - - @property - def size(self): - """The total number of elements in the array.""" - # N.B., this property depends on shape, and shape may change if array - # is resized, hence need to refresh metadata - self._refresh_metadata() - return self._size - @property def itemsize(self): """The size in bytes of each item in the array.""" @@ -288,61 +363,6 @@ def itemsize(self): def _nbytes(self): return self._size * self.itemsize - @property - def nbytes(self): - """The total number of bytes that would be required to store the - array without compression.""" - # N.B., this property depends on shape, and shape may change if array - # is resized, hence need to refresh metadata - self._refresh_metadata() - return self._nbytes - - @property - def nbytes_stored(self): - """The total number of stored bytes of data for the array. This - includes storage required for configuration metadata and user - attributes.""" - m = getsize(self._store, self._path) - if self._store == self._chunk_store: - return m - else: - n = getsize(self._chunk_store, self._path) - if m < 0 or n < 0: - return -1 - else: - return m + n - - @property - def _cdata_shape(self): - return tuple(int(np.ceil(s / c)) - for s, c in zip(self._shape, self._chunks)) - - @property - def cdata_shape(self): - """A tuple of integers describing the number of chunks along each - dimension of the array.""" - self._refresh_metadata() - return self._cdata_shape - - @property - def _nchunks(self): - return reduce(operator.mul, self._cdata_shape) - - @property - def nchunks(self): - """Total number of chunks.""" - self._refresh_metadata() - return self._nchunks - - @property - def nchunks_initialized(self): - """The number of chunks that have been initialized with some data.""" - return sum(1 for k in listdir(self._chunk_store, self._path) - if k not in [array_meta_key, attrs_key]) - - # backwards compability - initialized = nchunks_initialized - @property def is_view(self): """A boolean, True if this array is a view on another array.""" @@ -365,9 +385,6 @@ def __array__(self, *args): a = a.astype(args[0]) return a - def __len__(self): - return self.shape[0] - def __getitem__(self, item): """Retrieve data for some portion of the array. Most NumPy-style slicing operations are supported. @@ -813,11 +830,6 @@ def _encode_chunk(self, chunk): return cdata - def __repr__(self): - # N.B., __repr__ needs to be synchronized to ensure consistent view - # of metadata AND when retrieving nbytes_stored from filesystem storage - return self._synchronized_op(self._repr_nosync) - def _repr_nosync(self): # main line diff --git a/zarr/frame.py b/zarr/frame.py index cce630ee0e..32f7ccb115 100644 --- a/zarr/frame.py +++ b/zarr/frame.py @@ -4,9 +4,12 @@ import itertools import numpy as np -from zarr.core import Base -from zarr.util import normalize_storage_path -from zarr.storage import frame_meta_key, attrs_key, listdir, getsize +from zarr.core import Base, Array +from zarr.util import is_total_slice, normalize_array_selection, \ + get_chunk_range, human_readable_size, normalize_resize_args, \ + normalize_storage_path, normalize_shape, normalize_chunks +from zarr.storage import (frame_meta_key, attrs_key, listdir, getsize, + init_array, init_group) from zarr.meta import decode_frame_metadata, encode_frame_metadata from zarr.attrs import Attributes from zarr.errors import PermissionError, err_read_only, err_frame_not_found @@ -66,6 +69,7 @@ class Frame(Base): view """ # flake8: noqa + _key_meta = frame_meta_key def __init__(self, store, path=None, read_only=False, chunk_store=None, synchronizer=None, cache_metadata=True): @@ -93,15 +97,19 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, akey = self._key_prefix + attrs_key self._attrs = Attributes(store, key=akey, read_only=read_only, synchronizer=synchronizer) - - def _load_metadata(self): - """(Re)load metadata from store.""" - if self._synchronizer is None: - self._load_metadata_nosync() - else: - mkey = self._key_prefix + frame_meta_key - with self._synchronizer[mkey]: - self._load_metadata_nosync() + # create our arrays + self._arrays = {} + for c, dtype in zip(self._columns, self._dtypes): + path = self._key_prefix + '/data/' + c + init_array(store, + (self._nrows, 1), + chunks=self._chunks, + dtype=dtype, + compressor=self._compressor, + path=path, + chunk_store=self._chunk_store, + filters=self._filters) + self._arrays[c] = Array(store, path=path, read_only=True) def _load_metadata_nosync(self): try: @@ -115,8 +123,9 @@ def _load_metadata_nosync(self): meta = decode_frame_metadata(meta_bytes) self._meta = meta self._nrows = meta['nrows'] - self._chunks = meta['chunks'] + self._columns = meta['columns'] self._dtypes = meta['dtypes'] + self._chunks = meta['chunks'] # setup compressor config = meta['compressor'] @@ -131,14 +140,6 @@ def _load_metadata_nosync(self): filters = [get_codec(config) for config in filters] self._filters = filters - def _refresh_metadata(self): - if not self._cache_metadata: - self._load_metadata() - - def _refresh_metadata_nosync(self): - if not self._cache_metadata: - self._load_metadata_nosync() - def _flush_metadata_nosync(self): if self._compressor: @@ -159,6 +160,11 @@ def nrows(self): """ our number of rows """ return self._nrows + @property + def columns(self): + """ return a list of our columns """ + return self._columns + @property def dtypes(self): """ a list of our dtypes """ @@ -172,25 +178,6 @@ def _ncols(self): def ncols(self): return self._ncols - @property - def _shape(self): - return (self._nrows, self._ncols) - - @property - def shape(self): - """A tuple of integers describing the length of each dimension of - the array.""" - return self._shape - - @property - def _size(self): - return reduce(operator.mul, self._shape) - - @property - def size(self): - """The total number of elements in the array.""" - return self._size - @property def itemsize(self): """The size in bytes of each item in the array.""" @@ -198,59 +185,7 @@ def itemsize(self): @property def _nbytes(self): - return self._size * self.itemsize - - @property - def nbytes(self): - """The total number of bytes that would be required to store the - array without compression.""" - # N.B., this property depends on shape, and shape may change if array - # is resized, hence need to refresh metadata - self._refresh_metadata() - return self._nbytes - - @property - def nbytes_stored(self): - """The total number of stored bytes of data for the array. This - includes storage required for configuration metadata and user - attributes.""" - m = getsize(self._store, self._path) - if self._store == self._chunk_store: - return m - else: - n = getsize(self._chunk_store, self._path) - if m < 0 or n < 0: - return -1 - else: - return m + n - - @property - def _cdata_shape(self): - return tuple(int(np.ceil(s / c)) - for s, c in zip(self._shape, self._chunks)) - - @property - def cdata_shape(self): - """A tuple of integers describing the number of chunks along each - dimension of the array.""" - self._refresh_metadata() - return self._cdata_shape - - @property - def _nchunks(self): - return reduce(operator.mul, self._cdata_shape) - - @property - def nchunks(self): - """Total number of chunks.""" - self._refresh_metadata() - return self._nchunks - - @property - def nchunks_initialized(self): - """The number of chunks that have been initialized with some data.""" - return sum(1 for k in listdir(self._chunk_store, self._path) - if k not in [frame_meta_key, attrs_key]) + return sum(arr.nbytes for arr in self._arrays) def __eq__(self, other): return ( @@ -260,90 +195,26 @@ def __eq__(self, other): self.path == other.path ) - def __len__(self): - return self.nrows - def __getitem__(self, item): - """Retrieve data for some portion of the array. Most NumPy-style - slicing operations are supported. + """Retrieve a column or columns. Always returns a DataFrame of the requires column or columns. Returns ------- - out : ndarray - A NumPy array containing the data for the requested region. + out : DataFrame Examples -------- - Setup a 1-dimensional array:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.array(np.arange(100000000), chunks=1000000, dtype='i4') - >>> z - Array((100000000,), int32, chunks=(1000000,), order=C) - nbytes: 381.5M; nbytes_stored: 6.4M; ratio: 59.9; initialized: 100/100 - compressor: Blosc(cname='lz4', clevel=5, shuffle=1) - store: dict - - Take some slices:: - - >>> z[5] - 5 - >>> z[:5] - array([0, 1, 2, 3, 4], dtype=int32) - >>> z[-5:] - array([99999995, 99999996, 99999997, 99999998, 99999999], dtype=int32) - >>> z[5:10] - array([5, 6, 7, 8, 9], dtype=int32) - >>> z[:] - array([ 0, 1, 2, ..., 99999997, 99999998, 99999999], dtype=int32) - - Setup a 2-dimensional array:: - - >>> import zarr - >>> import numpy as np - >>> z = zarr.array(np.arange(100000000).reshape(10000, 10000), - ... chunks=(1000, 1000), dtype='i4') - >>> z - Array((10000, 10000), int32, chunks=(1000, 1000), order=C) - nbytes: 381.5M; nbytes_stored: 9.2M; ratio: 41.6; initialized: 100/100 - compressor: Blosc(cname='lz4', clevel=5, shuffle=1) - store: dict - - Take some slices:: - - >>> z[2, 2] - 20002 - >>> z[:2, :2] - array([[ 0, 1], - [10000, 10001]], dtype=int32) - >>> z[:2] - array([[ 0, 1, 2, ..., 9997, 9998, 9999], - [10000, 10001, 10002, ..., 19997, 19998, 19999]], dtype=int32) - >>> z[:, :2] - array([[ 0, 1], - [ 10000, 10001], - [ 20000, 20001], - ..., - [99970000, 99970001], - [99980000, 99980001], - [99990000, 99990001]], dtype=int32) - >>> z[:] - array([[ 0, 1, 2, ..., 9997, 9998, 9999], - [ 10000, 10001, 10002, ..., 19997, 19998, 19999], - [ 20000, 20001, 20002, ..., 29997, 29998, 29999], - ..., - [99970000, 99970001, 99970002, ..., 99979997, 99979998, 99979999], - [99980000, 99980001, 99980002, ..., 99989997, 99989998, 99989999], - [99990000, 99990001, 99990002, ..., 99999997, 99999998, 99999999]], dtype=int32) - """ # flake8: noqa # refresh metadata if not self._cache_metadata: self._load_metadata() + + import pdb; pdb.set_trace() + + # normalize selection selection = normalize_array_selection(item, self._shape) @@ -394,129 +265,7 @@ def __getitem__(self, item): return out[()] def __setitem__(self, item, value): - """Modify data for some portion of the array. - - Examples - -------- - - Setup a 1-dimensional array:: - - >>> import zarr - >>> z = zarr.zeros(100000000, chunks=1000000, dtype='i4') - >>> z - Array((100000000,), int32, chunks=(1000000,), order=C) - nbytes: 381.5M; nbytes_stored: 301; ratio: 1328903.7; initialized: 0/100 - compressor: Blosc(cname='lz4', clevel=5, shuffle=1) - store: dict - - Set all array elements to the same scalar value:: - - >>> z[:] = 42 - >>> z[:] - array([42, 42, 42, ..., 42, 42, 42], dtype=int32) - - Set a portion of the array:: - - >>> z[:100] = np.arange(100) - >>> z[-100:] = np.arange(100)[::-1] - >>> z[:] - array([0, 1, 2, ..., 2, 1, 0], dtype=int32) - - Setup a 2-dimensional array:: - - >>> z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') - >>> z - Array((10000, 10000), int32, chunks=(1000, 1000), order=C) - nbytes: 381.5M; nbytes_stored: 323; ratio: 1238390.1; initialized: 0/100 - compressor: Blosc(cname='lz4', clevel=5, shuffle=1) - store: dict - - Set all array elements to the same scalar value:: - - >>> z[:] = 42 - >>> z[:] - array([[42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - ..., - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42], - [42, 42, 42, ..., 42, 42, 42]], dtype=int32) - - Set a portion of the array:: - - >>> z[0, :] = np.arange(z.shape[1]) - >>> z[:, 0] = np.arange(z.shape[0]) - >>> z[:] - array([[ 0, 1, 2, ..., 9997, 9998, 9999], - [ 1, 42, 42, ..., 42, 42, 42], - [ 2, 42, 42, ..., 42, 42, 42], - ..., - [9997, 42, 42, ..., 42, 42, 42], - [9998, 42, 42, ..., 42, 42, 42], - [9999, 42, 42, ..., 42, 42, 42]], dtype=int32) - - """ - - # guard conditions - if self._read_only: - err_read_only() - - # refresh metadata - if not self._cache_metadata: - self._load_metadata_nosync() - - # normalize selection - selection = normalize_array_selection(item, self._shape) - - # check value shape - expected_shape = tuple( - s.stop - s.start for s in selection - if isinstance(s, slice) - ) - if np.isscalar(value): - pass - elif expected_shape != value.shape: - raise ValueError('value has wrong shape, expecting %s, found %s' - % (str(expected_shape), - str(value.shape))) - - # determine indices of chunks overlapping the selection - chunk_range = get_chunk_range(selection, self._chunks) - - # iterate over chunks in range - for cidx in itertools.product(*chunk_range): - - # determine chunk offset - offset = [i * c for i, c in zip(cidx, self._chunks)] - - # determine required index range within chunk - chunk_selection = tuple( - slice(max(0, s.start - o), min(c, s.stop - o)) - if isinstance(s, slice) - else s - o - for s, o, c in zip(selection, offset, self._chunks) - ) - - if np.isscalar(value): - - # put data - self._chunk_setitem(cidx, chunk_selection, value) - - else: - # assume value is array-like - - # determine index within value - value_selection = tuple( - slice(max(0, o - s.start), - min(o + c - s.start, s.stop - s.start)) - for s, o, c in zip(selection, offset, self._chunks) - if isinstance(s, slice) - ) - - # put data - self._chunk_setitem(cidx, chunk_selection, - value[value_selection]) + raise NotImplementedError("__setitem__ is not implemented") def _chunk_getitem(self, cidx, item, dest): """Obtain part or whole of a chunk. @@ -785,293 +534,10 @@ def _write_op(self, f, *args, **kwargs): return self._synchronized_op(f, *args, **kwargs) def resize(self, *args): - """Change the shape of the array by growing or shrinking one or more - dimensions. - - Examples - -------- - >>> import zarr - >>> z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000)) - >>> z - Array((10000, 10000), float64, chunks=(1000, 1000), order=C) - nbytes: 762.9M; nbytes_stored: 323; ratio: 2476780.2; initialized: 0/100 - compressor: Blosc(cname='lz4', clevel=5, shuffle=1) - store: dict - >>> z.resize(20000, 10000) - >>> z - Array((20000, 10000), float64, chunks=(1000, 1000), order=C) - nbytes: 1.5G; nbytes_stored: 323; ratio: 4953560.4; initialized: 0/200 - compressor: Blosc(cname='lz4', clevel=5, shuffle=1) - store: dict - >>> z.resize(30000, 1000) - >>> z - Array((30000, 1000), float64, chunks=(1000, 1000), order=C) - nbytes: 228.9M; nbytes_stored: 322; ratio: 745341.6; initialized: 0/30 - compressor: Blosc(cname='lz4', clevel=5, shuffle=1) - store: dict - - Notes - ----- - When resizing an array, the data are not rearranged in any way. - - If one or more dimensions are shrunk, any chunks falling outside the - new array shape will be deleted from the underlying store. - - """ # flake8: noqa - - return self._write_op(self._resize_nosync, *args) - - def _resize_nosync(self, *args): - - # normalize new shape argument - old_shape = self._shape - new_shape = normalize_resize_args(old_shape, *args) - old_cdata_shape = self._cdata_shape - - # update metadata - self._shape = new_shape - self._flush_metadata_nosync() - - # determine the new number and arrangement of chunks - chunks = self._chunks - new_cdata_shape = tuple(int(np.ceil(s / c)) - for s, c in zip(new_shape, chunks)) - - # remove any chunks not within range - for cidx in itertools.product(*[range(n) for n in old_cdata_shape]): - if all(i < c for i, c in zip(cidx, new_cdata_shape)): - pass # keep the chunk - else: - key = self._chunk_key(cidx) - try: - del self._chunk_store[key] - except KeyError: - # chunk not initialized - pass + raise NotImplementedError("resize is not implemented") def append(self, data, axis=0): - """Append `data` to `axis`. - - Parameters - ---------- - data : array_like - Data to be appended. - axis : int - Axis along which to append. - - Returns - ------- - new_shape : tuple - - Notes - ----- - The size of all dimensions other than `axis` must match between this - array and `data`. - - Examples - -------- - >>> import numpy as np - >>> import zarr - >>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000) - >>> z = zarr.array(a, chunks=(1000, 100)) - >>> z - Array((10000, 1000), int32, chunks=(1000, 100), order=C) - nbytes: 38.1M; nbytes_stored: 1.9M; ratio: 20.3; initialized: 100/100 - compressor: Blosc(cname='lz4', clevel=5, shuffle=1) - store: dict - >>> z.append(a) - (20000, 1000) - >>> z - Array((20000, 1000), int32, chunks=(1000, 100), order=C) - nbytes: 76.3M; nbytes_stored: 3.8M; ratio: 20.3; initialized: 200/200 - compressor: Blosc(cname='lz4', clevel=5, shuffle=1) - store: dict - >>> z.append(np.vstack([a, a]), axis=1) - (20000, 2000) - >>> z - Array((20000, 2000), int32, chunks=(1000, 100), order=C) - nbytes: 152.6M; nbytes_stored: 7.5M; ratio: 20.3; initialized: 400/400 - compressor: Blosc(cname='lz4', clevel=5, shuffle=1) - store: dict - - """ - return self._write_op(self._append_nosync, data, axis=axis) - - def _append_nosync(self, data, axis=0): - - # ensure data is array-like - if not hasattr(data, 'shape') or not hasattr(data, 'dtype'): - data = np.asanyarray(data) - - # ensure shapes are compatible for non-append dimensions - self_shape_preserved = tuple(s for i, s in enumerate(self._shape) - if i != axis) - data_shape_preserved = tuple(s for i, s in enumerate(data.shape) - if i != axis) - if self_shape_preserved != data_shape_preserved: - raise ValueError('shapes not compatible') - - # remember old shape - old_shape = self._shape - - # determine new shape - new_shape = tuple( - self._shape[i] if i != axis else self._shape[i] + data.shape[i] - for i in range(len(self._shape)) - ) + raise NotImplementedError("append is not implemented") - # resize - self._resize_nosync(new_shape) - - # store data - # noinspection PyTypeChecker - append_selection = tuple( - slice(None) if i != axis else slice(old_shape[i], new_shape[i]) - for i in range(len(self._shape)) - ) - self[append_selection] = data - - return new_shape - - def view(self, shape=None, chunks=None, dtype=None, - fill_value=None, filters=None, read_only=None, - synchronizer=None): - """Return an array sharing the same data. - - Parameters - ---------- - shape : int or tuple of ints - Array shape. - chunks : int or tuple of ints, optional - Chunk shape. - dtype : string or dtype, optional - NumPy dtype. - fill_value : object - Default value to use for uninitialized portions of the array. - filters : sequence, optional - Sequence of filters to use to encode chunk data prior to - compression. - read_only : bool, optional - True if array should be protected against modification. - synchronizer : object, optional - Array synchronizer. - - Notes - ----- - WARNING: This is an experimental feature and should be used with care. - There are plenty of ways to generate errors and/or cause data - corruption. - - Examples - -------- - - Bypass filters: - - >>> import zarr - >>> import numpy as np - >>> np.random.seed(42) - >>> labels = [b'female', b'male'] - >>> data = np.random.choice(labels, size=10000) - >>> filters = [zarr.Categorize(labels=labels, - ... dtype=data.dtype, - ... astype='u1')] - >>> a = zarr.array(data, chunks=1000, filters=filters) - >>> a[:] - array([b'female', b'male', b'female', ..., b'male', b'male', b'female'], - dtype='|S6') - >>> v = a.view(dtype='u1', filters=[]) - >>> v.is_view - True - >>> v[:] - array([1, 2, 1, ..., 2, 2, 1], dtype=uint8) - - Views can be used to modify data: - - >>> x = v[:] - >>> x.sort() - >>> v[:] = x - >>> v[:] - array([1, 1, 1, ..., 2, 2, 2], dtype=uint8) - >>> a[:] - array([b'female', b'female', b'female', ..., b'male', b'male', b'male'], - dtype='|S6') - - View as a different dtype with the same itemsize: - - >>> data = np.random.randint(0, 2, size=10000, dtype='u1') - >>> a = zarr.array(data, chunks=1000) - >>> a[:] - array([0, 0, 1, ..., 1, 0, 0], dtype=uint8) - >>> v = a.view(dtype=bool) - >>> v[:] - array([False, False, True, ..., True, False, False], dtype=bool) - >>> np.all(a[:].view(dtype=bool) == v[:]) - True - - An array can be viewed with a dtype with a different itemsize, however - some care is needed to adjust the shape and chunk shape so that chunk - data is interpreted correctly: - - >>> data = np.arange(10000, dtype='u2') - >>> a = zarr.array(data, chunks=1000) - >>> a[:10] - array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint16) - >>> v = a.view(dtype='u1', shape=20000, chunks=2000) - >>> v[:10] - array([0, 0, 1, 0, 2, 0, 3, 0, 4, 0], dtype=uint8) - >>> np.all(a[:].view('u1') == v[:]) - True - - Change fill value for uninitialized chunks: - - >>> a = zarr.full(10000, chunks=1000, fill_value=-1, dtype='i1') - >>> a[:] - array([-1, -1, -1, ..., -1, -1, -1], dtype=int8) - >>> v = a.view(fill_value=42) - >>> v[:] - array([42, 42, 42, ..., 42, 42, 42], dtype=int8) - - Note that resizing or appending to views is not permitted: - - >>> a = zarr.empty(10000) - >>> v = a.view() - >>> try: - ... v.resize(20000) - ... except PermissionError as e: - ... print(e) - not permitted for views - - """ # flake8: noqa - - store = self._store - chunk_store = self._chunk_store - path = self._path - if read_only is None: - read_only = self._read_only - if synchronizer is None: - synchronizer = self._synchronizer - a = Array(store=store, path=path, chunk_store=chunk_store, - read_only=read_only, synchronizer=synchronizer, - cache_metadata=True) - a._is_view = True - - # allow override of some properties - if dtype is None: - dtype = self._dtype - else: - dtype = np.dtype(dtype) - a._dtype = dtype - if shape is None: - shape = self._shape - else: - shape = normalize_shape(shape) - a._shape = shape - if chunks is not None: - chunks = normalize_chunks(chunks, shape, dtype.itemsize) - a._chunks = chunks - if fill_value is not None: - a._fill_value = fill_value - if filters is not None: - a._filters = filters - - return a + def view(self, *args, **kwargs): + raise NotImplementedError("view is not implemented") diff --git a/zarr/meta.py b/zarr/meta.py index c431392ed7..8411f387ef 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -108,7 +108,7 @@ def decode_frame_metadata(s): zarr_format=meta['zarr_format'], nrows=meta['nrows'], columns=meta['columns'], - dtypes=meta['dtypes'], + dtypes=[decode_dtype(dtype) for dtype in meta['dtypes']], chunks=tuple(meta['chunks']), compressor=meta['compressor'], filters=meta['filters'], diff --git a/zarr/storage.py b/zarr/storage.py index 8149d06b51..538e25c36a 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -144,9 +144,9 @@ def _require_parent_group(path, store, chunk_store, overwrite): if contains_array(store, p): _init_group_metadata(store, path=p, chunk_store=chunk_store, overwrite=overwrite) - elif contains_frame(store, p): - _init_frame_metadata(store, path=p, chunk_store=chunk_store, - overwrite=overwrite) + #elif contains_frame(store, p): + # _init_frame_metadata(store, path=p, chunk_store=chunk_store, + # overwrite=overwrite) elif not contains_group(store, p): _init_group_metadata(store, path=p, chunk_store=chunk_store) @@ -389,9 +389,14 @@ def _init_frame_metadata(store, nrows, columns, dtypes, chunks=None, err_contains_group(path) # normalize metadata - if not isinstance(dtypes, (list, tuple)): + from pandas.api.types import is_list_like + if not is_list_like(dtypes): raise ValueError("dtypes must be a list-like") - dtypes = tuple([ np.dtype(d) for d in dtypes ]) + dtypes = [ np.dtype(d) for d in dtypes ] + if not is_list_like(columns): + raise ValueError("columns must be a list-like") + if not len(dtypes) == len(columns): + raise ValueError("number of columns must equal number of dtypes") # chunks are based on the rows; treat each rows as singular chunks = normalize_chunks(chunks, (nrows, 1), sum([dtype.itemsize for dtype in dtypes])) @@ -419,8 +424,10 @@ def _init_frame_metadata(store, nrows, columns, dtypes, chunks=None, # initialize metadata # N.B., currently no metadata properties are needed, however there may # be in future - meta = dict(nrows=nrows, columns=columns, - dtypes=dtypes, chunks=chunks, + meta = dict(nrows=nrows, + columns=columns, + dtypes=dtypes, + chunks=chunks, compressor=compressor_config, filters=filters_config) @@ -431,6 +438,7 @@ def _init_frame_metadata(store, nrows, columns, dtypes, chunks=None, key = _path_to_prefix(path) + attrs_key store[key] = json.dumps(dict()).encode('ascii') + def init_group(store, overwrite=False, path=None, chunk_store=None): """initialize a group store. @@ -470,8 +478,8 @@ def _init_group_metadata(store, overwrite=False, path=None, chunk_store=None): rmdir(chunk_store, path) elif contains_array(store, path): err_contains_array(path) - elif contains_frame(store, path): - err_contains_frame(path) + #elif contains_frame(store, path): + # err_contains_frame(path) elif contains_group(store, path): err_contains_group(path) diff --git a/zarr/tests/test_frame.py b/zarr/tests/test_frame.py index d2765502ed..54e2039a2d 100644 --- a/zarr/tests/test_frame.py +++ b/zarr/tests/test_frame.py @@ -32,11 +32,15 @@ def test_array_init(self): # normal initialization store = dict() import pdb; pdb.set_trace() - init_frame(store, nrows=10, dtypes=[np.float64, np.int64]) + init_frame(store, nrows=100, columns=['float', 'int'], dtypes=[np.float64, np.int64]) fr = Frame(store) assert_is_instance(fr, Frame) - eq((10,2), fr.shape) + + assert repr(fr) + eq(["float", "int"], fr.columns) + eq((100,2), fr.shape) eq((10,), fr.chunks) + eq(100, fr.nrows) eq('', fr.path) assert_is_none(fr.name) assert_is(store, fr.store) From e51c354d3cf6c62a21d604ac369f51be87ed8311 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 15 Oct 2016 15:50:51 -0400 Subject: [PATCH 08/12] cleanups --- zarr/core.py | 82 ++-- zarr/frame.py | 79 +--- zarr/storage.py | 2 +- zarr/tests/test_frame.py | 784 +-------------------------------------- 4 files changed, 87 insertions(+), 860 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 56307b5b65..c7e2746dcd 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -188,6 +188,36 @@ def __repr__(self): # of metadata AND when retrieving nbytes_stored from filesystem storage return self._synchronized_op(self._repr_nosync) + def __getstate__(self): + return self._store, self._path, self._read_only, self._chunk_store, \ + self._synchronizer, self._cache_metadata + + def __setstate__(self, state): + self.__init__(*state) + + def _synchronized_op(self, f, *args, **kwargs): + + # no synchronization + if self._synchronizer is None: + self._refresh_metadata_nosync() + return f(*args, **kwargs) + + else: + # synchronize on the array + mkey = self._key_prefix + self._meta_key + with self._synchronizer[mkey]: + self._refresh_metadata_nosync() + result = f(*args, **kwargs) + return result + + def _write_op(self, f, *args, **kwargs): + + # guard condition + if self._read_only: + err_read_only() + + return self._synchronized_op(f, *args, **kwargs) + class Array(Base): @@ -872,35 +902,39 @@ def _repr_nosync(self): return r - def __getstate__(self): - return self._store, self._path, self._read_only, self._chunk_store, \ - self._synchronizer, self._cache_metadata - - def __setstate__(self, state): - self.__init__(*state) - - def _synchronized_op(self, f, *args, **kwargs): + def _repr_abbv_nosync(self): + # appreviated repr - # no synchronization - if self._synchronizer is None: - self._refresh_metadata_nosync() - return f(*args, **kwargs) + # main line + r = '%s(' % type(self).__name__ + if self.name: + r += '%s, ' % self.name + r += '%s, ' % str(self._dtype) + r += 'order=%s' % self._order + r += ')' - else: - # synchronize on the array - mkey = self._key_prefix + array_meta_key - with self._synchronizer[mkey]: - self._refresh_metadata_nosync() - result = f(*args, **kwargs) - return result + # storage size info + r += '\n nbytes: %s' % human_readable_size(self._nbytes) + if self.nbytes_stored > 0: + r += '; nbytes_stored: %s' % human_readable_size( + self.nbytes_stored) + r += '; ratio: %.1f' % (self._nbytes / self.nbytes_stored) + r += '; initialized: %s/%s' % (self.nchunks_initialized, + self._nchunks) - def _write_op(self, f, *args, **kwargs): + # filters + if self._filters: + # first line + r += '\n filters: %r' % self._filters[0] + # subsequent lines + for f in self._filters[1:]: + r += '\n %r' % f - # guard condition - if self._read_only: - err_read_only() + # compressor + if self._compressor: + r += '\n compressor: %r' % self._compressor - return self._synchronized_op(f, *args, **kwargs) + return r def resize(self, *args): """Change the shape of the array by growing or shrinking one or more diff --git a/zarr/frame.py b/zarr/frame.py index 32f7ccb115..d73ad1609e 100644 --- a/zarr/frame.py +++ b/zarr/frame.py @@ -160,16 +160,15 @@ def nrows(self): """ our number of rows """ return self._nrows + @property + def _shape(self): + return (self._nrows, self._ncols) + @property def columns(self): """ return a list of our columns """ return self._columns - @property - def dtypes(self): - """ a list of our dtypes """ - return self._dtypes - @property def _ncols(self): return len(self.dtypes) @@ -178,6 +177,11 @@ def _ncols(self): def ncols(self): return self._ncols + @property + def dtypes(self): + """ a list of our dtypes """ + return self._dtypes + @property def itemsize(self): """The size in bytes of each item in the array.""" @@ -457,11 +461,6 @@ def _encode_chunk(self, chunk): return cdata - def __repr__(self): - # N.B., __repr__ needs to be synchronized to ensure consistent view - # of metadata AND when retrieving nbytes_stored from filesystem storage - return self._synchronized_op(self._repr_nosync) - def _repr_nosync(self): # main line @@ -469,69 +468,23 @@ def _repr_nosync(self): if self.name: r += '%s, ' % self.name r += '%s, ' % str(self._shape) - r += '%s, ' % str(self._dtypes) r += 'chunks=%s, ' % str(self._chunks) r += ')' - # storage size info - r += '\n nbytes: %s' % human_readable_size(self._nbytes) - if self.nbytes_stored > 0: - r += '; nbytes_stored: %s' % human_readable_size( - self.nbytes_stored) - r += '; ratio: %.1f' % (self._nbytes / self.nbytes_stored) - r += '; initialized: %s/%s' % (self.nchunks_initialized, - self._nchunks) - - # filters - if self._filters: - # first line - r += '\n filters: %r' % self._filters[0] - # subsequent lines - for f in self._filters[1:]: - r += '\n %r' % f - - # compressor - if self._compressor: - r += '\n compressor: %r' % self._compressor - # storage and synchronizer classes - r += '\n store: %s' % type(self._store).__name__ + r += '\n store: %s' % type(self._store).__name__ if self._store != self._chunk_store: r += '; chunk_store: %s' % type(self._chunk_store).__name__ if self._synchronizer is not None: r += '; synchronizer: %s' % type(self._synchronizer).__name__ - return r - - def __getstate__(self): - return self._store, self._path, self._read_only, self._chunk_store, \ - self._synchronizer, self._cache_metadata - - def __setstate__(self, state): - self.__init__(*state) + # arrays + r += '\n' + for c in self._columns: + arr = self._arrays[c] + r += '\n %s' % arr._repr_abbv_nosync() - def _synchronized_op(self, f, *args, **kwargs): - - # no synchronization - if self._synchronizer is None: - self._refresh_metadata_nosync() - return f(*args, **kwargs) - - else: - # synchronize on the array - mkey = self._key_prefix + frame_meta_key - with self._synchronizer[mkey]: - self._refresh_metadata_nosync() - result = f(*args, **kwargs) - return result - - def _write_op(self, f, *args, **kwargs): - - # guard condition - if self._read_only: - err_read_only() - - return self._synchronized_op(f, *args, **kwargs) + return r def resize(self, *args): raise NotImplementedError("resize is not implemented") diff --git a/zarr/storage.py b/zarr/storage.py index 538e25c36a..023e4ceb5f 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -399,7 +399,7 @@ def _init_frame_metadata(store, nrows, columns, dtypes, chunks=None, raise ValueError("number of columns must equal number of dtypes") # chunks are based on the rows; treat each rows as singular - chunks = normalize_chunks(chunks, (nrows, 1), sum([dtype.itemsize for dtype in dtypes])) + chunks = normalize_chunks(chunks, (nrows, len(dtypes)), sum([dtype.itemsize for dtype in dtypes])) # obtain compressor config if compressor == 'none': diff --git a/zarr/tests/test_frame.py b/zarr/tests/test_frame.py index 54e2039a2d..43ae49c57f 100644 --- a/zarr/tests/test_frame.py +++ b/zarr/tests/test_frame.py @@ -25,13 +25,12 @@ Blosc, BZ2 -class TestArray(unittest.TestCase): +class TestFrame(unittest.TestCase): - def test_array_init(self): + def test_frame_init(self): # normal initialization store = dict() - import pdb; pdb.set_trace() init_frame(store, nrows=100, columns=['float', 'int'], dtypes=[np.float64, np.int64]) fr = Frame(store) assert_is_instance(fr, Frame) @@ -39,60 +38,26 @@ def test_array_init(self): assert repr(fr) eq(["float", "int"], fr.columns) eq((100,2), fr.shape) - eq((10,), fr.chunks) + eq((100,2), fr.chunks) eq(100, fr.nrows) eq('', fr.path) assert_is_none(fr.name) assert_is(store, fr.store) - # initialize at path - store = dict() - init_array(store, shape=100, chunks=10, path='foo/bar') - a = Array(store, path='foo/bar') - assert_is_instance(a, Array) - eq((100,), a.shape) - eq((10,), a.chunks) - eq('foo/bar', a.path) - eq('/foo/bar', a.name) - assert_is(store, a.store) - - # store not initialized - store = dict() - with assert_raises(KeyError): - Array(store) - - # group is in the way - store = dict() - init_group(store, path='baz') - with assert_raises(KeyError): - Array(store, path='baz') - - def create_array(self, read_only=False, **kwargs): + def create_frame(self, read_only=False, **kwargs): store = dict() kwargs.setdefault('compressor', Zlib(level=1)) - init_array(store, **kwargs) - return Array(store, read_only=read_only) - - def test_nbytes_stored(self): + init_frame(store, **kwargs) + return Frame(store, read_only=read_only) - # dict as store - z = self.create_array(shape=1000, chunks=100) - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) - eq(expect_nbytes_stored, z.nbytes_stored) - z[:] = 42 - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) - eq(expect_nbytes_stored, z.nbytes_stored) + def test_frame(self): - # mess with store - try: - z.store[z._key_prefix + 'foo'] = list(range(10)) - eq(-1, z.nbytes_stored) - except TypeError: - pass + df = pd.DataFrame({'A': [1, 2, 3], 'B': [1., 2., 3.], 'C': pd.date_range('20130101', periods=3), + 'D': ['foo', 'bar', 'baz']}, + columnslist('ABCD')) - def test_array_1d(self): - a = np.arange(1050) - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype) + import pdb; pdb.set_trace() + fr = self.create_frame(nrows=len(df), columns=df.columns, dtypes=df.dtypes.values) # check properties eq(len(a), len(z)) @@ -146,728 +111,3 @@ def test_array_1d(self): assert_array_equal(a[:190], z[:190]) assert_array_equal(b[190:310], z[190:310]) assert_array_equal(a[310:], z[310:]) - - def test_array_1d_fill_value(self): - for fill_value in -1, 0, 1, 10: - - a = np.arange(1050) - f = np.empty_like(a) - f.fill(fill_value) - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype, - fill_value=fill_value) - z[190:310] = a[190:310] - - assert_array_equal(f[:190], z[:190]) - assert_array_equal(a[190:310], z[190:310]) - assert_array_equal(f[310:], z[310:]) - - def test_array_1d_set_scalar(self): - # setup - a = np.zeros(100) - z = self.create_array(shape=a.shape, chunks=10, dtype=a.dtype) - z[:] = a - assert_array_equal(a, z[:]) - - for value in -1, 0, 1, 10: - print(value) - a[15:35] = value - z[15:35] = value - assert_array_equal(a, z[:]) - a[:] = value - z[:] = value - assert_array_equal(a, z[:]) - - def test_array_2d(self): - a = np.arange(10000).reshape((1000, 10)) - z = self.create_array(shape=a.shape, chunks=(100, 2), dtype=a.dtype) - - # check properties - eq(len(a), len(z)) - eq(a.ndim, z.ndim) - eq(a.shape, z.shape) - eq(a.dtype, z.dtype) - eq((100, 2), z.chunks) - eq(0, z.nchunks_initialized) - eq((10, 5), z.cdata_shape) - - # set data - z[:] = a - - # check properties - eq(a.nbytes, z.nbytes) - eq(50, z.nchunks_initialized) - - # check slicing - assert_array_equal(a, np.array(z)) - assert_array_equal(a, z[:]) - assert_array_equal(a, z[...]) - # noinspection PyTypeChecker - assert_array_equal(a, z[slice(None)]) - assert_array_equal(a[:10], z[:10]) - assert_array_equal(a[10:20], z[10:20]) - assert_array_equal(a[-10:], z[-10:]) - assert_array_equal(a[:, :2], z[:, :2]) - assert_array_equal(a[:, 2:4], z[:, 2:4]) - assert_array_equal(a[:, -2:], z[:, -2:]) - assert_array_equal(a[:10, :2], z[:10, :2]) - assert_array_equal(a[10:20, 2:4], z[10:20, 2:4]) - assert_array_equal(a[-10:, -2:], z[-10:, -2:]) - # ...across chunk boundaries... - assert_array_equal(a[:110], z[:110]) - assert_array_equal(a[190:310], z[190:310]) - assert_array_equal(a[-110:], z[-110:]) - assert_array_equal(a[:, :3], z[:, :3]) - assert_array_equal(a[:, 3:7], z[:, 3:7]) - assert_array_equal(a[:, -3:], z[:, -3:]) - assert_array_equal(a[:110, :3], z[:110, :3]) - assert_array_equal(a[190:310, 3:7], z[190:310, 3:7]) - assert_array_equal(a[-110:, -3:], z[-110:, -3:]) - # single item - assert_array_equal(a[0], z[0]) - assert_array_equal(a[-1], z[-1]) - eq(a[0, 0], z[0, 0]) - eq(a[-1, -1], z[-1, -1]) - - # check partial assignment - b = np.arange(10000, 20000).reshape((1000, 10)) - z[190:310, 3:7] = b[190:310, 3:7] - assert_array_equal(a[:190], z[:190]) - assert_array_equal(a[:, :3], z[:, :3]) - assert_array_equal(b[190:310, 3:7], z[190:310, 3:7]) - assert_array_equal(a[310:], z[310:]) - assert_array_equal(a[:, 7:], z[:, 7:]) - - def test_array_2d_partial(self): - z = self.create_array(shape=(1000, 10), chunks=(100, 2), dtype='i4', - fill_value=0) - - # check partial assignment, single row - c = np.arange(z.shape[1]) - z[0, :] = c - with assert_raises(ValueError): - # N.B., NumPy allows this, but we'll be strict for now - z[2:3] = c - with assert_raises(ValueError): - # N.B., NumPy allows this, but we'll be strict for now - z[-1:] = c - z[2:3] = c[None, :] - z[-1:] = c[None, :] - assert_array_equal(c, z[0, :]) - assert_array_equal(c, z[2, :]) - assert_array_equal(c, z[-1, :]) - - # check partial assignment, single column - d = np.arange(z.shape[0]) - z[:, 0] = d - with assert_raises(ValueError): - z[:, 2:3] = d - with assert_raises(ValueError): - z[:, -1:] = d - z[:, 2:3] = d[:, None] - z[:, -1:] = d[:, None] - assert_array_equal(d, z[:, 0]) - assert_array_equal(d, z[:, 2]) - assert_array_equal(d, z[:, -1]) - - # check single item assignment - z[0, 0] = -1 - z[2, 2] = -1 - z[-1, -1] = -1 - eq(-1, z[0, 0]) - eq(-1, z[2, 2]) - eq(-1, z[-1, -1]) - - def test_array_order(self): - - # 1D - a = np.arange(1050) - for order in 'C', 'F': - z = self.create_array(shape=a.shape, chunks=100, dtype=a.dtype, - order=order) - eq(order, z.order) - if order == 'F': - assert_true(z[:].flags.f_contiguous) - else: - assert_true(z[:].flags.c_contiguous) - z[:] = a - assert_array_equal(a, z[:]) - - # 2D - a = np.arange(10000).reshape((100, 100)) - for order in 'C', 'F': - z = self.create_array(shape=a.shape, chunks=(10, 10), - dtype=a.dtype, order=order) - eq(order, z.order) - if order == 'F': - assert_true(z[:].flags.f_contiguous) - else: - assert_true(z[:].flags.c_contiguous) - z[:] = a - actual = z[:] - assert_array_equal(a, actual) - - def test_setitem_data_not_shared(self): - # check that data don't end up being shared with another array - # https://github.com/alimanfoo/zarr/issues/79 - z = self.create_array(shape=20, chunks=10, dtype='i4') - a = np.arange(20, dtype='i4') - z[:] = a - assert_array_equal(z[:], np.arange(20, dtype='i4')) - a[:] = 0 - assert_array_equal(z[:], np.arange(20, dtype='i4')) - - def test_resize_1d(self): - - z = self.create_array(shape=105, chunks=10, dtype='i4', - fill_value=0) - a = np.arange(105, dtype='i4') - z[:] = a - eq((105,), z.shape) - eq((105,), z[:].shape) - eq(np.dtype('i4'), z.dtype) - eq(np.dtype('i4'), z[:].dtype) - eq((10,), z.chunks) - assert_array_equal(a, z[:]) - - z.resize(205) - eq((205,), z.shape) - eq((205,), z[:].shape) - eq(np.dtype('i4'), z.dtype) - eq(np.dtype('i4'), z[:].dtype) - eq((10,), z.chunks) - assert_array_equal(a, z[:105]) - assert_array_equal(np.zeros(100, dtype='i4'), z[105:]) - - z.resize(55) - eq((55,), z.shape) - eq((55,), z[:].shape) - eq(np.dtype('i4'), z.dtype) - eq(np.dtype('i4'), z[:].dtype) - eq((10,), z.chunks) - assert_array_equal(a[:55], z[:]) - - # via shape setter - z.shape = (105,) - eq((105,), z.shape) - eq((105,), z[:].shape) - - def test_resize_2d(self): - - z = self.create_array(shape=(105, 105), chunks=(10, 10), dtype='i4', - fill_value=0) - a = np.arange(105*105, dtype='i4').reshape((105, 105)) - z[:] = a - eq((105, 105), z.shape) - eq((105, 105), z[:].shape) - eq(np.dtype('i4'), z.dtype) - eq(np.dtype('i4'), z[:].dtype) - eq((10, 10), z.chunks) - assert_array_equal(a, z[:]) - - z.resize((205, 205)) - eq((205, 205), z.shape) - eq((205, 205), z[:].shape) - eq(np.dtype('i4'), z.dtype) - eq(np.dtype('i4'), z[:].dtype) - eq((10, 10), z.chunks) - assert_array_equal(a, z[:105, :105]) - assert_array_equal(np.zeros((100, 205), dtype='i4'), z[105:, :]) - assert_array_equal(np.zeros((205, 100), dtype='i4'), z[:, 105:]) - - z.resize((55, 55)) - eq((55, 55), z.shape) - eq((55, 55), z[:].shape) - eq(np.dtype('i4'), z.dtype) - eq(np.dtype('i4'), z[:].dtype) - eq((10, 10), z.chunks) - assert_array_equal(a[:55, :55], z[:]) - - z.resize((55, 1)) - eq((55, 1), z.shape) - eq((55, 1), z[:].shape) - eq(np.dtype('i4'), z.dtype) - eq(np.dtype('i4'), z[:].dtype) - eq((10, 10), z.chunks) - assert_array_equal(a[:55, :1], z[:]) - - # via shape setter - z.shape = (105, 105) - eq((105, 105), z.shape) - eq((105, 105), z[:].shape) - - def test_append_1d(self): - - a = np.arange(105) - z = self.create_array(shape=a.shape, chunks=10, dtype=a.dtype) - z[:] = a - eq(a.shape, z.shape) - eq(a.dtype, z.dtype) - eq((10,), z.chunks) - assert_array_equal(a, z[:]) - - b = np.arange(105, 205) - e = np.append(a, b) - z.append(b) - eq(e.shape, z.shape) - eq(e.dtype, z.dtype) - eq((10,), z.chunks) - assert_array_equal(e, z[:]) - - # check append handles array-like - c = [1, 2, 3] - f = np.append(e, c) - z.append(c) - eq(f.shape, z.shape) - eq(f.dtype, z.dtype) - eq((10,), z.chunks) - assert_array_equal(f, z[:]) - - def test_append_2d(self): - - a = np.arange(105*105, dtype='i4').reshape((105, 105)) - z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype) - z[:] = a - eq(a.shape, z.shape) - eq(a.dtype, z.dtype) - eq((10, 10), z.chunks) - actual = z[:] - assert_array_equal(a, actual) - - b = np.arange(105*105, 2*105*105, dtype='i4').reshape((105, 105)) - e = np.append(a, b, axis=0) - z.append(b) - eq(e.shape, z.shape) - eq(e.dtype, z.dtype) - eq((10, 10), z.chunks) - actual = z[:] - assert_array_equal(e, actual) - - def test_append_2d_axis(self): - - a = np.arange(105*105, dtype='i4').reshape((105, 105)) - z = self.create_array(shape=a.shape, chunks=(10, 10), dtype=a.dtype) - z[:] = a - eq(a.shape, z.shape) - eq(a.dtype, z.dtype) - eq((10, 10), z.chunks) - assert_array_equal(a, z[:]) - - b = np.arange(105*105, 2*105*105, dtype='i4').reshape((105, 105)) - e = np.append(a, b, axis=1) - z.append(b, axis=1) - eq(e.shape, z.shape) - eq(e.dtype, z.dtype) - eq((10, 10), z.chunks) - assert_array_equal(e, z[:]) - - def test_append_bad_shape(self): - a = np.arange(100) - z = self.create_array(shape=a.shape, chunks=10, dtype=a.dtype) - z[:] = a - b = a.reshape(10, 10) - with assert_raises(ValueError): - z.append(b) - - def test_read_only(self): - - z = self.create_array(shape=1000, chunks=100) - assert_false(z.read_only) - - z = self.create_array(shape=1000, chunks=100, read_only=True) - assert_true(z.read_only) - with assert_raises(PermissionError): - z[:] = 42 - with assert_raises(PermissionError): - z.resize(2000) - with assert_raises(PermissionError): - z.append(np.arange(1000)) - - def test_pickle(self): - - z = self.create_array(shape=1000, chunks=100, dtype=int) - z[:] = np.random.randint(0, 1000, 1000) - z2 = pickle.loads(pickle.dumps(z)) - eq(z.shape, z2.shape) - eq(z.chunks, z2.chunks) - eq(z.dtype, z2.dtype) - if z.compressor: - eq(z.compressor.get_config(), z2.compressor.get_config()) - eq(z.fill_value, z2.fill_value) - assert_array_equal(z[:], z2[:]) - - def test_repr(self): - if not PY2: - z = self.create_array(shape=100, chunks=10, dtype='f4') - expect = """Array((100,), float32, chunks=(10,), order=C) - nbytes: 400; nbytes_stored: 245; ratio: 1.6; initialized: 0/10 - compressor: Zlib(level=1) - store: dict -""" - actual = repr(z) - for l1, l2 in zip(expect.split('\n'), actual.split('\n')): - eq(l1, l2) - - def test_np_ufuncs(self): - z = self.create_array(shape=(100, 100), chunks=(10, 10)) - a = np.arange(10000).reshape(100, 100) - z[:] = a - - eq(np.sum(a), np.sum(z)) - assert_array_equal(np.sum(a, axis=0), np.sum(z, axis=0)) - eq(np.mean(a), np.mean(z)) - assert_array_equal(np.mean(a, axis=1), np.mean(z, axis=1)) - condition = np.random.randint(0, 2, size=100, dtype=bool) - assert_array_equal(np.compress(condition, a, axis=0), - np.compress(condition, z, axis=0)) - indices = np.random.choice(100, size=50, replace=True) - assert_array_equal(np.take(a, indices, axis=1), - np.take(z, indices, axis=1)) - - # use zarr array as indices or condition - zc = self.create_array(shape=condition.shape, dtype=condition.dtype, - chunks=10, filters=None) - zc[:] = condition - assert_array_equal(np.compress(condition, a, axis=0), - np.compress(zc, a, axis=0)) - zi = self.create_array(shape=indices.shape, dtype=indices.dtype, - chunks=10, filters=None) - zi[:] = indices - # this triggers __array__() call with dtype argument - assert_array_equal(np.take(a, indices, axis=1), - np.take(a, zi, axis=1)) - - -class TestArrayWithPath(TestArray): - - @staticmethod - def create_array(read_only=False, **kwargs): - store = dict() - init_array(store, path='foo/bar', **kwargs) - return Array(store, path='foo/bar', read_only=read_only) - - def test_nbytes_stored(self): - - # dict as store - z = self.create_array(shape=1000, chunks=100) - expect_nbytes_stored = sum(buffer_size(v) - for k, v in z.store.items() - if k.startswith('foo/bar/')) - eq(expect_nbytes_stored, z.nbytes_stored) - z[:] = 42 - expect_nbytes_stored = sum(buffer_size(v) - for k, v in z.store.items() - if k.startswith('foo/bar/')) - eq(expect_nbytes_stored, z.nbytes_stored) - - # mess with store - z.store[z._key_prefix + 'foo'] = list(range(10)) - eq(-1, z.nbytes_stored) - - def test_repr(self): - if not PY2: - z = self.create_array(shape=100, chunks=10, dtype='f4') - # flake8: noqa - expect = """Array(/foo/bar, (100,), float32, chunks=(10,), order=C) - nbytes: 400; nbytes_stored: 293; ratio: 1.4; initialized: 0/10 - compressor: Blosc(cname='lz4', clevel=5, shuffle=1) - store: dict -""" - actual = repr(z) - for l1, l2 in zip(expect.split('\n'), actual.split('\n')): - eq(l1, l2) - - -class TestArrayWithChunkStore(TestArray): - - @staticmethod - def create_array(read_only=False, **kwargs): - store = dict() - # separate chunk store - chunk_store = dict() - init_array(store, chunk_store=chunk_store, **kwargs) - return Array(store, read_only=read_only, chunk_store=chunk_store) - - def test_nbytes_stored(self): - - z = self.create_array(shape=1000, chunks=100) - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) - expect_nbytes_stored += sum(buffer_size(v) - for v in z.chunk_store.values()) - eq(expect_nbytes_stored, z.nbytes_stored) - z[:] = 42 - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) - expect_nbytes_stored += sum(buffer_size(v) - for v in z.chunk_store.values()) - eq(expect_nbytes_stored, z.nbytes_stored) - - # mess with store - z.chunk_store[z._key_prefix + 'foo'] = list(range(10)) - eq(-1, z.nbytes_stored) - - def test_repr(self): - if not PY2: - z = self.create_array(shape=100, chunks=10, dtype='f4') - # flake8: noqa - expect = """Array((100,), float32, chunks=(10,), order=C) - nbytes: 400; nbytes_stored: 293; ratio: 1.4; initialized: 0/10 - compressor: Blosc(cname='lz4', clevel=5, shuffle=1) - store: dict; chunk_store: dict -""" - actual = repr(z) - for l1, l2 in zip(expect.split('\n'), actual.split('\n')): - eq(l1, l2) - - -class TestArrayWithDirectoryStore(TestArray): - - @staticmethod - def create_array(read_only=False, **kwargs): - path = mkdtemp() - atexit.register(shutil.rmtree, path) - store = DirectoryStore(path) - kwargs.setdefault('compressor', Zlib(1)) - init_array(store, **kwargs) - return Array(store, read_only=read_only) - - def test_nbytes_stored(self): - - # dict as store - z = self.create_array(shape=1000, chunks=100) - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) - eq(expect_nbytes_stored, z.nbytes_stored) - z[:] = 42 - expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) - eq(expect_nbytes_stored, z.nbytes_stored) - - def test_repr(self): - if not PY2: - z = self.create_array(shape=100, chunks=10, dtype='f4') - # flake8: noqa - expect = """Array((100,), float32, chunks=(10,), order=C) - nbytes: 400; nbytes_stored: 245; ratio: 1.6; initialized: 0/10 - compressor: Zlib(level=1) - store: DirectoryStore -""" - actual = repr(z) - for l1, l2 in zip(expect.split('\n'), actual.split('\n')): - eq(l1, l2) - - -class TestArrayWithNoCompressor(TestArray): - - def create_array(self, read_only=False, **kwargs): - store = dict() - kwargs.setdefault('compressor', None) - init_array(store, **kwargs) - return Array(store, read_only=read_only) - - def test_repr(self): - if not PY2: - z = self.create_array(shape=100, chunks=10, dtype='f4') - expect = """Array((100,), float32, chunks=(10,), order=C) - nbytes: 400; nbytes_stored: 201; ratio: 2.0; initialized: 0/10 - store: dict -""" - actual = repr(z) - for l1, l2 in zip(expect.split('\n'), actual.split('\n')): - eq(l1, l2) - - -class TestArrayWithBZ2Compressor(TestArray): - - def create_array(self, read_only=False, **kwargs): - store = dict() - compressor = BZ2(level=1) - kwargs.setdefault('compressor', compressor) - init_array(store, **kwargs) - return Array(store, read_only=read_only) - - def test_repr(self): - if not PY2: - z = self.create_array(shape=100, chunks=10, dtype='f4') - expect = """Array((100,), float32, chunks=(10,), order=C) - nbytes: 400; nbytes_stored: 244; ratio: 1.6; initialized: 0/10 - compressor: BZ2(level=1) - store: dict -""" - actual = repr(z) - for l1, l2 in zip(expect.split('\n'), actual.split('\n')): - eq(l1, l2) - - -class TestArrayWithBloscCompressor(TestArray): - - def create_array(self, read_only=False, **kwargs): - store = dict() - compressor = Blosc(cname='zstd', clevel=1, shuffle=1) - kwargs.setdefault('compressor', compressor) - init_array(store, **kwargs) - return Array(store, read_only=read_only) - - def test_repr(self): - if not PY2: - z = self.create_array(shape=100, chunks=10, dtype='f4') - expect = """Array((100,), float32, chunks=(10,), order=C) - nbytes: 400; nbytes_stored: 294; ratio: 1.4; initialized: 0/10 - compressor: Blosc(cname='zstd', clevel=1, shuffle=1) - store: dict -""" - actual = repr(z) - for l1, l2 in zip(expect.split('\n'), actual.split('\n')): - eq(l1, l2) - - -if not PY2: - - from zarr.codecs import LZMA - - class TestArrayWithLZMACompressor(TestArray): - - def create_array(self, read_only=False, **kwargs): - store = dict() - compressor = LZMA(preset=1) - kwargs.setdefault('compressor', compressor) - init_array(store, **kwargs) - return Array(store, read_only=read_only) - - def test_repr(self): - z = self.create_array(shape=100, chunks=10, dtype='f4') - expect = """Array((100,), float32, chunks=(10,), order=C) - nbytes: 400; nbytes_stored: 313; ratio: 1.3; initialized: 0/10 - compressor: LZMA(format=1, check=-1, preset=1, filters=None) - store: dict -""" - actual = repr(z) - for l1, l2 in zip(expect.split('\n'), actual.split('\n')): - eq(l1, l2) - - -class TestArrayWithFilters(TestArray): - - @staticmethod - def create_array(read_only=False, **kwargs): - store = dict() - dtype = kwargs.get('dtype', None) - filters = [ - Delta(dtype=dtype), - FixedScaleOffset(dtype=dtype, scale=1, offset=0), - ] - kwargs.setdefault('filters', filters) - compressor = Zlib(1) - kwargs.setdefault('compressor', compressor) - init_array(store, **kwargs) - return Array(store, read_only=read_only) - - def test_repr(self): - if not PY2: - z = self.create_array(shape=100, chunks=10, dtype='f4') - # flake8: noqa - expect = """Array((100,), float32, chunks=(10,), order=C) - nbytes: 400; nbytes_stored: 515; ratio: 0.8; initialized: 0/10 - filters: Delta(dtype=float32) - FixedScaleOffset(scale=1, offset=0, dtype=float32) - compressor: Zlib(level=1) - store: dict -""" - actual = repr(z) - for l1, l2 in zip(expect.split('\n'), actual.split('\n')): - eq(l1, l2) - - -# custom store, does not support getsize() -class CustomMapping(object): - - def __init__(self): - self.inner = dict() - - def keys(self): - return self.inner.keys() - - def __getitem__(self, item): - return self.inner[item] - - def __setitem__(self, item, value): - self.inner[item] = value - - def __delitem__(self, key): - del self.inner[key] - - def __contains__(self, item): - return item in self.inner - - -class TestArrayWithCustomMapping(TestArray): - - @staticmethod - def create_array(read_only=False, **kwargs): - store = CustomMapping() - kwargs.setdefault('compressor', Zlib(1)) - init_array(store, **kwargs) - return Array(store, read_only=read_only) - - def test_nbytes_stored(self): - z = self.create_array(shape=1000, chunks=100) - eq(-1, z.nbytes_stored) - z[:] = 42 - eq(-1, z.nbytes_stored) - - def test_repr(self): - if not PY2: - z = self.create_array(shape=100, chunks=10, dtype='f4') - # flake8: noqa - expect = """Array((100,), float32, chunks=(10,), order=C) - nbytes: 400; initialized: 0/10 - compressor: Zlib(level=1) - store: CustomMapping -""" - actual = repr(z) - for l1, l2 in zip(expect.split('\n'), actual.split('\n')): - eq(l1, l2) - - -class TestArrayNoCacheMetadata(TestArray): - - @staticmethod - def create_array(read_only=False, **kwargs): - store = dict() - kwargs.setdefault('compressor', Zlib(level=1)) - init_array(store, **kwargs) - return Array(store, read_only=read_only, cache_metadata=False) - - def test_cache_metadata(self): - a1 = self.create_array(shape=100, chunks=10, dtype='i1') - a2 = Array(a1.store, cache_metadata=True) - eq(a1.shape, a2.shape) - eq(a1.size, a2.size) - eq(a1.nbytes, a2.nbytes) - eq(a1.nchunks, a2.nchunks) - - a2.resize(200) - eq((200,), a2.shape) - eq(200, a2.size) - eq(200, a2.nbytes) - eq(20, a2.nchunks) - eq(a1.shape, a2.shape) - eq(a1.size, a2.size) - eq(a1.nbytes, a2.nbytes) - eq(a1.nchunks, a2.nchunks) - - a2.append(np.zeros(100)) - eq((300,), a2.shape) - eq(300, a2.size) - eq(300, a2.nbytes) - eq(30, a2.nchunks) - eq(a1.shape, a2.shape) - eq(a1.size, a2.size) - eq(a1.nbytes, a2.nbytes) - eq(a1.nchunks, a2.nchunks) - - a1.resize(400) - eq((400,), a1.shape) - eq(400, a1.size) - eq(400, a1.nbytes) - eq(40, a1.nchunks) - eq((300,), a2.shape) - eq(300, a2.size) - eq(300, a2.nbytes) - eq(30, a2.nchunks) From 9c3406b3c9df9660e11868330fc0bd68f582c05f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 15 Oct 2016 17:54:31 -0400 Subject: [PATCH 09/12] more --- zarr/codecs.py | 26 +++- zarr/creation.py | 78 +++++++++- zarr/frame.py | 301 ++++++++------------------------------- zarr/storage.py | 11 +- zarr/tests/test_frame.py | 84 ++++------- 5 files changed, 187 insertions(+), 313 deletions(-) diff --git a/zarr/codecs.py b/zarr/codecs.py index b299bb8db5..b8ae94965b 100644 --- a/zarr/codecs.py +++ b/zarr/codecs.py @@ -6,7 +6,7 @@ import math import multiprocessing import atexit - +import pickle import numpy as np @@ -910,6 +910,30 @@ def __repr__(self): codec_registry[Categorize.codec_id] = Categorize +class PickleCodec(Codec): + + codec_id = 'x-pickle' + + def encode(self, buf): + return pickle.dumps(buf) + + def decode(self, buf, out=None): + dec = pickle.loads(buf) + if out is not None: + np.copyto(out, dec) + return out + else: + return dec + + def get_config(self): + return dict(id=self.codec_id) + + def __repr__(self): + return 'PickleCodec()' + + +codec_registry[PickleCodec.codec_id] = PickleCodec + __all__ = ['get_codec', 'codec_registry'] for _cls in codec_registry.values(): diff --git a/zarr/creation.py b/zarr/creation.py index 1812e11a80..a651e8314f 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -7,17 +7,18 @@ from zarr.core import Array -from zarr.storage import DirectoryStore, init_array, contains_array, \ +from zarr.frame import Frame +from zarr.storage import DirectoryStore, init_array, init_frame, contains_array, \ contains_group, default_compressor, normalize_storage_path from zarr.codecs import codec_registry from zarr.errors import err_contains_array, err_contains_group, \ err_array_not_found -def create(shape, chunks=None, dtype=None, compressor='default', - fill_value=0, order='C', store=None, synchronizer=None, - overwrite=False, path=None, chunk_store=None, filters=None, - cache_metadata=True, **kwargs): +def create_array(shape, chunks=None, dtype=None, compressor='default', + fill_value=0, order='C', store=None, synchronizer=None, + overwrite=False, path=None, chunk_store=None, filters=None, + cache_metadata=True, **kwargs): """Create an array. Parameters @@ -81,8 +82,8 @@ def create(shape, chunks=None, dtype=None, compressor='default', # initialize array metadata init_array(store, shape=shape, chunks=chunks, dtype=dtype, - compressor=compressor, fill_value=fill_value, order=order, - overwrite=overwrite, path=path, chunk_store=chunk_store, + compressor=compressor, fill_value=fill_value, order=order, + overwrite=overwrite, path=path, chunk_store=chunk_store, filters=filters) # instantiate array @@ -90,6 +91,69 @@ def create(shape, chunks=None, dtype=None, compressor='default', synchronizer=synchronizer, cache_metadata=cache_metadata) return z +create = create_array + +def create_frame(nrows, columns, dtypes, chunks=None, compressor='default', + store=None, synchronizer=None, + overwrite=False, path=None, chunk_store=None, filters=None, + cache_metadata=True, **kwargs): + """Create an array. + + Parameters + ---------- + nrows : int + len of Frame + columns : list of string columns + dtypes : list of dtypes + chunks : int or tuple of ints, optional + Chunk shape. If not provided, will be guessed from `shape` and `dtype`. + compressor : Codec, optional + Primary compressor. + store : MutableMapping or string + Store or path to directory in file system. + synchronizer : object, optional + Array synchronizer. + overwrite : bool, optional + If True, delete all pre-existing data in `store` at `path` before + creating the array. + path : string, optional + Path under which array is stored. + chunk_store : MutableMapping, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + filters : sequence of Codecs, optional + Sequence of filters to use to encode chunk data prior to compression. + cache_metadata : bool, optional + If True, array configuration metadata will be cached for the + lifetime of the object. If False, array metadata will be reloaded + prior to all data access and modification operations (may incur + overhead depending on storage and data access pattern). + + Returns + ------- + z : zarr.frame.Frame + + Examples + -------- + + """ # flake8: noqa + + # handle polymorphic store arg + store = _handle_store_arg(store) + + # compatibility + compressor, _ = _handle_kwargs(compressor, None, kwargs) + + # initialize frame metadata + init_frame(store, nrows=nrows, columns=column, dtypes=dtypes, chunks=chunks, + compressor=compressor, overwrite=overwrite, path=path, + chunk_store=chunk_store, filters=filters) + + # instantiate frame + z = Frame(store, path=path, chunk_store=chunk_store, + synchronizer=synchronizer, cache_metadata=cache_metadata) + + return z def _handle_store_arg(store): diff --git a/zarr/frame.py b/zarr/frame.py index d73ad1609e..bc3ef18144 100644 --- a/zarr/frame.py +++ b/zarr/frame.py @@ -13,7 +13,10 @@ from zarr.meta import decode_frame_metadata, encode_frame_metadata from zarr.attrs import Attributes from zarr.errors import PermissionError, err_read_only, err_frame_not_found -from zarr.codecs import get_codec +from zarr.codecs import get_codec, PickleCodec + +from pandas import DataFrame, concat +from pandas.api.types import is_object_dtype, is_categorical_dtype class Frame(Base): @@ -98,18 +101,27 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, self._attrs = Attributes(store, key=akey, read_only=read_only, synchronizer=synchronizer) # create our arrays + filters = self._filters self._arrays = {} for c, dtype in zip(self._columns, self._dtypes): path = self._key_prefix + '/data/' + c + + if is_object_dtype(dtype): + filters = self._filters + if filters is None: + filters = [] + filters += [PickleCodec()] + else: + filters = self._filters init_array(store, - (self._nrows, 1), - chunks=self._chunks, + self._nrows, + chunks=self._chunks[0], dtype=dtype, compressor=self._compressor, path=path, chunk_store=self._chunk_store, - filters=self._filters) - self._arrays[c] = Array(store, path=path, read_only=True) + filters=filters) + self._arrays[c] = Array(store, path=path, read_only=False) def _load_metadata_nosync(self): try: @@ -123,7 +135,9 @@ def _load_metadata_nosync(self): meta = decode_frame_metadata(meta_bytes) self._meta = meta self._nrows = meta['nrows'] - self._columns = meta['columns'] + + from pandas import Index + self._columns = Index(meta['columns']) self._dtypes = meta['dtypes'] self._chunks = meta['chunks'] @@ -199,8 +213,27 @@ def __eq__(self, other): self.path == other.path ) + def _array_to_series(self, c, indexer): + """ + Return a pandas Series for this array with name c + Raise KeyError if not found + """ + from pandas import Series + arr = self._arrays[c] + arr = arr[indexer] + return Series(arr, name=c) + + def _series_to_array(self, c, indexer, value): + """ + Set the array with name c for this value (a Series) + and the indexer + """ + arr = self._arrays[c] + arr[indexer] = value.values + def __getitem__(self, item): - """Retrieve a column or columns. Always returns a DataFrame of the requires column or columns. + """ + Retrieve a column or columns. Always returns a DataFrame of the requires column or columns. Returns ------- @@ -215,251 +248,31 @@ def __getitem__(self, item): if not self._cache_metadata: self._load_metadata() - - import pdb; pdb.set_trace() - - - # normalize selection - selection = normalize_array_selection(item, self._shape) - - # determine output array shape - out_shape = tuple(s.stop - s.start for s in selection - if isinstance(s, slice)) - - # setup output array - out = np.empty(out_shape, dtype=self._dtype, order=self._order) - - # determine indices of chunks overlapping the selection - chunk_range = get_chunk_range(selection, self._chunks) - - # iterate over chunks in range - for cidx in itertools.product(*chunk_range): - - # determine chunk offset - offset = [i * c for i, c in zip(cidx, self._chunks)] - - # determine region within output array - out_selection = tuple( - slice(max(0, o - s.start), - min(o + c - s.start, s.stop - s.start)) - for s, o, c, in zip(selection, offset, self._chunks) - if isinstance(s, slice) - ) - - # determine region within chunk - chunk_selection = tuple( - slice(max(0, s.start - o), min(c, s.stop - o)) - if isinstance(s, slice) - else s - o - for s, o, c in zip(selection, offset, self._chunks) - ) - - # obtain the destination array as a view of the output array - if out_selection: - dest = out[out_selection] - else: - dest = out - - # load chunk selection into output array - self._chunk_getitem(cidx, chunk_selection, dest) - - if out.shape: - return out - else: - return out[()] + columns = self._columns[item] + return concat([self._array_to_series(c, slice(None)) + for c in columns], + axis=1) def __setitem__(self, item, value): - raise NotImplementedError("__setitem__ is not implemented") - - def _chunk_getitem(self, cidx, item, dest): - """Obtain part or whole of a chunk. - - Parameters - ---------- - cidx : tuple of ints - Indices of the chunk. - item : tuple of slices - Location of region within the chunk. - dest : ndarray - Numpy array to store result in. - """ + Set particular data. item item refers to column or columns. + The shape and dtypes must match to the existing store. - try: - - # obtain compressed data for chunk - ckey = self._chunk_key(cidx) - cdata = self._chunk_store[ckey] - - except KeyError: - - # chunk not initialized - if self._fill_value is not None: - dest.fill(self._fill_value) - - else: - - if is_total_slice(item, self._chunks) and \ - not self._filters and \ - ((self._order == 'C' and dest.flags.c_contiguous) or - (self._order == 'F' and dest.flags.f_contiguous)): - - # optimization: we want the whole chunk, and the destination is - # contiguous, so we can decompress directly from the chunk - # into the destination array - if self._compressor: - self._compressor.decode(cdata, dest) - else: - arr = np.frombuffer(cdata, dtype=self._dtype) - arr = arr.reshape(self._chunks, order=self._order) - np.copyto(dest, arr) - - else: - - # decode chunk - chunk = self._decode_chunk(cdata) - - # set data in output array - # (split into two lines for profiling) - tmp = chunk[item] - if dest.shape: - dest[:] = tmp - else: - dest[()] = tmp - - def _chunk_setitem(self, cidx, item, value): - """Replace part or whole of a chunk. - - Parameters - ---------- - cidx : tuple of ints - Indices of the chunk. - item : tuple of slices - Location of region within the chunk. - value : scalar or ndarray - Value to set. - + Examples + -------- """ - # synchronization - if self._synchronizer is None: - self._chunk_setitem_nosync(cidx, item, value) - else: - # synchronize on the chunk - ckey = self._chunk_key(cidx) - with self._synchronizer[ckey]: - self._chunk_setitem_nosync(cidx, item, value) - - def _chunk_setitem_nosync(self, cidx, item, value): - - # obtain key for chunk storage - ckey = self._chunk_key(cidx) - - if is_total_slice(item, self._chunks): - # totally replace chunk - - # optimization: we are completely replacing the chunk, so no need - # to access the existing chunk data - - if np.isscalar(value): - - # setup array filled with value - chunk = np.empty(self._chunks, dtype=self._dtype, - order=self._order) - chunk.fill(value) - - else: - - if not self._compressor and not self._filters: - - # https://github.com/alimanfoo/zarr/issues/79 - # Ensure a copy is taken so we don't end up storing - # a view into someone else's array. - # N.B., this assumes that filters or compressor always - # take a copy and never attempt to apply encoding in-place. - chunk = np.array(value, dtype=self._dtype, - order=self._order) - - else: - # ensure array is contiguous - if self._order == 'F': - chunk = np.asfortranarray(value, dtype=self._dtype) - else: - chunk = np.ascontiguousarray(value, dtype=self._dtype) - - else: - # partially replace the contents of this chunk - - try: - - # obtain compressed data for chunk - cdata = self._chunk_store[ckey] - - except KeyError: - - # chunk not initialized - chunk = np.empty(self._chunks, dtype=self._dtype, - order=self._order) - if self._fill_value is not None: - chunk.fill(self._fill_value) - - else: - - # decode chunk - chunk = self._decode_chunk(cdata) - if not chunk.flags.writeable: - chunk = chunk.copy(order='K') - - # modify - chunk[item] = value - - # encode chunk - cdata = self._encode_chunk(chunk) - - # store - self._chunk_store[ckey] = cdata - - def _chunk_key(self, cidx): - return self._key_prefix + '.'.join(map(str, cidx)) - - def _decode_chunk(self, cdata): - - # decompress - if self._compressor: - chunk = self._compressor.decode(cdata) - else: - chunk = cdata - - # apply filters - if self._filters: - for f in self._filters[::-1]: - chunk = f.decode(chunk) - - # view as correct dtype - if isinstance(chunk, np.ndarray): - chunk = chunk.view(self._dtype) - else: - chunk = np.frombuffer(chunk, self._dtype) - - # reshape - chunk = chunk.reshape(self._chunks, order=self._order) - - return chunk - - def _encode_chunk(self, chunk): - - # apply filters - if self._filters: - for f in self._filters: - chunk = f.encode(chunk) + # refresh metadata + if not self._cache_metadata: + self._load_metadata_nosync() - # compress - if self._compressor: - cdata = self._compressor.encode(chunk) - else: - cdata = chunk + # normalize selection + columns = self._columns[item] + if not isinstance(value, DataFrame): + raise ValueError("setting must be with a DataFrame") - return cdata + for c in columns: + self._series_to_array(c, slice(None), value[c]) def _repr_nosync(self): diff --git a/zarr/storage.py b/zarr/storage.py index 023e4ceb5f..9d2f3af2a4 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -332,7 +332,7 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, def init_frame(store, nrows, columns, dtypes, chunks=None, overwrite=False, path=None, - chunk_store=None): + compressor='default', chunk_store=None, filters=None): """initialize a frame store. Parameters @@ -347,6 +347,8 @@ def init_frame(store, nrows, columns, dtypes, chunks=None, overwrite=False, path list of dtypes chunks : int or tuple of ints, optional Chunk shape. If not provided, will be guessed from `shape` and `dtype`. + compressor : Codec, optional + Primary compressor. overwrite : bool, optional If True, erase all data in `store` prior to initialisation. path : string, optional @@ -354,6 +356,8 @@ def init_frame(store, nrows, columns, dtypes, chunks=None, overwrite=False, path chunk_store : MutableMapping, optional Separate storage for chunks. If not provided, `store` will be used for storage of both chunks and metadata. + filters : sequence, optional + Sequence of filters to use to encode chunk data prior to compression. """ @@ -366,9 +370,9 @@ def init_frame(store, nrows, columns, dtypes, chunks=None, overwrite=False, path # initialise metadata _init_frame_metadata(store=store, nrows=nrows, columns=columns, - dtypes=dtypes, chunks=chunks, + dtypes=dtypes, chunks=chunks, compressor=compressor, overwrite=overwrite, path=path, - chunk_store=chunk_store) + chunk_store=chunk_store, filters=filters) def _init_frame_metadata(store, nrows, columns, dtypes, chunks=None, @@ -397,6 +401,7 @@ def _init_frame_metadata(store, nrows, columns, dtypes, chunks=None, raise ValueError("columns must be a list-like") if not len(dtypes) == len(columns): raise ValueError("number of columns must equal number of dtypes") + columns = list(columns) # chunks are based on the rows; treat each rows as singular chunks = normalize_chunks(chunks, (nrows, len(dtypes)), sum([dtype.itemsize for dtype in dtypes])) diff --git a/zarr/tests/test_frame.py b/zarr/tests/test_frame.py index 43ae49c57f..8fbba3880b 100644 --- a/zarr/tests/test_frame.py +++ b/zarr/tests/test_frame.py @@ -10,7 +10,7 @@ import numpy as np import pandas as pd -from pandas.util.testing import assert_frame_equal +from pandas.util.testing import assert_frame_equal, assert_series_equal from nose.tools import eq_ as eq, assert_is_instance, \ assert_raises, assert_true, assert_false, assert_is, assert_is_none @@ -32,17 +32,17 @@ def test_frame_init(self): # normal initialization store = dict() init_frame(store, nrows=100, columns=['float', 'int'], dtypes=[np.float64, np.int64]) - fr = Frame(store) - assert_is_instance(fr, Frame) - - assert repr(fr) - eq(["float", "int"], fr.columns) - eq((100,2), fr.shape) - eq((100,2), fr.chunks) - eq(100, fr.nrows) - eq('', fr.path) - assert_is_none(fr.name) - assert_is(store, fr.store) + a = Frame(store) + assert_is_instance(a, Frame) + + assert repr(a) + assert_true(pd.Index(["float", "int"]).equals(a.columns)) + eq((100,2), a.shape) + eq((100,2), a.chunks) + eq(100, a.nrows) + eq('', a.path) + assert_is_none(a.name) + assert_is(store, a.store) def create_frame(self, read_only=False, **kwargs): store = dict() @@ -54,60 +54,28 @@ def test_frame(self): df = pd.DataFrame({'A': [1, 2, 3], 'B': [1., 2., 3.], 'C': pd.date_range('20130101', periods=3), 'D': ['foo', 'bar', 'baz']}, - columnslist('ABCD')) + columns=list('ABCD')) - import pdb; pdb.set_trace() - fr = self.create_frame(nrows=len(df), columns=df.columns, dtypes=df.dtypes.values) + a = self.create_frame(nrows=len(df), columns=df.columns, dtypes=df.dtypes.values) # check properties - eq(len(a), len(z)) - eq(a.ndim, z.ndim) - eq(a.shape, z.shape) - eq(a.dtype, z.dtype) - eq((100,), z.chunks) - eq(a.nbytes, z.nbytes) - eq(11, z.nchunks) - eq(0, z.nchunks_initialized) - eq((11,), z.cdata_shape) + eq(len(a), len(df)) + eq(a.ndim, df.ndim) + eq(a.shape, df.shape) # check empty - b = z[:] - assert_is_instance(b, np.ndarray) + b = a[:] + assert_is_instance(b, pd.DataFrame) eq(a.shape, b.shape) - eq(a.dtype, b.dtype) + assert_series_equal(b.dtypes, df.dtypes) # check attributes - z.attrs['foo'] = 'bar' - eq('bar', z.attrs['foo']) + a.attrs['foo'] = 'bar' + eq('bar', a.attrs['foo']) # set data - z[:] = a + a[:] = df - # check properties - eq(a.nbytes, z.nbytes) - eq(11, z.nchunks) - eq(11, z.nchunks_initialized) - - # check slicing - assert_array_equal(a, np.array(z)) - assert_array_equal(a, z[:]) - assert_array_equal(a, z[...]) - # noinspection PyTypeChecker - assert_array_equal(a, z[slice(None)]) - assert_array_equal(a[:10], z[:10]) - assert_array_equal(a[10:20], z[10:20]) - assert_array_equal(a[-10:], z[-10:]) - # ...across chunk boundaries... - assert_array_equal(a[:110], z[:110]) - assert_array_equal(a[190:310], z[190:310]) - assert_array_equal(a[-110:], z[-110:]) - # single item - eq(a[0], z[0]) - eq(a[-1], z[-1]) - - # check partial assignment - b = np.arange(1e5, 2e5) - z[190:310] = b[190:310] - assert_array_equal(a[:190], z[:190]) - assert_array_equal(b[190:310], z[190:310]) - assert_array_equal(a[310:], z[310:]) + # get data + result = a[:] + assert_frame_equal(result, df) From d7ef9e474163716aac9177fca6ea9def7a9b494f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 15 Oct 2016 18:30:10 -0400 Subject: [PATCH 10/12] fixed --- zarr/creation.py | 27 +++++++++++++++++- zarr/frame.py | 60 +++++++++++++++++++++++++++------------- zarr/hierarchy.py | 2 +- zarr/tests/test_frame.py | 22 +++++++++++++++ 4 files changed, 90 insertions(+), 21 deletions(-) diff --git a/zarr/creation.py b/zarr/creation.py index a651e8314f..f0ad883617 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -4,6 +4,7 @@ import numpy as np +import pandas as pd from zarr.core import Array @@ -145,7 +146,7 @@ def create_frame(nrows, columns, dtypes, chunks=None, compressor='default', compressor, _ = _handle_kwargs(compressor, None, kwargs) # initialize frame metadata - init_frame(store, nrows=nrows, columns=column, dtypes=dtypes, chunks=chunks, + init_frame(store, nrows=nrows, columns=columns, dtypes=dtypes, chunks=chunks, compressor=compressor, overwrite=overwrite, path=path, chunk_store=chunk_store, filters=filters) @@ -377,6 +378,30 @@ def array(data, **kwargs): return z +def frame(data, **kwargs): + """Create a frame filled with `data`. + + Examples + -------- + + """ # flake8: noqa + + if not isinstance(data, pd.DataFrame): + raise ValueError("data must be a DataFrame") + + # instantiate frame + kwargs['nrows'] = len(data) + kwargs['columns'] = data.columns + kwargs['dtypes'] = data.dtypes + + z = create_frame(**kwargs) + + # fill with data + z[:] = data + + return z + + def open_array(store=None, mode='a', shape=None, chunks=None, dtype=None, compressor='default', fill_value=0, order='C', synchronizer=None, filters=None, cache_metadata=True, diff --git a/zarr/frame.py b/zarr/frame.py index bc3ef18144..fe9f86190a 100644 --- a/zarr/frame.py +++ b/zarr/frame.py @@ -8,7 +8,7 @@ from zarr.util import is_total_slice, normalize_array_selection, \ get_chunk_range, human_readable_size, normalize_resize_args, \ normalize_storage_path, normalize_shape, normalize_chunks -from zarr.storage import (frame_meta_key, attrs_key, listdir, getsize, +from zarr.storage import (frame_meta_key, array_meta_key, attrs_key, listdir, getsize, init_array, init_group) from zarr.meta import decode_frame_metadata, encode_frame_metadata from zarr.attrs import Attributes @@ -16,7 +16,7 @@ from zarr.codecs import get_codec, PickleCodec from pandas import DataFrame, concat -from pandas.api.types import is_object_dtype, is_categorical_dtype +from pandas.api.types import is_object_dtype, is_categorical_dtype, is_datetime64_dtype class Frame(Base): @@ -102,26 +102,38 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, synchronizer=synchronizer) # create our arrays filters = self._filters + + self._dtypes_map = dict(zip(self._columns, self._dtypes)) self._arrays = {} for c, dtype in zip(self._columns, self._dtypes): path = self._key_prefix + '/data/' + c + mkey = path + '/' + array_meta_key - if is_object_dtype(dtype): - filters = self._filters - if filters is None: - filters = [] - filters += [PickleCodec()] - else: + # create / read our arrays + if mkey not in store: filters = self._filters - init_array(store, - self._nrows, - chunks=self._chunks[0], - dtype=dtype, - compressor=self._compressor, - path=path, - chunk_store=self._chunk_store, - filters=filters) - self._arrays[c] = Array(store, path=path, read_only=False) + if is_object_dtype(dtype): + filters = self._filters + if filters is None: + filters = [] + filters += [PickleCodec()] + elif is_datetime64_dtype(dtype): + dtype = 'i8' + + init_array(store, + self._nrows, + chunks=self._chunks[0], + dtype=dtype, + compressor=self._compressor, + path=path, + chunk_store=self._chunk_store, + filters=filters) + pass + + self._arrays[c] = Array(self._store, path=path, + read_only=self._read_only, + chunk_store=self._chunk_store, + synchronizer=self._synchronizer) def _load_metadata_nosync(self): try: @@ -221,15 +233,25 @@ def _array_to_series(self, c, indexer): from pandas import Series arr = self._arrays[c] arr = arr[indexer] - return Series(arr, name=c) + + # re-create the actual dtypes + dtype = self._dtypes_map[c] + return Series(arr, name=c, dtype=dtype) def _series_to_array(self, c, indexer, value): """ Set the array with name c for this value (a Series) and the indexer """ + arr = self._arrays[c] - arr[indexer] = value.values + + if is_datetime64_dtype(value): + value = value.values.view('i8') + else: + value = value.values + + arr[indexer] = value def __getitem__(self, item): """ diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index a03003114d..5c204d05df 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -12,7 +12,7 @@ from zarr.storage import contains_array, contains_group, contains_frame, \ init_group, DictStore, DirectoryStore, group_meta_key, attrs_key, \ listdir, rmdir -from zarr.creation import array, create, empty, zeros, ones, full, \ +from zarr.creation import array, create, frame, create_frame, empty, zeros, ones, full, \ empty_like, zeros_like, ones_like, full_like from zarr.util import normalize_storage_path, normalize_shape from zarr.errors import PermissionError, err_contains_array, \ diff --git a/zarr/tests/test_frame.py b/zarr/tests/test_frame.py index 8fbba3880b..c5f6c48ae0 100644 --- a/zarr/tests/test_frame.py +++ b/zarr/tests/test_frame.py @@ -10,12 +10,14 @@ import numpy as np import pandas as pd +from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal from nose.tools import eq_ as eq, assert_is_instance, \ assert_raises, assert_true, assert_false, assert_is, assert_is_none from zarr.storage import (DirectoryStore, ZipStore, init_array, init_frame, init_group) +from zarr.hierarchy import group from zarr.core import Array from zarr.frame import Frame from zarr.errors import PermissionError @@ -79,3 +81,23 @@ def test_frame(self): # get data result = a[:] assert_frame_equal(result, df) + + def test_mixed_frame(self): + + np.random.seed(1234) + N = 10000 + ngroups = 10 + strings = tm.rands_array(10, 1000) + + df = pd.DataFrame({'A': np.arange(N), + 'B': np.random.randint(0, ngroups, size=N), + 'C': np.random.randn(N), + 'D': pd.date_range('20130101', periods=ngroups).take(np.random.randint(0, ngroups, size=N)), + #'E': pd.Series(strings.take(np.random.randint(0, ngroups, size=N))).astype('category')}) + #'f': strings.take(np.random.randint(0, ngroups, size=N)), + }) + + g = group('foo') + g['df'] = df + result = g['df'][:] + assert_frame_equal(result, df) From b60c9a19f746addd2b6721873a5b4d24d7ce20dc Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 15 Oct 2016 18:46:31 -0400 Subject: [PATCH 11/12] typo --- zarr/tests/test_hierarchy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index b407b1547e..679beb1e0b 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -18,7 +18,7 @@ from zarr.storage import DictStore, DirectoryStore, ZipStore, init_group, \ init_array, attrs_key, array_meta_key, group_meta_key from zarr.core import Array -from zarra.frame import Frame +from zarr.frame import Frame from zarr.hierarchy import Group, group, open_group from zarr.attrs import Attributes from zarr.errors import PermissionError From d3ce812995c155e785f7db2849c2e9a4e7dc60d8 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 16 Oct 2016 13:24:00 -0400 Subject: [PATCH 12/12] use dict() for storage fix normalized array path on frame storage add pandas as dep (for now) --- requirements.txt | 1 + zarr/frame.py | 2 +- zarr/tests/test_frame.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5186c748c2..fe27ef6893 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ numpy fasteners +pandas diff --git a/zarr/frame.py b/zarr/frame.py index fe9f86190a..e3ccf42514 100644 --- a/zarr/frame.py +++ b/zarr/frame.py @@ -106,7 +106,7 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, self._dtypes_map = dict(zip(self._columns, self._dtypes)) self._arrays = {} for c, dtype in zip(self._columns, self._dtypes): - path = self._key_prefix + '/data/' + c + path = normalize_storage_path(self._key_prefix + '/data/' + c) mkey = path + '/' + array_meta_key # create / read our arrays diff --git a/zarr/tests/test_frame.py b/zarr/tests/test_frame.py index c5f6c48ae0..0d975567c7 100644 --- a/zarr/tests/test_frame.py +++ b/zarr/tests/test_frame.py @@ -94,10 +94,10 @@ def test_mixed_frame(self): 'C': np.random.randn(N), 'D': pd.date_range('20130101', periods=ngroups).take(np.random.randint(0, ngroups, size=N)), #'E': pd.Series(strings.take(np.random.randint(0, ngroups, size=N))).astype('category')}) - #'f': strings.take(np.random.randint(0, ngroups, size=N)), + 'F': strings.take(np.random.randint(0, ngroups, size=N)), }) - g = group('foo') + g = group(store=dict()) g['df'] = df result = g['df'][:] assert_frame_equal(result, df)