Skip to content

Opening from zarr.ZipStore fails to read (store???) unicode characters #3815

Open
@hmaarrfk

Description

@hmaarrfk

See upstream: zarr-developers/zarr-python#551

It seems that using a ZipStore creates 1 byte objects for Unicode string attributes.

For example, saving the same Dataset with a DirectoryStore and a Zip Store creates an attribute for a unicode array with 20 bytes in size in the first, and 1 byte in size in the second.

In fact, ubuntu file roller isn't even allowing me to extract the files.

I have a feeling it is due to the note in the zarr documentation

Note that Zip files do not provide any way to remove or replace existing entries.

https://zarr.readthedocs.io/en/stable/api/storage.html#zarr.storage.ZipStore

MCVE Code Sample

ZipStore

import xarray as xr
import zarr
x = xr.Dataset()
x['hello'] = 'world'
x
with zarr.ZipStore('test_store.zip', mode='w') as store:
    x.to_zarr(store)
with zarr.ZipStore('test_store.zip', mode='r') as store:
    x_read = xr.open_zarr(store).compute()
Issued error
---------------------------------------------------------------------------
BadZipFile                                Traceback (most recent call last)
<ipython-input-1-2a92a6db56ab> in <module>
      7     x.to_zarr(store)
      8 with zarr.ZipStore('test_store.zip', mode='r') as store:
----> 9     x_read = xr.open_zarr(store).compute()

~/miniconda3/envs/dev/lib/python3.7/site-packages/xarray/core/dataset.py in compute(self, **kwargs)
    803         """
    804         new = self.copy(deep=False)
--> 805         return new.load(**kwargs)
    806 
    807     def _persist_inplace(self, **kwargs) -> "Dataset":

~/miniconda3/envs/dev/lib/python3.7/site-packages/xarray/core/dataset.py in load(self, **kwargs)
    655         for k, v in self.variables.items():
    656             if k not in lazy_data:
--> 657                 v.load()
    658 
    659         return self

~/miniconda3/envs/dev/lib/python3.7/site-packages/xarray/core/variable.py in load(self, **kwargs)
    370             self._data = as_compatible_data(self._data.compute(**kwargs))
    371         elif not hasattr(self._data, "__array_function__"):
--> 372             self._data = np.asarray(self._data)
    373         return self
    374 

~/miniconda3/envs/dev/lib/python3.7/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
     83 
     84     """
---> 85     return array(a, dtype, copy=False, order=order)
     86 
     87 

~/miniconda3/envs/dev/lib/python3.7/site-packages/xarray/core/indexing.py in __array__(self, dtype)
    545     def __array__(self, dtype=None):
    546         array = as_indexable(self.array)
--> 547         return np.asarray(array[self.key], dtype=None)
    548 
    549     def transpose(self, order):

~/miniconda3/envs/dev/lib/python3.7/site-packages/xarray/backends/zarr.py in __getitem__(self, key)
     46         array = self.get_array()
     47         if isinstance(key, indexing.BasicIndexer):
---> 48             return array[key.tuple]
     49         elif isinstance(key, indexing.VectorizedIndexer):
     50             return array.vindex[

~/miniconda3/envs/dev/lib/python3.7/site-packages/zarr/core.py in __getitem__(self, selection)
    570 
    571         fields, selection = pop_fields(selection)
--> 572         return self.get_basic_selection(selection, fields=fields)
    573 
    574     def get_basic_selection(self, selection=Ellipsis, out=None, fields=None):

~/miniconda3/envs/dev/lib/python3.7/site-packages/zarr/core.py in get_basic_selection(self, selection, out, fields)
    693         if self._shape == ():
    694             return self._get_basic_selection_zd(selection=selection, out=out,
--> 695                                                 fields=fields)
    696         else:
    697             return self._get_basic_selection_nd(selection=selection, out=out,

~/miniconda3/envs/dev/lib/python3.7/site-packages/zarr/core.py in _get_basic_selection_zd(self, selection, out, fields)
    709             # obtain encoded data for chunk
    710             ckey = self._chunk_key((0,))
--> 711             cdata = self.chunk_store[ckey]
    712 
    713         except KeyError:

~/miniconda3/envs/dev/lib/python3.7/site-packages/zarr/storage.py in __getitem__(self, key)
   1249         with self.mutex:
   1250             with self.zf.open(key) as f:  # will raise KeyError
-> 1251                 return f.read()
   1252 
   1253     def __setitem__(self, key, value):

~/miniconda3/envs/dev/lib/python3.7/zipfile.py in read(self, n)
    914             self._offset = 0
    915             while not self._eof:
--> 916                 buf += self._read1(self.MAX_N)
    917             return buf
    918 

~/miniconda3/envs/dev/lib/python3.7/zipfile.py in _read1(self, n)
   1018         if self._left <= 0:
   1019             self._eof = True
-> 1020         self._update_crc(data)
   1021         return data
   1022 

~/miniconda3/envs/dev/lib/python3.7/zipfile.py in _update_crc(self, newdata)
    946         # Check the CRC if we're at the end of the file
    947         if self._eof and self._running_crc != self._expected_crc:
--> 948             raise BadZipFile("Bad CRC-32 for file %r" % self.name)
    949 
    950     def read1(self, n):

BadZipFile: Bad CRC-32 for file 'hello/0'
0
2
Untitled10.ipynb

Working Directory Store example

import xarray as xr
import zarr
x = xr.Dataset()
x['hello'] = 'world'
x
store = zarr.DirectoryStore('test_store2.zarr')
x.to_zarr(store)
x_read = xr.open_zarr(store)
x_read.compute()
assert x_read.hello == x.hello

Expected Output

The string metadata should work.

Output of xr.show_versions()

INSTALLED VERSIONS
------------------
commit: None
python: 3.7.6 | packaged by conda-forge | (default, Jan  7 2020, 22:33:48) 
[GCC 7.3.0]
python-bits: 64
OS: Linux
OS-release: 5.3.0-40-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_CA.UTF-8
LOCALE: en_CA.UTF-8
libhdf5: None
libnetcdf: None

xarray: 0.14.1
pandas: 1.0.0
numpy: 1.17.5
scipy: 1.4.1
netCDF4: None
pydap: None
h5netcdf: None
h5py: None
Nio: None
zarr: 2.4.0
cftime: None
nc_time_axis: None
PseudoNetCDF: None
rasterio: None
cfgrib: None
iris: None
bottleneck: None
dask: 2.10.1
distributed: 2.10.0
matplotlib: 3.1.3
cartopy: None
seaborn: None
numbagg: None
setuptools: 45.1.0.post20200119
pip: 20.0.2
conda: None
pytest: 5.3.1
IPython: 7.12.0
sphinx: 2.3.1

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions