diff --git a/doc/whats-new.rst b/doc/whats-new.rst index dd775417132..041e4c39837 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -49,6 +49,8 @@ Bug fixes - Fixed ``apply_ufunc`` with ``dask='parallelized'`` for scalar arguments (:issue:`1697`). +- Fix "Chunksize cannot exceed dimension size" error when writing netCDF4 files + loaded from disk (:issue:`1225`). By `Stephan Hoyer `_. Testing diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 87bea8fe07c..5c80ae690b7 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -147,7 +147,11 @@ def _force_native_endianness(var): def _extract_nc4_variable_encoding(variable, raise_on_invalid=False, - lsd_okay=True, backend='netCDF4'): + lsd_okay=True, backend='netCDF4', + unlimited_dims=None): + if unlimited_dims is None: + unlimited_dims = () + encoding = variable.encoding.copy() safe_to_drop = set(['source', 'original_shape']) @@ -156,10 +160,17 @@ def _extract_nc4_variable_encoding(variable, raise_on_invalid=False, if lsd_okay: valid_encodings.add('least_significant_digit') - if (encoding.get('chunksizes') is not None and - (encoding.get('original_shape', variable.shape) != - variable.shape) and not raise_on_invalid): - del encoding['chunksizes'] + if not raise_on_invalid and encoding.get('chunksizes') is not None: + # It's possible to get encoded chunksizes larger than a dimension size + # if the original file had an unlimited dimension. This is problematic + # if the new file no longer has an unlimited dimension. + chunksizes = encoding['chunksizes'] + chunks_too_big = any( + c > d and dim not in unlimited_dims + for c, d, dim in zip(chunksizes, variable.shape, variable.dims)) + changed_shape = encoding.get('original_shape') != variable.shape + if chunks_too_big or changed_shape: + del encoding['chunksizes'] for k in safe_to_drop: if k in encoding: @@ -346,7 +357,8 @@ def prepare_variable(self, name, variable, check_encoding=False, 'NC_CHAR type.' % name) encoding = _extract_nc4_variable_encoding( - variable, raise_on_invalid=check_encoding) + variable, raise_on_invalid=check_encoding, + unlimited_dims=unlimited_dims) nc4_var = self.ds.createVariable( varname=name, datatype=datatype, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 53cb3c6d58f..03d05faa422 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -909,6 +909,21 @@ def test_compression_encoding(self): with self.roundtrip(expected) as actual: self.assertDatasetEqual(expected, actual) + def test_encoding_chunksizes_unlimited(self): + # regression test for GH1225 + ds = Dataset({'x': [1, 2, 3], 'y': ('x', [2, 3, 4])}) + ds.variables['x'].encoding = { + 'zlib': False, + 'shuffle': False, + 'complevel': 0, + 'fletcher32': False, + 'contiguous': False, + 'chunksizes': (2 ** 20,), + 'original_shape': (3,), + } + with self.roundtrip(ds) as actual: + self.assertDatasetEqual(ds, actual) + def test_mask_and_scale(self): with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, mode='w') as nc: @@ -1230,6 +1245,7 @@ def test_encoding_unlimited_dims(self): save_kwargs=dict(unlimited_dims=['y'])) as actual: self.assertEqual(actual.encoding['unlimited_dims'], set('y')) self.assertDatasetEqual(ds, actual) + ds.encoding = {'unlimited_dims': ['y']} with self.roundtrip(ds) as actual: self.assertEqual(actual.encoding['unlimited_dims'], set('y'))