From f5c1b575806e9b3af671bcc277bd3a1e3a57cece Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 10 Nov 2017 19:26:39 -0800 Subject: [PATCH 1/3] Fix "Chunksize cannot exceed dimension size" Fixes GH1225 --- doc/whats-new.rst | 2 ++ xarray/backends/netCDF4_.py | 10 ++++++---- xarray/tests/test_backends.py | 16 ++++++++++++++++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index dd775417132..041e4c39837 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -49,6 +49,8 @@ Bug fixes - Fixed ``apply_ufunc`` with ``dask='parallelized'`` for scalar arguments (:issue:`1697`). +- Fix "Chunksize cannot exceed dimension size" error when writing netCDF4 files + loaded from disk (:issue:`1225`). By `Stephan Hoyer `_. Testing diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 87bea8fe07c..7bfadcc7509 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -156,10 +156,12 @@ def _extract_nc4_variable_encoding(variable, raise_on_invalid=False, if lsd_okay: valid_encodings.add('least_significant_digit') - if (encoding.get('chunksizes') is not None and - (encoding.get('original_shape', variable.shape) != - variable.shape) and not raise_on_invalid): - del encoding['chunksizes'] + if not raise_on_invalid and 'chunksizes' in encoding: + chunks_too_big = any( + c > d for c, d in zip(encoding['chunksizes'], variable.shape)) + changed_shape = encoding.get('original_shape') != variable.shape + if chunks_too_big or changed_shape: + del encoding['chunksizes'] for k in safe_to_drop: if k in encoding: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 53cb3c6d58f..03d05faa422 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -909,6 +909,21 @@ def test_compression_encoding(self): with self.roundtrip(expected) as actual: self.assertDatasetEqual(expected, actual) + def test_encoding_chunksizes_unlimited(self): + # regression test for GH1225 + ds = Dataset({'x': [1, 2, 3], 'y': ('x', [2, 3, 4])}) + ds.variables['x'].encoding = { + 'zlib': False, + 'shuffle': False, + 'complevel': 0, + 'fletcher32': False, + 'contiguous': False, + 'chunksizes': (2 ** 20,), + 'original_shape': (3,), + } + with self.roundtrip(ds) as actual: + self.assertDatasetEqual(ds, actual) + def test_mask_and_scale(self): with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, mode='w') as nc: @@ -1230,6 +1245,7 @@ def test_encoding_unlimited_dims(self): save_kwargs=dict(unlimited_dims=['y'])) as actual: self.assertEqual(actual.encoding['unlimited_dims'], set('y')) self.assertDatasetEqual(ds, actual) + ds.encoding = {'unlimited_dims': ['y']} with self.roundtrip(ds) as actual: self.assertEqual(actual.encoding['unlimited_dims'], set('y')) From 069576f9ef0f56412cb44b38992020441a7e9e3b Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sat, 11 Nov 2017 17:46:40 -0800 Subject: [PATCH 2/3] Fix chunksizes not iterable --- xarray/backends/netCDF4_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 7bfadcc7509..4a1bf9a8bbe 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -156,7 +156,7 @@ def _extract_nc4_variable_encoding(variable, raise_on_invalid=False, if lsd_okay: valid_encodings.add('least_significant_digit') - if not raise_on_invalid and 'chunksizes' in encoding: + if not raise_on_invalid and encoding.get('chunksizes') is not None: chunks_too_big = any( c > d for c, d in zip(encoding['chunksizes'], variable.shape)) changed_shape = encoding.get('original_shape') != variable.shape From 50e9c762e826ebf58cc1cc6f2655bb86cb7ef5e1 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sat, 11 Nov 2017 21:05:46 -0800 Subject: [PATCH 3/3] Only remove big chunks if dimension is not unlimited --- xarray/backends/netCDF4_.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 4a1bf9a8bbe..5c80ae690b7 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -147,7 +147,11 @@ def _force_native_endianness(var): def _extract_nc4_variable_encoding(variable, raise_on_invalid=False, - lsd_okay=True, backend='netCDF4'): + lsd_okay=True, backend='netCDF4', + unlimited_dims=None): + if unlimited_dims is None: + unlimited_dims = () + encoding = variable.encoding.copy() safe_to_drop = set(['source', 'original_shape']) @@ -157,8 +161,13 @@ def _extract_nc4_variable_encoding(variable, raise_on_invalid=False, valid_encodings.add('least_significant_digit') if not raise_on_invalid and encoding.get('chunksizes') is not None: + # It's possible to get encoded chunksizes larger than a dimension size + # if the original file had an unlimited dimension. This is problematic + # if the new file no longer has an unlimited dimension. + chunksizes = encoding['chunksizes'] chunks_too_big = any( - c > d for c, d in zip(encoding['chunksizes'], variable.shape)) + c > d and dim not in unlimited_dims + for c, d, dim in zip(chunksizes, variable.shape, variable.dims)) changed_shape = encoding.get('original_shape') != variable.shape if chunks_too_big or changed_shape: del encoding['chunksizes'] @@ -348,7 +357,8 @@ def prepare_variable(self, name, variable, check_encoding=False, 'NC_CHAR type.' % name) encoding = _extract_nc4_variable_encoding( - variable, raise_on_invalid=check_encoding) + variable, raise_on_invalid=check_encoding, + unlimited_dims=unlimited_dims) nc4_var = self.ds.createVariable( varname=name, datatype=datatype,