Merge pull request #251 from xray/encoding-improvements

shoyer · shoyer · commit 6c5d272f09fe · 2014-10-13T17:23:33.000-07:00
Encoding improvements
diff --git a/xray/backends/netCDF4_.py b/xray/backends/netCDF4_.py
@@ -86,24 +86,19 @@ class NetCDF4DataStore(AbstractWritableDataStore):
     This store supports NetCDF3, NetCDF4 and OpenDAP datasets.
     """
     def __init__(self, filename, mode='r', clobber=True, diskless=False,
-                 persist=False, format='NETCDF4', group=None,
-                 *args, **kwdargs):
+                 persist=False, format='NETCDF4', group=None):
         import netCDF4 as nc4
         ds = nc4.Dataset(filename, mode=mode, clobber=clobber,
                          diskless=diskless, persist=persist,
                          format=format)
         self.ds = _nc4_group(ds, group)
         self.format = format
         self._filename = filename
-        self._encoder_args = args
-        self._encoder_kwdargs = kwdargs
 
     def store(self, variables, attributes):
         # All NetCDF files get CF encoded by default, without this attempting
         # to write times, for example, would fail.
-        cf_variables, cf_attrs = cf_encoder(variables, attributes,
-                                            *self._encoder_args,
-                                            **self._encoder_kwdargs)
+        cf_variables, cf_attrs = cf_encoder(variables, attributes)
         AbstractWritableDataStore.store(self, cf_variables, cf_attrs)
 
     def open_store_variable(self, var):
diff --git a/xray/backends/scipy_.py b/xray/backends/scipy_.py
@@ -34,8 +34,7 @@ class ScipyDataStore(AbstractWritableDataStore):
 
     It only supports the NetCDF3 file-format.
     """
-    def __init__(self, filename_or_obj, mode='r', mmap=None,
-                 version=1, *args, **kwdargs):
+    def __init__(self, filename_or_obj, mode='r', mmap=None, version=1):
         import scipy
         if mode != 'r' and scipy.__version__ < '0.13':
             warnings.warn('scipy %s detected; '
@@ -53,15 +52,11 @@ def __init__(self, filename_or_obj, mode='r', mmap=None,
             filename_or_obj = BytesIO(filename_or_obj)
         self.ds = scipy.io.netcdf.netcdf_file(
             filename_or_obj, mode=mode, mmap=mmap, version=version)
-        self._encoder_args = args
-        self._encoder_kwdargs = kwdargs
 
     def store(self, variables, attributes):
         # All Scipy objects get CF encoded by default, without this attempting
         # to write times, for example, would fail.
-        cf_variables, cf_attrs = cf_encoder(variables, attributes,
-                                            *self._encoder_args,
-                                            **self._encoder_kwdargs)
+        cf_variables, cf_attrs = cf_encoder(variables, attributes)
         AbstractWritableDataStore.store(self, cf_variables, cf_attrs)
 
     def open_store_variable(self, var):
diff --git a/xray/conventions.py b/xray/conventions.py
@@ -1,3 +1,4 @@
+import functools
 import numpy as np
 import pandas as pd
 import warnings
@@ -372,6 +373,73 @@ def pop_to(source, dest, key, default=None):
     return value
 
 
+def _var_as_tuple(var):
+    return var.dims, var.values, var.attrs.copy(), var.encoding.copy()
+
+
+def maybe_encode_datetime(var):
+    if (np.issubdtype(var.dtype, np.datetime64)
+            or (var.dtype.kind == 'O'
+                and isinstance(var.values.flat[0], datetime))):
+
+        dims, values, attrs, encoding = _var_as_tuple(var)
+        if 'units' in attrs or 'calendar' in attrs:
+            raise ValueError(
+                "Failed hard to prevent overwriting 'units' or 'calendar'")
+
+        (values, units, calendar) = encode_cf_datetime(
+            values, encoding.pop('units', None), encoding.pop('calendar', None))
+        attrs['units'] = units
+        attrs['calendar'] = calendar
+        var = Variable(dims, values, attrs, encoding)
+    return var
+
+
+def maybe_encode_offset_and_scale(var, needs_copy=True):
+    if any(k in var.encoding for k in ['add_offset', 'scale_factor']):
+        dims, values, attrs, encoding = _var_as_tuple(var)
+        values = np.array(values, dtype=float, copy=needs_copy)
+        needs_copy = False
+        if 'add_offset' in encoding:
+            values -= pop_to(encoding, attrs, 'add_offset')
+        if 'scale_factor' in encoding:
+            values /= pop_to(encoding, attrs, 'scale_factor')
+        var = Variable(dims, values, attrs, encoding)
+    return var, needs_copy
+
+
+def maybe_encode_fill_value(var, needs_copy=True):
+    # replace NaN with the fill value
+    if '_FillValue' in var.encoding:
+        dims, values, attrs, encoding = _var_as_tuple(var)
+        fill_value = pop_to(encoding, attrs, '_FillValue')
+        if not pd.isnull(fill_value):
+            missing = pd.isnull(values)
+            if missing.any():
+                if needs_copy:
+                    values = values.copy()
+                    needs_copy = False
+                values[missing] = fill_value
+        var = Variable(dims, values, attrs, encoding)
+    return var, needs_copy
+
+
+def maybe_encode_dtype(var, needs_copy=True):
+    if 'dtype' in var.encoding:
+        dims, values, attrs, encoding = _var_as_tuple(var)
+        dtype = np.dtype(encoding.pop('dtype'))
+        if dtype.kind != 'O':
+            if np.issubdtype(dtype, int):
+                out = np.empty_like(values) if needs_copy else values
+                np.around(values, out=out)
+            if dtype == 'S1' and values.dtype != 'S1':
+                values = string_to_char(np.asarray(values, 'S'))
+                dims = dims + ('string%s' % values.shape[-1],)
+            values = np.asarray(values, dtype=dtype)
+            var = Variable(dims, values, attrs, encoding)
+    return var
+
+
 def _infer_dtype(array):
     """Given an object array with no missing values, infer its dtype from its
     first element
@@ -390,7 +458,36 @@ def _infer_dtype(array):
     return dtype
 
 
-def encode_cf_variable(var):
+def ensure_dtype_not_object(var):
+    # TODO: move this from conventions to backends? (it's not CF related)
+    if var.dtype.kind == 'O':
+        dims, values, attrs, encoding = _var_as_tuple(var)
+        missing = pd.isnull(values)
+        if missing.any():
+            non_missing_values = values[~missing]
+            inferred_dtype = _infer_dtype(non_missing_values)
+
+            if inferred_dtype.kind in ['S', 'U']:
+                # There is no safe bit-pattern for NA in typical binary string
+                # formats, we so can't set a fill_value. Unfortunately, this
+                # means we won't be able to restore string arrays with missing
+                # values.
+                fill_value = ''
+            else:
+                # insist on using float for numeric values
+                if not np.issubdtype(inferred_dtype, float):
+                    inferred_dtype = np.dtype(float)
+                fill_value = np.nan
+
+            values = np.array(values, dtype=inferred_dtype, copy=True)
+            values[missing] = fill_value
+        else:
+            values = np.asarray(values, dtype=_infer_dtype(values))
+        var = Variable(dims, values, attrs, encoding)
+    return var
+
+
+def encode_cf_variable(var, needs_copy=True):
     """
     Converts an Variable into an Variable which follows some
     of the CF conventions:
@@ -410,86 +507,12 @@ def encode_cf_variable(var):
     out : xray.Variable
         A variable which has been encoded as described above.
     """
-    dimensions = var.dims
-    data = var.values
-    attributes = var.attrs.copy()
-    encoding = var.encoding.copy()
-
-    # convert datetimes into numbers
-    if (np.issubdtype(data.dtype, np.datetime64)
-            or (data.dtype.kind == 'O'
-                and isinstance(data.reshape(-1)[0], datetime))):
-        if 'units' in attributes or 'calendar' in attributes:
-            raise ValueError(
-                "Failed hard to prevent overwriting 'units' or 'calendar'")
-        (data, units, calendar) = encode_cf_datetime(
-            data, encoding.pop('units', None), encoding.pop('calendar', None))
-        attributes['units'] = units
-        attributes['calendar'] = calendar
-
-    # unscale/mask
-    if any(k in encoding for k in ['add_offset', 'scale_factor']):
-        data = np.array(data, dtype=float, copy=True)
-        if 'add_offset' in encoding:
-            data -= pop_to(encoding, attributes, 'add_offset')
-        if 'scale_factor' in encoding:
-            data /= pop_to(encoding, attributes, 'scale_factor')
-
-    # replace NaN with the fill value
-    if '_FillValue' in encoding:
-        fill_value = pop_to(encoding, attributes, '_FillValue')
-        if not pd.isnull(fill_value):
-            missing = pd.isnull(data)
-            if missing.any():
-                data = data.copy()
-                data[missing] = fill_value
-
-    # replace NaN with the missing_value
-    if 'missing_value' in encoding:
-        missing_value = pop_to(encoding, attributes, 'missing_value')
-        if not pd.isnull(missing_value):
-            missing = pd.isnull(data)
-            if missing.any():
-                data = data.copy()
-                data[missing] = missing_value
-
-    # cast to encoded dtype
-    if 'dtype' in encoding:
-        dtype = np.dtype(encoding.pop('dtype'))
-        if dtype.kind != 'O':
-            if np.issubdtype(dtype, int):
-                data = data.round()
-            if dtype == 'S1' and data.dtype != 'S1':
-                data = string_to_char(np.asarray(data, 'S'))
-                dimensions = dimensions + ('string%s' % data.shape[-1],)
-            data = np.asarray(data, dtype=dtype)
-
-    # infer a valid dtype if necessary
-    # TODO: move this from conventions to backends (it's not CF related)
-    if data.dtype.kind == 'O':
-        missing = pd.isnull(data)
-        if missing.any():
-            non_missing_data = data[~missing]
-            inferred_dtype = _infer_dtype(non_missing_data)
-
-            if inferred_dtype.kind in ['S', 'U']:
-                # There is no safe bit-pattern for NA in typical binary string
-                # formats, we so can't set a fill_value. Unfortunately, this
-                # means we won't be able to restore string arrays with missing
-                # values.
-                fill_value = ''
-            else:
-                # insist on using float for numeric data
-                if not np.issubdtype(inferred_dtype, float):
-                    inferred_dtype = np.dtype(float)
-                fill_value = np.nan
-
-            data = np.array(data, dtype=inferred_dtype, copy=True)
-            data[missing] = fill_value
-        else:
-            data = np.asarray(data, dtype=_infer_dtype(data))
-
-    return Variable(dimensions, data, attributes, encoding=encoding)
+    var = maybe_encode_datetime(var)
+    var, needs_copy = maybe_encode_offset_and_scale(var, needs_copy)
+    var, needs_copy = maybe_encode_fill_value(var, needs_copy)
+    var = maybe_encode_dtype(var, needs_copy)
+    var = ensure_dtype_not_object(var)
+    return var
 
 
 def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
@@ -539,15 +562,15 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
             data = CharToStringArray(data)
 
     if mask_and_scale:
-        # missing_value is deprecated, but we still want to support it.
-        missing_value = pop_to(attributes, encoding, 'missing_value')
+        if 'missing_value' in attributes:
+            # missing_value is deprecated, but we still want to support it as
+            # an alias for _FillValue.
+            assert ('_FillValue' not in attributes
+                    or utils.equivalent(attributes['_FillValue'],
+                                        attributes['missing_value']))
+            attributes['_FillValue'] = attributes.pop('missing_value')
+
         fill_value = pop_to(attributes, encoding, '_FillValue')
-        # if missing_value is given but not fill_value we use missing_value
-        if fill_value is None and missing_value is not None:
-            fill_value = missing_value
-        # if both were given we make sure they are the same.
-        if fill_value is not None and missing_value is not None:
-            assert fill_value == missing_value
         scale_factor = pop_to(attributes, encoding, 'scale_factor')
         add_offset = pop_to(attributes, encoding, 'add_offset')
         if ((fill_value is not None and not pd.isnull(fill_value))
diff --git a/xray/core/dataset.py b/xray/core/dataset.py
@@ -84,11 +84,14 @@ def open_dataset(nc, decode_cf=True, mask_and_scale=True, decode_times=True,
         # If nc is a file-like object we read it using
         # the scipy.io.netcdf package
         store = backends.ScipyDataStore(nc, *args, **kwargs)
-    decoder = conventions.cf_decoder if decode_cf else None
-    return Dataset.load_store(store, decoder=decoder,
-                              mask_and_scale=mask_and_scale,
-                              decode_times=decode_times,
-                              concat_characters=concat_characters)
+    if decode_cf:
+        decoder = functools.partial(conventions.cf_decoder,
+                                    mask_and_scale=mask_and_scale,
+                                    decode_times=decode_times,
+                                    concat_characters=concat_characters)
+    else:
+        decoder = None
+    return Dataset.load_store(store, decoder=decoder)
 
 
 # list of attributes of pd.DatetimeIndex that are ndarrays of time info
@@ -399,14 +402,13 @@ def _set_init_vars_and_dims(self, vars, coords):
                                      check_coord_names=False)
 
     @classmethod
-    def load_store(cls, store, decoder=None, *args, **kwdargs):
+    def load_store(cls, store, decoder=None):
         """Create a new dataset from the contents of a backends.*DataStore
         object
         """
         variables, attributes = store.load()
         if decoder:
-            variables, attributes = decoder(variables, attributes,
-                                            *args, **kwdargs)
+            variables, attributes = decoder(variables, attributes)
         obj = cls(variables, attrs=attributes)
         obj._file_obj = store
         return obj
@@ -785,13 +787,11 @@ def reset_coords(self, names=None, drop=False, inplace=False):
                 del obj._arrays[name]
         return obj
 
-    def dump_to_store(self, store, encoder=None,
-                      *args, **kwdargs):
+    def dump_to_store(self, store, encoder=None):
         """Store dataset contents to a backends.*DataStore object."""
         variables, attributes = self, self.attrs
         if encoder:
-            variables, attributes = encoder(variables, attributes,
-                                            *args, **kwdargs)
+            variables, attributes = encoder(variables, attributes)
         store.store(variables, attributes)
         store.sync()
 
diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py
@@ -8,7 +8,8 @@
 import numpy as np
 import pandas as pd
 
-from xray import align, concat, backends, Dataset, DataArray, Variable
+from xray import (align, concat, conventions, backends, Dataset, DataArray,
+                  Variable)
 from xray.core import indexing, utils
 from xray.core.pycompat import iteritems, OrderedDict
 
@@ -1020,8 +1021,8 @@ def test_lazy_load(self):
         store = InaccessibleVariableDataStore()
         create_test_data().dump_to_store(store)
 
-        for decode_cf in [False, True]:
-            ds = Dataset.load_store(store, decode_cf=decode_cf)
+        for decoder in [None, conventions.cf_decoder]:
+            ds = Dataset.load_store(store, decoder=decoder)
             with self.assertRaises(UnexpectedDataAccess):
                 ds.load_data()
             with self.assertRaises(UnexpectedDataAccess):