Validate output dimension sizes with apply_ufunc (#2155)

shoyer · web-flow · commit 0e21fdfc8df7 · 2018-05-31T08:40:03.000-07:00
* Validate output dimension sizes with apply_ufunc

Fixes GH1931

Uses of apply_ufunc that change dimension size now raise an explicit error,
e.g.,

    &gt;&gt;&gt; xr.apply_ufunc(lambda x: x[:5], xr.Variable('x', np.arange(10)))
    ValueError: size of dimension 'x' on inputs was unexpectedly changed by
    applied function from 10 to 5. Only dimensions specified in ``exclude_dims``
    with xarray.apply_ufunc are allowed to change size.

* lint

* More output validation for apply_ufunc
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -67,6 +67,10 @@ Enhancements
 Bug fixes
 ~~~~~~~~~
 
+- :py:func:`apply_ufunc` now directly validates output variables
+  (:issue:`1931`).
+  By `Stephan Hoyer <https://github.com/shoyer>`_.
+
 - Fixed a bug where ``to_netcdf(..., unlimited_dims='bar')`` yielded NetCDF
   files with spurious 0-length dimensions (i.e. ``b``, ``a``, and ``r``)
   (:issue:`2134`).
diff --git a/xarray/core/computation.py b/xarray/core/computation.py
@@ -513,7 +513,7 @@ def broadcast_compat_data(variable, broadcast_dims, core_dims):
 def apply_variable_ufunc(func, *args, **kwargs):
     """apply_variable_ufunc(func, *args, signature, exclude_dims=frozenset())
     """
-    from .variable import Variable
+    from .variable import Variable, as_compatible_data
 
     signature = kwargs.pop('signature')
     exclude_dims = kwargs.pop('exclude_dims', _DEFAULT_FROZEN_SET)
@@ -559,20 +559,42 @@ def func(*arrays):
                              'apply_ufunc: {}'.format(dask))
     result_data = func(*input_data)
 
-    if signature.num_outputs > 1:
-        output = []
-        for dims, data in zip(output_dims, result_data):
-            var = Variable(dims, data)
-            if keep_attrs and isinstance(args[0], Variable):
-                var.attrs.update(args[0].attrs)
-            output.append(var)
-        return tuple(output)
-    else:
-        dims, = output_dims
-        var = Variable(dims, result_data)
+    if signature.num_outputs == 1:
+        result_data = (result_data,)
+    elif (not isinstance(result_data, tuple) or
+            len(result_data) != signature.num_outputs):
+        raise ValueError('applied function does not have the number of '
+                         'outputs specified in the ufunc signature. '
+                         'Result is not a tuple of {} elements: {!r}'
+                         .format(signature.num_outputs, result_data))
+
+    output = []
+    for dims, data in zip(output_dims, result_data):
+        data = as_compatible_data(data)
+        if data.ndim != len(dims):
+            raise ValueError(
+                'applied function returned data with unexpected '
+                'number of dimensions: {} vs {}, for dimensions {}'
+                .format(data.ndim, len(dims), dims))
+
+        var = Variable(dims, data, fastpath=True)
+        for dim, new_size in var.sizes.items():
+            if dim in dim_sizes and new_size != dim_sizes[dim]:
+                raise ValueError(
+                    'size of dimension {!r} on inputs was unexpectedly '
+                    'changed by applied function from {} to {}. Only '
+                    'dimensions specified in ``exclude_dims`` with '
+                    'xarray.apply_ufunc are allowed to change size.'
+                    .format(dim, dim_sizes[dim], new_size))
+
         if keep_attrs and isinstance(args[0], Variable):
             var.attrs.update(args[0].attrs)
-        return var
+        output.append(var)
+
+    if signature.num_outputs == 1:
+        return output[0]
+    else:
+        return tuple(output)
 
 
 def _apply_with_dask_atop(func, args, input_dims, output_dims, signature,
@@ -719,7 +741,8 @@ def apply_ufunc(func, *args, **kwargs):
         Core dimensions on the inputs to exclude from alignment and
         broadcasting entirely. Any input coordinates along these dimensions
         will be dropped. Each excluded dimension must also appear in
-        ``input_core_dims`` for at least one argument.
+        ``input_core_dims`` for at least one argument. Only dimensions listed
+        here are allowed to change size between input and output objects.
     vectorize : bool, optional
         If True, then assume ``func`` only takes arrays defined over core
         dimensions as input and vectorize it automatically with
@@ -777,15 +800,38 @@ def apply_ufunc(func, *args, **kwargs):
 
     Examples
     --------
-    For illustrative purposes only, here are examples of how you could use
-    ``apply_ufunc`` to write functions to (very nearly) replicate existing
-    xarray functionality:
 
-    Calculate the vector magnitude of two arguments::
+    Calculate the vector magnitude of two arguments:
+
+    >>> def magnitude(a, b):
+    ...     func = lambda x, y: np.sqrt(x ** 2 + y ** 2)
+    ...     return xr.apply_ufunc(func, a, b)
+
+    You can now apply ``magnitude()`` to ``xr.DataArray`` and ``xr.Dataset``
+    objects, with automatically preserved dimensions and coordinates, e.g.,
+
+    >>> array = xr.DataArray([1, 2, 3], coords=[('x', [0.1, 0.2, 0.3])])
+    >>> magnitude(array, -array)
+    <xarray.DataArray (x: 3)>
+    array([1.414214, 2.828427, 4.242641])
+    Coordinates:
+      * x        (x) float64 0.1 0.2 0.3
+
+    Plain scalars, numpy arrays and a mix of these with xarray objects is also
+    supported:
+
+    >>> magnitude(4, 5)
+    5.0
+    >>> magnitude(3, np.array([0, 4]))
+    array([3., 5.])
+    >>> magnitude(array, 0)
+    <xarray.DataArray (x: 3)>
+    array([1., 2., 3.])
+    Coordinates:
+      * x        (x) float64 0.1 0.2 0.3
 
-        def magnitude(a, b):
-            func = lambda x, y: np.sqrt(x ** 2 + y ** 2)
-            return xr.apply_func(func, a, b)
+    Other examples of how you could use ``apply_ufunc`` to write functions to
+    (very nearly) replicate existing xarray functionality:
 
     Compute the mean (``.mean``) over one dimension::
 
@@ -795,7 +841,7 @@ def mean(obj, dim):
                                input_core_dims=[[dim]],
                                kwargs={'axis': -1})
 
-    Inner product over a specific dimension::
+    Inner product over a specific dimension (like ``xr.dot``)::
 
         def _inner(x, y):
             result = np.matmul(x[..., np.newaxis, :], y[..., :, np.newaxis])
diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py
@@ -752,6 +752,94 @@ def test_vectorize_dask():
     assert_identical(expected, actual)
 
 
+def test_output_wrong_number():
+    variable = xr.Variable('x', np.arange(10))
+
+    def identity(x):
+        return x
+
+    def tuple3x(x):
+        return (x, x, x)
+
+    with raises_regex(ValueError, 'number of outputs'):
+        apply_ufunc(identity, variable, output_core_dims=[(), ()])
+
+    with raises_regex(ValueError, 'number of outputs'):
+        apply_ufunc(tuple3x, variable, output_core_dims=[(), ()])
+
+
+def test_output_wrong_dims():
+    variable = xr.Variable('x', np.arange(10))
+
+    def add_dim(x):
+        return x[..., np.newaxis]
+
+    def remove_dim(x):
+        return x[..., 0]
+
+    with raises_regex(ValueError, 'unexpected number of dimensions'):
+        apply_ufunc(add_dim, variable, output_core_dims=[('y', 'z')])
+
+    with raises_regex(ValueError, 'unexpected number of dimensions'):
+        apply_ufunc(add_dim, variable)
+
+    with raises_regex(ValueError, 'unexpected number of dimensions'):
+        apply_ufunc(remove_dim, variable)
+
+
+def test_output_wrong_dim_size():
+    array = np.arange(10)
+    variable = xr.Variable('x', array)
+    data_array = xr.DataArray(variable, [('x', -array)])
+    dataset = xr.Dataset({'y': variable}, {'x': -array})
+
+    def truncate(array):
+        return array[:5]
+
+    def apply_truncate_broadcast_invalid(obj):
+        return apply_ufunc(truncate, obj)
+
+    with raises_regex(ValueError, 'size of dimension'):
+        apply_truncate_broadcast_invalid(variable)
+    with raises_regex(ValueError, 'size of dimension'):
+        apply_truncate_broadcast_invalid(data_array)
+    with raises_regex(ValueError, 'size of dimension'):
+        apply_truncate_broadcast_invalid(dataset)
+
+    def apply_truncate_x_x_invalid(obj):
+        return apply_ufunc(truncate, obj, input_core_dims=[['x']],
+                           output_core_dims=[['x']])
+
+    with raises_regex(ValueError, 'size of dimension'):
+        apply_truncate_x_x_invalid(variable)
+    with raises_regex(ValueError, 'size of dimension'):
+        apply_truncate_x_x_invalid(data_array)
+    with raises_regex(ValueError, 'size of dimension'):
+        apply_truncate_x_x_invalid(dataset)
+
+    def apply_truncate_x_z(obj):
+        return apply_ufunc(truncate, obj, input_core_dims=[['x']],
+                           output_core_dims=[['z']])
+
+    assert_identical(xr.Variable('z', array[:5]),
+                     apply_truncate_x_z(variable))
+    assert_identical(xr.DataArray(array[:5], dims=['z']),
+                     apply_truncate_x_z(data_array))
+    assert_identical(xr.Dataset({'y': ('z', array[:5])}),
+                     apply_truncate_x_z(dataset))
+
+    def apply_truncate_x_x_valid(obj):
+        return apply_ufunc(truncate, obj, input_core_dims=[['x']],
+                           output_core_dims=[['x']], exclude_dims={'x'})
+
+    assert_identical(xr.Variable('x', array[:5]),
+                     apply_truncate_x_x_valid(variable))
+    assert_identical(xr.DataArray(array[:5], dims=['x']),
+                     apply_truncate_x_x_valid(data_array))
+    assert_identical(xr.Dataset({'y': ('x', array[:5])}),
+                     apply_truncate_x_x_valid(dataset))
+
+
 @pytest.mark.parametrize('use_dask', [True, False])
 def test_dot(use_dask):
     if use_dask: