pydata · fujiisoup · Aug 16, 2018 · Jun 17, 2018 · Jun 17, 2018 · Jun 18, 2018
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -36,6 +36,12 @@ Documentation
 Enhancements
 ~~~~~~~~~~~~
 
+- min_count option is newly supported in :py:meth:`~xarray.DataArray.sum`,
+  :py:meth:`~xarray.DataArray.prod` and :py:meth:`~xarray.Dataset.sum`, and
+  :py:meth:`~xarray.Dataset.prod`.
+  (:issue:`2230`)
+  By `Keisuke Fujii <https://github.com/fujiisoup>`_.
+
 - :py:meth:`plot()` now accepts the kwargs ``xscale, yscale, xlim, ylim, xticks, yticks`` just like Pandas. Also ``xincrease=False, yincrease=False`` now use matplotlib's axis inverting methods instead of setting limits.
   By `Deepak Cherian <https://github.com/dcherian>`_. (:issue:`2224`)
 
@@ -65,6 +71,9 @@ Bug fixes
 - Tests can be run in parallel with pytest-xdist
   By `Tony Tung <https://github.com/ttung>`_.
 
+- Follow up the renamings in dask; from dask.ghost to dask.overlap 
+  By `Keisuke Fujii <https://github.com/fujiisoup>`_.
+
 - Now raises a ValueError when there is a conflict between dimension names and
   level names of MultiIndex. (:issue:`2299`)
   By `Keisuke Fujii <https://github.com/fujiisoup>`_.

diff --git a/xarray/core/common.py b/xarray/core/common.py
@@ -2,6 +2,7 @@
 
 import warnings
 from distutils.version import LooseVersion
+from textwrap import dedent
 
 import numpy as np
 import pandas as pd
@@ -27,20 +28,20 @@ def wrapped_func(self, dim=None, axis=None, keep_attrs=False,
                                    allow_lazy=True, **kwargs)
         return wrapped_func
 
-    _reduce_extra_args_docstring = \
-        """dim : str or sequence of str, optional
+    _reduce_extra_args_docstring = dedent("""\
+        dim : str or sequence of str, optional
             Dimension(s) over which to apply `{name}`.
         axis : int or sequence of int, optional
             Axis(es) over which to apply `{name}`. Only one of the 'dim'
             and 'axis' arguments can be supplied. If neither are supplied, then
-            `{name}` is calculated over axes."""
+            `{name}` is calculated over axes.""")
 
-    _cum_extra_args_docstring = \
-        """dim : str or sequence of str, optional
+    _cum_extra_args_docstring = dedent("""\
+        dim : str or sequence of str, optional
             Dimension over which to apply `{name}`.
         axis : int or sequence of int, optional
             Axis over which to apply `{name}`. Only one of the 'dim'
-            and 'axis' arguments can be supplied."""
+            and 'axis' arguments can be supplied.""")
 
 
 class ImplementsDatasetReduce(object):
@@ -308,12 +309,12 @@ def assign_coords(self, **kwargs):
         assigned : same type as caller
             A new object with the new coordinates in addition to the existing
             data.
-            
+
         Examples
         --------
-        
+
         Convert longitude coordinates from 0-359 to -180-179:
-        
+
         >>> da = xr.DataArray(np.random.rand(4),
         ...                   coords=[np.array([358, 359, 0, 1])],
         ...                   dims='lon')
@@ -445,11 +446,11 @@ def groupby(self, group, squeeze=True):
         grouped : GroupBy
             A `GroupBy` object patterned after `pandas.GroupBy` that can be
             iterated over in the form of `(unique_value, grouped_array)` pairs.
-            
+
         Examples
         --------
         Calculate daily anomalies for daily data:
-        
+
         >>> da = xr.DataArray(np.linspace(0, 1826, num=1827),
         ...                   coords=[pd.date_range('1/1/2000', '31/12/2004',
         ...                           freq='D')],
@@ -465,7 +466,7 @@ def groupby(self, group, squeeze=True):
         Coordinates:
           * time       (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 ...
             dayofyear  (time) int64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...
-        
+
         See Also
         --------
         core.groupby.DataArrayGroupBy
@@ -589,7 +590,7 @@ def resample(self, freq=None, dim=None, how=None, skipna=None,
                  closed=None, label=None, base=0, keep_attrs=False, **indexer):
         """Returns a Resample object for performing resampling operations.
 
-        Handles both downsampling and upsampling. If any intervals contain no 
+        Handles both downsampling and upsampling. If any intervals contain no
         values from the original object, they will be given the value ``NaN``.
 
         Parameters
@@ -616,11 +617,11 @@ def resample(self, freq=None, dim=None, how=None, skipna=None,
         -------
         resampled : same type as caller
             This object resampled.
-            
+
         Examples
         --------
         Downsample monthly time-series data to seasonal data:
-        
+
         >>> da = xr.DataArray(np.linspace(0, 11, num=12),
         ...                   coords=[pd.date_range('15/12/1999',
         ...                           periods=12, freq=pd.DateOffset(months=1))],
@@ -637,13 +638,13 @@ def resample(self, freq=None, dim=None, how=None, skipna=None,
           * time     (time) datetime64[ns] 1999-12-01 2000-03-01 2000-06-01 2000-09-01
 
         Upsample monthly time-series data to daily data:
-        
+
         >>> da.resample(time='1D').interpolate('linear')
         <xarray.DataArray (time: 337)>
         array([ 0.      ,  0.032258,  0.064516, ..., 10.935484, 10.967742, 11.      ])
         Coordinates:
           * time     (time) datetime64[ns] 1999-12-15 1999-12-16 1999-12-17 ...
-          
+
         References
         ----------
 
@@ -957,8 +958,8 @@ def contains_cftime_datetimes(var):
                     sample = sample.item()
             return isinstance(sample, cftime_datetime)
         else:
-            return False        
-                    
+            return False
+
 
 def _contains_datetime_like_objects(var):
     """Check if a variable contains datetime like objects (either

diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py
@@ -98,6 +98,9 @@ def maybe_promote(dtype):
     return np.dtype(dtype), fill_value
 
 
+NAT_TYPES = (np.datetime64('NaT'), np.timedelta64('NaT'))
+
+
 def get_fill_value(dtype):
     """Return an appropriate fill value for this dtype.
 

diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py
@@ -17,14 +17,6 @@
 from .nputils import nanfirst, nanlast
 from .pycompat import dask_array_type
 
-try:
-    import bottleneck as bn
-    has_bottleneck = True
-except ImportError:
-    # use numpy methods instead
-    bn = np
-    has_bottleneck = False
-
 try:
     import dask.array as dask_array
     from . import dask_array_compat
@@ -175,7 +167,7 @@ def array_notnull_equiv(arr1, arr2):
 def count(data, axis=None):
     """Count the number of non-NA in this array along the given axis or axes
     """
-    return sum(~isnull(data), axis=axis)
+    return np.sum(~isnull(data), axis=axis)
 
 
 def where(condition, x, y):
@@ -213,159 +205,69 @@ def _ignore_warnings_if(condition):
         yield
 
 
-def _nansum_object(value, axis=None, **kwargs):
-    """ In house nansum for object array """
-    value = fillna(value, 0)
-    return _dask_or_eager_func('sum')(value, axis=axis, **kwargs)
-
-
-def _nan_minmax_object(func, get_fill_value, value, axis=None, **kwargs):
-    """ In house nanmin and nanmax for object array """
-    fill_value = get_fill_value(value.dtype)
-    valid_count = count(value, axis=axis)
-    filled_value = fillna(value, fill_value)
-    data = _dask_or_eager_func(func)(filled_value, axis=axis, **kwargs)
-    if not hasattr(data, 'dtype'):  # scalar case
-        data = dtypes.fill_value(value.dtype) if valid_count == 0 else data
-        return np.array(data, dtype=value.dtype)
-    return where_method(data, valid_count != 0)
-
-
-def _nan_argminmax_object(func, get_fill_value, value, axis=None, **kwargs):
-    """ In house nanargmin, nanargmax for object arrays. Always return integer
-    type """
-    fill_value = get_fill_value(value.dtype)
-    valid_count = count(value, axis=axis)
-    value = fillna(value, fill_value)
-    data = _dask_or_eager_func(func)(value, axis=axis, **kwargs)
-    # dask seems return non-integer type
-    if isinstance(value, dask_array_type):
-        data = data.astype(int)
-
-    if (valid_count == 0).any():
-        raise ValueError('All-NaN slice encountered')
-
-    return np.array(data, dtype=int)
-
-
-def _nanmean_ddof_object(ddof, value, axis=None, **kwargs):
-    """ In house nanmean. ddof argument will be used in _nanvar method """
-    valid_count = count(value, axis=axis)
-    value = fillna(value, 0)
-    # As dtype inference is impossible for object dtype, we assume float
-    # https://github.com/dask/dask/issues/3162
-    dtype = kwargs.pop('dtype', None)
-    if dtype is None and value.dtype.kind == 'O':
-        dtype = value.dtype if value.dtype.kind in ['cf'] else float
-
-    data = _dask_or_eager_func('sum')(value, axis=axis, dtype=dtype, **kwargs)
-    data = data / (valid_count - ddof)
-    return where_method(data, valid_count != 0)
-
-
-def _nanvar_object(value, axis=None, **kwargs):
-    ddof = kwargs.pop('ddof', 0)
-    kwargs_mean = kwargs.copy()
-    kwargs_mean.pop('keepdims', None)
-    value_mean = _nanmean_ddof_object(ddof=0, value=value, axis=axis,
-                                      keepdims=True, **kwargs_mean)
-    squared = (value.astype(value_mean.dtype) - value_mean)**2
-    return _nanmean_ddof_object(ddof, squared, axis=axis, **kwargs)
-
-
-_nan_object_funcs = {
-    'sum': _nansum_object,
-    'min': partial(_nan_minmax_object, 'min', dtypes.get_pos_infinity),
-    'max': partial(_nan_minmax_object, 'max', dtypes.get_neg_infinity),
-    'argmin': partial(_nan_argminmax_object, 'argmin',
-                      dtypes.get_pos_infinity),
-    'argmax': partial(_nan_argminmax_object, 'argmax',
-                      dtypes.get_neg_infinity),
-    'mean': partial(_nanmean_ddof_object, 0),
-    'var': _nanvar_object,
-}
-
-
-def _create_nan_agg_method(name, numeric_only=False, np_compat=False,
-                           no_bottleneck=False, coerce_strings=False):
+def _create_nan_agg_method(name, coerce_strings=False):
+    from . import nanops
+
     def f(values, axis=None, skipna=None, **kwargs):
         if kwargs.pop('out', None) is not None:
             raise TypeError('`out` is not valid for {}'.format(name))
 
-        # If dtype is supplied, we use numpy's method.
-        dtype = kwargs.get('dtype', None)
         values = asarray(values)
 
-        # dask requires dtype argument for object dtype
-        if (values.dtype == 'object' and name in ['sum', ]):
-            kwargs['dtype'] = values.dtype if dtype is None else dtype
-
         if coerce_strings and values.dtype.kind in 'SU':
             values = values.astype(object)
 
+        func = None
         if skipna or (skipna is None and values.dtype.kind in 'cfO'):
-            if values.dtype.kind not in ['u', 'i', 'f', 'c']:
-                func = _nan_object_funcs.get(name, None)
-                using_numpy_nan_func = True
-                if func is None or values.dtype.kind not in 'Ob':
-                    raise NotImplementedError(
-                        'skipna=True not yet implemented for %s with dtype %s'
-                        % (name, values.dtype))
-            else:
-                nanname = 'nan' + name
-                if (isinstance(axis, tuple) or not values.dtype.isnative or
-                        no_bottleneck or (dtype is not None and
-                                          np.dtype(dtype) != values.dtype)):
-                    # bottleneck can't handle multiple axis arguments or
-                    # non-native endianness
-                    if np_compat:
-                        eager_module = npcompat
-                    else:
-                        eager_module = np
-                else:
-                    kwargs.pop('dtype', None)
-                    eager_module = bn
-                func = _dask_or_eager_func(nanname, eager_module)
-                using_numpy_nan_func = (eager_module is np or
-                                        eager_module is npcompat)
+            nanname = 'nan' + name
+            func = getattr(nanops, nanname)
         else:
             func = _dask_or_eager_func(name)
-            using_numpy_nan_func = False
-        with _ignore_warnings_if(using_numpy_nan_func):
-            try:
-                return func(values, axis=axis, **kwargs)
-            except AttributeError:
-                if isinstance(values, dask_array_type):
-                    try:  # dask/dask#3133 dask sometimes needs dtype argument
-                        return func(values, axis=axis, dtype=values.dtype,
-                                    **kwargs)
-                    except AttributeError:
-                        msg = '%s is not yet implemented on dask arrays' % name
-                else:
-                    assert using_numpy_nan_func
-                    msg = ('%s is not available with skipna=False with the '
-                           'installed version of numpy; upgrade to numpy 1.12 '
-                           'or newer to use skipna=True or skipna=None' % name)
-                raise NotImplementedError(msg)
-    f.numeric_only = numeric_only
+
+        try:
+            return func(values, axis=axis, **kwargs)
+        except AttributeError:
+            if isinstance(values, dask_array_type):
+                try:  # dask/dask#3133 dask sometimes needs dtype argument
+                    # if func does not accept dtype, then raises TypeError
+                    return func(values, axis=axis, dtype=values.dtype,
+                                **kwargs)
+                except (AttributeError, TypeError):
+                    msg = '%s is not yet implemented on dask arrays' % name
+            else:
+                msg = ('%s is not available with skipna=False with the '
+                       'installed version of numpy; upgrade to numpy 1.12 '
+                       'or newer to use skipna=True or skipna=None' % name)
+            raise NotImplementedError(msg)
+
     f.__name__ = name
     return f
 
 
+# Attributes `numeric_only`, `available_min_count` is used for docs.
+# See ops.inject_reduce_methods
 argmax = _create_nan_agg_method('argmax', coerce_strings=True)
 argmin = _create_nan_agg_method('argmin', coerce_strings=True)
 max = _create_nan_agg_method('max', coerce_strings=True)
 min = _create_nan_agg_method('min', coerce_strings=True)
-sum = _create_nan_agg_method('sum', numeric_only=True)
-mean = _create_nan_agg_method('mean', numeric_only=True)
-std = _create_nan_agg_method('std', numeric_only=True)
-var = _create_nan_agg_method('var', numeric_only=True)
-median = _create_nan_agg_method('median', numeric_only=True)
-prod = _create_nan_agg_method('prod', numeric_only=True, no_bottleneck=True)
-cumprod_1d = _create_nan_agg_method(
-    'cumprod', numeric_only=True, no_bottleneck=True)
-cumsum_1d = _create_nan_agg_method(
-    'cumsum', numeric_only=True, no_bottleneck=True)
+sum = _create_nan_agg_method('sum')
+sum.numeric_only = True
+sum.available_min_count = True
+mean = _create_nan_agg_method('mean')
+mean.numeric_only = True
+std = _create_nan_agg_method('std')
+std.numeric_only = True
+var = _create_nan_agg_method('var')
+var.numeric_only = True
+median = _create_nan_agg_method('median')
+median.numeric_only = True
+prod = _create_nan_agg_method('prod')
+prod.numeric_only = True
+sum.available_min_count = True
+cumprod_1d = _create_nan_agg_method('cumprod')
+cumprod_1d.numeric_only = True
+cumsum_1d = _create_nan_agg_method('cumsum')
+cumsum_1d.numeric_only = True
 
 
 def _nd_cum_func(cum_func, array, axis, **kwargs):