Implement DataFrame.__array_ufunc__

TomAugspurger · TomAugspurger · commit 0d725e87f4c7 · 2020-10-07T13:39:29.000-05:00
For some cases, this will preserve extension types of arrays by calling
the ufunc blockwise.

```python
In [1]: import pandas as pd; import numpy as np
In [2]: df = pd.DataFrame({"A": pd.array([0, 1], dtype="Sparse")})

In [3]: np.sin(df).dtypes
Out[3]:
A    Sparse[float64, nan]
dtype: object
```

We don't currently handle the multi-input case well (aside from ufuncs that
are implemented as dunder ops like `np.add`). For these, we fall back to
the old implementation of converting to an ndarray.
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -189,6 +189,8 @@ Other enhancements
 - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`)
 - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`)
 - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`)
+- Calling a NumPy ufunc on a ``DataFrame`` with extension types now presrves the extension types when possible (:issue:`23743`).
+- Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`).
 - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
 
 .. _whatsnew_120.api_breaking.python:
@@ -289,6 +291,7 @@ Deprecations
 - Deprecated :meth:`Index.is_all_dates` (:issue:`27744`)
 - Deprecated automatic alignment on comparison operations between :class:`DataFrame` and :class:`Series`, do ``frame, ser = frame.align(ser, axis=1, copy=False)`` before e.g. ``frame == ser`` (:issue:`28759`)
 - :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`)
+- Using "outer" ufuncs on DataFrames to return 4d ndarray is now deprecated. Convert to an ndarray first (:issue:`23743`)
 - :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`)
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -556,6 +556,10 @@ def __init__(
 
         NDFrame.__init__(self, mgr)
 
+    # ----------------------------------------------------------------------
+    # Array interface
+    _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)
+
     # ----------------------------------------------------------------------
 
     @property
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -87,11 +87,11 @@
 from pandas.core.dtypes.missing import isna, notna
 
 import pandas as pd
-from pandas.core import missing, nanops
+from pandas.core import missing, nanops, ops
 import pandas.core.algorithms as algos
 from pandas.core.base import PandasObject, SelectionMixin
 import pandas.core.common as com
-from pandas.core.construction import create_series_with_explicit_dtype
+from pandas.core.construction import create_series_with_explicit_dtype, extract_array
 from pandas.core.flags import Flags
 from pandas.core.indexes import base as ibase
 from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index
@@ -1912,6 +1912,102 @@ def __array_wrap__(
             self, method="__array_wrap__"
         )
 
+    @ops.defer_or_dispatch_ufunc
+    def __array_ufunc__(
+        self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any
+    ):
+        # XXX: check outer
+        # align all the inputs.
+        types = tuple(type(x) for x in inputs)
+        alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)]
+
+        if len(alignable) > 1:
+            # This triggers alignment.
+            # At the moment, there aren't any ufuncs with more than two inputs
+            # so this ends up just being x1.index | x2.index, but we write
+            # it to handle *args.
+
+            if len(set(types)) > 1:
+                # We currently don't handle ufunc(DataFrame, Series)
+                # well. Previously this raised an internal ValueError. We might
+                # support it someday, so raise a NotImplementedError.
+                raise NotImplementedError(
+                    "Cannot apply ufunc {} to mixed DataFrame and Series "
+                    "inputs.".format(ufunc)
+                )
+            axes = self.axes
+            for obj in alignable[1:]:
+                # this relies on the fact that we aren't handling mixed
+                # series / frame ufuncs.
+                for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)):
+                    axes[i] = ax1 | ax2
+
+            reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes))
+            inputs = tuple(
+                x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x
+                for x, t in zip(inputs, types)
+            )
+        else:
+            reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes))
+
+        if self.ndim == 1:
+            names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
+            name = names[0] if len(set(names)) == 1 else None
+            reconstruct_kwargs = {"name": name}
+        else:
+            reconstruct_kwargs = {}
+
+        def reconstruct(result):
+            if lib.is_scalar(result):
+                return result
+            if result.ndim != self.ndim:
+                if method == "outer":
+                    if self.ndim == 2:
+                        # we already deprecated for Series
+                        msg = (
+                            "outer method for ufunc {} is not implemented on "
+                            "pandas objects. Returning an ndarray, but in the "
+                            "future this will raise a 'NotImplementedError'. "
+                            "Consider explicitly converting the DataFrame "
+                            "to an array with '.to_numpy()' first."
+                        )
+                        warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=4)
+                        return result
+                    raise NotImplementedError
+                return result
+            if isinstance(result, BlockManager):
+                # we went through BlockManager.apply
+                return self._constructor(result, **reconstruct_kwargs, copy=False)
+            else:
+                # we converted an array, lost our axes
+                return self._constructor(
+                    result, **reconstruct_axes, **reconstruct_kwargs, copy=False
+                )
+
+        if self.ndim > 1 and (len(inputs) > 1 or ufunc.nout > 1):
+            # Just give up on preserving types in the complex case.
+            # In theory we could preserve them for them.
+            # * nout>1 is doable if BlockManager.apply took nout and
+            #   returned a Tuple[BlockManager].
+            # * len(inputs) > 1 is doable when we know that we have
+            #   aligned blocks / dtypes.
+            inputs = tuple(np.asarray(x) for x in inputs)
+            result = getattr(ufunc, method)(*inputs)
+        elif self.ndim == 1:
+            # ufunc(series, ...)
+            inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
+            result = getattr(ufunc, method)(*inputs, **kwargs)
+        else:
+            # ufunc(dataframe)
+            mgr = inputs[0]._mgr
+            result = mgr.apply(getattr(ufunc, method))
+
+        if ufunc.nout > 1:
+            result = tuple(reconstruct(x) for x in result)
+        else:
+            result = reconstruct(result)
+        return result
+
     # ideally we would define this to avoid the getattr checks, but
     # is slower
     # @property
diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py
@@ -27,7 +27,10 @@
     get_array_op,
     logical_op,
 )
-from pandas.core.ops.common import unpack_zerodim_and_defer
+from pandas.core.ops.common import (  # noqa:F401
+    defer_or_dispatch_ufunc,
+    unpack_zerodim_and_defer,
+)
 from pandas.core.ops.docstrings import (
     _arith_doc_FRAME,
     _flex_comp_doc_FRAME,
diff --git a/pandas/core/ops/common.py b/pandas/core/ops/common.py
@@ -2,9 +2,12 @@
 Boilerplate functions used in defining binary operations.
 """
 from functools import wraps
-from typing import Callable
+from typing import Any, Callable
+
+import numpy as np
 
 from pandas._libs.lib import item_from_zerodim
+from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
 from pandas._typing import F
 
 from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
@@ -65,3 +68,51 @@ def new_method(self, other):
         return method(self, other)
 
     return new_method
+
+
+def defer_or_dispatch_ufunc(meth):
+    """
+    Boilerplate for pandas conventions in arithmetic and comparison methods.
+
+    Ensure method returns NotImplemented when operating against "senior"
+    classes.  Ensure zero-dimensional ndarrays are always unpacked.
+
+    Parameters
+    ----------
+    method : binary method
+
+    Returns
+    -------
+    method
+    """
+
+    @wraps(meth)
+    def new_method(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any):
+        cls = type(self)
+
+        # for binary ops, use our custom dunder methods
+        result = maybe_dispatch_ufunc_to_dunder_op(
+            self, ufunc, method, *inputs, **kwargs
+        )
+        if result is not NotImplemented:
+            return result
+
+        # Determine if we should defer.
+        no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__)
+
+        for item in inputs:
+            higher_priority = (
+                hasattr(item, "__array_priority__")
+                and item.__array_priority__ > self.__array_priority__
+            )
+            has_array_ufunc = (
+                hasattr(item, "__array_ufunc__")
+                and type(item).__array_ufunc__ not in no_defer
+                and not isinstance(item, self._HANDLED_TYPES)
+            )
+            if higher_priority or has_array_ufunc:
+                return NotImplemented
+
+        return meth(self, ufunc, method, *inputs, **kwargs)
+
+    return new_method
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -683,81 +683,6 @@ def view(self, dtype=None) -> "Series":
     # NDArray Compat
     _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray)
 
-    def __array_ufunc__(
-        self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any
-    ):
-        # TODO: handle DataFrame
-        cls = type(self)
-
-        # for binary ops, use our custom dunder methods
-        result = ops.maybe_dispatch_ufunc_to_dunder_op(
-            self, ufunc, method, *inputs, **kwargs
-        )
-        if result is not NotImplemented:
-            return result
-
-        # Determine if we should defer.
-        no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__)
-
-        for item in inputs:
-            higher_priority = (
-                hasattr(item, "__array_priority__")
-                and item.__array_priority__ > self.__array_priority__
-            )
-            has_array_ufunc = (
-                hasattr(item, "__array_ufunc__")
-                and type(item).__array_ufunc__ not in no_defer
-                and not isinstance(item, self._HANDLED_TYPES)
-            )
-            if higher_priority or has_array_ufunc:
-                return NotImplemented
-
-        # align all the inputs.
-        names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
-        types = tuple(type(x) for x in inputs)
-        # TODO: dataframe
-        alignable = [x for x, t in zip(inputs, types) if issubclass(t, Series)]
-
-        if len(alignable) > 1:
-            # This triggers alignment.
-            # At the moment, there aren't any ufuncs with more than two inputs
-            # so this ends up just being x1.index | x2.index, but we write
-            # it to handle *args.
-            index = alignable[0].index
-            for s in alignable[1:]:
-                index |= s.index
-            inputs = tuple(
-                x.reindex(index) if issubclass(t, Series) else x
-                for x, t in zip(inputs, types)
-            )
-        else:
-            index = self.index
-
-        inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
-        result = getattr(ufunc, method)(*inputs, **kwargs)
-
-        name = names[0] if len(set(names)) == 1 else None
-
-        def construct_return(result):
-            if lib.is_scalar(result):
-                return result
-            elif result.ndim > 1:
-                # e.g. np.subtract.outer
-                if method == "outer":
-                    # GH#27198
-                    raise NotImplementedError
-                return result
-            return self._constructor(result, index=index, name=name, copy=False)
-
-        if type(result) is tuple:
-            # multiple return values
-            return tuple(construct_return(x) for x in result)
-        elif method == "at":
-            # no return value
-            return None
-        else:
-            return construct_return(result)
-
     def __array__(self, dtype=None) -> np.ndarray:
         """
         Return the values as a NumPy array.
diff --git a/pandas/tests/frame/test_ufunc.py b/pandas/tests/frame/test_ufunc.py