API: rolling.apply will pass Series to function (#20584)

jreback · web-flow · commit 4a344972722c · 2018-04-16T10:54:03.000-04:00
closes #5071
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -65,6 +65,35 @@ The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtyp
    pd.get_dummies(df, columns=['c'], dtype=bool).dtypes
 
 
+.. _whatsnew_0230.enhancements.window_raw:
+
+Rolling/Expanding.apply() accepts a ``raw`` keyword to pass a ``Series`` to the function
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`Series.rolling().apply() <pandas.core.window.Rolling.apply>`, :func:`DataFrame.rolling().apply() <pandas.core.window.Rolling.apply>`,
+:func:`Series.expanding().apply() <pandas.core.window.Expanding.apply>`, and :func:`DataFrame.expanding().apply() <pandas.core.window.Expanding.apply>` have gained a ``raw=None`` parameter.
+This is similar to :func:`DataFame.apply`. This parameter, if ``True`` allows one to send a ``np.ndarray`` to the applied function. If ``False`` a ``Series`` will be passed. The
+default is ``None``, which preserves backward compatibility, so this will default to ``True``, sending an ``np.ndarray``.
+In a future version the default will be changed to ``False``, sending a ``Series``. (:issue:`5071`, :issue:`20584`)
+
+.. ipython:: python
+
+   s = pd.Series(np.arange(5), np.arange(5) + 1)
+   s
+
+Pass a ``Series``:
+
+.. ipython:: python
+
+   s.rolling(2, min_periods=1).apply(lambda x: x.iloc[-1], raw=False)
+
+Mimic the original behavior of passing a ndarray:
+
+.. ipython:: python
+
+   s.rolling(2, min_periods=1).apply(lambda x: x[-1], raw=True)
+
+
 .. _whatsnew_0230.enhancements.merge_on_columns_and_levels:
 
 Merging on a combination of columns and index levels
@@ -817,6 +846,7 @@ Other API Changes
 - :func:`DatetimeIndex.strftime` and :func:`PeriodIndex.strftime` now return an ``Index`` instead of a numpy array to be consistent with similar accessors (:issue:`20127`)
 - Constructing a Series from a list of length 1 no longer broadcasts this list when a longer index is specified (:issue:`19714`, :issue:`20391`).
 - :func:`DataFrame.to_dict` with ``orient='index'`` no longer casts int columns to float for a DataFrame with only int and float columns (:issue:`18580`)
+- A user-defined-function that is passed to :func:`Series.rolling().aggregate() <pandas.core.window.Rolling.aggregate>`, :func:`DataFrame.rolling().aggregate() <pandas.core.window.Rolling.aggregate>`, or its expanding cousins, will now *always* be passed a ``Series``, rather than an ``np.array``; ``.apply()`` only has the ``raw`` keyword, see :ref:`here <whatsnew_0230.enhancements.window_raw>`. This is consistent with the signatures of ``.aggregate()`` across pandas (:issue:`20584`)
 
 .. _whatsnew_0230.deprecations:
 
@@ -845,6 +875,8 @@ Deprecations
 - ``Index.summary()`` is deprecated and will be removed in a future version (:issue:`18217`)
 - ``NDFrame.get_ftype_counts()`` is deprecated and will be removed in a future version (:issue:`18243`)
 - The ``convert_datetime64`` parameter in :func:`DataFrame.to_records` has been deprecated and will be removed in a future version. The NumPy bug motivating this parameter has been resolved. The default value for this parameter has also changed from ``True`` to ``None`` (:issue:`18160`).
+- :func:`Series.rolling().apply() <pandas.core.window.Rolling.apply>`, :func:`DataFrame.rolling().apply() <pandas.core.window.Rolling.apply>`,
+  :func:`Series.expanding().apply() <pandas.core.window.Expanding.apply>`, and :func:`DataFrame.expanding().apply() <pandas.core.window.Expanding.apply>` have deprecated passing an ``np.array`` by default. One will need to pass the new ``raw`` parameter to be explicit about what is passed (:issue:`20584`)
 
 .. _whatsnew_0230.prior_deprecations:
 
diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx
@@ -1432,39 +1432,44 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win,
     return output
 
 
-def roll_generic(ndarray[float64_t, cast=True] input,
+def roll_generic(object obj,
                  int64_t win, int64_t minp, object index, object closed,
-                 int offset, object func,
+                 int offset, object func, bint raw,
                  object args, object kwargs):
     cdef:
         ndarray[double_t] output, counts, bufarr
+        ndarray[float64_t, cast=True] arr
         float64_t *buf
         float64_t *oldbuf
         int64_t nobs = 0, i, j, s, e, N
         bint is_variable
         ndarray[int64_t] start, end
 
-    if not input.flags.c_contiguous:
-        input = input.copy('C')
-
-    n = len(input)
+    n = len(obj)
     if n == 0:
-        return input
+        return obj
+
+    arr = np.asarray(obj)
+
+    # ndarray input
+    if raw:
+        if not arr.flags.c_contiguous:
+            arr = arr.copy('C')
 
-    counts = roll_sum(np.concatenate([np.isfinite(input).astype(float),
+    counts = roll_sum(np.concatenate([np.isfinite(arr).astype(float),
                                       np.array([0.] * offset)]),
                       win, minp, index, closed)[offset:]
 
-    start, end, N, win, minp, is_variable = get_window_indexer(input, win,
+    start, end, N, win, minp, is_variable = get_window_indexer(arr, win,
                                                                minp, index,
                                                                closed,
                                                                floor=0)
 
     output = np.empty(N, dtype=float)
 
     if is_variable:
+        # variable window arr or series
 
-        # variable window
         if offset != 0:
             raise ValueError("unable to roll_generic with a non-zero offset")
 
@@ -1473,7 +1478,20 @@ def roll_generic(ndarray[float64_t, cast=True] input,
             e = end[i]
 
             if counts[i] >= minp:
-                output[i] = func(input[s:e], *args, **kwargs)
+                if raw:
+                    output[i] = func(arr[s:e], *args, **kwargs)
+                else:
+                    output[i] = func(obj.iloc[s:e], *args, **kwargs)
+            else:
+                output[i] = NaN
+
+    elif not raw:
+        # series
+        for i from 0 <= i < N:
+            if counts[i] >= minp:
+                sl = slice(int_max(i + offset - win + 1, 0),
+                           int_min(i + offset + 1, N))
+                output[i] = func(obj.iloc[sl], *args, **kwargs)
             else:
                 output[i] = NaN
 
@@ -1482,12 +1500,12 @@ def roll_generic(ndarray[float64_t, cast=True] input,
         # truncated windows at the beginning, through first full-length window
         for i from 0 <= i < (int_min(win, N) - offset):
             if counts[i] >= minp:
-                output[i] = func(input[0: (i + offset + 1)], *args, **kwargs)
+                output[i] = func(arr[0: (i + offset + 1)], *args, **kwargs)
             else:
                 output[i] = NaN
 
         # remaining full-length windows
-        buf = <float64_t *> input.data
+        buf = <float64_t *> arr.data
         bufarr = np.empty(win, dtype=float)
         oldbuf = <float64_t *> bufarr.data
         for i from (win - offset) <= i < (N - offset):
@@ -1502,7 +1520,7 @@ def roll_generic(ndarray[float64_t, cast=True] input,
         # truncated windows at the end
         for i from int_max(N - offset, 0) <= i < N:
             if counts[i] >= minp:
-                output[i] = func(input[int_max(i + offset - win + 1, 0): N],
+                output[i] = func(arr[int_max(i + offset - win + 1, 0): N],
                                  *args,
                                  **kwargs)
             else:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -4292,6 +4292,8 @@ def pipe(self, func, *args, **kwargs):
     Notes
     -----
     `agg` is an alias for `aggregate`. Use the alias.
+
+    A passed user-defined-function will be passed a Series for evaluation.
     """)
 
     _shared_docs['transform'] = ("""
diff --git a/pandas/core/window.py b/pandas/core/window.py
@@ -314,7 +314,7 @@ def _center_window(self, result, window):
     def aggregate(self, arg, *args, **kwargs):
         result, how = self._aggregate(arg, *args, **kwargs)
         if result is None:
-            return self.apply(arg, args=args, kwargs=kwargs)
+            return self.apply(arg, raw=False, args=args, kwargs=kwargs)
         return result
 
     agg = aggregate
@@ -954,23 +954,53 @@ def count(self):
     Parameters
     ----------
     func : function
-        Must produce a single value from an ndarray input
-        \*args and \*\*kwargs are passed to the function""")
+        Must produce a single value from an ndarray input if ``raw=True``
+        or a Series if ``raw=False``
+    raw : bool, default None
+        * ``False`` : passes each row or column as a Series to the
+          function.
+        * ``True`` or ``None`` : the passed function will receive ndarray
+          objects instead.
+          If you are just applying a NumPy reduction function this will
+          achieve much better performance.
+
+        The `raw` parameter is required and will show a FutureWarning if
+        not passed. In the future `raw` will default to False.
+
+        .. versionadded:: 0.23.0
+
+    \*args and \*\*kwargs are passed to the function""")
+
+    def apply(self, func, raw=None, args=(), kwargs={}):
+        from pandas import Series
 
-    def apply(self, func, args=(), kwargs={}):
         # TODO: _level is unused?
         _level = kwargs.pop('_level', None)  # noqa
         window = self._get_window()
         offset = _offset(window, self.center)
         index, indexi = self._get_index()
 
+        # TODO: default is for backward compat
+        # change to False in the future
+        if raw is None:
+            warnings.warn(
+                "Currently, 'apply' passes the values as ndarrays to the "
+                "applied function. In the future, this will change to passing "
+                "it as Series objects. You need to specify 'raw=True' to keep "
+                "the current behaviour, and you can pass 'raw=False' to "
+                "silence this warning", FutureWarning, stacklevel=3)
+            raw = True
+
         def f(arg, window, min_periods, closed):
             minp = _use_window(min_periods, window)
-            return _window.roll_generic(arg, window, minp, indexi, closed,
-                                        offset, func, args, kwargs)
+            if not raw:
+                arg = Series(arg, index=self.obj.index)
+            return _window.roll_generic(
+                arg, window, minp, indexi,
+                closed, offset, func, raw, args, kwargs)
 
         return self._apply(f, func, args=args, kwargs=kwargs,
-                           center=False)
+                           center=False, raw=raw)
 
     def sum(self, *args, **kwargs):
         nv.validate_window_func('sum', args, kwargs)
@@ -1498,8 +1528,9 @@ def count(self):
     @Substitution(name='rolling')
     @Appender(_doc_template)
     @Appender(_shared_docs['apply'])
-    def apply(self, func, args=(), kwargs={}):
-        return super(Rolling, self).apply(func, args=args, kwargs=kwargs)
+    def apply(self, func, raw=None, args=(), kwargs={}):
+        return super(Rolling, self).apply(
+            func, raw=raw, args=args, kwargs=kwargs)
 
     @Substitution(name='rolling')
     @Appender(_shared_docs['sum'])
@@ -1756,8 +1787,9 @@ def count(self, **kwargs):
     @Substitution(name='expanding')
     @Appender(_doc_template)
     @Appender(_shared_docs['apply'])
-    def apply(self, func, args=(), kwargs={}):
-        return super(Expanding, self).apply(func, args=args, kwargs=kwargs)
+    def apply(self, func, raw=None, args=(), kwargs={}):
+        return super(Expanding, self).apply(
+            func, raw=raw, args=args, kwargs=kwargs)
 
     @Substitution(name='expanding')
     @Appender(_shared_docs['sum'])
diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py