diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 016e8d90e7d21..3bfb507d2e140 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -265,6 +265,7 @@ Deprecations - Deprecated indexing :class:`DataFrame` rows with datetime-like strings ``df[string]``, use ``df.loc[string]`` instead (:issue:`36179`) - Deprecated casting an object-dtype index of ``datetime`` objects to :class:`DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`) - Deprecated :meth:`Index.is_all_dates` (:issue:`27744`) +- :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`) .. --------------------------------------------------------------------------- @@ -404,6 +405,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) - Bug in :meth:`Rolling.sum()` returned wrong values when dtypes where mixed between float and integer and axis was equal to one (:issue:`20649`, :issue:`35596`) +- Bug in :meth:`Rolling.count` returned ``np.nan`` with :class:`pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in window (:issue:`35579`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 5f60b884c6ada..c6fd569247b90 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -89,62 +89,6 @@ cdef bint is_monotonic_start_end_bounds( # Physical description: 366 p. # Series: Prentice-Hall Series in Automatic Computation -# ---------------------------------------------------------------------- -# Rolling count -# this is only an impl for index not None, IOW, freq aware - - -def roll_count( - ndarray[float64_t] values, - ndarray[int64_t] start, - ndarray[int64_t] end, - int64_t minp, -): - cdef: - float64_t val, count_x = 0.0 - int64_t s, e, nobs, N = len(values) - Py_ssize_t i, j - ndarray[float64_t] output - - output = np.empty(N, dtype=float) - - with nogil: - - for i in range(0, N): - s = start[i] - e = end[i] - - if i == 0: - - # setup - count_x = 0.0 - for j in range(s, e): - val = values[j] - if notnan(val): - count_x += 1.0 - - else: - - # calculate deletes - for j in range(start[i - 1], s): - val = values[j] - if notnan(val): - count_x -= 1.0 - - # calculate adds - for j in range(end[i - 1], e): - val = values[j] - if notnan(val): - count_x += 1.0 - - if count_x >= minp: - output[i] = count_x - else: - output[i] = NaN - - return output - - # ---------------------------------------------------------------------- # Rolling sum diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 6ab42dda865e7..f207ea4cd67d4 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -17,6 +17,7 @@ Type, Union, ) +import warnings import numpy as np @@ -469,14 +470,18 @@ def _get_window_indexer(self, window: int) -> BaseIndexer: return VariableWindowIndexer(index_array=self._on.asi8, window_size=window) return FixedWindowIndexer(window_size=window) - def _apply_series(self, homogeneous_func: Callable[..., ArrayLike]) -> "Series": + def _apply_series( + self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None + ) -> "Series": """ Series version of _apply_blockwise """ obj = self._create_data(self._selected_obj) try: - values = self._prep_values(obj.values) + # GH 12541: Special case for count where we support date-like types + input = obj.values if name != "count" else notna(obj.values).astype(int) + values = self._prep_values(input) except (TypeError, NotImplementedError) as err: raise DataError("No numeric types to aggregate") from err @@ -484,16 +489,20 @@ def _apply_series(self, homogeneous_func: Callable[..., ArrayLike]) -> "Series": return obj._constructor(result, index=obj.index, name=obj.name) def _apply_blockwise( - self, homogeneous_func: Callable[..., ArrayLike] + self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None ) -> FrameOrSeriesUnion: """ Apply the given function to the DataFrame broken down into homogeneous sub-frames. """ if self._selected_obj.ndim == 1: - return self._apply_series(homogeneous_func) + return self._apply_series(homogeneous_func, name) obj = self._create_data(self._selected_obj) + if name == "count": + # GH 12541: Special case for count where we support date-like types + obj = notna(obj).astype(int) + obj._mgr = obj._mgr.consolidate() mgr = obj._mgr def hfunc(bvalues: ArrayLike) -> ArrayLike: @@ -606,7 +615,7 @@ def calc(x): return result - return self._apply_blockwise(homogeneous_func) + return self._apply_blockwise(homogeneous_func, name) def aggregate(self, func, *args, **kwargs): result, how = self._aggregate(func, *args, **kwargs) @@ -1265,33 +1274,8 @@ class RollingAndExpandingMixin(BaseWindow): ) def count(self): - # GH 32865. Using count with custom BaseIndexer subclass - # implementations shouldn't end up here - assert not isinstance(self.window, BaseIndexer) - - obj = self._create_data(self._selected_obj) - - def hfunc(values: np.ndarray) -> np.ndarray: - result = notna(values) - result = result.astype(int) - frame = type(obj)(result.T) - result = self._constructor( - frame, - window=self._get_window(), - min_periods=self.min_periods or 0, - center=self.center, - axis=self.axis, - closed=self.closed, - ).sum() - return result.values.T - - new_mgr = obj._mgr.apply(hfunc) - out = obj._constructor(new_mgr) - if obj.ndim == 1: - out.name = obj.name - else: - self._insert_on_column(out, obj) - return out + window_func = self._get_cython_func_type("roll_sum") + return self._apply(window_func, center=self.center, name="count") _shared_docs["apply"] = dedent( r""" @@ -2050,14 +2034,16 @@ def aggregate(self, func, *args, **kwargs): @Substitution(name="rolling") @Appender(_shared_docs["count"]) def count(self): - - # different impl for freq counting - # GH 32865. Use a custom count function implementation - # when using a BaseIndexer subclass as a window - if self.is_freq_type or isinstance(self.window, BaseIndexer): - window_func = self._get_roll_func("roll_count") - return self._apply(window_func, center=self.center, name="count") - + if self.min_periods is None: + warnings.warn( + ( + "min_periods=None will default to the size of window " + "consistent with other methods in a future version. " + "Specify min_periods=0 instead." + ), + FutureWarning, + ) + self.min_periods = 0 return super().count() @Substitution(name="rolling") diff --git a/pandas/tests/window/moments/test_moments_consistency_rolling.py b/pandas/tests/window/moments/test_moments_consistency_rolling.py index dfcbdde466d44..99c2c4dd0045b 100644 --- a/pandas/tests/window/moments/test_moments_consistency_rolling.py +++ b/pandas/tests/window/moments/test_moments_consistency_rolling.py @@ -452,7 +452,7 @@ def test_moment_functions_zero_length(): df2_expected = df2 functions = [ - lambda x: x.rolling(window=10).count(), + lambda x: x.rolling(window=10, min_periods=0).count(), lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), lambda x: x.rolling(window=10, min_periods=5).max(), diff --git a/pandas/tests/window/moments/test_moments_rolling_functions.py b/pandas/tests/window/moments/test_moments_rolling_functions.py index 98c7a0a055bd3..abe75c7289ed4 100644 --- a/pandas/tests/window/moments/test_moments_rolling_functions.py +++ b/pandas/tests/window/moments/test_moments_rolling_functions.py @@ -12,7 +12,12 @@ [ [np.mean, "mean", {}], [np.nansum, "sum", {}], - [lambda x: np.isfinite(x).astype(float).sum(), "count", {}], + pytest.param( + lambda x: np.isfinite(x).astype(float).sum(), + "count", + {}, + marks=pytest.mark.filterwarnings("ignore:min_periods:FutureWarning"), + ), [np.median, "median", {}], [np.min, "min", {}], [np.max, "max", {}], @@ -33,7 +38,12 @@ def test_series(series, compare_func, roll_func, kwargs): [ [np.mean, "mean", {}], [np.nansum, "sum", {}], - [lambda x: np.isfinite(x).astype(float).sum(), "count", {}], + pytest.param( + lambda x: np.isfinite(x).astype(float).sum(), + "count", + {}, + marks=pytest.mark.filterwarnings("ignore:min_periods:FutureWarning"), + ), [np.median, "median", {}], [np.min, "min", {}], [np.max, "max", {}], diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index f681b19d57600..7f2d58effe1ae 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -138,6 +138,7 @@ def get_window_bounds(self, num_values, min_periods, center, closed): ), ], ) +@pytest.mark.filterwarnings("ignore:min_periods:FutureWarning") def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs): # GH 32865 values = np.arange(10.0) @@ -253,3 +254,12 @@ def test_non_fixed_variable_window_indexer(closed, expected_data): result = df.rolling(indexer, closed=closed).sum() expected = DataFrame(expected_data, index=index) tm.assert_frame_equal(result, expected) + + +def test_fixed_forward_indexer_count(): + # GH: 35579 + df = DataFrame({"b": [None, None, None, 7]}) + indexer = FixedForwardWindowIndexer(window_size=2) + result = df.rolling(window=indexer, min_periods=0).count() + expected = DataFrame({"b": [0.0, 0.0, 1.0, 1.0]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py index 245b48b351684..fc7a51834780f 100644 --- a/pandas/tests/window/test_dtypes.py +++ b/pandas/tests/window/test_dtypes.py @@ -21,82 +21,111 @@ def get_dtype(dtype, coerce_int=None): @pytest.mark.parametrize( - "method, data, expected_data, coerce_int", + "method, data, expected_data, coerce_int, min_periods", [ - ("count", np.arange(5), [1, 2, 2, 2, 2], True), - ("count", np.arange(10, 0, -2), [1, 2, 2, 2, 2], True), - ("count", [0, 1, 2, np.nan, 4], [1, 2, 2, 1, 1], False), - ("max", np.arange(5), [np.nan, 1, 2, 3, 4], True), - ("max", np.arange(10, 0, -2), [np.nan, 10, 8, 6, 4], True), - ("max", [0, 1, 2, np.nan, 4], [np.nan, 1, 2, np.nan, np.nan], False), - ("min", np.arange(5), [np.nan, 0, 1, 2, 3], True), - ("min", np.arange(10, 0, -2), [np.nan, 8, 6, 4, 2], True), - ("min", [0, 1, 2, np.nan, 4], [np.nan, 0, 1, np.nan, np.nan], False), - ("sum", np.arange(5), [np.nan, 1, 3, 5, 7], True), - ("sum", np.arange(10, 0, -2), [np.nan, 18, 14, 10, 6], True), - ("sum", [0, 1, 2, np.nan, 4], [np.nan, 1, 3, np.nan, np.nan], False), - ("mean", np.arange(5), [np.nan, 0.5, 1.5, 2.5, 3.5], True), - ("mean", np.arange(10, 0, -2), [np.nan, 9, 7, 5, 3], True), - ("mean", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 1.5, np.nan, np.nan], False), - ("std", np.arange(5), [np.nan] + [np.sqrt(0.5)] * 4, True), - ("std", np.arange(10, 0, -2), [np.nan] + [np.sqrt(2)] * 4, True), + ("count", np.arange(5), [1, 2, 2, 2, 2], True, 0), + ("count", np.arange(10, 0, -2), [1, 2, 2, 2, 2], True, 0), + ("count", [0, 1, 2, np.nan, 4], [1, 2, 2, 1, 1], False, 0), + ("max", np.arange(5), [np.nan, 1, 2, 3, 4], True, None), + ("max", np.arange(10, 0, -2), [np.nan, 10, 8, 6, 4], True, None), + ("max", [0, 1, 2, np.nan, 4], [np.nan, 1, 2, np.nan, np.nan], False, None), + ("min", np.arange(5), [np.nan, 0, 1, 2, 3], True, None), + ("min", np.arange(10, 0, -2), [np.nan, 8, 6, 4, 2], True, None), + ("min", [0, 1, 2, np.nan, 4], [np.nan, 0, 1, np.nan, np.nan], False, None), + ("sum", np.arange(5), [np.nan, 1, 3, 5, 7], True, None), + ("sum", np.arange(10, 0, -2), [np.nan, 18, 14, 10, 6], True, None), + ("sum", [0, 1, 2, np.nan, 4], [np.nan, 1, 3, np.nan, np.nan], False, None), + ("mean", np.arange(5), [np.nan, 0.5, 1.5, 2.5, 3.5], True, None), + ("mean", np.arange(10, 0, -2), [np.nan, 9, 7, 5, 3], True, None), + ("mean", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 1.5, np.nan, np.nan], False, None), + ("std", np.arange(5), [np.nan] + [np.sqrt(0.5)] * 4, True, None), + ("std", np.arange(10, 0, -2), [np.nan] + [np.sqrt(2)] * 4, True, None), ( "std", [0, 1, 2, np.nan, 4], [np.nan] + [np.sqrt(0.5)] * 2 + [np.nan] * 2, False, + None, + ), + ("var", np.arange(5), [np.nan, 0.5, 0.5, 0.5, 0.5], True, None), + ("var", np.arange(10, 0, -2), [np.nan, 2, 2, 2, 2], True, None), + ("var", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 0.5, np.nan, np.nan], False, None), + ("median", np.arange(5), [np.nan, 0.5, 1.5, 2.5, 3.5], True, None), + ("median", np.arange(10, 0, -2), [np.nan, 9, 7, 5, 3], True, None), + ( + "median", + [0, 1, 2, np.nan, 4], + [np.nan, 0.5, 1.5, np.nan, np.nan], + False, + None, ), - ("var", np.arange(5), [np.nan, 0.5, 0.5, 0.5, 0.5], True), - ("var", np.arange(10, 0, -2), [np.nan, 2, 2, 2, 2], True), - ("var", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 0.5, np.nan, np.nan], False), - ("median", np.arange(5), [np.nan, 0.5, 1.5, 2.5, 3.5], True), - ("median", np.arange(10, 0, -2), [np.nan, 9, 7, 5, 3], True), - ("median", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 1.5, np.nan, np.nan], False), ], ) -def test_series_dtypes(method, data, expected_data, coerce_int, dtypes): +def test_series_dtypes(method, data, expected_data, coerce_int, dtypes, min_periods): s = Series(data, dtype=get_dtype(dtypes, coerce_int=coerce_int)) if dtypes in ("m8[ns]", "M8[ns]") and method != "count": msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): - getattr(s.rolling(2), method)() + getattr(s.rolling(2, min_periods=min_periods), method)() else: - result = getattr(s.rolling(2), method)() + result = getattr(s.rolling(2, min_periods=min_periods), method)() expected = Series(expected_data, dtype="float64") tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( - "method, expected_data", + "method, expected_data, min_periods", [ - ("count", {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}), - ("max", {0: Series([np.nan, 2, 4, 6, 8]), 1: Series([np.nan, 3, 5, 7, 9])}), - ("min", {0: Series([np.nan, 0, 2, 4, 6]), 1: Series([np.nan, 1, 3, 5, 7])}), + ("count", {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}, 0), + ( + "max", + {0: Series([np.nan, 2, 4, 6, 8]), 1: Series([np.nan, 3, 5, 7, 9])}, + None, + ), + ( + "min", + {0: Series([np.nan, 0, 2, 4, 6]), 1: Series([np.nan, 1, 3, 5, 7])}, + None, + ), ( "sum", {0: Series([np.nan, 2, 6, 10, 14]), 1: Series([np.nan, 4, 8, 12, 16])}, + None, + ), + ( + "mean", + {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, + None, ), - ("mean", {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}), ( "std", { 0: Series([np.nan] + [np.sqrt(2)] * 4), 1: Series([np.nan] + [np.sqrt(2)] * 4), }, + None, + ), + ( + "var", + {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}, + None, + ), + ( + "median", + {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, + None, ), - ("var", {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}), - ("median", {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}), ], ) -def test_dataframe_dtypes(method, expected_data, dtypes): +def test_dataframe_dtypes(method, expected_data, dtypes, min_periods): if dtypes == "category": pytest.skip("Category dataframe testing not implemented.") df = DataFrame(np.arange(10).reshape((5, 2)), dtype=get_dtype(dtypes)) if dtypes in ("m8[ns]", "M8[ns]") and method != "count": msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): - getattr(df.rolling(2), method)() + getattr(df.rolling(2, min_periods=min_periods), method)() else: - result = getattr(df.rolling(2), method)() + result = getattr(df.rolling(2, min_periods=min_periods), method)() expected = DataFrame(expected_data, dtype="float64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 0eebd657e97b7..7cfac7c6a752a 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -45,9 +45,9 @@ def test_getitem_multiple(self): # GH 13174 g = self.frame.groupby("A") - r = g.rolling(2) + r = g.rolling(2, min_periods=0) g_mutated = get_groupby(self.frame, by="A", mutated=True) - expected = g_mutated.B.apply(lambda x: x.rolling(2).count()) + expected = g_mutated.B.apply(lambda x: x.rolling(2, min_periods=0).count()) result = r.B.count() tm.assert_series_equal(result, expected) @@ -56,7 +56,19 @@ def test_getitem_multiple(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "f", ["sum", "mean", "min", "max", "count", "kurt", "skew"] + "f", + [ + "sum", + "mean", + "min", + "max", + pytest.param( + "count", + marks=pytest.mark.filterwarnings("ignore:min_periods:FutureWarning"), + ), + "kurt", + "skew", + ], ) def test_rolling(self, f): g = self.frame.groupby("A") diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 9ac4871ad24a1..ffa39cfa4d8be 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -460,7 +460,9 @@ def test_rolling_count_default_min_periods_with_null_values(constructor): values = [1, 2, 3, np.nan, 4, 5, 6] expected_counts = [1.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0] - result = constructor(values).rolling(3).count() + # GH 31302 + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = constructor(values).rolling(3).count() expected = constructor(expected_counts) tm.assert_equal(result, expected) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index ea4d7df6700e9..d9fcb538c97c1 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -593,7 +593,10 @@ def test_freqs_ops(self, freq, op, result_data): [ "sum", "mean", - "count", + pytest.param( + "count", + marks=pytest.mark.filterwarnings("ignore:min_periods:FutureWarning"), + ), "median", "std", "var",