From 2fd90b90a2ad23a8fb19f9866a61baf234da2233 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Sep 2022 21:02:38 +0200 Subject: [PATCH 1/5] Add var to masked interface --- pandas/core/array_algos/masked_reductions.py | 18 ++++++++++++++++-- pandas/core/arrays/masked.py | 15 ++++++++++++++- pandas/tests/series/test_reductions.py | 10 ++++++++++ 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 3e59a267f7191..3ee394ce364ee 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -22,6 +22,7 @@ def _sumprod( skipna: bool = True, min_count: int = 0, axis: int | None = None, + **kwargs, ): """ Sum or product for 1D masked array. @@ -45,14 +46,14 @@ def _sumprod( if mask.any(axis=axis) or check_below_min_count(values.shape, None, min_count): return libmissing.NA else: - return func(values, axis=axis) + return func(values, axis=axis, **kwargs) else: if check_below_min_count(values.shape, mask, min_count) and ( axis is None or values.ndim == 1 ): return libmissing.NA - return func(values, where=~mask, axis=axis) + return func(values, where=~mask, axis=axis, **kwargs) def sum( @@ -147,3 +148,16 @@ def mean(values: np.ndarray, mask: npt.NDArray[np.bool_], skipna: bool = True): count = np.count_nonzero(~mask) mean_value = _sum / count return mean_value + + +def var( + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, + axis: int | None = None, + ddof: int = 1, +): + return _sumprod( + np.var, values=values, mask=mask, skipna=skipna, axis=axis, **{"ddof": ddof} + ) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index c5f6dea7157ab..838bd789abf46 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1036,7 +1036,7 @@ def _quantile( # Reductions def _reduce(self, name: str, *, skipna: bool = True, **kwargs): - if name in {"any", "all", "min", "max", "sum", "prod"}: + if name in {"any", "all", "min", "max", "sum", "prod", "var"}: return getattr(self, name)(skipna=skipna, **kwargs) data = self._data @@ -1107,6 +1107,19 @@ def prod(self, *, skipna=True, min_count=0, axis: int | None = 0, **kwargs): "prod", result, skipna=skipna, axis=axis, **kwargs ) + def var(self, *, skipna=True, axis: int | None = 0, ddof: int = 1, **kwargs): + nv.validate_stat_ddof_func((), kwargs, fname="var") + result = masked_reductions.var( + self._data, + self._mask, + skipna=skipna, + axis=axis, + ddof=ddof, + ) + return self._wrap_reduction_result( + "var", result, skipna=skipna, axis=axis, **kwargs + ) + def min(self, *, skipna=True, axis: int | None = 0, **kwargs): nv.validate_min((), kwargs) return masked_reductions.min( diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index a552d9d84329f..2dbdc2ad217e1 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -132,6 +132,16 @@ def test_validate_median_initial(): ser.median(overwrite_input=True) +def test_var_masked_array(): + # GH# + ser = Series([1, 2, 3, 4, 5], dtype="Int64") + ser_numpy_dtype = Series([1, 2, 3, 4, 5], dtype="int64") + result = ser.var() + result_numpy_dtype = ser_numpy_dtype.var() + assert result == result_numpy_dtype + assert result == 2.5 + + def test_validate_stat_keepdims(): ser = Series([1, 2]) msg = ( From f7b00ef550e648356549890ec5aa384fabc0ee57 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Sep 2022 21:12:28 +0200 Subject: [PATCH 2/5] Move test and add whatsnew --- doc/source/whatsnew/v1.6.0.rst | 1 + pandas/core/array_algos/masked_reductions.py | 13 ++++++++----- pandas/tests/reductions/test_reductions.py | 11 ++++++++++- pandas/tests/series/test_reductions.py | 10 ---------- 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 848e87f0bc029..c992b7a5c1dd3 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -102,6 +102,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) +- Performance improvement in ``var`` for nullable dtypes (:issue:`40000`). - .. --------------------------------------------------------------------------- diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 3ee394ce364ee..6bf0361d81df8 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -14,7 +14,7 @@ from pandas.core.nanops import check_below_min_count -def _sumprod( +def _reductions( func: Callable, values: np.ndarray, mask: npt.NDArray[np.bool_], @@ -64,7 +64,7 @@ def sum( min_count: int = 0, axis: int | None = None, ): - return _sumprod( + return _reductions( np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis ) @@ -77,7 +77,7 @@ def prod( min_count: int = 0, axis: int | None = None, ): - return _sumprod( + return _reductions( np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis ) @@ -144,7 +144,7 @@ def max( def mean(values: np.ndarray, mask: npt.NDArray[np.bool_], skipna: bool = True): if not values.size or mask.all(): return libmissing.NA - _sum = _sumprod(np.sum, values=values, mask=mask, skipna=skipna) + _sum = _reductions(np.sum, values=values, mask=mask, skipna=skipna) count = np.count_nonzero(~mask) mean_value = _sum / count return mean_value @@ -158,6 +158,9 @@ def var( axis: int | None = None, ddof: int = 1, ): - return _sumprod( + if not values.size or mask.all(): + return libmissing.NA + + return _reductions( np.var, values=values, mask=mask, skipna=skipna, axis=axis, **{"ddof": ddof} ) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index fa53ed47dbdba..a231e14c8a17e 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -696,7 +696,7 @@ def test_empty_multi(self, method, unit): expected = Series([1, np.nan], index=["a", "b"]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("method", ["mean"]) + @pytest.mark.parametrize("method", ["mean", "var"]) @pytest.mark.parametrize("dtype", ["Float64", "Int64", "boolean"]) def test_ops_consistency_on_empty_nullable(self, method, dtype): @@ -775,6 +775,15 @@ def test_sum_overflow_float(self, use_bottleneck, dtype): result = s.max(skipna=False) assert np.allclose(float(result), v[-1]) + def test_var_masked_array(self): + # GH# + ser = Series([1, 2, 3, 4, 5], dtype="Int64") + ser_numpy_dtype = Series([1, 2, 3, 4, 5], dtype="int64") + result = ser.var() + result_numpy_dtype = ser_numpy_dtype.var() + assert result == result_numpy_dtype + assert result == 2.5 + @pytest.mark.parametrize("dtype", ("m8[ns]", "m8[ns]", "M8[ns]", "M8[ns, UTC]")) @pytest.mark.parametrize("skipna", [True, False]) def test_empty_timeseries_reductions_return_nat(self, dtype, skipna): diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 2dbdc2ad217e1..a552d9d84329f 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -132,16 +132,6 @@ def test_validate_median_initial(): ser.median(overwrite_input=True) -def test_var_masked_array(): - # GH# - ser = Series([1, 2, 3, 4, 5], dtype="Int64") - ser_numpy_dtype = Series([1, 2, 3, 4, 5], dtype="int64") - result = ser.var() - result_numpy_dtype = ser_numpy_dtype.var() - assert result == result_numpy_dtype - assert result == 2.5 - - def test_validate_stat_keepdims(): ser = Series([1, 2]) msg = ( From 825a49bef9fc66552caa574ff611c45d90d84d39 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Sep 2022 21:21:37 +0200 Subject: [PATCH 3/5] Add gh ref --- doc/source/whatsnew/v1.6.0.rst | 2 +- pandas/tests/reductions/test_reductions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index c992b7a5c1dd3..bc114394ec7f6 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -102,7 +102,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) -- Performance improvement in ``var`` for nullable dtypes (:issue:`40000`). +- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). - .. --------------------------------------------------------------------------- diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index a231e14c8a17e..f9f104bcb727d 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -776,7 +776,7 @@ def test_sum_overflow_float(self, use_bottleneck, dtype): assert np.allclose(float(result), v[-1]) def test_var_masked_array(self): - # GH# + # GH#48379 ser = Series([1, 2, 3, 4, 5], dtype="Int64") ser_numpy_dtype = Series([1, 2, 3, 4, 5], dtype="int64") result = ser.var() From 7752e215e67f4856bd979cc8b6f5d66307b4bc60 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 12 Sep 2022 20:29:30 +0200 Subject: [PATCH 4/5] Update pandas/core/array_algos/masked_reductions.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/array_algos/masked_reductions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index d7693a67e3bd0..979d3ddac63c2 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -164,5 +164,5 @@ def var( return libmissing.NA return _reductions( - np.var, values=values, mask=mask, skipna=skipna, axis=axis, **{"ddof": ddof} + np.var, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof ) From 01bf0cf9adae056f7355bd95d3ecbbacc3dc1a7e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 12 Sep 2022 20:30:57 +0200 Subject: [PATCH 5/5] Parametrize test --- pandas/tests/reductions/test_reductions.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index e6c598e7550f4..66f263b84de4d 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -787,14 +787,15 @@ def test_mean_masked_overflow(self): assert result_masked - result_numpy == 0 assert result_masked == 1e17 - def test_var_masked_array(self): + @pytest.mark.parametrize("ddof, exp", [(1, 2.5), (0, 2.0)]) + def test_var_masked_array(self, ddof, exp): # GH#48379 ser = Series([1, 2, 3, 4, 5], dtype="Int64") ser_numpy_dtype = Series([1, 2, 3, 4, 5], dtype="int64") - result = ser.var() - result_numpy_dtype = ser_numpy_dtype.var() + result = ser.var(ddof=ddof) + result_numpy_dtype = ser_numpy_dtype.var(ddof=ddof) assert result == result_numpy_dtype - assert result == 2.5 + assert result == exp @pytest.mark.parametrize("dtype", ("m8[ns]", "m8[ns]", "M8[ns]", "M8[ns, UTC]")) @pytest.mark.parametrize("skipna", [True, False])