diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1de0d3b58dc5f..7d71c3bfb0368 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -100,6 +100,7 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs +from pandas.core.groupby import generic as groupby_generic from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes.datetimes import DatetimeIndex @@ -5598,6 +5599,82 @@ def update( # ---------------------------------------------------------------------- # Data reshaping + @Appender( + """ +Examples +-------- +>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', +... 'Parrot', 'Parrot'], +... 'Max Speed': [380., 370., 24., 26.]}) +>>> df + Animal Max Speed +0 Falcon 380.0 +1 Falcon 370.0 +2 Parrot 24.0 +3 Parrot 26.0 +>>> df.groupby(['Animal']).mean() + Max Speed +Animal +Falcon 375.0 +Parrot 25.0 + +**Hierarchical Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, +... index=index) +>>> df + Max Speed +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +>>> df.groupby(level=0).mean() + Max Speed +Animal +Falcon 370.0 +Parrot 25.0 +>>> df.groupby(level="Type").mean() + Max Speed +Type +Captive 210.0 +Wild 185.0 +""" + ) + @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis=0, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + ) -> "groupby_generic.DataFrameGroupBy": + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + return groupby_generic.DataFrameGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + ) _shared_docs[ "pivot" diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b896721469f1f..d8da0af413eca 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7273,19 +7273,10 @@ def clip( return result - def groupby( - self, - by=None, - axis=0, - level=None, - as_index: bool_t = True, - sort: bool_t = True, - group_keys: bool_t = True, - squeeze: bool_t = False, - observed: bool_t = False, - ): - """ - Group DataFrame or Series using a mapper or by a Series of columns. + _shared_docs[ + "groupby" + ] = """ + Group %(klass)s using a mapper or by a Series of columns. A groupby operation involves some combination of splitting the object, applying a function, and combining the results. This can be @@ -7330,9 +7321,8 @@ def groupby( Returns ------- - DataFrameGroupBy or SeriesGroupBy - Depends on the calling object and returns groupby object that - contains information about the groups. + %(klass)sGroupBy + Returns a groupby object that contains information about the groups. See Also -------- @@ -7343,69 +7333,7 @@ def groupby( ----- See the `user guide `_ for more. - - Examples - -------- - >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df - Animal Max Speed - 0 Falcon 380.0 - 1 Falcon 370.0 - 2 Parrot 24.0 - 3 Parrot 26.0 - >>> df.groupby(['Animal']).mean() - Max Speed - Animal - Falcon 375.0 - Parrot 25.0 - - **Hierarchical Indexes** - - We can groupby different levels of a hierarchical index - using the `level` parameter: - - >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... ['Captive', 'Wild', 'Captive', 'Wild']] - >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) - >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, - ... index=index) - >>> df - Max Speed - Animal Type - Falcon Captive 390.0 - Wild 350.0 - Parrot Captive 30.0 - Wild 20.0 - >>> df.groupby(level=0).mean() - Max Speed - Animal - Falcon 370.0 - Parrot 25.0 - >>> df.groupby(level=1).mean() - Max Speed - Type - Captive 210.0 - Wild 185.0 - """ - from pandas.core.groupby.groupby import get_groupby - - if level is None and by is None: - raise TypeError("You have to supply one of 'by' and 'level'") - axis = self._get_axis_number(axis) - - return get_groupby( - self, - by=by, - axis=axis, - level=level, - as_index=as_index, - sort=sort, - group_keys=group_keys, - squeeze=squeeze, - observed=observed, - ) + """ def asfreq( self, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 37ec05c40940e..a7471cc646777 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -41,6 +41,7 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas import Categorical, Index, MultiIndex +from pandas.core import groupby import pandas.core.algorithms as algos from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com @@ -113,6 +114,7 @@ def _groupby_and_merge( by = [by] lby = left.groupby(by, sort=False) + rby: Optional[groupby.DataFrameGroupBy] = None # if we can groupby the rhs # then we can get vastly better perf @@ -132,7 +134,7 @@ def _groupby_and_merge( try: rby = right.groupby(by, sort=False) except KeyError: - rby = None + pass for key, lhs in lby: diff --git a/pandas/core/series.py b/pandas/core/series.py index 36e26e088935c..aa5af9bb893fa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -59,7 +59,7 @@ is_empty_data, sanitize_array, ) -from pandas.core.generic import _shared_docs +from pandas.core.groupby import generic as groupby_generic from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.accessors import CombinedDatetimelikeProperties from pandas.core.indexes.api import ( @@ -1431,7 +1431,7 @@ def to_string( """ ) @Substitution(klass="Series") - @Appender(_shared_docs["to_markdown"]) + @Appender(generic._shared_docs["to_markdown"]) def to_markdown( self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs, ) -> Optional[str]: @@ -1568,6 +1568,89 @@ def _set_name(self, name, inplace=False): ser.name = name return ser + @Appender( + """ +Examples +-------- +>>> ser = pd.Series([390., 350., 30., 20.], +... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], name="Max Speed") +>>> ser +Falcon 390.0 +Falcon 350.0 +Parrot 30.0 +Parrot 20.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(["a", "b", "a", "b"]).mean() +a 210.0 +b 185.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level=0).mean() +Falcon 370.0 +Parrot 25.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(ser > 100).mean() +Max Speed +False 25.0 +True 370.0 +Name: Max Speed, dtype: float64 + +**Grouping by Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> ser = pd.Series([390., 350., 30., 20.], index=index, name="Max Speed") +>>> ser +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level=0).mean() +Animal +Falcon 370.0 +Parrot 25.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level="Type").mean() +Type +Captive 210.0 +Wild 185.0 +Name: Max Speed, dtype: float64 +""" + ) + @Appender(generic._shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis=0, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + ) -> "groupby_generic.SeriesGroupBy": + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + return groupby_generic.SeriesGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + ) + # ---------------------------------------------------------------------- # Statistics, overridden ndarray methods