Skip to content

Commit 328a765

Browse files
feat: Add Groupby.describe() (#2088)
1 parent a2daa3f commit 328a765

File tree

9 files changed

+310
-111
lines changed

9 files changed

+310
-111
lines changed

bigframes/core/blocks.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1780,15 +1780,19 @@ def pivot(
17801780
else:
17811781
return result_block.with_column_labels(columns_values)
17821782

1783-
def stack(self, how="left", levels: int = 1):
1783+
def stack(
1784+
self, how="left", levels: int = 1, *, override_labels: Optional[pd.Index] = None
1785+
):
17841786
"""Unpivot last column axis level into row axis"""
17851787
if levels == 0:
17861788
return self
17871789

17881790
# These are the values that will be turned into rows
17891791

17901792
col_labels, row_labels = utils.split_index(self.column_labels, levels=levels)
1791-
row_labels = row_labels.drop_duplicates()
1793+
row_labels = (
1794+
row_labels.drop_duplicates() if override_labels is None else override_labels
1795+
)
17921796

17931797
if col_labels is None:
17941798
result_index: pd.Index = pd.Index([None])

bigframes/core/compile/api.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,18 @@
1515

1616
from typing import TYPE_CHECKING
1717

18-
from bigframes.core import rewrite
19-
from bigframes.core.compile.ibis_compiler import ibis_compiler
20-
2118
if TYPE_CHECKING:
2219
import bigframes.core.nodes
2320

2421

2522
def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode):
2623
"""Use only for testing paths to ensure ibis inferred schema does not diverge from bigframes inferred schema."""
24+
from bigframes.core.compile.ibis_compiler import ibis_compiler
25+
import bigframes.core.rewrite
2726
import bigframes.core.schema
2827

2928
node = ibis_compiler._replace_unsupported_ops(node)
30-
node = rewrite.bake_order(node)
29+
node = bigframes.core.rewrite.bake_order(node)
3130
ir = ibis_compiler.compile_node(node)
3231
items = tuple(
3332
bigframes.core.schema.SchemaItem(name, ir.get_column_type(ibis_id))

bigframes/core/groupby/dataframe_group_by.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,20 @@ def head(self, n: int = 5) -> df.DataFrame:
149149
)
150150
)
151151

152+
def describe(self, include: None | Literal["all"] = None):
153+
from bigframes.pandas.core.methods import describe
154+
155+
return df.DataFrame(
156+
describe._describe(
157+
self._block,
158+
self._selected_cols,
159+
include,
160+
as_index=self._as_index,
161+
by_col_ids=self._by_col_ids,
162+
dropna=self._dropna,
163+
)
164+
)
165+
152166
def size(self) -> typing.Union[df.DataFrame, series.Series]:
153167
agg_block, _ = self._block.aggregate_size(
154168
by_column_ids=self._by_col_ids,

bigframes/core/groupby/series_group_by.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,20 @@ def head(self, n: int = 5) -> series.Series:
7575
)
7676
)
7777

78+
def describe(self, include: None | Literal["all"] = None):
79+
from bigframes.pandas.core.methods import describe
80+
81+
return df.DataFrame(
82+
describe._describe(
83+
self._block,
84+
columns=[self._value_column],
85+
include=include,
86+
as_index=True,
87+
by_col_ids=self._by_col_ids,
88+
dropna=self._dropna,
89+
)
90+
).droplevel(level=0, axis=1)
91+
7892
def all(self) -> series.Series:
7993
return self._aggregate(agg_ops.all_op)
8094

bigframes/core/rewrite/implicit_align.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,8 @@
1818
from typing import cast, Optional, Sequence, Set, Tuple
1919

2020
import bigframes.core.expression
21-
import bigframes.core.guid
2221
import bigframes.core.identifiers
23-
import bigframes.core.join_def
2422
import bigframes.core.nodes
25-
import bigframes.core.window_spec
26-
import bigframes.operations.aggregations
2723

2824
# Combination of selects and additive nodes can be merged as an explicit keyless "row join"
2925
ALIGNABLE_NODES = (

bigframes/operations/aggregations.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -251,12 +251,7 @@ def name(self):
251251
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
252252
if not dtypes.is_orderable(input_types[0]):
253253
raise TypeError(f"Type {input_types[0]} is not orderable")
254-
if pd.api.types.is_bool_dtype(input_types[0]) or pd.api.types.is_integer_dtype(
255-
input_types[0]
256-
):
257-
return dtypes.FLOAT_DTYPE
258-
else:
259-
return input_types[0]
254+
return input_types[0]
260255

261256

262257
@dataclasses.dataclass(frozen=True)

bigframes/pandas/core/methods/describe.py

Lines changed: 90 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,15 @@
1616

1717
import typing
1818

19+
import pandas as pd
20+
1921
from bigframes import dataframe, dtypes, series
20-
from bigframes.core.reshape import api as rs
22+
from bigframes.core import agg_expressions, blocks
23+
from bigframes.operations import aggregations
24+
25+
_DEFAULT_DTYPES = (
26+
dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
27+
)
2128

2229

2330
def describe(
@@ -30,100 +37,88 @@ def describe(
3037
elif not isinstance(input, dataframe.DataFrame):
3138
raise TypeError(f"Unsupported type: {type(input)}")
3239

40+
block = input._block
41+
42+
describe_block = _describe(block, columns=block.value_columns, include=include)
43+
# we override default stack behavior, because we want very specific ordering
44+
stack_cols = pd.Index(
45+
[
46+
"count",
47+
"nunique",
48+
"top",
49+
"freq",
50+
"mean",
51+
"std",
52+
"min",
53+
"25%",
54+
"50%",
55+
"75%",
56+
"max",
57+
]
58+
).intersection(describe_block.column_labels.get_level_values(-1))
59+
describe_block = describe_block.stack(override_labels=stack_cols)
60+
61+
return dataframe.DataFrame(describe_block).droplevel(level=0)
62+
63+
64+
def _describe(
65+
block: blocks.Block,
66+
columns: typing.Sequence[str],
67+
include: None | typing.Literal["all"] = None,
68+
*,
69+
as_index: bool = True,
70+
by_col_ids: typing.Sequence[str] = [],
71+
dropna: bool = False,
72+
) -> blocks.Block:
73+
stats: list[agg_expressions.Aggregation] = []
74+
column_labels: list[typing.Hashable] = []
75+
76+
# include=None behaves like include='all' if no numeric columns present
3377
if include is None:
34-
numeric_df = _select_dtypes(
35-
input,
36-
dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
37-
+ dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES,
38-
)
39-
if len(numeric_df.columns) == 0:
40-
# Describe eligible non-numeric columns
41-
return _describe_non_numeric(input)
42-
43-
# Otherwise, only describe numeric columns
44-
return _describe_numeric(input)
45-
46-
elif include == "all":
47-
numeric_result = _describe_numeric(input)
48-
non_numeric_result = _describe_non_numeric(input)
49-
50-
if len(numeric_result.columns) == 0:
51-
return non_numeric_result
52-
elif len(non_numeric_result.columns) == 0:
53-
return numeric_result
54-
else:
55-
# Use reindex after join to preserve the original column order.
56-
return rs.concat(
57-
[non_numeric_result, numeric_result], axis=1
58-
)._reindex_columns(input.columns)
59-
60-
else:
61-
raise ValueError(f"Unsupported include type: {include}")
62-
63-
64-
def _describe_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame:
65-
number_df_result = typing.cast(
66-
dataframe.DataFrame,
67-
_select_dtypes(df, dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE).agg(
68-
[
69-
"count",
70-
"mean",
71-
"std",
72-
"min",
73-
"25%",
74-
"50%",
75-
"75%",
76-
"max",
77-
]
78-
),
79-
)
80-
temporal_df_result = typing.cast(
81-
dataframe.DataFrame,
82-
_select_dtypes(df, dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES).agg(["count"]),
78+
if not any(
79+
block.expr.get_column_type(col) in _DEFAULT_DTYPES for col in columns
80+
):
81+
include = "all"
82+
83+
for col_id in columns:
84+
label = block.col_id_to_label[col_id]
85+
dtype = block.expr.get_column_type(col_id)
86+
if include != "all" and dtype not in _DEFAULT_DTYPES:
87+
continue
88+
agg_ops = _get_aggs_for_dtype(dtype)
89+
stats.extend(op.as_expr(col_id) for op in agg_ops)
90+
label_tuple = (label,) if block.column_labels.nlevels == 1 else label
91+
column_labels.extend((*label_tuple, op.name) for op in agg_ops) # type: ignore
92+
93+
agg_block, _ = block.aggregate(
94+
by_column_ids=by_col_ids,
95+
aggregations=stats,
96+
dropna=dropna,
97+
column_labels=pd.Index(column_labels, name=(*block.column_labels.names, None)),
8398
)
84-
85-
if len(number_df_result.columns) == 0:
86-
return temporal_df_result
87-
elif len(temporal_df_result.columns) == 0:
88-
return number_df_result
99+
return agg_block if as_index else agg_block.reset_index(drop=False)
100+
101+
102+
def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]:
103+
if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE:
104+
return [
105+
aggregations.count_op,
106+
aggregations.mean_op,
107+
aggregations.std_op,
108+
aggregations.min_op,
109+
aggregations.ApproxQuartilesOp(1),
110+
aggregations.ApproxQuartilesOp(2),
111+
aggregations.ApproxQuartilesOp(3),
112+
aggregations.max_op,
113+
]
114+
elif dtype in dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES:
115+
return [aggregations.count_op]
116+
elif dtype in [
117+
dtypes.STRING_DTYPE,
118+
dtypes.BOOL_DTYPE,
119+
dtypes.BYTES_DTYPE,
120+
dtypes.TIME_DTYPE,
121+
]:
122+
return [aggregations.count_op, aggregations.nunique_op]
89123
else:
90-
import bigframes.core.reshape.api as rs
91-
92-
original_columns = _select_dtypes(
93-
df,
94-
dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
95-
+ dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES,
96-
).columns
97-
98-
# Use reindex after join to preserve the original column order.
99-
return rs.concat(
100-
[number_df_result, temporal_df_result],
101-
axis=1,
102-
)._reindex_columns(original_columns)
103-
104-
105-
def _describe_non_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame:
106-
return typing.cast(
107-
dataframe.DataFrame,
108-
_select_dtypes(
109-
df,
110-
[
111-
dtypes.STRING_DTYPE,
112-
dtypes.BOOL_DTYPE,
113-
dtypes.BYTES_DTYPE,
114-
dtypes.TIME_DTYPE,
115-
],
116-
).agg(["count", "nunique"]),
117-
)
118-
119-
120-
def _select_dtypes(
121-
df: dataframe.DataFrame, dtypes: typing.Sequence[dtypes.Dtype]
122-
) -> dataframe.DataFrame:
123-
"""Selects columns without considering inheritance relationships."""
124-
columns = [
125-
col_id
126-
for col_id, dtype in zip(df._block.value_columns, df._block.dtypes)
127-
if dtype in dtypes
128-
]
129-
return dataframe.DataFrame(df._block.select_columns(columns))
124+
return []

0 commit comments

Comments
 (0)