Skip to content

Commit c56a78c

Browse files
authored
feat: add GroupBy.__iter__ (#1394)
* feat: add `GroupBy.__iter__` * iterate over keys * match by key * implement it * refactor * revert notebook change
1 parent 10a38d7 commit c56a78c

File tree

8 files changed

+495
-6
lines changed

8 files changed

+495
-6
lines changed

bigframes/core/blocks.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1375,10 +1375,16 @@ def aggregate(
13751375
) -> typing.Tuple[Block, typing.Sequence[str]]:
13761376
"""
13771377
Apply aggregations to the block.
1378+
13781379
Arguments:
13791380
by_column_id: column id of the aggregation key, this is preserved through the transform and used as index.
13801381
aggregations: input_column_id, operation tuples
13811382
dropna: whether null keys should be dropped
1383+
1384+
Returns:
1385+
Tuple[Block, Sequence[str]]:
1386+
The first element is the grouped block. The second is the
1387+
column IDs corresponding to each applied aggregation.
13821388
"""
13831389
if column_labels is None:
13841390
column_labels = pd.Index(range(len(aggregations)))

bigframes/core/groupby/dataframe_group_by.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
import datetime
1818
import typing
19-
from typing import Literal, Optional, Sequence, Tuple, Union
19+
from typing import Iterable, Literal, Optional, Sequence, Tuple, Union
2020

2121
import bigframes_vendored.constants as constants
2222
import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
@@ -29,7 +29,7 @@
2929
from bigframes.core import log_adapter
3030
import bigframes.core.block_transforms as block_ops
3131
import bigframes.core.blocks as blocks
32-
from bigframes.core.groupby import aggs, series_group_by
32+
from bigframes.core.groupby import aggs, group_by, series_group_by
3333
import bigframes.core.ordering as order
3434
import bigframes.core.utils as utils
3535
import bigframes.core.validations as validations
@@ -54,6 +54,7 @@ def __init__(
5454
selected_cols: typing.Optional[typing.Sequence[str]] = None,
5555
dropna: bool = True,
5656
as_index: bool = True,
57+
by_key_is_singular: bool = False,
5758
):
5859
# TODO(tbergeron): Support more group-by expression types
5960
self._block = block
@@ -64,6 +65,9 @@ def __init__(
6465
)
6566
}
6667
self._by_col_ids = by_col_ids
68+
self._by_key_is_singular = by_key_is_singular
69+
if by_key_is_singular:
70+
assert len(by_col_ids) == 1, "singular key should be exactly one group key"
6771

6872
self._dropna = dropna
6973
self._as_index = as_index
@@ -163,6 +167,16 @@ def describe(self, include: None | Literal["all"] = None):
163167
)
164168
)
165169

170+
def __iter__(self) -> Iterable[Tuple[blocks.Label, df.DataFrame]]:
171+
for group_keys, filtered_block in group_by.block_groupby_iter(
172+
self._block,
173+
by_col_ids=self._by_col_ids,
174+
by_key_is_singular=self._by_key_is_singular,
175+
dropna=self._dropna,
176+
):
177+
filtered_df = df.DataFrame(filtered_block)
178+
yield group_keys, filtered_df
179+
166180
def size(self) -> typing.Union[df.DataFrame, series.Series]:
167181
agg_block, _ = self._block.aggregate_size(
168182
by_column_ids=self._by_col_ids,

bigframes/core/groupby/group_by.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import functools
18+
from typing import Sequence
19+
20+
import pandas as pd
21+
22+
from bigframes.core import blocks
23+
from bigframes.core import expression as ex
24+
import bigframes.enums
25+
import bigframes.operations as ops
26+
27+
28+
def block_groupby_iter(
29+
block: blocks.Block,
30+
*,
31+
by_col_ids: Sequence[str],
32+
by_key_is_singular: bool,
33+
dropna: bool,
34+
):
35+
original_index_columns = block._index_columns
36+
original_index_labels = block._index_labels
37+
by_col_ids = by_col_ids
38+
block = block.reset_index(
39+
level=None,
40+
# Keep the original index columns so they can be recovered.
41+
drop=False,
42+
allow_duplicates=True,
43+
replacement=bigframes.enums.DefaultIndexKind.NULL,
44+
).set_index(
45+
by_col_ids,
46+
# Keep by_col_ids in-place so the ordering doesn't change.
47+
drop=False,
48+
append=False,
49+
)
50+
block.cached(
51+
force=True,
52+
# All DataFrames will be filtered by by_col_ids, so
53+
# force block.cached() to cluster by the new index by explicitly
54+
# setting `session_aware=False`. This will ensure that the filters
55+
# are more efficient.
56+
session_aware=False,
57+
)
58+
keys_block, _ = block.aggregate(by_col_ids, dropna=dropna)
59+
for chunk in keys_block.to_pandas_batches():
60+
# Convert to MultiIndex to make sure we get tuples,
61+
# even for singular keys.
62+
by_keys_index = chunk.index
63+
if not isinstance(by_keys_index, pd.MultiIndex):
64+
by_keys_index = pd.MultiIndex.from_frame(by_keys_index.to_frame())
65+
66+
for by_keys in by_keys_index:
67+
filtered_block = (
68+
# To ensure the cache is used, filter first, then reset the
69+
# index before yielding the DataFrame.
70+
block.filter(
71+
functools.reduce(
72+
ops.and_op.as_expr,
73+
(
74+
ops.eq_op.as_expr(by_col, ex.const(by_key))
75+
for by_col, by_key in zip(by_col_ids, by_keys)
76+
),
77+
),
78+
).set_index(
79+
original_index_columns,
80+
# We retained by_col_ids in the set_index call above,
81+
# so it's safe to drop the duplicates now.
82+
drop=True,
83+
append=False,
84+
index_labels=original_index_labels,
85+
)
86+
)
87+
88+
if by_key_is_singular:
89+
yield by_keys[0], filtered_block
90+
else:
91+
yield by_keys, filtered_block

bigframes/core/groupby/series_group_by.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
import datetime
1818
import typing
19-
from typing import Literal, Sequence, Union
19+
from typing import Iterable, Literal, Sequence, Tuple, Union
2020

2121
import bigframes_vendored.constants as constants
2222
import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
@@ -28,7 +28,7 @@
2828
from bigframes.core import log_adapter
2929
import bigframes.core.block_transforms as block_ops
3030
import bigframes.core.blocks as blocks
31-
from bigframes.core.groupby import aggs
31+
from bigframes.core.groupby import aggs, group_by
3232
import bigframes.core.ordering as order
3333
import bigframes.core.utils as utils
3434
import bigframes.core.validations as validations
@@ -52,6 +52,8 @@ def __init__(
5252
by_col_ids: typing.Sequence[str],
5353
value_name: blocks.Label = None,
5454
dropna=True,
55+
*,
56+
by_key_is_singular: bool = False,
5557
):
5658
# TODO(tbergeron): Support more group-by expression types
5759
self._block = block
@@ -60,6 +62,10 @@ def __init__(
6062
self._value_name = value_name
6163
self._dropna = dropna # Applies to aggregations but not windowing
6264

65+
self._by_key_is_singular = by_key_is_singular
66+
if by_key_is_singular:
67+
assert len(by_col_ids) == 1, "singular key should be exactly one group key"
68+
6369
@property
6470
def _session(self) -> session.Session:
6571
return self._block.session
@@ -89,6 +95,19 @@ def describe(self, include: None | Literal["all"] = None):
8995
)
9096
).droplevel(level=0, axis=1)
9197

98+
def __iter__(self) -> Iterable[Tuple[blocks.Label, series.Series]]:
99+
for group_keys, filtered_block in group_by.block_groupby_iter(
100+
self._block,
101+
by_col_ids=self._by_col_ids,
102+
by_key_is_singular=self._by_key_is_singular,
103+
dropna=self._dropna,
104+
):
105+
filtered_series = series.Series(
106+
filtered_block.select_column(self._value_column)
107+
)
108+
filtered_series.name = self._value_name
109+
yield group_keys, filtered_series
110+
92111
def all(self) -> series.Series:
93112
return self._aggregate(agg_ops.all_op)
94113

bigframes/dataframe.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3913,11 +3913,17 @@ def _groupby_level(
39133913
as_index: bool = True,
39143914
dropna: bool = True,
39153915
):
3916+
if utils.is_list_like(level):
3917+
by_key_is_singular = False
3918+
else:
3919+
by_key_is_singular = True
3920+
39163921
return groupby.DataFrameGroupBy(
39173922
self._block,
39183923
by_col_ids=self._resolve_levels(level),
39193924
as_index=as_index,
39203925
dropna=dropna,
3926+
by_key_is_singular=by_key_is_singular,
39213927
)
39223928

39233929
def _groupby_series(
@@ -3930,10 +3936,14 @@ def _groupby_series(
39303936
as_index: bool = True,
39313937
dropna: bool = True,
39323938
):
3939+
# Pandas makes a distinction between groupby with a list of keys
3940+
# versus groupby with a single item in some methods, like __iter__.
39333941
if not isinstance(by, bigframes.series.Series) and utils.is_list_like(by):
39343942
by = list(by)
3943+
by_key_is_singular = False
39353944
else:
39363945
by = [typing.cast(typing.Union[blocks.Label, bigframes.series.Series], by)]
3946+
by_key_is_singular = True
39373947

39383948
block = self._block
39393949
col_ids: typing.Sequence[str] = []
@@ -3963,6 +3973,7 @@ def _groupby_series(
39633973
by_col_ids=col_ids,
39643974
as_index=as_index,
39653975
dropna=dropna,
3976+
by_key_is_singular=by_key_is_singular,
39663977
)
39673978

39683979
def abs(self) -> DataFrame:

bigframes/series.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1854,12 +1854,18 @@ def _groupby_level(
18541854
level: int | str | typing.Sequence[int] | typing.Sequence[str],
18551855
dropna: bool = True,
18561856
) -> bigframes.core.groupby.SeriesGroupBy:
1857+
if utils.is_list_like(level):
1858+
by_key_is_singular = False
1859+
else:
1860+
by_key_is_singular = True
1861+
18571862
return groupby.SeriesGroupBy(
18581863
self._block,
18591864
self._value_column,
18601865
by_col_ids=self._resolve_levels(level),
18611866
value_name=self.name,
18621867
dropna=dropna,
1868+
by_key_is_singular=by_key_is_singular,
18631869
)
18641870

18651871
def _groupby_values(
@@ -1871,8 +1877,10 @@ def _groupby_values(
18711877
) -> bigframes.core.groupby.SeriesGroupBy:
18721878
if not isinstance(by, Series) and _is_list_like(by):
18731879
by = list(by)
1880+
by_key_is_singular = False
18741881
else:
18751882
by = [typing.cast(typing.Union[blocks.Label, Series], by)]
1883+
by_key_is_singular = True
18761884

18771885
block = self._block
18781886
grouping_cols: typing.Sequence[str] = []
@@ -1904,6 +1912,7 @@ def _groupby_values(
19041912
by_col_ids=grouping_cols,
19051913
value_name=self.name,
19061914
dropna=dropna,
1915+
by_key_is_singular=by_key_is_singular,
19071916
)
19081917

19091918
def apply(

0 commit comments

Comments
 (0)