Skip to content

Commit c9ca02c

Browse files
authored
feat: add DataFrame.resample and Series.resample (#2213)
* feat: add DataFrame.resample and Series.resample * raise for unsupported values * add docstrings * fix dataframe tests
1 parent 0a3e172 commit c9ca02c

File tree

10 files changed

+288
-123
lines changed

10 files changed

+288
-123
lines changed

bigframes/core/blocks.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1996,6 +1996,31 @@ def _generate_resample_label(
19961996
Literal["epoch", "start", "start_day", "end", "end_day"],
19971997
] = "start_day",
19981998
) -> Block:
1999+
if not isinstance(rule, str):
2000+
raise NotImplementedError(
2001+
f"Only offset strings are currently supported for rule, but got {repr(rule)}. {constants.FEEDBACK_LINK}"
2002+
)
2003+
2004+
if rule in ("ME", "YE", "QE", "BME", "BA", "BQE", "W"):
2005+
raise NotImplementedError(
2006+
f"Offset strings 'ME', 'YE', 'QE', 'BME', 'BA', 'BQE', 'W' are not currently supported for rule, but got {repr(rule)}. {constants.FEEDBACK_LINK}"
2007+
)
2008+
2009+
if closed == "right":
2010+
raise NotImplementedError(
2011+
f"Only closed='left' is currently supported. {constants.FEEDBACK_LINK}",
2012+
)
2013+
2014+
if label == "right":
2015+
raise NotImplementedError(
2016+
f"Only label='left' is currently supported. {constants.FEEDBACK_LINK}",
2017+
)
2018+
2019+
if origin not in ("epoch", "start", "start_day"):
2020+
raise NotImplementedError(
2021+
f"Only origin='epoch', 'start', 'start_day' are currently supported, but got {repr(origin)}. {constants.FEEDBACK_LINK}"
2022+
)
2023+
19992024
# Validate and resolve the index or column to use for grouping
20002025
if on is None:
20012026
if len(self.index_columns) == 0:

bigframes/dataframe.py

Lines changed: 5 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -4182,10 +4182,12 @@ def _split(
41824182
return [DataFrame(block) for block in blocks]
41834183

41844184
@validations.requires_ordering()
4185-
def _resample(
4185+
def resample(
41864186
self,
41874187
rule: str,
41884188
*,
4189+
closed: Optional[Literal["right", "left"]] = None,
4190+
label: Optional[Literal["right", "left"]] = None,
41894191
on: blocks.Label = None,
41904192
level: Optional[LevelsType] = None,
41914193
origin: Union[
@@ -4195,64 +4197,10 @@ def _resample(
41954197
Literal["epoch", "start", "start_day", "end", "end_day"],
41964198
] = "start_day",
41974199
) -> bigframes.core.groupby.DataFrameGroupBy:
4198-
"""Internal function to support resample. Resample time-series data.
4199-
4200-
**Examples:**
4201-
4202-
>>> import bigframes.pandas as bpd
4203-
>>> data = {
4204-
... "timestamp_col": pd.date_range(
4205-
... start="2021-01-01 13:00:00", periods=30, freq="1s"
4206-
... ),
4207-
... "int64_col": range(30),
4208-
... "int64_too": range(10, 40),
4209-
... }
4210-
4211-
Resample on a DataFrame with index:
4212-
4213-
>>> df = bpd.DataFrame(data).set_index("timestamp_col")
4214-
>>> df._resample(rule="7s").min()
4215-
int64_col int64_too
4216-
2021-01-01 12:59:55 0 10
4217-
2021-01-01 13:00:02 2 12
4218-
2021-01-01 13:00:09 9 19
4219-
2021-01-01 13:00:16 16 26
4220-
2021-01-01 13:00:23 23 33
4221-
<BLANKLINE>
4222-
[5 rows x 2 columns]
4223-
4224-
Resample with column and origin set to 'start':
4225-
4226-
>>> df = bpd.DataFrame(data)
4227-
>>> df._resample(rule="7s", on = "timestamp_col", origin="start").min()
4228-
int64_col int64_too
4229-
2021-01-01 13:00:00 0 10
4230-
2021-01-01 13:00:07 7 17
4231-
2021-01-01 13:00:14 14 24
4232-
2021-01-01 13:00:21 21 31
4233-
2021-01-01 13:00:28 28 38
4234-
<BLANKLINE>
4235-
[5 rows x 2 columns]
4236-
4237-
Args:
4238-
rule (str):
4239-
The offset string representing target conversion.
4240-
on (str, default None):
4241-
For a DataFrame, column to use instead of index for resampling. Column
4242-
must be datetime-like.
4243-
level (str or int, default None):
4244-
For a MultiIndex, level (name or number) to use for resampling.
4245-
level must be datetime-like.
4246-
origin(str, default 'start_day'):
4247-
The timestamp on which to adjust the grouping. Must be one of the following:
4248-
'epoch': origin is 1970-01-01
4249-
'start': origin is the first value of the timeseries
4250-
'start_day': origin is the first day at midnight of the timeseries
4251-
Returns:
4252-
DataFrameGroupBy: DataFrameGroupBy object.
4253-
"""
42544200
block = self._block._generate_resample_label(
42554201
rule=rule,
4202+
closed=closed,
4203+
label=label,
42564204
on=on,
42574205
level=level,
42584206
origin=origin,

bigframes/series.py

Lines changed: 1 addition & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2505,7 +2505,7 @@ def explode(self, *, ignore_index: Optional[bool] = False) -> Series:
25052505
)
25062506

25072507
@validations.requires_ordering()
2508-
def _resample(
2508+
def resample(
25092509
self,
25102510
rule: str,
25112511
*,
@@ -2519,43 +2519,6 @@ def _resample(
25192519
Literal["epoch", "start", "start_day", "end", "end_day"],
25202520
] = "start_day",
25212521
) -> bigframes.core.groupby.SeriesGroupBy:
2522-
"""Internal function to support resample. Resample time-series data.
2523-
2524-
**Examples:**
2525-
2526-
>>> import bigframes.pandas as bpd
2527-
>>> data = {
2528-
... "timestamp_col": pd.date_range(
2529-
... start="2021-01-01 13:00:00", periods=30, freq="1s"
2530-
... ),
2531-
... "int64_col": range(30),
2532-
... }
2533-
>>> s = bpd.DataFrame(data).set_index("timestamp_col")
2534-
>>> s._resample(rule="7s", origin="epoch").min()
2535-
int64_col
2536-
2021-01-01 12:59:56 0
2537-
2021-01-01 13:00:03 3
2538-
2021-01-01 13:00:10 10
2539-
2021-01-01 13:00:17 17
2540-
2021-01-01 13:00:24 24
2541-
<BLANKLINE>
2542-
[5 rows x 1 columns]
2543-
2544-
2545-
Args:
2546-
rule (str):
2547-
The offset string representing target conversion.
2548-
level (str or int, default None):
2549-
For a MultiIndex, level (name or number) to use for resampling.
2550-
level must be datetime-like.
2551-
origin(str, default 'start_day'):
2552-
The timestamp on which to adjust the grouping. Must be one of the following:
2553-
'epoch': origin is 1970-01-01
2554-
'start': origin is the first value of the timeseries
2555-
'start_day': origin is the first day at midnight of the timeseries
2556-
Returns:
2557-
SeriesGroupBy: SeriesGroupBy object.
2558-
"""
25592522
block = self._block._generate_resample_label(
25602523
rule=rule,
25612524
closed=closed,

tests/system/small/test_dataframe.py

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5915,21 +5915,15 @@ def test_dataframe_explode_xfail(col_names):
59155915
pytest.param("datetime_col", "5M", "epoch"),
59165916
pytest.param("datetime_col", "3Q", "start_day"),
59175917
pytest.param("datetime_col", "3YE", "start"),
5918-
pytest.param(
5919-
"int64_col", "100D", "start", marks=pytest.mark.xfail(raises=TypeError)
5920-
),
5921-
pytest.param(
5922-
"datetime_col", "100D", "end", marks=pytest.mark.xfail(raises=ValueError)
5923-
),
59245918
],
59255919
)
5926-
def test__resample_with_column(
5920+
def test_resample_with_column(
59275921
scalars_df_index, scalars_pandas_df_index, on, rule, origin
59285922
):
59295923
# TODO: supply a reason why this isn't compatible with pandas 1.x
59305924
pytest.importorskip("pandas", minversion="2.0.0")
59315925
bf_result = (
5932-
scalars_df_index._resample(rule=rule, on=on, origin=origin)[
5926+
scalars_df_index.resample(rule=rule, on=on, origin=origin)[
59335927
["int64_col", "int64_too"]
59345928
]
59355929
.max()
@@ -5943,30 +5937,54 @@ def test__resample_with_column(
59435937
)
59445938

59455939

5940+
@pytest.mark.parametrize("index_col", ["timestamp_col", "datetime_col"])
5941+
@pytest.mark.parametrize(
5942+
("index_append", "level"),
5943+
[(True, 1), (False, None), (False, 0)],
5944+
)
59465945
@pytest.mark.parametrize(
5947-
("append", "level", "col", "rule"),
5946+
"rule",
59485947
[
5949-
pytest.param(False, None, "timestamp_col", "100d"),
5950-
pytest.param(True, 1, "timestamp_col", "1200h"),
5951-
pytest.param(False, None, "datetime_col", "100d"),
5948+
# TODO(tswast): support timedeltas and dataoffsets.
5949+
# TODO(tswast): support bins that default to "right".
5950+
"100d",
5951+
"1200h",
59525952
],
59535953
)
5954-
def test__resample_with_index(
5955-
scalars_df_index, scalars_pandas_df_index, append, level, col, rule
5954+
# TODO(tswast): support "right"
5955+
@pytest.mark.parametrize("closed", ["left", None])
5956+
# TODO(tswast): support "right"
5957+
@pytest.mark.parametrize("label", ["left", None])
5958+
@pytest.mark.parametrize(
5959+
"origin",
5960+
["epoch", "start", "start_day"], # TODO(tswast): support end, end_day.
5961+
)
5962+
def test_resample_with_index(
5963+
scalars_df_index,
5964+
scalars_pandas_df_index,
5965+
index_append,
5966+
level,
5967+
index_col,
5968+
rule,
5969+
closed,
5970+
origin,
5971+
label,
59565972
):
59575973
# TODO: supply a reason why this isn't compatible with pandas 1.x
59585974
pytest.importorskip("pandas", minversion="2.0.0")
5959-
scalars_df_index = scalars_df_index.set_index(col, append=append)
5960-
scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)
5975+
scalars_df_index = scalars_df_index.set_index(index_col, append=index_append)
5976+
scalars_pandas_df_index = scalars_pandas_df_index.set_index(
5977+
index_col, append=index_append
5978+
)
59615979
bf_result = (
59625980
scalars_df_index[["int64_col", "int64_too"]]
5963-
._resample(rule=rule, level=level)
5981+
.resample(rule=rule, level=level, closed=closed, origin=origin, label=label)
59645982
.min()
59655983
.to_pandas()
59665984
)
59675985
pd_result = (
59685986
scalars_pandas_df_index[["int64_col", "int64_too"]]
5969-
.resample(rule=rule, level=level)
5987+
.resample(rule=rule, level=level, closed=closed, origin=origin, label=label)
59705988
.min()
59715989
)
59725990
assert_pandas_df_equal(bf_result, pd_result)
@@ -6010,15 +6028,15 @@ def test__resample_with_index(
60106028
),
60116029
],
60126030
)
6013-
def test__resample_start_time(rule, origin, data):
6031+
def test_resample_start_time(rule, origin, data):
60146032
# TODO: supply a reason why this isn't compatible with pandas 1.x
60156033
pytest.importorskip("pandas", minversion="2.0.0")
60166034
col = "timestamp_col"
60176035
scalars_df_index = bpd.DataFrame(data).set_index(col)
60186036
scalars_pandas_df_index = pd.DataFrame(data).set_index(col)
60196037
scalars_pandas_df_index.index.name = None
60206038

6021-
bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas()
6039+
bf_result = scalars_df_index.resample(rule=rule, origin=origin).min().to_pandas()
60226040

60236041
pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min()
60246042

tests/system/small/test_series.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4856,14 +4856,14 @@ def test_series_explode_null(data):
48564856
pytest.param(True, "timestamp_col", "timestamp_col", "1YE"),
48574857
],
48584858
)
4859-
def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule):
4859+
def test_resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule):
48604860
# TODO: supply a reason why this isn't compatible with pandas 1.x
48614861
pytest.importorskip("pandas", minversion="2.0.0")
48624862
scalars_df_index = scalars_df_index.set_index(col, append=append)["int64_col"]
48634863
scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)[
48644864
"int64_col"
48654865
]
4866-
bf_result = scalars_df_index._resample(rule=rule, level=level).min().to_pandas()
4866+
bf_result = scalars_df_index.resample(rule=rule, level=level).min().to_pandas()
48674867
pd_result = scalars_pandas_df_index.resample(rule=rule, level=level).min()
48684868
pd.testing.assert_series_equal(bf_result, pd_result)
48694869

tests/system/small/test_unordered.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -248,20 +248,24 @@ def test_unordered_mode_no_ambiguity_warning(unordered_session):
248248
),
249249
],
250250
)
251-
def test__resample_with_index(unordered_session, rule, origin, data):
251+
def test_resample_with_index(unordered_session, rule, origin, data):
252252
# TODO: supply a reason why this isn't compatible with pandas 1.x
253253
pytest.importorskip("pandas", minversion="2.0.0")
254254
col = "timestamp_col"
255255
scalars_df_index = bpd.DataFrame(data, session=unordered_session).set_index(col)
256256
scalars_pandas_df_index = pd.DataFrame(data).set_index(col)
257257
scalars_pandas_df_index.index.name = None
258258

259-
bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas()
260-
259+
bf_result = scalars_df_index.resample(rule=rule, origin=origin).min()
261260
pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min()
262261

262+
assert isinstance(bf_result.index, bpd.DatetimeIndex)
263+
assert isinstance(pd_result.index, pd.DatetimeIndex)
263264
pd.testing.assert_frame_equal(
264-
bf_result, pd_result, check_dtype=False, check_index_type=False
265+
bf_result.to_pandas(),
266+
pd_result,
267+
check_index_type=False,
268+
check_dtype=False,
265269
)
266270

267271

tests/unit/test_dataframe.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,68 @@ def test_dataframe_repr_with_uninitialized_object():
4242
assert "DataFrame" in got
4343

4444

45+
@pytest.mark.parametrize(
46+
"rule",
47+
[
48+
pd.DateOffset(weeks=1),
49+
pd.Timedelta(hours=8),
50+
# According to
51+
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.resample.html
52+
# these all default to "right" for closed and label, which isn't yet supported.
53+
"ME",
54+
"YE",
55+
"QE",
56+
"BME",
57+
"BA",
58+
"BQE",
59+
"W",
60+
],
61+
)
62+
def test_dataframe_rule_not_implememented(
63+
monkeypatch: pytest.MonkeyPatch,
64+
rule,
65+
):
66+
dataframe = mocks.create_dataframe(monkeypatch)
67+
68+
with pytest.raises(NotImplementedError, match="rule"):
69+
dataframe.resample(rule=rule)
70+
71+
72+
def test_dataframe_closed_not_implememented(
73+
monkeypatch: pytest.MonkeyPatch,
74+
):
75+
dataframe = mocks.create_dataframe(monkeypatch)
76+
77+
with pytest.raises(NotImplementedError, match="Only closed='left'"):
78+
dataframe.resample(rule="1d", closed="right")
79+
80+
81+
def test_dataframe_label_not_implememented(
82+
monkeypatch: pytest.MonkeyPatch,
83+
):
84+
dataframe = mocks.create_dataframe(monkeypatch)
85+
86+
with pytest.raises(NotImplementedError, match="Only label='left'"):
87+
dataframe.resample(rule="1d", label="right")
88+
89+
90+
@pytest.mark.parametrize(
91+
"origin",
92+
[
93+
"end",
94+
"end_day",
95+
],
96+
)
97+
def test_dataframe_origin_not_implememented(
98+
monkeypatch: pytest.MonkeyPatch,
99+
origin,
100+
):
101+
dataframe = mocks.create_dataframe(monkeypatch)
102+
103+
with pytest.raises(NotImplementedError, match="origin"):
104+
dataframe.resample(rule="1d", origin=origin)
105+
106+
45107
def test_dataframe_setattr_with_uninitialized_object():
46108
"""Ensures DataFrame can be subclassed without trying to set attributes as columns."""
47109
# Avoid calling __init__ since it might be called later in a subclass.

0 commit comments

Comments
 (0)