feat: add DataFrame.resample and Series.resample (#2213)

tswast · web-flow · commit c9ca02c5194c · 2025-11-04T11:03:58.000-08:00
* feat: add DataFrame.resample and Series.resample

* raise for unsupported values

* add docstrings

* fix dataframe tests
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -1996,6 +1996,31 @@ def _generate_resample_label(
             Literal["epoch", "start", "start_day", "end", "end_day"],
         ] = "start_day",
     ) -> Block:
+        if not isinstance(rule, str):
+            raise NotImplementedError(
+                f"Only offset strings are currently supported for rule, but got {repr(rule)}. {constants.FEEDBACK_LINK}"
+            )
+
+        if rule in ("ME", "YE", "QE", "BME", "BA", "BQE", "W"):
+            raise NotImplementedError(
+                f"Offset strings 'ME', 'YE', 'QE', 'BME', 'BA', 'BQE', 'W' are not currently supported for rule, but got {repr(rule)}. {constants.FEEDBACK_LINK}"
+            )
+
+        if closed == "right":
+            raise NotImplementedError(
+                f"Only closed='left' is currently supported. {constants.FEEDBACK_LINK}",
+            )
+
+        if label == "right":
+            raise NotImplementedError(
+                f"Only label='left' is currently supported. {constants.FEEDBACK_LINK}",
+            )
+
+        if origin not in ("epoch", "start", "start_day"):
+            raise NotImplementedError(
+                f"Only origin='epoch', 'start', 'start_day' are currently supported, but got {repr(origin)}. {constants.FEEDBACK_LINK}"
+            )
+
         # Validate and resolve the index or column to use for grouping
         if on is None:
             if len(self.index_columns) == 0:
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -4182,10 +4182,12 @@ def _split(
         return [DataFrame(block) for block in blocks]
 
     @validations.requires_ordering()
-    def _resample(
+    def resample(
         self,
         rule: str,
         *,
+        closed: Optional[Literal["right", "left"]] = None,
+        label: Optional[Literal["right", "left"]] = None,
         on: blocks.Label = None,
         level: Optional[LevelsType] = None,
         origin: Union[
@@ -4195,64 +4197,10 @@ def _resample(
             Literal["epoch", "start", "start_day", "end", "end_day"],
         ] = "start_day",
     ) -> bigframes.core.groupby.DataFrameGroupBy:
-        """Internal function to support resample. Resample time-series data.
-
-        **Examples:**
-
-        >>> import bigframes.pandas as bpd
-        >>> data = {
-        ...     "timestamp_col": pd.date_range(
-        ...         start="2021-01-01 13:00:00", periods=30, freq="1s"
-        ...     ),
-        ...     "int64_col": range(30),
-        ...     "int64_too": range(10, 40),
-        ... }
-
-        Resample on a DataFrame with index:
-
-        >>> df = bpd.DataFrame(data).set_index("timestamp_col")
-        >>> df._resample(rule="7s").min()
-                             int64_col  int64_too
-        2021-01-01 12:59:55          0         10
-        2021-01-01 13:00:02          2         12
-        2021-01-01 13:00:09          9         19
-        2021-01-01 13:00:16         16         26
-        2021-01-01 13:00:23         23         33
-        <BLANKLINE>
-        [5 rows x 2 columns]
-
-        Resample with column and origin set to 'start':
-
-        >>> df = bpd.DataFrame(data)
-        >>> df._resample(rule="7s", on = "timestamp_col", origin="start").min()
-                             int64_col  int64_too
-        2021-01-01 13:00:00          0         10
-        2021-01-01 13:00:07          7         17
-        2021-01-01 13:00:14         14         24
-        2021-01-01 13:00:21         21         31
-        2021-01-01 13:00:28         28         38
-        <BLANKLINE>
-        [5 rows x 2 columns]
-
-        Args:
-            rule (str):
-                The offset string representing target conversion.
-            on (str, default None):
-                For a DataFrame, column to use instead of index for resampling. Column
-                must be datetime-like.
-            level (str or int, default None):
-                For a MultiIndex, level (name or number) to use for resampling.
-                level must be datetime-like.
-            origin(str, default 'start_day'):
-                The timestamp on which to adjust the grouping. Must be one of the following:
-                'epoch': origin is 1970-01-01
-                'start': origin is the first value of the timeseries
-                'start_day': origin is the first day at midnight of the timeseries
-        Returns:
-            DataFrameGroupBy: DataFrameGroupBy object.
-        """
         block = self._block._generate_resample_label(
             rule=rule,
+            closed=closed,
+            label=label,
             on=on,
             level=level,
             origin=origin,
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -2505,7 +2505,7 @@ def explode(self, *, ignore_index: Optional[bool] = False) -> Series:
         )
 
     @validations.requires_ordering()
-    def _resample(
+    def resample(
         self,
         rule: str,
         *,
@@ -2519,43 +2519,6 @@ def _resample(
             Literal["epoch", "start", "start_day", "end", "end_day"],
         ] = "start_day",
     ) -> bigframes.core.groupby.SeriesGroupBy:
-        """Internal function to support resample. Resample time-series data.
-
-        **Examples:**
-
-        >>> import bigframes.pandas as bpd
-        >>> data = {
-        ...     "timestamp_col": pd.date_range(
-        ...         start="2021-01-01 13:00:00", periods=30, freq="1s"
-        ...     ),
-        ...     "int64_col": range(30),
-        ... }
-        >>> s = bpd.DataFrame(data).set_index("timestamp_col")
-        >>> s._resample(rule="7s", origin="epoch").min()
-                             int64_col
-        2021-01-01 12:59:56          0
-        2021-01-01 13:00:03          3
-        2021-01-01 13:00:10         10
-        2021-01-01 13:00:17         17
-        2021-01-01 13:00:24         24
-        <BLANKLINE>
-        [5 rows x 1 columns]
-
-
-        Args:
-            rule (str):
-                The offset string representing target conversion.
-            level (str or int, default None):
-                For a MultiIndex, level (name or number) to use for resampling.
-                level must be datetime-like.
-            origin(str, default 'start_day'):
-                The timestamp on which to adjust the grouping. Must be one of the following:
-                'epoch': origin is 1970-01-01
-                'start': origin is the first value of the timeseries
-                'start_day': origin is the first day at midnight of the timeseries
-        Returns:
-            SeriesGroupBy: SeriesGroupBy object.
-        """
         block = self._block._generate_resample_label(
             rule=rule,
             closed=closed,
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -5915,21 +5915,15 @@ def test_dataframe_explode_xfail(col_names):
         pytest.param("datetime_col", "5M", "epoch"),
         pytest.param("datetime_col", "3Q", "start_day"),
         pytest.param("datetime_col", "3YE", "start"),
-        pytest.param(
-            "int64_col", "100D", "start", marks=pytest.mark.xfail(raises=TypeError)
-        ),
-        pytest.param(
-            "datetime_col", "100D", "end", marks=pytest.mark.xfail(raises=ValueError)
-        ),
     ],
 )
-def test__resample_with_column(
+def test_resample_with_column(
     scalars_df_index, scalars_pandas_df_index, on, rule, origin
 ):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
     bf_result = (
-        scalars_df_index._resample(rule=rule, on=on, origin=origin)[
+        scalars_df_index.resample(rule=rule, on=on, origin=origin)[
             ["int64_col", "int64_too"]
         ]
         .max()
@@ -5943,30 +5937,54 @@ def test__resample_with_column(
     )
 
 
+@pytest.mark.parametrize("index_col", ["timestamp_col", "datetime_col"])
+@pytest.mark.parametrize(
+    ("index_append", "level"),
+    [(True, 1), (False, None), (False, 0)],
+)
 @pytest.mark.parametrize(
-    ("append", "level", "col", "rule"),
+    "rule",
     [
-        pytest.param(False, None, "timestamp_col", "100d"),
-        pytest.param(True, 1, "timestamp_col", "1200h"),
-        pytest.param(False, None, "datetime_col", "100d"),
+        # TODO(tswast): support timedeltas and dataoffsets.
+        # TODO(tswast): support bins that default to "right".
+        "100d",
+        "1200h",
     ],
 )
-def test__resample_with_index(
-    scalars_df_index, scalars_pandas_df_index, append, level, col, rule
+# TODO(tswast): support "right"
+@pytest.mark.parametrize("closed", ["left", None])
+# TODO(tswast): support "right"
+@pytest.mark.parametrize("label", ["left", None])
+@pytest.mark.parametrize(
+    "origin",
+    ["epoch", "start", "start_day"],  # TODO(tswast): support end, end_day.
+)
+def test_resample_with_index(
+    scalars_df_index,
+    scalars_pandas_df_index,
+    index_append,
+    level,
+    index_col,
+    rule,
+    closed,
+    origin,
+    label,
 ):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
-    scalars_df_index = scalars_df_index.set_index(col, append=append)
-    scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)
+    scalars_df_index = scalars_df_index.set_index(index_col, append=index_append)
+    scalars_pandas_df_index = scalars_pandas_df_index.set_index(
+        index_col, append=index_append
+    )
     bf_result = (
         scalars_df_index[["int64_col", "int64_too"]]
-        ._resample(rule=rule, level=level)
+        .resample(rule=rule, level=level, closed=closed, origin=origin, label=label)
         .min()
         .to_pandas()
     )
     pd_result = (
         scalars_pandas_df_index[["int64_col", "int64_too"]]
-        .resample(rule=rule, level=level)
+        .resample(rule=rule, level=level, closed=closed, origin=origin, label=label)
         .min()
     )
     assert_pandas_df_equal(bf_result, pd_result)
@@ -6010,15 +6028,15 @@ def test__resample_with_index(
         ),
     ],
 )
-def test__resample_start_time(rule, origin, data):
+def test_resample_start_time(rule, origin, data):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
     col = "timestamp_col"
     scalars_df_index = bpd.DataFrame(data).set_index(col)
     scalars_pandas_df_index = pd.DataFrame(data).set_index(col)
     scalars_pandas_df_index.index.name = None
 
-    bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas()
+    bf_result = scalars_df_index.resample(rule=rule, origin=origin).min().to_pandas()
 
     pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min()
 
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -4856,14 +4856,14 @@ def test_series_explode_null(data):
         pytest.param(True, "timestamp_col", "timestamp_col", "1YE"),
     ],
 )
-def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule):
+def test_resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
     scalars_df_index = scalars_df_index.set_index(col, append=append)["int64_col"]
     scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)[
         "int64_col"
     ]
-    bf_result = scalars_df_index._resample(rule=rule, level=level).min().to_pandas()
+    bf_result = scalars_df_index.resample(rule=rule, level=level).min().to_pandas()
     pd_result = scalars_pandas_df_index.resample(rule=rule, level=level).min()
     pd.testing.assert_series_equal(bf_result, pd_result)
 
diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py
@@ -248,20 +248,24 @@ def test_unordered_mode_no_ambiguity_warning(unordered_session):
         ),
     ],
 )
-def test__resample_with_index(unordered_session, rule, origin, data):
+def test_resample_with_index(unordered_session, rule, origin, data):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
     col = "timestamp_col"
     scalars_df_index = bpd.DataFrame(data, session=unordered_session).set_index(col)
     scalars_pandas_df_index = pd.DataFrame(data).set_index(col)
     scalars_pandas_df_index.index.name = None
 
-    bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas()
-
+    bf_result = scalars_df_index.resample(rule=rule, origin=origin).min()
     pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min()
 
+    assert isinstance(bf_result.index, bpd.DatetimeIndex)
+    assert isinstance(pd_result.index, pd.DatetimeIndex)
     pd.testing.assert_frame_equal(
-        bf_result, pd_result, check_dtype=False, check_index_type=False
+        bf_result.to_pandas(),
+        pd_result,
+        check_index_type=False,
+        check_dtype=False,
     )
 
 
diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py
@@ -42,6 +42,68 @@ def test_dataframe_repr_with_uninitialized_object():
     assert "DataFrame" in got
 
 
+@pytest.mark.parametrize(
+    "rule",
+    [
+        pd.DateOffset(weeks=1),
+        pd.Timedelta(hours=8),
+        # According to
+        # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.resample.html
+        # these all default to "right" for closed and label, which isn't yet supported.
+        "ME",
+        "YE",
+        "QE",
+        "BME",
+        "BA",
+        "BQE",
+        "W",
+    ],
+)
+def test_dataframe_rule_not_implememented(
+    monkeypatch: pytest.MonkeyPatch,
+    rule,
+):
+    dataframe = mocks.create_dataframe(monkeypatch)
+
+    with pytest.raises(NotImplementedError, match="rule"):
+        dataframe.resample(rule=rule)
+
+
+def test_dataframe_closed_not_implememented(
+    monkeypatch: pytest.MonkeyPatch,
+):
+    dataframe = mocks.create_dataframe(monkeypatch)
+
+    with pytest.raises(NotImplementedError, match="Only closed='left'"):
+        dataframe.resample(rule="1d", closed="right")
+
+
+def test_dataframe_label_not_implememented(
+    monkeypatch: pytest.MonkeyPatch,
+):
+    dataframe = mocks.create_dataframe(monkeypatch)
+
+    with pytest.raises(NotImplementedError, match="Only label='left'"):
+        dataframe.resample(rule="1d", label="right")
+
+
+@pytest.mark.parametrize(
+    "origin",
+    [
+        "end",
+        "end_day",
+    ],
+)
+def test_dataframe_origin_not_implememented(
+    monkeypatch: pytest.MonkeyPatch,
+    origin,
+):
+    dataframe = mocks.create_dataframe(monkeypatch)
+
+    with pytest.raises(NotImplementedError, match="origin"):
+        dataframe.resample(rule="1d", origin=origin)
+
+
 def test_dataframe_setattr_with_uninitialized_object():
     """Ensures DataFrame can be subclassed without trying to set attributes as columns."""
     # Avoid calling __init__ since it might be called later in a subclass.
diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py