Skip to content

Commit 5990732

Browse files
authored
Merge branch 'main' into refactor-isnull-op
2 parents a475b72 + 40e7638 commit 5990732

File tree

10 files changed

+163
-1
lines changed

10 files changed

+163
-1
lines changed

bigframes/ml/metrics/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
auc,
1919
confusion_matrix,
2020
f1_score,
21+
mean_absolute_error,
2122
mean_squared_error,
2223
precision_score,
2324
r2_score,
@@ -36,6 +37,7 @@
3637
"confusion_matrix",
3738
"precision_score",
3839
"f1_score",
40+
"mean_absolute_error",
3941
"mean_squared_error",
4042
"pairwise",
4143
]

bigframes/ml/metrics/_metrics.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,3 +344,17 @@ def mean_squared_error(
344344
mean_squared_error.__doc__ = inspect.getdoc(
345345
vendored_metrics_regression.mean_squared_error
346346
)
347+
348+
349+
def mean_absolute_error(
350+
y_true: Union[bpd.DataFrame, bpd.Series],
351+
y_pred: Union[bpd.DataFrame, bpd.Series],
352+
) -> float:
353+
y_true_series, y_pred_series = utils.batch_convert_to_series(y_true, y_pred)
354+
355+
return (y_pred_series - y_true_series).abs().sum() / len(y_true_series)
356+
357+
358+
mean_absolute_error.__doc__ = inspect.getdoc(
359+
vendored_metrics_regression.mean_absolute_error
360+
)

bigframes/operations/datetimes.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,18 @@ def day(self) -> series.Series:
4949
def dayofweek(self) -> series.Series:
5050
return self._apply_unary_op(ops.dayofweek_op)
5151

52+
@property
53+
def day_of_week(self) -> series.Series:
54+
return self.dayofweek
55+
5256
@property
5357
def dayofyear(self) -> series.Series:
5458
return self._apply_unary_op(ops.dayofyear_op)
5559

60+
@property
61+
def day_of_year(self) -> series.Series:
62+
return self.dayofyear
63+
5664
@property
5765
def date(self) -> series.Series:
5866
return self._apply_unary_op(ops.date_op)

bigframes/session/loader.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -983,6 +983,8 @@ def read_gbq_query(
983983
)
984984
job_config.dry_run = True
985985
query_job = self._bqclient.query(query, job_config=job_config)
986+
if self._metrics is not None:
987+
self._metrics.count_job_stats(query_job=query_job)
986988
return dry_runs.get_query_stats_with_inferred_dtypes(
987989
query_job, list(columns), index_cols
988990
)

bigframes/session/metrics.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ def count_job_stats(
5151
write_stats_to_disk(len(query), total_bytes_processed)
5252
return
5353

54+
if query_job.configuration.dry_run:
55+
write_stats_to_disk(len(query_job.query), 0, 0, 0)
56+
5457
stats = get_performance_stats(query_job)
5558
if stats is not None:
5659
query_char_count, bytes_processed, slot_millis, execution_secs = stats

scripts/run_and_publish_benchmark.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,12 @@ def collect_benchmark_result(
100100
== len(local_seconds_files)
101101
):
102102
raise ValueError(
103-
"Mismatch in the number of report files for bytes, millis, seconds and query char count."
103+
"Mismatch in the number of report files for bytes, millis, seconds and query char count: \n"
104+
f"millis_files: {len(millis_files)}\n"
105+
f"bq_seconds_files: {len(bq_seconds_files)}\n"
106+
f"bytes_files: {len(bytes_files)}\n"
107+
f"query_char_count_files: {len(query_char_count_files)}\n"
108+
f"local_seconds_files: {len(local_seconds_files)}\n"
104109
)
105110

106111
has_full_metrics = len(bq_seconds_files) == len(local_seconds_files)

tests/system/small/ml/test_metrics.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -818,3 +818,10 @@ def test_mean_squared_error(session: bigframes.Session):
818818
df = session.read_pandas(pd_df)
819819
mse = metrics.mean_squared_error(df["y_true"], df["y_pred"])
820820
assert mse == 0.375
821+
822+
823+
def test_mean_absolute_error(session: bigframes.Session):
824+
pd_df = pd.DataFrame({"y_true": [3, -0.5, 2, 7], "y_pred": [2.5, 0.0, 2, 8]})
825+
df = session.read_pandas(pd_df)
826+
mse = metrics.mean_absolute_error(df["y_true"], df["y_pred"])
827+
assert mse == 0.5

tests/system/small/operations/test_datetimes.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,28 @@ def test_dt_dayofweek(scalars_dfs, col_name):
8686
pytest.importorskip("pandas", minversion="2.0.0")
8787
scalars_df, scalars_pandas_df = scalars_dfs
8888
bf_series: bigframes.series.Series = scalars_df[col_name]
89+
8990
bf_result = bf_series.dt.dayofweek.to_pandas()
9091
pd_result = scalars_pandas_df[col_name].dt.dayofweek
9192

9293
assert_series_equal(pd_result, bf_result, check_dtype=False)
9394

9495

96+
@pytest.mark.parametrize(
97+
("col_name",),
98+
DATE_COLUMNS,
99+
)
100+
def test_dt_day_of_week(scalars_dfs, col_name):
101+
pytest.importorskip("pandas", minversion="2.0.0")
102+
scalars_df, scalars_pandas_df = scalars_dfs
103+
bf_series: bigframes.series.Series = scalars_df[col_name]
104+
105+
bf_result = bf_series.dt.day_of_week.to_pandas()
106+
pd_result = scalars_pandas_df[col_name].dt.day_of_week
107+
108+
assert_series_equal(pd_result, bf_result, check_dtype=False)
109+
110+
95111
@pytest.mark.parametrize(
96112
("col_name",),
97113
DATE_COLUMNS,
@@ -100,12 +116,28 @@ def test_dt_dayofyear(scalars_dfs, col_name):
100116
pytest.importorskip("pandas", minversion="2.0.0")
101117
scalars_df, scalars_pandas_df = scalars_dfs
102118
bf_series: bigframes.series.Series = scalars_df[col_name]
119+
103120
bf_result = bf_series.dt.dayofyear.to_pandas()
104121
pd_result = scalars_pandas_df[col_name].dt.dayofyear
105122

106123
assert_series_equal(pd_result, bf_result, check_dtype=False)
107124

108125

126+
@pytest.mark.parametrize(
127+
("col_name",),
128+
DATE_COLUMNS,
129+
)
130+
def test_dt_day_of_year(scalars_dfs, col_name):
131+
pytest.importorskip("pandas", minversion="2.0.0")
132+
scalars_df, scalars_pandas_df = scalars_dfs
133+
bf_series: bigframes.series.Series = scalars_df[col_name]
134+
135+
bf_result = bf_series.dt.day_of_year.to_pandas()
136+
pd_result = scalars_pandas_df[col_name].dt.day_of_year
137+
138+
assert_series_equal(pd_result, bf_result, check_dtype=False)
139+
140+
109141
@pytest.mark.parametrize(
110142
("col_name",),
111143
DATETIME_COL_NAMES,

third_party/bigframes_vendored/pandas/core/indexes/accessor.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,40 @@ def dayofweek(self):
6666

6767
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
6868

69+
@property
70+
def day_of_week(self):
71+
"""The day of the week with Monday=0, Sunday=6.
72+
73+
Return the day of the week. It is assumed the week starts on
74+
Monday, which is denoted by 0 and ends on Sunday, which is denoted
75+
by 6.
76+
77+
**Examples:**
78+
79+
>>> import pandas as pd
80+
>>> import bigframes.pandas as bpd
81+
>>> bpd.options.display.progress_bar = None
82+
>>> s = bpd.Series(
83+
... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series()
84+
... )
85+
>>> s.dt.day_of_week
86+
2016-12-31 00:00:00 5
87+
2017-01-01 00:00:00 6
88+
2017-01-02 00:00:00 0
89+
2017-01-03 00:00:00 1
90+
2017-01-04 00:00:00 2
91+
2017-01-05 00:00:00 3
92+
2017-01-06 00:00:00 4
93+
2017-01-07 00:00:00 5
94+
2017-01-08 00:00:00 6
95+
dtype: Int64
96+
97+
Returns:
98+
Series: Containing integers indicating the day number.
99+
"""
100+
101+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
102+
69103
@property
70104
def dayofyear(self):
71105
"""The ordinal day of the year.
@@ -94,6 +128,34 @@ def dayofyear(self):
94128

95129
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
96130

131+
@property
132+
def day_of_year(self):
133+
"""The ordinal day of the year.
134+
135+
**Examples:**
136+
137+
>>> import pandas as pd
138+
>>> import bigframes.pandas as bpd
139+
>>> bpd.options.display.progress_bar = None
140+
>>> s = bpd.Series(
141+
... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series()
142+
... )
143+
>>> s.dt.day_of_year
144+
2016-12-28 00:00:00 363
145+
2016-12-29 00:00:00 364
146+
2016-12-30 00:00:00 365
147+
2016-12-31 00:00:00 366
148+
2017-01-01 00:00:00 1
149+
2017-01-02 00:00:00 2
150+
2017-01-03 00:00:00 3
151+
dtype: Int64
152+
153+
Returns:
154+
Series: Containing integers indicating the day number.
155+
"""
156+
157+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
158+
97159
@property
98160
def date(self):
99161
"""Returns a Series with the date part of Timestamps without time and

third_party/bigframes_vendored/sklearn/metrics/_regression.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,30 @@ def mean_squared_error(y_true, y_pred) -> float:
9191
float: Mean squared error.
9292
"""
9393
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
94+
95+
96+
def mean_absolute_error(y_true, y_pred) -> float:
97+
"""Mean absolute error regression loss.
98+
99+
**Examples:**
100+
101+
>>> import bigframes.pandas as bpd
102+
>>> import bigframes.ml.metrics
103+
>>> bpd.options.display.progress_bar = None
104+
105+
>>> y_true = bpd.DataFrame([3, -0.5, 2, 7])
106+
>>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8])
107+
>>> mae = bigframes.ml.metrics.mean_absolute_error(y_true, y_pred)
108+
>>> mae
109+
np.float64(0.5)
110+
111+
Args:
112+
y_true (Series or DataFrame of shape (n_samples,)):
113+
Ground truth (correct) target values.
114+
y_pred (Series or DataFrame of shape (n_samples,)):
115+
Estimated target values.
116+
117+
Returns:
118+
float: Mean absolute error.
119+
"""
120+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

0 commit comments

Comments
 (0)