Skip to content

Commit a2daa3f

Browse files
authored
fix: Transformers with non-standard column names throw errors (#2089)
* fix: Transformers with non-standard column names through errors * fix
1 parent fd4b264 commit a2daa3f

File tree

4 files changed

+45
-3
lines changed

4 files changed

+45
-3
lines changed

bigframes/ml/compose.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
from bigframes.core import log_adapter
3131
import bigframes.core.compile.googlesql as sql_utils
32+
import bigframes.core.utils as core_utils
3233
from bigframes.ml import base, core, globals, impute, preprocessing, utils
3334
import bigframes.pandas as bpd
3435

@@ -103,13 +104,12 @@ def __init__(self, sql: str, target_column: str = "transformed_{0}"):
103104
# TODO: More robust unescaping
104105
self._target_column = target_column.replace("`", "")
105106

106-
PLAIN_COLNAME_RX = re.compile("^[a-z][a-z0-9_]*$", re.IGNORECASE)
107-
108107
def _compile_to_sql(
109108
self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None
110109
) -> List[str]:
111110
if columns is None:
112111
columns = X.columns
112+
columns, _ = core_utils.get_standardized_ids(columns)
113113
result = []
114114
for column in columns:
115115
current_sql = self._sql.format(sql_utils.identifier(column))

bigframes/ml/impute.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import bigframes_vendored.sklearn.impute._base
2424

2525
from bigframes.core import log_adapter
26+
import bigframes.core.utils as core_utils
2627
from bigframes.ml import base, core, globals, utils
2728
import bigframes.pandas as bpd
2829

@@ -62,6 +63,7 @@ def _compile_to_sql(
6263
Returns: a list of tuples sql_expr."""
6364
if columns is None:
6465
columns = X.columns
66+
columns, _ = core_utils.get_standardized_ids(columns)
6567
return [
6668
self._base_sql_generator.ml_imputer(
6769
column, self.strategy, f"imputer_{column}"

bigframes/ml/preprocessing.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import bigframes_vendored.sklearn.preprocessing._polynomial
2828

2929
from bigframes.core import log_adapter
30+
import bigframes.core.utils as core_utils
3031
from bigframes.ml import base, core, globals, utils
3132
import bigframes.pandas as bpd
3233

@@ -59,6 +60,7 @@ def _compile_to_sql(
5960
Returns: a list of tuples sql_expr."""
6061
if columns is None:
6162
columns = X.columns
63+
columns, _ = core_utils.get_standardized_ids(columns)
6264
return [
6365
self._base_sql_generator.ml_standard_scaler(
6466
column, f"standard_scaled_{column}"
@@ -136,6 +138,7 @@ def _compile_to_sql(
136138
Returns: a list of tuples sql_expr."""
137139
if columns is None:
138140
columns = X.columns
141+
columns, _ = core_utils.get_standardized_ids(columns)
139142
return [
140143
self._base_sql_generator.ml_max_abs_scaler(
141144
column, f"max_abs_scaled_{column}"
@@ -214,6 +217,7 @@ def _compile_to_sql(
214217
Returns: a list of tuples sql_expr."""
215218
if columns is None:
216219
columns = X.columns
220+
columns, _ = core_utils.get_standardized_ids(columns)
217221
return [
218222
self._base_sql_generator.ml_min_max_scaler(
219223
column, f"min_max_scaled_{column}"
@@ -304,6 +308,7 @@ def _compile_to_sql(
304308
Returns: a list of tuples sql_expr."""
305309
if columns is None:
306310
columns = X.columns
311+
columns, _ = core_utils.get_standardized_ids(columns)
307312
array_split_points = {}
308313
if self.strategy == "uniform":
309314
for column in columns:
@@ -433,6 +438,7 @@ def _compile_to_sql(
433438
Returns: a list of tuples sql_expr."""
434439
if columns is None:
435440
columns = X.columns
441+
columns, _ = core_utils.get_standardized_ids(columns)
436442
drop = self.drop if self.drop is not None else "none"
437443
# minus one here since BQML's implementation always includes index 0, and top_k is on top of that.
438444
top_k = (
@@ -547,6 +553,7 @@ def _compile_to_sql(
547553
Returns: a list of tuples sql_expr."""
548554
if columns is None:
549555
columns = X.columns
556+
columns, _ = core_utils.get_standardized_ids(columns)
550557

551558
# minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that.
552559
top_k = (
@@ -644,6 +651,7 @@ def _compile_to_sql(
644651
Returns: a list of tuples sql_expr."""
645652
if columns is None:
646653
columns = X.columns
654+
columns, _ = core_utils.get_standardized_ids(columns)
647655
output_name = "poly_feat"
648656
return [
649657
self._base_sql_generator.ml_polynomial_expand(

tests/system/small/ml/test_preprocessing.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import bigframes.features
2121
from bigframes.ml import preprocessing
22+
import bigframes.pandas as bpd
2223
from bigframes.testing import utils
2324

2425
ONE_HOT_ENCODED_DTYPE = (
@@ -62,7 +63,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df):
6263
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
6364

6465

65-
def test_standard_scaler_normalizeds_fit_transform(new_penguins_df):
66+
def test_standard_scaler_normalizes_fit_transform(new_penguins_df):
6667
# TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod.
6768
scaler = preprocessing.StandardScaler()
6869
result = scaler.fit_transform(
@@ -114,6 +115,37 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui
114115
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
115116

116117

118+
def test_standard_scaler_normalizes_non_standard_column_names(
119+
new_penguins_df: bpd.DataFrame,
120+
):
121+
new_penguins_df = new_penguins_df.rename(
122+
columns={
123+
"culmen_length_mm": "culmen?metric",
124+
"culmen_depth_mm": "culmen/metric",
125+
}
126+
)
127+
scaler = preprocessing.StandardScaler()
128+
result = scaler.fit_transform(
129+
new_penguins_df[["culmen?metric", "culmen/metric", "flipper_length_mm"]]
130+
).to_pandas()
131+
132+
# If standard-scaled correctly, mean should be 0.0
133+
for column in result.columns:
134+
assert math.isclose(result[column].mean(), 0.0, abs_tol=1e-3)
135+
136+
expected = pd.DataFrame(
137+
{
138+
"standard_scaled_culmen_metric": [1.313249, -0.20198, -1.111118],
139+
"standard_scaled_culmen_metric_1": [1.17072, -1.272416, 0.101848],
140+
"standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338],
141+
},
142+
dtype="Float64",
143+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
144+
)
145+
146+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
147+
148+
117149
def test_standard_scaler_save_load(new_penguins_df, dataset_id):
118150
transformer = preprocessing.StandardScaler()
119151
transformer.fit(

0 commit comments

Comments
 (0)