fix: Transformers with non-standard column names throw errors (#2089)

GarrettWu · web-flow · commit a2daa3fffe67 · 2025-09-18T10:34:28.000-07:00
* fix: Transformers with non-standard column names through errors

* fix
diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py
@@ -29,6 +29,7 @@
 
 from bigframes.core import log_adapter
 import bigframes.core.compile.googlesql as sql_utils
+import bigframes.core.utils as core_utils
 from bigframes.ml import base, core, globals, impute, preprocessing, utils
 import bigframes.pandas as bpd
 
@@ -103,13 +104,12 @@ def __init__(self, sql: str, target_column: str = "transformed_{0}"):
         # TODO: More robust unescaping
         self._target_column = target_column.replace("`", "")
 
-    PLAIN_COLNAME_RX = re.compile("^[a-z][a-z0-9_]*$", re.IGNORECASE)
-
     def _compile_to_sql(
         self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None
     ) -> List[str]:
         if columns is None:
             columns = X.columns
+        columns, _ = core_utils.get_standardized_ids(columns)
         result = []
         for column in columns:
             current_sql = self._sql.format(sql_utils.identifier(column))
diff --git a/bigframes/ml/impute.py b/bigframes/ml/impute.py
@@ -23,6 +23,7 @@
 import bigframes_vendored.sklearn.impute._base
 
 from bigframes.core import log_adapter
+import bigframes.core.utils as core_utils
 from bigframes.ml import base, core, globals, utils
 import bigframes.pandas as bpd
 
@@ -62,6 +63,7 @@ def _compile_to_sql(
         Returns: a list of tuples sql_expr."""
         if columns is None:
             columns = X.columns
+        columns, _ = core_utils.get_standardized_ids(columns)
         return [
             self._base_sql_generator.ml_imputer(
                 column, self.strategy, f"imputer_{column}"
diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py
@@ -27,6 +27,7 @@
 import bigframes_vendored.sklearn.preprocessing._polynomial
 
 from bigframes.core import log_adapter
+import bigframes.core.utils as core_utils
 from bigframes.ml import base, core, globals, utils
 import bigframes.pandas as bpd
 
@@ -59,6 +60,7 @@ def _compile_to_sql(
         Returns: a list of tuples sql_expr."""
         if columns is None:
             columns = X.columns
+        columns, _ = core_utils.get_standardized_ids(columns)
         return [
             self._base_sql_generator.ml_standard_scaler(
                 column, f"standard_scaled_{column}"
@@ -136,6 +138,7 @@ def _compile_to_sql(
         Returns: a list of tuples sql_expr."""
         if columns is None:
             columns = X.columns
+        columns, _ = core_utils.get_standardized_ids(columns)
         return [
             self._base_sql_generator.ml_max_abs_scaler(
                 column, f"max_abs_scaled_{column}"
@@ -214,6 +217,7 @@ def _compile_to_sql(
         Returns: a list of tuples sql_expr."""
         if columns is None:
             columns = X.columns
+        columns, _ = core_utils.get_standardized_ids(columns)
         return [
             self._base_sql_generator.ml_min_max_scaler(
                 column, f"min_max_scaled_{column}"
@@ -304,6 +308,7 @@ def _compile_to_sql(
         Returns: a list of tuples sql_expr."""
         if columns is None:
             columns = X.columns
+        columns, _ = core_utils.get_standardized_ids(columns)
         array_split_points = {}
         if self.strategy == "uniform":
             for column in columns:
@@ -433,6 +438,7 @@ def _compile_to_sql(
         Returns: a list of tuples sql_expr."""
         if columns is None:
             columns = X.columns
+        columns, _ = core_utils.get_standardized_ids(columns)
         drop = self.drop if self.drop is not None else "none"
         # minus one here since BQML's implementation always includes index 0, and top_k is on top of that.
         top_k = (
@@ -547,6 +553,7 @@ def _compile_to_sql(
         Returns: a list of tuples sql_expr."""
         if columns is None:
             columns = X.columns
+        columns, _ = core_utils.get_standardized_ids(columns)
 
         # minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that.
         top_k = (
@@ -644,6 +651,7 @@ def _compile_to_sql(
         Returns: a list of tuples sql_expr."""
         if columns is None:
             columns = X.columns
+        columns, _ = core_utils.get_standardized_ids(columns)
         output_name = "poly_feat"
         return [
             self._base_sql_generator.ml_polynomial_expand(
diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py
@@ -19,6 +19,7 @@
 
 import bigframes.features
 from bigframes.ml import preprocessing
+import bigframes.pandas as bpd
 from bigframes.testing import utils
 
 ONE_HOT_ENCODED_DTYPE = (
@@ -62,7 +63,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df):
     pd.testing.assert_frame_equal(result, expected, rtol=0.1)
 
 
-def test_standard_scaler_normalizeds_fit_transform(new_penguins_df):
+def test_standard_scaler_normalizes_fit_transform(new_penguins_df):
     # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod.
     scaler = preprocessing.StandardScaler()
     result = scaler.fit_transform(
@@ -114,6 +115,37 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui
     pd.testing.assert_frame_equal(result, expected, rtol=0.1)
 
 
+def test_standard_scaler_normalizes_non_standard_column_names(
+    new_penguins_df: bpd.DataFrame,
+):
+    new_penguins_df = new_penguins_df.rename(
+        columns={
+            "culmen_length_mm": "culmen?metric",
+            "culmen_depth_mm": "culmen/metric",
+        }
+    )
+    scaler = preprocessing.StandardScaler()
+    result = scaler.fit_transform(
+        new_penguins_df[["culmen?metric", "culmen/metric", "flipper_length_mm"]]
+    ).to_pandas()
+
+    # If standard-scaled correctly, mean should be 0.0
+    for column in result.columns:
+        assert math.isclose(result[column].mean(), 0.0, abs_tol=1e-3)
+
+    expected = pd.DataFrame(
+        {
+            "standard_scaled_culmen_metric": [1.313249, -0.20198, -1.111118],
+            "standard_scaled_culmen_metric_1": [1.17072, -1.272416, 0.101848],
+            "standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338],
+        },
+        dtype="Float64",
+        index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+    )
+
+    pd.testing.assert_frame_equal(result, expected, rtol=0.1)
+
+
 def test_standard_scaler_save_load(new_penguins_df, dataset_id):
     transformer = preprocessing.StandardScaler()
     transformer.fit(