Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bigframes/ml/compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

from bigframes.core import log_adapter
import bigframes.core.compile.googlesql as sql_utils
import bigframes.core.utils as core_utils
from bigframes.ml import base, core, globals, impute, preprocessing, utils
import bigframes.pandas as bpd

Expand Down Expand Up @@ -103,13 +104,12 @@ def __init__(self, sql: str, target_column: str = "transformed_{0}"):
# TODO: More robust unescaping
self._target_column = target_column.replace("`", "")

PLAIN_COLNAME_RX = re.compile("^[a-z][a-z0-9_]*$", re.IGNORECASE)

def _compile_to_sql(
self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None
) -> List[str]:
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
result = []
for column in columns:
current_sql = self._sql.format(sql_utils.identifier(column))
Expand Down
2 changes: 2 additions & 0 deletions bigframes/ml/impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import bigframes_vendored.sklearn.impute._base

from bigframes.core import log_adapter
import bigframes.core.utils as core_utils
from bigframes.ml import base, core, globals, utils
import bigframes.pandas as bpd

Expand Down Expand Up @@ -62,6 +63,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
return [
self._base_sql_generator.ml_imputer(
column, self.strategy, f"imputer_{column}"
Expand Down
8 changes: 8 additions & 0 deletions bigframes/ml/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import bigframes_vendored.sklearn.preprocessing._polynomial

from bigframes.core import log_adapter
import bigframes.core.utils as core_utils
from bigframes.ml import base, core, globals, utils
import bigframes.pandas as bpd

Expand Down Expand Up @@ -59,6 +60,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
return [
self._base_sql_generator.ml_standard_scaler(
column, f"standard_scaled_{column}"
Expand Down Expand Up @@ -136,6 +138,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
return [
self._base_sql_generator.ml_max_abs_scaler(
column, f"max_abs_scaled_{column}"
Expand Down Expand Up @@ -214,6 +217,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
return [
self._base_sql_generator.ml_min_max_scaler(
column, f"min_max_scaled_{column}"
Expand Down Expand Up @@ -304,6 +308,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
array_split_points = {}
if self.strategy == "uniform":
for column in columns:
Expand Down Expand Up @@ -433,6 +438,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
drop = self.drop if self.drop is not None else "none"
# minus one here since BQML's implementation always includes index 0, and top_k is on top of that.
top_k = (
Expand Down Expand Up @@ -547,6 +553,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)

# minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that.
top_k = (
Expand Down Expand Up @@ -644,6 +651,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
output_name = "poly_feat"
return [
self._base_sql_generator.ml_polynomial_expand(
Expand Down
34 changes: 33 additions & 1 deletion tests/system/small/ml/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import bigframes.features
from bigframes.ml import preprocessing
import bigframes.pandas as bpd
from bigframes.testing import utils

ONE_HOT_ENCODED_DTYPE = (
Expand Down Expand Up @@ -62,7 +63,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df):
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_standard_scaler_normalizeds_fit_transform(new_penguins_df):
def test_standard_scaler_normalizes_fit_transform(new_penguins_df):
# TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod.
scaler = preprocessing.StandardScaler()
result = scaler.fit_transform(
Expand Down Expand Up @@ -114,6 +115,37 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_standard_scaler_normalizes_non_standard_column_names(
new_penguins_df: bpd.DataFrame,
):
new_penguins_df = new_penguins_df.rename(
columns={
"culmen_length_mm": "culmen?metric",
"culmen_depth_mm": "culmen/metric",
}
)
scaler = preprocessing.StandardScaler()
result = scaler.fit_transform(
new_penguins_df[["culmen?metric", "culmen/metric", "flipper_length_mm"]]
).to_pandas()

# If standard-scaled correctly, mean should be 0.0
for column in result.columns:
assert math.isclose(result[column].mean(), 0.0, abs_tol=1e-3)

expected = pd.DataFrame(
{
"standard_scaled_culmen_metric": [1.313249, -0.20198, -1.111118],
"standard_scaled_culmen_metric_1": [1.17072, -1.272416, 0.101848],
"standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338],
},
dtype="Float64",
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_standard_scaler_save_load(new_penguins_df, dataset_id):
transformer = preprocessing.StandardScaler()
transformer.fit(
Expand Down