Skip to content
This repository was archived by the owner on Apr 1, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bigframes/ml/compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

from bigframes.core import log_adapter
import bigframes.core.compile.googlesql as sql_utils
import bigframes.core.utils as core_utils
from bigframes.ml import base, core, globals, impute, preprocessing, utils
import bigframes.pandas as bpd

Expand Down Expand Up @@ -103,13 +104,12 @@ def __init__(self, sql: str, target_column: str = "transformed_{0}"):
# TODO: More robust unescaping
self._target_column = target_column.replace("`", "")

PLAIN_COLNAME_RX = re.compile("^[a-z][a-z0-9_]*$", re.IGNORECASE)

def _compile_to_sql(
self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None
) -> List[str]:
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
result = []
for column in columns:
current_sql = self._sql.format(sql_utils.identifier(column))
Expand Down
2 changes: 2 additions & 0 deletions bigframes/ml/impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import bigframes_vendored.sklearn.impute._base

from bigframes.core import log_adapter
import bigframes.core.utils as core_utils
from bigframes.ml import base, core, globals, utils
import bigframes.pandas as bpd

Expand Down Expand Up @@ -62,6 +63,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
return [
self._base_sql_generator.ml_imputer(
column, self.strategy, f"imputer_{column}"
Expand Down
8 changes: 8 additions & 0 deletions bigframes/ml/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import bigframes_vendored.sklearn.preprocessing._polynomial

from bigframes.core import log_adapter
import bigframes.core.utils as core_utils
from bigframes.ml import base, core, globals, utils
import bigframes.pandas as bpd

Expand Down Expand Up @@ -59,6 +60,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
return [
self._base_sql_generator.ml_standard_scaler(
column, f"standard_scaled_{column}"
Expand Down Expand Up @@ -136,6 +138,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
return [
self._base_sql_generator.ml_max_abs_scaler(
column, f"max_abs_scaled_{column}"
Expand Down Expand Up @@ -214,6 +217,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
return [
self._base_sql_generator.ml_min_max_scaler(
column, f"min_max_scaled_{column}"
Expand Down Expand Up @@ -304,6 +308,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
array_split_points = {}
if self.strategy == "uniform":
for column in columns:
Expand Down Expand Up @@ -433,6 +438,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
drop = self.drop if self.drop is not None else "none"
# minus one here since BQML's implementation always includes index 0, and top_k is on top of that.
top_k = (
Expand Down Expand Up @@ -547,6 +553,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)

# minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that.
top_k = (
Expand Down Expand Up @@ -644,6 +651,7 @@ def _compile_to_sql(
Returns: a list of tuples sql_expr."""
if columns is None:
columns = X.columns
columns, _ = core_utils.get_standardized_ids(columns)
output_name = "poly_feat"
return [
self._base_sql_generator.ml_polynomial_expand(
Expand Down
32 changes: 32 additions & 0 deletions tests/system/small/ml/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import bigframes.features
from bigframes.ml import preprocessing
import bigframes.pandas as bpd
from bigframes.testing import utils

ONE_HOT_ENCODED_DTYPE = (
Expand Down Expand Up @@ -114,6 +115,37 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui
pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_standard_scaler_normalizeds_non_standard_column_names(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo check: "normalizeds" ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

new_penguins_df: bpd.DataFrame,
):
new_penguins_df = new_penguins_df.rename(
columns={
"culmen_length_mm": "culmen?metric",
"culmen_depth_mm": "culmen/metric",
}
)
scaler = preprocessing.StandardScaler()
result = scaler.fit_transform(
new_penguins_df[["culmen?metric", "culmen/metric", "flipper_length_mm"]]
).to_pandas()

# If standard-scaled correctly, mean should be 0.0
for column in result.columns:
assert math.isclose(result[column].mean(), 0.0, abs_tol=1e-3)

expected = pd.DataFrame(
{
"standard_scaled_culmen_metric": [1.313249, -0.20198, -1.111118],
"standard_scaled_culmen_metric_1": [1.17072, -1.272416, 0.101848],
"standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338],
},
dtype="Float64",
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
)

pd.testing.assert_frame_equal(result, expected, rtol=0.1)


def test_standard_scaler_save_load(new_penguins_df, dataset_id):
transformer = preprocessing.StandardScaler()
transformer.fit(
Expand Down