Skip to content
This repository was archived by the owner on Apr 1, 2026. It is now read-only.

Commit bbb0c0e

Browse files
Merge branch 'main' into validate_join_type
2 parents 80e3c19 + a2daa3f commit bbb0c0e

File tree

10 files changed

+126
-27
lines changed

10 files changed

+126
-27
lines changed

CHANGELOG.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,20 @@
44

55
[1]: https://pypi.org/project/bigframes/#history
66

7+
## [2.21.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.20.0...v2.21.0) (2025-09-17)
8+
9+
10+
### Features
11+
12+
* Add bigframes.bigquery.to_json ([#2078](https://github.com/googleapis/python-bigquery-dataframes/issues/2078)) ([0fc795a](https://github.com/googleapis/python-bigquery-dataframes/commit/0fc795a9fb56f469b62603462c3f0f56f52bfe04))
13+
* Support average='binary' in precision_score() ([#2080](https://github.com/googleapis/python-bigquery-dataframes/issues/2080)) ([920f381](https://github.com/googleapis/python-bigquery-dataframes/commit/920f381aec7e0a0b986886cdbc333e86335c6d7d))
14+
* Support pandas series in ai.generate_bool ([#2086](https://github.com/googleapis/python-bigquery-dataframes/issues/2086)) ([a3de53f](https://github.com/googleapis/python-bigquery-dataframes/commit/a3de53f68b2a24f4ed85a474dfaff9b59570a2f1))
15+
16+
17+
### Bug Fixes
18+
19+
* Allow bigframes.options.bigquery.credentials to be `None` ([#2092](https://github.com/googleapis/python-bigquery-dataframes/issues/2092)) ([78f4001](https://github.com/googleapis/python-bigquery-dataframes/commit/78f4001e8fcfc77fc82f3893d58e0d04c0f6d3db))
20+
721
## [2.20.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.19.0...v2.20.0) (2025-09-16)
822

923

bigframes/bigquery/_operations/ai.py

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,25 @@
1919
from __future__ import annotations
2020

2121
import json
22-
from typing import Any, List, Literal, Mapping, Tuple
22+
from typing import Any, List, Literal, Mapping, Tuple, Union
2323

24-
from bigframes import clients, dtypes, series
25-
from bigframes.core import log_adapter
24+
import pandas as pd
25+
26+
from bigframes import clients, dtypes, series, session
27+
from bigframes.core import convert, log_adapter
2628
from bigframes.operations import ai_ops
2729

30+
PROMPT_TYPE = Union[
31+
series.Series,
32+
pd.Series,
33+
List[Union[str, series.Series, pd.Series]],
34+
Tuple[Union[str, series.Series, pd.Series], ...],
35+
]
36+
2837

2938
@log_adapter.method_logger(custom_base_name="bigquery_ai")
3039
def generate_bool(
31-
prompt: series.Series | List[str | series.Series] | Tuple[str | series.Series, ...],
40+
prompt: PROMPT_TYPE,
3241
*,
3342
connection_id: str | None = None,
3443
endpoint: str | None = None,
@@ -51,7 +60,7 @@ def generate_bool(
5160
0 {'result': True, 'full_response': '{"candidate...
5261
1 {'result': True, 'full_response': '{"candidate...
5362
2 {'result': False, 'full_response': '{"candidat...
54-
dtype: struct<result: bool, full_response: string, status: string>[pyarrow]
63+
dtype: struct<result: bool, full_response: extension<dbjson<JSONArrowType>>, status: string>[pyarrow]
5564
5665
>>> bbq.ai.generate_bool((df["col_1"], " is a ", df["col_2"])).struct.field("result")
5766
0 True
@@ -60,8 +69,9 @@ def generate_bool(
6069
Name: result, dtype: boolean
6170
6271
Args:
63-
prompt (series.Series | List[str|series.Series] | Tuple[str|series.Series, ...]):
64-
A mixture of Series and string literals that specifies the prompt to send to the model.
72+
prompt (Series | List[str|Series] | Tuple[str|Series, ...]):
73+
A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series
74+
or pandas Series.
6575
connection_id (str, optional):
6676
Specifies the connection to use to communicate with the model. For example, `myproject.us.myconnection`.
6777
If not provided, the connection from the current session will be used.
@@ -84,7 +94,7 @@ def generate_bool(
8494
Returns:
8595
bigframes.series.Series: A new struct Series with the result data. The struct contains these fields:
8696
* "result": a BOOL value containing the model's response to the prompt. The result is None if the request fails or is filtered by responsible AI.
87-
* "full_response": a STRING value containing the JSON response from the projects.locations.endpoints.generateContent call to the model.
97+
* "full_response": a JSON value containing the response from the projects.locations.endpoints.generateContent call to the model.
8898
The generated text is in the text element.
8999
* "status": a STRING value that contains the API response status for the corresponding row. This value is empty if the operation was successful.
90100
"""
@@ -104,7 +114,7 @@ def generate_bool(
104114

105115

106116
def _separate_context_and_series(
107-
prompt: series.Series | List[str | series.Series] | Tuple[str | series.Series, ...],
117+
prompt: PROMPT_TYPE,
108118
) -> Tuple[List[str | None], List[series.Series]]:
109119
"""
110120
Returns the two values. The first value is the prompt with all series replaced by None. The second value is all the series
@@ -123,18 +133,19 @@ def _separate_context_and_series(
123133
return [None], [prompt]
124134

125135
prompt_context: List[str | None] = []
126-
series_list: List[series.Series] = []
136+
series_list: List[series.Series | pd.Series] = []
127137

138+
session = None
128139
for item in prompt:
129140
if isinstance(item, str):
130141
prompt_context.append(item)
131142

132-
elif isinstance(item, series.Series):
143+
elif isinstance(item, (series.Series, pd.Series)):
133144
prompt_context.append(None)
134145

135-
if item.dtype == dtypes.OBJ_REF_DTYPE:
136-
# Multi-model support
137-
item = item.blob.read_url()
146+
if isinstance(item, series.Series) and session is None:
147+
# Use the first available BF session if there's any.
148+
session = item._session
138149
series_list.append(item)
139150

140151
else:
@@ -143,7 +154,20 @@ def _separate_context_and_series(
143154
if not series_list:
144155
raise ValueError("Please provide at least one Series in the prompt")
145156

146-
return prompt_context, series_list
157+
converted_list = [_convert_series(s, session) for s in series_list]
158+
159+
return prompt_context, converted_list
160+
161+
162+
def _convert_series(
163+
s: series.Series | pd.Series, session: session.Session | None
164+
) -> series.Series:
165+
result = convert.to_bf_series(s, default_index=None, session=session)
166+
167+
if result.dtype == dtypes.OBJ_REF_DTYPE:
168+
# Support multimodel
169+
return result.blob.read_url()
170+
return result
147171

148172

149173
def _resolve_connection_id(series: series.Series, connection_id: str | None):

bigframes/ml/compose.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
from bigframes.core import log_adapter
3131
import bigframes.core.compile.googlesql as sql_utils
32+
import bigframes.core.utils as core_utils
3233
from bigframes.ml import base, core, globals, impute, preprocessing, utils
3334
import bigframes.pandas as bpd
3435

@@ -103,13 +104,12 @@ def __init__(self, sql: str, target_column: str = "transformed_{0}"):
103104
# TODO: More robust unescaping
104105
self._target_column = target_column.replace("`", "")
105106

106-
PLAIN_COLNAME_RX = re.compile("^[a-z][a-z0-9_]*$", re.IGNORECASE)
107-
108107
def _compile_to_sql(
109108
self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None
110109
) -> List[str]:
111110
if columns is None:
112111
columns = X.columns
112+
columns, _ = core_utils.get_standardized_ids(columns)
113113
result = []
114114
for column in columns:
115115
current_sql = self._sql.format(sql_utils.identifier(column))

bigframes/ml/impute.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import bigframes_vendored.sklearn.impute._base
2424

2525
from bigframes.core import log_adapter
26+
import bigframes.core.utils as core_utils
2627
from bigframes.ml import base, core, globals, utils
2728
import bigframes.pandas as bpd
2829

@@ -62,6 +63,7 @@ def _compile_to_sql(
6263
Returns: a list of tuples sql_expr."""
6364
if columns is None:
6465
columns = X.columns
66+
columns, _ = core_utils.get_standardized_ids(columns)
6567
return [
6668
self._base_sql_generator.ml_imputer(
6769
column, self.strategy, f"imputer_{column}"

bigframes/ml/preprocessing.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import bigframes_vendored.sklearn.preprocessing._polynomial
2828

2929
from bigframes.core import log_adapter
30+
import bigframes.core.utils as core_utils
3031
from bigframes.ml import base, core, globals, utils
3132
import bigframes.pandas as bpd
3233

@@ -59,6 +60,7 @@ def _compile_to_sql(
5960
Returns: a list of tuples sql_expr."""
6061
if columns is None:
6162
columns = X.columns
63+
columns, _ = core_utils.get_standardized_ids(columns)
6264
return [
6365
self._base_sql_generator.ml_standard_scaler(
6466
column, f"standard_scaled_{column}"
@@ -136,6 +138,7 @@ def _compile_to_sql(
136138
Returns: a list of tuples sql_expr."""
137139
if columns is None:
138140
columns = X.columns
141+
columns, _ = core_utils.get_standardized_ids(columns)
139142
return [
140143
self._base_sql_generator.ml_max_abs_scaler(
141144
column, f"max_abs_scaled_{column}"
@@ -214,6 +217,7 @@ def _compile_to_sql(
214217
Returns: a list of tuples sql_expr."""
215218
if columns is None:
216219
columns = X.columns
220+
columns, _ = core_utils.get_standardized_ids(columns)
217221
return [
218222
self._base_sql_generator.ml_min_max_scaler(
219223
column, f"min_max_scaled_{column}"
@@ -304,6 +308,7 @@ def _compile_to_sql(
304308
Returns: a list of tuples sql_expr."""
305309
if columns is None:
306310
columns = X.columns
311+
columns, _ = core_utils.get_standardized_ids(columns)
307312
array_split_points = {}
308313
if self.strategy == "uniform":
309314
for column in columns:
@@ -433,6 +438,7 @@ def _compile_to_sql(
433438
Returns: a list of tuples sql_expr."""
434439
if columns is None:
435440
columns = X.columns
441+
columns, _ = core_utils.get_standardized_ids(columns)
436442
drop = self.drop if self.drop is not None else "none"
437443
# minus one here since BQML's implementation always includes index 0, and top_k is on top of that.
438444
top_k = (
@@ -547,6 +553,7 @@ def _compile_to_sql(
547553
Returns: a list of tuples sql_expr."""
548554
if columns is None:
549555
columns = X.columns
556+
columns, _ = core_utils.get_standardized_ids(columns)
550557

551558
# minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that.
552559
top_k = (
@@ -644,6 +651,7 @@ def _compile_to_sql(
644651
Returns: a list of tuples sql_expr."""
645652
if columns is None:
646653
columns = X.columns
654+
columns, _ = core_utils.get_standardized_ids(columns)
647655
output_name = "poly_feat"
648656
return [
649657
self._base_sql_generator.ml_polynomial_expand(

bigframes/operations/ai_ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
4040
pa.struct(
4141
(
4242
pa.field("result", pa.bool_()),
43-
pa.field("full_response", pa.string()),
43+
pa.field("full_response", dtypes.JSON_ARROW_TYPE),
4444
pa.field("status", pa.string()),
4545
)
4646
)

bigframes/version.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
__version__ = "2.20.0"
15+
__version__ = "2.21.0"
1616

1717
# {x-release-please-start-date}
18-
__release_date__ = "2025-09-16"
18+
__release_date__ = "2025-09-17"
1919
# {x-release-please-end}

tests/system/small/bigquery/test_ai.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import pyarrow as pa
1919
import pytest
2020

21-
from bigframes import series
21+
from bigframes import dtypes, series
2222
import bigframes.bigquery as bbq
2323
import bigframes.pandas as bpd
2424

@@ -35,7 +35,26 @@ def test_ai_generate_bool(session):
3535
pa.struct(
3636
(
3737
pa.field("result", pa.bool_()),
38-
pa.field("full_response", pa.string()),
38+
pa.field("full_response", dtypes.JSON_ARROW_TYPE),
39+
pa.field("status", pa.string()),
40+
)
41+
)
42+
)
43+
44+
45+
def test_ai_generate_bool_with_pandas(session):
46+
s1 = pd.Series(["apple", "bear"])
47+
s2 = bpd.Series(["fruit", "tree"], session=session)
48+
prompt = (s1, " is a ", s2)
49+
50+
result = bbq.ai.generate_bool(prompt, endpoint="gemini-2.5-flash")
51+
52+
assert _contains_no_nulls(result)
53+
assert result.dtype == pd.ArrowDtype(
54+
pa.struct(
55+
(
56+
pa.field("result", pa.bool_()),
57+
pa.field("full_response", dtypes.JSON_ARROW_TYPE),
3958
pa.field("status", pa.string()),
4059
)
4160
)
@@ -62,7 +81,7 @@ def test_ai_generate_bool_with_model_params(session):
6281
pa.struct(
6382
(
6483
pa.field("result", pa.bool_()),
65-
pa.field("full_response", pa.string()),
84+
pa.field("full_response", dtypes.JSON_ARROW_TYPE),
6685
pa.field("status", pa.string()),
6786
)
6887
)
@@ -81,7 +100,7 @@ def test_ai_generate_bool_multi_model(session):
81100
pa.struct(
82101
(
83102
pa.field("result", pa.bool_()),
84-
pa.field("full_response", pa.string()),
103+
pa.field("full_response", dtypes.JSON_ARROW_TYPE),
85104
pa.field("status", pa.string()),
86105
)
87106
)

tests/system/small/ml/test_preprocessing.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import bigframes.features
2121
from bigframes.ml import preprocessing
22+
import bigframes.pandas as bpd
2223
from bigframes.testing import utils
2324

2425
ONE_HOT_ENCODED_DTYPE = (
@@ -62,7 +63,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df):
6263
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
6364

6465

65-
def test_standard_scaler_normalizeds_fit_transform(new_penguins_df):
66+
def test_standard_scaler_normalizes_fit_transform(new_penguins_df):
6667
# TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod.
6768
scaler = preprocessing.StandardScaler()
6869
result = scaler.fit_transform(
@@ -114,6 +115,37 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui
114115
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
115116

116117

118+
def test_standard_scaler_normalizes_non_standard_column_names(
119+
new_penguins_df: bpd.DataFrame,
120+
):
121+
new_penguins_df = new_penguins_df.rename(
122+
columns={
123+
"culmen_length_mm": "culmen?metric",
124+
"culmen_depth_mm": "culmen/metric",
125+
}
126+
)
127+
scaler = preprocessing.StandardScaler()
128+
result = scaler.fit_transform(
129+
new_penguins_df[["culmen?metric", "culmen/metric", "flipper_length_mm"]]
130+
).to_pandas()
131+
132+
# If standard-scaled correctly, mean should be 0.0
133+
for column in result.columns:
134+
assert math.isclose(result[column].mean(), 0.0, abs_tol=1e-3)
135+
136+
expected = pd.DataFrame(
137+
{
138+
"standard_scaled_culmen_metric": [1.313249, -0.20198, -1.111118],
139+
"standard_scaled_culmen_metric_1": [1.17072, -1.272416, 0.101848],
140+
"standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338],
141+
},
142+
dtype="Float64",
143+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
144+
)
145+
146+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
147+
148+
117149
def test_standard_scaler_save_load(new_penguins_df, dataset_id):
118150
transformer = preprocessing.StandardScaler()
119151
transformer.fit(

third_party/bigframes_vendored/version.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
__version__ = "2.20.0"
15+
__version__ = "2.21.0"
1616

1717
# {x-release-please-start-date}
18-
__release_date__ = "2025-09-16"
18+
__release_date__ = "2025-09-17"
1919
# {x-release-please-end}

0 commit comments

Comments
 (0)