python-bigquery-dataframes/bigframes/bigquery/_operations/ml.py at 50037a6328145b2f29c4649080ba494a4aff9aef · googleapis/python-bigquery-dataframes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

from typing import cast, Mapping, Optional, Union

import bigframes_vendored.constants
import google.cloud.bigquery
import pandas as pd

import bigframes.core.log_adapter as log_adapter
import bigframes.core.sql.ml
import bigframes.dataframe as dataframe
import bigframes.ml.base
import bigframes.session


# Helper to convert DataFrame to SQL string
def _to_sql(df_or_sql: Union[pd.DataFrame, dataframe.DataFrame, str]) -> str:
    import bigframes.pandas as bpd

    if isinstance(df_or_sql, str):
        return df_or_sql

    if isinstance(df_or_sql, pd.DataFrame):
        bf_df = bpd.read_pandas(df_or_sql)
    else:
        bf_df = cast(dataframe.DataFrame, df_or_sql)

    # Cache dataframes to make sure base table is not a snapshot.
    # Cached dataframe creates a full copy, never uses snapshot.
    # This is a workaround for internal issue b/310266666.
    bf_df.cache()
    sql, _, _ = bf_df._to_sql_query(include_index=False)
    return sql


def _get_model_name_and_session(
    model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series],
    # Other dataframe arguments to extract session from
    *dataframes: Optional[Union[pd.DataFrame, dataframe.DataFrame, str]],
) -> tuple[str, Optional[bigframes.session.Session]]:
    if isinstance(model, pd.Series):
        try:
            model_ref = model["modelReference"]
            model_name = f"{model_ref['projectId']}.{model_ref['datasetId']}.{model_ref['modelId']}"  # type: ignore
        except KeyError:
            raise ValueError("modelReference must be present in the pandas Series.")
    elif isinstance(model, str):
        model_name = model
    else:
        if model._bqml_model is None:
            raise ValueError("Model must be fitted to be used in ML operations.")
        return model._bqml_model.model_name, model._bqml_model.session

    session = None
    for df in dataframes:
        if isinstance(df, dataframe.DataFrame):
            session = df._session
            break

    return model_name, session


def _get_model_metadata(
    *,
    bqclient: google.cloud.bigquery.Client,
    model_name: str,
) -> pd.Series:
    model_metadata = bqclient.get_model(model_name)
    model_dict = model_metadata.to_api_repr()
    return pd.Series(model_dict)


@log_adapter.method_logger(custom_base_name="bigquery_ml")
def create_model(
    model_name: str,
    *,
    replace: bool = False,
    if_not_exists: bool = False,
    # TODO(tswast): Also support bigframes.ml transformer classes and/or
    # bigframes.pandas functions?
    transform: Optional[list[str]] = None,
    input_schema: Optional[Mapping[str, str]] = None,
    output_schema: Optional[Mapping[str, str]] = None,
    connection_name: Optional[str] = None,
    options: Optional[Mapping[str, Union[str, int, float, bool, list]]] = None,
    training_data: Optional[Union[pd.DataFrame, dataframe.DataFrame, str]] = None,
    custom_holiday: Optional[Union[pd.DataFrame, dataframe.DataFrame, str]] = None,
    session: Optional[bigframes.session.Session] = None,
) -> pd.Series:
    """
    Creates a BigQuery ML model.

    See the `BigQuery ML CREATE MODEL DDL syntax
    <https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create>`_
    for additional reference.

    Args:
        model_name (str):
            The name of the model in BigQuery.
        replace (bool, default False):
            Whether to replace the model if it already exists.
        if_not_exists (bool, default False):
            Whether to ignore the error if the model already exists.
        transform (list[str], optional):
            A list of SQL transformations for the TRANSFORM clause, which
            specifies the preprocessing steps to apply to the input data.
        input_schema (Mapping[str, str], optional):
            The INPUT clause, which specifies the schema of the input data.
        output_schema (Mapping[str, str], optional):
            The OUTPUT clause, which specifies the schema of the output data.
        connection_name (str, optional):
            The connection to use for the model.
        options (Mapping[str, Union[str, int, float, bool, list]], optional):
            The OPTIONS clause, which specifies the model options.
        training_data (Union[bigframes.pandas.DataFrame, str], optional):
            The query or DataFrame to use for training the model.
        custom_holiday (Union[bigframes.pandas.DataFrame, str], optional):
            The query or DataFrame to use for custom holiday data.
        session (bigframes.session.Session, optional):
            The session to use. If not provided, the default session is used.

    Returns:
        pandas.Series:
            A Series with object dtype containing the model metadata. Reference
            the `BigQuery Model REST API reference
            <https://docs.cloud.google.com/bigquery/docs/reference/rest/v2/models#Model>`_
            for available fields.

    """
    import bigframes.pandas as bpd

    training_data_sql = _to_sql(training_data) if training_data is not None else None
    custom_holiday_sql = _to_sql(custom_holiday) if custom_holiday is not None else None

    # Determine session from DataFrames if not provided
    if session is None:
        # Try to get session from inputs
        dfs = [
            obj
            for obj in [training_data, custom_holiday]
            if isinstance(obj, dataframe.DataFrame)
        ]
        if dfs:
            session = dfs[0]._session

    sql = bigframes.core.sql.ml.create_model_ddl(
        model_name=model_name,
        replace=replace,
        if_not_exists=if_not_exists,
        transform=transform,
        input_schema=input_schema,
        output_schema=output_schema,
        connection_name=connection_name,
        options=options,
        training_data=training_data_sql,
        custom_holiday=custom_holiday_sql,
    )

    if session is None:
        bpd.read_gbq_query(sql)
        session = bpd.get_global_session()
        assert (
            session is not None
        ), f"Missing connection to BigQuery. Please report how you encountered this error at {bigframes_vendored.constants.FEEDBACK_LINK}."
    else:
        session.read_gbq_query(sql)

    return _get_model_metadata(bqclient=session.bqclient, model_name=model_name)


@log_adapter.method_logger(custom_base_name="bigquery_ml")
def evaluate(
    model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series],
    input_: Optional[Union[pd.DataFrame, dataframe.DataFrame, str]] = None,
    *,
    perform_aggregation: Optional[bool] = None,
    horizon: Optional[int] = None,
    confidence_level: Optional[float] = None,
) -> dataframe.DataFrame:
    """
    Evaluates a BigQuery ML model.

    See the `BigQuery ML EVALUATE function syntax
    <https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate>`_
    for additional reference.

    Args:
        model (bigframes.ml.base.BaseEstimator or str):
            The model to evaluate.
        input_ (Union[bigframes.pandas.DataFrame, str], optional):
            The DataFrame or query to use for evaluation. If not provided, the
            evaluation data from training is used.
        perform_aggregation (bool, optional):
            A BOOL value that indicates the level of evaluation for forecasting
            accuracy. If you specify TRUE, then the forecasting accuracy is on
            the time series level. If you specify FALSE, the forecasting
            accuracy is on the timestamp level. The default value is TRUE.
        horizon (int, optional):
            An INT64 value that specifies the number of forecasted time points
            against which the evaluation metrics are computed. The default value
            is the horizon value specified in the CREATE MODEL statement for the
            time series model, or 1000 if unspecified. When evaluating multiple
            time series at the same time, this parameter applies to each time
            series.
        confidence_level (float, optional):
            A FLOAT64 value that specifies the percentage of the future values
            that fall in the prediction interval. The default value is 0.95. The
            valid input range is ``[0, 1)``.

    Returns:
        bigframes.pandas.DataFrame:
            The evaluation results.
    """
    import bigframes.pandas as bpd

    model_name, session = _get_model_name_and_session(model, input_)
    table_sql = _to_sql(input_) if input_ is not None else None

    sql = bigframes.core.sql.ml.evaluate(
        model_name=model_name,
        table=table_sql,
        perform_aggregation=perform_aggregation,
        horizon=horizon,
        confidence_level=confidence_level,
    )

    if session is None:
        return bpd.read_gbq_query(sql)
    else:
        return session.read_gbq_query(sql)


@log_adapter.method_logger(custom_base_name="bigquery_ml")
def predict(
    model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series],
    input_: Union[pd.DataFrame, dataframe.DataFrame, str],
    *,
    threshold: Optional[float] = None,
    keep_original_columns: Optional[bool] = None,
    trial_id: Optional[int] = None,
) -> dataframe.DataFrame:
    """
    Runs prediction on a BigQuery ML model.

    See the `BigQuery ML PREDICT function syntax
    <https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-predict>`_
    for additional reference.

    Args:
        model (bigframes.ml.base.BaseEstimator or str):
            The model to use for prediction.
        input_ (Union[bigframes.pandas.DataFrame, str]):
            The DataFrame or query to use for prediction.
        threshold (float, optional):
            The threshold to use for classification models.
        keep_original_columns (bool, optional):
            Whether to keep the original columns in the output.
        trial_id (int, optional):
            An INT64 value that identifies the hyperparameter tuning trial that
            you want the function to evaluate. The function uses the optimal
            trial by default. Only specify this argument if you ran
            hyperparameter tuning when creating the model.

    Returns:
        bigframes.pandas.DataFrame:
            The prediction results.
    """
    import bigframes.pandas as bpd

    model_name, session = _get_model_name_and_session(model, input_)
    table_sql = _to_sql(input_)

    sql = bigframes.core.sql.ml.predict(
        model_name=model_name,
        table=table_sql,
        threshold=threshold,
        keep_original_columns=keep_original_columns,
        trial_id=trial_id,
    )

    if session is None:
        return bpd.read_gbq_query(sql)
    else:
        return session.read_gbq_query(sql)


@log_adapter.method_logger(custom_base_name="bigquery_ml")
def explain_predict(
    model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series],
    input_: Union[pd.DataFrame, dataframe.DataFrame, str],
    *,
    top_k_features: Optional[int] = None,
    threshold: Optional[float] = None,
    integrated_gradients_num_steps: Optional[int] = None,
    approx_feature_contrib: Optional[bool] = None,
) -> dataframe.DataFrame:
    """
    Runs explainable prediction on a BigQuery ML model.

    See the `BigQuery ML EXPLAIN_PREDICT function syntax
    <https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-explain-predict>`_
    for additional reference.

    Args:
        model (bigframes.ml.base.BaseEstimator or str):
            The model to use for prediction.
        input_ (Union[bigframes.pandas.DataFrame, str]):
            The DataFrame or query to use for prediction.
        top_k_features (int, optional):
            The number of top features to return.
        threshold (float, optional):
            The threshold for binary classification models.
        integrated_gradients_num_steps (int, optional):
            an INT64 value that specifies the number of steps to sample between
            the example being explained and its baseline. This value is used to
            approximate the integral in integrated gradients attribution
            methods. Increasing the value improves the precision of feature
            attributions, but can be slower and more computationally expensive.
        approx_feature_contrib (bool, optional):
            A BOOL value that indicates whether to use an approximate feature
            contribution method in the XGBoost model explanation.

    Returns:
        bigframes.pandas.DataFrame:
            The prediction results with explanations.
    """
    import bigframes.pandas as bpd

    model_name, session = _get_model_name_and_session(model, input_)
    table_sql = _to_sql(input_)

    sql = bigframes.core.sql.ml.explain_predict(
        model_name=model_name,
        table=table_sql,
        top_k_features=top_k_features,
        threshold=threshold,
        integrated_gradients_num_steps=integrated_gradients_num_steps,
        approx_feature_contrib=approx_feature_contrib,
    )

    if session is None:
        return bpd.read_gbq_query(sql)
    else:
        return session.read_gbq_query(sql)


@log_adapter.method_logger(custom_base_name="bigquery_ml")
def global_explain(
    model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series],
    *,
    class_level_explain: Optional[bool] = None,
) -> dataframe.DataFrame:
    """
    Gets global explanations for a BigQuery ML model.

    See the `BigQuery ML GLOBAL_EXPLAIN function syntax
    <https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-global-explain>`_
    for additional reference.

    Args:
        model (bigframes.ml.base.BaseEstimator or str):
            The model to get explanations from.
        class_level_explain (bool, optional):
            Whether to return class-level explanations.

    Returns:
        bigframes.pandas.DataFrame:
            The global explanation results.
    """
    import bigframes.pandas as bpd

    model_name, session = _get_model_name_and_session(model)
    sql = bigframes.core.sql.ml.global_explain(
        model_name=model_name,
        class_level_explain=class_level_explain,
    )

    if session is None:
        return bpd.read_gbq_query(sql)
    else:
        return session.read_gbq_query(sql)