diff --git a/packages/bigframes/bigframes/extensions/bigframes/series_accessor.py b/packages/bigframes/bigframes/extensions/bigframes/series_accessor.py index b67d007b88e7..8379e6a145a0 100644 --- a/packages/bigframes/bigframes/extensions/bigframes/series_accessor.py +++ b/packages/bigframes/bigframes/extensions/bigframes/series_accessor.py @@ -20,41 +20,68 @@ from typing import Optional, TypeVar, cast -import bigframes.extensions.core.series_accessor as core_accessor -import bigframes.series -import bigframes.session +from bigframes import dataframe, series, session from bigframes.core.logging import log_adapter +from bigframes.extensions.core import series_accessor as core_accessor -S = TypeVar("S", bound="bigframes.series.Series") +T = TypeVar("T", bound="dataframe.DataFrame") +S = TypeVar("S", bound="series.Series") @log_adapter.class_logger -class BigframesBigQuerySeriesAccessor(core_accessor.BigQuerySeriesAccessor[S]): +class BigframesBigQuerySeriesAccessor(core_accessor.BigQuerySeriesAccessor[T, S]): def __init__(self, bf_obj: S): super().__init__(bf_obj) def _bf_from_series( - self, session: Optional[bigframes.session.Session] = None - ) -> bigframes.series.Series: + self, session: Optional[session.Session] = None + ) -> series.Series: return self._obj - def _to_series(self, bf_series: bigframes.series.Series) -> S: + def _to_dataframe(self, bf_df: dataframe.DataFrame) -> T: + return cast(T, bf_df) + + def _to_series(self, bf_series: series.Series) -> S: return cast(S, bf_series) @property - def aead(self) -> BigframesAeadSeriesAccessor[S]: + def ai(self) -> BigframesAiSeriesAccessor[T, S]: + return BigframesAiSeriesAccessor(self._obj) + + @property + def aead(self) -> BigframesAeadSeriesAccessor[T, S]: return BigframesAeadSeriesAccessor(self._obj) @log_adapter.class_logger -class BigframesAeadSeriesAccessor(core_accessor.AeadSeriesAccessor[S]): +class BigframesAiSeriesAccessor(core_accessor.AiSeriesAccessor[T, S]): def __init__(self, bf_obj: S): super().__init__(bf_obj) def _bf_from_series( - self, session: Optional[bigframes.session.Session] = None - ) -> bigframes.series.Series: + self, session: Optional[session.Session] = None + ) -> series.Series: return self._obj - def _to_series(self, bf_series: bigframes.series.Series) -> S: + def _to_dataframe(self, bf_df: dataframe.DataFrame) -> T: + return cast(T, bf_df) + + def _to_series(self, bf_series: series.Series) -> S: + return cast(S, bf_series) + + +@log_adapter.class_logger +class BigframesAeadSeriesAccessor(core_accessor.AeadSeriesAccessor[T, S]): + def __init__(self, bf_obj: S): + super().__init__(bf_obj) + + def _bf_from_series( + self, session: Optional[session.Session] = None + ) -> series.Series: + return self._obj + + def _to_dataframe(self, bf_df: dataframe.DataFrame) -> T: + return cast(T, bf_df) + + def _to_series(self, bf_series: series.Series) -> S: return cast(S, bf_series) diff --git a/packages/bigframes/bigframes/extensions/core/abstract_series_accessor.py b/packages/bigframes/bigframes/extensions/core/abstract_series_accessor.py new file mode 100644 index 000000000000..22d098618770 --- /dev/null +++ b/packages/bigframes/bigframes/extensions/core/abstract_series_accessor.py @@ -0,0 +1,50 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# DO NOT MODIFY THIS FILE DIRECTLY. +# This file was generated by the script: scripts/generate_bigframes_bigquery.py +# + +from __future__ import annotations + +import abc +from typing import ( + Generic, + Optional, + TypeVar, +) + +from bigframes import dataframe, series, session + +T = TypeVar("T") +S = TypeVar("S") + + +class AbstractBigQuerySeriesAccessor(abc.ABC, Generic[T, S]): + def __init__(self, obj: S): + self._obj = obj + + @abc.abstractmethod + def _bf_from_series( + self, session: Optional[session.Session] = None + ) -> series.Series: + """Convert the accessor's object to a BigFrames Series.""" + + @abc.abstractmethod + def _to_dataframe(self, bf_df: dataframe.DataFrame) -> T: + """Convert a BigFrames DataFrame to the accessor's object type.""" + + @abc.abstractmethod + def _to_series(self, bf_series: series.Series) -> S: + """Convert a BigFrames Series to the accessor's object type.""" diff --git a/packages/bigframes/bigframes/extensions/core/series_accessor.py b/packages/bigframes/bigframes/extensions/core/series_accessor.py index 96d0eb8d045e..86f34e9ab603 100644 --- a/packages/bigframes/bigframes/extensions/core/series_accessor.py +++ b/packages/bigframes/bigframes/extensions/core/series_accessor.py @@ -21,7 +21,6 @@ import abc from typing import ( Any, - Generic, Literal, Optional, TypeVar, @@ -29,51 +28,43 @@ cast, ) -import bigframes.core.col -import bigframes.core.sentinels as sentinels -import bigframes.series as series -import bigframes.session +from bigframes import series, session +from bigframes.core import col, sentinels +from bigframes.extensions.core import abstract_series_accessor, series_tvf_mixins +T = TypeVar("T") S = TypeVar("S") -class AbstractBigQuerySeriesAccessor(abc.ABC, Generic[S]): - def __init__(self, obj: S): - self._obj = obj - - @abc.abstractmethod - def _bf_from_series( - self, session: Optional[bigframes.session.Session] = None - ) -> series.Series: - """Convert the accessor's object to a BigFrames Series.""" +class BigQuerySeriesAccessor( + abstract_series_accessor.AbstractBigQuerySeriesAccessor[T, S] +): + """Series accessor for BigQuery functions.""" + @property @abc.abstractmethod - def _to_series(self, bf_series: series.Series) -> S: - """Convert a BigFrames Series to the accessor's object type.""" - - -class BigQuerySeriesAccessor(AbstractBigQuerySeriesAccessor[S]): - """Series accessor for BigQuery functions.""" + def ai(self) -> AiSeriesAccessor[T, S]: + """Accessor for BigQuery ai functions.""" @property @abc.abstractmethod - def aead(self) -> AeadSeriesAccessor[S]: + def aead(self) -> AeadSeriesAccessor[T, S]: """Accessor for BigQuery aead functions.""" def deterministic_decrypt_bytes( self, ciphertext: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes], ], additional_data: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes], ], *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Uses the matching key from `keyset` to decrypt `ciphertext` and verifies the integrity of the data using `additional_data`. Returns an error if decryption fails.""" from bigframes.operations.googlesql.global_namespace.aead_encryption import ( @@ -82,7 +73,7 @@ def deterministic_decrypt_bytes( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( ciphertext, @@ -101,16 +92,16 @@ def deterministic_decrypt_string( self, ciphertext: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes], ], additional_data: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], str], ], *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Like `DETERMINISTIC_DECRYPT_BYTES`, but where plaintext is of type STRING.""" from bigframes.operations.googlesql.global_namespace.aead_encryption import ( @@ -119,7 +110,7 @@ def deterministic_decrypt_string( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( ciphertext, @@ -138,16 +129,16 @@ def deterministic_encrypt( self, plaintext: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, str], ], additional_data: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, str], ], *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Encrypts `plaintext` using the primary cryptographic key in `keyset` using deterministic AEAD. The algorithm of the primary key must be `DETERMINISTIC_AEAD_AES_SIV_CMAC_256`. Binds the ciphertext to the context defined by `additional_data`. Returns `NULL` if any input is `NULL`.""" from bigframes.operations.googlesql.global_namespace.aead_encryption import ( @@ -156,7 +147,7 @@ def deterministic_encrypt( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( plaintext, @@ -175,11 +166,11 @@ def array_concat( self, array_expression_2: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], ], *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Concatenates one or more arrays with the same element type into a single array.""" from bigframes.operations.googlesql.global_namespace.array import ( @@ -188,7 +179,7 @@ def array_concat( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( array_expression_2, @@ -204,7 +195,7 @@ def array_concat( def array_first( self, *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Takes an array and returns the first element in the array.""" from bigframes.operations.googlesql.global_namespace.array import ( @@ -221,11 +212,11 @@ def array_first_n( self, n: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], int], ], *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Returns a prefix of `input_array` consisting of the first `n` elements.""" from bigframes.operations.googlesql.global_namespace.array import ( @@ -234,7 +225,7 @@ def array_first_n( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( n, @@ -251,11 +242,11 @@ def array_includes( self, search_value: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], ], *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Takes an array and returns `TRUE` if there is an element in the array that is equal to the search_value.""" from bigframes.operations.googlesql.global_namespace.array import ( @@ -264,7 +255,7 @@ def array_includes( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( search_value, @@ -281,11 +272,11 @@ def array_includes_all( self, search_values: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], ], *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Takes an array to search and an array of search values. Returns `TRUE` if all search values are in the array to search, otherwise returns `FALSE`.""" from bigframes.operations.googlesql.global_namespace.array import ( @@ -294,7 +285,7 @@ def array_includes_all( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( search_values, @@ -311,11 +302,11 @@ def array_includes_any( self, search_values: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Any, Literal[sentinels.Sentinel.ARGUMENT_DEFAULT]], ], *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Takes an array to search and an array of search values. Returns `TRUE` if any search values are in the array to search, otherwise returns `FALSE`.""" from bigframes.operations.googlesql.global_namespace.array import ( @@ -324,7 +315,7 @@ def array_includes_any( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( search_values, @@ -340,7 +331,7 @@ def array_includes_any( def array_is_distinct( self, *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Returns `TRUE` if the array contains no repeated elements, using the same equality comparison logic as `SELECT DISTINCT`.""" from bigframes.operations.googlesql.global_namespace.array import ( @@ -356,7 +347,7 @@ def array_is_distinct( def array_last( self, *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Takes an array and returns the last element in the array.""" from bigframes.operations.googlesql.global_namespace.array import ( @@ -372,7 +363,7 @@ def array_last( def array_length( self, *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Compute the length of each array element in the Series. @@ -435,7 +426,7 @@ def array_length( def array_reverse( self, *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Returns the input `ARRAY` with elements in reverse order.""" from bigframes.operations.googlesql.global_namespace.array import ( @@ -452,16 +443,16 @@ def array_slice( self, start_offset: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], int], ], end_offset: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], int], ], *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Returns an array containing zero or more consecutive elements from the input array.""" from bigframes.operations.googlesql.global_namespace.array import ( @@ -470,7 +461,7 @@ def array_slice( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( start_offset, @@ -489,16 +480,16 @@ def array_to_string( self, delimiter: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, str], ], null_text: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, str], ] = sentinels.Sentinel.ARGUMENT_DEFAULT, *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Converts array elements within a Series into delimited strings. @@ -553,7 +544,7 @@ def array_to_string( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( delimiter, @@ -572,11 +563,11 @@ def flatten( self, depth: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], int], ] = sentinels.Sentinel.ARGUMENT_DEFAULT, *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Takes an array of nested data and flattens a specific part of it into a single, flat array with the [array elements field access operator][array-el-field-operator]. Returns `NULL` if the input value is `NULL`.""" from bigframes.operations.googlesql.global_namespace.array import ( @@ -585,7 +576,7 @@ def flatten( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( depth, @@ -601,7 +592,7 @@ def flatten( def bool_( self, *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Converts a JSON boolean to a SQL BOOL value.""" from bigframes.operations.googlesql.global_namespace.conversion import ( @@ -618,11 +609,11 @@ def double( self, wide_number_mode: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], str], ] = sentinels.Sentinel.ARGUMENT_DEFAULT, *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Converts a JSON number to a SQL FLOAT64 value.""" from bigframes.operations.googlesql.global_namespace.conversion import ( @@ -631,7 +622,7 @@ def double( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( wide_number_mode, @@ -648,11 +639,11 @@ def float64( self, wide_number_mode: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], str], ] = sentinels.Sentinel.ARGUMENT_DEFAULT, *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Converts a JSON number to a SQL FLOAT64 value.""" from bigframes.operations.googlesql.global_namespace.conversion import ( @@ -661,7 +652,7 @@ def float64( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( wide_number_mode, @@ -677,7 +668,7 @@ def float64( def int64( self, *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Converts a JSON number to a SQL INT64 value.""" from bigframes.operations.googlesql.global_namespace.conversion import ( @@ -693,7 +684,7 @@ def int64( def parse_bignumeric( self, *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Converts a STRING to a BIGNUMERIC value.""" from bigframes.operations.googlesql.global_namespace.conversion import ( @@ -709,7 +700,7 @@ def parse_bignumeric( def parse_numeric( self, *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Converts a STRING to a NUMERIC value.""" from bigframes.operations.googlesql.global_namespace.conversion import ( @@ -726,11 +717,11 @@ def string( self, timezone: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], str], ] = sentinels.Sentinel.ARGUMENT_DEFAULT, *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Converts a value to a STRING value.""" from bigframes.operations.googlesql.global_namespace.conversion import ( @@ -739,7 +730,7 @@ def string( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( timezone, @@ -753,23 +744,27 @@ def string( return self._to_series(cast(series.Series, result)) -class AeadSeriesAccessor(AbstractBigQuerySeriesAccessor[S]): +class AiSeriesAccessor(series_tvf_mixins.AITVFMixin[T, S]): + """Series accessor for BigQuery ai functions.""" + + +class AeadSeriesAccessor(abstract_series_accessor.AbstractBigQuerySeriesAccessor[T, S]): """Series accessor for BigQuery aead functions.""" def decrypt_bytes( self, ciphertext: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes], ], additional_data: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes], ], *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Uses the matching key from keyset to decrypt ciphertext and verifies the integrity of the data using additional_data. Returns an error if decryption or verification fails.""" from bigframes.operations.googlesql.aead import ( @@ -778,7 +773,7 @@ def decrypt_bytes( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( ciphertext, @@ -797,16 +792,16 @@ def decrypt_string( self, ciphertext: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes], ], additional_data: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], str], ], *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Like AEAD.DECRYPT_BYTES, but where additional_data is of type STRING.""" from bigframes.operations.googlesql.aead import ( @@ -815,7 +810,7 @@ def decrypt_string( # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( ciphertext, @@ -834,23 +829,23 @@ def encrypt( self, plaintext: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, str], ], additional_data: Union[ series.Series, - bigframes.core.col.Expression, + col.Expression, Union[Literal[sentinels.Sentinel.ARGUMENT_DEFAULT], bytes, str], ], *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """Encrypts plaintext using the primary cryptographic key in keyset. The algorithm of the primary key must be AEAD_AES_GCM_256. Binds the ciphertext to the context defined by additional_data. Returns NULL if any input is NULL.""" from bigframes.operations.googlesql.aead import encrypt as encrypt_impl # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( plaintext, diff --git a/packages/bigframes/bigframes/extensions/core/series_tvf_mixins.py b/packages/bigframes/bigframes/extensions/core/series_tvf_mixins.py new file mode 100644 index 000000000000..673978bbe46c --- /dev/null +++ b/packages/bigframes/bigframes/extensions/core/series_tvf_mixins.py @@ -0,0 +1,129 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import List, Mapping, TypeVar + +import pandas as pd + +from bigframes import session +from bigframes.extensions.core import abstract_series_accessor +from bigframes.ml import base as ml_base + +T = TypeVar("T") +S = TypeVar("S") + + +class AITVFMixin(abstract_series_accessor.AbstractBigQuerySeriesAccessor[T, S]): + def generate_embedding( + self, + model: ml_base.BaseEstimator | str | pd.Series, + *, + output_dimensionality: int | None = None, + task_type: str | None = None, + start_second: float | None = None, + end_second: float | None = None, + interval_seconds: float | None = None, + trial_id: int | None = None, + session: session.Session | None = None, + ) -> T: + """ + Creates embeddings that describe an entity — for example, a piece of text or an image. + + This is an accessor for :func:`bigframes.bigquery.ai.generate_embedding`. See that + function's documentation for detailed parameter descriptions and examples. + """ + import bigframes.bigquery.ai + + bf_series = self._bf_from_series(session) + result = bigframes.bigquery.ai.generate_embedding( + model, + bf_series, + output_dimensionality=output_dimensionality, + task_type=task_type, + start_second=start_second, + end_second=end_second, + interval_seconds=interval_seconds, + trial_id=trial_id, + ) + return self._to_dataframe(result) + + def generate_text( + self, + model: ml_base.BaseEstimator | str | pd.Series, + *, + temperature: float | None = None, + max_output_tokens: int | None = None, + top_k: int | None = None, + top_p: float | None = None, + stop_sequences: List[str] | None = None, + ground_with_google_search: bool | None = None, + request_type: str | None = None, + session: session.Session | None = None, + ) -> T: + """ + Generates text using a BigQuery ML model. + + This is an accessor for :func:`bigframes.bigquery.ai.generate_text`. See that + function's documentation for detailed parameter descriptions and examples. + """ + import bigframes.bigquery.ai + + bf_series = self._bf_from_series(session) + result = bigframes.bigquery.ai.generate_text( + model, + bf_series, + temperature=temperature, + max_output_tokens=max_output_tokens, + top_k=top_k, + top_p=top_p, + stop_sequences=stop_sequences, + ground_with_google_search=ground_with_google_search, + request_type=request_type, + ) + return self._to_dataframe(result) + + def generate_table( + self, + model: ml_base.BaseEstimator | str | pd.Series, + *, + output_schema: str | Mapping[str, str], + temperature: float | None = None, + top_p: float | None = None, + max_output_tokens: int | None = None, + stop_sequences: List[str] | None = None, + request_type: str | None = None, + session: session.Session | None = None, + ) -> T: + """ + Generates a table using a BigQuery ML model. + + This is an accessor for :func:`bigframes.bigquery.ai.generate_table`. See that + function's documentation for detailed parameter descriptions and examples. + """ + import bigframes.bigquery.ai + + bf_series = self._bf_from_series(session) + result = bigframes.bigquery.ai.generate_table( + model, + bf_series, + output_schema=output_schema, + temperature=temperature, + top_p=top_p, + max_output_tokens=max_output_tokens, + stop_sequences=stop_sequences, + request_type=request_type, + ) + return self._to_dataframe(result) diff --git a/packages/bigframes/bigframes/extensions/pandas/series_accessor.py b/packages/bigframes/bigframes/extensions/pandas/series_accessor.py index 837664c6e1f5..204f1e0d2cf3 100644 --- a/packages/bigframes/bigframes/extensions/pandas/series_accessor.py +++ b/packages/bigframes/bigframes/extensions/pandas/series_accessor.py @@ -23,47 +23,76 @@ import pandas import pandas.api.extensions -import bigframes.core.global_session as bf_session -import bigframes.extensions.core.series_accessor as core_accessor -import bigframes.series -import bigframes.session +from bigframes import dataframe, series, session +from bigframes.core import global_session as bf_session from bigframes.core.logging import log_adapter +from bigframes.extensions.core import series_accessor as core_accessor +T = TypeVar("T", bound="pandas.DataFrame") S = TypeVar("S", bound="pandas.Series") @pandas.api.extensions.register_series_accessor("bigquery") @log_adapter.class_logger -class PandasBigQuerySeriesAccessor(core_accessor.BigQuerySeriesAccessor[S]): +class PandasBigQuerySeriesAccessor(core_accessor.BigQuerySeriesAccessor[T, S]): def __init__(self, pandas_obj: S): super().__init__(pandas_obj) def _bf_from_series( - self, session: Optional[bigframes.session.Session] = None - ) -> bigframes.series.Series: + self, session: Optional[session.Session] = None + ) -> series.Series: if session is None: session = bf_session.get_global_session() - return cast(bigframes.series.Series, session.read_pandas(self._obj)) + return cast(series.Series, session.read_pandas(self._obj)) - def _to_series(self, bf_series: bigframes.series.Series) -> S: + def _to_dataframe(self, bf_df: dataframe.DataFrame) -> T: + return cast(T, bf_df.to_pandas(ordered=True)) + + def _to_series(self, bf_series: series.Series) -> S: return cast(S, bf_series.to_pandas(ordered=True)) @property - def aead(self) -> PandasAeadSeriesAccessor[S]: + def ai(self) -> PandasAiSeriesAccessor[T, S]: + return PandasAiSeriesAccessor(self._obj) + + @property + def aead(self) -> PandasAeadSeriesAccessor[T, S]: return PandasAeadSeriesAccessor(self._obj) @log_adapter.class_logger -class PandasAeadSeriesAccessor(core_accessor.AeadSeriesAccessor[S]): +class PandasAiSeriesAccessor(core_accessor.AiSeriesAccessor[T, S]): + def __init__(self, pandas_obj: S): + super().__init__(pandas_obj) + + def _bf_from_series( + self, session: Optional[session.Session] = None + ) -> series.Series: + if session is None: + session = bf_session.get_global_session() + return cast(series.Series, session.read_pandas(self._obj)) + + def _to_dataframe(self, bf_df: dataframe.DataFrame) -> T: + return cast(T, bf_df.to_pandas(ordered=True)) + + def _to_series(self, bf_series: series.Series) -> S: + return cast(S, bf_series.to_pandas(ordered=True)) + + +@log_adapter.class_logger +class PandasAeadSeriesAccessor(core_accessor.AeadSeriesAccessor[T, S]): def __init__(self, pandas_obj: S): super().__init__(pandas_obj) def _bf_from_series( - self, session: Optional[bigframes.session.Session] = None - ) -> bigframes.series.Series: + self, session: Optional[session.Session] = None + ) -> series.Series: if session is None: session = bf_session.get_global_session() - return cast(bigframes.series.Series, session.read_pandas(self._obj)) + return cast(series.Series, session.read_pandas(self._obj)) + + def _to_dataframe(self, bf_df: dataframe.DataFrame) -> T: + return cast(T, bf_df.to_pandas(ordered=True)) - def _to_series(self, bf_series: bigframes.series.Series) -> S: + def _to_series(self, bf_series: series.Series) -> S: return cast(S, bf_series.to_pandas(ordered=True)) diff --git a/packages/bigframes/scripts/data/sql-functions/ai.yaml b/packages/bigframes/scripts/data/sql-functions/ai.yaml new file mode 100644 index 000000000000..f3238c8178b7 --- /dev/null +++ b/packages/bigframes/scripts/data/sql-functions/ai.yaml @@ -0,0 +1 @@ +urn: extension:google:bq_scalar_functions diff --git a/packages/bigframes/scripts/generate_bigframes_bigquery.py b/packages/bigframes/scripts/generate_bigframes_bigquery.py index bb232a6cdf8c..999f17b10215 100755 --- a/packages/bigframes/scripts/generate_bigframes_bigquery.py +++ b/packages/bigframes/scripts/generate_bigframes_bigquery.py @@ -489,6 +489,11 @@ def process_yaml_file(yaml_file, templates): is_global = "global_namespace" in module_path.parts namespace = get_namespace(yaml_file) + + if not data or not isinstance(data, dict) or "scalar_functions" not in data: + # If the file is empty or has no functions, just create the namespace. + return [{"namespace": namespace}] + ops_list, functions_list = parse_scalar_functions( data, module_name, @@ -611,8 +616,9 @@ def generate_series_accessors(functions: list[dict], templates: dict): # Populate functions for func in functions: - ns = func["namespace"] or () - ns_by_tuple[ns]["functions"].append(func) + if "name" in func: + ns = func["namespace"] or () + ns_by_tuple[ns]["functions"].append(func) # Populate children properties for ns in sorted_namespaces: diff --git a/packages/bigframes/scripts/templates/bigframes_series_accessor.py.j2 b/packages/bigframes/scripts/templates/bigframes_series_accessor.py.j2 index f5ed3d045485..8ce37d67321b 100644 --- a/packages/bigframes/scripts/templates/bigframes_series_accessor.py.j2 +++ b/packages/bigframes/scripts/templates/bigframes_series_accessor.py.j2 @@ -10,30 +10,33 @@ from __future__ import annotations from typing import cast, Optional, TypeVar from bigframes.core.logging import log_adapter -import bigframes.extensions.core.series_accessor as core_accessor -import bigframes.series -import bigframes.session +from bigframes.extensions.core import series_accessor as core_accessor +from bigframes import series, dataframe, session -S = TypeVar("S", bound="bigframes.series.Series") +T = TypeVar("T", bound="dataframe.DataFrame") +S = TypeVar("S", bound="series.Series") {% for ns in namespaces %} @log_adapter.class_logger -class {{ ns.bigframes_class_name }}(core_accessor.{{ ns.class_name }}[S]): +class {{ ns.bigframes_class_name }}(core_accessor.{{ ns.class_name }}[T, S]): def __init__(self, bf_obj: S): super().__init__(bf_obj) def _bf_from_series( - self, session: Optional[bigframes.session.Session] = None - ) -> bigframes.series.Series: + self, session: Optional[session.Session] = None + ) -> series.Series: return self._obj - def _to_series(self, bf_series: bigframes.series.Series) -> S: + def _to_dataframe(self, bf_df: dataframe.DataFrame) -> T: + return cast(T, bf_df) + + def _to_series(self, bf_series: series.Series) -> S: return cast(S, bf_series) {% for child in ns.children %} @property - def {{ child.prop_name }}(self) -> {{ child.bigframes_class_name }}[S]: + def {{ child.prop_name }}(self) -> {{ child.bigframes_class_name }}[T, S]: return {{ child.bigframes_class_name }}(self._obj) {% endfor %} diff --git a/packages/bigframes/scripts/templates/core_series_accessor.py.j2 b/packages/bigframes/scripts/templates/core_series_accessor.py.j2 index 5881fe6963b9..5a64b7590398 100644 --- a/packages/bigframes/scripts/templates/core_series_accessor.py.j2 +++ b/packages/bigframes/scripts/templates/core_series_accessor.py.j2 @@ -10,45 +10,33 @@ from __future__ import annotations import abc from typing import ( Any, - cast, - Generic, Literal, Optional, TypeVar, Union, + cast, ) -from bigframes import dtypes -import bigframes.core.col -import bigframes.core.sentinels as sentinels -import bigframes.series as series -import bigframes.session +from bigframes import series, session +from bigframes.core import col, sentinels +from bigframes.extensions.core import abstract_series_accessor, series_tvf_mixins +T = TypeVar("T") S = TypeVar("S") -class AbstractBigQuerySeriesAccessor(abc.ABC, Generic[S]): - def __init__(self, obj: S): - self._obj = obj - - @abc.abstractmethod - def _bf_from_series( - self, session: Optional[bigframes.session.Session] = None - ) -> series.Series: - """Convert the accessor's object to a BigFrames Series.""" - - @abc.abstractmethod - def _to_series(self, bf_series: series.Series) -> S: - """Convert a BigFrames Series to the accessor's object type.""" - {% for ns in namespaces %} -class {{ ns.class_name }}(AbstractBigQuerySeriesAccessor[S]): +{% if ns.class_name == "AiSeriesAccessor" %} +class {{ ns.class_name }}(series_tvf_mixins.AITVFMixin[T, S]): +{% else %} +class {{ ns.class_name }}(abstract_series_accessor.AbstractBigQuerySeriesAccessor[T, S]): +{% endif %} """{{ ns.description }}""" {% for child in ns.children %} @property @abc.abstractmethod - def {{ child.prop_name }}(self) -> {{ child.class_name }}[S]: + def {{ child.prop_name }}(self) -> {{ child.class_name }}[T, S]: """Accessor for BigQuery {{ child.prop_name }} functions.""" {% endfor %} @@ -56,10 +44,10 @@ class {{ ns.class_name }}(AbstractBigQuerySeriesAccessor[S]): def {{ func.name }}( self, {% for arg in func.args if arg.name != func.series_accessor_arg %} - {{ arg.name }}: Union[series.Series, bigframes.core.col.Expression, {{ arg.type_hint }}]{% if arg.default %} = {{ arg.default }}{% endif %}, + {{ arg.name }}: Union[series.Series, col.Expression, {{ arg.type_hint }}]{% if arg.default %} = {{ arg.default }}{% endif %}, {% endfor %} *, - session: Optional[bigframes.session.Session] = None, + session: Optional[session.Session] = None, ) -> S: """{{ func.description | indent(8) }}""" from {{ func.import_module }} import {{ func.name }} as {{ func.name }}_impl @@ -67,7 +55,7 @@ class {{ ns.class_name }}(AbstractBigQuerySeriesAccessor[S]): # Resolve session from other arguments if not passed if session is None: - import bigframes.core.googlesql as googlesql + from bigframes.core import googlesql session = googlesql._find_session( {% for arg in func.args if arg.name != func.series_accessor_arg %} {{ arg.name }}, diff --git a/packages/bigframes/scripts/templates/pandas_series_accessor.py.j2 b/packages/bigframes/scripts/templates/pandas_series_accessor.py.j2 index 76f3d4797531..150546655613 100644 --- a/packages/bigframes/scripts/templates/pandas_series_accessor.py.j2 +++ b/packages/bigframes/scripts/templates/pandas_series_accessor.py.j2 @@ -12,12 +12,12 @@ from typing import cast, Optional, TypeVar import pandas import pandas.api.extensions -import bigframes.core.global_session as bf_session +from bigframes import dataframe, series, session +from bigframes.core import global_session as bf_session from bigframes.core.logging import log_adapter -import bigframes.extensions.core.series_accessor as core_accessor -import bigframes.series -import bigframes.session +from bigframes.extensions.core import series_accessor as core_accessor +T = TypeVar("T", bound="pandas.DataFrame") S = TypeVar("S", bound="pandas.Series") @@ -26,23 +26,26 @@ S = TypeVar("S", bound="pandas.Series") @pandas.api.extensions.register_series_accessor("bigquery") {% endif %} @log_adapter.class_logger -class {{ ns.pandas_class_name }}(core_accessor.{{ ns.class_name }}[S]): +class {{ ns.pandas_class_name }}(core_accessor.{{ ns.class_name }}[T, S]): def __init__(self, pandas_obj: S): super().__init__(pandas_obj) def _bf_from_series( - self, session: Optional[bigframes.session.Session] = None - ) -> bigframes.series.Series: + self, session: Optional[session.Session] = None + ) -> series.Series: if session is None: session = bf_session.get_global_session() - return cast(bigframes.series.Series, session.read_pandas(self._obj)) + return cast(series.Series, session.read_pandas(self._obj)) - def _to_series(self, bf_series: bigframes.series.Series) -> S: + def _to_dataframe(self, bf_df: dataframe.DataFrame) -> T: + return cast(T, bf_df.to_pandas(ordered=True)) + + def _to_series(self, bf_series: series.Series) -> S: return cast(S, bf_series.to_pandas(ordered=True)) {% for child in ns.children %} @property - def {{ child.prop_name }}(self) -> {{ child.pandas_class_name }}[S]: + def {{ child.prop_name }}(self) -> {{ child.pandas_class_name }}[T, S]: return {{ child.pandas_class_name }}(self._obj) {% endfor %} diff --git a/packages/bigframes/tests/unit/extensions/core/test_dataframe_accessor.py b/packages/bigframes/tests/unit/extensions/core/test_dataframe_accessor.py index 7ab4f5176980..2f3352116aff 100644 --- a/packages/bigframes/tests/unit/extensions/core/test_dataframe_accessor.py +++ b/packages/bigframes/tests/unit/extensions/core/test_dataframe_accessor.py @@ -26,16 +26,16 @@ def test_ai_forecast(monkeypatch): bf_df = mock.create_autospec(bpd.DataFrame) session.read_pandas.return_value = bf_df - def mock_ai_forecast(df, **kwargs): - assert df is bf_df - result_df = mock.create_autospec(bpd.DataFrame) - result_df.to_pandas.return_value = kwargs - return result_df + mock_forecast = mock.MagicMock() + forecast_result_df = mock.create_autospec(bpd.DataFrame) + mock_forecast.return_value = forecast_result_df + expected_result = mock.create_autospec(pd.DataFrame) + forecast_result_df.to_pandas.return_value = expected_result - monkeypatch.setattr(bigframes.bigquery.ai, "forecast", mock_ai_forecast) + monkeypatch.setattr(bigframes.bigquery.ai, "forecast", mock_forecast) df = pd.DataFrame({"date": ["2020-01-01"], "value": [1.0]}) - result = df.bigquery.ai.forecast( + actual_result = df.bigquery.ai.forecast( timestamp_col="date", data_col="value", horizon=5, @@ -43,30 +43,31 @@ def mock_ai_forecast(df, **kwargs): ) session.read_pandas.assert_called_once() - assert result == { - "timestamp_col": "date", - "data_col": "value", - "model": "TimesFM 2.0", - "id_cols": None, - "horizon": 5, - "confidence_level": 0.95, - "context_window": None, - "output_historical_time_series": False, - } + mock_forecast.assert_called_once_with( + bf_df, + timestamp_col="date", + data_col="value", + model="TimesFM 2.0", + id_cols=None, + horizon=5, + confidence_level=0.95, + context_window=None, + output_historical_time_series=False, + ) + forecast_result_df.to_pandas.assert_called_once() + assert actual_result is expected_result -def test_bigframes_ai_forecast(monkeypatch): - session = mock.create_autospec(bigframes.session.Session) - bf_df = mock.create_autospec(bpd.DataFrame) - def mock_ai_forecast(df, **kwargs): - assert df is bf_df - result_df = mock.create_autospec(bpd.DataFrame) - return result_df +def test_bigframes_ai_forecast(scalar_types_df: bpd.DataFrame, monkeypatch): + session = mock.create_autospec(bigframes.session.Session) + forecast_result = mock.create_autospec(bpd.DataFrame) + mock_forecast = mock.MagicMock() + mock_forecast.return_value = forecast_result - monkeypatch.setattr(bigframes.bigquery.ai, "forecast", mock_ai_forecast) + monkeypatch.setattr(bigframes.bigquery.ai, "forecast", mock_forecast) - result = bf_df.bigquery.ai.forecast( + actual_result = scalar_types_df.bigquery.ai.forecast( timestamp_col="date", data_col="value", horizon=5, @@ -74,21 +75,37 @@ def mock_ai_forecast(df, **kwargs): ) session.read_pandas.assert_not_called() + mock_forecast.assert_called_once() + args, kwargs = mock_forecast.call_args + assert args[0] is scalar_types_df + assert kwargs == { + "timestamp_col": "date", + "data_col": "value", + "model": "TimesFM 2.0", + "id_cols": None, + "horizon": 5, + "confidence_level": 0.95, + "context_window": None, + "output_historical_time_series": False, + } # BigFrames accessor returns the bf_df directly without calling to_pandas - assert result is not None + forecast_result.to_pandas.assert_not_called() + assert actual_result is forecast_result def test_ai_generate(monkeypatch): - def mock_generate(prompt, **kwargs): - result_series = mock.create_autospec(bpd.Series) - result_series.to_pandas.return_value = (prompt, kwargs) - return result_series + mock_generate = mock.MagicMock() + result_series = mock.create_autospec(bpd.Series) + mock_generate.return_value = result_series + expected_result = mock.create_autospec(pd.Series) + result_series.to_pandas.return_value = expected_result monkeypatch.setattr(bigframes.bigquery.ai, "generate", mock_generate) + prompt = mock.create_autospec(pd.Series) df = pd.DataFrame({"text_input": ["Is this a positive review?"]}) - result = df.bigquery.ai.generate( - df["text_input"], + actual_result = df.bigquery.ai.generate( + prompt, connection_id="conn", endpoint="endpoint", request_type="dedicated", @@ -96,29 +113,28 @@ def mock_generate(prompt, **kwargs): output_schema={"res": "STRING"}, ) - assert isinstance(result, tuple) - assert len(result) == 2 - pd.testing.assert_series_equal(result[0], df["text_input"]) - assert result[1] == { - "connection_id": "conn", - "endpoint": "endpoint", - "request_type": "dedicated", - "model_params": {"temp": 0.5}, - "output_schema": {"res": "STRING"}, - } + mock_generate.assert_called_once_with( + prompt, + connection_id="conn", + endpoint="endpoint", + request_type="dedicated", + model_params={"temp": 0.5}, + output_schema={"res": "STRING"}, + ) + result_series.to_pandas.assert_called_once() + assert actual_result is expected_result def test_bigframes_ai_generate(scalar_types_df: bpd.DataFrame, monkeypatch): bf_series = mock.create_autospec(bpd.Series) result_series = mock.create_autospec(bpd.Series) - def mock_generate(prompt, **kwargs): - assert prompt is bf_series - return result_series + mock_generate = mock.MagicMock() + mock_generate.return_value = result_series monkeypatch.setattr(bigframes.bigquery.ai, "generate", mock_generate) - result = scalar_types_df.bigquery.ai.generate( + actual_result = scalar_types_df.bigquery.ai.generate( bf_series, connection_id="conn", endpoint="endpoint", @@ -127,48 +143,60 @@ def mock_generate(prompt, **kwargs): output_schema={"res": "STRING"}, ) - assert result is result_series + mock_generate.assert_called_once() + args, kwargs = mock_generate.call_args + assert args[0] is bf_series + assert kwargs == { + "connection_id": "conn", + "endpoint": "endpoint", + "request_type": "dedicated", + "model_params": {"temp": 0.5}, + "output_schema": {"res": "STRING"}, + } + result_series.to_pandas.assert_not_called() + assert actual_result is result_series def test_ai_generate_bool(monkeypatch): - def mock_generate_bool(prompt, **kwargs): - result_series = mock.create_autospec(bpd.Series) - result_series.to_pandas.return_value = (prompt, kwargs) - return result_series + mock_generate_bool = mock.MagicMock() + result_series = mock.create_autospec(bpd.Series) + mock_generate_bool.return_value = result_series + expected_result = mock.create_autospec(pd.Series) + result_series.to_pandas.return_value = expected_result monkeypatch.setattr(bigframes.bigquery.ai, "generate_bool", mock_generate_bool) + prompt = mock.create_autospec(pd.Series) df = pd.DataFrame({"text_input": ["Is this a positive review?"]}) - result = df.bigquery.ai.generate_bool( - df["text_input"], + actual_result = df.bigquery.ai.generate_bool( + prompt, connection_id="conn", endpoint="endpoint", request_type="dedicated", model_params={"temp": 0.5}, ) - assert isinstance(result, tuple) - assert len(result) == 2 - pd.testing.assert_series_equal(result[0], df["text_input"]) - assert result[1] == { - "connection_id": "conn", - "endpoint": "endpoint", - "request_type": "dedicated", - "model_params": {"temp": 0.5}, - } + mock_generate_bool.assert_called_once_with( + prompt, + connection_id="conn", + endpoint="endpoint", + request_type="dedicated", + model_params={"temp": 0.5}, + ) + result_series.to_pandas.assert_called_once() + assert actual_result is expected_result def test_bigframes_ai_generate_bool(scalar_types_df: bpd.DataFrame, monkeypatch): bf_series = mock.create_autospec(bpd.Series) result_series = mock.create_autospec(bpd.Series) - def mock_generate_bool(prompt, **kwargs): - assert prompt is bf_series - return result_series + mock_generate_bool = mock.MagicMock() + mock_generate_bool.return_value = result_series monkeypatch.setattr(bigframes.bigquery.ai, "generate_bool", mock_generate_bool) - result = scalar_types_df.bigquery.ai.generate_bool( + actual_result = scalar_types_df.bigquery.ai.generate_bool( bf_series, connection_id="conn", endpoint="endpoint", @@ -176,48 +204,59 @@ def mock_generate_bool(prompt, **kwargs): model_params={"temp": 0.5}, ) - assert result is result_series + mock_generate_bool.assert_called_once() + args, kwargs = mock_generate_bool.call_args + assert args[0] is bf_series + assert kwargs == { + "connection_id": "conn", + "endpoint": "endpoint", + "request_type": "dedicated", + "model_params": {"temp": 0.5}, + } + result_series.to_pandas.assert_not_called() + assert actual_result is result_series def test_ai_generate_int(monkeypatch): - def mock_generate_int(prompt, **kwargs): - result_series = mock.create_autospec(bpd.Series) - result_series.to_pandas.return_value = (prompt, kwargs) - return result_series + mock_generate_int = mock.MagicMock() + result_series = mock.create_autospec(bpd.Series) + mock_generate_int.return_value = result_series + expected_result = mock.create_autospec(pd.Series) + result_series.to_pandas.return_value = expected_result monkeypatch.setattr(bigframes.bigquery.ai, "generate_int", mock_generate_int) + prompt = mock.create_autospec(pd.Series) df = pd.DataFrame({"text_input": ["How many legs?"]}) - result = df.bigquery.ai.generate_int( - df["text_input"], + actual_result = df.bigquery.ai.generate_int( + prompt, connection_id="conn", endpoint="endpoint", request_type="dedicated", model_params={"temp": 0.5}, ) - assert isinstance(result, tuple) - assert len(result) == 2 - pd.testing.assert_series_equal(result[0], df["text_input"]) - assert result[1] == { - "connection_id": "conn", - "endpoint": "endpoint", - "request_type": "dedicated", - "model_params": {"temp": 0.5}, - } + mock_generate_int.assert_called_once_with( + prompt, + connection_id="conn", + endpoint="endpoint", + request_type="dedicated", + model_params={"temp": 0.5}, + ) + result_series.to_pandas.assert_called_once() + assert actual_result is expected_result def test_bigframes_ai_generate_int(scalar_types_df: bpd.DataFrame, monkeypatch): bf_series = mock.create_autospec(bpd.Series) result_series = mock.create_autospec(bpd.Series) - def mock_generate_int(prompt, **kwargs): - assert prompt is bf_series - return result_series + mock_generate_int = mock.MagicMock() + mock_generate_int.return_value = result_series monkeypatch.setattr(bigframes.bigquery.ai, "generate_int", mock_generate_int) - result = scalar_types_df.bigquery.ai.generate_int( + actual_result = scalar_types_df.bigquery.ai.generate_int( bf_series, connection_id="conn", endpoint="endpoint", @@ -225,48 +264,59 @@ def mock_generate_int(prompt, **kwargs): model_params={"temp": 0.5}, ) - assert result is result_series + mock_generate_int.assert_called_once() + args, kwargs = mock_generate_int.call_args + assert args[0] is bf_series + assert kwargs == { + "connection_id": "conn", + "endpoint": "endpoint", + "request_type": "dedicated", + "model_params": {"temp": 0.5}, + } + result_series.to_pandas.assert_not_called() + assert actual_result is result_series def test_ai_generate_double(monkeypatch): - def mock_generate_double(prompt, **kwargs): - result_series = mock.create_autospec(bpd.Series) - result_series.to_pandas.return_value = (prompt, kwargs) - return result_series + mock_generate_double = mock.MagicMock() + result_series = mock.create_autospec(bpd.Series) + mock_generate_double.return_value = result_series + expected_result = mock.create_autospec(pd.Series) + result_series.to_pandas.return_value = expected_result monkeypatch.setattr(bigframes.bigquery.ai, "generate_double", mock_generate_double) + prompt = mock.create_autospec(pd.Series) df = pd.DataFrame({"text_input": ["How tall?"]}) - result = df.bigquery.ai.generate_double( - df["text_input"], + actual_result = df.bigquery.ai.generate_double( + prompt, connection_id="conn", endpoint="endpoint", request_type="dedicated", model_params={"temp": 0.5}, ) - assert isinstance(result, tuple) - assert len(result) == 2 - pd.testing.assert_series_equal(result[0], df["text_input"]) - assert result[1] == { - "connection_id": "conn", - "endpoint": "endpoint", - "request_type": "dedicated", - "model_params": {"temp": 0.5}, - } + mock_generate_double.assert_called_once_with( + prompt, + connection_id="conn", + endpoint="endpoint", + request_type="dedicated", + model_params={"temp": 0.5}, + ) + result_series.to_pandas.assert_called_once() + assert actual_result is expected_result def test_bigframes_ai_generate_double(scalar_types_df: bpd.DataFrame, monkeypatch): bf_series = mock.create_autospec(bpd.Series) result_series = mock.create_autospec(bpd.Series) - def mock_generate_double(prompt, **kwargs): - assert prompt is bf_series - return result_series + mock_generate_double = mock.MagicMock() + mock_generate_double.return_value = result_series monkeypatch.setattr(bigframes.bigquery.ai, "generate_double", mock_generate_double) - result = scalar_types_df.bigquery.ai.generate_double( + actual_result = scalar_types_df.bigquery.ai.generate_double( bf_series, connection_id="conn", endpoint="endpoint", @@ -274,4 +324,14 @@ def mock_generate_double(prompt, **kwargs): model_params={"temp": 0.5}, ) - assert result is result_series + mock_generate_double.assert_called_once() + args, kwargs = mock_generate_double.call_args + assert args[0] is bf_series + assert kwargs == { + "connection_id": "conn", + "endpoint": "endpoint", + "request_type": "dedicated", + "model_params": {"temp": 0.5}, + } + result_series.to_pandas.assert_not_called() + assert actual_result is result_series diff --git a/packages/bigframes/tests/unit/extensions/core/test_series_tvf_mixins.py b/packages/bigframes/tests/unit/extensions/core/test_series_tvf_mixins.py new file mode 100644 index 000000000000..9d5a24d2db93 --- /dev/null +++ b/packages/bigframes/tests/unit/extensions/core/test_series_tvf_mixins.py @@ -0,0 +1,248 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest.mock as mock + +import pandas as pd + +import bigframes.bigquery.ai +import bigframes.pandas as bpd +import bigframes.session + + +def test_ai_generate_embedding(monkeypatch): + session = mock.create_autospec(bigframes.session.Session) + bf_series = mock.create_autospec(bpd.Series) + session.read_pandas.return_value = bf_series + + mock_generate_embedding = mock.MagicMock() + result_df = mock.create_autospec(bpd.DataFrame) + mock_generate_embedding.return_value = result_df + expected_result = mock.create_autospec(pd.DataFrame) + result_df.to_pandas.return_value = expected_result + + monkeypatch.setattr( + bigframes.bigquery.ai, "generate_embedding", mock_generate_embedding + ) + + series = pd.Series(["apple"], name="content") + actual_result = series.bigquery.ai.generate_embedding( # type: ignore + model="my_model", + output_dimensionality=256, + task_type="retrieval_document", + start_second=1.0, + end_second=2.0, + interval_seconds=3.0, + trial_id=4, + session=session, + ) + + session.read_pandas.assert_called_once() + mock_generate_embedding.assert_called_once_with( + "my_model", + bf_series, + output_dimensionality=256, + task_type="retrieval_document", + start_second=1.0, + end_second=2.0, + interval_seconds=3.0, + trial_id=4, + ) + result_df.to_pandas.assert_called_once() + assert actual_result is expected_result + + +def test_bigframes_ai_generate_embedding(scalar_types_df: bpd.DataFrame, monkeypatch): + session = mock.create_autospec(bigframes.session.Session) + result_df = mock.create_autospec(bpd.DataFrame) + + mock_generate_embedding = mock.MagicMock() + mock_generate_embedding.return_value = result_df + + monkeypatch.setattr( + bigframes.bigquery.ai, "generate_embedding", mock_generate_embedding + ) + + scalar_types_series = scalar_types_df["string_col"] + actual_result = scalar_types_series.bigquery.ai.generate_embedding( + model="my_model", + output_dimensionality=256, + session=session, + ) + + session.read_pandas.assert_not_called() + mock_generate_embedding.assert_called_once() + args, kwargs = mock_generate_embedding.call_args + assert args[0] == "my_model" + assert args[1] is scalar_types_series + assert kwargs == { + "output_dimensionality": 256, + "task_type": None, + "start_second": None, + "end_second": None, + "interval_seconds": None, + "trial_id": None, + } + result_df.to_pandas.assert_not_called() + assert actual_result is result_df + + +def test_ai_generate_text(monkeypatch): + session = mock.create_autospec(bigframes.session.Session) + bf_series = mock.create_autospec(bpd.Series) + session.read_pandas.return_value = bf_series + + mock_generate_text = mock.MagicMock() + result_df = mock.create_autospec(bpd.DataFrame) + mock_generate_text.return_value = result_df + expected_result = mock.create_autospec(pd.DataFrame) + result_df.to_pandas.return_value = expected_result + + monkeypatch.setattr(bigframes.bigquery.ai, "generate_text", mock_generate_text) + + series = pd.Series(["write a poem"], name="prompt") + actual_result = series.bigquery.ai.generate_text( # type: ignore + model="my_model", + temperature=0.7, + max_output_tokens=100, + top_k=50, + top_p=0.9, + stop_sequences=["\n"], + ground_with_google_search=True, + request_type="dedicated", + session=session, + ) + + session.read_pandas.assert_called_once() + mock_generate_text.assert_called_once_with( + "my_model", + bf_series, + temperature=0.7, + max_output_tokens=100, + top_k=50, + top_p=0.9, + stop_sequences=["\n"], + ground_with_google_search=True, + request_type="dedicated", + ) + result_df.to_pandas.assert_called_once() + assert actual_result is expected_result + + +def test_bigframes_ai_generate_text(scalar_types_df: bpd.DataFrame, monkeypatch): + session = mock.create_autospec(bigframes.session.Session) + result_df = mock.create_autospec(bpd.DataFrame) + + mock_generate_text = mock.MagicMock() + mock_generate_text.return_value = result_df + + monkeypatch.setattr(bigframes.bigquery.ai, "generate_text", mock_generate_text) + + scalar_types_series = scalar_types_df["string_col"] + actual_result = scalar_types_series.bigquery.ai.generate_text( + model="my_model", + temperature=0.7, + session=session, + ) + + session.read_pandas.assert_not_called() + mock_generate_text.assert_called_once() + args, kwargs = mock_generate_text.call_args + assert args[0] == "my_model" + assert args[1] is scalar_types_series + assert kwargs == { + "temperature": 0.7, + "max_output_tokens": None, + "top_k": None, + "top_p": None, + "stop_sequences": None, + "ground_with_google_search": None, + "request_type": None, + } + result_df.to_pandas.assert_not_called() + assert actual_result is result_df + + +def test_ai_generate_table(monkeypatch): + session = mock.create_autospec(bigframes.session.Session) + bf_series = mock.create_autospec(bpd.Series) + session.read_pandas.return_value = bf_series + + mock_generate_table = mock.MagicMock() + result_df = mock.create_autospec(bpd.DataFrame) + mock_generate_table.return_value = result_df + expected_result = mock.create_autospec(pd.DataFrame) + result_df.to_pandas.return_value = expected_result + + monkeypatch.setattr(bigframes.bigquery.ai, "generate_table", mock_generate_table) + + series = pd.Series(["generate something"], name="prompt") + actual_result = series.bigquery.ai.generate_table( # type: ignore + model="my_model", + output_schema="category STRING", + temperature=0.7, + top_p=0.9, + max_output_tokens=100, + stop_sequences=["\n"], + request_type="dedicated", + session=session, + ) + + session.read_pandas.assert_called_once() + mock_generate_table.assert_called_once_with( + "my_model", + bf_series, + output_schema="category STRING", + temperature=0.7, + top_p=0.9, + max_output_tokens=100, + stop_sequences=["\n"], + request_type="dedicated", + ) + result_df.to_pandas.assert_called_once() + assert actual_result is expected_result + + +def test_bigframes_ai_generate_table(scalar_types_df: bpd.DataFrame, monkeypatch): + session = mock.create_autospec(bigframes.session.Session) + result_df = mock.create_autospec(bpd.DataFrame) + + mock_generate_table = mock.MagicMock() + mock_generate_table.return_value = result_df + + monkeypatch.setattr(bigframes.bigquery.ai, "generate_table", mock_generate_table) + + scalar_types_series = scalar_types_df["string_col"] + actual_result = scalar_types_series.bigquery.ai.generate_table( + model="my_model", + output_schema="category STRING", + temperature=0.7, + session=session, + ) + + session.read_pandas.assert_not_called() + mock_generate_table.assert_called_once() + args, kwargs = mock_generate_table.call_args + assert args[0] == "my_model" + assert args[1] is scalar_types_series + assert kwargs == { + "output_schema": "category STRING", + "temperature": 0.7, + "top_p": None, + "max_output_tokens": None, + "stop_sequences": None, + "request_type": None, + } + result_df.to_pandas.assert_not_called() + assert actual_result is result_df