python-bigquery-dataframes/bigframes/bigquery/_operations/search.py at 76e72d6cbc0c7bddac7a1cbadcdede02603b160c · googleapis/python-bigquery-dataframes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import json
import typing
from typing import Collection, Literal, Mapping, Optional, Union

import google.cloud.bigquery as bigquery

import bigframes.ml.utils as utils

if typing.TYPE_CHECKING:
    import bigframes.dataframe as dataframe
    import bigframes.series as series
    import bigframes.session

"""
Search functions defined from
https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions
"""


def create_vector_index(
    table_id: str,
    column_name: str,
    *,
    replace: bool = False,
    index_name: Optional[str] = None,
    distance_type="cosine",
    stored_column_names: Collection[str] = (),
    index_type: str = "ivf",
    ivf_options: Optional[Mapping] = None,
    tree_ah_options: Optional[Mapping] = None,
    session: Optional[bigframes.session.Session] = None,
) -> None:
    """
    Creates a new vector index on a column of a table.

    This method calls the `CREATE VECTOR INDEX DDL statement
    <https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#create_vector_index_statement>`_.

    """
    import bigframes.pandas

    if index_name is None:
        table_ref = bigquery.TableReference.from_string(table_id)
        index_name = table_ref.table_id

    options = {
        "index_type": index_type.upper(),
        "distance_type": distance_type.upper(),
    }

    if ivf_options is not None:
        options["ivf_options"] = json.dumps(ivf_options)

    if tree_ah_options is not None:
        options["tree_ah_options"] = json.dumps(tree_ah_options)

    sql = bigframes.core.sql.create_vector_index_ddl(
        replace=replace,
        index_name=index_name,
        table_name=table_id,
        column_name=column_name,
        stored_column_names=stored_column_names,
        options=options,
    )

    # Use global read_gbq to execute this for better location autodetection.
    if session is None:
        read_gbq_query = bigframes.pandas.read_gbq_query
    else:
        read_gbq_query = session.read_gbq_query

    read_gbq_query(sql)


def vector_search(
    base_table: str,
    column_to_search: str,
    query: Union[dataframe.DataFrame, series.Series],
    *,
    query_column_to_search: Optional[str] = None,
    top_k: Optional[int] = None,
    distance_type: Optional[Literal["euclidean", "cosine", "dot_product"]] = None,
    fraction_lists_to_search: Optional[float] = None,
    use_brute_force: Optional[bool] = None,
    allow_large_results: Optional[bool] = None,
) -> dataframe.DataFrame:
    """
    Conduct vector search which searches embeddings to find semantically similar entities.

    This method calls the `VECTOR_SEARCH() SQL function
    <https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions#vector_search>`_.

    **Examples:**


        >>> import bigframes.pandas as bpd
        >>> import bigframes.bigquery as bbq
        >>> bpd.options.display.progress_bar = None

    DataFrame embeddings for which to find nearest neighbors. The ``ARRAY<FLOAT64>`` column
    is used as the search query:

        >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],
        ...                               "embedding": [[1.0, 2.0], [3.0, 5.2]]})
        >>> bbq.vector_search(
        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
        ...             column_to_search="my_embedding",
        ...             query=search_query,
        ...             top_k=2).sort_values("id")
          query_id  embedding  id my_embedding  distance
        0      dog    [1. 2.]   1      [1. 2.]       0.0
        1      cat  [3.  5.2]   2      [2. 4.]   1.56205
        0      dog    [1. 2.]   4    [1.  3.2]       1.2
        1      cat  [3.  5.2]   5    [5.  5.4]  2.009975
        <BLANKLINE>
        [4 rows x 5 columns]

    Series embeddings for which to find nearest neighbors:

        >>> search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]],
        ...                            index=["dog", "cat"],
        ...                            name="embedding")
        >>> bbq.vector_search(
        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
        ...             column_to_search="my_embedding",
        ...             query=search_query,
        ...             top_k=2,
        ...             use_brute_force=True).sort_values("id")
             embedding  id my_embedding  distance
        dog    [1. 2.]   1      [1. 2.]       0.0
        cat  [3.  5.2]   2      [2. 4.]   1.56205
        dog    [1. 2.]   4    [1.  3.2]       1.2
        cat  [3.  5.2]   5    [5.  5.4]  2.009975
        <BLANKLINE>
        [4 rows x 4 columns]

    You can specify the name of the column in the query DataFrame embeddings and distance type.
    If you specify query_column_to_search_value, it will use the provided column which contains
    the embeddings for which to find nearest neighbors. Otherwiese, it uses the column_to_search value.

        >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],
        ...                               "embedding": [[1.0, 2.0], [3.0, 5.2]],
        ...                               "another_embedding": [[0.7, 2.2], [3.3, 5.2]]})
        >>> bbq.vector_search(
        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
        ...             column_to_search="my_embedding",
        ...             query=search_query,
        ...             distance_type="cosine",
        ...             query_column_to_search="another_embedding",
        ...             top_k=2).sort_values("id")
          query_id  embedding another_embedding  id my_embedding  distance
        1      cat  [3.  5.2]         [3.3 5.2]   1      [1. 2.]  0.005181
        1      cat  [3.  5.2]         [3.3 5.2]   2      [2. 4.]  0.005181
        0      dog    [1. 2.]         [0.7 2.2]   3    [1.5 7. ]  0.004697
        0      dog    [1. 2.]         [0.7 2.2]   4    [1.  3.2]  0.000013
        <BLANKLINE>
        [4 rows x 6 columns]

    Args:
        base_table (str):
            The table to search for nearest neighbor embeddings.
        column_to_search (str):
            The name of the base table column to search for nearest neighbor embeddings.
            The column must have a type of ``ARRAY<FLOAT64>``. All elements in the array must be non-NULL.
        query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series):
            A Series or DataFrame that provides the embeddings for which to find nearest neighbors.
        query_column_to_search (str):
            Specifies the name of the column in the query that contains the embeddings for which to
            find nearest neighbors. The column must have a type of ``ARRAY<FLOAT64>``. All elements in
            the array must be non-NULL and all values in the column must have the same array dimensions
            as the values in the ``column_to_search`` column. Can only be set when query is a DataFrame.
        top_k (int):
            Sepecifies the number of nearest neighbors to return. Default to 10.
        distance_type (str, defalt "euclidean"):
            Specifies the type of metric to use to compute the distance between two vectors.
            Possible values are "euclidean", "cosine" and "dot_product".
            Default to "euclidean".
        fraction_lists_to_search (float, range in [0.0, 1.0]):
            Specifies the percentage of lists to search. Specifying a higher percentage leads to
            higher recall and slower performance, and the converse is true when specifying a lower
            percentage. It is only used when a vector index is also used. You can only specify
            ``fraction_lists_to_search`` when ``use_brute_force`` is set to False.
        use_brute_force (bool):
            Determines whether to use brute force search by skipping the vector index if one is available.
            Default to False.
        allow_large_results (bool, optional):
            Whether to allow large query results. If ``True``, the query
            results can be larger than the maximum response size.
            Defaults to ``bpd.options.compute.allow_large_results``.

    Returns:
        bigframes.dataframe.DataFrame: A DataFrame containing vector search result.
    """
    import bigframes.series

    if (
        isinstance(query, bigframes.series.Series)
        and query_column_to_search is not None
    ):
        raise ValueError(
            "You can't specify query_column_to_search when query is a Series."
        )

    # Only populate options if not set to the default value.
    # This avoids accidentally setting options that are mutually exclusive.
    options = None
    if fraction_lists_to_search is not None:
        options = {} if options is None else options
        options["fraction_lists_to_search"] = fraction_lists_to_search
    if use_brute_force is not None:
        options = {} if options is None else options
        options["use_brute_force"] = use_brute_force

    (query,) = utils.batch_convert_to_dataframe(query)
    sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True)

    sql = bigframes.core.sql.create_vector_search_sql(
        sql_string=sql_string,
        base_table=base_table,
        column_to_search=column_to_search,
        query_column_to_search=query_column_to_search,
        top_k=top_k,
        distance_type=distance_type,
        options=options,
    )
    if index_col_ids is not None:
        df = query._session.read_gbq_query(
            sql, index_col=index_col_ids, allow_large_results=allow_large_results
        )
        df.index.names = index_labels
    else:
        df = query._session.read_gbq_query(sql, allow_large_results=allow_large_results)

    return df