fix unit tests

tswast · tswast · commit 38c4d6f9677b · 2026-03-13T16:34:14.000Z
diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py
@@ -16,19 +16,31 @@
 
 from __future__ import annotations
 
-from typing import Sequence
+from typing import cast, Optional, Sequence, Union
 
 import google.cloud.bigquery
 
 from bigframes.core.compile.sqlglot import sql
+import bigframes.dataframe
 import bigframes.dtypes
 import bigframes.operations
 import bigframes.series
 
 
+def _format_names(sql_template: str, dataframe: bigframes.dataframe.DataFrame):
+    """Turn sql_template from a template that uses names to one that uses
+    numbers.
+    """
+    names_to_numbers = {name: f"{{{i}}}" for i, name in enumerate(dataframe.columns)}
+    numbers = [f"{{{i}}}" for i in range(len(dataframe.columns))]
+    return sql_template.format(*numbers, **names_to_numbers)
+
+
 def sql_scalar(
     sql_template: str,
-    columns: Sequence[bigframes.series.Series],
+    columns: Union[bigframes.dataframe.DataFrame, Sequence[bigframes.series.Series]],
+    *,
+    output_dtype: Optional[bigframes.dtypes.Dtype] = None,
 ) -> bigframes.series.Series:
     """Create a Series from a SQL template.
 
@@ -37,6 +49,9 @@ def sql_scalar(
         >>> import bigframes.pandas as bpd
         >>> import bigframes.bigquery as bbq
 
+    Either pass in a sequence of series, in which case use  integers in the
+    format strings.
+
         >>> s = bpd.Series(["1.5", "2.5", "3.5"])
         >>> s = s.astype(pd.ArrowDtype(pa.decimal128(38, 9)))
         >>> bbq.sql_scalar("ROUND({0}, 0, 'ROUND_HALF_EVEN')", [s])
@@ -45,13 +60,29 @@ def sql_scalar(
         2    4.000000000
         dtype: decimal128(38, 9)[pyarrow]
 
+    Or pass in a DataFrame, in which case use the column names in the format
+    strings.
+
+        >>> df = bpd.DataFrame({"a": ["1.5", "2.5", "3.5"]})
+        >>> df = df.astype({"a": pd.ArrowDtype(pa.decimal128(38, 9))})
+        >>> bbq.sql_scalar("ROUND({a}, 0, 'ROUND_HALF_EVEN')", df)
+        0    2.000000000
+        1    2.000000000
+        2    4.000000000
+        dtype: decimal128(38, 9)[pyarrow]
+
     Args:
         sql_template (str):
             A SQL format string with Python-style {0} placeholders for each of
             the Series objects in ``columns``.
-        columns (Sequence[bigframes.pandas.Series]):
+        columns (
+            Sequence[bigframes.pandas.Series] | bigframes.pandas.DataFrame
+        ):
             Series objects representing the column inputs to the
             ``sql_template``. Must contain at least one Series.
+        output_dtype (a BigQuery DataFrames compatible dtype, optional):
+            If provided, BigQuery DataFrames uses this to determine the output
+            of the returned Series. This avoids a dry run query.
 
     Returns:
         bigframes.pandas.Series:
@@ -60,28 +91,38 @@ def sql_scalar(
     Raises:
         ValueError: If ``columns`` is empty.
     """
+    if isinstance(columns, bigframes.dataframe.DataFrame):
+        sql_template = _format_names(sql_template, columns)
+        columns = [
+            cast(bigframes.series.Series, columns[column]) for column in columns.columns
+        ]
+
     if len(columns) == 0:
         raise ValueError("Must provide at least one column in columns")
 
+    base_series = columns[0]
+
     # To integrate this into our expression trees, we need to get the output
     # type, so we do some manual compilation and a dry run query to get that.
     # Another benefit of this is that if there is a syntax error in the SQL
     # template, then this will fail with an error earlier in the process,
     # aiding users in debugging.
-    literals_sql = [sql.to_sql(sql.literal(None, column.dtype)) for column in columns]
-    select_sql = sql_template.format(*literals_sql)
-    dry_run_sql = f"SELECT {select_sql}"
-
-    # Use the executor directly, because we want the original column IDs, not
-    # the user-friendly column names that block.to_sql_query() would produce.
-    base_series = columns[0]
-    bqclient = base_series._session.bqclient
-    job = bqclient.query(
-        dry_run_sql, job_config=google.cloud.bigquery.QueryJobConfig(dry_run=True)
-    )
-    _, output_type = bigframes.dtypes.convert_schema_field(job.schema[0])
+    if output_dtype is None:
+        literals_sql = [
+            sql.to_sql(sql.literal(None, column.dtype)) for column in columns
+        ]
+        select_sql = sql_template.format(*literals_sql)
+        dry_run_sql = f"SELECT {select_sql}"
+
+        # Use the executor directly, because we want the original column IDs, not
+        # the user-friendly column names that block.to_sql_query() would produce.
+        bqclient = base_series._session.bqclient
+        job = bqclient.query(
+            dry_run_sql, job_config=google.cloud.bigquery.QueryJobConfig(dry_run=True)
+        )
+        _, output_dtype = bigframes.dtypes.convert_schema_field(job.schema[0])
 
     op = bigframes.operations.SqlScalarOp(
-        _output_type=output_type, sql_template=sql_template
+        _output_type=output_dtype, sql_template=sql_template
     )
     return base_series._apply_nary_op(op, columns[1:])
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -2666,7 +2666,11 @@ def _array_value_for_output(
         )
 
     def to_sql_query(
-        self, include_index: bool, enable_cache: bool = True
+        self,
+        include_index: bool,
+        enable_cache: bool = True,
+        *,
+        ordered=False,
     ) -> Tuple[str, list[str], list[Label]]:
         """
         Compiles this DataFrame's expression tree to SQL, optionally
@@ -2688,7 +2692,9 @@ def to_sql_query(
         # Note: this uses the sql from the executor, so is coupled tightly to execution
         # implementaton. It will reference cached tables instead of original data sources.
         # Maybe should just compile raw BFET? Depends on user intent.
-        sql = self.session._executor.to_sql(array_value, enable_cache=enable_cache)
+        sql = self.session._executor.to_sql(
+            array_value, enable_cache=enable_cache, ordered=ordered
+        )
         return (
             sql,
             idx_ids,
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -447,7 +447,7 @@ def _to_placeholder_table(self, dry_run: bool = False) -> bigquery.TableReferenc
         )
 
     def _to_sql_query(
-        self, include_index: bool, enable_cache: bool = True
+        self, include_index: bool, enable_cache: bool = True, *, ordered: bool = False
     ) -> Tuple[str, list[str], list[blocks.Label]]:
         """Compiles this DataFrame's expression tree to SQL, optionally
         including index columns.
@@ -461,7 +461,9 @@ def _to_sql_query(
                 If include_index is set to False, index_column_id_list and index_column_label_list
                 return empty lists.
         """
-        return self._block.to_sql_query(include_index, enable_cache=enable_cache)
+        return self._block.to_sql_query(
+            include_index, enable_cache=enable_cache, ordered=ordered
+        )
 
     @property
     def sql(self) -> str:
diff --git a/bigframes/extensions/pandas/dataframe_accessor.py b/bigframes/extensions/pandas/dataframe_accessor.py
@@ -32,7 +32,7 @@ class BigQueryDataFrameAccessor:
     def __init__(self, pandas_obj: pandas.DataFrame):
         self._obj = pandas_obj
 
-    def sql_scalar(self, sql_template: str, session=None):
+    def sql_scalar(self, sql_template: str, *, output_dtype=None, session=None):
         """
         Compute a new pandas Series by applying a SQL scalar function to the DataFrame.
 
@@ -44,22 +44,24 @@ def sql_scalar(self, sql_template: str, session=None):
             sql_template (str):
                 A SQL format string with Python-style {0}, {1}, etc. placeholders for each of
                 the columns in the DataFrame (in the order they appear in ``df.columns``).
+            output_dtype (a BigQuery DataFrames compatible dtype, optional):
+                If provided, BigQuery DataFrames uses this to determine the output
+                of the returned Series. This avoids a dry run query.
             session (bigframes.session.Session, optional):
                 The BigFrames session to use. If not provided, the default global session is used.
 
         Returns:
             pandas.Series:
                 The result of the SQL scalar function as a pandas Series.
         """
-        if session is None:
-            session = bf_session.get_global_session()
-
-        bf_df = cast(bpd.DataFrame, session.read_pandas(self._obj))
-
         # Import bigframes.bigquery here to avoid circular imports
         import bigframes.bigquery
 
-        columns = [cast(bpd.Series, bf_df[col]) for col in bf_df.columns]
-        result = bigframes.bigquery.sql_scalar(sql_template, columns)
+        if session is None:
+            session = bf_session.get_global_session()
 
-        return result.to_pandas()
+        bf_df = cast(bpd.DataFrame, session.read_pandas(self._obj))
+        result = bigframes.bigquery.sql_scalar(
+            sql_template, bf_df, output_dtype=output_dtype
+        )
+        return result.to_pandas(ordered=True)
diff --git a/notebooks/getting_started/pandas_extensions.ipynb b/notebooks/getting_started/pandas_extensions.ipynb
@@ -11,14 +11,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
     "import bigframes  # This import registers the bigquery accessor."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "By default, BigQuery DataFrames selects a location to process data based on the\n",
+    "data location, but using a pandas object doesn't provide such informat. If\n",
+    "processing location is important to you, configure the location before using the\n",
+    "accessor."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import bigframes.pandas as bpd\n",
+    "\n",
+    "bpd.reset_session()\n",
+    "bpd.options.bigquery.location = \"US\""
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -30,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -56,7 +78,7 @@
        "dtype: Float64"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -76,7 +98,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -102,14 +124,14 @@
        "dtype: Int64"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "df = pd.DataFrame({\"a\": [1, 2, 3], \"b\": [10, 20, 30]})\n",
-    "result = df.bigquery.sql_scalar(\"{0} + {1}\")\n",
+    "result = df.bigquery.sql_scalar(\"{a} + {b}\")\n",
     "result"
    ]
   }
diff --git a/tests/unit/core/compile/sqlglot/extensions/__init__.py b/tests/unit/core/compile/sqlglot/extensions/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/unit/core/compile/sqlglot/extensions/pandas/__init__.py b/tests/unit/core/compile/sqlglot/extensions/pandas/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/unit/core/compile/sqlglot/extensions/pandas/snapshots/test_dataframe_accessor/test_compile_aggregate/out.sql b/tests/unit/core/compile/sqlglot/extensions/pandas/snapshots/test_dataframe_accessor/test_compile_aggregate/out.sql
@@ -0,0 +1,4 @@
+SELECT
+  `rowindex`,
+  ROUND(`int64_col` + `int64_too`) AS `0`
+FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` AS `bft_0`
diff --git a/tests/unit/core/compile/sqlglot/extensions/pandas/test_dataframe_accessor.py b/tests/unit/core/compile/sqlglot/extensions/pandas/test_dataframe_accessor.py
@@ -0,0 +1,41 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest.mock as mock
+
+import pandas as pd
+
+import bigframes.pandas as bpd
+import bigframes.session
+
+
+def test_sql_scalar(scalar_types_df: bpd.DataFrame, snapshot, monkeypatch):
+    session = mock.create_autospec(bigframes.session.Session)
+    session.read_pandas.return_value = scalar_types_df
+
+    def to_pandas(series, ordered=True):
+        sql, _, _ = series.to_frame()._to_sql_query(include_index=True, ordered=ordered)
+        return sql
+
+    monkeypatch.setattr(bpd.Series, "to_pandas", to_pandas)
+
+    df = pd.DataFrame({"int64_col": [1, 2], "int64_too": [3, 4]})
+    result = df.bigquery.sql_scalar(
+        "ROUND({int64_col} + {int64_too})",
+        output_dtype=pd.Int64Dtype(),
+        session=session,
+    )
+
+    session.read_pandas.assert_called_once()
+    snapshot.assert_match(result, "out.sql")
diff --git a/tests/unit/extensions/pandas/test_dataframe_accessor.py b/tests/unit/extensions/pandas/test_dataframe_accessor.py