Skip to content
This repository was archived by the owner on Apr 1, 2026. It is now read-only.

Commit 46c9d48

Browse files
committed
fix: avoid views when querying BigLake tables from SQL cells
1 parent af49ca2 commit 46c9d48

File tree

4 files changed

+94
-17
lines changed

4 files changed

+94
-17
lines changed

bigframes/core/compile/ibis_compiler/ibis_compiler.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -221,18 +221,15 @@ def _table_to_ibis(
221221
physical_schema = ibis_bigquery.BigQuerySchema.to_ibis(
222222
list(source.table.physical_schema)
223223
)
224-
if source.at_time is not None or source.sql_predicate is not None:
225-
import bigframes.session._io.bigquery
226-
227-
sql = bigframes.session._io.bigquery.to_query(
228-
full_table_name,
229-
columns=scan_cols,
230-
sql_predicate=source.sql_predicate,
231-
time_travel_timestamp=source.at_time,
232-
)
233-
return ibis_bigquery.Backend().sql(schema=physical_schema, query=sql)
234-
else:
235-
return ibis_api.table(physical_schema, full_table_name).select(scan_cols)
224+
import bigframes.session._io.bigquery
225+
226+
sql = bigframes.session._io.bigquery.to_query(
227+
full_table_name,
228+
columns=scan_cols,
229+
sql_predicate=source.sql_predicate,
230+
time_travel_timestamp=source.at_time,
231+
)
232+
return ibis_bigquery.Backend().sql(schema=physical_schema, query=sql)
236233

237234

238235
@_compile_node.register

bigframes/core/pyformat.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
import string
2323
import typing
24-
from typing import Any, Optional, Union
24+
from typing import Any, Optional, Tuple, Union
2525

2626
import google.cloud.bigquery
2727
import pandas
@@ -39,7 +39,11 @@
3939

4040

4141
def _table_to_sql(table: _BQ_TABLE_TYPES) -> str:
42-
return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`"
42+
# BiglakeIcebergTable IDs have 4 parts. BigFrames packs catalog.namespace
43+
# into the dataset_id.
44+
dataset_parts = table.dataset_id.split(".")
45+
dataset_sql = ".".join(f"`{part}`" for part in dataset_parts)
46+
return f"`{table.project}`.{dataset_sql}.`{table.table_id}`"
4347

4448

4549
def _pandas_df_to_sql_dry_run(pd_df: pandas.DataFrame) -> str:
@@ -102,6 +106,24 @@ def _field_to_template_value(
102106
return _pandas_df_to_sql(value, session=session, dry_run=dry_run, name=name)
103107

104108
if isinstance(value, bigframes.dataframe.DataFrame):
109+
import bigframes.core.bq_data as bq_data
110+
import bigframes.core.nodes as nodes
111+
112+
# TODO(b/493608478): Remove this workaround for BigLake/Iceberg tables,
113+
# which cannot currently be used in views, once a fix rolls out.
114+
def is_biglake(
115+
node: nodes.BigFrameNode, child_results: Tuple[bool, ...]
116+
) -> bool:
117+
if isinstance(node, nodes.ReadTableNode):
118+
return isinstance(node.source.table, bq_data.BiglakeIcebergTable)
119+
return any(child_results)
120+
121+
contains_biglake = value._block.expr.node.reduce_up(is_biglake)
122+
123+
if contains_biglake:
124+
sql_query, _, _ = value._to_sql_query(include_index=False)
125+
return f"({sql_query})"
126+
105127
return _table_to_sql(value._to_placeholder_table(dry_run=dry_run))
106128

107129
if isinstance(value, str):

bigframes/session/_io/bigquery/__init__.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -519,9 +519,13 @@ def to_query(
519519
time_travel_timestamp: Optional[datetime.datetime] = None,
520520
) -> str:
521521
"""Compile query_or_table with conditions(filters, wildcards) to query."""
522-
sub_query = (
523-
f"({query_or_table})" if is_query(query_or_table) else f"`{query_or_table}`"
524-
)
522+
if is_query(query_or_table):
523+
sub_query = f"({query_or_table})"
524+
else:
525+
# Table ID can have 1, 2, 3, or 4 parts. Quoting all parts to be safe.
526+
# See: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers
527+
parts = query_or_table.split(".")
528+
sub_query = ".".join(f"`{part}`" for part in parts)
525529

526530
# TODO(b/338111344): Generate an index based on DefaultIndexKind if we
527531
# don't have index columns specified.

tests/unit/core/test_pyformat.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,15 @@ def test_pyformat_with_query_string_replaces_variables(session):
500500
),
501501
"SELECT * FROM `ListedProject`.`ListedDataset`.`ListedTable`",
502502
),
503+
(
504+
google.cloud.bigquery.TableReference(
505+
google.cloud.bigquery.DatasetReference(
506+
"my-project", "my-catalog.my-namespace"
507+
),
508+
"my-table",
509+
),
510+
"SELECT * FROM `my-project`.`my-catalog`.`my-namespace`.`my-table`",
511+
),
503512
),
504513
)
505514
def test_pyformat_with_table_replaces_variables(table, expected_sql, session=session):
@@ -511,3 +520,48 @@ def test_pyformat_with_table_replaces_variables(table, expected_sql, session=ses
511520
sql = "SELECT * FROM {table}"
512521
got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session)
513522
assert got_sql == expected_sql
523+
524+
525+
def test_pyformat_with_bigframes_dataframe_biglake_table(session):
526+
# Create a real BigFrames DataFrame that points to a BigLake table.
527+
import bigframes.core.array_value as array_value
528+
import bigframes.core.blocks as blocks
529+
import bigframes.core.bq_data as bq_data
530+
import bigframes.dataframe
531+
532+
# Define the BigLake table
533+
project_id = "my-project"
534+
catalog_id = "my-catalog"
535+
namespace_id = "my-namespace"
536+
table_id = "my-table"
537+
schema = (google.cloud.bigquery.SchemaField("col", "INTEGER"),)
538+
539+
biglake_table = bq_data.BiglakeIcebergTable(
540+
project_id=project_id,
541+
catalog_id=catalog_id,
542+
namespace_id=namespace_id,
543+
table_id=table_id,
544+
physical_schema=schema,
545+
cluster_cols=(),
546+
metadata=bq_data.TableMetadata(
547+
location=bq_data.BigQueryRegion("us-central1"),
548+
type="TABLE",
549+
),
550+
)
551+
552+
# ArrayValue.from_table is what read_gbq uses.
553+
av = array_value.ArrayValue.from_table(biglake_table, session)
554+
block = blocks.Block(av, index_columns=[], column_labels=["col"])
555+
df = bigframes.dataframe.DataFrame(block)
556+
557+
pyformat_args = {"df": df}
558+
sql = "SELECT * FROM {df}"
559+
560+
got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session)
561+
562+
# For BigLake, we now expect a SUBQUERY, not a view reference.
563+
# The subquery should have correctly quoted 4-part ID.
564+
assert "SELECT" in got_sql
565+
assert "FROM `my-project`.`my-catalog`.`my-namespace`.`my-table`" in got_sql
566+
assert got_sql.startswith("SELECT * FROM (SELECT")
567+
assert got_sql.endswith(")")

0 commit comments

Comments
 (0)