@@ -449,13 +449,6 @@ def _query_to_destination(
449449 index_cols : List [str ],
450450 api_name : str ,
451451 ) -> Tuple [Optional [bigquery .TableReference ], Optional [bigquery .QueryJob ]]:
452- # If there are no index columns, then there's no reason to cache to a
453- # (clustered) session table, as we'll just have to query it again to
454- # create a default index & ordering.
455- if not index_cols :
456- _ , query_job = self ._start_query (query )
457- return query_job .destination , query_job
458-
459452 # If a dry_run indicates this is not a query type job, then don't
460453 # bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement.
461454 dry_run_config = bigquery .QueryJobConfig ()
@@ -465,15 +458,24 @@ def _query_to_destination(
465458 _ , query_job = self ._start_query (query )
466459 return query_job .destination , query_job
467460
468- # Make sure we cluster by the index column(s) so that subsequent
469- # operations are as speedy as they can be.
461+ # Create a table to workaround BigQuery 10 GB query results limit. See:
462+ # internal issue 303057336.
463+ # Since we have a `statement_type == 'SELECT'`, schema should be populated.
464+ schema = typing .cast (Iterable [bigquery .SchemaField ], dry_run_job .schema )
465+ temp_table = self ._create_session_table_empty (api_name , schema , index_cols )
466+
467+ job_config = bigquery .QueryJobConfig ()
468+ job_config .destination = temp_table
469+
470470 try :
471- ibis_expr = self .ibis_client .sql (query )
472- return self ._ibis_to_session_table (ibis_expr , index_cols , api_name ), None
471+ # Write to temp table to workaround BigQuery 10 GB query results
472+ # limit. See: internal issue 303057336.
473+ _ , query_job = self ._start_query (query , job_config = job_config )
474+ return query_job .destination , query_job
473475 except google .api_core .exceptions .BadRequest :
474- # Some SELECT statements still aren't compatible with CREATE TEMP
475- # TABLE ... AS SELECT ... statements. For example, if the query has
476- # a top-level ORDER BY, this conflicts with our ability to cluster
476+ # Some SELECT statements still aren't compatible with cluster
477+ # tables as the destination. For example, if the query has a
478+ # top-level ORDER BY, this conflicts with our ability to cluster
477479 # the table by the index column(s).
478480 _ , query_job = self ._start_query (query )
479481 return query_job .destination , query_job
@@ -1231,6 +1233,54 @@ def _create_session_table(self) -> bigquery.TableReference:
12311233 )
12321234 return dataset .table (table_name )
12331235
1236+ def _create_session_table_empty (
1237+ self ,
1238+ api_name : str ,
1239+ schema : Iterable [bigquery .SchemaField ],
1240+ cluster_cols : List [str ],
1241+ ) -> bigquery .TableReference :
1242+ # Can't set a table in _SESSION as destination via query job API, so we
1243+ # run DDL, instead.
1244+ table = self ._create_session_table ()
1245+ schema_sql = bigframes_io .bq_schema_to_sql (schema )
1246+
1247+ clusterable_cols = [
1248+ col .name
1249+ for col in schema
1250+ if col .name in cluster_cols and _can_cluster_bq (col )
1251+ ][:_MAX_CLUSTER_COLUMNS ]
1252+
1253+ if clusterable_cols :
1254+ cluster_cols_sql = ", " .join (
1255+ f"`{ cluster_col } `" for cluster_col in clusterable_cols
1256+ )
1257+ cluster_sql = f"CLUSTER BY { cluster_cols_sql } "
1258+ else :
1259+ cluster_sql = ""
1260+
1261+ ddl_text = f"""
1262+ CREATE TEMP TABLE
1263+ `_SESSION`.`{ table .table_id } `
1264+ ({ schema_sql } )
1265+ { cluster_sql }
1266+ """
1267+
1268+ job_config = bigquery .QueryJobConfig ()
1269+
1270+ # Include a label so that Dataplex Lineage can identify temporary
1271+ # tables that BigQuery DataFrames creates. Googlers: See internal issue
1272+ # 296779699. We're labeling the job instead of the table because
1273+ # otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not
1274+ # supported`.
1275+ job_config .labels = {"source" : "bigquery-dataframes-temp" }
1276+ job_config .labels ["bigframes-api" ] = api_name
1277+
1278+ _ , query_job = self ._start_query (ddl_text , job_config = job_config )
1279+
1280+ # Use fully-qualified name instead of `_SESSION` name so that the
1281+ # created table can be used as the destination table.
1282+ return query_job .destination
1283+
12341284 def _create_sequential_ordering (
12351285 self ,
12361286 table : ibis_types .Table ,
@@ -1249,7 +1299,9 @@ def _create_sequential_ordering(
12491299 cluster_cols = list (index_cols ) + [default_ordering_name ],
12501300 api_name = api_name ,
12511301 )
1252- table = self .ibis_client .sql (f"SELECT * FROM `{ table_ref .table_id } `" )
1302+ table = self .ibis_client .table (
1303+ f"{ table_ref .project } .{ table_ref .dataset_id } .{ table_ref .table_id } "
1304+ )
12531305 ordering_reference = core .OrderingColumnReference (default_ordering_name )
12541306 ordering = core .ExpressionOrdering (
12551307 ordering_value_columns = [ordering_reference ],
@@ -1264,55 +1316,13 @@ def _ibis_to_session_table(
12641316 cluster_cols : Iterable [str ],
12651317 api_name : str ,
12661318 ) -> bigquery .TableReference :
1267- clusterable_cols = [
1268- col for col in cluster_cols if _can_cluster (table [col ].type ())
1269- ][:_MAX_CLUSTER_COLUMNS ]
1270- return self ._query_to_session_table (
1319+ desination , _ = self ._query_to_destination (
12711320 self .ibis_client .compile (table ),
1272- cluster_cols = clusterable_cols ,
1321+ index_cols = list ( cluster_cols ) ,
12731322 api_name = api_name ,
12741323 )
1275-
1276- def _query_to_session_table (
1277- self ,
1278- query_text : str ,
1279- cluster_cols : Iterable [str ],
1280- api_name : str ,
1281- ) -> bigquery .TableReference :
1282- if len (list (cluster_cols )) > _MAX_CLUSTER_COLUMNS :
1283- raise ValueError (
1284- f"Too many cluster columns: { list (cluster_cols )} , max { _MAX_CLUSTER_COLUMNS } allowed."
1285- )
1286- # Can't set a table in _SESSION as destination via query job API, so we
1287- # run DDL, instead.
1288- table = self ._create_session_table ()
1289- cluster_cols_sql = ", " .join (f"`{ cluster_col } `" for cluster_col in cluster_cols )
1290-
1291- # TODO(swast): This might not support multi-statement SQL queries (scripts).
1292- ddl_text = f"""
1293- CREATE TEMP TABLE `_SESSION`.`{ table .table_id } `
1294- CLUSTER BY { cluster_cols_sql }
1295- AS { query_text }
1296- """
1297-
1298- job_config = bigquery .QueryJobConfig ()
1299-
1300- # Include a label so that Dataplex Lineage can identify temporary
1301- # tables that BigQuery DataFrames creates. Googlers: See internal issue
1302- # 296779699. We're labeling the job instead of the table because
1303- # otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not
1304- # supported`.
1305- job_config .labels = {"source" : "bigquery-dataframes-temp" }
1306- job_config .labels ["bigframes-api" ] = api_name
1307-
1308- try :
1309- self ._start_query (
1310- ddl_text , job_config = job_config
1311- ) # Wait for the job to complete
1312- except google .api_core .exceptions .Conflict :
1313- # Allow query retry to succeed.
1314- pass
1315- return table
1324+ # There should always be a destination table for this query type.
1325+ return typing .cast (bigquery .TableReference , desination )
13161326
13171327 def remote_function (
13181328 self ,
@@ -1494,14 +1504,21 @@ def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Sessi
14941504 return Session (context )
14951505
14961506
1497- def _can_cluster ( ibis_type : ibis_dtypes . DataType ):
1507+ def _can_cluster_bq ( field : bigquery . SchemaField ):
14981508 # https://cloud.google.com/bigquery/docs/clustered-tables
14991509 # Notably, float is excluded
1500- return (
1501- ibis_type .is_integer ()
1502- or ibis_type .is_string ()
1503- or ibis_type .is_decimal ()
1504- or ibis_type .is_date ()
1505- or ibis_type .is_timestamp ()
1506- or ibis_type .is_boolean ()
1510+ type_ = field .field_type
1511+ return type_ in (
1512+ "INTEGER" ,
1513+ "INT64" ,
1514+ "STRING" ,
1515+ "NUMERIC" ,
1516+ "DECIMAL" ,
1517+ "BIGNUMERIC" ,
1518+ "BIGDECIMAL" ,
1519+ "DATE" ,
1520+ "DATETIME" ,
1521+ "TIMESTAMP" ,
1522+ "BOOL" ,
1523+ "BOOLEAN" ,
15071524 )
0 commit comments