Skip to content
This repository was archived by the owner on May 7, 2026. It is now read-only.

Commit b0429e8

Browse files
committed
chore: Make graph visualization more scalable
We now only include query results directly in the html when the query results are less than 100 KB. For larger query results, we store only the reference to the destination table in the HTML, and have the python code re-read the query results from the destination table during the callback. Also, added a hard limit of 5 MB in the query result size, beyond which, graph visualization is not supported altogether.
1 parent 4aea24d commit b0429e8

5 files changed

Lines changed: 459 additions & 46 deletions

File tree

bigquery_magics/bigquery.py

Lines changed: 78 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,6 @@
117117
import IPython # type: ignore
118118
from IPython.core import magic_arguments # type: ignore
119119
from IPython.core.getipython import get_ipython
120-
from google.api_core import client_info
121120
from google.api_core.exceptions import NotFound
122121
from google.cloud import bigquery
123122
from google.cloud.bigquery import exceptions
@@ -126,13 +125,12 @@
126125
from google.cloud.bigquery.job import QueryJobConfig
127126
import pandas
128127

129-
from bigquery_magics import environment
130128
from bigquery_magics import line_arg_parser as lap
131129
import bigquery_magics._versions_helpers
132130
import bigquery_magics.config
133131
import bigquery_magics.graph_server as graph_server
132+
from bigquery_magics import core
134133
import bigquery_magics.pyformat
135-
import bigquery_magics.version
136134

137135
try:
138136
from google.cloud import bigquery_storage # type: ignore
@@ -147,24 +145,6 @@
147145
context = bigquery_magics.config.context
148146

149147

150-
def _get_user_agent():
151-
identities = [
152-
f"ipython-{IPython.__version__}",
153-
f"bigquery-magics/{bigquery_magics.version.__version__}",
154-
]
155-
156-
if environment.is_vscode():
157-
identities.append("vscode")
158-
if environment.is_vscode_google_cloud_code_extension_installed():
159-
identities.append(environment.GOOGLE_CLOUD_CODE_EXTENSION_NAME)
160-
elif environment.is_jupyter():
161-
identities.append("jupyter")
162-
if environment.is_jupyter_bigquery_plugin_installed():
163-
identities.append(environment.BIGQUERY_JUPYTER_PLUGIN_NAME)
164-
165-
return " ".join(identities)
166-
167-
168148
def _handle_error(error, destination_var=None):
169149
"""Process a query execution error.
170150
@@ -565,23 +545,9 @@ def _query_with_pandas(query: str, params: List[Any], args: Any):
565545

566546

567547
def _create_clients(args: Any) -> Tuple[bigquery.Client, Any]:
568-
bigquery_client_options = copy.deepcopy(context.bigquery_client_options)
569-
if args.bigquery_api_endpoint:
570-
if isinstance(bigquery_client_options, dict):
571-
bigquery_client_options["api_endpoint"] = args.bigquery_api_endpoint
572-
else:
573-
bigquery_client_options.api_endpoint = args.bigquery_api_endpoint
574-
575-
bq_client = bigquery.Client(
576-
project=args.project or context.project,
577-
credentials=context.credentials,
578-
default_query_job_config=context.default_query_job_config,
579-
client_info=client_info.ClientInfo(user_agent=_get_user_agent()),
580-
client_options=bigquery_client_options,
581-
location=args.location,
548+
bq_client = core.create_bq_client(
549+
args.project, args.bigquery_api_endpoint, args.location
582550
)
583-
if context._connection:
584-
bq_client._connection = context._connection
585551

586552
# Check and instantiate bq storage client
587553
if args.use_bqstorage_api is not None:
@@ -633,8 +599,9 @@ def _handle_result(result, args):
633599

634600

635601
def _colab_query_callback(query: str, params: str):
602+
parsed_params = json.loads(params)
636603
return IPython.core.display.JSON(
637-
graph_server.convert_graph_data(query_results=json.loads(params))
604+
graph_server.convert_graph_data(query_results=parsed_params["query_result"])
638605
)
639606

640607

@@ -663,7 +630,50 @@ def _colab_node_expansion_callback(request: dict, params_str: str):
663630
singleton_server_thread: threading.Thread = None
664631

665632

666-
def _add_graph_widget(query_result):
633+
MAX_GRAPH_VISUALIZATION_SIZE = 5000000
634+
MAX_GRAPH_VISUALIZATION_QUERY_RESULT_SIZE = 100000
635+
636+
637+
def _estimate_json_size(df: pandas.DataFrame) -> int:
638+
"""Approximates the length of df.to_json(orient='records')
639+
without materializing the string.
640+
"""
641+
num_rows, num_cols = df.shape
642+
if num_rows == 0:
643+
return 2 # "[]"
644+
645+
# 1. Key overhead: "column_name": (repeated for every row)
646+
# Includes quotes, colon, and comma separator per field
647+
key_overhead = sum(len(f'"{col}":') + 1 for col in df.columns) * num_rows
648+
649+
# 2. Row structural overhead: { } per row and [ ] for the list
650+
# Plus commas between rows (num_rows - 1)
651+
structural_overhead = (2 * num_rows) + 2 + (num_rows - 1)
652+
653+
# 3. Value lengths
654+
total_val_len = 0
655+
for col in df.columns:
656+
series = df[col]
657+
658+
if pandas.api.types.is_bool_dtype(series):
659+
# true (4) or false (5)
660+
total_val_len += series.map({True: 4, False: 5}).sum()
661+
elif pandas.api.types.is_numeric_dtype(series):
662+
# Numeric values (no quotes). Sample for average length to save memory.
663+
sample_size = min(len(series), 1000)
664+
avg_len = series.sample(sample_size).astype(str).str.len().mean()
665+
total_val_len += avg_len * num_rows
666+
else:
667+
raise ValueError('Got here #102')
668+
# Strings/Objects: "value" + quotes (2) + rough escaping factor
669+
# .str.len() is relatively memory-efficient
670+
val_chars = series.astype(str).str.len().sum()
671+
total_val_len += val_chars + (2 * num_rows)
672+
673+
return int(key_overhead + structural_overhead + total_val_len)
674+
675+
676+
def _add_graph_widget(query_result: pandas.DataFrame, query_job: Any, args: Any):
667677
try:
668678
from spanner_graphs.graph_visualization import generate_visualization_html
669679
except ImportError as err:
@@ -698,10 +708,36 @@ def _add_graph_widget(query_result):
698708
port = graph_server.graph_server.port
699709

700710
# Create html to invoke the graph server
711+
args_dict = {
712+
"bigquery_api_endpoint": args.bigquery_api_endpoint,
713+
"project": args.project,
714+
"location": args.location,
715+
}
716+
717+
estimated_size = _estimate_json_size(query_result)
718+
if estimated_size > MAX_GRAPH_VISUALIZATION_SIZE:
719+
IPython.display.display(
720+
IPython.core.display.HTML(
721+
"<big><b>Error:</b> The query result is too large for graph visualization.</big>"
722+
)
723+
)
724+
return
725+
726+
table_dict = {
727+
"projectId": query_job.configuration.destination.project,
728+
"datasetId": query_job.configuration.destination.dataset_id,
729+
"tableId": query_job.configuration.destination.table_id,
730+
}
731+
732+
params_dict = {"destination_table": table_dict, "args": args_dict}
733+
if estimated_size < MAX_GRAPH_VISUALIZATION_QUERY_RESULT_SIZE:
734+
params_dict["query_result"] = json.loads(query_result.to_json())
735+
736+
params_str = json.dumps(params_dict)
701737
html_content = generate_visualization_html(
702738
query="placeholder query",
703739
port=port,
704-
params=query_result.to_json().replace("\\", "\\\\").replace('"', '\\"'),
740+
params=params_str.replace("\\", "\\\\").replace('"', '\\"'),
705741
)
706742
IPython.display.display(IPython.core.display.HTML(html_content))
707743

@@ -810,7 +846,7 @@ def _make_bq_query(
810846
result = result.to_dataframe(**dataframe_kwargs)
811847

812848
if args.graph and _supports_graph_widget(result):
813-
_add_graph_widget(result)
849+
_add_graph_widget(result, query_job, args)
814850
return _handle_result(result, args)
815851

816852

@@ -904,7 +940,7 @@ def _make_bqstorage_client(client, client_options):
904940

905941
return client._ensure_bqstorage_client(
906942
client_options=client_options,
907-
client_info=gapic_client_info.ClientInfo(user_agent=_get_user_agent()),
943+
client_info=gapic_client_info.ClientInfo(user_agent=core._get_user_agent()),
908944
)
909945

910946

bigquery_magics/core.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Copyright 2024 Google LLC
2+
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import copy
16+
from google.api_core import client_info
17+
from google.cloud import bigquery
18+
import IPython # type: ignore
19+
from bigquery_magics import environment
20+
import bigquery_magics.config
21+
import bigquery_magics.version
22+
23+
context = bigquery_magics.config.context
24+
25+
26+
def _get_user_agent():
27+
identities = [
28+
f"ipython-{IPython.__version__}",
29+
f"bigquery-magics/{bigquery_magics.version.__version__}",
30+
]
31+
32+
if environment.is_vscode():
33+
identities.append("vscode")
34+
if environment.is_vscode_google_cloud_code_extension_installed():
35+
identities.append(environment.GOOGLE_CLOUD_CODE_EXTENSION_NAME)
36+
elif environment.is_jupyter():
37+
identities.append("jupyter")
38+
if environment.is_jupyter_bigquery_plugin_installed():
39+
identities.append(environment.BIGQUERY_JUPYTER_PLUGIN_NAME)
40+
41+
return " ".join(identities)
42+
43+
44+
def create_bq_client(project: str, bigquery_api_endpoint: str, location: str):
45+
"""Creates a BigQuery client.
46+
47+
Args:
48+
project: Project to use for api calls, None to obtain the project from the context.
49+
bigquery_api_endpoint: Bigquery client endpoint.
50+
location: Cloud region to use for api calls.
51+
52+
Returns:
53+
google.cloud.bigquery.client.Client: The BigQuery client.
54+
"""
55+
bigquery_client_options = copy.deepcopy(context.bigquery_client_options)
56+
if bigquery_api_endpoint:
57+
if isinstance(bigquery_client_options, dict):
58+
bigquery_client_options["api_endpoint"] = bigquery_api_endpoint
59+
else:
60+
bigquery_client_options.api_endpoint = bigquery_api_endpoint
61+
62+
bq_client = bigquery.Client(
63+
project=project or context.project,
64+
credentials=context.credentials,
65+
default_query_job_config=context.default_query_job_config,
66+
client_info=client_info.ClientInfo(user_agent=_get_user_agent()),
67+
client_options=bigquery_client_options,
68+
location=location,
69+
)
70+
if context._connection:
71+
bq_client._connection = context._connection
72+
73+
return bq_client

bigquery_magics/graph_server.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,18 @@
1313
# limitations under the License.
1414

1515
import atexit
16+
import copy
1617
import http.server
1718
import json
1819
import socketserver
1920
import threading
2021
from typing import Any, Dict, List
2122

23+
import IPython # type: ignore
24+
from google.cloud import bigquery
25+
26+
from bigquery_magics import core
27+
2228

2329
def execute_node_expansion(params, request):
2430
return {"error": "Node expansion not yet implemented"}
@@ -251,7 +257,25 @@ def handle_post_ping(self):
251257

252258
def handle_post_query(self):
253259
data = self.parse_post_data()
254-
response = convert_graph_data(query_results=json.loads(data["params"]))
260+
params = json.loads(data["params"])
261+
262+
query_results = None
263+
if "query_result" in params:
264+
query_results = params["query_result"]
265+
else:
266+
bq_client = core.create_bq_client(
267+
params["args"]["project"],
268+
params["args"]["bigquery_api_endpoint"],
269+
params["args"]["location"],
270+
)
271+
272+
table_ref = bigquery.TableReference.from_api_repr(
273+
params["destination_table"]
274+
)
275+
query_results = json.loads(
276+
bq_client.list_rows(table_ref).to_dataframe().to_json()
277+
)
278+
response = convert_graph_data(query_results=query_results)
255279
self.do_data_response(response)
256280

257281
def handle_post_node_expansion(self):

0 commit comments

Comments
 (0)