Skip to content
This repository was archived by the owner on May 7, 2026. It is now read-only.

Commit 9e132e7

Browse files
committed
chore: Make graph visualization more scalable
We now only include query results directly in the html when the query results are less than 100 KB. For larger query results, we store only the reference to the destination table in the HTML, and have the python code re-read the query results from the destination table during the callback. Also, added a hard limit of 5 MB in the query result size, beyond which, graph visualization is not supported altogether.
1 parent 4aea24d commit 9e132e7

4 files changed

Lines changed: 368 additions & 47 deletions

File tree

bigquery_magics/bigquery.py

Lines changed: 79 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,6 @@
117117
import IPython # type: ignore
118118
from IPython.core import magic_arguments # type: ignore
119119
from IPython.core.getipython import get_ipython
120-
from google.api_core import client_info
121120
from google.api_core.exceptions import NotFound
122121
from google.cloud import bigquery
123122
from google.cloud.bigquery import exceptions
@@ -126,13 +125,11 @@
126125
from google.cloud.bigquery.job import QueryJobConfig
127126
import pandas
128127

129-
from bigquery_magics import environment
130128
from bigquery_magics import line_arg_parser as lap
131129
import bigquery_magics._versions_helpers
132130
import bigquery_magics.config
133131
import bigquery_magics.graph_server as graph_server
134132
import bigquery_magics.pyformat
135-
import bigquery_magics.version
136133

137134
try:
138135
from google.cloud import bigquery_storage # type: ignore
@@ -147,24 +144,6 @@
147144
context = bigquery_magics.config.context
148145

149146

150-
def _get_user_agent():
151-
identities = [
152-
f"ipython-{IPython.__version__}",
153-
f"bigquery-magics/{bigquery_magics.version.__version__}",
154-
]
155-
156-
if environment.is_vscode():
157-
identities.append("vscode")
158-
if environment.is_vscode_google_cloud_code_extension_installed():
159-
identities.append(environment.GOOGLE_CLOUD_CODE_EXTENSION_NAME)
160-
elif environment.is_jupyter():
161-
identities.append("jupyter")
162-
if environment.is_jupyter_bigquery_plugin_installed():
163-
identities.append(environment.BIGQUERY_JUPYTER_PLUGIN_NAME)
164-
165-
return " ".join(identities)
166-
167-
168147
def _handle_error(error, destination_var=None):
169148
"""Process a query execution error.
170149
@@ -565,23 +544,7 @@ def _query_with_pandas(query: str, params: List[Any], args: Any):
565544

566545

567546
def _create_clients(args: Any) -> Tuple[bigquery.Client, Any]:
568-
bigquery_client_options = copy.deepcopy(context.bigquery_client_options)
569-
if args.bigquery_api_endpoint:
570-
if isinstance(bigquery_client_options, dict):
571-
bigquery_client_options["api_endpoint"] = args.bigquery_api_endpoint
572-
else:
573-
bigquery_client_options.api_endpoint = args.bigquery_api_endpoint
574-
575-
bq_client = bigquery.Client(
576-
project=args.project or context.project,
577-
credentials=context.credentials,
578-
default_query_job_config=context.default_query_job_config,
579-
client_info=client_info.ClientInfo(user_agent=_get_user_agent()),
580-
client_options=bigquery_client_options,
581-
location=args.location,
582-
)
583-
if context._connection:
584-
bq_client._connection = context._connection
547+
bq_client = graph_server.create_bq_client(args)
585548

586549
# Check and instantiate bq storage client
587550
if args.use_bqstorage_api is not None:
@@ -633,8 +596,9 @@ def _handle_result(result, args):
633596

634597

635598
def _colab_query_callback(query: str, params: str):
599+
parsed_params = json.loads(params)
636600
return IPython.core.display.JSON(
637-
graph_server.convert_graph_data(query_results=json.loads(params))
601+
graph_server.convert_graph_data(query_results=parsed_params["query_result"])
638602
)
639603

640604

@@ -663,7 +627,51 @@ def _colab_node_expansion_callback(request: dict, params_str: str):
663627
singleton_server_thread: threading.Thread = None
664628

665629

666-
def _add_graph_widget(query_result):
630+
MAX_GRAPH_VISUALIZATION_SIZE = 5000000
631+
MAX_GRAPH_VISUALIZATION_QUERY_RESULT_SIZE = 100000
632+
633+
634+
def _estimate_json_size(df: pandas.DataFrame) -> int:
635+
"""Approximates the length of df.to_json(orient='records')
636+
without materializing the string.
637+
"""
638+
num_rows, num_cols = df.shape
639+
if num_rows == 0:
640+
return 2 # "[]"
641+
642+
# 1. Key overhead: "column_name": (repeated for every row)
643+
# Includes quotes, colon, and comma separator per field
644+
key_overhead = sum(len(f'"{col}":') + 1 for col in df.columns) * num_rows
645+
646+
# 2. Row structural overhead: { } per row and [ ] for the list
647+
# Plus commas between rows (num_rows - 1)
648+
structural_overhead = (2 * num_rows) + 2 + (num_rows - 1)
649+
650+
# 3. Value lengths
651+
total_val_len = 0
652+
for col in df.columns:
653+
series = df[col]
654+
655+
if pandas.api.types.is_numeric_dtype(series):
656+
# Numeric values (no quotes). Sample for average length to save memory.
657+
sample_size = min(len(series), 1000)
658+
avg_len = series.sample(sample_size).astype(str).str.len().mean()
659+
total_val_len += avg_len * num_rows
660+
661+
elif pandas.api.types.is_bool_dtype(series):
662+
# true (4) or false (5)
663+
total_val_len += series.map({True: 4, False: 5}).sum()
664+
665+
else:
666+
# Strings/Objects: "value" + quotes (2) + rough escaping factor
667+
# .str.len() is relatively memory-efficient
668+
val_chars = series.astype(str).str.len().sum()
669+
total_val_len += val_chars + (2 * num_rows)
670+
671+
return int(key_overhead + structural_overhead + total_val_len)
672+
673+
674+
def _add_graph_widget(query_result: pandas.DataFrame, query_job: Any, args: Any):
667675
try:
668676
from spanner_graphs.graph_visualization import generate_visualization_html
669677
except ImportError as err:
@@ -698,10 +706,36 @@ def _add_graph_widget(query_result):
698706
port = graph_server.graph_server.port
699707

700708
# Create html to invoke the graph server
709+
args_dict = {
710+
"bigquery_api_endpoint": args.bigquery_api_endpoint,
711+
"project": args.project,
712+
"location": args.location,
713+
}
714+
715+
estimated_size = _estimate_json_size(query_result)
716+
if estimated_size > MAX_GRAPH_VISUALIZATION_SIZE:
717+
IPython.display.display(
718+
IPython.core.display.HTML(
719+
"<big><b>Error:</b> The query result is too large for graph visualization.</big>"
720+
)
721+
)
722+
return
723+
724+
table_dict = {
725+
"projectId": query_job.configuration.destination.project,
726+
"datasetId": query_job.configuration.destination.dataset_id,
727+
"tableId": query_job.configuration.destination.table_id,
728+
}
729+
730+
params_dict = {"destination_table": table_dict, "args": args_dict}
731+
if estimated_size < MAX_GRAPH_VISUALIZATION_QUERY_RESULT_SIZE:
732+
params_dict["query_result"] = json.loads(query_result.to_json())
733+
734+
params_str = json.dumps(params_dict)
701735
html_content = generate_visualization_html(
702736
query="placeholder query",
703737
port=port,
704-
params=query_result.to_json().replace("\\", "\\\\").replace('"', '\\"'),
738+
params=params_str.replace("\\", "\\\\").replace('"', '\\"'),
705739
)
706740
IPython.display.display(IPython.core.display.HTML(html_content))
707741

@@ -810,7 +844,7 @@ def _make_bq_query(
810844
result = result.to_dataframe(**dataframe_kwargs)
811845

812846
if args.graph and _supports_graph_widget(result):
813-
_add_graph_widget(result)
847+
_add_graph_widget(result, query_job, args)
814848
return _handle_result(result, args)
815849

816850

@@ -904,7 +938,9 @@ def _make_bqstorage_client(client, client_options):
904938

905939
return client._ensure_bqstorage_client(
906940
client_options=client_options,
907-
client_info=gapic_client_info.ClientInfo(user_agent=_get_user_agent()),
941+
client_info=gapic_client_info.ClientInfo(
942+
user_agent=graph_server._get_user_agent()
943+
),
908944
)
909945

910946

bigquery_magics/graph_server.py

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,72 @@
1313
# limitations under the License.
1414

1515
import atexit
16+
import copy
1617
import http.server
1718
import json
1819
import socketserver
1920
import threading
2021
from typing import Any, Dict, List
2122

23+
import IPython # type: ignore
24+
from google.api_core import client_info
25+
from google.cloud import bigquery
26+
27+
from bigquery_magics import environment
28+
import bigquery_magics.config
29+
import bigquery_magics.version
30+
31+
32+
context = bigquery_magics.config.context
33+
34+
35+
def _get_user_agent():
36+
identities = [
37+
f"ipython-{IPython.__version__}",
38+
f"bigquery-magics/{bigquery_magics.version.__version__}",
39+
]
40+
41+
if environment.is_vscode():
42+
identities.append("vscode")
43+
if environment.is_vscode_google_cloud_code_extension_installed():
44+
identities.append(environment.GOOGLE_CLOUD_CODE_EXTENSION_NAME)
45+
elif environment.is_jupyter():
46+
identities.append("jupyter")
47+
if environment.is_jupyter_bigquery_plugin_installed():
48+
identities.append(environment.BIGQUERY_JUPYTER_PLUGIN_NAME)
49+
50+
return " ".join(identities)
51+
52+
53+
def create_bq_client(args: Any):
54+
"""Creates a BigQuery client.
55+
56+
Args:
57+
args (argparse.Namespace): The arguments passed to the cell magic.
58+
59+
Returns:
60+
google.cloud.bigquery.client.Client: The BigQuery client.
61+
"""
62+
bigquery_client_options = copy.deepcopy(context.bigquery_client_options)
63+
if args.bigquery_api_endpoint:
64+
if isinstance(bigquery_client_options, dict):
65+
bigquery_client_options["api_endpoint"] = args.bigquery_api_endpoint
66+
else:
67+
bigquery_client_options.api_endpoint = args.bigquery_api_endpoint
68+
69+
bq_client = bigquery.Client(
70+
project=args.project or context.project,
71+
credentials=context.credentials,
72+
default_query_job_config=context.default_query_job_config,
73+
client_info=client_info.ClientInfo(user_agent=_get_user_agent()),
74+
client_options=bigquery_client_options,
75+
location=args.location,
76+
)
77+
if context._connection:
78+
bq_client._connection = context._connection
79+
80+
return bq_client
81+
2282

2383
def execute_node_expansion(params, request):
2484
return {"error": "Node expansion not yet implemented"}
@@ -251,7 +311,23 @@ def handle_post_ping(self):
251311

252312
def handle_post_query(self):
253313
data = self.parse_post_data()
254-
response = convert_graph_data(query_results=json.loads(data["params"]))
314+
params = json.loads(data["params"])
315+
316+
query_results = None
317+
if "query_result" in params:
318+
query_results = params["query_result"]
319+
else:
320+
class BqArgs:
321+
pass
322+
args_obj = BqArgs()
323+
args_obj.bigquery_api_endpoint = params["args"]["bigquery_api_endpoint"]
324+
args_obj.project = params["args"]["project"]
325+
args_obj.location = params["args"]["location"]
326+
bq_client = create_bq_client(args_obj)
327+
328+
table_ref = bigquery.TableReference.from_api_repr(params["destination_table"])
329+
query_results = json.loads(bq_client.list_rows(table_ref).to_dataframe().to_json())
330+
response = convert_graph_data(query_results=query_results)
255331
self.do_data_response(response)
256332

257333
def handle_post_node_expansion(self):

0 commit comments

Comments
 (0)