|
117 | 117 | import IPython # type: ignore |
118 | 118 | from IPython.core import magic_arguments # type: ignore |
119 | 119 | from IPython.core.getipython import get_ipython |
120 | | -from google.api_core import client_info |
121 | 120 | from google.api_core.exceptions import NotFound |
122 | 121 | from google.cloud import bigquery |
123 | 122 | from google.cloud.bigquery import exceptions |
|
126 | 125 | from google.cloud.bigquery.job import QueryJobConfig |
127 | 126 | import pandas |
128 | 127 |
|
129 | | -from bigquery_magics import environment |
130 | 128 | from bigquery_magics import line_arg_parser as lap |
131 | 129 | import bigquery_magics._versions_helpers |
132 | 130 | import bigquery_magics.config |
133 | 131 | import bigquery_magics.graph_server as graph_server |
| 132 | +from bigquery_magics import core |
134 | 133 | import bigquery_magics.pyformat |
135 | | -import bigquery_magics.version |
136 | 134 |
|
137 | 135 | try: |
138 | 136 | from google.cloud import bigquery_storage # type: ignore |
|
147 | 145 | context = bigquery_magics.config.context |
148 | 146 |
|
149 | 147 |
|
150 | | -def _get_user_agent(): |
151 | | - identities = [ |
152 | | - f"ipython-{IPython.__version__}", |
153 | | - f"bigquery-magics/{bigquery_magics.version.__version__}", |
154 | | - ] |
155 | | - |
156 | | - if environment.is_vscode(): |
157 | | - identities.append("vscode") |
158 | | - if environment.is_vscode_google_cloud_code_extension_installed(): |
159 | | - identities.append(environment.GOOGLE_CLOUD_CODE_EXTENSION_NAME) |
160 | | - elif environment.is_jupyter(): |
161 | | - identities.append("jupyter") |
162 | | - if environment.is_jupyter_bigquery_plugin_installed(): |
163 | | - identities.append(environment.BIGQUERY_JUPYTER_PLUGIN_NAME) |
164 | | - |
165 | | - return " ".join(identities) |
166 | | - |
167 | | - |
168 | 148 | def _handle_error(error, destination_var=None): |
169 | 149 | """Process a query execution error. |
170 | 150 |
|
@@ -565,23 +545,9 @@ def _query_with_pandas(query: str, params: List[Any], args: Any): |
565 | 545 |
|
566 | 546 |
|
567 | 547 | def _create_clients(args: Any) -> Tuple[bigquery.Client, Any]: |
568 | | - bigquery_client_options = copy.deepcopy(context.bigquery_client_options) |
569 | | - if args.bigquery_api_endpoint: |
570 | | - if isinstance(bigquery_client_options, dict): |
571 | | - bigquery_client_options["api_endpoint"] = args.bigquery_api_endpoint |
572 | | - else: |
573 | | - bigquery_client_options.api_endpoint = args.bigquery_api_endpoint |
574 | | - |
575 | | - bq_client = bigquery.Client( |
576 | | - project=args.project or context.project, |
577 | | - credentials=context.credentials, |
578 | | - default_query_job_config=context.default_query_job_config, |
579 | | - client_info=client_info.ClientInfo(user_agent=_get_user_agent()), |
580 | | - client_options=bigquery_client_options, |
581 | | - location=args.location, |
| 548 | + bq_client = core.create_bq_client( |
| 549 | + args.project, args.bigquery_api_endpoint, args.location |
582 | 550 | ) |
583 | | - if context._connection: |
584 | | - bq_client._connection = context._connection |
585 | 551 |
|
586 | 552 | # Check and instantiate bq storage client |
587 | 553 | if args.use_bqstorage_api is not None: |
@@ -633,8 +599,9 @@ def _handle_result(result, args): |
633 | 599 |
|
634 | 600 |
|
635 | 601 | def _colab_query_callback(query: str, params: str): |
| 602 | + parsed_params = json.loads(params) |
636 | 603 | return IPython.core.display.JSON( |
637 | | - graph_server.convert_graph_data(query_results=json.loads(params)) |
| 604 | + graph_server.convert_graph_data(query_results=parsed_params["query_result"]) |
638 | 605 | ) |
639 | 606 |
|
640 | 607 |
|
@@ -663,7 +630,50 @@ def _colab_node_expansion_callback(request: dict, params_str: str): |
663 | 630 | singleton_server_thread: threading.Thread = None |
664 | 631 |
|
665 | 632 |
|
666 | | -def _add_graph_widget(query_result): |
| 633 | +MAX_GRAPH_VISUALIZATION_SIZE = 5000000 |
| 634 | +MAX_GRAPH_VISUALIZATION_QUERY_RESULT_SIZE = 100000 |
| 635 | + |
| 636 | + |
| 637 | +def _estimate_json_size(df: pandas.DataFrame) -> int: |
| 638 | + """Approximates the length of df.to_json(orient='records') |
| 639 | + without materializing the string. |
| 640 | + """ |
| 641 | + num_rows, num_cols = df.shape |
| 642 | + if num_rows == 0: |
| 643 | + return 2 # "[]" |
| 644 | + |
| 645 | + # 1. Key overhead: "column_name": (repeated for every row) |
| 646 | + # Includes quotes, colon, and comma separator per field |
| 647 | + key_overhead = sum(len(f'"{col}":') + 1 for col in df.columns) * num_rows |
| 648 | + |
| 649 | + # 2. Row structural overhead: { } per row and [ ] for the list |
| 650 | + # Plus commas between rows (num_rows - 1) |
| 651 | + structural_overhead = (2 * num_rows) + 2 + (num_rows - 1) |
| 652 | + |
| 653 | + # 3. Value lengths |
| 654 | + total_val_len = 0 |
| 655 | + for col in df.columns: |
| 656 | + series = df[col] |
| 657 | + |
| 658 | + if pandas.api.types.is_bool_dtype(series): |
| 659 | + # true (4) or false (5) |
| 660 | + total_val_len += series.map({True: 4, False: 5}).sum() |
| 661 | + elif pandas.api.types.is_numeric_dtype(series): |
| 662 | + # Numeric values (no quotes). Sample for average length to save memory. |
| 663 | + sample_size = min(len(series), 1000) |
| 664 | + avg_len = series.sample(sample_size).astype(str).str.len().mean() |
| 665 | + total_val_len += avg_len * num_rows |
| 666 | + else: |
| 667 | + raise ValueError('Got here #102') |
| 668 | + # Strings/Objects: "value" + quotes (2) + rough escaping factor |
| 669 | + # .str.len() is relatively memory-efficient |
| 670 | + val_chars = series.astype(str).str.len().sum() |
| 671 | + total_val_len += val_chars + (2 * num_rows) |
| 672 | + |
| 673 | + return int(key_overhead + structural_overhead + total_val_len) |
| 674 | + |
| 675 | + |
| 676 | +def _add_graph_widget(query_result: pandas.DataFrame, query_job: Any, args: Any): |
667 | 677 | try: |
668 | 678 | from spanner_graphs.graph_visualization import generate_visualization_html |
669 | 679 | except ImportError as err: |
@@ -698,10 +708,36 @@ def _add_graph_widget(query_result): |
698 | 708 | port = graph_server.graph_server.port |
699 | 709 |
|
700 | 710 | # Create html to invoke the graph server |
| 711 | + args_dict = { |
| 712 | + "bigquery_api_endpoint": args.bigquery_api_endpoint, |
| 713 | + "project": args.project, |
| 714 | + "location": args.location, |
| 715 | + } |
| 716 | + |
| 717 | + estimated_size = _estimate_json_size(query_result) |
| 718 | + if estimated_size > MAX_GRAPH_VISUALIZATION_SIZE: |
| 719 | + IPython.display.display( |
| 720 | + IPython.core.display.HTML( |
| 721 | + "<big><b>Error:</b> The query result is too large for graph visualization.</big>" |
| 722 | + ) |
| 723 | + ) |
| 724 | + return |
| 725 | + |
| 726 | + table_dict = { |
| 727 | + "projectId": query_job.configuration.destination.project, |
| 728 | + "datasetId": query_job.configuration.destination.dataset_id, |
| 729 | + "tableId": query_job.configuration.destination.table_id, |
| 730 | + } |
| 731 | + |
| 732 | + params_dict = {"destination_table": table_dict, "args": args_dict} |
| 733 | + if estimated_size < MAX_GRAPH_VISUALIZATION_QUERY_RESULT_SIZE: |
| 734 | + params_dict["query_result"] = json.loads(query_result.to_json()) |
| 735 | + |
| 736 | + params_str = json.dumps(params_dict) |
701 | 737 | html_content = generate_visualization_html( |
702 | 738 | query="placeholder query", |
703 | 739 | port=port, |
704 | | - params=query_result.to_json().replace("\\", "\\\\").replace('"', '\\"'), |
| 740 | + params=params_str.replace("\\", "\\\\").replace('"', '\\"'), |
705 | 741 | ) |
706 | 742 | IPython.display.display(IPython.core.display.HTML(html_content)) |
707 | 743 |
|
@@ -810,7 +846,7 @@ def _make_bq_query( |
810 | 846 | result = result.to_dataframe(**dataframe_kwargs) |
811 | 847 |
|
812 | 848 | if args.graph and _supports_graph_widget(result): |
813 | | - _add_graph_widget(result) |
| 849 | + _add_graph_widget(result, query_job, args) |
814 | 850 | return _handle_result(result, args) |
815 | 851 |
|
816 | 852 |
|
@@ -904,7 +940,7 @@ def _make_bqstorage_client(client, client_options): |
904 | 940 |
|
905 | 941 | return client._ensure_bqstorage_client( |
906 | 942 | client_options=client_options, |
907 | | - client_info=gapic_client_info.ClientInfo(user_agent=_get_user_agent()), |
| 943 | + client_info=gapic_client_info.ClientInfo(user_agent=core._get_user_agent()), |
908 | 944 | ) |
909 | 945 |
|
910 | 946 |
|
|
0 commit comments