Skip to content
This repository was archived by the owner on Mar 13, 2026. It is now read-only.

Commit cc2be7c

Browse files
committed
enrich dry run result with more stats
1 parent 78edab9 commit cc2be7c

6 files changed

Lines changed: 191 additions & 18 deletions

File tree

pandas_gbq/dry_runs.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Copyright (c) 2025 pandas-gbq Authors All rights reserved.
2+
# Use of this source code is governed by a BSD-style
3+
# license that can be found in the LICENSE file.
4+
5+
from __future__ import annotations
6+
7+
import copy
8+
from typing import Any, List
9+
10+
from google.cloud import bigquery
11+
import pandas
12+
13+
14+
def get_query_stats(
15+
query_job: bigquery.QueryJob,
16+
) -> pandas.Series:
17+
"""Returns important stats from the query job as a Pandas Series."""
18+
19+
index: List[Any] = []
20+
values: List[Any] = []
21+
22+
# Add raw BQ schema
23+
index.append("bigquerySchema")
24+
values.append(query_job.schema)
25+
26+
job_api_repr = copy.deepcopy(query_job._properties)
27+
28+
# jobReference might not be populated for "job optional" queries.
29+
job_ref = job_api_repr.get("jobReference", {})
30+
for key, val in job_ref.items():
31+
index.append(key)
32+
values.append(val)
33+
34+
configuration = job_api_repr.get("configuration", {})
35+
index.append("jobType")
36+
values.append(configuration.get("jobType", None))
37+
index.append("dispatchedSql")
38+
values.append(configuration.get("query", {}).get("query", None))
39+
40+
query_config = configuration.get("query", {})
41+
for key in ("destinationTable", "useLegacySql"):
42+
index.append(key)
43+
values.append(query_config.get(key, None))
44+
45+
statistics = job_api_repr.get("statistics", {})
46+
query_stats = statistics.get("query", {})
47+
for key in (
48+
"referencedTables",
49+
"totalBytesProcessed",
50+
"cacheHit",
51+
"statementType",
52+
):
53+
index.append(key)
54+
values.append(query_stats.get(key, None))
55+
56+
creation_time = statistics.get("creationTime", None)
57+
index.append("creationTime")
58+
values.append(
59+
pandas.Timestamp(creation_time, unit="ms", tz="UTC")
60+
if creation_time is not None
61+
else None
62+
)
63+
64+
result = pandas.Series(values, index=index)
65+
if result["totalBytesProcessed"] is None:
66+
result["totalBytesProcessed"] = 0
67+
else:
68+
result["totalBytesProcessed"] = int(result["totalBytesProcessed"])
69+
70+
return result

pandas_gbq/gbq.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -269,9 +269,9 @@ def read_gbq(
269269
If True, run a dry run query.
270270
Returns
271271
-------
272-
df: DataFrame or int
272+
df: DataFrame or Series
273273
DataFrame representing results of query. If ``dry_run=True``, returns
274-
aan integer representing the amount of data that would be processed (in bytes).
274+
a Pandas series that contains job statistics.
275275
"""
276276
if dialect is None:
277277
dialect = context.dialect
@@ -328,7 +328,7 @@ def read_gbq(
328328
dtypes=dtypes,
329329
dry_run=dry_run,
330330
)
331-
# When dry_run=True, run_query returns a float (cost in GB), not a DataFrame
331+
# When dry_run=True, run_query returns a Pandas series
332332
if dry_run:
333333
return final_df
334334
else:

pandas_gbq/gbq_connector.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
if typing.TYPE_CHECKING: # pragma: NO COVER
1717
import pandas
1818

19+
from pandas_gbq import dry_runs
1920
import pandas_gbq.constants
2021
from pandas_gbq.contexts import context
2122
import pandas_gbq.core.read
@@ -249,15 +250,13 @@ def run_query(
249250
if dry_run:
250251
# Access total_bytes_processed from the QueryJob via RowIterator.job
251252
# RowIterator has a job attribute that references the QueryJob
252-
query_job = (
253-
rows_iter.job if hasattr(rows_iter, "job") and rows_iter.job else None
254-
)
255-
if query_job is None:
256-
# Fallback: if query_and_wait_via_client_library doesn't set job,
257-
# we need to get it from the query result
258-
# For query_and_wait_via_client_library, the RowIterator should have job set
259-
raise ValueError("Cannot access QueryJob from RowIterator for dry_run")
260-
return query_job.total_bytes_processed
253+
if hasattr(rows_iter, "job") and rows_iter.job:
254+
return dry_runs.get_query_stats(rows_iter.job)
255+
256+
# Fallback: if query_and_wait_via_client_library doesn't set job,
257+
# we need to get it from the query result
258+
# For query_and_wait_via_client_library, the RowIterator should have job set
259+
raise ValueError("Cannot access QueryJob from RowIterator for dry_run")
261260

262261
return self._download_results(
263262
rows_iter,

tests/system/test_gbq.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -656,15 +656,15 @@ def test_columns_and_col_order_raises_error(self, project_id):
656656

657657
def test_read_gbq_with_dry_run(self, project_id):
658658
query = "SELECT 1"
659-
cost = gbq.read_gbq(
659+
result = gbq.read_gbq(
660660
query,
661661
project_id=project_id,
662662
credentials=self.credentials,
663663
dialect="standard",
664664
dry_run=True,
665665
)
666-
assert isinstance(cost, float)
667-
assert cost > 0
666+
assert isinstance(result, pandas.Series)
667+
assert result["totalBytesProcessed"] >= 0
668668

669669

670670
class TestToGBQIntegration(object):

tests/unit/test_dry_runs.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# Copyright (c) 2025 pandas-gbq Authors All rights reserved.
2+
# Use of this source code is governed by a BSD-style
3+
# license that can be found in the LICENSE file.
4+
5+
from unittest import mock
6+
7+
from google.cloud import bigquery
8+
import pandas
9+
import pandas.testing
10+
11+
from pandas_gbq import dry_runs
12+
13+
14+
def test_get_query_stats():
15+
mock_query_job = mock.create_autospec(bigquery.QueryJob)
16+
total_bytes_processed = 15
17+
mock_query_job._properties = {
18+
"kind": "bigquery#job",
19+
"etag": "e-tag",
20+
"id": "id",
21+
"selfLink": "self-link",
22+
"user_email": "user-emial",
23+
"configuration": {
24+
"query": {
25+
"query": "SELECT * FROM `test_table`",
26+
"destinationTable": {
27+
"projectId": "project-id",
28+
"datasetId": "dataset-id",
29+
"tableId": "table-id",
30+
},
31+
"writeDisposition": "WRITE_TRUNCATE",
32+
"priority": "INTERACTIVE",
33+
"useLegacySql": False,
34+
},
35+
"jobType": "QUERY",
36+
},
37+
"jobReference": {
38+
"projectId": "project-id",
39+
"jobId": "job-id",
40+
"location": "US",
41+
},
42+
"statistics": {
43+
"creationTime": 1767037135155.0,
44+
"startTime": 1767037135238.0,
45+
"endTime": 1767037135353.0,
46+
"totalBytesProcessed": f"{total_bytes_processed}",
47+
"query": {
48+
"totalBytesProcessed": f"{total_bytes_processed}",
49+
"totalBytesBilled": "0",
50+
"cacheHit": True,
51+
"statementType": "SELECT",
52+
},
53+
"reservation_id": "reservation_id",
54+
"edition": "ENTERPRISE",
55+
"reservationGroupPath": [""],
56+
},
57+
"status": {"state": "DONE"},
58+
"principal_subject": "principal_subject",
59+
"jobCreationReason": {"code": "REQUESTED"},
60+
}
61+
expected_index = pandas.Index(
62+
[
63+
"bigquerySchema",
64+
"projectId",
65+
"jobId",
66+
"location",
67+
"jobType",
68+
"dispatchedSql",
69+
"destinationTable",
70+
"useLegacySql",
71+
"referencedTables",
72+
"totalBytesProcessed",
73+
"cacheHit",
74+
"statementType",
75+
"creationTime",
76+
]
77+
)
78+
79+
result = dry_runs.get_query_stats(mock_query_job)
80+
81+
assert isinstance(result, pandas.Series)
82+
pandas.testing.assert_index_equal(expected_index, result.index)
83+
assert result["totalBytesProcessed"] == total_bytes_processed

tests/unit/test_gbq.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -943,8 +943,29 @@ def test_run_query_with_dml_query(mock_bigquery_client, mock_query_job):
943943

944944

945945
def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job):
946-
type(mock_query_job).total_bytes_processed = mock.PropertyMock(return_value=12345)
947-
cost = gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True)
946+
total_bytes_processed = 15
947+
type(mock_query_job)._properties = mock.PropertyMock(
948+
return_value={
949+
"statistics": {
950+
"creationTime": 1767037135155.0,
951+
"startTime": 1767037135238.0,
952+
"endTime": 1767037135353.0,
953+
"totalBytesProcessed": f"{total_bytes_processed}",
954+
"query": {
955+
"totalBytesProcessed": f"{total_bytes_processed}",
956+
"totalBytesBilled": "0",
957+
"cacheHit": True,
958+
"statementType": "SELECT",
959+
},
960+
"reservation_id": "reservation_id",
961+
"edition": "ENTERPRISE",
962+
"reservationGroupPath": [""],
963+
},
964+
}
965+
)
966+
967+
dry_run_result = gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True)
968+
948969
# Check which method was called based on BigQuery version
949970
if (
950971
hasattr(mock_bigquery_client, "query_and_wait")
@@ -956,4 +977,4 @@ def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job):
956977
_, kwargs = mock_bigquery_client.query.call_args
957978
job_config = kwargs["job_config"]
958979
assert job_config.dry_run is True
959-
assert cost >= 0
980+
assert dry_run_result["totalBytesProcessed"] == total_bytes_processed

0 commit comments

Comments
 (0)