enrich dry run result with more stats

sycai · sycai · commit cc2be7c18277 · 2025-12-29T20:22:42.000Z
diff --git a/pandas_gbq/dry_runs.py b/pandas_gbq/dry_runs.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2025 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+from __future__ import annotations
+
+import copy
+from typing import Any, List
+
+from google.cloud import bigquery
+import pandas
+
+
+def get_query_stats(
+    query_job: bigquery.QueryJob,
+) -> pandas.Series:
+    """Returns important stats from the query job as a Pandas Series."""
+
+    index: List[Any] = []
+    values: List[Any] = []
+
+    # Add raw BQ schema
+    index.append("bigquerySchema")
+    values.append(query_job.schema)
+
+    job_api_repr = copy.deepcopy(query_job._properties)
+
+    # jobReference might not be populated for "job optional" queries.
+    job_ref = job_api_repr.get("jobReference", {})
+    for key, val in job_ref.items():
+        index.append(key)
+        values.append(val)
+
+    configuration = job_api_repr.get("configuration", {})
+    index.append("jobType")
+    values.append(configuration.get("jobType", None))
+    index.append("dispatchedSql")
+    values.append(configuration.get("query", {}).get("query", None))
+
+    query_config = configuration.get("query", {})
+    for key in ("destinationTable", "useLegacySql"):
+        index.append(key)
+        values.append(query_config.get(key, None))
+
+    statistics = job_api_repr.get("statistics", {})
+    query_stats = statistics.get("query", {})
+    for key in (
+        "referencedTables",
+        "totalBytesProcessed",
+        "cacheHit",
+        "statementType",
+    ):
+        index.append(key)
+        values.append(query_stats.get(key, None))
+
+    creation_time = statistics.get("creationTime", None)
+    index.append("creationTime")
+    values.append(
+        pandas.Timestamp(creation_time, unit="ms", tz="UTC")
+        if creation_time is not None
+        else None
+    )
+
+    result = pandas.Series(values, index=index)
+    if result["totalBytesProcessed"] is None:
+        result["totalBytesProcessed"] = 0
+    else:
+        result["totalBytesProcessed"] = int(result["totalBytesProcessed"])
+
+    return result
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -269,9 +269,9 @@ def read_gbq(
         If True, run a dry run query.
     Returns
     -------
-    df: DataFrame or int
+    df: DataFrame or Series
         DataFrame representing results of query. If ``dry_run=True``, returns
-        aan integer representing the amount of data that would be processed (in bytes).
+        a Pandas series that contains job statistics.
     """
     if dialect is None:
         dialect = context.dialect
@@ -328,7 +328,7 @@ def read_gbq(
             dtypes=dtypes,
             dry_run=dry_run,
         )
-        # When dry_run=True, run_query returns a float (cost in GB), not a DataFrame
+        # When dry_run=True, run_query returns a Pandas series
         if dry_run:
             return final_df
     else:
diff --git a/pandas_gbq/gbq_connector.py b/pandas_gbq/gbq_connector.py
@@ -16,6 +16,7 @@
 if typing.TYPE_CHECKING:  # pragma: NO COVER
     import pandas
 
+from pandas_gbq import dry_runs
 import pandas_gbq.constants
 from pandas_gbq.contexts import context
 import pandas_gbq.core.read
@@ -249,15 +250,13 @@ def run_query(
         if dry_run:
             # Access total_bytes_processed from the QueryJob via RowIterator.job
             # RowIterator has a job attribute that references the QueryJob
-            query_job = (
-                rows_iter.job if hasattr(rows_iter, "job") and rows_iter.job else None
-            )
-            if query_job is None:
-                # Fallback: if query_and_wait_via_client_library doesn't set job,
-                # we need to get it from the query result
-                # For query_and_wait_via_client_library, the RowIterator should have job set
-                raise ValueError("Cannot access QueryJob from RowIterator for dry_run")
-            return query_job.total_bytes_processed
+            if hasattr(rows_iter, "job") and rows_iter.job:
+                return dry_runs.get_query_stats(rows_iter.job)
+
+            # Fallback: if query_and_wait_via_client_library doesn't set job,
+            # we need to get it from the query result
+            # For query_and_wait_via_client_library, the RowIterator should have job set
+            raise ValueError("Cannot access QueryJob from RowIterator for dry_run")
 
         return self._download_results(
             rows_iter,
diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py
@@ -656,15 +656,15 @@ def test_columns_and_col_order_raises_error(self, project_id):
 
     def test_read_gbq_with_dry_run(self, project_id):
         query = "SELECT 1"
-        cost = gbq.read_gbq(
+        result = gbq.read_gbq(
             query,
             project_id=project_id,
             credentials=self.credentials,
             dialect="standard",
             dry_run=True,
         )
-        assert isinstance(cost, float)
-        assert cost > 0
+        assert isinstance(result, pandas.Series)
+        assert result["totalBytesProcessed"] >= 0
 
 
 class TestToGBQIntegration(object):
diff --git a/tests/unit/test_dry_runs.py b/tests/unit/test_dry_runs.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2025 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+from unittest import mock
+
+from google.cloud import bigquery
+import pandas
+import pandas.testing
+
+from pandas_gbq import dry_runs
+
+
+def test_get_query_stats():
+    mock_query_job = mock.create_autospec(bigquery.QueryJob)
+    total_bytes_processed = 15
+    mock_query_job._properties = {
+        "kind": "bigquery#job",
+        "etag": "e-tag",
+        "id": "id",
+        "selfLink": "self-link",
+        "user_email": "user-emial",
+        "configuration": {
+            "query": {
+                "query": "SELECT * FROM `test_table`",
+                "destinationTable": {
+                    "projectId": "project-id",
+                    "datasetId": "dataset-id",
+                    "tableId": "table-id",
+                },
+                "writeDisposition": "WRITE_TRUNCATE",
+                "priority": "INTERACTIVE",
+                "useLegacySql": False,
+            },
+            "jobType": "QUERY",
+        },
+        "jobReference": {
+            "projectId": "project-id",
+            "jobId": "job-id",
+            "location": "US",
+        },
+        "statistics": {
+            "creationTime": 1767037135155.0,
+            "startTime": 1767037135238.0,
+            "endTime": 1767037135353.0,
+            "totalBytesProcessed": f"{total_bytes_processed}",
+            "query": {
+                "totalBytesProcessed": f"{total_bytes_processed}",
+                "totalBytesBilled": "0",
+                "cacheHit": True,
+                "statementType": "SELECT",
+            },
+            "reservation_id": "reservation_id",
+            "edition": "ENTERPRISE",
+            "reservationGroupPath": [""],
+        },
+        "status": {"state": "DONE"},
+        "principal_subject": "principal_subject",
+        "jobCreationReason": {"code": "REQUESTED"},
+    }
+    expected_index = pandas.Index(
+        [
+            "bigquerySchema",
+            "projectId",
+            "jobId",
+            "location",
+            "jobType",
+            "dispatchedSql",
+            "destinationTable",
+            "useLegacySql",
+            "referencedTables",
+            "totalBytesProcessed",
+            "cacheHit",
+            "statementType",
+            "creationTime",
+        ]
+    )
+
+    result = dry_runs.get_query_stats(mock_query_job)
+
+    assert isinstance(result, pandas.Series)
+    pandas.testing.assert_index_equal(expected_index, result.index)
+    assert result["totalBytesProcessed"] == total_bytes_processed
diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py
@@ -943,8 +943,29 @@ def test_run_query_with_dml_query(mock_bigquery_client, mock_query_job):
 
 
 def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job):
-    type(mock_query_job).total_bytes_processed = mock.PropertyMock(return_value=12345)
-    cost = gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True)
+    total_bytes_processed = 15
+    type(mock_query_job)._properties = mock.PropertyMock(
+        return_value={
+            "statistics": {
+                "creationTime": 1767037135155.0,
+                "startTime": 1767037135238.0,
+                "endTime": 1767037135353.0,
+                "totalBytesProcessed": f"{total_bytes_processed}",
+                "query": {
+                    "totalBytesProcessed": f"{total_bytes_processed}",
+                    "totalBytesBilled": "0",
+                    "cacheHit": True,
+                    "statementType": "SELECT",
+                },
+                "reservation_id": "reservation_id",
+                "edition": "ENTERPRISE",
+                "reservationGroupPath": [""],
+            },
+        }
+    )
+
+    dry_run_result = gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True)
+
     # Check which method was called based on BigQuery version
     if (
         hasattr(mock_bigquery_client, "query_and_wait")
@@ -956,4 +977,4 @@ def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job):
         _, kwargs = mock_bigquery_client.query.call_args
         job_config = kwargs["job_config"]
     assert job_config.dry_run is True
-    assert cost >= 0
+    assert dry_run_result["totalBytesProcessed"] == total_bytes_processed