feat: add StreamingDataFrame.to_bigtable and .to_pubsub start_timestamp parameter

GarrettWu · GarrettWu · commit aa3622dbf950 · 2025-09-10T00:15:00.000Z
diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py
@@ -17,7 +17,7 @@
 from __future__ import annotations
 
 from collections import namedtuple
-from datetime import datetime
+from datetime import date, datetime
 import inspect
 import sys
 import typing
@@ -194,7 +194,7 @@ def to_datetime(
 
 @typing.overload
 def to_datetime(
-    arg: Union[int, float, str, datetime],
+    arg: Union[int, float, str, datetime, date],
     *,
     utc: bool = False,
     format: Optional[str] = None,
@@ -205,7 +205,7 @@ def to_datetime(
 
 def to_datetime(
     arg: Union[
-        Union[int, float, str, datetime],
+        Union[int, float, str, datetime, date],
         vendored_pandas_datetimes.local_iterables,
         bigframes.series.Series,
         bigframes.dataframe.DataFrame,
diff --git a/bigframes/streaming/dataframe.py b/bigframes/streaming/dataframe.py
@@ -15,13 +15,16 @@
 """Module for bigquery continuous queries"""
 from __future__ import annotations
 
+from abc import abstractmethod
+from datetime import date, datetime
 import functools
 import inspect
 import json
-from typing import Optional
+from typing import Optional, Union
 import warnings
 
 from google.cloud import bigquery
+import pandas as pd
 
 from bigframes import dataframe
 from bigframes.core import log_adapter, nodes
@@ -54,9 +57,14 @@ def _curate_df_doc(doc: Optional[str]):
 
 
 class StreamingBase:
-    _appends_sql: str
     _session: bigframes.session.Session
 
+    @abstractmethod
+    def _appends_sql(
+        self, start_timestamp: Optional[Union[int, float, str, datetime, date]]
+    ) -> str:
+        pass
+
     def to_bigtable(
         self,
         *,
@@ -70,6 +78,8 @@ def to_bigtable(
         bigtable_options: Optional[dict] = None,
         job_id: Optional[str] = None,
         job_id_prefix: Optional[str] = None,
+        start_timestamp: Optional[Union[int, float, str, datetime, date]] = None,
+        end_timestamp: Optional[Union[int, float, str, datetime, date]] = None,
     ) -> bigquery.QueryJob:
         """
         Export the StreamingDataFrame as a continue job and returns a
@@ -115,6 +125,8 @@ def to_bigtable(
                 If specified, a job id prefix for the query, see
                 job_id_prefix parameter of
                 https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query
+            start_timestamp (int, float, str, datetime, date, default None):
+                The start timestamp of the query. Possible values should be within the recent 7 days. If None, will start from the max value, 7 days ago. If pass in time zone naive values, use UTC time zone.
 
         Returns:
             google.cloud.bigquery.QueryJob:
@@ -123,8 +135,15 @@ def to_bigtable(
                 For example, the job can be cancelled or its error status
                 can be examined.
         """
+        if not isinstance(
+            start_timestamp, (int, float, str, datetime, date, type(None))
+        ):
+            raise ValueError(
+                f"Unsupported start_timestamp type {type(start_timestamp)}"
+            )
+
         return _to_bigtable(
-            self._appends_sql,
+            self._appends_sql(start_timestamp),
             instance=instance,
             table=table,
             service_account_email=service_account_email,
@@ -145,6 +164,7 @@ def to_pubsub(
         service_account_email: str,
         job_id: Optional[str] = None,
         job_id_prefix: Optional[str] = None,
+        start_timestamp: Optional[Union[int, float, str, datetime, date]] = None,
     ) -> bigquery.QueryJob:
         """
         Export the StreamingDataFrame as a continue job and returns a
@@ -172,6 +192,8 @@ def to_pubsub(
                 If specified, a job id prefix for the query, see
                 job_id_prefix parameter of
                 https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query
+            start_timestamp (int, float, str, datetime, date, default None):
+                The start timestamp of the query. Possible values should be within the recent 7 days. If None, will start from the max value, 7 days ago. If pass in time zone naive values, use UTC time zone.
 
         Returns:
             google.cloud.bigquery.QueryJob:
@@ -180,8 +202,15 @@ def to_pubsub(
                 For example, the job can be cancelled or its error status
                 can be examined.
         """
+        if not isinstance(
+            start_timestamp, (int, float, str, datetime, date, type(None))
+        ):
+            raise ValueError(
+                f"Unsupported start_timestamp type {type(start_timestamp)}"
+            )
+
         return _to_pubsub(
-            self._appends_sql,
+            self._appends_sql(start_timestamp),
             topic=topic,
             service_account_email=service_account_email,
             session=self._session,
@@ -280,14 +309,21 @@ def sql(self):
     sql.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame.sql))
 
     # Patch for the required APPENDS clause
-    @property
-    def _appends_sql(self):
+    def _appends_sql(
+        self, start_timestamp: Optional[Union[int, float, str, datetime, date]]
+    ) -> str:
         sql_str = self.sql
         original_table = self._original_table
         assert original_table is not None
 
         # TODO(b/405691193): set start time back to NULL. Now set it slightly after 7 days max interval to avoid the bug.
-        appends_clause = f"APPENDS(TABLE `{original_table}`, CURRENT_TIMESTAMP() - (INTERVAL 7 DAY - INTERVAL 5 MINUTE))"
+        start_ts_str = (
+            str(pd.to_datetime(start_timestamp))
+            if start_timestamp
+            else "CURRENT_TIMESTAMP() - (INTERVAL 7 DAY - INTERVAL 5 MINUTE)"
+        )
+
+        appends_clause = f"APPENDS(TABLE `{original_table}`, {start_ts_str})"
         sql_str = sql_str.replace(f"`{original_table}`", appends_clause)
         return sql_str
 
diff --git a/tests/system/large/streaming/test_bigtable.py b/tests/system/large/streaming/test_bigtable.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from datetime import datetime, timedelta
 import time
 from typing import Generator
 import uuid
@@ -67,7 +68,7 @@ def bigtable_table(
     bt_table.delete()
 
 
-@pytest.mark.flaky(retries=3, delay=10)
+# @pytest.mark.flaky(retries=3, delay=10)
 def test_streaming_df_to_bigtable(
     session_load: bigframes.Session, bigtable_table: table.Table
 ):
@@ -91,6 +92,7 @@ def test_streaming_df_to_bigtable(
             bigtable_options={},
             job_id=None,
             job_id_prefix=job_id_prefix,
+            start_timestamp=datetime.now() - timedelta(days=1),
         )
 
         # wait 100 seconds in order to ensure the query doesn't stop