fix: align process_data timestamps to UTC to match ingest_data cutoff filter (#927)

lspataroG · web-flow · commit 847e2f3c256b · 2026-04-15T12:50:16.000+02:00
process_data used naive datetime.now() for creation_timestamp while
ingest_data uses datetime.now(timezone.utc) for the look_back_days
cutoff. On non-UTC containers, chunks could fall outside the window.
diff --git a/agent_starter_pack/agents/agentic_rag/data_ingestion/data_ingestion_pipeline/components/process_data.py b/agent_starter_pack/agents/agentic_rag/data_ingestion/data_ingestion_pipeline/components/process_data.py
@@ -62,7 +62,7 @@ def process_data(
         location: BigQuery location
     """
     import logging
-    from datetime import datetime, timedelta
+    from datetime import datetime, timedelta, timezone
 
     import bigframes.pandas as bpd
     import swifter
@@ -89,7 +89,7 @@ def process_data(
         logging.warning(
             "Pipeline schedule not set. Setting schedule_time to current date."
         )
-        schedule_time_dt = datetime.now()
+        schedule_time_dt = datetime.now(timezone.utc)
 
     # Note: The following line sets the schedule time 5 years back to allow sample data to be present.
     # For your use case, please comment out the following line to use the actual schedule time.
@@ -215,7 +215,7 @@ def create_table_if_not_exist(
     # This allows create-before-delete ingestion: new chunks never collide
     # with old ones, so we can safely create first, then delete stale data.
     logging.info("Creating chunk IDs and exploding chunks into rows...")
-    run_ts = datetime.now().strftime("%Y%m%d%H%M%S")
+    run_ts = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")
     chunk_ids = [
         str(idx) for text_chunk in df["text_chunk"] for idx in range(len(text_chunk))
     ]
@@ -226,7 +226,7 @@ def create_table_if_not_exist(
     logging.info("Chunk IDs created and chunks exploded.")
 
     # No embedding generation needed — Vector Search 2.0 auto-generates embeddings
-    df = df.assign(creation_timestamp=datetime.now())
+    df = df.assign(creation_timestamp=datetime.now(timezone.utc))
 
     # Store results in BigQuery
     PARTITION_DATE_COLUMN = "creation_timestamp"