fix: speed up traces sync by skipping partitions that exist

dbirman · dbirman · commit 43a91774ff4d · 2026-06-30T15:23:51.000-07:00
diff --git a/src/biodata_cache/backend.py b/src/biodata_cache/backend.py
@@ -70,6 +70,10 @@ def get_versions_index(self) -> list[str]:
         """Return the list of all available version folders from cache_versions.json."""
         pass  # pragma: no cover
 
+    def partition_exists(self, table_name: str) -> bool:
+        """Return True if data already exists for the given partition."""
+        return False
+
 
 class S3Backend(Backend):
     """Stores and retrieves caches using AWS S3 with parquet files."""
@@ -149,6 +153,16 @@ def clear_partition(self, table_name: str) -> None:
                 Delete={"Objects": to_delete[i : i + 1000]},
             )
 
+    def partition_exists(self, table_name: str) -> bool:
+        """Return True if any parquet chunk exists for a hive partition."""
+        if "/" not in table_name:
+            return False
+        base, value = table_name.split("/", 1)
+        partition_key = HIVE_PARTITION_KEYS[base]
+        prefix = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{base}/{partition_key}={value}/"
+        resp = self.s3_client.list_objects_v2(Bucket=self.bucket, Prefix=prefix, MaxKeys=1)
+        return resp.get("KeyCount", 0) > 0
+
     def write_chunk(self, table_name: str, data: pd.DataFrame, chunk_idx: int) -> None:
         """Append one numbered parquet chunk to a hive partition."""
         base, value = table_name.split("/", 1)
@@ -363,6 +377,11 @@ def get_versions_index(self) -> list[str]:
         """Return the list of all available version folders from the in-memory index."""
         return json.loads(self._json_store.get("cache_versions.json", "[]"))
 
+    def partition_exists(self, table_name: str) -> bool:
+        """Return True if a partition has stored data in memory."""
+        df = self._store.get(table_name)
+        return df is not None and not df.empty
+
     def clear_partition(self, table_name: str) -> None:
         """Remove all chunks stored for a partitioned table."""
         self._store.pop(table_name, None)
diff --git a/src/biodata_cache/sync.py b/src/biodata_cache/sync.py
@@ -279,6 +279,11 @@ def update_all_tables(fast: bool = True, slow: bool = True) -> None:
             fib_subject_ids = (
                 df_basics[fib_mask & (df_basics["data_level"] == "derived")]["subject_id"].dropna().unique()
             )
+        fib_subject_ids = [
+            subject_id
+            for subject_id in fib_subject_ids
+            if not BACKEND.partition_exists(f"{NAMES['fib_traces']}/{subject_id}")
+        ]
         if len(fib_subject_ids) > 0:
             fib_traces_fn = TABLE_REGISTRY[NAMES["fib_traces"]]
             try: