Skip scratch pad eviction data in enrichment mode to avoid cudaFree overhead (pytorch#5645)

EddyLXJ · meta-codesync[bot] · commit 9374e3fc1205 · 2026-04-19T14:45:33.000-07:00
Summary: Pull Request resolved: pytorch#5645 X-link: https://github.com/facebookresearch/FBGEMM/pull/2593 CONTEXT: In KVZCH enrichment mode (_enrichment_enabled), the ssd_scratch_pad_eviction_data list accumulates UVA tensors every forward pass via _prefetch. The backward hook _evict_from_scratch_pad pops entries but does nothing useful (evict() is skipped in embedding_cache_mode, RES is disabled). The .clear() call in enrichment_query_id then triggers expensive cudaFree calls when releasing those UVA tensors, causing GPU stalls visible in Perfetto traces. WHAT: Skip appending to ssd_scratch_pad_eviction_data in _prefetch when _enrichment_enabled is True. Add early return in _evict_from_scratch_pad for enrichment mode. Remove the now-unnecessary .clear() in enrichment_query_id since the list is always empty. Reviewed By: emlin Differential Revision: D101102800 fbshipit-source-id: 16dcd8d32d55f77478235f4a27a3be10f692e288
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -1652,6 +1652,9 @@ def _evict_from_scratch_pad(self, grad: Tensor) -> None:
         (`inserted_rows`) on the `ssd_eviction_stream`. This is a hook
         that is invoked right after TBE backward.
 
+        In enrichment mode, scratch pad eviction data is not populated
+        (skipped in _prefetch), so this hook returns early.
+
         Conflict missed indices are specified in
         `post_bwd_evicted_indices_cpu`. Indices that are not -1 and
         their positions < `actions_count_cpu` (i.e., rows
@@ -1665,6 +1668,11 @@ def _evict_from_scratch_pad(self, grad: Tensor) -> None:
             None
         """
         with record_function("## ssd_evict_from_scratch_pad_pipeline ##"):
+            # In enrichment mode, scratch pad eviction data is not populated
+            # (_prefetch skips the append), so nothing to do here.
+            if self._enrichment_enabled:
+                return
+
             current_stream = torch.cuda.current_stream()
             current_stream.record_event(self.ssd_event_backward)
 
@@ -2421,7 +2429,10 @@ def _prefetch(  # noqa C901
 
             # Store scratch pad info for post backward eviction only for training
             # for eval job, no backward pass, so no need to store this info
-            if self.training:
+            # Skip for enrichment mode: the backward hook only pops without
+            # evicting (embedding_cache_mode skips evict), and the .clear()
+            # in enrichment_query_id triggers expensive cudaFree on UVA tensors.
+            if self.training and not self._enrichment_enabled:
                 self.ssd_scratch_pad_eviction_data.append(
                     (
                         inserted_rows,
@@ -5228,13 +5239,6 @@ def enrichment_query_id(
                     dedup_linear_indices = sorted_linear_indices[mask]
                     dedup_weights = sorted_weights[mask]
 
-                    if len(self.ssd_scratch_pad_eviction_data) > 0:
-                        # IMPORTANT: Clear ALL accumulated scratch pad data, not just one!
-                        # _prefetch appends one element per forward, but enrichment_query_id
-                        # may not be called every forward. This prevents memory leak from
-                        # accumulated GPU tensors (inserted_rows is a UVA tensor).
-                        self.ssd_scratch_pad_eviction_data.clear()
-
                     # D2H copy on the same stream (already on enrichment_query_stream)
                     linear_cache_indices_cpu = self.to_pinned_cpu(dedup_linear_indices)
                     dedup_weights_cpu = self.to_pinned_cpu(dedup_weights)