add overall L1→DRAM hit rate metric (#5777)

Xinyi Wang · facebook-github-bot · commit 9fed2a57e2bf · 2026-06-01T10:42:25.000-07:00
Summary: X-link: facebookresearch/FBGEMM#2706 `SSDTableBatchedEmbeddingBags` already emits per-tier hit rates — `ssd_tbe.prefetch.l1_hit_rate_pct`, `l2_cache.hit_rate_pct`, and `dram_kv.hit_rate_pct` — but each is conditional on requests that reached that tier. As `l1_cache_size` grows, L1 absorbs more keys and only the long-tail keys fall through to DRAM, so the L1-conditional DRAM hit rate drops mechanically even though the system is doing more — not less — work in the cheaper tier. None of the existing per-tier metrics give an at-a-glance answer to "what fraction of unique requests were served from cache (L1 or DRAM), without paying SSD cost?". This diff adds an `ssd_tbe.overall_hit_rate_pct` aggregate metric (per-TBE: `ssd_tbe.tbe_id{N}.overall_hit_rate_pct`) defined as: overall_hit_rate_pct = 100.0 * (num_unique - dram_read_miss_count) / num_unique i.e. the fraction of unique requests that did not miss at DRAM. The value stays stable as cache sizes shift between L1 and DRAM. Algebraically equivalent to the expanded form `L1_hit + (1 - L1_hit) * DRAM_hit_conditional` under the assumption that every L1 miss reaches DRAM (the only path today). A code comment documents this caveat in case a future SSD-bypass path is added. The existing per-tier metrics (`l1_hit_rate_pct`, `l2_cache.hit_rate_pct`, `dram_kv.hit_rate_pct`) are left unchanged — they remain useful for diagnosing per-tier behavior. Implementation: - `_report_uvm_cache_stats` stashes `num_unique` into `_last_l1_num_unique` so `_report_dram_kv_perf_stats` can use it as the normalization denominator without re-reading L1 counters. Both reporters fire from the same `should_report(self.step)` cadence, so the stashed value corresponds to the same reporting window. Reviewed By: kausv Differential Revision: D105727013
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -1233,6 +1233,10 @@ def __init__(
         # 4: N_conflict_unique_misses, 5: N_conflict_misses
         self.last_reported_ssd_stats: list[float] = []
         self.last_reported_step = 0
+        # Stashed by _report_uvm_cache_stats so _report_dram_kv_perf_stats
+        # can compute the overall L1 → DRAM hit rate against total unique
+        # indices. See T272139146.
+        self._last_l1_num_unique: float = 0.0
 
         self.register_buffer(
             "ssd_cache_stats",
@@ -1313,6 +1317,9 @@ def __init__(
         self.l1_hit_rate_stats_name: str = (
             f"ssd_tbe.prefetch.tbe_id{tbe_unique_id}.l1_hit_rate_pct"
         )
+        self.overall_hit_rate_stats_name: str = (
+            f"ssd_tbe.tbe_id{tbe_unique_id}.overall_hit_rate_pct"
+        )
 
         self.eviction_sum_evicted_counts_stats_name: str = (
             f"eviction.tbe_id.{tbe_unique_id}.sum_evicted_counts"
@@ -1366,6 +1373,7 @@ def __init__(
             self.stats_reporter.register_stats(self.enrichment_empty_count_stats_name)
             self.stats_reporter.register_stats(self.enrichment_success_rate_stats_name)
             self.stats_reporter.register_stats(self.l1_hit_rate_stats_name)
+            self.stats_reporter.register_stats(self.overall_hit_rate_stats_name)
             for t in self.feature_table_map:
                 self.stats_reporter.register_stats(
                     f"eviction.feature_table.{t}.evicted_counts"
@@ -4185,6 +4193,7 @@ def _report_ssd_l1_cache_stats(self) -> None:
         # L1 cache hit rate
         num_unique = ssd_cache_stats_delta[UVMCacheStatsIndex.num_unique_indices]
         num_misses = ssd_cache_stats_delta[UVMCacheStatsIndex.num_unique_misses]
+        self._last_l1_num_unique = num_unique
         if num_unique > 0:
             l1_hit_rate_pct = 100.0 * (num_unique - num_misses) / num_unique
             # Per-TBE L1 hit rate
@@ -4864,6 +4873,31 @@ def _report_dram_kv_perf_stats(self) -> None:
                     enable_tb_metrics=True,
                 )
 
+            # Overall hit rate across the L1 → DRAM path, normalized by total
+            # unique requests so it stays stable as l1_cache_size changes
+            # (the per-tier rates each shift mechanically with cache sizing).
+            # Assumes every L1 miss reaches DRAM, i.e.
+            #     num_misses == dram_read_hit_count + dram_read_miss_count.
+            # If a future path lets L1 misses bypass DRAM, this needs to
+            # account for the additional tier.
+            num_unique = self._last_l1_num_unique
+            if num_unique > 0:
+                overall_hit_rate_pct = (
+                    100.0 * (num_unique - dram_read_miss_count) / num_unique
+                )
+                stats_reporter.report_data_amount(
+                    iteration_step=self.step,
+                    event_name=self.overall_hit_rate_stats_name,
+                    data_bytes=overall_hit_rate_pct,
+                    enable_tb_metrics=True,
+                )
+                stats_reporter.report_data_amount(
+                    iteration_step=self.step,
+                    event_name="ssd_tbe.overall_hit_rate_pct",
+                    data_bytes=overall_hit_rate_pct,
+                    enable_tb_metrics=True,
+                )
+
         # Enrichment query metrics
         if (
             DramKvPerfStat.ENRICHMENT_QUERY_COUNT in stats