Skip to content

Commit 9fed2a5

Browse files
Xinyi Wangfacebook-github-bot
authored andcommitted
add overall L1→DRAM hit rate metric (#5777)
Summary: X-link: facebookresearch/FBGEMM#2706 `SSDTableBatchedEmbeddingBags` already emits per-tier hit rates — `ssd_tbe.prefetch.l1_hit_rate_pct`, `l2_cache.hit_rate_pct`, and `dram_kv.hit_rate_pct` — but each is conditional on requests that reached that tier. As `l1_cache_size` grows, L1 absorbs more keys and only the long-tail keys fall through to DRAM, so the L1-conditional DRAM hit rate drops mechanically even though the system is doing more — not less — work in the cheaper tier. None of the existing per-tier metrics give an at-a-glance answer to "what fraction of unique requests were served from cache (L1 or DRAM), without paying SSD cost?". This diff adds an `ssd_tbe.overall_hit_rate_pct` aggregate metric (per-TBE: `ssd_tbe.tbe_id{N}.overall_hit_rate_pct`) defined as: overall_hit_rate_pct = 100.0 * (num_unique - dram_read_miss_count) / num_unique i.e. the fraction of unique requests that did not miss at DRAM. The value stays stable as cache sizes shift between L1 and DRAM. Algebraically equivalent to the expanded form `L1_hit + (1 - L1_hit) * DRAM_hit_conditional` under the assumption that every L1 miss reaches DRAM (the only path today). A code comment documents this caveat in case a future SSD-bypass path is added. The existing per-tier metrics (`l1_hit_rate_pct`, `l2_cache.hit_rate_pct`, `dram_kv.hit_rate_pct`) are left unchanged — they remain useful for diagnosing per-tier behavior. Implementation: - `_report_uvm_cache_stats` stashes `num_unique` into `_last_l1_num_unique` so `_report_dram_kv_perf_stats` can use it as the normalization denominator without re-reading L1 counters. Both reporters fire from the same `should_report(self.step)` cadence, so the stashed value corresponds to the same reporting window. Reviewed By: kausv Differential Revision: D105727013
1 parent 07767a8 commit 9fed2a5

1 file changed

Lines changed: 34 additions & 0 deletions

File tree

fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,6 +1233,10 @@ def __init__(
12331233
# 4: N_conflict_unique_misses, 5: N_conflict_misses
12341234
self.last_reported_ssd_stats: list[float] = []
12351235
self.last_reported_step = 0
1236+
# Stashed by _report_uvm_cache_stats so _report_dram_kv_perf_stats
1237+
# can compute the overall L1 → DRAM hit rate against total unique
1238+
# indices. See T272139146.
1239+
self._last_l1_num_unique: float = 0.0
12361240

12371241
self.register_buffer(
12381242
"ssd_cache_stats",
@@ -1313,6 +1317,9 @@ def __init__(
13131317
self.l1_hit_rate_stats_name: str = (
13141318
f"ssd_tbe.prefetch.tbe_id{tbe_unique_id}.l1_hit_rate_pct"
13151319
)
1320+
self.overall_hit_rate_stats_name: str = (
1321+
f"ssd_tbe.tbe_id{tbe_unique_id}.overall_hit_rate_pct"
1322+
)
13161323

13171324
self.eviction_sum_evicted_counts_stats_name: str = (
13181325
f"eviction.tbe_id.{tbe_unique_id}.sum_evicted_counts"
@@ -1366,6 +1373,7 @@ def __init__(
13661373
self.stats_reporter.register_stats(self.enrichment_empty_count_stats_name)
13671374
self.stats_reporter.register_stats(self.enrichment_success_rate_stats_name)
13681375
self.stats_reporter.register_stats(self.l1_hit_rate_stats_name)
1376+
self.stats_reporter.register_stats(self.overall_hit_rate_stats_name)
13691377
for t in self.feature_table_map:
13701378
self.stats_reporter.register_stats(
13711379
f"eviction.feature_table.{t}.evicted_counts"
@@ -4185,6 +4193,7 @@ def _report_ssd_l1_cache_stats(self) -> None:
41854193
# L1 cache hit rate
41864194
num_unique = ssd_cache_stats_delta[UVMCacheStatsIndex.num_unique_indices]
41874195
num_misses = ssd_cache_stats_delta[UVMCacheStatsIndex.num_unique_misses]
4196+
self._last_l1_num_unique = num_unique
41884197
if num_unique > 0:
41894198
l1_hit_rate_pct = 100.0 * (num_unique - num_misses) / num_unique
41904199
# Per-TBE L1 hit rate
@@ -4864,6 +4873,31 @@ def _report_dram_kv_perf_stats(self) -> None:
48644873
enable_tb_metrics=True,
48654874
)
48664875

4876+
# Overall hit rate across the L1 → DRAM path, normalized by total
4877+
# unique requests so it stays stable as l1_cache_size changes
4878+
# (the per-tier rates each shift mechanically with cache sizing).
4879+
# Assumes every L1 miss reaches DRAM, i.e.
4880+
# num_misses == dram_read_hit_count + dram_read_miss_count.
4881+
# If a future path lets L1 misses bypass DRAM, this needs to
4882+
# account for the additional tier.
4883+
num_unique = self._last_l1_num_unique
4884+
if num_unique > 0:
4885+
overall_hit_rate_pct = (
4886+
100.0 * (num_unique - dram_read_miss_count) / num_unique
4887+
)
4888+
stats_reporter.report_data_amount(
4889+
iteration_step=self.step,
4890+
event_name=self.overall_hit_rate_stats_name,
4891+
data_bytes=overall_hit_rate_pct,
4892+
enable_tb_metrics=True,
4893+
)
4894+
stats_reporter.report_data_amount(
4895+
iteration_step=self.step,
4896+
event_name="ssd_tbe.overall_hit_rate_pct",
4897+
data_bytes=overall_hit_rate_pct,
4898+
enable_tb_metrics=True,
4899+
)
4900+
48674901
# Enrichment query metrics
48684902
if (
48694903
DramKvPerfStat.ENRICHMENT_QUERY_COUNT in stats

0 commit comments

Comments
 (0)