Add OOB benchmarking and V2 bounds check mode support to bounds_check_indices benchmark (#5797)

spcyppt · meta-codesync[bot] · commit 4871b4565f49 · 2026-05-29T18:47:12.000-07:00
Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/2725 Pull Request resolved: #5797 Add `--oob` flag (0-100) to the `bounds_check_indices` benchmark to set a percentage of indices out of bounds, enabling measurement of the atomic contention overhead in WARNING/IGNORE modes across v1 and v2 kernels. Add V2 bounds check mode support (`V2_IGNORE=4`, `V2_WARNING=5`, `V2_FATAL=6`) to the benchmark. V2 modes are decomposed into their base mode + `bounds_check_version=2`, mirroring the logic in `SplitTableBatchedEmbeddingBagsCodegen`. The `bounds_check_version` is now passed through to `torch.ops.fbgemm.bounds_check_indices`. Update `run_bounds_check_benchmark.sh` to accept `--oob` and to loop over multiple `--bounds-check-mode` values (e.g., `"1 2 5 4"` for v1/v2 x WARNING/IGNORE). Trace URLs now include mode and OOB percentage. Add a convenience sweep script for the OOB experiment. Reviewed By: q10 Differential Revision: D106606582 fbshipit-source-id: 58abe82d6793fbfdda130490563bc7f6d13e9818
diff --git a/fbgemm_gpu/bench/tbe/tbe_utils_benchmark.py b/fbgemm_gpu/bench/tbe/tbe_utils_benchmark.py
@@ -282,7 +282,10 @@ def pruned_array_lookup(  # noqa C901
     help=f"Available modes: FATAL={BoundsCheckMode.FATAL.value}, "
     f"WARNING={BoundsCheckMode.WARNING.value}, "
     f"IGNORE={BoundsCheckMode.IGNORE.value}, "
-    f"NONE={BoundsCheckMode.NONE.value}",
+    f"NONE={BoundsCheckMode.NONE.value}, "
+    f"V2_IGNORE={BoundsCheckMode.V2_IGNORE.value}, "
+    f"V2_WARNING={BoundsCheckMode.V2_WARNING.value}, "
+    f"V2_FATAL={BoundsCheckMode.V2_FATAL.value}",
 )
 @click.option("--requests_data_file", type=str, default=None)
 @click.option("--tables", type=str, default=None)
@@ -299,6 +302,13 @@ def pruned_array_lookup(  # noqa C901
     type=str,
     default="bounds_check_indices_trace_{ospid}.json",
 )
+@click.option(
+    "--oob",
+    type=int,
+    default=0,
+    help="Percentage of indices to set out of bounds (0 to 100). "
+    "Use with WARNING or IGNORE mode (FATAL will crash).",
+)
 def bounds_check_indices(  # noqa C901
     bag_size: int,
     batch_size: int,
@@ -312,6 +322,7 @@ def bounds_check_indices(  # noqa C901
     batch_sizes: str,
     export_trace: bool,
     trace_url: str,
+    oob: int,
 ) -> None:
     np.random.seed(42)
     torch.manual_seed(42)
@@ -358,9 +369,27 @@ def bounds_check_indices(  # noqa C901
             offset_dtype=torch.long,
         )
 
+    if oob > 0:
+        for req in requests:
+            num_indices = req.indices.numel()
+            num_oob = int(num_indices * oob / 100)
+            oob_positions = torch.randperm(num_indices)[:num_oob]
+            req.indices[oob_positions] = E
+
     warning = torch.tensor([0]).long().to(get_device())
     rows_per_table = torch.tensor([E for _ in range(T)]).long().to(get_device())
 
+    bc_mode = BoundsCheckMode(bounds_check_mode)
+    bounds_check_version = 1
+    if bc_mode.name.startswith("V2_"):
+        bounds_check_version = 2
+        if bc_mode == BoundsCheckMode.V2_IGNORE:
+            bc_mode = BoundsCheckMode.IGNORE
+        elif bc_mode == BoundsCheckMode.V2_WARNING:
+            bc_mode = BoundsCheckMode.WARNING
+        elif bc_mode == BoundsCheckMode.V2_FATAL:
+            bc_mode = BoundsCheckMode.FATAL
+
     def _kineto_trace_handler(p: profile) -> None:
         p.export_chrome_trace(trace_url.format(ospid=os.getpid()))
 
@@ -411,20 +440,22 @@ def context_factory(on_trace_ready: Callable[[profile], None]):
                 rows_per_table,
                 indices,
                 offsets,
-                BoundsCheckMode(bounds_check_mode),
+                bc_mode,
                 warning,
                 B_offsets=B_offsets,
                 max_B=max_B,
                 b_t_map=b_t_map,
                 info_B_num_bits=info_B_num_bits,
                 info_B_mask=info_B_mask,
+                bounds_check_version=bounds_check_version,
             ),
             num_warmups=warmup_runs,
         )
 
     logging.info(
         f"Bounds Check Indices:  Bs: {Bs}, "
         f"E: {E}, T: {T}, L: {L}, "
+        f"mode: {bc_mode.name}, v: {bounds_check_version}, "
         f"BW: {(8 * total_B * L + 8 * (total_B + 1)) / time_per_iter / 1.0e9: .2f} GB/s, "  # noqa: B950
         f"T: {time_per_iter * 1.0e6:.0f}us"
     )
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py b/fbgemm_gpu/fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py
@@ -28,7 +28,7 @@
 )
 from fbgemm_gpu.utils.loader import load_torch_module
 
-from .cache_config import CacheAlgorithm
+from .cache_config import CacheAlgorithm  # usort:skip
 
 try:
     load_torch_module(
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -54,7 +54,7 @@
     TBEStatsReporterConfig,
 )
 
-from .ssd_config import BackendType, EvictionPolicy, KVZCHParams
+from .ssd_config import BackendType, EvictionPolicy, KVZCHParams  # usort:skip
 from torch import distributed as dist, nn, Tensor  # usort:skip
 import sys
 from dataclasses import dataclass

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@`
`28`	`28`	`)`
`29`	`29`	`from fbgemm_gpu.utils.loader import load_torch_module`
`30`	`30`
`31`		`-from .cache_config import CacheAlgorithm`
	`31`	`+from .cache_config import CacheAlgorithm # usort:skip`
`32`	`32`
`33`	`33`	`try:`
`34`	`34`	`load_torch_module(`
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@`
`54`	`54`	`TBEStatsReporterConfig,`
`55`	`55`	`)`
`56`	`56`
`57`		`-from .ssd_config import BackendType, EvictionPolicy, KVZCHParams`
	`57`	`+from .ssd_config import BackendType, EvictionPolicy, KVZCHParams # usort:skip`
`58`	`58`	`from torch import distributed as dist, nn, Tensor # usort:skip`
`59`	`59`	`import sys`
`60`	`60`	`from dataclasses import dataclass`