Fix deadline/flaky + suppress GPU-only opcheck SKIPPING (OMH round 2) (#5932)

q10 · facebook-github-bot · commit fc58d8b03215 · 2026-06-19T22:20:02.000-07:00
Summary: X-link: facebookresearch/FBGEMM#2850 Second round of fbgemm_dev OMH test-health fixes (T191384137). After the first residual stack, the dashboard reopened with ~60-100+ records that the actual failure logs (meta testinfra) showed were NOT numeric regressions but three mechanical artifacts. This diff fixes the two that are fbgemm's to own. 1. FLAKY (deadline cold-start): every sampled FLAKY failed with hypothesis DeadlineExceeded -> FlakyFailure because the first GPU call in a fresh stress-run process pays 12-27s of CUDA/Triton/JIT cold-start, blowing the settings(deadline=10000). Set deadline=None (what hypothesis recommends; already the convention in 287 sibling fbgemm tests) on the affected GPU hypothesis tests: bfloat16, fused_8bit_rowwise, fused_nbit_rowwise, mixed_dim_int8, hfp8, msfp, permute_pooled_embedding. 2. kmeans bf16 centroids: the bf16 branch kept the tight rtol=2e-3/atol=2e-2 while only fp32 was widened earlier; bf16 GPU atomicAdd order is nondeterministic (~0.023 abs / 0.046 rel) -> FlakyFailure. Widen to 5e-2. 3. SSD inference test_sequence_table_simple_forward: failed HealthCheck.too_slow (each example builds a RocksDB table + GPU forward, 15-33s). Suppress too_slow and cap max_examples=20. 4. block_bucketize test_aot_dispatch_dynamic__...sparse_features_large: 600s tpx timeout from per-example PT2 recompile (my_size up to 1024). Scoped skip of the aot variant in failures_dict (eager + faketensor still cover the op). 5. SKIPPING: ~16 GPU-only base tests (gated skipIf(gpu_unavailable)) generate opcheck variants that only SKIP on CPU/non-NVIDIA samples and add no op coverage (op is covered by non-gated / *_cpu twin tests). Mark them optests.dontGenerateOpCheckTests so the variants are never generated, the same pattern already used for permute_indices *_large_grid. Covers permute_indices (6 methods), index_select (3), pack_segments (3), reorder_batched (3), histogram_binning_calibration (1). NOTE: a separate cluster of ~70 preproc_legacy_comparison:test_op_* FAILUREs is an EXTERNAL MTIA KernelsDB build break (mtia/kernels_db duplicate acc_ops.square, D108088144) and is not addressed here. Differential Revision: D109040643
diff --git a/fbgemm_gpu/test/permute/permute_pooled_embedding_test.py b/fbgemm_gpu/test/permute/permute_pooled_embedding_test.py
@@ -58,7 +58,7 @@ def setUp(self) -> None:
         # registration (flaky). T191384137
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-    @settings(deadline=10000)
+    @settings(deadline=None)
     @given(fwd_only=st.booleans())
     def test_permutation(self, fwd_only: bool) -> None:
         net = Net(fwd_only=fwd_only).to(self.device)
diff --git a/fbgemm_gpu/test/quantize/bfloat16_test.py b/fbgemm_gpu/test/quantize/bfloat16_test.py
@@ -27,7 +27,7 @@ class SparseNNOperatorsGPUTest(unittest.TestCase):
         k=st.integers(min_value=2, max_value=2),
         n=st.integers(min_value=2, max_value=2),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_dense_mlp_quantize_ops(
         self, precision: str, batch_size: int, k: int, n: int
     ) -> None:
@@ -64,7 +64,7 @@ class TestBfloat16QuantizationConversion(unittest.TestCase):
         nrows=st.integers(min_value=0, max_value=100),
         ncols=st.integers(min_value=0, max_value=100),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_op(self, nrows: int, ncols: int) -> None:
         input_data = torch.rand(nrows, ncols).float()
         quantized_data = torch.ops.fbgemm.FloatToBfloat16Quantized(input_data)
@@ -91,7 +91,7 @@ def test_quantize_op(self, nrows: int, ncols: int) -> None:
         nrows=st.integers(min_value=0, max_value=100),
         ncols=st.integers(min_value=0, max_value=100),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_and_dequantize_op(self, nrows: int, ncols: int) -> None:
         input_data = torch.rand(nrows, ncols).float()
         quantized_data = torch.ops.fbgemm.FloatToBfloat16Quantized(input_data)
@@ -129,7 +129,7 @@ def test_quantize_and_dequantize_op(self, nrows: int, ncols: int) -> None:
     @given(
         ncols_nrows=st.sampled_from([(65540, 256), (256, 65540)]),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_and_dequantize_op_cuda_large_nrows_bf16(
         self, ncols_nrows: tuple[int, int]
     ) -> None:
diff --git a/fbgemm_gpu/test/quantize/fused_8bit_rowwise_test.py b/fbgemm_gpu/test/quantize/fused_8bit_rowwise_test.py
@@ -50,7 +50,7 @@ class TestFused8BitRowwiseQuantizationConversion(unittest.TestCase):
         is_half=st.booleans(),
         test_float_or_half_op=st.booleans(),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_op(
         self,
         nrows: int,
@@ -336,7 +336,7 @@ def quantize_and_dequantize_op_test_helper(  # noqa: C901
         ),
         test_generic_op=st.booleans(),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_and_dequantize_op_cpu(  # noqa: C901
         self,
         nrows: int,
@@ -361,7 +361,7 @@ def test_quantize_and_dequantize_op_cpu(  # noqa: C901
         ),
         test_generic_op=st.booleans(),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_and_dequantize_op_cuda(  # noqa: C901
         self,
         nrows: int,
diff --git a/fbgemm_gpu/test/quantize/fused_nbit_rowwise_test.py b/fbgemm_gpu/test/quantize/fused_nbit_rowwise_test.py
@@ -49,7 +49,7 @@ class TestFusedNBitRowwiseQuantizationConversion(unittest.TestCase):
         is_half=st.booleans(),
         test_float_or_half_op=st.booleans(),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_op(
         self,
         nrows: int,
@@ -151,7 +151,7 @@ def test_quantize_op(
         test_meta=st.booleans(),
         test_cuda=st.booleans(),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_and_dequantize_op(
         self,
         nrows: int,
@@ -357,7 +357,7 @@ def test_quantize_and_dequantize_op_cuda_large_nrows(self) -> None:
             [SparseType.BF16]
         ),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_and_dequantize_op_cpu_and_cuda(
         self,
         nrows: int,
diff --git a/fbgemm_gpu/test/quantize/hfp8_test.py b/fbgemm_gpu/test/quantize/hfp8_test.py
@@ -75,7 +75,7 @@ def _test_conversion(
         ncols=st.integers(min_value=1, max_value=100),
         exponent_bias=st.integers(min_value=4, max_value=7),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_and_dequantize_op(
         self, nrows: int, ncols: int, exponent_bias: int
     ) -> None:
diff --git a/fbgemm_gpu/test/quantize/mixed_dim_int8_test.py b/fbgemm_gpu/test/quantize/mixed_dim_int8_test.py
@@ -55,7 +55,7 @@ def test_mixed_dim_8bit_dequantize_op_empty(self) -> None:
         min_dim=st.just(1),
         max_dim=st.just(100),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_mixed_dim_8bit_dequantize_op(
         self,
         B: int,
@@ -77,7 +77,7 @@ def test_mixed_dim_8bit_dequantize_op(
         min_dim=st.just(100),
         max_dim=st.just(1000),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_mixed_dim_8bit_dequantize_op_large_dims(
         self,
         B: int,
@@ -99,7 +99,7 @@ def test_mixed_dim_8bit_dequantize_op_large_dims(
         min_dim=st.just(1),
         max_dim=st.just(100),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_mixed_dim_8bit_dequantize_op_large_rows(
         self,
         B: int,
diff --git a/fbgemm_gpu/test/quantize/msfp_test.py b/fbgemm_gpu/test/quantize/msfp_test.py
@@ -33,7 +33,7 @@ class TestMSFPQuantizationConversion(unittest.TestCase):
         nrows=st.integers(min_value=0, max_value=100),
         ncols=st.integers(min_value=0, max_value=100),
     )
-    @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
+    @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
     def test_quantize_op(self, nrows: int, ncols: int) -> None:
         ebits = 8
         mbits = 7
diff --git a/fbgemm_gpu/test/sparse/failures_dict.json b/fbgemm_gpu/test/sparse/failures_dict.json
@@ -23,7 +23,12 @@
     "fbgemm::asynchronous_inclusive_cumsum": {},
     "fbgemm::batch_index_select_dim0": {},
     "fbgemm::batch_index_select_dim0_tensor": {},
-    "fbgemm::block_bucketize_sparse_features": {},
+    "fbgemm::block_bucketize_sparse_features": {
+      "BlockBucketizeTest.test_aot_dispatch_dynamic__test_block_bucketize_sparse_features_large": {
+        "comment": "max_examples=32 with my_size up to 1024 (~2048 items) recompiles per hypothesis example under aot_dispatch_dynamic and exceeds the 600s tpx timeout (flaky ~10%). Eager + faketensor variants still cover the op. T191384137",
+        "status": "skip"
+      }
+    },
     "fbgemm::block_bucketize_sparse_features_2d_weights": {},
     "fbgemm::block_bucketize_sparse_features_inference": {},
     "fbgemm::bottom_k_per_row": {
diff --git a/fbgemm_gpu/test/sparse/histogram_binning_calibration_test.py b/fbgemm_gpu/test/sparse/histogram_binning_calibration_test.py
@@ -20,10 +20,10 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gpu_unavailable
+    from test_utils import gpu_unavailable, optests
 else:
     import fbgemm_gpu.sparse_ops  # noqa: F401, E402
-    from fbgemm_gpu.test.test_utils import gpu_unavailable
+    from fbgemm_gpu.test.test_utils import gpu_unavailable, optests
 
 
 class HistogramBinningCalibrationTest(unittest.TestCase):
@@ -327,6 +327,9 @@ def test_generic_histogram_binning_calibration_by_feature(
         data_type=st.sampled_from([torch.bfloat16, torch.half, torch.float32]),
     )
     @settings(verbosity=Verbosity.verbose, deadline=None)
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only CPU/GPU parity test; op covered by test_generic_histogram_binning_calibration_by_feature (T191384137)"
+    )
     def test_generic_histogram_binning_calibration_by_feature_cpu_gpu(
         self,
         data_type: torch.dtype,
diff --git a/fbgemm_gpu/test/sparse/index_select_test.py b/fbgemm_gpu/test/sparse/index_select_test.py
@@ -26,10 +26,10 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gpu_available
+    from test_utils import gpu_available, optests
 else:
     import fbgemm_gpu.sparse_ops  # noqa: F401, E402
-    from fbgemm_gpu.test.test_utils import gpu_available
+    from fbgemm_gpu.test.test_utils import gpu_available, optests
 
 
 class IndexSelectTest(unittest.TestCase):
@@ -251,6 +251,9 @@ def compare_tensor_groups(
         )
 
     @unittest.skipIf(not gpu_available, "Skip when CUDA is not available")
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only test; opcheck variants only skip on CPU samples and add no op coverage (T191384137)"
+    )
     def test_group_index_select_dim0_mixed_input_dtype(self) -> None:
         # lint-fixme: TorchDeviceCuda, TorchFunctionCallCudaDevice
         # CUDA specifically required: testing GPU-only FBGEMM sparse op validation
@@ -268,6 +271,9 @@ def test_group_index_select_dim0_mixed_input_dtype(self) -> None:
             )
 
     @unittest.skipIf(not gpu_available, "Skip when CUDA is not available")
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only test; opcheck variants only skip on CPU samples and add no op coverage (T191384137)"
+    )
     def test_group_index_select_dim0_mixed_indices_dtype(self) -> None:
         # lint-fixme: TorchDeviceCuda, TorchFunctionCallCudaDevice
         # CUDA specifically required: testing GPU-only FBGEMM sparse op validation
@@ -499,6 +505,9 @@ def test_batch_index_select_dim0(
         max_examples=20,
         deadline=None,
     )
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only test; opcheck variants only skip on CPU samples and add no op coverage (T191384137)"
+    )
     def test_batch_index_select_dim0_long_runs(
         self,
         input_rows_count: int,
diff --git a/fbgemm_gpu/test/sparse/pack_segments_test.py b/fbgemm_gpu/test/sparse/pack_segments_test.py
@@ -23,12 +23,13 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gpu_available, gpu_memory_lt_gb, gpu_unavailable
+    from test_utils import gpu_available, gpu_memory_lt_gb, gpu_unavailable, optests
 else:
     from fbgemm_gpu.test.test_utils import (
         gpu_available,
         gpu_memory_lt_gb,
         gpu_unavailable,
+        optests,
     )
 
 
@@ -423,6 +424,9 @@ def test_pack_segments_meta_backend(
         use_cpu=st.booleans(),
     )
     @settings(deadline=None)
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only test; opcheck variants only skip on CPU samples; op covered by test_pack_segments (T191384137)"
+    )
     def test_pack_segments_noncontig(
         self,
         n: int,
@@ -569,6 +573,9 @@ def test_pack_segments_noncontig(
         ),
     )
     @settings(deadline=None)
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only test; opcheck variants only skip on CPU samples; op covered by test_pack_segments (T191384137)"
+    )
     def test_pack_segments_backward_truncated(self, dtype: torch.dtype) -> None:
         """
         Regression test: when lengths[seq] > max_length, the backward kernel
@@ -634,6 +641,9 @@ def test_pack_segments_backward_truncated(self, dtype: torch.dtype) -> None:
     # output of shape (num_seq, max_length) at fp16, ~8 GiB at the chosen
     # max_length.
     @unittest.skipIf(*gpu_memory_lt_gb(12))
+    @optests.dontGenerateOpCheckTests(
+        "large-grid GPU-memory-gated stress repro; opcheck variants add no coverage (T191384137)"
+    )
     def test_pack_segments_large_grid(self) -> None:
         """
         Reproduces the HIP grid-overflow bug in pack_segments_cuda{,_v2}
diff --git a/fbgemm_gpu/test/sparse/permute_indices_test.py b/fbgemm_gpu/test/sparse/permute_indices_test.py
@@ -182,6 +182,9 @@ def test_permute_indices(
     )
     @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
     @unittest.skipIf(*gpu_unavailable)
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only test; opcheck variants only skip on CPU samples and add no op coverage (T191384137)"
+    )
     def test_permute_indices_non_contiguous(
         self,
         B: int,
@@ -246,6 +249,9 @@ def create_non_contiguous(x: torch.Tensor) -> torch.Tensor:
     # models returned. So we need to add a unittest to ensure the op return
     # real None, not an undefined tensor.
     @unittest.skipIf(*gpu_unavailable)
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only test; opcheck variants only skip on CPU samples and add no op coverage (T191384137)"
+    )
     def test_permute_indices_scripted_with_none_weights(
         self,
     ) -> None:
@@ -362,6 +368,9 @@ def test_permute_indices_with_repeats(
     )
     @settings(verbosity=Verbosity.verbose, max_examples=100, deadline=None)
     @unittest.skipIf(*gpu_unavailable)
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only test; opcheck variants only skip on CPU samples and add no op coverage (T191384137)"
+    )
     def test_permute_1D_sparse_data_vec(
         self,
         num_segments: int,
@@ -511,6 +520,9 @@ def test_permute_1D_sparse_data_vec(
     )
     @settings(verbosity=Verbosity.verbose, max_examples=50, deadline=None)
     @unittest.skipIf(*gpu_unavailable)
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only test; opcheck variants only skip on CPU samples and add no op coverage (T191384137)"
+    )
     def test_permute_1D_sparse_data_vec_2d_weights(
         self,
         num_segments: int,
@@ -650,6 +662,9 @@ def permute_1d_ref(
     )
     @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None)
     @unittest.skipIf(*gpu_unavailable)
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only test; opcheck variants only skip on CPU samples and add no op coverage (T191384137)"
+    )
     def test_permute_2D_indices_vec_remainder(
         self,
         long_index: bool,
@@ -726,6 +741,9 @@ def test_permute_2D_indices_vec_remainder(
     )
     @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None)
     @unittest.skipIf(*gpu_unavailable)
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only test; opcheck variants only skip on CPU samples and add no op coverage (T191384137)"
+    )
     def test_permute_2D_indices_large_segments(
         self,
         index_dtype: torch.dtype,
diff --git a/fbgemm_gpu/test/sparse/reorder_batched_test.py b/fbgemm_gpu/test/sparse/reorder_batched_test.py
@@ -20,10 +20,10 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gpu_unavailable, skipIfRocm
+    from test_utils import gpu_unavailable, optests, skipIfRocm
 else:
     import fbgemm_gpu.sparse_ops  # noqa: F401, E402
-    from fbgemm_gpu.test.test_utils import gpu_unavailable, skipIfRocm
+    from fbgemm_gpu.test.test_utils import gpu_unavailable, optests, skipIfRocm
 
 
 class ReorderBatchedTest(unittest.TestCase):
@@ -37,6 +37,9 @@ class ReorderBatchedTest(unittest.TestCase):
         broadcast_lengths=st.booleans(),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only test; opcheck variants only skip on CPU samples; op covered by *_cpu twin (T191384137)"
+    )
     def test_reorder_batched_ad_lengths(
         self,
         B: int,
@@ -135,6 +138,9 @@ def test_reorder_batched_ad_lengths_cpu(
         broadcast_indices=st.booleans(),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only test; opcheck variants only skip on CPU samples; op covered by *_cpu twin (T191384137)"
+    )
     def test_reorder_batched_ad_indices(
         self,
         B: int,
@@ -511,6 +517,9 @@ def test_reorder_batched_sequence_embeddings_cpu(
         ),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=40, deadline=None)
+    @optests.dontGenerateOpCheckTests(
+        "GPU-only test; opcheck variants only skip on CPU samples; op covered by *_cpu twin (T191384137)"
+    )
     def test_reorder_batched_sequence_embeddings(
         self,
         B: int,

Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ def _test_conversion(`
`75`	`75`	`ncols=st.integers(min_value=1, max_value=100),`
`76`	`76`	`exponent_bias=st.integers(min_value=4, max_value=7),`
`77`	`77`	`)`
`78`		`- @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])`
	`78`	`+ @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])`
`79`	`79`	`def test_quantize_and_dequantize_op(`
`80`	`80`	`self, nrows: int, ncols: int, exponent_bias: int`
`81`	`81`	`) -> None:`
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ class TestMSFPQuantizationConversion(unittest.TestCase):`
`33`	`33`	`nrows=st.integers(min_value=0, max_value=100),`
`34`	`34`	`ncols=st.integers(min_value=0, max_value=100),`
`35`	`35`	`)`
`36`		`- @settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])`
	`36`	`+ @settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])`
`37`	`37`	`def test_quantize_op(self, nrows: int, ncols: int) -> None:`
`38`	`38`	`ebits = 8`
`39`	`39`	`mbits = 7`