Skip to content

Commit fc58d8b

Browse files
q10facebook-github-bot
authored andcommitted
Fix deadline/flaky + suppress GPU-only opcheck SKIPPING (OMH round 2) (#5932)
Summary: X-link: facebookresearch/FBGEMM#2850 Second round of fbgemm_dev OMH test-health fixes (T191384137). After the first residual stack, the dashboard reopened with ~60-100+ records that the actual failure logs (meta testinfra) showed were NOT numeric regressions but three mechanical artifacts. This diff fixes the two that are fbgemm's to own. 1. FLAKY (deadline cold-start): every sampled FLAKY failed with hypothesis DeadlineExceeded -> FlakyFailure because the first GPU call in a fresh stress-run process pays 12-27s of CUDA/Triton/JIT cold-start, blowing the settings(deadline=10000). Set deadline=None (what hypothesis recommends; already the convention in 287 sibling fbgemm tests) on the affected GPU hypothesis tests: bfloat16, fused_8bit_rowwise, fused_nbit_rowwise, mixed_dim_int8, hfp8, msfp, permute_pooled_embedding. 2. kmeans bf16 centroids: the bf16 branch kept the tight rtol=2e-3/atol=2e-2 while only fp32 was widened earlier; bf16 GPU atomicAdd order is nondeterministic (~0.023 abs / 0.046 rel) -> FlakyFailure. Widen to 5e-2. 3. SSD inference test_sequence_table_simple_forward: failed HealthCheck.too_slow (each example builds a RocksDB table + GPU forward, 15-33s). Suppress too_slow and cap max_examples=20. 4. block_bucketize test_aot_dispatch_dynamic__...sparse_features_large: 600s tpx timeout from per-example PT2 recompile (my_size up to 1024). Scoped skip of the aot variant in failures_dict (eager + faketensor still cover the op). 5. SKIPPING: ~16 GPU-only base tests (gated skipIf(gpu_unavailable)) generate opcheck variants that only SKIP on CPU/non-NVIDIA samples and add no op coverage (op is covered by non-gated / *_cpu twin tests). Mark them optests.dontGenerateOpCheckTests so the variants are never generated, the same pattern already used for permute_indices *_large_grid. Covers permute_indices (6 methods), index_select (3), pack_segments (3), reorder_batched (3), histogram_binning_calibration (1). NOTE: a separate cluster of ~70 preproc_legacy_comparison:test_op_* FAILUREs is an EXTERNAL MTIA KernelsDB build break (mtia/kernels_db duplicate acc_ops.square, D108088144) and is not addressed here. Differential Revision: D109040643
1 parent 739849f commit fc58d8b

13 files changed

Lines changed: 78 additions & 24 deletions

fbgemm_gpu/test/permute/permute_pooled_embedding_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def setUp(self) -> None:
5858
# registration (flaky). T191384137
5959
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
6060

61-
@settings(deadline=10000)
61+
@settings(deadline=None)
6262
@given(fwd_only=st.booleans())
6363
def test_permutation(self, fwd_only: bool) -> None:
6464
net = Net(fwd_only=fwd_only).to(self.device)

fbgemm_gpu/test/quantize/bfloat16_test.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class SparseNNOperatorsGPUTest(unittest.TestCase):
2727
k=st.integers(min_value=2, max_value=2),
2828
n=st.integers(min_value=2, max_value=2),
2929
)
30-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
30+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
3131
def test_dense_mlp_quantize_ops(
3232
self, precision: str, batch_size: int, k: int, n: int
3333
) -> None:
@@ -64,7 +64,7 @@ class TestBfloat16QuantizationConversion(unittest.TestCase):
6464
nrows=st.integers(min_value=0, max_value=100),
6565
ncols=st.integers(min_value=0, max_value=100),
6666
)
67-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
67+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
6868
def test_quantize_op(self, nrows: int, ncols: int) -> None:
6969
input_data = torch.rand(nrows, ncols).float()
7070
quantized_data = torch.ops.fbgemm.FloatToBfloat16Quantized(input_data)
@@ -91,7 +91,7 @@ def test_quantize_op(self, nrows: int, ncols: int) -> None:
9191
nrows=st.integers(min_value=0, max_value=100),
9292
ncols=st.integers(min_value=0, max_value=100),
9393
)
94-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
94+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
9595
def test_quantize_and_dequantize_op(self, nrows: int, ncols: int) -> None:
9696
input_data = torch.rand(nrows, ncols).float()
9797
quantized_data = torch.ops.fbgemm.FloatToBfloat16Quantized(input_data)
@@ -129,7 +129,7 @@ def test_quantize_and_dequantize_op(self, nrows: int, ncols: int) -> None:
129129
@given(
130130
ncols_nrows=st.sampled_from([(65540, 256), (256, 65540)]),
131131
)
132-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
132+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
133133
def test_quantize_and_dequantize_op_cuda_large_nrows_bf16(
134134
self, ncols_nrows: tuple[int, int]
135135
) -> None:

fbgemm_gpu/test/quantize/fused_8bit_rowwise_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ class TestFused8BitRowwiseQuantizationConversion(unittest.TestCase):
5050
is_half=st.booleans(),
5151
test_float_or_half_op=st.booleans(),
5252
)
53-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
53+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
5454
def test_quantize_op(
5555
self,
5656
nrows: int,
@@ -336,7 +336,7 @@ def quantize_and_dequantize_op_test_helper( # noqa: C901
336336
),
337337
test_generic_op=st.booleans(),
338338
)
339-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
339+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
340340
def test_quantize_and_dequantize_op_cpu( # noqa: C901
341341
self,
342342
nrows: int,
@@ -361,7 +361,7 @@ def test_quantize_and_dequantize_op_cpu( # noqa: C901
361361
),
362362
test_generic_op=st.booleans(),
363363
)
364-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
364+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
365365
def test_quantize_and_dequantize_op_cuda( # noqa: C901
366366
self,
367367
nrows: int,

fbgemm_gpu/test/quantize/fused_nbit_rowwise_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ class TestFusedNBitRowwiseQuantizationConversion(unittest.TestCase):
4949
is_half=st.booleans(),
5050
test_float_or_half_op=st.booleans(),
5151
)
52-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
52+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
5353
def test_quantize_op(
5454
self,
5555
nrows: int,
@@ -151,7 +151,7 @@ def test_quantize_op(
151151
test_meta=st.booleans(),
152152
test_cuda=st.booleans(),
153153
)
154-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
154+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
155155
def test_quantize_and_dequantize_op(
156156
self,
157157
nrows: int,
@@ -357,7 +357,7 @@ def test_quantize_and_dequantize_op_cuda_large_nrows(self) -> None:
357357
[SparseType.BF16]
358358
),
359359
)
360-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
360+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
361361
def test_quantize_and_dequantize_op_cpu_and_cuda(
362362
self,
363363
nrows: int,

fbgemm_gpu/test/quantize/hfp8_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def _test_conversion(
7575
ncols=st.integers(min_value=1, max_value=100),
7676
exponent_bias=st.integers(min_value=4, max_value=7),
7777
)
78-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
78+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
7979
def test_quantize_and_dequantize_op(
8080
self, nrows: int, ncols: int, exponent_bias: int
8181
) -> None:

fbgemm_gpu/test/quantize/mixed_dim_int8_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def test_mixed_dim_8bit_dequantize_op_empty(self) -> None:
5555
min_dim=st.just(1),
5656
max_dim=st.just(100),
5757
)
58-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
58+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
5959
def test_mixed_dim_8bit_dequantize_op(
6060
self,
6161
B: int,
@@ -77,7 +77,7 @@ def test_mixed_dim_8bit_dequantize_op(
7777
min_dim=st.just(100),
7878
max_dim=st.just(1000),
7979
)
80-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
80+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
8181
def test_mixed_dim_8bit_dequantize_op_large_dims(
8282
self,
8383
B: int,
@@ -99,7 +99,7 @@ def test_mixed_dim_8bit_dequantize_op_large_dims(
9999
min_dim=st.just(1),
100100
max_dim=st.just(100),
101101
)
102-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
102+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
103103
def test_mixed_dim_8bit_dequantize_op_large_rows(
104104
self,
105105
B: int,

fbgemm_gpu/test/quantize/msfp_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class TestMSFPQuantizationConversion(unittest.TestCase):
3333
nrows=st.integers(min_value=0, max_value=100),
3434
ncols=st.integers(min_value=0, max_value=100),
3535
)
36-
@settings(deadline=10000, suppress_health_check=[HealthCheck.filter_too_much])
36+
@settings(deadline=None, suppress_health_check=[HealthCheck.filter_too_much])
3737
def test_quantize_op(self, nrows: int, ncols: int) -> None:
3838
ebits = 8
3939
mbits = 7

fbgemm_gpu/test/sparse/failures_dict.json

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,12 @@
2323
"fbgemm::asynchronous_inclusive_cumsum": {},
2424
"fbgemm::batch_index_select_dim0": {},
2525
"fbgemm::batch_index_select_dim0_tensor": {},
26-
"fbgemm::block_bucketize_sparse_features": {},
26+
"fbgemm::block_bucketize_sparse_features": {
27+
"BlockBucketizeTest.test_aot_dispatch_dynamic__test_block_bucketize_sparse_features_large": {
28+
"comment": "max_examples=32 with my_size up to 1024 (~2048 items) recompiles per hypothesis example under aot_dispatch_dynamic and exceeds the 600s tpx timeout (flaky ~10%). Eager + faketensor variants still cover the op. T191384137",
29+
"status": "skip"
30+
}
31+
},
2732
"fbgemm::block_bucketize_sparse_features_2d_weights": {},
2833
"fbgemm::block_bucketize_sparse_features_inference": {},
2934
"fbgemm::bottom_k_per_row": {

fbgemm_gpu/test/sparse/histogram_binning_calibration_test.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@
2020

2121
if open_source:
2222
# pyre-ignore[21]
23-
from test_utils import gpu_unavailable
23+
from test_utils import gpu_unavailable, optests
2424
else:
2525
import fbgemm_gpu.sparse_ops # noqa: F401, E402
26-
from fbgemm_gpu.test.test_utils import gpu_unavailable
26+
from fbgemm_gpu.test.test_utils import gpu_unavailable, optests
2727

2828

2929
class HistogramBinningCalibrationTest(unittest.TestCase):
@@ -327,6 +327,9 @@ def test_generic_histogram_binning_calibration_by_feature(
327327
data_type=st.sampled_from([torch.bfloat16, torch.half, torch.float32]),
328328
)
329329
@settings(verbosity=Verbosity.verbose, deadline=None)
330+
@optests.dontGenerateOpCheckTests(
331+
"GPU-only CPU/GPU parity test; op covered by test_generic_histogram_binning_calibration_by_feature (T191384137)"
332+
)
330333
def test_generic_histogram_binning_calibration_by_feature_cpu_gpu(
331334
self,
332335
data_type: torch.dtype,

fbgemm_gpu/test/sparse/index_select_test.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@
2626

2727
if open_source:
2828
# pyre-ignore[21]
29-
from test_utils import gpu_available
29+
from test_utils import gpu_available, optests
3030
else:
3131
import fbgemm_gpu.sparse_ops # noqa: F401, E402
32-
from fbgemm_gpu.test.test_utils import gpu_available
32+
from fbgemm_gpu.test.test_utils import gpu_available, optests
3333

3434

3535
class IndexSelectTest(unittest.TestCase):
@@ -251,6 +251,9 @@ def compare_tensor_groups(
251251
)
252252

253253
@unittest.skipIf(not gpu_available, "Skip when CUDA is not available")
254+
@optests.dontGenerateOpCheckTests(
255+
"GPU-only test; opcheck variants only skip on CPU samples and add no op coverage (T191384137)"
256+
)
254257
def test_group_index_select_dim0_mixed_input_dtype(self) -> None:
255258
# lint-fixme: TorchDeviceCuda, TorchFunctionCallCudaDevice
256259
# CUDA specifically required: testing GPU-only FBGEMM sparse op validation
@@ -268,6 +271,9 @@ def test_group_index_select_dim0_mixed_input_dtype(self) -> None:
268271
)
269272

270273
@unittest.skipIf(not gpu_available, "Skip when CUDA is not available")
274+
@optests.dontGenerateOpCheckTests(
275+
"GPU-only test; opcheck variants only skip on CPU samples and add no op coverage (T191384137)"
276+
)
271277
def test_group_index_select_dim0_mixed_indices_dtype(self) -> None:
272278
# lint-fixme: TorchDeviceCuda, TorchFunctionCallCudaDevice
273279
# CUDA specifically required: testing GPU-only FBGEMM sparse op validation
@@ -499,6 +505,9 @@ def test_batch_index_select_dim0(
499505
max_examples=20,
500506
deadline=None,
501507
)
508+
@optests.dontGenerateOpCheckTests(
509+
"GPU-only test; opcheck variants only skip on CPU samples and add no op coverage (T191384137)"
510+
)
502511
def test_batch_index_select_dim0_long_runs(
503512
self,
504513
input_rows_count: int,

0 commit comments

Comments
 (0)