Enable previously skipped tests on CDNA (warp size 64)

sstamenk · sstamenk · commit 44fad6c2eb3d · 2026-03-04T18:11:55.000+01:00
Remove ROCM_WARP_SIZE_64 guards from all test files now that
blocksize-32/64 quantization and GEMV kernels work on warp-64 hardware.
diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -10,7 +10,6 @@
 
 import bitsandbytes as bnb
 from bitsandbytes import functional as F
-from bitsandbytes.cextension import ROCM_WARP_SIZE_64
 from tests.helpers import (
     BOOLEAN_TUPLES,
     TRUE_FALSE,
@@ -96,7 +95,7 @@ class Test8BitBlockwiseQuantizeFunctional:
     @pytest.mark.parametrize("nested", TRUE_FALSE, ids=id_formatter("nested"))
     @pytest.mark.parametrize(
         "blocksize",
-        [4096, 2048, 1024, 512, 256, 128, 64] if not ROCM_WARP_SIZE_64 else [4096, 2048, 1024, 512, 256, 128],
+        [4096, 2048, 1024, 512, 256, 128, 64],
     )
     @pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed"))
     def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed):
@@ -509,7 +508,6 @@ def test_dim3_igemm(self, seq_dim, hidden_dim, batch_dim):
     @pytest.mark.parametrize("hidden_dim", [32, 1024 * 4], ids=id_formatter("hidden_dim"))
     @pytest.mark.parametrize("batch_dim", [2, 16], ids=id_formatter("batch_dim"))
     @pytest.mark.parametrize("transpose", TRUE_FALSE, ids=id_formatter("transpose"))
-    @pytest.mark.skipif(ROCM_WARP_SIZE_64, reason="this test is not supported on ROCm yet")
     def test_minmax_igemm(self, seq_dim, hidden_dim, batch_dim, transpose):
         def min_max(x):
             maxA = torch.amax(x, dim=2, keepdim=True)
@@ -844,7 +842,7 @@ class TestQuantize4BitFunctional:
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
     @pytest.mark.parametrize(
         "blocksize",
-        [32, 64, 128, 256, 512, 1024, 2048, 4096] if not ROCM_WARP_SIZE_64 else [64, 128, 256, 512, 1024, 2048, 4096],
+        [32, 64, 128, 256, 512, 1024, 2048, 4096],
     )
     def test_4bit_quant(self, device, dtype, quant_type, blocksize):
         if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
@@ -927,9 +925,7 @@ def test_4bit_quant(self, device, dtype, quant_type, blocksize):
 
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
-    @pytest.mark.parametrize(
-        "blocksize", [32, 64, 128] if not ROCM_WARP_SIZE_64 else [64, 128], ids=id_formatter("blocksize")
-    )
+    @pytest.mark.parametrize("blocksize", [32, 64, 128], ids=id_formatter("blocksize"))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=describe_dtype)
     def test_4bit_compressed_stats(self, device, quant_type, blocksize, dtype):
         if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
@@ -966,9 +962,7 @@ def test_4bit_compressed_stats(self, device, quant_type, blocksize, dtype):
     @pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="No accelerator device")
     @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
-    @pytest.mark.parametrize(
-        "blocksize", [32, 64, 128] if not ROCM_WARP_SIZE_64 else [64, 128], ids=id_formatter("blocksize")
-    )
+    @pytest.mark.parametrize("blocksize", [32, 64, 128], ids=id_formatter("blocksize"))
     def test_4bit_quant_large(self, device, dtype, quant_type, blocksize):
         """
         Test that we can successfully quantize a large tensor. Note that the following limitations apply:
@@ -1028,9 +1022,6 @@ def test_bench_4bit_dequant(self, quant_type):
         # torch.cuda.synchronize()
         # print((time.time()-t0)/iters*1e6)
 
-    @pytest.mark.skipif(
-        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
-    )
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
     @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
@@ -1185,7 +1176,6 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, double_quant, kind):
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("storage_type", ["nf4", "fp4"], ids=["nf4", "fp4"])
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
-    @pytest.mark.skipif(ROCM_WARP_SIZE_64, reason="this test is not supported on ROCm yet")
     def test_gemv_eye_4bit(self, device, storage_type, dtype):
         if device == "hpu" and not is_supported_on_hpu(storage_type, dtype):
             pytest.skip("This configuration is not supported on HPU.")
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
@@ -11,7 +11,6 @@
 import torch
 
 import bitsandbytes as bnb
-from bitsandbytes.cextension import ROCM_WARP_SIZE_64
 from tests.helpers import (
     TRUE_FALSE,
     describe_dtype,
@@ -195,7 +194,7 @@ def test_linear_serialization(
 
 @pytest.mark.parametrize("device", get_available_devices())
 @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
-@pytest.mark.parametrize("blocksize", [32, 64, 128] if not ROCM_WARP_SIZE_64 else [64, 128])
+@pytest.mark.parametrize("blocksize", [32, 64, 128])
 @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
 def test_copy_param(device, quant_type, blocksize, compress_statistics):
     if device == "hpu" and not is_supported_on_hpu(quant_type):
@@ -286,7 +285,7 @@ def test_quant_storage_shard_roundtrip(device, quant_type, quant_storage):
 
 @pytest.mark.parametrize("device", get_available_devices())
 @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
-@pytest.mark.parametrize("blocksize", [32, 64, 128] if not ROCM_WARP_SIZE_64 else [64, 128])
+@pytest.mark.parametrize("blocksize", [32, 64, 128])
 @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
 def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
     if device == "hpu" and not is_supported_on_hpu(quant_type):
@@ -315,7 +314,7 @@ def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
 
 @pytest.mark.parametrize("device", get_available_devices())
 @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
-@pytest.mark.parametrize("blocksize", [32, 64, 128] if not ROCM_WARP_SIZE_64 else [64, 128])
+@pytest.mark.parametrize("blocksize", [32, 64, 128])
 @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
 def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics):
     if device == "hpu" and not is_supported_on_hpu(quant_type):
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
@@ -10,7 +10,6 @@
 import torch
 
 import bitsandbytes as bnb
-from bitsandbytes.cextension import ROCM_WARP_SIZE_64
 from bitsandbytes.nn.modules import Linear8bitLt
 from tests.helpers import (
     TRUE_FALSE,
@@ -238,7 +237,6 @@ def test_linear8bit_serialization(linear8bit):
 @pytest.mark.skipif(
     torch.__version__ < (2, 10) and sys.version_info >= (3, 14), reason="Not supported in Python 3.14 until torch 2.10"
 )
-@pytest.mark.skipif(ROCM_WARP_SIZE_64, reason="this test is not supported on ROCm yet")
 def test_linear8bitlt_torch_compile(device, threshold, bias, fullgraph, mode):
     if device == "cuda" and platform.system() == "Windows":
         pytest.skip("Triton is not officially supported on Windows")
diff --git a/tests/test_ops.py b/tests/test_ops.py
@@ -4,7 +4,6 @@
 import torch
 
 import bitsandbytes
-from bitsandbytes.cextension import ROCM_WARP_SIZE_64
 from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, is_supported_on_hpu
 
 # torch.library.opcheck is only available in torch 2.4 and later.
@@ -102,7 +101,7 @@ def test_int8_scaled_mm(self, device, dtype, has_bias):
 class TestInt8BlockwiseQuantOps:
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
-    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not ROCM_WARP_SIZE_64 else [128, 256, 512])
+    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
     def test_quantize_blockwise(self, device, dtype, blocksize):
         if device == "cpu":
             if dtype != torch.float32:
@@ -126,7 +125,7 @@ def test_quantize_blockwise(self, device, dtype, blocksize):
 
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
-    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not ROCM_WARP_SIZE_64 else [128, 256, 512])
+    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
     def test_dequantize_blockwise(self, device, dtype, blocksize):
         if device == "cpu" and dtype != torch.float32:
             pytest.skip("CPU implementation is only available for float32")
@@ -152,7 +151,7 @@ class Test4bitBlockwiseQuantOps:
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
     @pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype"))
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
-    @pytest.mark.parametrize("blocksize", [32, 64, 128, 256, 512] if not ROCM_WARP_SIZE_64 else [64, 128, 256, 512])
+    @pytest.mark.parametrize("blocksize", [32, 64, 128, 256, 512])
     def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
         if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
             pytest.skip("This configuration is not supported on HPU.")
@@ -202,7 +201,7 @@ def test_quantize_4bit_not_divisible_by_blocksize(self, device, dtype, quant_typ
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
     @pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype"))
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
-    @pytest.mark.parametrize("blocksize", [32, 64, 128, 256, 512] if not ROCM_WARP_SIZE_64 else [64, 128, 256, 512])
+    @pytest.mark.parametrize("blocksize", [32, 64, 128, 256, 512])
     def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
         if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
             pytest.skip("This configuration is not supported on HPU.")
@@ -236,8 +235,7 @@ def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksi
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
     @pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype"))
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
-    @pytest.mark.parametrize("blocksize", [32, 64, 128, 256, 512] if not ROCM_WARP_SIZE_64 else [128, 256, 512])
-    @pytest.mark.skipif(ROCM_WARP_SIZE_64, reason="this test is not supported on ROCm yet")
+    @pytest.mark.parametrize("blocksize", [32, 64, 128, 256, 512])
     def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
         if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
             pytest.skip("This configuration is not supported on HPU.")
diff --git a/tests/test_parametrize.py b/tests/test_parametrize.py
@@ -3,7 +3,6 @@
 import torch.nn as nn
 
 from bitsandbytes import functional as F
-from bitsandbytes.cextension import ROCM_WARP_SIZE_64
 from bitsandbytes.nn.parametrize import (
     Bnb4bitParametrization,
     replace_parameter_4bit,
@@ -336,7 +335,7 @@ def test_multiple_parameters(device, dtype):
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
 @pytest.mark.parametrize(
     "blocksize",
-    [64, 128, 256] if not ROCM_WARP_SIZE_64 else [128, 256],
+    [64, 128, 256],
 )
 def test_different_blocksizes(device, dtype, blocksize):
     """Test parametrization with different block sizes to verify flexibility."""