Merge pull request #30 from stackav-oss/feature/jmanning/cleanup-mp-gemm

jmanning-stackav · web-flow · commit 9e0d7987fada · 2025-06-18T16:42:20.000-04:00
Tune MP GEMM kernel
diff --git a/conch/kernels/quantization/gemm.py b/conch/kernels/quantization/gemm.py
@@ -135,33 +135,13 @@ def _get_metadata_eviction_policy() -> str:
 
 def _get_tuning_parameters() -> dict[str, int]:
     """Get block sizes/tuning parameters for current device."""
-    device_name = current_platform.get_device_name()
-
-    if "H100" in device_name:
-        return {
-            "cxpr_block_size_m": 128,
-            "cxpr_block_size_n": 128,
-            "cxpr_block_size_k": 128,
-            "cxpr_group_size_m": 8,
-            "num_warps": 8,
-            "num_stages": 2,
-        }
-
-    if "MI300X" in device_name:
-        return {
-            "cxpr_block_size_m": 128,
-            "cxpr_block_size_n": 64,
-            "cxpr_block_size_k": 128,
-            "cxpr_group_size_m": 16,
-            "num_warps": 8,
-            "num_stages": 2,
-        }
-
     return {
-        "cxpr_block_size_m": 64,
+        "cxpr_block_size_m": 128,
         "cxpr_block_size_n": 64,
-        "cxpr_block_size_k": 32,
-        "cxpr_group_size_m": 8,
+        "cxpr_block_size_k": 64,
+        "cxpr_group_size_m": 16,
+        "num_warps": 8,
+        "num_stages": 2,
     }
 
 
diff --git a/conch/ops/quantization/gemm.py b/conch/ops/quantization/gemm.py
@@ -51,31 +51,33 @@ def create_mixed_precision_metadata(
     acc_dtype: torch.dtype | None = None,
     meta_dtype: torch.dtype | None = None,
     scaled_activations: bool = False,
+    strict: bool = False,
 ) -> MixedPrecisionMatmulMetadata:
     """Verify sizes and dtypes of tensors and deduce metadata parameters."""
-    expected_input_matrix_rank: Final = 2
+    if strict:
+        expected_input_matrix_rank: Final = 2
 
-    if (x_rank := len(x.shape)) != expected_input_matrix_rank:
-        error_msg = f"Unexpected number of dimensions of input tensor x: {x_rank}"
-        raise ValueError(error_msg)
+        if (x_rank := len(x.shape)) != expected_input_matrix_rank:
+            error_msg = f"Unexpected number of dimensions of input tensor x: {x_rank}"
+            raise ValueError(error_msg)
 
-    if (w_q_packed_rank := len(w_q_packed.shape)) != expected_input_matrix_rank:
-        error_msg = f"Unexpected number of dimensions of input tensor w_q_packed: {w_q_packed_rank}"
-        raise ValueError(error_msg)
+        if (w_q_packed_rank := len(w_q_packed.shape)) != expected_input_matrix_rank:
+            error_msg = f"Unexpected number of dimensions of input tensor w_q_packed: {w_q_packed_rank}"
+            raise ValueError(error_msg)
 
-    if (w_s_rank := len(w_s.shape)) != expected_input_matrix_rank:
-        error_msg = f"Unexpected number of dimensions of input tensor w_s: {w_s_rank}"
-        raise ValueError(error_msg)
+        if (w_s_rank := len(w_s.shape)) != expected_input_matrix_rank:
+            error_msg = f"Unexpected number of dimensions of input tensor w_s: {w_s_rank}"
+            raise ValueError(error_msg)
 
-    if w_zp is not None and (w_zp_rank := len(w_zp.shape)) != expected_input_matrix_rank:
-        error_msg = f"Unexpected number of dimensions of input tensor w_zp: {w_zp_rank}"
-        raise ValueError(error_msg)
+        if w_zp is not None and (w_zp_rank := len(w_zp.shape)) != expected_input_matrix_rank:
+            error_msg = f"Unexpected number of dimensions of input tensor w_zp: {w_zp_rank}"
+            raise ValueError(error_msg)
 
-    # Expecting some form of 32-bit packing
-    expected_packed_dtypes: Final = [torch.uint32, torch.int32]
-    if (packed_dtype := w_q_packed.dtype) not in expected_packed_dtypes:
-        error_msg = f"Invalid datatype for packed weights: {packed_dtype}"
-        raise ValueError(error_msg)
+        # Expecting some form of 32-bit packing
+        expected_packed_dtypes: Final = [torch.uint32, torch.int32]
+        if (packed_dtype := w_q_packed.dtype) not in expected_packed_dtypes:
+            error_msg = f"Invalid datatype for packed weights: {packed_dtype}"
+            raise ValueError(error_msg)
 
     # Assume 32-bit packing
     packed_bitwidth: Final = 32
@@ -86,25 +88,27 @@ def create_mixed_precision_metadata(
 
     unpack_mask = 2**weight_size_bits - 1
 
-    # Verify shape of w_s
-    expected_scales_shape: Final = (k_dim // group_size, n_dim)
-    if (scales_shape := w_s.shape) != expected_scales_shape:
-        error_msg = f"Invalid w_s shape (expected: {expected_scales_shape}, actual: {scales_shape})"
-        raise ValueError(error_msg)
-
     # Check if zeros is a scalar value
     zero_is_scalar = False if w_zp is None else w_zp.numel() == 1
-    # Expected shape of zeros tensor if 1) it is not scalar 2) it is not None
-    expected_zeros_shape: Final = (k_dim // group_size, n_dim)
-    # Verify shape of w_zp
-    if not zero_is_scalar and w_zp is not None and (zeros_shape := w_zp.shape) != expected_zeros_shape:
-        error_msg = f"Invalid w_zp shape (expected: {expected_zeros_shape}, actual: {zeros_shape})"
-        raise ValueError(error_msg)
-
-    # Not supporting scaled activations right now, but we can add support later if needed. This simplifies the interface
-    if scaled_activations:
-        error_msg = "Scaled activations not yet implemented (need to deduce correct channel_scale_mode)"
-        raise NotImplementedError(error_msg)
+
+    if strict:
+        # Verify shape of w_s
+        expected_scales_shape: Final = (k_dim // group_size, n_dim)
+        if (scales_shape := w_s.shape) != expected_scales_shape:
+            error_msg = f"Invalid w_s shape (expected: {expected_scales_shape}, actual: {scales_shape})"
+            raise ValueError(error_msg)
+
+        # Expected shape of zeros tensor if 1) it is not scalar 2) it is not None
+        expected_zeros_shape: Final = (k_dim // group_size, n_dim)
+        # Verify shape of w_zp
+        if not zero_is_scalar and w_zp is not None and (zeros_shape := w_zp.shape) != expected_zeros_shape:
+            error_msg = f"Invalid w_zp shape (expected: {expected_zeros_shape}, actual: {zeros_shape})"
+            raise ValueError(error_msg)
+
+        # Not supporting scaled activations right now, but we can add support later if needed. This simplifies the interface
+        if scaled_activations:
+            error_msg = "Scaled activations not yet implemented (need to deduce correct channel_scale_mode)"
+            raise NotImplementedError(error_msg)
 
     return MixedPrecisionMatmulMetadata(
         m_dim=m_dim,
@@ -139,6 +143,7 @@ def mixed_precision_gemm(
     acc_dtype: torch.dtype | None = None,
     meta_dtype: torch.dtype | None = None,
     scaled_activations: bool = False,
+    strict: bool = False,
 ) -> torch.Tensor:
     """Mixed precision GEMM operation."""
     metadata = create_mixed_precision_metadata(
@@ -153,6 +158,7 @@ def mixed_precision_gemm(
         acc_dtype=acc_dtype,
         meta_dtype=meta_dtype,
         scaled_activations=scaled_activations,
+        strict=strict,
     )
 
     output = torch.zeros((metadata.m_dim, metadata.n_dim), device=x.device, dtype=metadata.output_dtype)
@@ -168,42 +174,45 @@ def create_scaled_metadata(
     scale_a: torch.Tensor,
     scale_b: torch.Tensor,
     output_dtype: torch.dtype,
+    strict: bool = False,
 ) -> ScaledMatmulMetadata:
     """Verify sizes and dtypes of tensors and deduce metadata parameters."""
-    expected_input_matrix_rank: Final = 2
+    if strict:
+        expected_input_matrix_rank: Final = 2
 
-    if (a_rank := len(a.shape)) != expected_input_matrix_rank:
-        error_msg = f"Unexpected number of dimensions of input tensor a: {a_rank}"
-        raise ValueError(error_msg)
+        if (a_rank := len(a.shape)) != expected_input_matrix_rank:
+            error_msg = f"Unexpected number of dimensions of input tensor a: {a_rank}"
+            raise ValueError(error_msg)
 
-    if (b_rank := len(b.shape)) != expected_input_matrix_rank:
-        error_msg = f"Unexpected number of dimensions of input tensor b: {b_rank}"
-        raise ValueError(error_msg)
+        if (b_rank := len(b.shape)) != expected_input_matrix_rank:
+            error_msg = f"Unexpected number of dimensions of input tensor b: {b_rank}"
+            raise ValueError(error_msg)
 
-    if a.dtype != b.dtype:
-        error_msg = f"Input tensors a and b must have the same datatype (a: {a.dtype}, b: {b.dtype})"
-        raise ValueError(error_msg)
+        if a.dtype != b.dtype:
+            error_msg = f"Input tensors a and b must have the same datatype (a: {a.dtype}, b: {b.dtype})"
+            raise ValueError(error_msg)
 
     m_dim, k_dim = a.shape
     _, n_dim = b.shape
 
-    if scale_a.numel() != 1:
-        if (scale_a_rank := len(scale_a.shape)) != expected_input_matrix_rank:
-            error_msg = f"Unexpected number of dimensions of input tensor scale_a: {scale_a_rank}"
-            raise ValueError(error_msg)
+    if strict:
+        if scale_a.numel() != 1:
+            if (scale_a_rank := len(scale_a.shape)) != expected_input_matrix_rank:
+                error_msg = f"Unexpected number of dimensions of input tensor scale_a: {scale_a_rank}"
+                raise ValueError(error_msg)
 
-        if scale_a.shape[0] != m_dim:
-            error_msg = f"Invalid scale_a shape (expected: ({m_dim},), actual: {scale_a.shape})"
-            raise ValueError(error_msg)
+            if scale_a.shape[0] != m_dim:
+                error_msg = f"Invalid scale_a shape (expected: ({m_dim},), actual: {scale_a.shape})"
+                raise ValueError(error_msg)
 
-    if scale_b.numel() != 1:
-        if (scale_b_rank := len(scale_b.shape)) != expected_input_matrix_rank:
-            error_msg = f"Unexpected number of dimensions of input tensor scale_b: {scale_b_rank}"
-            raise ValueError(error_msg)
+        if scale_b.numel() != 1:
+            if (scale_b_rank := len(scale_b.shape)) != expected_input_matrix_rank:
+                error_msg = f"Unexpected number of dimensions of input tensor scale_b: {scale_b_rank}"
+                raise ValueError(error_msg)
 
-        if scale_b.shape[0] != n_dim:
-            error_msg = f"Invalid scale_b shape (expected: ({n_dim},), actual: {scale_b.shape})"
-            raise ValueError(error_msg)
+            if scale_b.shape[0] != n_dim:
+                error_msg = f"Invalid scale_b shape (expected: ({n_dim},), actual: {scale_b.shape})"
+                raise ValueError(error_msg)
 
     return ScaledMatmulMetadata(
         m_dim=m_dim,
@@ -228,9 +237,10 @@ def scaled_gemm(
     scale_b: torch.Tensor,
     output_dtype: torch.dtype,
     bias: torch.Tensor | None = None,
+    strict: bool = False,
 ) -> torch.Tensor:
     """Scaled GEMM operation."""
-    metadata = create_scaled_metadata(a, b, scale_a, scale_b, output_dtype)
+    metadata = create_scaled_metadata(a, b, scale_a, scale_b, output_dtype, strict=strict)
 
     output = torch.zeros((metadata.m_dim, metadata.n_dim), device=a.device, dtype=output_dtype)