Merge pull request #18 from stackav-oss/feature/jmanning/wheel-fixing

jmanning-stackav · web-flow · commit 55254f31e497 · 2025-06-04T10:50:32.000-04:00
Platform-agnostic wheel fixes
diff --git a/benchmarks/bnb_dequantize_blockwise_benchmark.py b/benchmarks/bnb_dequantize_blockwise_benchmark.py
@@ -187,9 +187,7 @@ def main(  # noqa: PLR0913
             error_msg = "bitsandbytes must be installed and enabled via CONCH_ENABLE_BNB=1"
             raise NotImplementedError(error_msg)
 
-        from bitsandbytes.functional import (  # type: ignore[import-not-found, import-untyped, unused-ignore]  # isort:skip
-            dequantize_4bit as bnb_dequantize_4bit,
-        )
+        from bitsandbytes.functional import dequantize_4bit as bnb_dequantize_4bit
         from bitsandbytes.functional import quantize_4bit as bnb_quantize_4bit
 
         bnb_quantized, bnb_state = bnb_quantize_4bit(
diff --git a/benchmarks/bnb_quantize_blockwise_benchmark.py b/benchmarks/bnb_quantize_blockwise_benchmark.py
@@ -177,9 +177,7 @@ def main(  # noqa: PLR0913
             error_msg = "bitsandbytes must be installed and enabled via CONCH_ENABLE_BNB=1"
             raise NotImplementedError(error_msg)
 
-        from bitsandbytes.functional import (  # type: ignore[import-not-found, import-untyped, unused-ignore]  # isort:skip
-            quantize_4bit as bnb_quantize_4bit,
-        )
+        from bitsandbytes.functional import quantize_4bit as bnb_quantize_4bit
 
         bnb_output, bnb_state = bnb_quantize_4bit(
             x,
diff --git a/benchmarks/mixed_precision_gemm_benchmark.py b/benchmarks/mixed_precision_gemm_benchmark.py
@@ -18,7 +18,7 @@
 from conch.utils.benchmark import BenchmarkMetadata, benchmark_it
 
 if envs.CONCH_ENABLE_VLLM and current_platform.has_cuda():
-    from vllm import _custom_ops as vllm_custom_ops  # type: ignore[import-not-found, unused-ignore]
+    from vllm import _custom_ops as vllm_custom_ops
 else:
     vllm_custom_ops = None  # type: ignore[assignment, unused-ignore]
 
diff --git a/benchmarks/paged_attention_benchmark.py b/benchmarks/paged_attention_benchmark.py
@@ -15,9 +15,7 @@
 from conch.utils.benchmark import BenchmarkMetadata, benchmark_it
 
 if envs.CONCH_ENABLE_VLLM and current_platform.has_cuda():
-    from vllm._custom_ops import (
-        paged_attention_v2 as vllm_paged_attention_v2,  # type: ignore[import-not-found, import-untyped, unused-ignore]
-    )
+    from vllm._custom_ops import paged_attention_v2 as vllm_paged_attention_v2
 else:
     vllm_paged_attention_v2 = None  # type: ignore[assignment, unused-ignore]
 
diff --git a/benchmarks/paged_attention_vs_flash_benchmark.py b/benchmarks/paged_attention_vs_flash_benchmark.py
@@ -15,9 +15,7 @@
 from conch.utils.benchmark import BenchmarkMetadata, benchmark_it
 
 if envs.CONCH_ENABLE_VLLM and current_platform.is_nvidia():
-    from vllm.vllm_flash_attn import (  # type: ignore[attr-defined, import-not-found, import-untyped, unused-ignore]  # isort:skip
-        flash_attn_with_kvcache,
-    )
+    from vllm.vllm_flash_attn import flash_attn_with_kvcache  # type: ignore[attr-defined, unused-ignore]
 else:
     flash_attn_with_kvcache = None  # type: ignore[assignment, unused-ignore]
 
diff --git a/benchmarks/varlen_attention_benchmark.py b/benchmarks/varlen_attention_benchmark.py
@@ -16,9 +16,7 @@
 from conch.utils.benchmark import BenchmarkMetadata, benchmark_it
 
 if envs.CONCH_ENABLE_VLLM and current_platform.is_nvidia():
-    from vllm.vllm_flash_attn import (  # type: ignore[attr-defined, import-not-found, unused-ignore]  # isort:skip
-        flash_attn_varlen_func,
-    )
+    from vllm.vllm_flash_attn import flash_attn_varlen_func  # type: ignore[attr-defined, unused-ignore]
 else:
     flash_attn_varlen_func = None  # type: ignore[assignment, unused-ignore]
 
diff --git a/conch/reference/activation/gelu_tanh_and_mul.py b/conch/reference/activation/gelu_tanh_and_mul.py
@@ -17,7 +17,7 @@ def _gelu_tanh_and_mul_pytorch_ref(x: torch.Tensor) -> torch.Tensor:
 
 def _gelu_tanh_and_mul_vllm_ref(x: torch.Tensor) -> torch.Tensor:
     """vLLM reference gelu_tanh_and_mul impl."""
-    from vllm.model_executor.layers.activation import GeluAndMul  # type: ignore[import-not-found, unused-ignore]
+    from vllm.model_executor.layers.activation import GeluAndMul
 
     gelu_layer = GeluAndMul("tanh")
     return gelu_layer.forward_cuda(x)  # type: ignore[no-any-return, unused-ignore]
diff --git a/conch/reference/activation/silu_and_mul.py b/conch/reference/activation/silu_and_mul.py
@@ -17,7 +17,7 @@ def _silu_and_mul_pytorch_ref(x: torch.Tensor) -> torch.Tensor:
 
 def _silu_and_mul_vllm_ref(x: torch.Tensor) -> torch.Tensor:
     """vLLM reference silu and mul implementation."""
-    from vllm.model_executor.layers.activation import SiluAndMul  # type: ignore[import-not-found, unused-ignore]
+    from vllm.model_executor.layers.activation import SiluAndMul
 
     silu_layer = SiluAndMul()  # type: ignore[no-untyped-call, unused-ignore]
     return silu_layer.forward_cuda(x)  # type: ignore[no-any-return, unused-ignore]
diff --git a/conch/reference/embedding/rotary_embedding.py b/conch/reference/embedding/rotary_embedding.py
@@ -99,18 +99,18 @@ def _rotary_embedding_vllm_ref(
     offsets: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """vLLM reference rotary_embedding impl."""
-    from vllm import _custom_ops as ops  # type: ignore[import-not-found, unused-ignore]
+    from vllm import _custom_ops as vllm_custom_ops
 
     cos_sin_cache = cos_sin_cache.to(query.device, dtype=query.dtype)
 
-    # ops.rotary_embedding()/batched_rotary_embedding()
+    # vllm_custom_ops.rotary_embedding()/batched_rotary_embedding()
     # are in-place operations that update the query and key tensors.
     if offsets is not None:
-        ops.batched_rotary_embedding(
+        vllm_custom_ops.batched_rotary_embedding(
             positions, query, key, head_size, cos_sin_cache, is_neox_style, rotary_dim, offsets
         )
     else:
-        ops.rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox_style)
+        vllm_custom_ops.rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox_style)
 
     return query, key
 
diff --git a/conch/reference/normalization/gemma_rms_norm.py b/conch/reference/normalization/gemma_rms_norm.py
@@ -37,7 +37,7 @@ def _gemma_rms_norm_vllm_ref(
     residual: torch.Tensor | None,
 ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
     """vLLM reference gemma_rms_norm impl."""
-    from vllm.model_executor.layers.layernorm import GemmaRMSNorm  # type: ignore[import-not-found, unused-ignore]
+    from vllm.model_executor.layers.layernorm import GemmaRMSNorm
 
     layer = GemmaRMSNorm(hidden_size=weight.size(0), eps=variance_epsilon)
     layer.weight = torch.nn.Parameter(weight)
diff --git a/conch/reference/normalization/rms_norm.py b/conch/reference/normalization/rms_norm.py
@@ -22,7 +22,7 @@ def _rms_norm_pytorch_ref(x: torch.Tensor, weight: torch.Tensor, epsilon: float)
 
 def _rms_norm_vllm_ref(x: torch.Tensor, weight: torch.Tensor, epsilon: float) -> torch.Tensor:
     """vLLM reference rms_norm impl."""
-    from vllm._custom_ops import rms_norm as rms_norm_cuda  # type: ignore[import-not-found, unused-ignore]
+    from vllm._custom_ops import rms_norm as rms_norm_cuda
 
     out = torch.empty_like(x, dtype=x.dtype, device=x.device)
     rms_norm_cuda(out, x, weight, epsilon)
@@ -57,9 +57,7 @@ def _fused_add_rms_norm_vllm_ref(
     epsilon: float,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """vLLM reference fused_add_rms_norm impl."""
-    from vllm._custom_ops import (
-        fused_add_rms_norm as fused_add_rms_norm_cuda,  # type: ignore[import-not-found, unused-ignore]
-    )
+    from vllm._custom_ops import fused_add_rms_norm as fused_add_rms_norm_cuda
 
     fused_add_rms_norm_cuda(x, residual, weight, epsilon)
 
diff --git a/conch/reference/quantization/fp8.py b/conch/reference/quantization/fp8.py
@@ -19,9 +19,7 @@ def _scaled_fp8_quant_pytorch_ref(x: torch.Tensor, scale: torch.Tensor) -> torch
 
 def _scaled_fp8_quant_vllm_ref(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
     """vLLM reference fp8 quant impl."""
-    from vllm._custom_ops import (
-        scaled_fp8_quant as scaled_fp8_quant_vllm,  # type: ignore[import-not-found, unused-ignore]
-    )
+    from vllm._custom_ops import scaled_fp8_quant as scaled_fp8_quant_vllm
 
     output, _ = scaled_fp8_quant_vllm(x, scale)
     return output  # type: ignore[no-any-return, unused-ignore]
diff --git a/conch/reference/quantization/int8.py b/conch/reference/quantization/int8.py
@@ -19,9 +19,7 @@ def _scaled_int8_quant_pytorch_ref(x: torch.Tensor, scale: torch.Tensor) -> torc
 
 def _scaled_int8_quant_vllm_ref(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
     """vLLM reference int8 quant impl."""
-    from vllm._custom_ops import (
-        scaled_int8_quant as scaled_int8_quant_vllm,  # type: ignore[import-not-found, unused-ignore]
-    )
+    from vllm._custom_ops import scaled_int8_quant as scaled_int8_quant_vllm
 
     output, _, _ = scaled_int8_quant_vllm(x, scale)
     return output  # type: ignore[no-any-return, unused-ignore]
diff --git a/conch/reference/quantization/scaled_gemm.py b/conch/reference/quantization/scaled_gemm.py
@@ -34,7 +34,7 @@ def _scaled_gemm_vllm_ref(
     out_dtype: torch.dtype,
     bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
-    from vllm._custom_ops import cutlass_scaled_mm  # type: ignore[import-not-found, unused-ignore]
+    from vllm._custom_ops import cutlass_scaled_mm
 
     return cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)  # type: ignore[no-any-return, unused-ignore]
 
diff --git a/conch/reference/vllm/copy_blocks.py b/conch/reference/vllm/copy_blocks.py
@@ -23,7 +23,7 @@ def _copy_blocks_vllm_ref(
     key_caches: list[torch.Tensor], value_caches: list[torch.Tensor], block_mapping: list[tuple[int, int]]
 ) -> None:
     """Reference vLLM implementation of copy_blocks."""
-    from vllm._custom_ops import copy_blocks as copy_blocks_vllm  # type: ignore[import-not-found, unused-ignore]
+    from vllm._custom_ops import copy_blocks as copy_blocks_vllm
 
     block_mapping_tensor = torch.tensor(block_mapping, dtype=torch.int64, device=key_caches[0].device).view(-1, 2)
     copy_blocks_vllm(key_caches, value_caches, block_mapping_tensor)
diff --git a/conch/reference/vllm/reshape_and_cache.py b/conch/reference/vllm/reshape_and_cache.py
@@ -56,9 +56,7 @@ def _reshape_and_cache_vllm_ref(
     v_scale: torch.Tensor,
 ) -> None:
     """Reference vLLM implementation of reshape_and_cache."""
-    from vllm._custom_ops import (
-        reshape_and_cache as reshape_and_cache_vllm,  # type: ignore[import-not-found, unused-ignore]
-    )
+    from vllm._custom_ops import reshape_and_cache as reshape_and_cache_vllm
 
     reshape_and_cache_vllm(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, k_scale, v_scale)
 
diff --git a/docs/distribution/wheel.md b/docs/distribution/wheel.md
@@ -11,7 +11,3 @@ pip install build wheel twine
 ```bash
 ./scripts/wheel/build.sh
 ```
-
-By default, this builds a wheel for your current platform (detected via `nvidia-smi`, `rocm-smi`, or similar).
-Optionally, you can override the platform to build the wheel for by specifying it as an argument to the script.
-Acceptable platforms are: `cuda` or `rocm`.
diff --git a/docs/getting_started/developer_environment.md b/docs/getting_started/developer_environment.md
@@ -40,7 +40,17 @@ To install the project as an editable, clone this repository and run this comman
 pip install -e ".[dev]"
 ```
 
-**Note**: For ROCm/AMD support, you'll need to add `--extra-index-url https://download.pytorch.org/whl/rocm6.2`.
+By default this does not install `torch` or `triton`.
+You can specify an extra for your platform (either `cuda` or `rocm`) to install the appropriate versions of those packages for your accelerator.
+For ROCm/AMD support, you'll need to add `--extra-index-url https://download.pytorch.org/whl/rocm6.2.4`.
+
+```bash
+pip install -e ".[dev, cuda]"
+```
+
+```bash
+pip install -e ".[dev, rocm]" --extra-index-url https://download.pytorch.org/whl/rocm6.2.4
+```
 
 ## Testing
 
diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md
@@ -15,27 +15,44 @@ then install:
 pip install -e .
 ```
 
-## As wheel
+### Torch/Triton installation
 
-### Nvidia/CUDA
+To install `torch`/`triton`, specify either the `cuda` or `rocm` extra (depending on your platform).
 
-For Nvidia/CUDA platforms you can install `conch` from PyPi via:
+```bash
+# For Nvidia/CUDA
+pip install -e ".[cuda]"
+```
+
+```bash
+# For AMD/ROCm
+pip install -e ".[rocm]" --extra-index-url https://download.pytorch.org/whl/rocm6.2.4
+```
+
+## As wheel
+
+You can install `conch` from PyPi via:
 
 ```bash
 pip install conch-triton-kernels
 ```
 
-### AMD/ROCm
+**Note**: by default, without any extras specified, **this will not install `torch` or `triton`**.
+This allows usage of Conch as long as Torch and Triton are already installed in your environment.
 
-For AMD/ROCm, we do not currently have a wheel on PyPi, but you can easily build one.
-After cloning the Conch repo, run this command from the repository root:
+### Nvidia/CUDA
+
+For Nvidia/CUDA platforms, you can specify the `[cuda]` extra to install `torch` and `triton` for Nvidia/CUDA platforms.
 
 ```bash
-./scripts/wheel/build.sh rocm
+pip install "conch-triton-kernels[cuda]"
 ```
 
-The resulting wheel file will be generated under `dist/rocm/`.
+### AMD/ROCm
+
+For AMD/ROCm platforms, you can specify the `[rocm]` extra to install `torch` and `triton` for AMD/ROCm platforms.
+You must also specify the appropriate `--extra-index-url`.
 
 ```bash
-pip install dist/rocm/conch_triton_kernels-{version}-py3-none-any.whl
+pip install "conch-triton-kernels[rocm]" --extra-index-url https://download.pytorch.org/whl/rocm6.2.4
 ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,6 +51,15 @@ module = [
 ]
 follow_untyped_imports = true
 
+[[tool.mypy.overrides]]
+module = [
+  "bitsandbytes.*",
+  "vllm.*",
+]
+allow_untyped_calls = true
+ignore_missing_imports = true
+disable_error_code = "attr-defined"
+
 [tool.setuptools.packages.find]
 include = ["conch*"]
 exclude = [
diff --git a/scripts/wheel/build.sh b/scripts/wheel/build.sh
@@ -1,12 +1,3 @@
 #!/bin/bash
 
-wheel_platform=$1
-
-if [ -z "$wheel_platform" ]; then
-    output_dir="dist/"
-else
-    output_dir="dist/$wheel_platform"
-    export CONCH_WHEEL_BUILD_PLATFORM=$wheel_platform
-fi
-
-python -m build --outdir $output_dir
+python -m build
diff --git a/setup.py b/setup.py
diff --git a/tests/paged_attention_test.py b/tests/paged_attention_test.py
diff --git a/tests/quantize_blockwise_test.py b/tests/quantize_blockwise_test.py
diff --git a/tests/varlen_attention_test.py b/tests/varlen_attention_test.py