NVIDIA
diff --git a/‎examples/deepseek/ptq.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/deepseek/ptq.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/deepseek/quantize_to_nvfp4.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/deepseek/quantize_to_nvfp4.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎modelopt/torch/kernels/__init__.py‎
Lines changed: 2 additions & 36 deletions b/‎modelopt/torch/kernels/__init__.py‎
Lines changed: 2 additions & 36 deletions
diff --git a/‎modelopt/torch/kernels/common/__init__.py‎
Lines changed: 50 additions & 0 deletions b/‎modelopt/torch/kernels/common/__init__.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎…opt/torch/kernels/hf_triton_attention.py‎ ‎…ch/kernels/common/hf_triton_attention.py‎modelopt/torch/kernels/hf_triton_attention.py renamed to modelopt/torch/kernels/common/hf_triton_attention.py
Lines changed: 1 addition & 1 deletion b/‎…opt/torch/kernels/hf_triton_attention.py‎ ‎…ch/kernels/common/hf_triton_attention.py‎modelopt/torch/kernels/hf_triton_attention.py renamed to modelopt/torch/kernels/common/hf_triton_attention.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎modelopt/torch/kernels/triton_fa.py‎ ‎…delopt/torch/kernels/common/triton_fa.py‎modelopt/torch/kernels/triton_fa.py renamed to modelopt/torch/kernels/common/triton_fa.py b/‎modelopt/torch/kernels/triton_fa.py‎ ‎…delopt/torch/kernels/common/triton_fa.py‎modelopt/torch/kernels/triton_fa.py renamed to modelopt/torch/kernels/common/triton_fa.py
diff --git a/‎modelopt/torch/kernels/quantization/__init__.py‎
Lines changed: 16 additions & 0 deletions b/‎modelopt/torch/kernels/quantization/__init__.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎…pt/torch/quantization/src/conv/README.md‎ ‎…orch/kernels/quantization/conv/README.md‎modelopt/torch/quantization/src/conv/README.md renamed to modelopt/torch/kernels/quantization/conv/README.md
Lines changed: 3 additions & 3 deletions b/‎…pt/torch/quantization/src/conv/README.md‎ ‎…orch/kernels/quantization/conv/README.md‎modelopt/torch/quantization/src/conv/README.md renamed to modelopt/torch/kernels/quantization/conv/README.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎modelopt/torch/kernels/quantization/conv/__init__.py‎
Lines changed: 16 additions & 0 deletions b/‎modelopt/torch/kernels/quantization/conv/__init__.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎…tization/src/conv/bench_implicit_gemm.py‎ ‎…quantization/conv/bench_implicit_gemm.py‎modelopt/torch/quantization/src/conv/bench_implicit_gemm.py renamed to modelopt/torch/kernels/quantization/conv/bench_implicit_gemm.py
Lines changed: 3 additions & 1 deletion b/‎…tization/src/conv/bench_implicit_gemm.py‎ ‎…quantization/conv/bench_implicit_gemm.py‎modelopt/torch/quantization/src/conv/bench_implicit_gemm.py renamed to modelopt/torch/kernels/quantization/conv/bench_implicit_gemm.py
Lines changed: 3 additions & 1 deletion
@@ -55,8 +55,8 @@
 import modelopt.torch.quantization as mtq
 from modelopt.torch.export.model_config import KV_CACHE_FP8
 from modelopt.torch.export.quant_utils import get_quant_config
+from modelopt.torch.kernels.quantization.gemm import weight_dequant
 from modelopt.torch.quantization.nn import TensorQuantizer
-from modelopt.torch.quantization.triton import weight_dequant
 from modelopt.torch.quantization.utils import (
     is_quantized_column_parallel_linear,
     is_quantized_parallel_linear,
 
@@ -47,8 +47,8 @@
 from safetensors.torch import load_file, save_file
 from tqdm import tqdm
 
+from modelopt.torch.kernels.quantization.gemm import weight_dequant
 from modelopt.torch.quantization.qtensor import NVFP4QTensor
-from modelopt.torch.quantization.triton import weight_dequant
 
 
 def _remap_key(key_dict: dict[str, Any]):
 
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,38 +13,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Shared Triton kernels for modelopt (attention, quantization, etc.)."""
-
-import torch
-
-from modelopt.torch.utils import import_plugin
-
-IS_AVAILABLE = False
-attention = None
-attention_calibrate = None
-register_triton_attention = None
-
-if torch.cuda.is_available():
-    with import_plugin(
-        "triton",
-        msg_if_missing=(
-            "Your device is potentially capable of using the triton attention "
-            "kernel. Try to install triton with `pip install triton`."
-        ),
-    ):
-        from .triton_fa import attention as _attention
-        from .triton_fa import attention_calibrate as _attention_calibrate
-
-        attention = _attention
-        attention_calibrate = _attention_calibrate
-        IS_AVAILABLE = True
-        from .hf_triton_attention import register_triton_attention as _register_triton_attention
-
-        register_triton_attention = _register_triton_attention
-
-__all__ = [
-    "IS_AVAILABLE",
-    "attention",
-    "attention_calibrate",
-    "register_triton_attention",
-]
+"""ModelOpt kernel library: common, quantization (conv, gemm), sparsity (attention, gemm)."""
@@ -0,0 +1,50 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared Triton kernels for modelopt (attention, quantization, etc.)."""
+
+import torch
+
+from modelopt.torch.utils import import_plugin
+
+IS_AVAILABLE = False
+attention = None
+attention_calibrate = None
+register_triton_attention = None
+
+if torch.cuda.is_available():
+    with import_plugin(
+        "triton",
+        msg_if_missing=(
+            "Your device is potentially capable of using the triton attention "
+            "kernel. Try to install triton with `pip install triton`."
+        ),
+    ):
+        from .triton_fa import attention as _attention
+        from .triton_fa import attention_calibrate as _attention_calibrate
+
+        attention = _attention
+        attention_calibrate = _attention_calibrate
+        IS_AVAILABLE = True
+        from .hf_triton_attention import register_triton_attention as _register_triton_attention
+
+        register_triton_attention = _register_triton_attention
+
+__all__ = [
+    "IS_AVAILABLE",
+    "attention",
+    "attention_calibrate",
+    "register_triton_attention",
+]
@@ -25,7 +25,7 @@
 import torch
 import torch.nn as nn
 
-from modelopt.torch.kernels.triton_fa import attention
+from modelopt.torch.kernels.common.triton_fa import attention
 
 
 def _seq_lens_from_mask(
 
@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Quantization kernels: conv (implicit GEMM) and gemm (tensor_quant + Triton FP4/FP8)."""
@@ -32,7 +32,7 @@ When NVFP4 quantization is configured on a `Conv3d` layer via ModelOpt PTQ, the
 ```python
 import torch
 
-from modelopt.torch.quantization.src.conv.implicit_gemm_cuda import conv3d_implicit_gemm_cuda
+from modelopt.torch.kernels.quantization.conv.implicit_gemm_cuda import conv3d_implicit_gemm_cuda
 from modelopt.torch.quantization.tensor_quant import dynamic_block_quantize_op
 
 x = torch.randn(1, 128, 21, 60, 106, device="cuda")
@@ -75,7 +75,7 @@ out_q = conv3d_implicit_gemm_cuda(
 
 ### `conv3d_implicit_gemm_cuda`
 
-`from modelopt.torch.quantization.src.conv.implicit_gemm_cuda import conv3d_implicit_gemm_cuda`
+`from modelopt.torch.kernels.quantization.conv.implicit_gemm_cuda import conv3d_implicit_gemm_cuda`
 
 | Parameter | Description |
 |-----------|-------------|
@@ -91,7 +91,7 @@ out_q = conv3d_implicit_gemm_cuda(
 
 ### `fp4_fake_quant`
 
-`from modelopt.torch.quantization.src.conv.implicit_gemm_cuda import fp4_fake_quant`
+`from modelopt.torch.kernels.quantization.conv.implicit_gemm_cuda import fp4_fake_quant`
 
 Standalone FP4 (E2M1) blockwise fake quantization with FP8 E4M3 scale quantization. Uses the same CUDA device functions as the fused path inside the GEMM kernel.
 
 
@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implicit-GEMM CUDA kernel for quantized 3D convolution."""
@@ -94,7 +94,9 @@ def bench_fn(fn, warmup: int, iters: int) -> float:
 
 def run_benchmark(shapes_name: str, warmup: int, iters: int, fp4_block_size: int):
     """Run latency benchmark for the given shapes."""
-    from modelopt.torch.quantization.src.conv.implicit_gemm_cuda import conv3d_implicit_gemm_cuda
+    from modelopt.torch.kernels.quantization.conv.implicit_gemm_cuda import (
+        conv3d_implicit_gemm_cuda,
+    )
 
     shapes = get_shapes(shapes_name)