Skip to content

Commit 7e00248

Browse files
committed
Reorg the kernel dir
Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
1 parent 2fef374 commit 7e00248

49 files changed

Lines changed: 201 additions & 108 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

examples/deepseek/ptq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@
5555
import modelopt.torch.quantization as mtq
5656
from modelopt.torch.export.model_config import KV_CACHE_FP8
5757
from modelopt.torch.export.quant_utils import get_quant_config
58+
from modelopt.torch.kernels.quantization.gemm import weight_dequant
5859
from modelopt.torch.quantization.nn import TensorQuantizer
59-
from modelopt.torch.quantization.triton import weight_dequant
6060
from modelopt.torch.quantization.utils import (
6161
is_quantized_column_parallel_linear,
6262
is_quantized_parallel_linear,

examples/deepseek/quantize_to_nvfp4.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@
4747
from safetensors.torch import load_file, save_file
4848
from tqdm import tqdm
4949

50+
from modelopt.torch.kernels.quantization.gemm import weight_dequant
5051
from modelopt.torch.quantization.qtensor import NVFP4QTensor
51-
from modelopt.torch.quantization.triton import weight_dequant
5252

5353

5454
def _remap_key(key_dict: dict[str, Any]):

modelopt/torch/kernels/__init__.py

Lines changed: 2 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,38 +13,4 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
"""Shared Triton kernels for modelopt (attention, quantization, etc.)."""
17-
18-
import torch
19-
20-
from modelopt.torch.utils import import_plugin
21-
22-
IS_AVAILABLE = False
23-
attention = None
24-
attention_calibrate = None
25-
register_triton_attention = None
26-
27-
if torch.cuda.is_available():
28-
with import_plugin(
29-
"triton",
30-
msg_if_missing=(
31-
"Your device is potentially capable of using the triton attention "
32-
"kernel. Try to install triton with `pip install triton`."
33-
),
34-
):
35-
from .triton_fa import attention as _attention
36-
from .triton_fa import attention_calibrate as _attention_calibrate
37-
38-
attention = _attention
39-
attention_calibrate = _attention_calibrate
40-
IS_AVAILABLE = True
41-
from .hf_triton_attention import register_triton_attention as _register_triton_attention
42-
43-
register_triton_attention = _register_triton_attention
44-
45-
__all__ = [
46-
"IS_AVAILABLE",
47-
"attention",
48-
"attention_calibrate",
49-
"register_triton_attention",
50-
]
16+
"""ModelOpt kernel library: common, quantization (conv, gemm), sparsity (attention, gemm)."""
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Shared Triton kernels for modelopt (attention, quantization, etc.)."""
17+
18+
import torch
19+
20+
from modelopt.torch.utils import import_plugin
21+
22+
IS_AVAILABLE = False
23+
attention = None
24+
attention_calibrate = None
25+
register_triton_attention = None
26+
27+
if torch.cuda.is_available():
28+
with import_plugin(
29+
"triton",
30+
msg_if_missing=(
31+
"Your device is potentially capable of using the triton attention "
32+
"kernel. Try to install triton with `pip install triton`."
33+
),
34+
):
35+
from .triton_fa import attention as _attention
36+
from .triton_fa import attention_calibrate as _attention_calibrate
37+
38+
attention = _attention
39+
attention_calibrate = _attention_calibrate
40+
IS_AVAILABLE = True
41+
from .hf_triton_attention import register_triton_attention as _register_triton_attention
42+
43+
register_triton_attention = _register_triton_attention
44+
45+
__all__ = [
46+
"IS_AVAILABLE",
47+
"attention",
48+
"attention_calibrate",
49+
"register_triton_attention",
50+
]

modelopt/torch/kernels/hf_triton_attention.py renamed to modelopt/torch/kernels/common/hf_triton_attention.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
import torch
2626
import torch.nn as nn
2727

28-
from modelopt.torch.kernels.triton_fa import attention
28+
from modelopt.torch.kernels.common.triton_fa import attention
2929

3030

3131
def _seq_lens_from_mask(
File renamed without changes.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Quantization kernels: conv (implicit GEMM) and gemm (tensor_quant + Triton FP4/FP8)."""

modelopt/torch/quantization/src/conv/README.md renamed to modelopt/torch/kernels/quantization/conv/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ When NVFP4 quantization is configured on a `Conv3d` layer via ModelOpt PTQ, the
3232
```python
3333
import torch
3434

35-
from modelopt.torch.quantization.src.conv.implicit_gemm_cuda import conv3d_implicit_gemm_cuda
35+
from modelopt.torch.kernels.quantization.conv.implicit_gemm_cuda import conv3d_implicit_gemm_cuda
3636
from modelopt.torch.quantization.tensor_quant import dynamic_block_quantize_op
3737

3838
x = torch.randn(1, 128, 21, 60, 106, device="cuda")
@@ -75,7 +75,7 @@ out_q = conv3d_implicit_gemm_cuda(
7575

7676
### `conv3d_implicit_gemm_cuda`
7777

78-
`from modelopt.torch.quantization.src.conv.implicit_gemm_cuda import conv3d_implicit_gemm_cuda`
78+
`from modelopt.torch.kernels.quantization.conv.implicit_gemm_cuda import conv3d_implicit_gemm_cuda`
7979

8080
| Parameter | Description |
8181
|-----------|-------------|
@@ -91,7 +91,7 @@ out_q = conv3d_implicit_gemm_cuda(
9191

9292
### `fp4_fake_quant`
9393

94-
`from modelopt.torch.quantization.src.conv.implicit_gemm_cuda import fp4_fake_quant`
94+
`from modelopt.torch.kernels.quantization.conv.implicit_gemm_cuda import fp4_fake_quant`
9595

9696
Standalone FP4 (E2M1) blockwise fake quantization with FP8 E4M3 scale quantization. Uses the same CUDA device functions as the fused path inside the GEMM kernel.
9797

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Implicit-GEMM CUDA kernel for quantized 3D convolution."""

modelopt/torch/quantization/src/conv/bench_implicit_gemm.py renamed to modelopt/torch/kernels/quantization/conv/bench_implicit_gemm.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,9 @@ def bench_fn(fn, warmup: int, iters: int) -> float:
9494

9595
def run_benchmark(shapes_name: str, warmup: int, iters: int, fp4_block_size: int):
9696
"""Run latency benchmark for the given shapes."""
97-
from modelopt.torch.quantization.src.conv.implicit_gemm_cuda import conv3d_implicit_gemm_cuda
97+
from modelopt.torch.kernels.quantization.conv.implicit_gemm_cuda import (
98+
conv3d_implicit_gemm_cuda,
99+
)
98100

99101
shapes = get_shapes(shapes_name)
100102

0 commit comments

Comments
 (0)