Skip to content

Commit 7f7fca0

Browse files
committed
kernel-builder: reject empty capabilities/archs list
We compute a kernel component's capabilities by intersecting the capabilities that are specified for the kernel and the capabilities that are supported by CUDA/ROCm. Before this change, we would silently set an empty list if this intersection was empty. This resulted in CMake falling back to an old capability. This change fixes that by erroring out when when the capability list is empty.
1 parent c358366 commit 7f7fca0

9 files changed

Lines changed: 215 additions & 1 deletion

File tree

examples/kernels/flake.nix

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,13 @@
102102
path = ./relu-compiler-flags;
103103
drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};
104104
}
105+
{
106+
name = "relu-invalid-capability";
107+
path = ./relu-invalid-capability;
108+
drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};
109+
assertFail = true;
110+
assertFailLogs = [ "empty set of capabilities" ];
111+
}
105112
{
106113
# Check that we can build an arch dev shell.
107114
name = "relu-dev-shell";
@@ -164,7 +171,17 @@
164171

165172
resolvedKernels = map (kernel: {
166173
inherit (kernel) name;
167-
drv = kernel.drv system kernel.outputs;
174+
drv =
175+
let
176+
baseDrv = kernel.drv system kernel.outputs;
177+
in
178+
if kernel.assertFail or false then
179+
pkgs.testers.testBuildFailure' {
180+
drv = baseDrv;
181+
expectedBuilderLogEntries = kernel.assertFailLogs or [ ];
182+
}
183+
else
184+
baseDrv;
168185
}) ciKernelOutputs;
169186

170187
ci-build = pkgs.linkFarm "ci-kernels" (
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
---
2+
library_name: kernels
3+
{% if license %}license: {{ license }}
4+
{% endif %}---
5+
6+
This is the repository card of {{ repo_id }} that has been pushed on the Hub. It was built to be used with the [`kernels` library](https://github.com/huggingface/kernels). This card was automatically generated.
7+
8+
## How to use
9+
{% if functions %}
10+
11+
```python
12+
# make sure `kernels` is installed: `pip install -U kernels`
13+
from kernels import get_kernel
14+
15+
kernel_module = get_kernel("{{ repo_id }}", version={{ version }})
16+
{{ functions[0] }} = kernel_module.{{ functions[0] }}
17+
18+
{{ functions[0] }}(...)
19+
```
20+
{% else %}
21+
22+
Usage example not available.
23+
{% endif %}
24+
25+
## Available functions
26+
{% if functions %}
27+
{% for func in functions %}
28+
- `{{ func }}`
29+
{% endfor %}
30+
{% else %}
31+
32+
Function list not available.
33+
{% endif %}
34+
{% if layers %}
35+
36+
## Available layers
37+
{% for layer in layers %}
38+
- `{{ layer }}`
39+
{% endfor %}
40+
{% endif %}
41+
42+
## Benchmarks
43+
{% if has_benchmark %}
44+
45+
Benchmarking script is available for this kernel. Run `kernels benchmark {{ repo_id }} --version {{ version }}`.
46+
{% else %}
47+
48+
No benchmark available yet.
49+
{% endif %}
50+
{% if upstream %}
51+
52+
## Source code
53+
54+
Source code of this kernel originally comes from {{ upstream }} and it was repurposed for compatibility with `kernels`.
55+
{% endif %}
56+
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
[general]
2+
name = "relu-invalid-capability"
3+
version = 1
4+
license = "Apache-2.0"
5+
backends = [
6+
"cpu",
7+
"cuda",
8+
"metal",
9+
"rocm",
10+
"xpu",
11+
]
12+
13+
[general.hub]
14+
repo-id = "kernels-test/relu-invalid-capability"
15+
16+
[torch]
17+
src = [
18+
"torch-ext/torch_binding.cpp",
19+
"torch-ext/torch_binding.h",
20+
]
21+
22+
[kernel.relu_rocm]
23+
backend = "rocm"
24+
depends = ["torch"]
25+
rocm-archs = [ "gfx99999" ]
26+
src = ["relu_cuda/relu.cu"]
27+
28+
[kernel.relu]
29+
backend = "cuda"
30+
depends = ["torch"]
31+
cuda-capabilities = [ "99999.0" ]
32+
src = ["relu_cuda/relu.cu"]
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
description = "Flake for ReLU kernel";
3+
4+
inputs = {
5+
kernel-builder.url = "path:../../..";
6+
};
7+
8+
outputs =
9+
{
10+
self,
11+
kernel-builder,
12+
}:
13+
kernel-builder.lib.genKernelFlakeOutputs {
14+
inherit self;
15+
path = ./.;
16+
};
17+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#include <ATen/cuda/CUDAContext.h>
2+
#include <c10/cuda/CUDAGuard.h>
3+
#include <torch/all.h>
4+
5+
#include <cmath>
6+
7+
__global__ void relu_kernel(float *__restrict__ out,
8+
float const *__restrict__ input, const int d) {
9+
const int64_t token_idx = blockIdx.x;
10+
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
11+
auto x = input[token_idx * d + idx];
12+
out[token_idx * d + idx] = x > 0.0f ? x : 0.0f;
13+
}
14+
}
15+
16+
void relu(torch::Tensor &out, torch::Tensor const &input) {
17+
TORCH_CHECK(input.device().is_cuda(), "input must be a CUDA tensor");
18+
TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
19+
TORCH_CHECK(input.scalar_type() == at::ScalarType::Float &&
20+
input.scalar_type() == at::ScalarType::Float,
21+
"relu_kernel only supports float32");
22+
23+
TORCH_CHECK(input.sizes() == out.sizes(),
24+
"Tensors must have the same shape. Got input shape: ",
25+
input.sizes(), " and output shape: ", out.sizes());
26+
27+
TORCH_CHECK(input.scalar_type() == out.scalar_type(),
28+
"Tensors must have the same data type. Got input dtype: ",
29+
input.scalar_type(), " and output dtype: ", out.scalar_type());
30+
31+
TORCH_CHECK(input.device() == out.device(),
32+
"Tensors must be on the same device. Got input device: ",
33+
input.device(), " and output device: ", out.device());
34+
35+
if (input.numel() == 0) {
36+
return;
37+
}
38+
39+
int d = input.size(-1);
40+
int64_t num_tokens = input.numel() / d;
41+
dim3 grid(num_tokens);
42+
dim3 block(std::min(d, 1024));
43+
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
44+
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
45+
relu_kernel<<<grid, block, 0, stream>>>(out.data_ptr<float>(),
46+
input.data_ptr<float>(), d);
47+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from typing import Optional
2+
3+
import torch
4+
5+
from ._ops import ops
6+
7+
8+
def relu(x: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor:
9+
if out is None:
10+
out = torch.empty_like(x)
11+
ops.relu(out, x)
12+
return out
13+
14+
15+
__all__ = ["relu"]
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#include <torch/library.h>
2+
3+
#include "registration.h"
4+
#include "torch_binding.h"
5+
6+
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
7+
ops.def("relu(Tensor! out, Tensor input) -> ()");
8+
#if defined(CPU_KERNEL)
9+
ops.impl("relu", torch::kCPU, &relu);
10+
#elif defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
11+
ops.impl("relu", torch::kCUDA, &relu);
12+
#elif defined(METAL_KERNEL)
13+
ops.impl("relu", torch::kMPS, relu);
14+
#elif defined(XPU_KERNEL)
15+
ops.impl("relu", torch::kXPU, &relu);
16+
#endif
17+
}
18+
19+
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#pragma once
2+
3+
#include <torch/torch.h>
4+
5+
void relu(torch::Tensor &out, torch::Tensor const &input);

kernel-builder/src/pyproject/templates/kernel.cmake

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ function(cuda_kernel_component SRC_VAR)
3434
# Determine CUDA architectures
3535
if(KERNEL_CUDA_CAPABILITIES)
3636
cuda_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_CUDA_CAPABILITIES}" "${CUDA_ARCHS}")
37+
if(NOT _KERNEL_ARCHS)
38+
message(FATAL_ERROR "CUDA kernel: ${KERNEL_NAME}, empty set of capabilities after intersection (kernel: ${KERNEL_CUDA_CAPABILITIES}, supported: ${CUDA_ARCHS})")
39+
endif()
3740
else()
3841
set(_KERNEL_ARCHS "${CUDA_KERNEL_ARCHS}")
3942
endif()
@@ -115,6 +118,9 @@ function(hip_kernel_component SRC_VAR)
115118
# Determine ROCm architectures
116119
if(KERNEL_ROCM_ARCHS)
117120
hip_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_ROCM_ARCHS}" "${ROCM_ARCHS}")
121+
if(NOT _KERNEL_ARCHS)
122+
message(FATAL_ERROR "ROCm kernel: ${KERNEL_NAME}, empty set of architectures after intersection (kernel: ${KERNEL_ROCM_ARCHS}, supported: ${ROCM_ARCHS})")
123+
endif()
118124
else()
119125
set(_KERNEL_ARCHS "${ROCM_ARCHS}")
120126
endif()

0 commit comments

Comments
 (0)