kernel-builder: reject empty capabilities/archs list (#590)

danieldk · web-flow · commit 9cfa6423a20d · 2026-05-27T13:13:16.000+02:00
* kernel-builder: reject empty capabilities/archs list

We compute a kernel component's capabilities by intersecting the
capabilities that are specified for the kernel and the capabilities that
are supported by CUDA/ROCm.

Before this change, we would silently set an empty list if this
intersection was empty. This resulted in CMake falling back to an old
capability.

This change fixes that by erroring out when when the capability list is
empty.

* Hook up ROCm test and make it concurrent
diff --git a/.github/workflows/build_kernel.yaml b/.github/workflows/build_kernel.yaml
@@ -40,7 +40,7 @@ jobs:
         run: nix-shell -p nix-info --run "nix-info -m"
 
       - name: Build all example kernels
-        run: nix build -L ./examples/kernels#ci-build
+        run: nix build -L ./examples/kernels#ci-build-cuda
       - name: Copy kernel artifacts
         run: cp -rL result/* .
 
diff --git a/.github/workflows/build_kernel_rocm.yaml b/.github/workflows/build_kernel_rocm.yaml
@@ -12,7 +12,7 @@ on:
 
 jobs:
   build:
-    name: Build kernel
+    name: Build kernels (ROCm)
     runs-on:
       group: aws-highmemory-32-plus-nix
     steps:
@@ -33,8 +33,5 @@ jobs:
         run: nix-shell -p nix-info --run "nix-info -m"
       # For now we only test that there are no regressions in building ROCm
       # kernels. Also run tests once we have a ROCm runner.
-      - name: Build relu kernel
-        run: ( cd examples/kernels/relu && nix build .\#redistributable.torch211-rocm71-x86_64-linux -L )
-
-      - name: Build relu kernel (compiler flags)
-        run: ( cd examples/kernels/relu-compiler-flags && nix build .\#redistributable.torch211-rocm71-x86_64-linux )
+      - name: Build all ROCm example kernels
+        run: nix build -L ./examples/kernels#ci-build-rocm
diff --git a/examples/kernels/flake.nix b/examples/kernels/flake.nix
@@ -15,6 +15,7 @@
       inherit (kernel-builder.inputs.nixpkgs) lib;
 
       cudaVersion = "cu126";
+      rocmVersion = "rocm71";
       torchVersion = "211";
       tvmFfiVersion = "01";
 
@@ -102,6 +103,13 @@
           path = ./relu-compiler-flags;
           drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};
         }
+        {
+          name = "relu-invalid-capability";
+          path = ./relu-invalid-capability;
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};
+          assertFail = true;
+          assertFailLogs = [ "empty set of capabilities" ];
+        }
         {
           # Check that we can build an arch dev shell.
           name = "relu-dev-shell";
@@ -129,6 +137,27 @@
         }
       ];
 
+      # ROCm kernels to build in CI.
+      ciRocmKernels = [
+        {
+          name = "relu-invalid-capability";
+          path = ./relu-invalid-capability;
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"};
+          assertFail = true;
+          assertFailLogs = [ "empty set of architectures" ];
+        }
+        {
+          name = "relu-kernel";
+          path = ./relu;
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"};
+        }
+        {
+          name = "relu-compiler-flags";
+          path = ./relu-compiler-flags;
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"};
+        }
+      ];
+
       mkKernelOutputs =
         {
           path,
@@ -141,16 +170,21 @@
           // lib.optionalAttrs (torchVersions != null) { inherit torchVersions; }
         );
 
-      ciKernelOutputs = map (
-        kernel:
-        kernel
-        // {
-          outputs = mkKernelOutputs {
-            inherit (kernel) path;
-            torchVersions = kernel.torchVersions or null;
-          };
-        }
-      ) ciKernels;
+      mkKernelOutputs' =
+        kernels:
+        map (
+          kernel:
+          kernel
+          // {
+            outputs = mkKernelOutputs {
+              inherit (kernel) path;
+              torchVersions = kernel.torchVersions or null;
+            };
+          }
+        ) kernels;
+
+      ciKernelOutputs = mkKernelOutputs' ciKernels;
+      ciRocmKernelOutputs = mkKernelOutputs' ciRocmKernels;
     in
     flake-utils.lib.eachSystem
       [
@@ -162,22 +196,39 @@
         let
           pkgs = nixpkgs.legacyPackages.${system};
 
-          resolvedKernels = map (kernel: {
-            inherit (kernel) name;
-            drv = kernel.drv system kernel.outputs;
-          }) ciKernelOutputs;
-
-          ci-build = pkgs.linkFarm "ci-kernels" (
+          resolveKernels =
+            kernelOutputsList:
             map (kernel: {
               inherit (kernel) name;
-              path = kernel.drv;
-            }) resolvedKernels
-          );
+              drv =
+                let
+                  baseDrv = kernel.drv system kernel.outputs;
+                in
+                if kernel.assertFail or false then
+                  pkgs.testers.testBuildFailure' {
+                    drv = baseDrv;
+                    expectedBuilderLogEntries = kernel.assertFailLogs or [ ];
+                  }
+                else
+                  baseDrv;
+            }) kernelOutputsList;
+
+          mkCiBuild =
+            name: kernelOutputsList:
+            pkgs.linkFarm name (
+              map (kernel: {
+                inherit (kernel) name;
+                path = kernel.drv;
+              }) (resolveKernels kernelOutputsList)
+            );
+
+          ci-build-cuda = mkCiBuild "ci-kernels-cuda" ciKernelOutputs;
+          ci-build-rocm = mkCiBuild "ci-kernels-rocm" ciRocmKernelOutputs;
         in
         {
           packages = {
-            inherit ci-build;
-            default = ci-build;
+            inherit ci-build-cuda ci-build-rocm;
+            default = ci-build-cuda;
           };
         }
       );
diff --git a/examples/kernels/relu-invalid-capability/CARD.md b/examples/kernels/relu-invalid-capability/CARD.md
@@ -0,0 +1,56 @@
+---
+library_name: kernels
+{% if license %}license: {{ license }}
+{% endif %}---
+
+This is the repository card of {{ repo_id }} that has been pushed on the Hub. It was built to be used with the [`kernels` library](https://github.com/huggingface/kernels). This card was automatically generated.
+
+## How to use
+{% if functions %}
+
+```python
+# make sure `kernels` is installed: `pip install -U kernels`
+from kernels import get_kernel
+
+kernel_module = get_kernel("{{ repo_id }}", version={{ version }})
+{{ functions[0] }} = kernel_module.{{ functions[0] }}
+
+{{ functions[0] }}(...)
+```
+{% else %}
+
+Usage example not available.
+{% endif %}
+
+## Available functions
+{% if functions %}
+{% for func in functions %}
+- `{{ func }}`
+{% endfor %}
+{% else %}
+
+Function list not available.
+{% endif %}
+{% if layers %}
+
+## Available layers
+{% for layer in layers %}
+- `{{ layer }}`
+{% endfor %}
+{% endif %}
+
+## Benchmarks
+{% if has_benchmark %}
+
+Benchmarking script is available for this kernel. Run `kernels benchmark {{ repo_id }} --version {{ version }}`.
+{% else %}
+
+No benchmark available yet.
+{% endif %}
+{% if upstream %}
+
+## Source code
+
+Source code of this kernel originally comes from {{ upstream }} and it was repurposed for compatibility with `kernels`.
+{% endif %}
+
diff --git a/examples/kernels/relu-invalid-capability/build.toml b/examples/kernels/relu-invalid-capability/build.toml
@@ -0,0 +1,32 @@
+[general]
+name = "relu-invalid-capability"
+version = 1
+license = "Apache-2.0"
+backends = [
+    "cpu",
+    "cuda",
+    "metal",
+    "rocm",
+    "xpu",
+]
+
+[general.hub]
+repo-id = "kernels-test/relu-invalid-capability"
+
+[torch]
+src = [
+    "torch-ext/torch_binding.cpp",
+    "torch-ext/torch_binding.h",
+]
+
+[kernel.relu_rocm]
+backend = "rocm"
+depends = ["torch"]
+rocm-archs = [ "gfx99999" ]
+src = ["relu_cuda/relu.cu"]
+
+[kernel.relu]
+backend = "cuda"
+depends = ["torch"]
+cuda-capabilities = [ "99999.0" ]
+src = ["relu_cuda/relu.cu"]
diff --git a/examples/kernels/relu-invalid-capability/flake.nix b/examples/kernels/relu-invalid-capability/flake.nix
@@ -0,0 +1,17 @@
+{
+  description = "Flake for ReLU kernel";
+
+  inputs = {
+    kernel-builder.url = "path:../../..";
+  };
+
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genKernelFlakeOutputs {
+      inherit self;
+      path = ./.;
+    };
+}
diff --git a/examples/kernels/relu-invalid-capability/relu_cuda/relu.cu b/examples/kernels/relu-invalid-capability/relu_cuda/relu.cu
@@ -0,0 +1,47 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include <cmath>
+
+__global__ void relu_kernel(float *__restrict__ out,
+                            float const *__restrict__ input, const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    auto x = input[token_idx * d + idx];
+    out[token_idx * d + idx] = x > 0.0f ? x : 0.0f;
+  }
+}
+
+void relu(torch::Tensor &out, torch::Tensor const &input) {
+  TORCH_CHECK(input.device().is_cuda(), "input must be a CUDA tensor");
+  TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Float &&
+                  input.scalar_type() == at::ScalarType::Float,
+              "relu_kernel only supports float32");
+
+  TORCH_CHECK(input.sizes() == out.sizes(),
+              "Tensors must have the same shape. Got input shape: ",
+              input.sizes(), " and output shape: ", out.sizes());
+
+  TORCH_CHECK(input.scalar_type() == out.scalar_type(),
+              "Tensors must have the same data type. Got input dtype: ",
+              input.scalar_type(), " and output dtype: ", out.scalar_type());
+
+  TORCH_CHECK(input.device() == out.device(),
+              "Tensors must be on the same device. Got input device: ",
+              input.device(), " and output device: ", out.device());
+
+  if (input.numel() == 0) {
+    return;
+  }
+
+  int d = input.size(-1);
+  int64_t num_tokens = input.numel() / d;
+  dim3 grid(num_tokens);
+  dim3 block(std::min(d, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  relu_kernel<<<grid, block, 0, stream>>>(out.data_ptr<float>(),
+                                          input.data_ptr<float>(), d);
+}
diff --git a/examples/kernels/relu-invalid-capability/torch-ext/relu_invalid_capability/__init__.py b/examples/kernels/relu-invalid-capability/torch-ext/relu_invalid_capability/__init__.py
@@ -0,0 +1,15 @@
+from typing import Optional
+
+import torch
+
+from ._ops import ops
+
+
+def relu(x: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    if out is None:
+        out = torch.empty_like(x)
+    ops.relu(out, x)
+    return out
+
+
+__all__ = ["relu"]
diff --git a/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.cpp b/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.cpp
@@ -0,0 +1,19 @@
+#include <torch/library.h>
+
+#include "registration.h"
+#include "torch_binding.h"
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("relu(Tensor! out, Tensor input) -> ()");
+#if defined(CPU_KERNEL)
+  ops.impl("relu", torch::kCPU, &relu);
+#elif defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
+  ops.impl("relu", torch::kCUDA, &relu);
+#elif defined(METAL_KERNEL)
+  ops.impl("relu", torch::kMPS, relu);
+#elif defined(XPU_KERNEL)
+  ops.impl("relu", torch::kXPU, &relu);
+#endif
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.h b/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <torch/torch.h>
+
+void relu(torch::Tensor &out, torch::Tensor const &input);
diff --git a/kernel-builder/src/pyproject/templates/kernel.cmake b/kernel-builder/src/pyproject/templates/kernel.cmake
@@ -34,6 +34,9 @@ function(cuda_kernel_component SRC_VAR)
     # Determine CUDA architectures
     if(KERNEL_CUDA_CAPABILITIES)
         cuda_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_CUDA_CAPABILITIES}" "${CUDA_ARCHS}")
+        if(NOT _KERNEL_ARCHS)
+            message(FATAL_ERROR "CUDA kernel: ${KERNEL_NAME}, empty set of capabilities after intersection (kernel: ${KERNEL_CUDA_CAPABILITIES}, supported: ${CUDA_ARCHS})")
+        endif()
     else()
         set(_KERNEL_ARCHS "${CUDA_KERNEL_ARCHS}")
     endif()
@@ -115,6 +118,9 @@ function(hip_kernel_component SRC_VAR)
     # Determine ROCm architectures
     if(KERNEL_ROCM_ARCHS)
         hip_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_ROCM_ARCHS}" "${ROCM_ARCHS}")
+        if(NOT _KERNEL_ARCHS)
+            message(FATAL_ERROR "ROCm kernel: ${KERNEL_NAME}, empty set of architectures after intersection (kernel: ${KERNEL_ROCM_ARCHS}, supported: ${ROCM_ARCHS})")
+        endif()
     else()
         set(_KERNEL_ARCHS "${ROCM_ARCHS}")
     endif()