huggingface
diff --git a/‎.github/workflows/build_kernel.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/build_kernel.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/build_kernel_rocm.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build_kernel_rocm.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/build_kernel_xpu.yaml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/build_kernel_xpu.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/builder/writing-kernels.md‎
Lines changed: 6 additions & 1 deletion b/‎docs/source/builder/writing-kernels.md‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎examples/kernels/flake.nix‎
Lines changed: 16 additions & 14 deletions b/‎examples/kernels/flake.nix‎
Lines changed: 16 additions & 14 deletions
diff --git a/‎examples/kernels/relu-torch-stable-abi/CARD.md‎
Lines changed: 56 additions & 0 deletions b/‎examples/kernels/relu-torch-stable-abi/CARD.md‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎examples/kernels/relu-torch-stable-abi/build.toml‎
Lines changed: 65 additions & 0 deletions b/‎examples/kernels/relu-torch-stable-abi/build.toml‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎examples/kernels/relu-torch-stable-abi/flake.nix‎
Lines changed: 17 additions & 0 deletions b/‎examples/kernels/relu-torch-stable-abi/flake.nix‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎examples/kernels/relu-torch-stable-abi/relu_cpu/relu_cpu.cpp‎
Lines changed: 59 additions & 0 deletions b/‎examples/kernels/relu-torch-stable-abi/relu_cpu/relu_cpu.cpp‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎examples/kernels/relu-torch-stable-abi/relu_cuda/relu.cu‎
Lines changed: 48 additions & 0 deletions b/‎examples/kernels/relu-torch-stable-abi/relu_cuda/relu.cu‎
Lines changed: 48 additions & 0 deletions
@@ -55,6 +55,7 @@ jobs:
             cutlass-gemm-tvm-ffi-kernel
             extra-data
             relu-kernel
+            relu-torch-stable-abi-kernel
             relu-tvm-ffi-kernel
             relu-kernel-cpu
             relu-backprop-compile-kernel
 
@@ -34,7 +34,7 @@ jobs:
       # For now we only test that there are no regressions in building ROCm
       # kernels. Also run tests once we have a ROCm runner.
       - name: Build relu kernel
-        run: ( cd examples/kernels/relu && nix build .\#redistributable.torch211-cxx11-rocm71-x86_64-linux -L )
+        run: ( cd examples/kernels/relu && nix build .\#redistributable.torch211-rocm71-x86_64-linux -L )
 
       - name: Build relu kernel (compiler flags)
-        run: ( cd examples/kernels/relu-compiler-flags && nix build .\#redistributable.torch211-cxx11-rocm71-x86_64-linux )
+        run: ( cd examples/kernels/relu-compiler-flags && nix build .\#redistributable.torch211-rocm71-x86_64-linux )
@@ -34,13 +34,13 @@ jobs:
       # For now we only test that there are no regressions in building XPU
       # kernels. Also run tests once we have a XPU runner.
       - name: Build relu kernel
-        run: ( cd examples/kernels/relu && nix build .\#redistributable.torch211-cxx11-xpu20253-x86_64-linux -L )
+        run: ( cd examples/kernels/relu && nix build .\#redistributable.torch211-xpu20253-x86_64-linux -L )
 
       - name: Build relu tvm-ffi kernel
         run: ( cd examples/kernels/relu-tvm-ffi && nix build .\#redistributable.tvm-ffi01-xpu20253-x86_64-linux -L )
 
       - name: Build relu kernel (compiler flags)
-        run: ( cd examples/kernels/relu-compiler-flags && nix build .\#redistributable.torch211-cxx11-xpu20253-x86_64-linux )
+        run: ( cd examples/kernels/relu-compiler-flags && nix build .\#redistributable.torch211-xpu20253-x86_64-linux )
 
       - name: Build cutlass-gemm kernel
-        run: ( cd examples/kernels/cutlass-gemm && nix build .\#redistributable.torch211-cxx11-xpu20253-x86_64-linux -L )
+        run: ( cd examples/kernels/cutlass-gemm && nix build .\#redistributable.torch211-xpu20253-x86_64-linux -L )
@@ -227,6 +227,12 @@ options:
   non-compliant kernels if the version range does not correspond to the [required variants](build-variants.md).
 - `minver` (optional): only build for this Torch version and later. Use cautiously, since this option produces
   non-compliant kernels if the version range does not correspond to the [required variants](build-variants.md).
+- `stable-abi` (**experimental**): when set to a Torch version (e.g.
+  `"2.11"`), the kernel is built using the Torch stable ABI. This
+  requires that the kernel itself only use
+  [stable ABI headers](https://docs.pytorch.org/docs/2.12/notes/libtorch_stable_abi.html).
+  For an example, see the [`relu-torch-stable-abi`](https://github.com/huggingface/kernels/tree/main/examples/kernels/relu-torch-stable-abi)
+  example kernel.
 
 ### `kernel.<name>`
 
@@ -277,7 +283,6 @@ are available:
 - `cxx-flags`: a list of additional flags to be passed to the C++
   compiler.
 
-
 ## Torch bindings
 
 ### Defining bindings
 
@@ -26,16 +26,22 @@
       #        system and flake outputs.
       # - torchVersions: optional override for the torchVersions argument
       ciKernels = [
+        {
+          name = "cpp20-symbols-kernel";
+          path = ./cpp20-symbols;
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cpu-${sys}"};
+        }
         {
           name = "relu-kernel";
           path = ./relu;
-          drv =
-            sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};
         }
         {
-          name = "cpp20-symbols-kernel";
-          path = ./cpp20-symbols;
-          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-cpu-${sys}"};
+          name = "relu-torch-stable-abi-kernel";
+          path = ./relu-torch-stable-abi;
+          drv =
+            sys: out:
+            out.packages.${sys}.redistributable.${"torch-stable-abi${torchVersion}-${cudaVersion}-${sys}"};
         }
         {
           name = "relu-tvm-ffi-kernel";
@@ -46,19 +52,17 @@
         {
           name = "extra-data";
           path = ./extra-data;
-          drv =
-            sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};
         }
         {
           name = "relu-kernel-cpu";
           path = ./relu;
-          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-cpu-${sys}"};
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cpu-${sys}"};
         }
         {
           name = "cutlass-gemm-kernel";
           path = ./cutlass-gemm;
-          drv =
-            sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};
         }
         {
           name = "cutlass-gemm-tvm-ffi-kernel";
@@ -69,8 +73,7 @@
         {
           name = "relu-backprop-compile-kernel";
           path = ./relu-backprop-compile;
-          drv =
-            sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};
         }
         {
           name = "silu-and-mul-kernel";
@@ -97,8 +100,7 @@
         {
           name = "relu-compiler-flags";
           path = ./relu-compiler-flags;
-          drv =
-            sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};
         }
         {
           # Check that we can build an arch dev shell.
 
@@ -0,0 +1,56 @@
+---
+library_name: kernels
+{% if license %}license: {{ license }}
+{% endif %}---
+
+This is the repository card of {{ repo_id }} that has been pushed on the Hub. It was built to be used with the [`kernels` library](https://github.com/huggingface/kernels). This card was automatically generated.
+
+## How to use
+{% if functions %}
+
+```python
+# make sure `kernels` is installed: `pip install -U kernels`
+from kernels import get_kernel
+
+kernel_module = get_kernel("{{ repo_id }}", version={{ version }})
+{{ functions[0] }} = kernel_module.{{ functions[0] }}
+
+{{ functions[0] }}(...)
+```
+{% else %}
+
+Usage example not available.
+{% endif %}
+
+## Available functions
+{% if functions %}
+{% for func in functions %}
+- `{{ func }}`
+{% endfor %}
+{% else %}
+
+Function list not available.
+{% endif %}
+{% if layers %}
+
+## Available layers
+{% for layer in layers %}
+- `{{ layer }}`
+{% endfor %}
+{% endif %}
+
+## Benchmarks
+{% if has_benchmark %}
+
+Benchmarking script is available for this kernel. Run `kernels benchmark {{ repo_id }} --version {{ version }}`.
+{% else %}
+
+No benchmark available yet.
+{% endif %}
+{% if upstream %}
+
+## Source code
+
+Source code of this kernel originally comes from {{ upstream }} and it was repurposed for compatibility with `kernels`.
+{% endif %}
+
@@ -0,0 +1,65 @@
+[general]
+name = "relu-torch-stable-abi"
+version = 1
+license = "Apache-2.0"
+backends = [
+    "cpu",
+    "cuda",
+    "metal",
+    "rocm",
+    "xpu",
+]
+
+[general.hub]
+repo-id = "kernels-test/relu-torch-stable-abi"
+
+[torch]
+stable-abi = "2.11"
+src = [
+    "torch-ext/torch_binding.cpp",
+    "torch-ext/torch_binding.h",
+]
+
+[kernel.relu_xpu]
+backend = "xpu"
+depends = ["torch"]
+src = ["relu_xpu/relu.cpp"]
+
+
+# Converting metal to the Torch stable ABI requires additional APIs to get
+# the command buffer and dispatch queue.
+#
+# [kernel.relu_metal]
+# backend = "metal"
+# depends = ["torch"]
+# src = [
+#    "relu_metal/relu.mm",
+#    "relu_metal/relu.metal",
+#    "relu_metal/common.h",
+#]
+
+[kernel.relu_rocm]
+backend = "rocm"
+depends = ["torch"]
+rocm-archs = [
+    "gfx906",
+    "gfx908",
+    "gfx90a",
+    "gfx940",
+    "gfx941",
+    "gfx942",
+    "gfx1030",
+    "gfx1100",
+    "gfx1101",
+]
+src = ["relu_cuda/relu.cu"]
+
+[kernel.relu_cpu]
+backend = "cpu"
+depends = ["torch"]
+src = ["relu_cpu/relu_cpu.cpp"]
+
+[kernel.relu]
+backend = "cuda"
+depends = ["torch"]
+src = ["relu_cuda/relu.cu"]
@@ -0,0 +1,17 @@
+{
+  description = "Flake for ReLU kernel";
+
+  inputs = {
+    kernel-builder.url = "path:../../..";
+  };
+
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genKernelFlakeOutputs {
+      inherit self;
+      path = ./.;
+    };
+}
@@ -0,0 +1,59 @@
+#include <torch/csrc/stable/tensor.h>
+
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+// NOTE: This is a minimal example kernel that is not optimized for
+//       performance, so we do not care about unaligned loads/stores.
+
+#ifdef __SSE__
+void relu_forward_sse(float* out, const float* input, size_t size) {
+    size_t i = 0;
+
+    for (; i + 4 <= size; i += 4) {
+        __m128 vec_input = _mm_loadu_ps(input + i);
+        __m128 vec_zero = _mm_setzero_ps();
+        __m128 vec_output = _mm_max_ps(vec_input, vec_zero);
+        _mm_storeu_ps(out + i, vec_output);
+    }
+
+    for (; i < size; ++i) {
+        out[i] = input[i] > 0 ? input[i] : 0;
+    }
+}
+#endif
+
+#ifdef __ARM_NEON
+void relu_forward_neon(float* out, const float* input, size_t size) {
+    size_t i = 0;
+
+    for (; i + 4 <= size; i += 4) {
+        float32x4_t vec_input = vld1q_f32(input + i);
+        float32x4_t vec_output = vmaxq_f32(vec_input, vdupq_n_f32(0));
+        vst1q_f32(out + i, vec_output);
+    }
+
+    for (; i < size; ++i) {
+        out[i] = input[i] > 0 ? input[i] : 0;
+    }
+}
+#endif
+
+void relu(torch::stable::Tensor &out, torch::stable::Tensor const &input) {
+    STD_TORCH_CHECK(out.scalar_type() == torch::headeronly::ScalarType::Float, "Output tensor must be of dtype float");
+    STD_TORCH_CHECK(input.scalar_type() == torch::headeronly::ScalarType::Float, "Input tensor must be of dtype float");
+    STD_TORCH_CHECK(out.numel() == input.numel(), "Input and output tensors must have the same number of elements");
+
+#if defined(__SSE__)
+    relu_forward_sse(static_cast<float*>(out.data_ptr()), static_cast<const float*>(input.data_ptr()), input.numel());
+#elif defined(__ARM_NEON)
+    relu_forward_neon(static_cast<float*>(out.data_ptr()), static_cast<const float*>(input.data_ptr()), input.numel());
+#else
+    #error "Unsupported architecture; please use a CPU with SSE or ARM NEON support."
+#endif
+}
@@ -0,0 +1,48 @@
+#include <torch/csrc/stable/accelerator.h>
+#include <torch/csrc/stable/tensor.h>
+
+// The shim's definition is guarded by USE_CUDA, so define here.
+extern "C" AOTITorchError aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream);
+
+#include <cmath>
+
+__global__ void relu_kernel(float *__restrict__ out,
+                            float const *__restrict__ input, const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    auto x = input[token_idx * d + idx];
+    out[token_idx * d + idx] = x > 0.0f ? x : 0.0f;
+  }
+}
+
+void relu(torch::stable::Tensor &out, torch::stable::Tensor const &input) {
+  STD_TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor");
+  STD_TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
+  STD_TORCH_CHECK(input.scalar_type() == torch::headeronly::ScalarType::Float &&
+                      out.scalar_type() == torch::headeronly::ScalarType::Float,
+                  "relu_kernel only supports float32");
+
+  STD_TORCH_CHECK(input.sizes().equals(out.sizes()),
+                  "Tensors must have the same shape.");
+
+  STD_TORCH_CHECK(input.scalar_type() == out.scalar_type(),
+                  "Tensors must have the same data type.");
+
+  STD_TORCH_CHECK(input.device() == out.device(),
+                  "Tensors must be on the same device.");
+
+  if (input.numel() == 0) {
+    return;
+  }
+
+  int d = input.size(-1);
+  int64_t num_tokens = input.numel() / d;
+  dim3 grid(num_tokens);
+  dim3 block(std::min(d, 1024));
+  const torch::stable::accelerator::DeviceGuard device_guard(input.get_device_index());
+  void* stream_ptr = nullptr;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_get_current_cuda_stream(input.get_device_index(), &stream_ptr));
+  const cudaStream_t stream = static_cast<cudaStream_t>(stream_ptr);
+  relu_kernel<<<grid, block, 0, stream>>>(static_cast<float*>(out.data_ptr()),
+                                          static_cast<const float*>(input.data_ptr()), d);
+}
Original file line number	Diff line number	Diff line change
`@@ -26,16 +26,22 @@`
`26`	`26`	`# system and flake outputs.`
`27`	`27`	`# - torchVersions: optional override for the torchVersions argument`
`28`	`28`	`ciKernels = [`
	`29`	`+ {`
	`30`	`+ name = "cpp20-symbols-kernel";`
	`31`	`+ path = ./cpp20-symbols;`
	`32`	`+ drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cpu-${sys}"};`
	`33`	`+ }`
`29`	`34`	`{`
`30`	`35`	`name = "relu-kernel";`
`31`	`36`	`path = ./relu;`
`32`		`- drv =`
`33`		`- sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};`
	`37`	`+ drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};`
`34`	`38`	`}`
`35`	`39`	`{`
`36`		`- name = "cpp20-symbols-kernel";`
`37`		`- path = ./cpp20-symbols;`
`38`		`- drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-cpu-${sys}"};`
	`40`	`+ name = "relu-torch-stable-abi-kernel";`
	`41`	`+ path = ./relu-torch-stable-abi;`
	`42`	`+ drv =`
	`43`	`+ sys: out:`
	`44`	`+ out.packages.${sys}.redistributable.${"torch-stable-abi${torchVersion}-${cudaVersion}-${sys}"};`
`39`	`45`	`}`
`40`	`46`	`{`
`41`	`47`	`name = "relu-tvm-ffi-kernel";`
`@@ -46,19 +52,17 @@`
`46`	`52`	`{`
`47`	`53`	`name = "extra-data";`
`48`	`54`	`path = ./extra-data;`
`49`		`- drv =`
`50`		`- sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};`
	`55`	`+ drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};`
`51`	`56`	`}`
`52`	`57`	`{`
`53`	`58`	`name = "relu-kernel-cpu";`
`54`	`59`	`path = ./relu;`
`55`		`- drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-cpu-${sys}"};`
	`60`	`+ drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cpu-${sys}"};`
`56`	`61`	`}`
`57`	`62`	`{`
`58`	`63`	`name = "cutlass-gemm-kernel";`
`59`	`64`	`path = ./cutlass-gemm;`
`60`		`- drv =`
`61`		`- sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};`
	`65`	`+ drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};`
`62`	`66`	`}`
`63`	`67`	`{`
`64`	`68`	`name = "cutlass-gemm-tvm-ffi-kernel";`
`@@ -69,8 +73,7 @@`
`69`	`73`	`{`
`70`	`74`	`name = "relu-backprop-compile-kernel";`
`71`	`75`	`path = ./relu-backprop-compile;`
`72`		`- drv =`
`73`		`- sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};`
	`76`	`+ drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};`
`74`	`77`	`}`
`75`	`78`	`{`
`76`	`79`	`name = "silu-and-mul-kernel";`
`@@ -97,8 +100,7 @@`
`97`	`100`	`{`
`98`	`101`	`name = "relu-compiler-flags";`
`99`	`102`	`path = ./relu-compiler-flags;`
`100`		`- drv =`
`101`		`- sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};`
	`103`	`+ drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};`
`102`	`104`	`}`
`103`	`105`	`{`
`104`	`106`	`# Check that we can build an arch dev shell.`