diff --git a/.github/workflows/build_kernel.yaml b/.github/workflows/build_kernel.yaml
index f5302178..2cbfdf39 100644
--- a/.github/workflows/build_kernel.yaml
+++ b/.github/workflows/build_kernel.yaml
@@ -28,11 +28,17 @@ jobs:
           USER: runner
       - name: Nix info
         run: nix-shell -p nix-info --run "nix-info -m"
+
       - name: Build relu kernel
         run: ( cd builder/examples/relu && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux )
       - name: Copy relu kernel
         run: cp -rL builder/examples/relu/result relu-kernel
 
+      - name: Build extra-data kernel
+        run: ( cd builder/examples/extra-data && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux )
+      - name: Copy extra-data kernel
+        run: cp -rL builder/examples/extra-data/result extra-data
+
       - name: Build relu kernel (CPU)
         run: ( cd builder/examples/relu && nix build .\#redistributable.torch29-cxx11-cpu-x86_64-linux )
       - name: Copy relu kernel (CPU)
@@ -70,6 +76,7 @@ jobs:
           path: |
             activation-kernel
             cutlass-gemm-kernel
+            extra-data
             relu-kernel
             relu-kernel-cpu
             relu-backprop-compile-kernel
diff --git a/build2cmake/src/config/mod.rs b/build2cmake/src/config/mod.rs
index 1078a604..d39342c2 100644
--- a/build2cmake/src/config/mod.rs
+++ b/build2cmake/src/config/mod.rs
@@ -27,10 +27,6 @@ impl Build {
     pub fn is_noarch(&self) -> bool {
         self.kernels.is_empty()
     }
-
-    pub fn supports_backend(&self, backend: &Backend) -> bool {
-        self.general.backends.contains(backend)
-    }
 }
 
 pub struct General {
@@ -126,18 +122,18 @@ pub struct Torch {
 }
 
 impl Torch {
-    pub fn data_globs(&self) -> Option<Vec<String>> {
+    pub fn data_extensions(&self) -> Option<Vec<String>> {
         match self.pyext.as_ref() {
             Some(exts) => {
-                let globs = exts
+                let extensions = exts
                     .iter()
                     .filter(|&ext| ext != "py" && ext != "pyi")
-                    .map(|ext| format!("\"**/*.{ext}\""))
+                    .cloned()
                     .collect_vec();
-                if globs.is_empty() {
+                if extensions.is_empty() {
                     None
                 } else {
-                    Some(globs)
+                    Some(extensions)
                 }
             }
 
@@ -247,6 +243,18 @@ pub enum Backend {
     Xpu,
 }
 
+impl Backend {
+    pub const fn all() -> [Backend; 5] {
+        [
+            Backend::Cpu,
+            Backend::Cuda,
+            Backend::Metal,
+            Backend::Rocm,
+            Backend::Xpu,
+        ]
+    }
+}
+
 impl Display for Backend {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
diff --git a/build2cmake/src/main.rs b/build2cmake/src/main.rs
index 60a869d2..1b17ab76 100644
--- a/build2cmake/src/main.rs
+++ b/build2cmake/src/main.rs
@@ -12,7 +12,7 @@ mod torch;
 use torch::{write_torch_ext, write_torch_ext_noarch};
 
 mod config;
-use config::{v3, Backend, Build, BuildCompat};
+use config::{v3, Build, BuildCompat};
 
 mod fileset;
 use fileset::FileSet;
@@ -48,9 +48,6 @@ enum Commands {
         /// kernel name to avoid name collisions. (e.g. Git SHA)
         #[arg(long)]
         ops_id: Option<String>,
-
-        #[arg(long)]
-        backend: Option<Backend>,
     },
 
     /// Update a `build.toml` to the current format.
@@ -93,12 +90,11 @@ fn main() -> Result<()> {
     let args = Cli::parse();
     match args.command {
         Commands::GenerateTorch {
-            backend,
             build_toml,
             force,
             target_dir,
             ops_id,
-        } => generate_torch(backend, build_toml, target_dir, force, ops_id),
+        } => generate_torch(build_toml, target_dir, force, ops_id),
         Commands::UpdateBuild { build_toml } => update_build(build_toml),
         Commands::Validate { build_toml } => {
             parse_and_validate(build_toml)?;
@@ -115,7 +111,6 @@ fn main() -> Result<()> {
 }
 
 fn generate_torch(
-    backend: Option<Backend>,
     build_toml: PathBuf,
     target_dir: Option<PathBuf>,
     force: bool,
@@ -139,41 +134,10 @@ fn generate_torch(
     env.set_trim_blocks(true);
     minijinja_embed::load_templates!(&mut env);
 
-    let backend = match backend {
-        Some(backend) => {
-            if !build.supports_backend(&backend) {
-                bail!("Kernel does not support backend: {}", backend);
-            }
-
-            backend
-        }
-        None => {
-            let kernel_backends = &build.general.backends;
-
-            if kernel_backends.len() > 1 {
-                let mut kernel_backends = kernel_backends
-                    .iter()
-                    .map(ToString::to_string)
-                    .collect::<Vec<_>>();
-                kernel_backends.sort();
-                bail!(
-                    "Multiple supported backends found in build.toml: {}. Please specify one with --backend.",
-                    kernel_backends.join(", ")
-                );
-            }
-
-            if let Some(backend) = kernel_backends.first() {
-                *backend
-            } else {
-                bail!("No backends are specified in build.toml");
-            }
-        }
-    };
-
     let file_set = if build.is_noarch() {
-        write_torch_ext_noarch(&env, backend, &build, target_dir.clone(), ops_id)?
+        write_torch_ext_noarch(&env, &build, target_dir.clone(), ops_id)?
     } else {
-        write_torch_ext(&env, backend, &build, target_dir.clone(), ops_id)?
+        write_torch_ext(&env, &build, target_dir.clone(), ops_id)?
     };
     file_set.write(&target_dir, force)?;
 
@@ -368,14 +332,12 @@ fn get_generated_files(
 ) -> Result<Vec<PathBuf>> {
     let mut all_set = FileSet::new();
 
-    for backend in &build.general.backends {
-        let set = if build.is_noarch() {
-            write_torch_ext_noarch(env, *backend, build, target_dir.clone(), ops_id.clone())?
-        } else {
-            write_torch_ext(env, *backend, build, target_dir.clone(), ops_id.clone())?
-        };
-        all_set.extend(set);
-    }
+    let set = if build.is_noarch() {
+        write_torch_ext_noarch(env, build, target_dir.clone(), ops_id.clone())?
+    } else {
+        write_torch_ext(env, build, target_dir.clone(), ops_id.clone())?
+    };
+    all_set.extend(set);
 
     Ok(all_set.into_names())
 }
diff --git a/build2cmake/src/templates/windows.cmake b/build2cmake/src/templates/build-variants.cmake
similarity index 50%
rename from build2cmake/src/templates/windows.cmake
rename to build2cmake/src/templates/build-variants.cmake
index 123ccef3..b12c88da 100644
--- a/build2cmake/src/templates/windows.cmake
+++ b/build2cmake/src/templates/build-variants.cmake
@@ -1,12 +1,15 @@
 # Generate a standardized build variant name following the pattern:
-# torch<VERSION>-<ABI>-<COMPUTE>-windows
+# torch<VERSION>-[cxx11-]<COMPUTE>-<ARCH>-<OS>
 #
 # Arguments:
 #   OUT_BUILD_NAME - Output variable name
 #   TORCH_VERSION - PyTorch version (e.g., "2.7.1")
-#   COMPUTE_FRAMEWORK - One of: cuda, rocm, metal, xpu
+#   COMPUTE_FRAMEWORK - One of: cuda, rocm, metal, xpu, cpu
 #   COMPUTE_VERSION - Version of compute framework (e.g., "12.4" for CUDA, "6.0" for ROCm)
-# Example output: torch271-cxx11-cu124-x86_64-windows
+#                     Optional for CPU-only builds (pass empty string or omit)
+# Example output: torch271-cxx11-cu124-x86_64-linux (Linux)
+#                 torch271-cu124-x86_64-windows (Windows)
+#                 torch271-metal-aarch64-darwin (macOS)
 #
 function(generate_build_name OUT_BUILD_NAME TORCH_VERSION COMPUTE_FRAMEWORK COMPUTE_VERSION)
     # Flatten version by removing dots and padding to 2 components
@@ -60,20 +63,45 @@ function(generate_build_name OUT_BUILD_NAME TORCH_VERSION COMPUTE_FRAMEWORK COMP
             list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
             set(COMPUTE_STRING "xpu${COMPUTE_MAJOR}0")
         endif()
+    elseif(COMPUTE_FRAMEWORK STREQUAL "metal")
+        set(COMPUTE_STRING "metal")
+    elseif(COMPUTE_FRAMEWORK STREQUAL "cpu")
+        set(COMPUTE_STRING "cpu")
     else()
         message(FATAL_ERROR "Unknown compute framework: ${COMPUTE_FRAMEWORK}")
     endif()
 
-    if(CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64")
+    # Detect from target system (CMAKE_SYSTEM_* variables refer to target, not host)
+    # Normalize architecture name
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64|AMD64)$")
         set(CPU_ARCH "x86_64")
-    elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64")
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|ARM64)$")
         set(CPU_ARCH "aarch64")
     else()
-        message(ERROR "Unsupported Windows platform ${CMAKE_SYSTEM_PROCESSOR}")
+        message(FATAL_ERROR "Unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}")
     endif()
 
+    # Normalize OS name
+    if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
+        set(OS_NAME "windows")
+    elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+        set(OS_NAME "linux")
+    elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+        set(OS_NAME "darwin")
+    else()
+        message(WARNING "Unknown OS ${CMAKE_SYSTEM_NAME}, using as-is")
+        string(TOLOWER "${CMAKE_SYSTEM_NAME}" OS_NAME)
+    endif()
+
+    set(ARCH_OS_STRING "${CPU_ARCH}-${OS_NAME}")
+
     # Assemble the final build name
-    set(BUILD_NAME "torch${FLATTENED_TORCH}-${COMPUTE_STRING}-${CPU_ARCH}-windows")
+    # For Linux, include cxx11 ABI indicator for compatibility
+    if(ARCH_OS_STRING MATCHES "-linux$")
+        set(BUILD_NAME "torch${FLATTENED_TORCH}-cxx11-${COMPUTE_STRING}-${ARCH_OS_STRING}")
+    else()
+        set(BUILD_NAME "torch${FLATTENED_TORCH}-${COMPUTE_STRING}-${ARCH_OS_STRING}")
+    endif()
 
     set(${OUT_BUILD_NAME} "${BUILD_NAME}" PARENT_SCOPE)
     message(STATUS "Generated build name: ${BUILD_NAME}")
@@ -82,7 +110,7 @@ endfunction()
 #
 # Create a custom install target for the huggingface/kernels library layout.
 # This installs the extension into a directory structure suitable for kernel hub discovery:
-#   <PREFIX>/<BUILD_VARIANT_NAME>/<PACKAGE_NAME>/
+#   <PREFIX>/<BUILD_VARIANT_NAME>
 #
 # Arguments:
 #   TARGET_NAME - Name of the target to create the install rule for
@@ -92,14 +120,29 @@ endfunction()
 #
 function(add_kernels_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME)
     set(oneValueArgs INSTALL_PREFIX)
-    cmake_parse_arguments(ARG "" "${oneValueArgs}" "" ${ARGN})
+    set(multiValueArgs DATA_EXTENSIONS)
+    cmake_parse_arguments(ARG "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
     if(NOT ARG_INSTALL_PREFIX)
         set(ARG_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
     endif()
 
+    if (${GPU_LANG} STREQUAL "CPU")
+        set(_BACKEND "cpu")
+    elseif (${GPU_LANG} STREQUAL "CUDA")
+        set(_BACKEND "cuda")
+    elseif (${GPU_LANG} STREQUAL "HIP")
+        set(_BACKEND "rocm")
+    elseif (${GPU_LANG} STREQUAL "METAL")
+        set(_BACKEND "metal")
+    elseif (${GPU_LANG} STREQUAL "SYCL")
+        set(_BACKEND "xpu")
+    else()
+        message(FATAL_ERROR "Unsupported GPU_LANG: ${GPU_LANG}")
+    endif()
+
     # Set the installation directory
-    set(KERNEL_INSTALL_DIR "${ARG_INSTALL_PREFIX}/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
+    set(KERNEL_INSTALL_DIR "${ARG_INSTALL_PREFIX}/${BUILD_VARIANT_NAME}")
 
     message(STATUS "Using PACKAGE_NAME: ${PACKAGE_NAME}")
 
@@ -110,21 +153,44 @@ function(add_kernels_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME)
         RUNTIME DESTINATION "${KERNEL_INSTALL_DIR}"
         COMPONENT ${TARGET_NAME})
 
-    # Glob Python files to install
-    file(GLOB PYTHON_FILES "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.py")
+    # Glob Python files to install recursively.
+    file(GLOB_RECURSE PYTHON_FILES RELATIVE "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}" "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.py")
+    foreach(python_file IN LISTS PYTHON_FILES)
+        get_filename_component(python_file_dir "${python_file}" DIRECTORY)
+        install(FILES "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/${python_file}"
+          DESTINATION "${KERNEL_INSTALL_DIR}/${python_file_dir}"
+          COMPONENT ${TARGET_NAME})
+    endforeach()
 
-    # Install Python files (__init__.py and _ops.py)
-    install(FILES ${PYTHON_FILES}
+    install(FILES ${CMAKE_SOURCE_DIR}/metadata-${_BACKEND}.json
         DESTINATION "${KERNEL_INSTALL_DIR}"
+        RENAME "metadata.json"
+        COMPONENT ${TARGET_NAME})
+
+    # Compatibility with older kernels and direct Python imports.
+    install(FILES ${CMAKE_SOURCE_DIR}/compat.py
+      DESTINATION "${KERNEL_INSTALL_DIR}/${PACKAGE_NAME}"
+        RENAME "__init__.py"
         COMPONENT ${TARGET_NAME})
 
-    message(STATUS "Added install rules for ${TARGET_NAME} -> ${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
+    # Install data files with specified extensions
+    foreach(ext IN LISTS ARG_DATA_EXTENSIONS)
+        file(GLOB_RECURSE DATA_FILES RELATIVE "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}" "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.${ext}")
+        foreach(data_file IN LISTS DATA_FILES)
+            get_filename_component(data_file_dir "${data_file}" DIRECTORY)
+            install(FILES "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/${data_file}"
+                DESTINATION "${KERNEL_INSTALL_DIR}/${data_file_dir}"
+                COMPONENT ${TARGET_NAME})
+        endforeach()
+    endforeach()
+
+    message(STATUS "Added install rules for ${TARGET_NAME} -> ${BUILD_VARIANT_NAME}")
 endfunction()
 
 #
 # Add install rules for local development with huggingface/kernels.
 # This installs the extension into the layout expected by get_local_kernel():
-#   ${CMAKE_SOURCE_DIR}/build/<BUILD_VARIANT_NAME>/<PACKAGE_NAME>/
+#   ${CMAKE_SOURCE_DIR}/build/<BUILD_VARIANT_NAME>/
 #
 # This allows developers to use get_local_kernel() from the kernels library to load
 # locally built kernels without needing to publish to the hub.
@@ -138,19 +204,36 @@ endfunction()
 #   BUILD_VARIANT_NAME - Build variant name (e.g., "torch271-cxx11-cu124-x86_64-linux")
 #
 function(add_local_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME)
+    set(multiValueArgs DATA_EXTENSIONS)
+    cmake_parse_arguments(ARG "" "" "${multiValueArgs}" ${ARGN})
+
     # Define your local, folder based, installation directory
-    set(LOCAL_INSTALL_DIR "${CMAKE_SOURCE_DIR}/build/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
+    set(LOCAL_INSTALL_DIR "${CMAKE_SOURCE_DIR}/build/${BUILD_VARIANT_NAME}")
     # Variant directory is where metadata.json should go (for kernels upload discovery)
     set(VARIANT_DIR "${CMAKE_SOURCE_DIR}/build/${BUILD_VARIANT_NAME}")
 
-    # Glob Python files at configure time
-    file(GLOB PYTHON_FILES "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.py")
+    # Glob Python files to install recursively.
+    file(GLOB_RECURSE PYTHON_FILES RELATIVE "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}" "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.py")
 
     # Create a custom target for local installation
     add_custom_target(local_install
             COMMENT "Installing files to local directory..."
     )
 
+    if (${GPU_LANG} STREQUAL "CPU")
+        set(_BACKEND "cpu")
+    elseif (${GPU_LANG} STREQUAL "CUDA")
+        set(_BACKEND "cuda")
+    elseif (${GPU_LANG} STREQUAL "HIP")
+        set(_BACKEND "rocm")
+    elseif (${GPU_LANG} STREQUAL "METAL")
+        set(_BACKEND "metal")
+    elseif (${GPU_LANG} STREQUAL "SYCL")
+        set(_BACKEND "xpu")
+    else()
+        message(FATAL_ERROR "Unsupported GPU_LANG: ${GPU_LANG}")
+    endif()
+
     # Add custom commands to copy files
     add_custom_command(TARGET local_install POST_BUILD
             # Copy the shared library
@@ -158,22 +241,51 @@ function(add_local_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME)
             $<TARGET_FILE:${TARGET_NAME}>
             ${LOCAL_INSTALL_DIR}/
 
-            # Copy each Python file
+            # Copy metadata.json if it exists
             COMMAND ${CMAKE_COMMAND} -E copy_if_different
-            ${PYTHON_FILES}
-            ${LOCAL_INSTALL_DIR}/
+            ${CMAKE_SOURCE_DIR}/metadata-${_BACKEND}.json
+            ${VARIANT_DIR}/metadata.json
 
-            # Copy metadata.json if it exists
+            # Compatibility with older kernels and direct Python imports.
             COMMAND ${CMAKE_COMMAND} -E copy_if_different
-            ${CMAKE_SOURCE_DIR}/metadata.json
-            ${VARIANT_DIR}/
+            ${CMAKE_SOURCE_DIR}/compat.py
+            ${VARIANT_DIR}/${PACKAGE_NAME}/__init__.py
 
             COMMENT "Copying shared library and Python files to ${LOCAL_INSTALL_DIR}"
             COMMAND_EXPAND_LISTS
     )
 
+    # Copy each Python file preserving directory structure
+    foreach(python_file IN LISTS PYTHON_FILES)
+        get_filename_component(python_file_dir "${python_file}" DIRECTORY)
+        add_custom_command(TARGET local_install POST_BUILD
+              COMMAND ${CMAKE_COMMAND} -E make_directory
+              ${LOCAL_INSTALL_DIR}/${python_file_dir}
+              COMMAND ${CMAKE_COMMAND} -E copy_if_different
+              ${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/${python_file}
+              ${LOCAL_INSTALL_DIR}/${python_file_dir}/
+              COMMENT "Copying ${python_file} to ${LOCAL_INSTALL_DIR}/${python_file_dir}"
+      )
+    endforeach()
+
+    # Copy data files with specified extensions
+    foreach(ext IN LISTS ARG_DATA_EXTENSIONS)
+        file(GLOB_RECURSE DATA_FILES RELATIVE "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}" "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.${ext}")
+        foreach(data_file IN LISTS DATA_FILES)
+            get_filename_component(data_file_dir "${data_file}" DIRECTORY)
+            add_custom_command(TARGET local_install POST_BUILD
+                COMMAND ${CMAKE_COMMAND} -E make_directory
+                ${LOCAL_INSTALL_DIR}/${data_file_dir}
+                COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                ${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/${data_file}
+                ${LOCAL_INSTALL_DIR}/${data_file_dir}/
+                COMMENT "Copying ${data_file} to ${LOCAL_INSTALL_DIR}/${data_file_dir}"
+            )
+        endforeach()
+    endforeach()
+
     # Create both directories: variant dir for metadata.json, package dir for binaries
     file(MAKE_DIRECTORY ${VARIANT_DIR})
     file(MAKE_DIRECTORY ${LOCAL_INSTALL_DIR})
-    message(STATUS "Added install rules for ${TARGET_NAME} -> build/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
+    message(STATUS "Added install rules for ${TARGET_NAME} -> build/${BUILD_VARIANT_NAME}")
 endfunction()
diff --git a/builder/lib/torch-extension/compat.py b/build2cmake/src/templates/compat.py
similarity index 100%
rename from builder/lib/torch-extension/compat.py
rename to build2cmake/src/templates/compat.py
diff --git a/build2cmake/src/templates/cuda/dep-cutlass.cmake b/build2cmake/src/templates/cuda/dep-cutlass.cmake
index 8316ae51..d9a00b8b 100644
--- a/build2cmake/src/templates/cuda/dep-cutlass.cmake
+++ b/build2cmake/src/templates/cuda/dep-cutlass.cmake
@@ -1,3 +1,6 @@
+if(GPU_LANG STREQUAL "CUDA")
+  message(STATUS "Including CUTLASS dependency")
+
 find_package(NvidiaCutlass)
 
 if (NOT NvidiaCutlass_FOUND)
@@ -37,3 +40,5 @@ if (NOT NvidiaCutlass_FOUND)
 else()
   message(STATUS "Using system cutlass with version: ${NvidiaCutlass_VERSION}")
 endif(NOT NvidiaCutlass_FOUND)
+
+endif(GPU_LANG STREQUAL "CUDA")
diff --git a/build2cmake/src/templates/noarch/pyproject.toml b/build2cmake/src/templates/noarch/pyproject.toml
index e60657b9..348b6cc8 100644
--- a/build2cmake/src/templates/noarch/pyproject.toml
+++ b/build2cmake/src/templates/noarch/pyproject.toml
@@ -7,6 +7,15 @@ dependencies = [
   {{python_dependencies}}
 ]
 
+[project.optional-dependencies]
+{% for backend_name, deps in backend_dependencies %}
+{{backend_name}} = [
+  {% for dep in deps %}
+  {{dep}},
+  {% endfor %}
+]
+{% endfor %}
+
 [tool.setuptools]
 package-dir = { "" = "torch-ext" }
 
diff --git a/build2cmake/src/templates/preamble.cmake b/build2cmake/src/templates/preamble.cmake
index 0be9e078..c5bfc704 100644
--- a/build2cmake/src/templates/preamble.cmake
+++ b/build2cmake/src/templates/preamble.cmake
@@ -196,24 +196,20 @@ endif()
 # Initialize SRC list for kernel and binding sources
 set(SRC "")
 
-message(STATUS "Rendered for platform {{ platform }}")
-
-{% if platform == 'windows' %}
-include(${CMAKE_CURRENT_LIST_DIR}/cmake/windows.cmake)
-
-# Generate standardized build name
-cmake_host_system_information(RESULT HOST_ARCH QUERY OS_PLATFORM)
-
-set(SYSTEM_STRING "${HOST_ARCH}-windows")
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/build-variants.cmake)
 
+# Generate build variant name.
 if(GPU_LANG STREQUAL "CUDA")
-  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "cuda" "${CUDA_VERSION}" "${SYSTEM_STRING}")
+  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "cuda" "${CUDA_VERSION}")
 elseif(GPU_LANG STREQUAL "HIP")
   run_python(ROCM_VERSION "import torch.version; print(torch.version.hip.split('.')[0] + '.' + torch.version.hip.split('.')[1])" "Failed to get ROCm version")
-  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "rocm" "${ROCM_VERSION}" "${SYSTEM_STRING}")
+  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "rocm" "${ROCM_VERSION}")
 elseif(GPU_LANG STREQUAL "SYCL")
   generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "xpu" "${DPCPP_VERSION}")
+elseif(GPU_LANG STREQUAL "METAL")
+  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "metal" "")
+elseif(GPU_LANG STREQUAL "CPU")
+  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "cpu" "")
 else()
-  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "cpu" "${SYSTEM_STRING}")
+  message(FATAL_ERROR "Cannot generate build name for unknown GPU_LANG: ${GPU_LANG}")
 endif()
-{% endif %}
diff --git a/build2cmake/src/templates/pyproject.toml b/build2cmake/src/templates/pyproject.toml
index 5785e836..d016dda3 100644
--- a/build2cmake/src/templates/pyproject.toml
+++ b/build2cmake/src/templates/pyproject.toml
@@ -1,3 +1,8 @@
+[project]
+name = "{{ python_name }}"
+version = "0.1.0"
+requires-python = ">=3.9"
+
 [build-system]
 requires = [
   "cmake>=3.26",
@@ -9,3 +14,12 @@ requires = [
   {{python_dependencies}}
 ]
 build-backend = "setuptools.build_meta"
+
+[project.optional-dependencies]
+{% for backend_name, deps in backend_dependencies %}
+{{backend_name}} = [
+  {% for dep in deps %}
+  {{dep}},
+  {% endfor %}
+]
+{% endfor %}
diff --git a/build2cmake/src/templates/setup.py b/build2cmake/src/templates/setup.py
index b629da3f..abc9f5dd 100644
--- a/build2cmake/src/templates/setup.py
+++ b/build2cmake/src/templates/setup.py
@@ -124,7 +124,6 @@ def build_extension(self, ext: CMakeExtension) -> None:
                 move(extdir / cfg / filename, extdir / filename)
 
 
-{% set python_name = name | replace('-', '_') %}
 setup(
     name="{{ python_name }}",
     # The version is just a stub, it's not used by the final build artefact.
diff --git a/build2cmake/src/templates/torch-extension.cmake b/build2cmake/src/templates/torch-extension.cmake
index 2fdd220c..805e5c5d 100644
--- a/build2cmake/src/templates/torch-extension.cmake
+++ b/build2cmake/src/templates/torch-extension.cmake
@@ -1,6 +1,6 @@
 # Include Metal shader compilation utilities if needed
 if(GPU_LANG STREQUAL "METAL")
-  include(${CMAKE_CURRENT_LIST_DIR}/cmake/compile-metal.cmake)
+    include(${CMAKE_CURRENT_LIST_DIR}/cmake/compile-metal.cmake)
 endif()
 
 # Define the extension target with unified parameters
@@ -15,25 +15,25 @@ define_gpu_extension_target(
   WITH_SOABI)
 
 if(NOT (MSVC OR GPU_LANG STREQUAL "SYCL"))
-  target_link_options({{ ops_name }} PRIVATE -static-libstdc++)
+    target_link_options({{ ops_name }} PRIVATE -static-libstdc++)
 endif()
 
 if(GPU_LANG STREQUAL "SYCL")
-  target_link_options({{ ops_name }} PRIVATE ${sycl_link_flags})
-  target_link_libraries({{ ops_name }} PRIVATE dnnl)
+    target_link_options({{ ops_name }} PRIVATE ${sycl_link_flags})
+    target_link_libraries({{ ops_name }} PRIVATE dnnl)
 endif()
 
 # Compile Metal shaders if any were found
 if(GPU_LANG STREQUAL "METAL")
-  if(ALL_METAL_SOURCES)
-    compile_metal_shaders({{ ops_name }} "${ALL_METAL_SOURCES}" "${METAL_INCLUDE_DIRS}")
-  endif()
+    if(ALL_METAL_SOURCES)
+        compile_metal_shaders({{ ops_name }} "${ALL_METAL_SOURCES}" "${METAL_INCLUDE_DIRS}")
+    endif()
 endif()
 
-{% if platform == 'windows' %}
 # Add kernels_install target for huggingface/kernels library layout
-add_kernels_install_target({{ ops_name }} "{{ name }}" "${BUILD_VARIANT_NAME}")
+add_kernels_install_target({{ ops_name }} "{{ python_name }}" "${BUILD_VARIANT_NAME}"
+    DATA_EXTENSIONS "{{ data_extensions | join(';') }}")
 
 # Add local_install target for local development with get_local_kernel()
-add_local_install_target({{ ops_name }} "{{ name }}" "${BUILD_VARIANT_NAME}")
-{% endif %}
+add_local_install_target({{ ops_name }} "{{ python_name }}" "${BUILD_VARIANT_NAME}"
+    DATA_EXTENSIONS "{{ data_extensions | join(';') }}")
diff --git a/build2cmake/src/templates/xpu/dep-cutlass-sycl.cmake b/build2cmake/src/templates/xpu/dep-cutlass-sycl.cmake
index a034f8f8..f905e173 100644
--- a/build2cmake/src/templates/xpu/dep-cutlass-sycl.cmake
+++ b/build2cmake/src/templates/xpu/dep-cutlass-sycl.cmake
@@ -1,3 +1,5 @@
+if(GPU_LANG STREQUAL "SYCL")
+
 find_package(CutlassSycl)
 
 if(DPCPP_VERSION STREQUAL "2025.3")
@@ -78,3 +80,4 @@ if(DPCPP_VERSION STREQUAL "2025.2" OR DPCPP_VERSION STREQUAL "2025.3" OR CUTLASS
 endif()
 string(REPLACE "-fsycl-targets=spir64_gen,spir64" "-fsycl-targets=spir64" sycl_flags "${sycl_flags}")
 
+endif(GPU_LANG STREQUAL "SYCL")
diff --git a/build2cmake/src/torch/common.rs b/build2cmake/src/torch/common.rs
index 6d07d1d7..8203ab13 100644
--- a/build2cmake/src/torch/common.rs
+++ b/build2cmake/src/torch/common.rs
@@ -12,26 +12,29 @@ use crate::torch::kernel::render_kernel_components;
 use crate::version::Version;
 use crate::FileSet;
 
-static REGISTRATION_H: &str = include_str!("../templates/registration.h");
-static CMAKE_UTILS: &str = include_str!("../templates/utils.cmake");
+static BUILD_VARIANTS_UTILS: &str = include_str!("../templates/build-variants.cmake");
 static CMAKE_KERNEL: &str = include_str!("../templates/kernel.cmake");
-static WINDOWS_UTILS: &str = include_str!("../templates/windows.cmake");
-static HIPIFY: &str = include_str!("../templates/cuda/hipify.py");
+static CMAKE_UTILS: &str = include_str!("../templates/utils.cmake");
+static COMPAT_PY: &str = include_str!("../templates/compat.py");
 static COMPILE_METAL_CMAKE: &str = include_str!("../templates/metal/compile-metal.cmake");
-static METALLIB_TO_HEADER_PY: &str = include_str!("../templates/metal/metallib_to_header.py");
 static GET_GPU_LANG: &str = include_str!("../templates/get_gpu_lang.cmake");
 static GET_GPU_LANG_PY: &str = include_str!("../templates/get_gpu_lang.py");
+static HIPIFY: &str = include_str!("../templates/cuda/hipify.py");
+static METALLIB_TO_HEADER_PY: &str = include_str!("../templates/metal/metallib_to_header.py");
+static REGISTRATION_H: &str = include_str!("../templates/registration.h");
 
 pub fn write_setup_py(
     env: &Environment,
+    general: &General,
     torch: &crate::config::Torch,
-    name: &str,
     ops_name: &str,
     file_set: &mut FileSet,
 ) -> Result<()> {
     let writer = file_set.entry("setup.py");
 
-    let data_globs = torch.data_globs().map(|globs| globs.join(", "));
+    let data_globs = torch
+        .data_extensions()
+        .map(|exts| exts.iter().map(|ext| format!("\"**/*.{ext}\"")).join(", "));
 
     env.get_template("setup.py")
         .wrap_err("Cannot get setup.py template")?
@@ -39,7 +42,7 @@ pub fn write_setup_py(
             context! {
                 data_globs => data_globs,
                 ops_name => ops_name,
-                name => name,
+                python_name => general.python_name(),
                 version => "0.1.0",
             },
             writer,
@@ -49,26 +52,45 @@ pub fn write_setup_py(
     Ok(())
 }
 
+pub fn write_compat_py(file_set: &mut FileSet) -> Result<()> {
+    let mut path = PathBuf::new();
+    path.push("compat.py");
+    file_set.entry(path).extend_from_slice(COMPAT_PY.as_bytes());
+
+    Ok(())
+}
+
 pub fn write_pyproject_toml(
     env: &Environment,
-    backend: Backend,
     general: &General,
     file_set: &mut FileSet,
 ) -> Result<()> {
     let writer = file_set.entry("pyproject.toml");
 
-    let python_dependencies = itertools::process_results(
-        general
-            .python_depends()
-            .chain(general.backend_python_depends(backend)),
-        |iter| iter.map(|d| format!("\"{d}\"")).join(", "),
-    )?;
+    // Common python dependencies (no backend-specific ones)
+    let python_dependencies = itertools::process_results(general.python_depends(), |iter| {
+        iter.map(|d| format!("\"{d}\"")).join(", ")
+    })?;
+
+    // Collect backend-specific dependencies for all backends
+    let mut backend_dependencies = Vec::new();
+    for backend in &Backend::all() {
+        let deps = itertools::process_results(general.backend_python_depends(*backend), |iter| {
+            iter.map(|d| format!("\"{d}\"")).collect::<Vec<_>>()
+        })?;
+
+        if !deps.is_empty() {
+            backend_dependencies.push((backend.to_string(), deps));
+        }
+    }
 
     env.get_template("pyproject.toml")
         .wrap_err("Cannot get pyproject.toml template")?
         .render_to_write(
             context! {
+                python_name => general.python_name(),
                 python_dependencies => python_dependencies,
+                backend_dependencies => backend_dependencies,
             },
             writer,
         )
@@ -77,21 +99,23 @@ pub fn write_pyproject_toml(
     Ok(())
 }
 
-pub fn write_metadata(backend: Backend, general: &General, file_set: &mut FileSet) -> Result<()> {
-    let writer = file_set.entry("metadata.json");
+pub fn write_metadata(general: &General, file_set: &mut FileSet) -> Result<()> {
+    for backend in &Backend::all() {
+        let writer = file_set.entry(format!("metadata-{}.json", backend));
 
-    let python_depends = general
-        .python_depends()
-        .chain(general.backend_python_depends(backend))
-        .collect::<Result<Vec<_>>>()?;
+        let python_depends = general
+            .python_depends()
+            .chain(general.backend_python_depends(*backend))
+            .collect::<Result<Vec<_>>>()?;
 
-    let metadata = Metadata {
-        version: general.version,
-        license: general.license.clone(),
-        python_depends,
-    };
+        let metadata = Metadata {
+            version: general.version,
+            license: general.license.clone(),
+            python_depends,
+        };
 
-    serde_json::to_writer_pretty(writer, &metadata)?;
+        serde_json::to_writer_pretty(writer, &metadata)?;
+    }
 
     Ok(())
 }
@@ -180,7 +204,11 @@ pub fn write_cmake_file(file_set: &mut FileSet, filename: &str, content: &[u8])
 pub fn write_cmake_helpers(file_set: &mut FileSet) {
     write_cmake_file(file_set, "utils.cmake", CMAKE_UTILS.as_bytes());
     write_cmake_file(file_set, "kernel.cmake", CMAKE_KERNEL.as_bytes());
-    write_cmake_file(file_set, "windows.cmake", WINDOWS_UTILS.as_bytes());
+    write_cmake_file(
+        file_set,
+        "build-variants.cmake",
+        BUILD_VARIANTS_UTILS.as_bytes(),
+    );
     write_cmake_file(file_set, "hipify.py", HIPIFY.as_bytes());
     write_cmake_file(
         file_set,
@@ -198,7 +226,8 @@ pub fn write_cmake_helpers(file_set: &mut FileSet) {
 
 pub fn render_extension(
     env: &Environment,
-    name: &str,
+    general: &General,
+    torch: &Torch,
     ops_name: &str,
     write: &mut impl Write,
 ) -> Result<()> {
@@ -206,9 +235,9 @@ pub fn render_extension(
         .wrap_err("Cannot get Torch extension template")?
         .render_to_write(
             context! {
-                name => name,
+                python_name => general.python_name(),
                 ops_name => ops_name,
-                platform => std::env::consts::OS,
+                data_extensions => torch.data_extensions(),
             },
             &mut *write,
         )
@@ -237,7 +266,6 @@ pub fn render_preamble(
                 cuda_maxver => cuda_maxver.map(|v| v.to_string()),
                 torch_minver => torch_minver.map(|v| v.to_string()),
                 torch_maxver => torch_maxver.map(|v| v.to_string()),
-                platform => std::env::consts::OS
             },
             &mut *write,
         )
@@ -250,7 +278,6 @@ pub fn render_preamble(
 
 pub fn write_cmake(
     env: &Environment,
-    backend: Backend,
     build: &Build,
     torch: &Torch,
     name: &str,
@@ -261,38 +288,29 @@ pub fn write_cmake(
 
     let cmake_writer = file_set.entry("CMakeLists.txt");
 
-    let (cuda_minver, cuda_maxver) = match backend {
-        Backend::Cuda => (
-            build.general.cuda.as_ref().and_then(|c| c.minver.as_ref()),
-            build.general.cuda.as_ref().and_then(|c| c.maxver.as_ref()),
-        ),
-        _ => (None, None),
-    };
-
     render_preamble(
         env,
         name,
-        cuda_minver,
-        cuda_maxver,
+        build.general.cuda.as_ref().and_then(|c| c.minver.as_ref()),
+        build.general.cuda.as_ref().and_then(|c| c.maxver.as_ref()),
         torch.minver.as_ref(),
         torch.maxver.as_ref(),
         cmake_writer,
     )?;
 
-    render_deps(env, backend, build, cmake_writer)?;
+    render_deps(env, build, cmake_writer)?;
 
     render_binding(env, torch, name, cmake_writer)?;
 
     render_kernel_components(env, build, cmake_writer)?;
 
-    render_extension(env, name, ops_name, cmake_writer)?;
+    render_extension(env, &build.general, torch, ops_name, cmake_writer)?;
 
     Ok(())
 }
 
 pub fn write_torch_ext(
     env: &Environment,
-    backend: Backend,
     build: &Build,
     target_dir: PathBuf,
     ops_id: Option<String>,
@@ -312,7 +330,6 @@ pub fn write_torch_ext(
 
     write_cmake(
         env,
-        backend,
         build,
         torch_ext,
         &build.general.name,
@@ -320,21 +337,17 @@ pub fn write_torch_ext(
         &mut file_set,
     )?;
 
-    write_setup_py(
-        env,
-        torch_ext,
-        &build.general.name,
-        &ops_name,
-        &mut file_set,
-    )?;
+    write_setup_py(env, &build.general, torch_ext, &ops_name, &mut file_set)?;
+
+    write_compat_py(&mut file_set)?;
 
     write_ops_py(env, &build.general.python_name(), &ops_name, &mut file_set)?;
 
-    write_pyproject_toml(env, backend, &build.general, &mut file_set)?;
+    write_pyproject_toml(env, &build.general, &mut file_set)?;
 
     write_torch_registration_macros(&mut file_set)?;
 
-    write_metadata(backend, &build.general, &mut file_set)?;
+    write_metadata(&build.general, &mut file_set)?;
 
     Ok(file_set)
 }
diff --git a/build2cmake/src/torch/deps.rs b/build2cmake/src/torch/deps.rs
index 5a333460..fcb2e8b7 100644
--- a/build2cmake/src/torch/deps.rs
+++ b/build2cmake/src/torch/deps.rs
@@ -4,22 +4,13 @@ use std::io::Write;
 use eyre::{Context, Result};
 use minijinja::{context, Environment};
 
-use crate::config::{Backend, Build, Dependency};
+use crate::config::{Build, Dependency};
 
-pub fn render_deps(
-    env: &Environment,
-    backend: Backend,
-    build: &Build,
-    write: &mut impl Write,
-) -> Result<()> {
+pub fn render_deps(env: &Environment, build: &Build, write: &mut impl Write) -> Result<()> {
+    // Collect all dependencies.
     let mut deps = HashSet::new();
-
-    for kernel in build
-        .kernels
-        .values()
-        .filter(|kernel| kernel.backend() == backend)
-    {
-        deps.extend(kernel.depends());
+    for kernel in build.kernels.values() {
+        deps.extend(kernel.depends())
     }
 
     for dep in deps {
@@ -94,10 +85,10 @@ pub fn render_deps(
                 env.get_template("xpu/dep-cutlass-sycl.cmake")?
                     .render_to_write(context! {}, &mut *write)?;
             }
-            Dependency::Torch => (),
-            _ => {
-                eprintln!("Warning: {backend:?} backend doesn't need/support dependency: {dep:?}");
+            Dependency::MetalCpp => {
+                // TODO: add CMake dependency.
             }
+            Dependency::Torch => (),
         }
         write.write_all(b"\n")?;
     }
diff --git a/build2cmake/src/torch/noarch.rs b/build2cmake/src/torch/noarch.rs
index 85caf080..2e1327c4 100644
--- a/build2cmake/src/torch/noarch.rs
+++ b/build2cmake/src/torch/noarch.rs
@@ -7,12 +7,14 @@ use minijinja::{context, Environment};
 use crate::{
     config::{Backend, Build, General, Torch},
     fileset::FileSet,
-    torch::{common::write_metadata, kernel_ops_identifier},
+    torch::{
+        common::{write_compat_py, write_metadata},
+        kernel_ops_identifier,
+    },
 };
 
 pub fn write_torch_ext_noarch(
     env: &Environment,
-    backend: Backend,
     build: &Build,
     target_dir: PathBuf,
     ops_id: Option<String>,
@@ -21,16 +23,10 @@ pub fn write_torch_ext_noarch(
 
     let ops_name = kernel_ops_identifier(&target_dir, &build.general.python_name(), ops_id);
 
+    write_compat_py(&mut file_set)?;
     write_ops_py(env, &build.general.python_name(), &ops_name, &mut file_set)?;
-    write_pyproject_toml(
-        env,
-        backend,
-        build.torch.as_ref(),
-        &build.general,
-        &mut file_set,
-    )?;
-
-    write_metadata(backend, &build.general, &mut file_set)?;
+    write_pyproject_toml(env, build.torch.as_ref(), &build.general, &mut file_set)?;
+    write_metadata(&build.general, &mut file_set)?;
 
     Ok(file_set)
 }
@@ -62,7 +58,6 @@ fn write_ops_py(
 
 fn write_pyproject_toml(
     env: &Environment,
-    backend: Backend,
     torch: Option<&Torch>,
     general: &General,
     file_set: &mut FileSet,
@@ -70,13 +65,28 @@ fn write_pyproject_toml(
     let writer = file_set.entry("pyproject.toml");
 
     let name = &general.name;
-    let data_globs = torch.and_then(|torch| torch.data_globs().map(|globs| globs.join(", ")));
-    let python_dependencies = itertools::process_results(
-        general
-            .python_depends()
-            .chain(general.backend_python_depends(backend)),
-        |iter| iter.map(|d| format!("\"{d}\"")).join(", "),
-    )?;
+    let data_globs = torch.and_then(|torch| {
+        torch
+            .data_extensions()
+            .map(|exts| exts.iter().map(|ext| format!("\"**/*.{ext}\"")).join(", "))
+    });
+
+    // Common python dependencies (no backend-specific ones)
+    let python_dependencies = itertools::process_results(general.python_depends(), |iter| {
+        iter.map(|d| format!("\"{d}\"")).join(", ")
+    })?;
+
+    // Collect backend-specific dependencies for all backends
+    let mut backend_dependencies = Vec::new();
+    for backend in &Backend::all() {
+        let deps = itertools::process_results(general.backend_python_depends(*backend), |iter| {
+            iter.map(|d| format!("\"{d}\"")).collect::<Vec<_>>()
+        })?;
+
+        if !deps.is_empty() {
+            backend_dependencies.push((backend.to_string(), deps));
+        }
+    }
 
     env.get_template("noarch/pyproject.toml")
         .wrap_err("Cannot get noarch pyproject.toml template")?
@@ -84,6 +94,7 @@ fn write_pyproject_toml(
             context! {
                 data_globs => data_globs,
                 python_dependencies => python_dependencies,
+                backend_dependencies => backend_dependencies,
                 name => name,
             },
             writer,
diff --git a/builder/examples/extra-data/build.toml b/builder/examples/extra-data/build.toml
new file mode 100644
index 00000000..391b3043
--- /dev/null
+++ b/builder/examples/extra-data/build.toml
@@ -0,0 +1,56 @@
+[general]
+name = "extra-data"
+backends = [
+    "cpu",
+    "cuda",
+    "metal",
+    "rocm",
+    "xpu",
+]
+
+[torch]
+src = [
+    "torch-ext/torch_binding.cpp",
+    "torch-ext/torch_binding.h",
+]
+pyext = ["json", "py"]
+
+[kernel.relu]
+backend = "cuda"
+depends = ["torch"]
+src = ["relu_cuda/relu.cu"]
+
+[kernel.relu_metal]
+backend = "metal"
+src = [
+  "relu_metal/relu.mm",
+  "relu_metal/relu.metal",
+  "relu_metal/common.h",
+]
+depends = [ "torch" ]
+
+[kernel.relu_rocm]
+backend = "rocm"
+rocm-archs = [
+    "gfx906",
+    "gfx908",
+    "gfx90a",
+    "gfx940",
+    "gfx941",
+    "gfx942",
+    "gfx1030",
+    "gfx1100",
+    "gfx1101",
+]
+depends = ["torch"]
+src = ["relu_cuda/relu.cu"]
+
+[kernel.relu_xpu]
+backend = "xpu"
+depends = ["torch"]
+src = ["relu_xpu/relu.cpp"]
+
+[kernel.relu_cpu]
+backend = "cpu"
+depends = ["torch"]
+src = ["relu_cpu/relu_cpu.cpp"]
diff --git a/builder/examples/extra-data/flake.nix b/builder/examples/extra-data/flake.nix
new file mode 100644
index 00000000..41e51eef
--- /dev/null
+++ b/builder/examples/extra-data/flake.nix
@@ -0,0 +1,17 @@
+{
+  description = "Flake for ReLU kernel";
+
+  inputs = {
+    kernel-builder.url = "path:../../..";
+  };
+
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genKernelFlakeOutputs {
+      inherit self;
+      path = ./.;
+    };
+}
diff --git a/builder/examples/extra-data/relu_cpu/relu_cpu.cpp b/builder/examples/extra-data/relu_cpu/relu_cpu.cpp
new file mode 100644
index 00000000..6197a9f1
--- /dev/null
+++ b/builder/examples/extra-data/relu_cpu/relu_cpu.cpp
@@ -0,0 +1,56 @@
+#include <torch/all.h>
+
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+#ifdef __SSE__
+void relu_forward_sse(float* out, const float* input, size_t size) {
+    size_t i = 0;
+
+    for (; i + 4 <= size; i += 4) {
+        __m128 vec_input = _mm_load_ps(input + i);
+        __m128 vec_zero = _mm_setzero_ps();
+        __m128 vec_output = _mm_max_ps(vec_input, vec_zero);
+        _mm_store_ps(out + i, vec_output);
+    }
+
+    for (; i < size; ++i) {
+        out[i] = input[i] > 0 ? input[i] : 0;
+    }
+}
+#endif
+
+#ifdef __ARM_NEON
+void relu_forward_neon(float* out, const float* input, size_t size) {
+    size_t i = 0;
+
+    for (; i + 4 <= size; i += 4) {
+        float32x4_t vec_input = vld1q_f32(input + i);
+        float32x4_t vec_output = vmaxq_f32(vec_input, vdupq_n_f32(0));
+        vst1q_f32(out + i, vec_output);
+    }
+
+    for (; i < size; ++i) {
+        out[i] = input[i] > 0 ? input[i] : 0;
+    }
+}
+#endif
+
+void relu(torch::Tensor &out, torch::Tensor const &input) {
+    TORCH_CHECK(out.dtype() == torch::kFloat32, "Output tensor must be of dtype float");
+    TORCH_CHECK(input.dtype() == torch::kFloat32, "Input tensor must be of dtype float");
+    TORCH_CHECK(out.numel() == input.numel(), "Input and output tensors must have the same number of elements");
+
+#if defined(__SSE__)
+    relu_forward_sse(out.data_ptr<float>(), input.data_ptr<float>(), input.numel());
+#elif defined(__ARM_NEON)
+    relu_forward_neon(out.data_ptr<float>(), input.data_ptr<float>(), input.numel());
+#else
+    #error "Unsupported architecture; please use a CPU with SSE or ARM NEON support."
+#endif
+}
diff --git a/builder/examples/extra-data/relu_cuda/relu.cu b/builder/examples/extra-data/relu_cuda/relu.cu
new file mode 100644
index 00000000..6bbe3160
--- /dev/null
+++ b/builder/examples/extra-data/relu_cuda/relu.cu
@@ -0,0 +1,43 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include <cmath>
+
+__global__ void relu_kernel(float *__restrict__ out,
+                            float const *__restrict__ input, const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    auto x = input[token_idx * d + idx];
+    out[token_idx * d + idx] = x > 0.0f ? x : 0.0f;
+  }
+}
+
+void relu(torch::Tensor &out, torch::Tensor const &input) {
+  TORCH_CHECK(input.device().is_cuda(), "input must be a CUDA tensor");
+  TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Float &&
+                  input.scalar_type() == at::ScalarType::Float,
+              "relu_kernel only supports float32");
+
+  TORCH_CHECK(input.sizes() == out.sizes(),
+              "Tensors must have the same shape. Got input shape: ",
+              input.sizes(), " and output shape: ", out.sizes());
+
+  TORCH_CHECK(input.scalar_type() == out.scalar_type(),
+              "Tensors must have the same data type. Got input dtype: ",
+              input.scalar_type(), " and output dtype: ", out.scalar_type());
+
+  TORCH_CHECK(input.device() == out.device(),
+              "Tensors must be on the same device. Got input device: ",
+              input.device(), " and output device: ", out.device());
+
+  int d = input.size(-1);
+  int64_t num_tokens = input.numel() / d;
+  dim3 grid(num_tokens);
+  dim3 block(std::min(d, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  relu_kernel<<<grid, block, 0, stream>>>(out.data_ptr<float>(),
+                                          input.data_ptr<float>(), d);
+}
diff --git a/builder/examples/extra-data/relu_metal/common.h b/builder/examples/extra-data/relu_metal/common.h
new file mode 100644
index 00000000..1b891fad
--- /dev/null
+++ b/builder/examples/extra-data/relu_metal/common.h
@@ -0,0 +1,10 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <metal_stdlib>
+using namespace metal;
+
+// Common constants and utilities for Metal kernels
+constant float RELU_THRESHOLD = 0.0f;
+
+#endif // COMMON_H
\ No newline at end of file
diff --git a/builder/examples/extra-data/relu_metal/relu.metal b/builder/examples/extra-data/relu_metal/relu.metal
new file mode 100644
index 00000000..286b46fe
--- /dev/null
+++ b/builder/examples/extra-data/relu_metal/relu.metal
@@ -0,0 +1,17 @@
+#include <metal_stdlib>
+#include "common.h"
+using namespace metal;
+
+kernel void relu_forward_kernel_float(device const float *inA [[buffer(0)]],
+                                device float *outC [[buffer(1)]],
+                                uint index [[thread_position_in_grid]]) {
+    // Explicitly write to output
+    outC[index] = max(RELU_THRESHOLD, inA[index]);
+}
+
+kernel void relu_forward_kernel_half(device const half *inA [[buffer(0)]],
+                                device half *outC [[buffer(1)]],
+                                uint index [[thread_position_in_grid]]) {
+    // Explicitly write to output
+    outC[index] = max(static_cast<half>(0.0), inA[index]);
+}
\ No newline at end of file
diff --git a/builder/examples/extra-data/relu_metal/relu.mm b/builder/examples/extra-data/relu_metal/relu.mm
new file mode 100644
index 00000000..7636737b
--- /dev/null
+++ b/builder/examples/extra-data/relu_metal/relu.mm
@@ -0,0 +1,105 @@
+#include <torch/torch.h>
+
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+
+// Include the auto-generated header with embedded metallib
+#ifdef EMBEDDED_METALLIB_HEADER
+#include EMBEDDED_METALLIB_HEADER
+#else
+#error "EMBEDDED_METALLIB_HEADER not defined"
+#endif
+
+static inline id<MTLBuffer> getMTLBufferStorage(const torch::Tensor &tensor) {
+  return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+}
+
+
+torch::Tensor &dispatchReluKernel(torch::Tensor const &input,
+                                  torch::Tensor &output) {
+  @autoreleasepool {
+    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+
+    int numThreads = input.numel();
+
+    // Load the embedded Metal library from memory
+    NSError *error = nil;
+    id<MTLLibrary> customKernelLibrary = EMBEDDED_METALLIB_NAMESPACE::createLibrary(device, &error);
+    TORCH_CHECK(customKernelLibrary,
+                "Failed to create Metal library from embedded data: ",
+                error.localizedDescription.UTF8String);
+
+    std::string kernel_name =
+        std::string("relu_forward_kernel_") +
+        (input.scalar_type() == torch::kFloat ? "float" : "half");
+    id<MTLFunction> customReluFunction = [customKernelLibrary
+        newFunctionWithName:[NSString
+                                stringWithUTF8String:kernel_name.c_str()]];
+    TORCH_CHECK(customReluFunction,
+                "Failed to create function state object for ",
+                kernel_name.c_str());
+
+    id<MTLComputePipelineState> reluPSO =
+        [device newComputePipelineStateWithFunction:customReluFunction
+                                              error:&error];
+    TORCH_CHECK(reluPSO, error.localizedDescription.UTF8String);
+
+    id<MTLCommandBuffer> commandBuffer = torch::mps::get_command_buffer();
+    TORCH_CHECK(commandBuffer, "Failed to retrieve command buffer reference");
+
+    dispatch_queue_t serialQueue = torch::mps::get_dispatch_queue();
+
+    dispatch_sync(serialQueue, ^() {
+      id<MTLComputeCommandEncoder> computeEncoder =
+          [commandBuffer computeCommandEncoder];
+      TORCH_CHECK(computeEncoder, "Failed to create compute command encoder");
+
+      [computeEncoder setComputePipelineState:reluPSO];
+      [computeEncoder setBuffer:getMTLBufferStorage(input)
+                         offset:input.storage_offset() * input.element_size()
+                        atIndex:0];
+      [computeEncoder setBuffer:getMTLBufferStorage(output)
+                         offset:output.storage_offset() * output.element_size()
+                        atIndex:1];
+
+      MTLSize gridSize = MTLSizeMake(numThreads, 1, 1);
+
+      NSUInteger threadGroupSize = reluPSO.maxTotalThreadsPerThreadgroup;
+      if (threadGroupSize > numThreads) {
+        threadGroupSize = numThreads;
+      }
+      MTLSize threadgroupSize = MTLSizeMake(threadGroupSize, 1, 1);
+
+      [computeEncoder dispatchThreads:gridSize
+                threadsPerThreadgroup:threadgroupSize];
+
+      [computeEncoder endEncoding];
+
+      torch::mps::commit();
+    });
+  }
+
+  return output;
+}
+
+void relu(torch::Tensor &out, torch::Tensor const &input) {
+  TORCH_CHECK(input.device().is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
+  TORCH_CHECK(input.scalar_type() == torch::kFloat ||
+                  input.scalar_type() == torch::kHalf,
+              "Unsupported data type: ", input.scalar_type());
+
+  TORCH_CHECK(input.sizes() == out.sizes(),
+              "Tensors must have the same shape. Got input shape: ",
+              input.sizes(), " and output shape: ", out.sizes());
+
+  TORCH_CHECK(input.scalar_type() == out.scalar_type(),
+              "Tensors must have the same data type. Got input dtype: ",
+              input.scalar_type(), " and output dtype: ", out.scalar_type());
+
+  TORCH_CHECK(input.device() == out.device(),
+              "Tensors must be on the same device. Got input device: ",
+              input.device(), " and output device: ", out.device());
+
+  dispatchReluKernel(input, out);
+}
diff --git a/builder/examples/extra-data/relu_xpu/relu.cpp b/builder/examples/extra-data/relu_xpu/relu.cpp
new file mode 100644
index 00000000..1809de08
--- /dev/null
+++ b/builder/examples/extra-data/relu_xpu/relu.cpp
@@ -0,0 +1,40 @@
+#include <sycl/sycl.hpp>
+#include <torch/torch.h>
+
+using namespace sycl;
+
+void relu_xpu_impl(torch::Tensor& output, const torch::Tensor& input) {
+    // Create SYCL queue directly
+    sycl::queue queue;
+
+    auto input_ptr = input.data_ptr<float>();
+    auto output_ptr = output.data_ptr<float>();
+    auto numel = input.numel();
+
+    // Launch SYCL kernel
+    queue.parallel_for(range<1>(numel), [=](id<1> idx) {
+        auto i = idx[0];
+        output_ptr[i] = input_ptr[i] > 0.0f ? input_ptr[i] : 0.0f;
+    }).wait();
+}
+
+void relu(torch::Tensor& out, const torch::Tensor& input) {
+    TORCH_CHECK(input.device().is_xpu(), "input must be a XPU tensor");
+    TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
+    TORCH_CHECK(input.scalar_type() == torch::kFloat,
+                "Unsupported data type: ", input.scalar_type());
+
+    TORCH_CHECK(input.sizes() == out.sizes(),
+                "Tensors must have the same shape. Got input shape: ",
+                input.sizes(), " and output shape: ", out.sizes());
+
+    TORCH_CHECK(input.scalar_type() == out.scalar_type(),
+                "Tensors must have the same data type. Got input dtype: ",
+                input.scalar_type(), " and output dtype: ", out.scalar_type());
+
+    TORCH_CHECK(input.device() == out.device(),
+                "Tensors must be on the same device. Got input device: ",
+                input.device(), " and output device: ", out.device());
+
+    relu_xpu_impl(out, input);
+}
diff --git a/builder/examples/extra-data/tests/__init__.py b/builder/examples/extra-data/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/builder/examples/extra-data/tests/test_relu.py b/builder/examples/extra-data/tests/test_relu.py
new file mode 100644
index 00000000..d4426a0c
--- /dev/null
+++ b/builder/examples/extra-data/tests/test_relu.py
@@ -0,0 +1,37 @@
+import platform
+
+import torch
+import torch.nn.functional as F
+
+import extra_data
+
+
+def test_relu():
+    if platform.system() == "Darwin":
+        device = torch.device("mps")
+    elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        device = torch.device("xpu")
+    elif torch.version.cuda is not None and torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    x = torch.randn(1024, 1024, dtype=torch.float32, device=device)
+    torch.testing.assert_allclose(F.relu(x), extra_data.relu(x))
+
+
+def test_relu_layer():
+    if platform.system() == "Darwin":
+        device = torch.device("mps")
+    elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        device = torch.device("xpu")
+    elif torch.version.cuda is not None and torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    x = torch.randn(1024, 1024, dtype=torch.float32, device=device)
+    layer = extra_data.layers.ReLU()
+    torch.testing.assert_allclose(F.relu(x), layer(x))
+
+
+def test_data():
+    assert extra_data.EASTER_EGG == 42
diff --git a/builder/examples/extra-data/torch-ext/extra_data/__init__.py b/builder/examples/extra-data/torch-ext/extra_data/__init__.py
new file mode 100644
index 00000000..684f3558
--- /dev/null
+++ b/builder/examples/extra-data/torch-ext/extra_data/__init__.py
@@ -0,0 +1,32 @@
+import json
+from pathlib import Path
+from typing import Optional
+
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+# This is the regular ReLU, but this example also shows how to embed some
+# non-Python data. This can be used for e.g. Triton tuning data.
+
+
+def _read_json() -> dict:
+    json_path = Path(__file__).parent / "data.json"
+    with open(json_path, "r") as f:
+        return json.load(f)
+
+
+EASTER_EGG = _read_json()
+
+
+def relu(x: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    if out is None:
+        out = torch.empty_like(x)
+    ops.relu(out, x)
+    return out
+
+
+__all__ = ["EASTER_EGG", "relu", "layers"]
diff --git a/builder/examples/extra-data/torch-ext/extra_data/data.json b/builder/examples/extra-data/torch-ext/extra_data/data.json
new file mode 100644
index 00000000..d81cc071
--- /dev/null
+++ b/builder/examples/extra-data/torch-ext/extra_data/data.json
@@ -0,0 +1 @@
+42
diff --git a/builder/examples/extra-data/torch-ext/extra_data/layers/__init__.py b/builder/examples/extra-data/torch-ext/extra_data/layers/__init__.py
new file mode 100644
index 00000000..6105a191
--- /dev/null
+++ b/builder/examples/extra-data/torch-ext/extra_data/layers/__init__.py
@@ -0,0 +1,11 @@
+import torch
+import torch.nn as nn
+
+from .._ops import ops
+
+
+class ReLU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.relu(out, x)
+        return out
diff --git a/builder/examples/extra-data/torch-ext/torch_binding.cpp b/builder/examples/extra-data/torch-ext/torch_binding.cpp
new file mode 100644
index 00000000..1765d92d
--- /dev/null
+++ b/builder/examples/extra-data/torch-ext/torch_binding.cpp
@@ -0,0 +1,19 @@
+#include <torch/library.h>
+
+#include "registration.h"
+#include "torch_binding.h"
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("relu(Tensor! out, Tensor input) -> ()");
+#if defined(CPU_KERNEL)
+  ops.impl("relu", torch::kCPU, &relu);
+#elif defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
+  ops.impl("relu", torch::kCUDA, &relu);
+#elif defined(METAL_KERNEL)
+  ops.impl("relu", torch::kMPS, relu);
+#elif defined(XPU_KERNEL)
+  ops.impl("relu", torch::kXPU, &relu);
+#endif
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/builder/examples/extra-data/torch-ext/torch_binding.h b/builder/examples/extra-data/torch-ext/torch_binding.h
new file mode 100644
index 00000000..3bcf2904
--- /dev/null
+++ b/builder/examples/extra-data/torch-ext/torch_binding.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <torch/torch.h>
+
+void relu(torch::Tensor &out, torch::Tensor const &input);
\ No newline at end of file
diff --git a/builder/examples/relu/tests/test_relu.py b/builder/examples/relu/tests/test_relu.py
index 65544aa4..7c2a495b 100644
--- a/builder/examples/relu/tests/test_relu.py
+++ b/builder/examples/relu/tests/test_relu.py
@@ -17,3 +17,17 @@ def test_relu():
         device = torch.device("cpu")
     x = torch.randn(1024, 1024, dtype=torch.float32, device=device)
     torch.testing.assert_allclose(F.relu(x), relu.relu(x))
+
+
+def test_relu_layer():
+    if platform.system() == "Darwin":
+        device = torch.device("mps")
+    elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        device = torch.device("xpu")
+    elif torch.version.cuda is not None and torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    x = torch.randn(1024, 1024, dtype=torch.float32, device=device)
+    layer = relu.layers.ReLU()
+    torch.testing.assert_allclose(F.relu(x), layer(x))
diff --git a/builder/examples/relu/torch-ext/relu/__init__.py b/builder/examples/relu/torch-ext/relu/__init__.py
index 8050dfd7..6890c993 100644
--- a/builder/examples/relu/torch-ext/relu/__init__.py
+++ b/builder/examples/relu/torch-ext/relu/__init__.py
@@ -4,9 +4,15 @@
 
 from ._ops import ops
 
+from . import layers
+
 
 def relu(x: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor:
     if out is None:
         out = torch.empty_like(x)
     ops.relu(out, x)
-    return out
\ No newline at end of file
+    return out
+
+
+__all__ = ["relu", "layers"]
+
diff --git a/builder/examples/relu/torch-ext/relu/layers/__init__.py b/builder/examples/relu/torch-ext/relu/layers/__init__.py
new file mode 100644
index 00000000..6105a191
--- /dev/null
+++ b/builder/examples/relu/torch-ext/relu/layers/__init__.py
@@ -0,0 +1,11 @@
+import torch
+import torch.nn as nn
+
+from .._ops import ops
+
+
+class ReLU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.relu(out, x)
+        return out
diff --git a/builder/lib/build.nix b/builder/lib/build.nix
index 63c2f182..c187aaf5 100644
--- a/builder/lib/build.nix
+++ b/builder/lib/build.nix
@@ -217,22 +217,20 @@ rec {
         bundleOnly = true;
       };
       buildToml = readBuildConfig path;
-      namePaths =
-        # TODO: treat kernels without compiled parts differently.
-        lib.mapAttrs (name: pkg: toString pkg) extensions;
-
-      # Include benchmarks directory if it exists in the source
       benchmarksPath = path + "/benchmarks";
       hasBenchmarks = builtins.pathExists benchmarksPath;
-      allPaths =
-        namePaths
-        // lib.optionalAttrs hasBenchmarks {
-          benchmarks = benchmarksPath;
+      benchmarks =
+        with lib.fileset;
+        toSource {
+          root = path;
+          fileset = maybeMissing benchmarksPath;
         };
+      contents =
+        builtins.map (pkg: toString pkg) (builtins.attrValues extensions)
+        ++ lib.optionals hasBenchmarks [ (toString benchmarks) ];
     in
     import ./join-paths {
-      inherit pkgs;
-      namePaths = allPaths;
+      inherit pkgs contents;
       name = "torch-ext-bundle";
     };
 
@@ -276,8 +274,11 @@ rec {
               ))
             ];
             shellHook = ''
-              export PYTHONPATH=''${PYTHONPATH}:${extension}
+              # This is run from `nix develop`, which provides the existing
+              # environment. We clear the LD_LIBRARY_PATH and PYTHONPATH to
+              # make testing as pure as possible.
               unset LD_LIBRARY_PATH
+              export PYTHONPATH=${extension}/${buildSet.torch.variant}
             '';
           };
         };
diff --git a/builder/lib/join-paths/default.nix b/builder/lib/join-paths/default.nix
index a339a4a5..3fcad0d9 100644
--- a/builder/lib/join-paths/default.nix
+++ b/builder/lib/join-paths/default.nix
@@ -3,8 +3,8 @@ args@{
 
   name,
 
-  # Attribute set with names to paths.
-  namePaths,
+  # Package paths.
+  contents,
 
   preferLocalBuild ? true,
   allowSubstitutes ? false,
@@ -16,16 +16,11 @@ let
     "pkgs"
     "namePaths"
   ];
-  # Iterating over pairs in bash sucks, so let's generate
-  # the commands in Nix instead.
-  copyPath = path: pkg: ''
-    mkdir -p ${placeholder "out"}/${path}
-    cp -r ${pkg}/* ${placeholder "out"}/${path}
+  copyPkg = pkg: ''
+    cp -r ${pkg}/* ${placeholder "out"}/
   '';
   prelude = ''
     mkdir -p ${placeholder "out"}
   '';
 in
-pkgs.runCommand name args_ (
-  prelude + lib.concatStringsSep "\n" (lib.mapAttrsToList copyPath namePaths)
-)
+pkgs.runCommand name args_ (prelude + lib.concatStringsSep "\n" (builtins.map copyPkg contents))
diff --git a/builder/lib/torch-extension/arch.nix b/builder/lib/torch-extension/arch.nix
index b78909de..769d33de 100644
--- a/builder/lib/torch-extension/arch.nix
+++ b/builder/lib/torch-extension/arch.nix
@@ -108,7 +108,6 @@ stdenv.mkDerivation (prevAttrs: {
   # Generate build files.
   postPatch = ''
     build2cmake generate-torch \
-      --backend ${buildConfig.backend} \
       --ops-id ${rev} build.toml
   '';
 
@@ -237,33 +236,25 @@ stdenv.mkDerivation (prevAttrs: {
     #(lib.cmakeFeature "METAL_COMPILER" "${xcrunHost}/bin/xcrunHost")
   ];
 
-  postInstall = ''
-    (
-      cd ..
-      cp -r torch-ext/${moduleName}/* $out/
-    )
-    mv $out/_${moduleName}_*/* $out/
-    rm -d $out/_${moduleName}_${rev}
-
-    # Set up a compatibility module for older kernels versions, remove when
-    # the updated kernels has been around for a while.
-    mkdir $out/${moduleName}
-    cp ${./compat.py} $out/${moduleName}/__init__.py
-
-    cp ../metadata.json $out/
-  ''
-  + (lib.optionalString (stripRPath && stdenv.hostPlatform.isLinux)) ''
-    find $out/ -name '*.so' \
-      -exec patchelf --set-rpath "" {} \;
-  ''
-  + (lib.optionalString (stripRPath && stdenv.hostPlatform.isDarwin)) ''
-    find $out/ -name '*.so' \
-      -exec rewrite-nix-paths-macho {} \;
-
-    # Stub some rpath.
-    find $out/ -name '*.so' \
-      -exec install_name_tool -add_rpath "@loader_path/lib" {} \;
-  '';
+  postInstall =
+    let
+      buildVariant = torch.variant;
+    in
+    ''
+      rm -rf $out/_${moduleName}_${rev}
+    ''
+    + (lib.optionalString (stripRPath && stdenv.hostPlatform.isLinux)) ''
+      find $out/ -name '*.so' \
+        -exec patchelf --set-rpath "" {} \;
+    ''
+    + (lib.optionalString (stripRPath && stdenv.hostPlatform.isDarwin)) ''
+      find $out/ -name '*.so' \
+        -exec rewrite-nix-paths-macho {} \;
+
+      # Stub some rpath.
+      find $out/ -name '*.so' \
+        -exec install_name_tool -add_rpath "@loader_path/lib" {} \;
+    '';
 
   doInstallCheck = true;
 
diff --git a/builder/lib/torch-extension/no-arch.nix b/builder/lib/torch-extension/no-arch.nix
index 83ebf7b6..2ef038dc 100644
--- a/builder/lib/torch-extension/no-arch.nix
+++ b/builder/lib/torch-extension/no-arch.nix
@@ -81,17 +81,19 @@ stdenv.mkDerivation (prevAttrs: {
   # we run it anyway.
   postPatch = ''
     build2cmake generate-torch \
-      --backend ${buildConfig.backend} \
       --ops-id ${rev} build.toml
   '';
 
-  installPhase = ''
-    mkdir -p $out
-    cp -r torch-ext/${moduleName}/* $out/
-    mkdir $out/${moduleName}
-    cp ${./compat.py} $out/${moduleName}/__init__.py
-    cp metadata.json $out/
-  '';
+  installPhase =
+    let
+      noarchVariant = torch.noarchVariant;
+    in
+    ''
+      mkdir -p $out/${noarchVariant}/${moduleName}
+      cp -r torch-ext/${moduleName}/* $out/${noarchVariant}
+      cp compat.py $out/${noarchVariant}/${moduleName}/__init__.py
+      cp metadata-${buildConfig.backend}.json $out/${noarchVariant}
+    '';
 
   doInstallCheck = true;
 
diff --git a/builder/scripts/windows/builder.ps1 b/builder/scripts/windows/builder.ps1
index 311f978e..69f14c2c 100644
--- a/builder/scripts/windows/builder.ps1
+++ b/builder/scripts/windows/builder.ps1
@@ -483,7 +483,6 @@ function Invoke-Backend {
     if ($Target) { $kwargs += $Target }
     if ($Options.Force) { $kwargs += '--force' }
     if ($Options.OpsId) { $kwargs += '--ops-id', $Options.OpsId }
-    if ($Backend -and $Backend -ne 'universal') { $kwargs += '--backend', $Backend }
 
     Invoke-Build2Cmake -Build2CmakeExe $Build2CmakeExe -Arguments $kwargs
 }
diff --git a/builder/tests/Dockerfile.test-kernel b/builder/tests/Dockerfile.test-kernel
index 2c3ab50a..f6b852b8 100644
--- a/builder/tests/Dockerfile.test-kernel
+++ b/builder/tests/Dockerfile.test-kernel
@@ -67,6 +67,7 @@ COPY relu-kernel ./relu-kernel
 COPY relu-kernel-cpu ./relu-kernel-cpu
 COPY cutlass-gemm-kernel ./cutlass-gemm-kernel
 COPY silu-and-mul-kernel ./silu-and-mul-kernel
+COPY builder/examples/extra-data/tests ./extra_data_tests
 COPY builder/examples/relu/tests ./relu_tests
 COPY builder/examples/cutlass-gemm/tests ./cutlass_gemm_tests
 
diff --git a/builder/tests/run-tests.sh b/builder/tests/run-tests.sh
index 3498a8c1..a776b89f 100644
--- a/builder/tests/run-tests.sh
+++ b/builder/tests/run-tests.sh
@@ -1,12 +1,19 @@
 #!/bin/bash
 
-PYTHONPATH="relu-kernel:cutlass-gemm-kernel:$PYTHONPATH" \
-  .venv/bin/pytest relu_tests cutlass_gemm_tests
+# Expand to build variant directories.
+EXTRA_DATA_PATH=$(echo extra-data/torch*)
+RELU_PATH=$(echo relu-kernel/torch*)
+CUTLASS_PATH=$(echo cutlass-gemm-kernel/torch*)
+SILU_MUL_PATH=$(echo silu-and-mul-kernel/torch*)
+RELU_CPU_PATH=$(echo relu-kernel-cpu/torch*)
+
+PYTHONPATH="$EXTRA_DATA_PATH:$RELU_PATH:$CUTLASS_PATH:$PYTHONPATH" \
+  .venv/bin/pytest extra_data_tests relu_tests cutlass_gemm_tests
 
 # We only care about importing, the kernel is trivial.
-PYTHONPATH="silu-and-mul-kernel:$PYTHONPATH" \
+PYTHONPATH="$SILU_MUL_PATH:$PYTHONPATH" \
   .venv/bin/python -c "import silu_and_mul"
 
-PYTHONPATH="relu-kernel-cpu:$PYTHONPATH" \
+PYTHONPATH="$RELU_CPU_PATH:$PYTHONPATH" \
    CUDA_VISIBLE_DEVICES="" \
   .venv/bin/pytest relu_tests
diff --git a/nix/pkgs/get-kernel-check/get-kernel-check-hook.sh b/nix/pkgs/get-kernel-check/get-kernel-check-hook.sh
index 381ed073..4a845ec9 100755
--- a/nix/pkgs/get-kernel-check/get-kernel-check-hook.sh
+++ b/nix/pkgs/get-kernel-check/get-kernel-check-hook.sh
@@ -21,23 +21,12 @@ _getKernelCheckHook() {
     export DYLD_LIBRARY_PATH="${TORCH_DIR}/lib:${DYLD_LIBRARY_PATH}"
   fi
 
-  TMPDIR=$(mktemp -d -t test.XXXXXX) || exit 1
-  trap "rm -rf '$TMPDIR'" EXIT
-
   # Some kernels want to write stuff (especially when they use Triton).
   HOME=$(mktemp -d -t test.XXXXXX) || exit 1
   trap "rm -rf '$HOME'" EXIT
 
-  # Emulate the bundle layout that kernels expects. This even works
-  # for universal kernels, since kernels checks the non-universal
-  # path first.
-  PYTHONPATH="@kernels@" \
-    BUILD_VARIANT=$(@python3@ -c "from kernels.utils import build_variant; print(build_variant())")
-  mkdir -p "${TMPDIR}/build"
-  ln -s "$out" "${TMPDIR}/build/${BUILD_VARIANT}"
-
   PYTHONPATH="@kernels@" \
-    @python3@ -c "from pathlib import Path; import kernels; kernels.get_local_kernel(Path('${TMPDIR}'), '${moduleName}')"
+    @python3@ -c "from pathlib import Path; import kernels; kernels.get_local_kernel(Path('${out}'), '${moduleName}')"
 }
 
 postInstallCheckHooks+=(_getKernelCheckHook)