diff --git a/.github/workflows/build_kernel.yaml b/.github/workflows/build_kernel.yaml index f5302178..2cbfdf39 100644 --- a/.github/workflows/build_kernel.yaml +++ b/.github/workflows/build_kernel.yaml @@ -28,11 +28,17 @@ jobs: USER: runner - name: Nix info run: nix-shell -p nix-info --run "nix-info -m" + - name: Build relu kernel run: ( cd builder/examples/relu && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux ) - name: Copy relu kernel run: cp -rL builder/examples/relu/result relu-kernel + - name: Build extra-data kernel + run: ( cd builder/examples/extra-data && nix build .\#redistributable.torch29-cxx11-cu126-x86_64-linux ) + - name: Copy extra-data kernel + run: cp -rL builder/examples/extra-data/result extra-data + - name: Build relu kernel (CPU) run: ( cd builder/examples/relu && nix build .\#redistributable.torch29-cxx11-cpu-x86_64-linux ) - name: Copy relu kernel (CPU) @@ -70,6 +76,7 @@ jobs: path: | activation-kernel cutlass-gemm-kernel + extra-data relu-kernel relu-kernel-cpu relu-backprop-compile-kernel diff --git a/build2cmake/src/config/mod.rs b/build2cmake/src/config/mod.rs index 1078a604..d39342c2 100644 --- a/build2cmake/src/config/mod.rs +++ b/build2cmake/src/config/mod.rs @@ -27,10 +27,6 @@ impl Build { pub fn is_noarch(&self) -> bool { self.kernels.is_empty() } - - pub fn supports_backend(&self, backend: &Backend) -> bool { - self.general.backends.contains(backend) - } } pub struct General { @@ -126,18 +122,18 @@ pub struct Torch { } impl Torch { - pub fn data_globs(&self) -> Option> { + pub fn data_extensions(&self) -> Option> { match self.pyext.as_ref() { Some(exts) => { - let globs = exts + let extensions = exts .iter() .filter(|&ext| ext != "py" && ext != "pyi") - .map(|ext| format!("\"**/*.{ext}\"")) + .cloned() .collect_vec(); - if globs.is_empty() { + if extensions.is_empty() { None } else { - Some(globs) + Some(extensions) } } @@ -247,6 +243,18 @@ pub enum Backend { Xpu, } +impl Backend { + pub const fn all() -> [Backend; 5] { + [ + Backend::Cpu, + Backend::Cuda, + Backend::Metal, + Backend::Rocm, + Backend::Xpu, + ] + } +} + impl Display for Backend { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { diff --git a/build2cmake/src/main.rs b/build2cmake/src/main.rs index 60a869d2..1b17ab76 100644 --- a/build2cmake/src/main.rs +++ b/build2cmake/src/main.rs @@ -12,7 +12,7 @@ mod torch; use torch::{write_torch_ext, write_torch_ext_noarch}; mod config; -use config::{v3, Backend, Build, BuildCompat}; +use config::{v3, Build, BuildCompat}; mod fileset; use fileset::FileSet; @@ -48,9 +48,6 @@ enum Commands { /// kernel name to avoid name collisions. (e.g. Git SHA) #[arg(long)] ops_id: Option, - - #[arg(long)] - backend: Option, }, /// Update a `build.toml` to the current format. @@ -93,12 +90,11 @@ fn main() -> Result<()> { let args = Cli::parse(); match args.command { Commands::GenerateTorch { - backend, build_toml, force, target_dir, ops_id, - } => generate_torch(backend, build_toml, target_dir, force, ops_id), + } => generate_torch(build_toml, target_dir, force, ops_id), Commands::UpdateBuild { build_toml } => update_build(build_toml), Commands::Validate { build_toml } => { parse_and_validate(build_toml)?; @@ -115,7 +111,6 @@ fn main() -> Result<()> { } fn generate_torch( - backend: Option, build_toml: PathBuf, target_dir: Option, force: bool, @@ -139,41 +134,10 @@ fn generate_torch( env.set_trim_blocks(true); minijinja_embed::load_templates!(&mut env); - let backend = match backend { - Some(backend) => { - if !build.supports_backend(&backend) { - bail!("Kernel does not support backend: {}", backend); - } - - backend - } - None => { - let kernel_backends = &build.general.backends; - - if kernel_backends.len() > 1 { - let mut kernel_backends = kernel_backends - .iter() - .map(ToString::to_string) - .collect::>(); - kernel_backends.sort(); - bail!( - "Multiple supported backends found in build.toml: {}. Please specify one with --backend.", - kernel_backends.join(", ") - ); - } - - if let Some(backend) = kernel_backends.first() { - *backend - } else { - bail!("No backends are specified in build.toml"); - } - } - }; - let file_set = if build.is_noarch() { - write_torch_ext_noarch(&env, backend, &build, target_dir.clone(), ops_id)? + write_torch_ext_noarch(&env, &build, target_dir.clone(), ops_id)? } else { - write_torch_ext(&env, backend, &build, target_dir.clone(), ops_id)? + write_torch_ext(&env, &build, target_dir.clone(), ops_id)? }; file_set.write(&target_dir, force)?; @@ -368,14 +332,12 @@ fn get_generated_files( ) -> Result> { let mut all_set = FileSet::new(); - for backend in &build.general.backends { - let set = if build.is_noarch() { - write_torch_ext_noarch(env, *backend, build, target_dir.clone(), ops_id.clone())? - } else { - write_torch_ext(env, *backend, build, target_dir.clone(), ops_id.clone())? - }; - all_set.extend(set); - } + let set = if build.is_noarch() { + write_torch_ext_noarch(env, build, target_dir.clone(), ops_id.clone())? + } else { + write_torch_ext(env, build, target_dir.clone(), ops_id.clone())? + }; + all_set.extend(set); Ok(all_set.into_names()) } diff --git a/build2cmake/src/templates/windows.cmake b/build2cmake/src/templates/build-variants.cmake similarity index 50% rename from build2cmake/src/templates/windows.cmake rename to build2cmake/src/templates/build-variants.cmake index 123ccef3..b12c88da 100644 --- a/build2cmake/src/templates/windows.cmake +++ b/build2cmake/src/templates/build-variants.cmake @@ -1,12 +1,15 @@ # Generate a standardized build variant name following the pattern: -# torch---windows +# torch-[cxx11-]-- # # Arguments: # OUT_BUILD_NAME - Output variable name # TORCH_VERSION - PyTorch version (e.g., "2.7.1") -# COMPUTE_FRAMEWORK - One of: cuda, rocm, metal, xpu +# COMPUTE_FRAMEWORK - One of: cuda, rocm, metal, xpu, cpu # COMPUTE_VERSION - Version of compute framework (e.g., "12.4" for CUDA, "6.0" for ROCm) -# Example output: torch271-cxx11-cu124-x86_64-windows +# Optional for CPU-only builds (pass empty string or omit) +# Example output: torch271-cxx11-cu124-x86_64-linux (Linux) +# torch271-cu124-x86_64-windows (Windows) +# torch271-metal-aarch64-darwin (macOS) # function(generate_build_name OUT_BUILD_NAME TORCH_VERSION COMPUTE_FRAMEWORK COMPUTE_VERSION) # Flatten version by removing dots and padding to 2 components @@ -60,20 +63,45 @@ function(generate_build_name OUT_BUILD_NAME TORCH_VERSION COMPUTE_FRAMEWORK COMP list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR) set(COMPUTE_STRING "xpu${COMPUTE_MAJOR}0") endif() + elseif(COMPUTE_FRAMEWORK STREQUAL "metal") + set(COMPUTE_STRING "metal") + elseif(COMPUTE_FRAMEWORK STREQUAL "cpu") + set(COMPUTE_STRING "cpu") else() message(FATAL_ERROR "Unknown compute framework: ${COMPUTE_FRAMEWORK}") endif() - if(CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64") + # Detect from target system (CMAKE_SYSTEM_* variables refer to target, not host) + # Normalize architecture name + if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64|AMD64)$") set(CPU_ARCH "x86_64") - elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|ARM64)$") set(CPU_ARCH "aarch64") else() - message(ERROR "Unsupported Windows platform ${CMAKE_SYSTEM_PROCESSOR}") + message(FATAL_ERROR "Unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}") endif() + # Normalize OS name + if(CMAKE_SYSTEM_NAME STREQUAL "Windows") + set(OS_NAME "windows") + elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(OS_NAME "linux") + elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + set(OS_NAME "darwin") + else() + message(WARNING "Unknown OS ${CMAKE_SYSTEM_NAME}, using as-is") + string(TOLOWER "${CMAKE_SYSTEM_NAME}" OS_NAME) + endif() + + set(ARCH_OS_STRING "${CPU_ARCH}-${OS_NAME}") + # Assemble the final build name - set(BUILD_NAME "torch${FLATTENED_TORCH}-${COMPUTE_STRING}-${CPU_ARCH}-windows") + # For Linux, include cxx11 ABI indicator for compatibility + if(ARCH_OS_STRING MATCHES "-linux$") + set(BUILD_NAME "torch${FLATTENED_TORCH}-cxx11-${COMPUTE_STRING}-${ARCH_OS_STRING}") + else() + set(BUILD_NAME "torch${FLATTENED_TORCH}-${COMPUTE_STRING}-${ARCH_OS_STRING}") + endif() set(${OUT_BUILD_NAME} "${BUILD_NAME}" PARENT_SCOPE) message(STATUS "Generated build name: ${BUILD_NAME}") @@ -82,7 +110,7 @@ endfunction() # # Create a custom install target for the huggingface/kernels library layout. # This installs the extension into a directory structure suitable for kernel hub discovery: -# /// +# / # # Arguments: # TARGET_NAME - Name of the target to create the install rule for @@ -92,14 +120,29 @@ endfunction() # function(add_kernels_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME) set(oneValueArgs INSTALL_PREFIX) - cmake_parse_arguments(ARG "" "${oneValueArgs}" "" ${ARGN}) + set(multiValueArgs DATA_EXTENSIONS) + cmake_parse_arguments(ARG "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) if(NOT ARG_INSTALL_PREFIX) set(ARG_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") endif() + if (${GPU_LANG} STREQUAL "CPU") + set(_BACKEND "cpu") + elseif (${GPU_LANG} STREQUAL "CUDA") + set(_BACKEND "cuda") + elseif (${GPU_LANG} STREQUAL "HIP") + set(_BACKEND "rocm") + elseif (${GPU_LANG} STREQUAL "METAL") + set(_BACKEND "metal") + elseif (${GPU_LANG} STREQUAL "SYCL") + set(_BACKEND "xpu") + else() + message(FATAL_ERROR "Unsupported GPU_LANG: ${GPU_LANG}") + endif() + # Set the installation directory - set(KERNEL_INSTALL_DIR "${ARG_INSTALL_PREFIX}/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}") + set(KERNEL_INSTALL_DIR "${ARG_INSTALL_PREFIX}/${BUILD_VARIANT_NAME}") message(STATUS "Using PACKAGE_NAME: ${PACKAGE_NAME}") @@ -110,21 +153,44 @@ function(add_kernels_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME) RUNTIME DESTINATION "${KERNEL_INSTALL_DIR}" COMPONENT ${TARGET_NAME}) - # Glob Python files to install - file(GLOB PYTHON_FILES "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.py") + # Glob Python files to install recursively. + file(GLOB_RECURSE PYTHON_FILES RELATIVE "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}" "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.py") + foreach(python_file IN LISTS PYTHON_FILES) + get_filename_component(python_file_dir "${python_file}" DIRECTORY) + install(FILES "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/${python_file}" + DESTINATION "${KERNEL_INSTALL_DIR}/${python_file_dir}" + COMPONENT ${TARGET_NAME}) + endforeach() - # Install Python files (__init__.py and _ops.py) - install(FILES ${PYTHON_FILES} + install(FILES ${CMAKE_SOURCE_DIR}/metadata-${_BACKEND}.json DESTINATION "${KERNEL_INSTALL_DIR}" + RENAME "metadata.json" + COMPONENT ${TARGET_NAME}) + + # Compatibility with older kernels and direct Python imports. + install(FILES ${CMAKE_SOURCE_DIR}/compat.py + DESTINATION "${KERNEL_INSTALL_DIR}/${PACKAGE_NAME}" + RENAME "__init__.py" COMPONENT ${TARGET_NAME}) - message(STATUS "Added install rules for ${TARGET_NAME} -> ${BUILD_VARIANT_NAME}/${PACKAGE_NAME}") + # Install data files with specified extensions + foreach(ext IN LISTS ARG_DATA_EXTENSIONS) + file(GLOB_RECURSE DATA_FILES RELATIVE "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}" "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.${ext}") + foreach(data_file IN LISTS DATA_FILES) + get_filename_component(data_file_dir "${data_file}" DIRECTORY) + install(FILES "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/${data_file}" + DESTINATION "${KERNEL_INSTALL_DIR}/${data_file_dir}" + COMPONENT ${TARGET_NAME}) + endforeach() + endforeach() + + message(STATUS "Added install rules for ${TARGET_NAME} -> ${BUILD_VARIANT_NAME}") endfunction() # # Add install rules for local development with huggingface/kernels. # This installs the extension into the layout expected by get_local_kernel(): -# ${CMAKE_SOURCE_DIR}/build/// +# ${CMAKE_SOURCE_DIR}/build// # # This allows developers to use get_local_kernel() from the kernels library to load # locally built kernels without needing to publish to the hub. @@ -138,19 +204,36 @@ endfunction() # BUILD_VARIANT_NAME - Build variant name (e.g., "torch271-cxx11-cu124-x86_64-linux") # function(add_local_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME) + set(multiValueArgs DATA_EXTENSIONS) + cmake_parse_arguments(ARG "" "" "${multiValueArgs}" ${ARGN}) + # Define your local, folder based, installation directory - set(LOCAL_INSTALL_DIR "${CMAKE_SOURCE_DIR}/build/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}") + set(LOCAL_INSTALL_DIR "${CMAKE_SOURCE_DIR}/build/${BUILD_VARIANT_NAME}") # Variant directory is where metadata.json should go (for kernels upload discovery) set(VARIANT_DIR "${CMAKE_SOURCE_DIR}/build/${BUILD_VARIANT_NAME}") - # Glob Python files at configure time - file(GLOB PYTHON_FILES "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.py") + # Glob Python files to install recursively. + file(GLOB_RECURSE PYTHON_FILES RELATIVE "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}" "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.py") # Create a custom target for local installation add_custom_target(local_install COMMENT "Installing files to local directory..." ) + if (${GPU_LANG} STREQUAL "CPU") + set(_BACKEND "cpu") + elseif (${GPU_LANG} STREQUAL "CUDA") + set(_BACKEND "cuda") + elseif (${GPU_LANG} STREQUAL "HIP") + set(_BACKEND "rocm") + elseif (${GPU_LANG} STREQUAL "METAL") + set(_BACKEND "metal") + elseif (${GPU_LANG} STREQUAL "SYCL") + set(_BACKEND "xpu") + else() + message(FATAL_ERROR "Unsupported GPU_LANG: ${GPU_LANG}") + endif() + # Add custom commands to copy files add_custom_command(TARGET local_install POST_BUILD # Copy the shared library @@ -158,22 +241,51 @@ function(add_local_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME) $ ${LOCAL_INSTALL_DIR}/ - # Copy each Python file + # Copy metadata.json if it exists COMMAND ${CMAKE_COMMAND} -E copy_if_different - ${PYTHON_FILES} - ${LOCAL_INSTALL_DIR}/ + ${CMAKE_SOURCE_DIR}/metadata-${_BACKEND}.json + ${VARIANT_DIR}/metadata.json - # Copy metadata.json if it exists + # Compatibility with older kernels and direct Python imports. COMMAND ${CMAKE_COMMAND} -E copy_if_different - ${CMAKE_SOURCE_DIR}/metadata.json - ${VARIANT_DIR}/ + ${CMAKE_SOURCE_DIR}/compat.py + ${VARIANT_DIR}/${PACKAGE_NAME}/__init__.py COMMENT "Copying shared library and Python files to ${LOCAL_INSTALL_DIR}" COMMAND_EXPAND_LISTS ) + # Copy each Python file preserving directory structure + foreach(python_file IN LISTS PYTHON_FILES) + get_filename_component(python_file_dir "${python_file}" DIRECTORY) + add_custom_command(TARGET local_install POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory + ${LOCAL_INSTALL_DIR}/${python_file_dir} + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/${python_file} + ${LOCAL_INSTALL_DIR}/${python_file_dir}/ + COMMENT "Copying ${python_file} to ${LOCAL_INSTALL_DIR}/${python_file_dir}" + ) + endforeach() + + # Copy data files with specified extensions + foreach(ext IN LISTS ARG_DATA_EXTENSIONS) + file(GLOB_RECURSE DATA_FILES RELATIVE "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}" "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.${ext}") + foreach(data_file IN LISTS DATA_FILES) + get_filename_component(data_file_dir "${data_file}" DIRECTORY) + add_custom_command(TARGET local_install POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory + ${LOCAL_INSTALL_DIR}/${data_file_dir} + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/${data_file} + ${LOCAL_INSTALL_DIR}/${data_file_dir}/ + COMMENT "Copying ${data_file} to ${LOCAL_INSTALL_DIR}/${data_file_dir}" + ) + endforeach() + endforeach() + # Create both directories: variant dir for metadata.json, package dir for binaries file(MAKE_DIRECTORY ${VARIANT_DIR}) file(MAKE_DIRECTORY ${LOCAL_INSTALL_DIR}) - message(STATUS "Added install rules for ${TARGET_NAME} -> build/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}") + message(STATUS "Added install rules for ${TARGET_NAME} -> build/${BUILD_VARIANT_NAME}") endfunction() diff --git a/builder/lib/torch-extension/compat.py b/build2cmake/src/templates/compat.py similarity index 100% rename from builder/lib/torch-extension/compat.py rename to build2cmake/src/templates/compat.py diff --git a/build2cmake/src/templates/cuda/dep-cutlass.cmake b/build2cmake/src/templates/cuda/dep-cutlass.cmake index 8316ae51..d9a00b8b 100644 --- a/build2cmake/src/templates/cuda/dep-cutlass.cmake +++ b/build2cmake/src/templates/cuda/dep-cutlass.cmake @@ -1,3 +1,6 @@ +if(GPU_LANG STREQUAL "CUDA") + message(STATUS "Including CUTLASS dependency") + find_package(NvidiaCutlass) if (NOT NvidiaCutlass_FOUND) @@ -37,3 +40,5 @@ if (NOT NvidiaCutlass_FOUND) else() message(STATUS "Using system cutlass with version: ${NvidiaCutlass_VERSION}") endif(NOT NvidiaCutlass_FOUND) + +endif(GPU_LANG STREQUAL "CUDA") diff --git a/build2cmake/src/templates/noarch/pyproject.toml b/build2cmake/src/templates/noarch/pyproject.toml index e60657b9..348b6cc8 100644 --- a/build2cmake/src/templates/noarch/pyproject.toml +++ b/build2cmake/src/templates/noarch/pyproject.toml @@ -7,6 +7,15 @@ dependencies = [ {{python_dependencies}} ] +[project.optional-dependencies] +{% for backend_name, deps in backend_dependencies %} +{{backend_name}} = [ + {% for dep in deps %} + {{dep}}, + {% endfor %} +] +{% endfor %} + [tool.setuptools] package-dir = { "" = "torch-ext" } diff --git a/build2cmake/src/templates/preamble.cmake b/build2cmake/src/templates/preamble.cmake index 0be9e078..c5bfc704 100644 --- a/build2cmake/src/templates/preamble.cmake +++ b/build2cmake/src/templates/preamble.cmake @@ -196,24 +196,20 @@ endif() # Initialize SRC list for kernel and binding sources set(SRC "") -message(STATUS "Rendered for platform {{ platform }}") - -{% if platform == 'windows' %} -include(${CMAKE_CURRENT_LIST_DIR}/cmake/windows.cmake) - -# Generate standardized build name -cmake_host_system_information(RESULT HOST_ARCH QUERY OS_PLATFORM) - -set(SYSTEM_STRING "${HOST_ARCH}-windows") +include(${CMAKE_CURRENT_LIST_DIR}/cmake/build-variants.cmake) +# Generate build variant name. if(GPU_LANG STREQUAL "CUDA") - generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "cuda" "${CUDA_VERSION}" "${SYSTEM_STRING}") + generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "cuda" "${CUDA_VERSION}") elseif(GPU_LANG STREQUAL "HIP") run_python(ROCM_VERSION "import torch.version; print(torch.version.hip.split('.')[0] + '.' + torch.version.hip.split('.')[1])" "Failed to get ROCm version") - generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "rocm" "${ROCM_VERSION}" "${SYSTEM_STRING}") + generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "rocm" "${ROCM_VERSION}") elseif(GPU_LANG STREQUAL "SYCL") generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "xpu" "${DPCPP_VERSION}") +elseif(GPU_LANG STREQUAL "METAL") + generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "metal" "") +elseif(GPU_LANG STREQUAL "CPU") + generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "cpu" "") else() - generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "cpu" "${SYSTEM_STRING}") + message(FATAL_ERROR "Cannot generate build name for unknown GPU_LANG: ${GPU_LANG}") endif() -{% endif %} diff --git a/build2cmake/src/templates/pyproject.toml b/build2cmake/src/templates/pyproject.toml index 5785e836..d016dda3 100644 --- a/build2cmake/src/templates/pyproject.toml +++ b/build2cmake/src/templates/pyproject.toml @@ -1,3 +1,8 @@ +[project] +name = "{{ python_name }}" +version = "0.1.0" +requires-python = ">=3.9" + [build-system] requires = [ "cmake>=3.26", @@ -9,3 +14,12 @@ requires = [ {{python_dependencies}} ] build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +{% for backend_name, deps in backend_dependencies %} +{{backend_name}} = [ + {% for dep in deps %} + {{dep}}, + {% endfor %} +] +{% endfor %} diff --git a/build2cmake/src/templates/setup.py b/build2cmake/src/templates/setup.py index b629da3f..abc9f5dd 100644 --- a/build2cmake/src/templates/setup.py +++ b/build2cmake/src/templates/setup.py @@ -124,7 +124,6 @@ def build_extension(self, ext: CMakeExtension) -> None: move(extdir / cfg / filename, extdir / filename) -{% set python_name = name | replace('-', '_') %} setup( name="{{ python_name }}", # The version is just a stub, it's not used by the final build artefact. diff --git a/build2cmake/src/templates/torch-extension.cmake b/build2cmake/src/templates/torch-extension.cmake index 2fdd220c..805e5c5d 100644 --- a/build2cmake/src/templates/torch-extension.cmake +++ b/build2cmake/src/templates/torch-extension.cmake @@ -1,6 +1,6 @@ # Include Metal shader compilation utilities if needed if(GPU_LANG STREQUAL "METAL") - include(${CMAKE_CURRENT_LIST_DIR}/cmake/compile-metal.cmake) + include(${CMAKE_CURRENT_LIST_DIR}/cmake/compile-metal.cmake) endif() # Define the extension target with unified parameters @@ -15,25 +15,25 @@ define_gpu_extension_target( WITH_SOABI) if(NOT (MSVC OR GPU_LANG STREQUAL "SYCL")) - target_link_options({{ ops_name }} PRIVATE -static-libstdc++) + target_link_options({{ ops_name }} PRIVATE -static-libstdc++) endif() if(GPU_LANG STREQUAL "SYCL") - target_link_options({{ ops_name }} PRIVATE ${sycl_link_flags}) - target_link_libraries({{ ops_name }} PRIVATE dnnl) + target_link_options({{ ops_name }} PRIVATE ${sycl_link_flags}) + target_link_libraries({{ ops_name }} PRIVATE dnnl) endif() # Compile Metal shaders if any were found if(GPU_LANG STREQUAL "METAL") - if(ALL_METAL_SOURCES) - compile_metal_shaders({{ ops_name }} "${ALL_METAL_SOURCES}" "${METAL_INCLUDE_DIRS}") - endif() + if(ALL_METAL_SOURCES) + compile_metal_shaders({{ ops_name }} "${ALL_METAL_SOURCES}" "${METAL_INCLUDE_DIRS}") + endif() endif() -{% if platform == 'windows' %} # Add kernels_install target for huggingface/kernels library layout -add_kernels_install_target({{ ops_name }} "{{ name }}" "${BUILD_VARIANT_NAME}") +add_kernels_install_target({{ ops_name }} "{{ python_name }}" "${BUILD_VARIANT_NAME}" + DATA_EXTENSIONS "{{ data_extensions | join(';') }}") # Add local_install target for local development with get_local_kernel() -add_local_install_target({{ ops_name }} "{{ name }}" "${BUILD_VARIANT_NAME}") -{% endif %} +add_local_install_target({{ ops_name }} "{{ python_name }}" "${BUILD_VARIANT_NAME}" + DATA_EXTENSIONS "{{ data_extensions | join(';') }}") diff --git a/build2cmake/src/templates/xpu/dep-cutlass-sycl.cmake b/build2cmake/src/templates/xpu/dep-cutlass-sycl.cmake index a034f8f8..f905e173 100644 --- a/build2cmake/src/templates/xpu/dep-cutlass-sycl.cmake +++ b/build2cmake/src/templates/xpu/dep-cutlass-sycl.cmake @@ -1,3 +1,5 @@ +if(GPU_LANG STREQUAL "SYCL") + find_package(CutlassSycl) if(DPCPP_VERSION STREQUAL "2025.3") @@ -78,3 +80,4 @@ if(DPCPP_VERSION STREQUAL "2025.2" OR DPCPP_VERSION STREQUAL "2025.3" OR CUTLASS endif() string(REPLACE "-fsycl-targets=spir64_gen,spir64" "-fsycl-targets=spir64" sycl_flags "${sycl_flags}") +endif(GPU_LANG STREQUAL "SYCL") diff --git a/build2cmake/src/torch/common.rs b/build2cmake/src/torch/common.rs index 6d07d1d7..8203ab13 100644 --- a/build2cmake/src/torch/common.rs +++ b/build2cmake/src/torch/common.rs @@ -12,26 +12,29 @@ use crate::torch::kernel::render_kernel_components; use crate::version::Version; use crate::FileSet; -static REGISTRATION_H: &str = include_str!("../templates/registration.h"); -static CMAKE_UTILS: &str = include_str!("../templates/utils.cmake"); +static BUILD_VARIANTS_UTILS: &str = include_str!("../templates/build-variants.cmake"); static CMAKE_KERNEL: &str = include_str!("../templates/kernel.cmake"); -static WINDOWS_UTILS: &str = include_str!("../templates/windows.cmake"); -static HIPIFY: &str = include_str!("../templates/cuda/hipify.py"); +static CMAKE_UTILS: &str = include_str!("../templates/utils.cmake"); +static COMPAT_PY: &str = include_str!("../templates/compat.py"); static COMPILE_METAL_CMAKE: &str = include_str!("../templates/metal/compile-metal.cmake"); -static METALLIB_TO_HEADER_PY: &str = include_str!("../templates/metal/metallib_to_header.py"); static GET_GPU_LANG: &str = include_str!("../templates/get_gpu_lang.cmake"); static GET_GPU_LANG_PY: &str = include_str!("../templates/get_gpu_lang.py"); +static HIPIFY: &str = include_str!("../templates/cuda/hipify.py"); +static METALLIB_TO_HEADER_PY: &str = include_str!("../templates/metal/metallib_to_header.py"); +static REGISTRATION_H: &str = include_str!("../templates/registration.h"); pub fn write_setup_py( env: &Environment, + general: &General, torch: &crate::config::Torch, - name: &str, ops_name: &str, file_set: &mut FileSet, ) -> Result<()> { let writer = file_set.entry("setup.py"); - let data_globs = torch.data_globs().map(|globs| globs.join(", ")); + let data_globs = torch + .data_extensions() + .map(|exts| exts.iter().map(|ext| format!("\"**/*.{ext}\"")).join(", ")); env.get_template("setup.py") .wrap_err("Cannot get setup.py template")? @@ -39,7 +42,7 @@ pub fn write_setup_py( context! { data_globs => data_globs, ops_name => ops_name, - name => name, + python_name => general.python_name(), version => "0.1.0", }, writer, @@ -49,26 +52,45 @@ pub fn write_setup_py( Ok(()) } +pub fn write_compat_py(file_set: &mut FileSet) -> Result<()> { + let mut path = PathBuf::new(); + path.push("compat.py"); + file_set.entry(path).extend_from_slice(COMPAT_PY.as_bytes()); + + Ok(()) +} + pub fn write_pyproject_toml( env: &Environment, - backend: Backend, general: &General, file_set: &mut FileSet, ) -> Result<()> { let writer = file_set.entry("pyproject.toml"); - let python_dependencies = itertools::process_results( - general - .python_depends() - .chain(general.backend_python_depends(backend)), - |iter| iter.map(|d| format!("\"{d}\"")).join(", "), - )?; + // Common python dependencies (no backend-specific ones) + let python_dependencies = itertools::process_results(general.python_depends(), |iter| { + iter.map(|d| format!("\"{d}\"")).join(", ") + })?; + + // Collect backend-specific dependencies for all backends + let mut backend_dependencies = Vec::new(); + for backend in &Backend::all() { + let deps = itertools::process_results(general.backend_python_depends(*backend), |iter| { + iter.map(|d| format!("\"{d}\"")).collect::>() + })?; + + if !deps.is_empty() { + backend_dependencies.push((backend.to_string(), deps)); + } + } env.get_template("pyproject.toml") .wrap_err("Cannot get pyproject.toml template")? .render_to_write( context! { + python_name => general.python_name(), python_dependencies => python_dependencies, + backend_dependencies => backend_dependencies, }, writer, ) @@ -77,21 +99,23 @@ pub fn write_pyproject_toml( Ok(()) } -pub fn write_metadata(backend: Backend, general: &General, file_set: &mut FileSet) -> Result<()> { - let writer = file_set.entry("metadata.json"); +pub fn write_metadata(general: &General, file_set: &mut FileSet) -> Result<()> { + for backend in &Backend::all() { + let writer = file_set.entry(format!("metadata-{}.json", backend)); - let python_depends = general - .python_depends() - .chain(general.backend_python_depends(backend)) - .collect::>>()?; + let python_depends = general + .python_depends() + .chain(general.backend_python_depends(*backend)) + .collect::>>()?; - let metadata = Metadata { - version: general.version, - license: general.license.clone(), - python_depends, - }; + let metadata = Metadata { + version: general.version, + license: general.license.clone(), + python_depends, + }; - serde_json::to_writer_pretty(writer, &metadata)?; + serde_json::to_writer_pretty(writer, &metadata)?; + } Ok(()) } @@ -180,7 +204,11 @@ pub fn write_cmake_file(file_set: &mut FileSet, filename: &str, content: &[u8]) pub fn write_cmake_helpers(file_set: &mut FileSet) { write_cmake_file(file_set, "utils.cmake", CMAKE_UTILS.as_bytes()); write_cmake_file(file_set, "kernel.cmake", CMAKE_KERNEL.as_bytes()); - write_cmake_file(file_set, "windows.cmake", WINDOWS_UTILS.as_bytes()); + write_cmake_file( + file_set, + "build-variants.cmake", + BUILD_VARIANTS_UTILS.as_bytes(), + ); write_cmake_file(file_set, "hipify.py", HIPIFY.as_bytes()); write_cmake_file( file_set, @@ -198,7 +226,8 @@ pub fn write_cmake_helpers(file_set: &mut FileSet) { pub fn render_extension( env: &Environment, - name: &str, + general: &General, + torch: &Torch, ops_name: &str, write: &mut impl Write, ) -> Result<()> { @@ -206,9 +235,9 @@ pub fn render_extension( .wrap_err("Cannot get Torch extension template")? .render_to_write( context! { - name => name, + python_name => general.python_name(), ops_name => ops_name, - platform => std::env::consts::OS, + data_extensions => torch.data_extensions(), }, &mut *write, ) @@ -237,7 +266,6 @@ pub fn render_preamble( cuda_maxver => cuda_maxver.map(|v| v.to_string()), torch_minver => torch_minver.map(|v| v.to_string()), torch_maxver => torch_maxver.map(|v| v.to_string()), - platform => std::env::consts::OS }, &mut *write, ) @@ -250,7 +278,6 @@ pub fn render_preamble( pub fn write_cmake( env: &Environment, - backend: Backend, build: &Build, torch: &Torch, name: &str, @@ -261,38 +288,29 @@ pub fn write_cmake( let cmake_writer = file_set.entry("CMakeLists.txt"); - let (cuda_minver, cuda_maxver) = match backend { - Backend::Cuda => ( - build.general.cuda.as_ref().and_then(|c| c.minver.as_ref()), - build.general.cuda.as_ref().and_then(|c| c.maxver.as_ref()), - ), - _ => (None, None), - }; - render_preamble( env, name, - cuda_minver, - cuda_maxver, + build.general.cuda.as_ref().and_then(|c| c.minver.as_ref()), + build.general.cuda.as_ref().and_then(|c| c.maxver.as_ref()), torch.minver.as_ref(), torch.maxver.as_ref(), cmake_writer, )?; - render_deps(env, backend, build, cmake_writer)?; + render_deps(env, build, cmake_writer)?; render_binding(env, torch, name, cmake_writer)?; render_kernel_components(env, build, cmake_writer)?; - render_extension(env, name, ops_name, cmake_writer)?; + render_extension(env, &build.general, torch, ops_name, cmake_writer)?; Ok(()) } pub fn write_torch_ext( env: &Environment, - backend: Backend, build: &Build, target_dir: PathBuf, ops_id: Option, @@ -312,7 +330,6 @@ pub fn write_torch_ext( write_cmake( env, - backend, build, torch_ext, &build.general.name, @@ -320,21 +337,17 @@ pub fn write_torch_ext( &mut file_set, )?; - write_setup_py( - env, - torch_ext, - &build.general.name, - &ops_name, - &mut file_set, - )?; + write_setup_py(env, &build.general, torch_ext, &ops_name, &mut file_set)?; + + write_compat_py(&mut file_set)?; write_ops_py(env, &build.general.python_name(), &ops_name, &mut file_set)?; - write_pyproject_toml(env, backend, &build.general, &mut file_set)?; + write_pyproject_toml(env, &build.general, &mut file_set)?; write_torch_registration_macros(&mut file_set)?; - write_metadata(backend, &build.general, &mut file_set)?; + write_metadata(&build.general, &mut file_set)?; Ok(file_set) } diff --git a/build2cmake/src/torch/deps.rs b/build2cmake/src/torch/deps.rs index 5a333460..fcb2e8b7 100644 --- a/build2cmake/src/torch/deps.rs +++ b/build2cmake/src/torch/deps.rs @@ -4,22 +4,13 @@ use std::io::Write; use eyre::{Context, Result}; use minijinja::{context, Environment}; -use crate::config::{Backend, Build, Dependency}; +use crate::config::{Build, Dependency}; -pub fn render_deps( - env: &Environment, - backend: Backend, - build: &Build, - write: &mut impl Write, -) -> Result<()> { +pub fn render_deps(env: &Environment, build: &Build, write: &mut impl Write) -> Result<()> { + // Collect all dependencies. let mut deps = HashSet::new(); - - for kernel in build - .kernels - .values() - .filter(|kernel| kernel.backend() == backend) - { - deps.extend(kernel.depends()); + for kernel in build.kernels.values() { + deps.extend(kernel.depends()) } for dep in deps { @@ -94,10 +85,10 @@ pub fn render_deps( env.get_template("xpu/dep-cutlass-sycl.cmake")? .render_to_write(context! {}, &mut *write)?; } - Dependency::Torch => (), - _ => { - eprintln!("Warning: {backend:?} backend doesn't need/support dependency: {dep:?}"); + Dependency::MetalCpp => { + // TODO: add CMake dependency. } + Dependency::Torch => (), } write.write_all(b"\n")?; } diff --git a/build2cmake/src/torch/noarch.rs b/build2cmake/src/torch/noarch.rs index 85caf080..2e1327c4 100644 --- a/build2cmake/src/torch/noarch.rs +++ b/build2cmake/src/torch/noarch.rs @@ -7,12 +7,14 @@ use minijinja::{context, Environment}; use crate::{ config::{Backend, Build, General, Torch}, fileset::FileSet, - torch::{common::write_metadata, kernel_ops_identifier}, + torch::{ + common::{write_compat_py, write_metadata}, + kernel_ops_identifier, + }, }; pub fn write_torch_ext_noarch( env: &Environment, - backend: Backend, build: &Build, target_dir: PathBuf, ops_id: Option, @@ -21,16 +23,10 @@ pub fn write_torch_ext_noarch( let ops_name = kernel_ops_identifier(&target_dir, &build.general.python_name(), ops_id); + write_compat_py(&mut file_set)?; write_ops_py(env, &build.general.python_name(), &ops_name, &mut file_set)?; - write_pyproject_toml( - env, - backend, - build.torch.as_ref(), - &build.general, - &mut file_set, - )?; - - write_metadata(backend, &build.general, &mut file_set)?; + write_pyproject_toml(env, build.torch.as_ref(), &build.general, &mut file_set)?; + write_metadata(&build.general, &mut file_set)?; Ok(file_set) } @@ -62,7 +58,6 @@ fn write_ops_py( fn write_pyproject_toml( env: &Environment, - backend: Backend, torch: Option<&Torch>, general: &General, file_set: &mut FileSet, @@ -70,13 +65,28 @@ fn write_pyproject_toml( let writer = file_set.entry("pyproject.toml"); let name = &general.name; - let data_globs = torch.and_then(|torch| torch.data_globs().map(|globs| globs.join(", "))); - let python_dependencies = itertools::process_results( - general - .python_depends() - .chain(general.backend_python_depends(backend)), - |iter| iter.map(|d| format!("\"{d}\"")).join(", "), - )?; + let data_globs = torch.and_then(|torch| { + torch + .data_extensions() + .map(|exts| exts.iter().map(|ext| format!("\"**/*.{ext}\"")).join(", ")) + }); + + // Common python dependencies (no backend-specific ones) + let python_dependencies = itertools::process_results(general.python_depends(), |iter| { + iter.map(|d| format!("\"{d}\"")).join(", ") + })?; + + // Collect backend-specific dependencies for all backends + let mut backend_dependencies = Vec::new(); + for backend in &Backend::all() { + let deps = itertools::process_results(general.backend_python_depends(*backend), |iter| { + iter.map(|d| format!("\"{d}\"")).collect::>() + })?; + + if !deps.is_empty() { + backend_dependencies.push((backend.to_string(), deps)); + } + } env.get_template("noarch/pyproject.toml") .wrap_err("Cannot get noarch pyproject.toml template")? @@ -84,6 +94,7 @@ fn write_pyproject_toml( context! { data_globs => data_globs, python_dependencies => python_dependencies, + backend_dependencies => backend_dependencies, name => name, }, writer, diff --git a/builder/examples/extra-data/build.toml b/builder/examples/extra-data/build.toml new file mode 100644 index 00000000..391b3043 --- /dev/null +++ b/builder/examples/extra-data/build.toml @@ -0,0 +1,56 @@ +[general] +name = "extra-data" +backends = [ + "cpu", + "cuda", + "metal", + "rocm", + "xpu", +] + +[torch] +src = [ + "torch-ext/torch_binding.cpp", + "torch-ext/torch_binding.h", +] +pyext = ["json", "py"] + +[kernel.relu] +backend = "cuda" +depends = ["torch"] +src = ["relu_cuda/relu.cu"] + +[kernel.relu_metal] +backend = "metal" +src = [ + "relu_metal/relu.mm", + "relu_metal/relu.metal", + "relu_metal/common.h", +] +depends = [ "torch" ] + +[kernel.relu_rocm] +backend = "rocm" +rocm-archs = [ + "gfx906", + "gfx908", + "gfx90a", + "gfx940", + "gfx941", + "gfx942", + "gfx1030", + "gfx1100", + "gfx1101", +] +depends = ["torch"] +src = ["relu_cuda/relu.cu"] + +[kernel.relu_xpu] +backend = "xpu" +depends = ["torch"] +src = ["relu_xpu/relu.cpp"] + +[kernel.relu_cpu] +backend = "cpu" +depends = ["torch"] +src = ["relu_cpu/relu_cpu.cpp"] diff --git a/builder/examples/extra-data/flake.nix b/builder/examples/extra-data/flake.nix new file mode 100644 index 00000000..41e51eef --- /dev/null +++ b/builder/examples/extra-data/flake.nix @@ -0,0 +1,17 @@ +{ + description = "Flake for ReLU kernel"; + + inputs = { + kernel-builder.url = "path:../../.."; + }; + + outputs = + { + self, + kernel-builder, + }: + kernel-builder.lib.genKernelFlakeOutputs { + inherit self; + path = ./.; + }; +} diff --git a/builder/examples/extra-data/relu_cpu/relu_cpu.cpp b/builder/examples/extra-data/relu_cpu/relu_cpu.cpp new file mode 100644 index 00000000..6197a9f1 --- /dev/null +++ b/builder/examples/extra-data/relu_cpu/relu_cpu.cpp @@ -0,0 +1,56 @@ +#include + +#ifdef __SSE__ +#include +#endif + +#ifdef __ARM_NEON +#include +#endif + +#ifdef __SSE__ +void relu_forward_sse(float* out, const float* input, size_t size) { + size_t i = 0; + + for (; i + 4 <= size; i += 4) { + __m128 vec_input = _mm_load_ps(input + i); + __m128 vec_zero = _mm_setzero_ps(); + __m128 vec_output = _mm_max_ps(vec_input, vec_zero); + _mm_store_ps(out + i, vec_output); + } + + for (; i < size; ++i) { + out[i] = input[i] > 0 ? input[i] : 0; + } +} +#endif + +#ifdef __ARM_NEON +void relu_forward_neon(float* out, const float* input, size_t size) { + size_t i = 0; + + for (; i + 4 <= size; i += 4) { + float32x4_t vec_input = vld1q_f32(input + i); + float32x4_t vec_output = vmaxq_f32(vec_input, vdupq_n_f32(0)); + vst1q_f32(out + i, vec_output); + } + + for (; i < size; ++i) { + out[i] = input[i] > 0 ? input[i] : 0; + } +} +#endif + +void relu(torch::Tensor &out, torch::Tensor const &input) { + TORCH_CHECK(out.dtype() == torch::kFloat32, "Output tensor must be of dtype float"); + TORCH_CHECK(input.dtype() == torch::kFloat32, "Input tensor must be of dtype float"); + TORCH_CHECK(out.numel() == input.numel(), "Input and output tensors must have the same number of elements"); + +#if defined(__SSE__) + relu_forward_sse(out.data_ptr(), input.data_ptr(), input.numel()); +#elif defined(__ARM_NEON) + relu_forward_neon(out.data_ptr(), input.data_ptr(), input.numel()); +#else + #error "Unsupported architecture; please use a CPU with SSE or ARM NEON support." +#endif +} diff --git a/builder/examples/extra-data/relu_cuda/relu.cu b/builder/examples/extra-data/relu_cuda/relu.cu new file mode 100644 index 00000000..6bbe3160 --- /dev/null +++ b/builder/examples/extra-data/relu_cuda/relu.cu @@ -0,0 +1,43 @@ +#include +#include +#include + +#include + +__global__ void relu_kernel(float *__restrict__ out, + float const *__restrict__ input, const int d) { + const int64_t token_idx = blockIdx.x; + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + auto x = input[token_idx * d + idx]; + out[token_idx * d + idx] = x > 0.0f ? x : 0.0f; + } +} + +void relu(torch::Tensor &out, torch::Tensor const &input) { + TORCH_CHECK(input.device().is_cuda(), "input must be a CUDA tensor"); + TORCH_CHECK(input.is_contiguous(), "input must be contiguous"); + TORCH_CHECK(input.scalar_type() == at::ScalarType::Float && + input.scalar_type() == at::ScalarType::Float, + "relu_kernel only supports float32"); + + TORCH_CHECK(input.sizes() == out.sizes(), + "Tensors must have the same shape. Got input shape: ", + input.sizes(), " and output shape: ", out.sizes()); + + TORCH_CHECK(input.scalar_type() == out.scalar_type(), + "Tensors must have the same data type. Got input dtype: ", + input.scalar_type(), " and output dtype: ", out.scalar_type()); + + TORCH_CHECK(input.device() == out.device(), + "Tensors must be on the same device. Got input device: ", + input.device(), " and output device: ", out.device()); + + int d = input.size(-1); + int64_t num_tokens = input.numel() / d; + dim3 grid(num_tokens); + dim3 block(std::min(d, 1024)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + relu_kernel<<>>(out.data_ptr(), + input.data_ptr(), d); +} diff --git a/builder/examples/extra-data/relu_metal/common.h b/builder/examples/extra-data/relu_metal/common.h new file mode 100644 index 00000000..1b891fad --- /dev/null +++ b/builder/examples/extra-data/relu_metal/common.h @@ -0,0 +1,10 @@ +#ifndef COMMON_H +#define COMMON_H + +#include +using namespace metal; + +// Common constants and utilities for Metal kernels +constant float RELU_THRESHOLD = 0.0f; + +#endif // COMMON_H \ No newline at end of file diff --git a/builder/examples/extra-data/relu_metal/relu.metal b/builder/examples/extra-data/relu_metal/relu.metal new file mode 100644 index 00000000..286b46fe --- /dev/null +++ b/builder/examples/extra-data/relu_metal/relu.metal @@ -0,0 +1,17 @@ +#include +#include "common.h" +using namespace metal; + +kernel void relu_forward_kernel_float(device const float *inA [[buffer(0)]], + device float *outC [[buffer(1)]], + uint index [[thread_position_in_grid]]) { + // Explicitly write to output + outC[index] = max(RELU_THRESHOLD, inA[index]); +} + +kernel void relu_forward_kernel_half(device const half *inA [[buffer(0)]], + device half *outC [[buffer(1)]], + uint index [[thread_position_in_grid]]) { + // Explicitly write to output + outC[index] = max(static_cast(0.0), inA[index]); +} \ No newline at end of file diff --git a/builder/examples/extra-data/relu_metal/relu.mm b/builder/examples/extra-data/relu_metal/relu.mm new file mode 100644 index 00000000..7636737b --- /dev/null +++ b/builder/examples/extra-data/relu_metal/relu.mm @@ -0,0 +1,105 @@ +#include + +#import +#import + +// Include the auto-generated header with embedded metallib +#ifdef EMBEDDED_METALLIB_HEADER +#include EMBEDDED_METALLIB_HEADER +#else +#error "EMBEDDED_METALLIB_HEADER not defined" +#endif + +static inline id getMTLBufferStorage(const torch::Tensor &tensor) { + return __builtin_bit_cast(id, tensor.storage().data()); +} + + +torch::Tensor &dispatchReluKernel(torch::Tensor const &input, + torch::Tensor &output) { + @autoreleasepool { + id device = MTLCreateSystemDefaultDevice(); + + int numThreads = input.numel(); + + // Load the embedded Metal library from memory + NSError *error = nil; + id customKernelLibrary = EMBEDDED_METALLIB_NAMESPACE::createLibrary(device, &error); + TORCH_CHECK(customKernelLibrary, + "Failed to create Metal library from embedded data: ", + error.localizedDescription.UTF8String); + + std::string kernel_name = + std::string("relu_forward_kernel_") + + (input.scalar_type() == torch::kFloat ? "float" : "half"); + id customReluFunction = [customKernelLibrary + newFunctionWithName:[NSString + stringWithUTF8String:kernel_name.c_str()]]; + TORCH_CHECK(customReluFunction, + "Failed to create function state object for ", + kernel_name.c_str()); + + id reluPSO = + [device newComputePipelineStateWithFunction:customReluFunction + error:&error]; + TORCH_CHECK(reluPSO, error.localizedDescription.UTF8String); + + id commandBuffer = torch::mps::get_command_buffer(); + TORCH_CHECK(commandBuffer, "Failed to retrieve command buffer reference"); + + dispatch_queue_t serialQueue = torch::mps::get_dispatch_queue(); + + dispatch_sync(serialQueue, ^() { + id computeEncoder = + [commandBuffer computeCommandEncoder]; + TORCH_CHECK(computeEncoder, "Failed to create compute command encoder"); + + [computeEncoder setComputePipelineState:reluPSO]; + [computeEncoder setBuffer:getMTLBufferStorage(input) + offset:input.storage_offset() * input.element_size() + atIndex:0]; + [computeEncoder setBuffer:getMTLBufferStorage(output) + offset:output.storage_offset() * output.element_size() + atIndex:1]; + + MTLSize gridSize = MTLSizeMake(numThreads, 1, 1); + + NSUInteger threadGroupSize = reluPSO.maxTotalThreadsPerThreadgroup; + if (threadGroupSize > numThreads) { + threadGroupSize = numThreads; + } + MTLSize threadgroupSize = MTLSizeMake(threadGroupSize, 1, 1); + + [computeEncoder dispatchThreads:gridSize + threadsPerThreadgroup:threadgroupSize]; + + [computeEncoder endEncoding]; + + torch::mps::commit(); + }); + } + + return output; +} + +void relu(torch::Tensor &out, torch::Tensor const &input) { + TORCH_CHECK(input.device().is_mps(), "input must be a MPS tensor"); + TORCH_CHECK(input.is_contiguous(), "input must be contiguous"); + TORCH_CHECK(input.scalar_type() == torch::kFloat || + input.scalar_type() == torch::kHalf, + "Unsupported data type: ", input.scalar_type()); + + TORCH_CHECK(input.sizes() == out.sizes(), + "Tensors must have the same shape. Got input shape: ", + input.sizes(), " and output shape: ", out.sizes()); + + TORCH_CHECK(input.scalar_type() == out.scalar_type(), + "Tensors must have the same data type. Got input dtype: ", + input.scalar_type(), " and output dtype: ", out.scalar_type()); + + TORCH_CHECK(input.device() == out.device(), + "Tensors must be on the same device. Got input device: ", + input.device(), " and output device: ", out.device()); + + dispatchReluKernel(input, out); +} diff --git a/builder/examples/extra-data/relu_xpu/relu.cpp b/builder/examples/extra-data/relu_xpu/relu.cpp new file mode 100644 index 00000000..1809de08 --- /dev/null +++ b/builder/examples/extra-data/relu_xpu/relu.cpp @@ -0,0 +1,40 @@ +#include +#include + +using namespace sycl; + +void relu_xpu_impl(torch::Tensor& output, const torch::Tensor& input) { + // Create SYCL queue directly + sycl::queue queue; + + auto input_ptr = input.data_ptr(); + auto output_ptr = output.data_ptr(); + auto numel = input.numel(); + + // Launch SYCL kernel + queue.parallel_for(range<1>(numel), [=](id<1> idx) { + auto i = idx[0]; + output_ptr[i] = input_ptr[i] > 0.0f ? input_ptr[i] : 0.0f; + }).wait(); +} + +void relu(torch::Tensor& out, const torch::Tensor& input) { + TORCH_CHECK(input.device().is_xpu(), "input must be a XPU tensor"); + TORCH_CHECK(input.is_contiguous(), "input must be contiguous"); + TORCH_CHECK(input.scalar_type() == torch::kFloat, + "Unsupported data type: ", input.scalar_type()); + + TORCH_CHECK(input.sizes() == out.sizes(), + "Tensors must have the same shape. Got input shape: ", + input.sizes(), " and output shape: ", out.sizes()); + + TORCH_CHECK(input.scalar_type() == out.scalar_type(), + "Tensors must have the same data type. Got input dtype: ", + input.scalar_type(), " and output dtype: ", out.scalar_type()); + + TORCH_CHECK(input.device() == out.device(), + "Tensors must be on the same device. Got input device: ", + input.device(), " and output device: ", out.device()); + + relu_xpu_impl(out, input); +} diff --git a/builder/examples/extra-data/tests/__init__.py b/builder/examples/extra-data/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/builder/examples/extra-data/tests/test_relu.py b/builder/examples/extra-data/tests/test_relu.py new file mode 100644 index 00000000..d4426a0c --- /dev/null +++ b/builder/examples/extra-data/tests/test_relu.py @@ -0,0 +1,37 @@ +import platform + +import torch +import torch.nn.functional as F + +import extra_data + + +def test_relu(): + if platform.system() == "Darwin": + device = torch.device("mps") + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + device = torch.device("xpu") + elif torch.version.cuda is not None and torch.cuda.is_available(): + device = torch.device("cuda") + else: + device = torch.device("cpu") + x = torch.randn(1024, 1024, dtype=torch.float32, device=device) + torch.testing.assert_allclose(F.relu(x), extra_data.relu(x)) + + +def test_relu_layer(): + if platform.system() == "Darwin": + device = torch.device("mps") + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + device = torch.device("xpu") + elif torch.version.cuda is not None and torch.cuda.is_available(): + device = torch.device("cuda") + else: + device = torch.device("cpu") + x = torch.randn(1024, 1024, dtype=torch.float32, device=device) + layer = extra_data.layers.ReLU() + torch.testing.assert_allclose(F.relu(x), layer(x)) + + +def test_data(): + assert extra_data.EASTER_EGG == 42 diff --git a/builder/examples/extra-data/torch-ext/extra_data/__init__.py b/builder/examples/extra-data/torch-ext/extra_data/__init__.py new file mode 100644 index 00000000..684f3558 --- /dev/null +++ b/builder/examples/extra-data/torch-ext/extra_data/__init__.py @@ -0,0 +1,32 @@ +import json +from pathlib import Path +from typing import Optional + +import torch + +from ._ops import ops + +from . import layers + + +# This is the regular ReLU, but this example also shows how to embed some +# non-Python data. This can be used for e.g. Triton tuning data. + + +def _read_json() -> dict: + json_path = Path(__file__).parent / "data.json" + with open(json_path, "r") as f: + return json.load(f) + + +EASTER_EGG = _read_json() + + +def relu(x: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor: + if out is None: + out = torch.empty_like(x) + ops.relu(out, x) + return out + + +__all__ = ["EASTER_EGG", "relu", "layers"] diff --git a/builder/examples/extra-data/torch-ext/extra_data/data.json b/builder/examples/extra-data/torch-ext/extra_data/data.json new file mode 100644 index 00000000..d81cc071 --- /dev/null +++ b/builder/examples/extra-data/torch-ext/extra_data/data.json @@ -0,0 +1 @@ +42 diff --git a/builder/examples/extra-data/torch-ext/extra_data/layers/__init__.py b/builder/examples/extra-data/torch-ext/extra_data/layers/__init__.py new file mode 100644 index 00000000..6105a191 --- /dev/null +++ b/builder/examples/extra-data/torch-ext/extra_data/layers/__init__.py @@ -0,0 +1,11 @@ +import torch +import torch.nn as nn + +from .._ops import ops + + +class ReLU(nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + out = torch.empty_like(x) + ops.relu(out, x) + return out diff --git a/builder/examples/extra-data/torch-ext/torch_binding.cpp b/builder/examples/extra-data/torch-ext/torch_binding.cpp new file mode 100644 index 00000000..1765d92d --- /dev/null +++ b/builder/examples/extra-data/torch-ext/torch_binding.cpp @@ -0,0 +1,19 @@ +#include + +#include "registration.h" +#include "torch_binding.h" + +TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { + ops.def("relu(Tensor! out, Tensor input) -> ()"); +#if defined(CPU_KERNEL) + ops.impl("relu", torch::kCPU, &relu); +#elif defined(CUDA_KERNEL) || defined(ROCM_KERNEL) + ops.impl("relu", torch::kCUDA, &relu); +#elif defined(METAL_KERNEL) + ops.impl("relu", torch::kMPS, relu); +#elif defined(XPU_KERNEL) + ops.impl("relu", torch::kXPU, &relu); +#endif +} + +REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/builder/examples/extra-data/torch-ext/torch_binding.h b/builder/examples/extra-data/torch-ext/torch_binding.h new file mode 100644 index 00000000..3bcf2904 --- /dev/null +++ b/builder/examples/extra-data/torch-ext/torch_binding.h @@ -0,0 +1,5 @@ +#pragma once + +#include + +void relu(torch::Tensor &out, torch::Tensor const &input); \ No newline at end of file diff --git a/builder/examples/relu/tests/test_relu.py b/builder/examples/relu/tests/test_relu.py index 65544aa4..7c2a495b 100644 --- a/builder/examples/relu/tests/test_relu.py +++ b/builder/examples/relu/tests/test_relu.py @@ -17,3 +17,17 @@ def test_relu(): device = torch.device("cpu") x = torch.randn(1024, 1024, dtype=torch.float32, device=device) torch.testing.assert_allclose(F.relu(x), relu.relu(x)) + + +def test_relu_layer(): + if platform.system() == "Darwin": + device = torch.device("mps") + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + device = torch.device("xpu") + elif torch.version.cuda is not None and torch.cuda.is_available(): + device = torch.device("cuda") + else: + device = torch.device("cpu") + x = torch.randn(1024, 1024, dtype=torch.float32, device=device) + layer = relu.layers.ReLU() + torch.testing.assert_allclose(F.relu(x), layer(x)) diff --git a/builder/examples/relu/torch-ext/relu/__init__.py b/builder/examples/relu/torch-ext/relu/__init__.py index 8050dfd7..6890c993 100644 --- a/builder/examples/relu/torch-ext/relu/__init__.py +++ b/builder/examples/relu/torch-ext/relu/__init__.py @@ -4,9 +4,15 @@ from ._ops import ops +from . import layers + def relu(x: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor: if out is None: out = torch.empty_like(x) ops.relu(out, x) - return out \ No newline at end of file + return out + + +__all__ = ["relu", "layers"] + diff --git a/builder/examples/relu/torch-ext/relu/layers/__init__.py b/builder/examples/relu/torch-ext/relu/layers/__init__.py new file mode 100644 index 00000000..6105a191 --- /dev/null +++ b/builder/examples/relu/torch-ext/relu/layers/__init__.py @@ -0,0 +1,11 @@ +import torch +import torch.nn as nn + +from .._ops import ops + + +class ReLU(nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + out = torch.empty_like(x) + ops.relu(out, x) + return out diff --git a/builder/lib/build.nix b/builder/lib/build.nix index 63c2f182..c187aaf5 100644 --- a/builder/lib/build.nix +++ b/builder/lib/build.nix @@ -217,22 +217,20 @@ rec { bundleOnly = true; }; buildToml = readBuildConfig path; - namePaths = - # TODO: treat kernels without compiled parts differently. - lib.mapAttrs (name: pkg: toString pkg) extensions; - - # Include benchmarks directory if it exists in the source benchmarksPath = path + "/benchmarks"; hasBenchmarks = builtins.pathExists benchmarksPath; - allPaths = - namePaths - // lib.optionalAttrs hasBenchmarks { - benchmarks = benchmarksPath; + benchmarks = + with lib.fileset; + toSource { + root = path; + fileset = maybeMissing benchmarksPath; }; + contents = + builtins.map (pkg: toString pkg) (builtins.attrValues extensions) + ++ lib.optionals hasBenchmarks [ (toString benchmarks) ]; in import ./join-paths { - inherit pkgs; - namePaths = allPaths; + inherit pkgs contents; name = "torch-ext-bundle"; }; @@ -276,8 +274,11 @@ rec { )) ]; shellHook = '' - export PYTHONPATH=''${PYTHONPATH}:${extension} + # This is run from `nix develop`, which provides the existing + # environment. We clear the LD_LIBRARY_PATH and PYTHONPATH to + # make testing as pure as possible. unset LD_LIBRARY_PATH + export PYTHONPATH=${extension}/${buildSet.torch.variant} ''; }; }; diff --git a/builder/lib/join-paths/default.nix b/builder/lib/join-paths/default.nix index a339a4a5..3fcad0d9 100644 --- a/builder/lib/join-paths/default.nix +++ b/builder/lib/join-paths/default.nix @@ -3,8 +3,8 @@ args@{ name, - # Attribute set with names to paths. - namePaths, + # Package paths. + contents, preferLocalBuild ? true, allowSubstitutes ? false, @@ -16,16 +16,11 @@ let "pkgs" "namePaths" ]; - # Iterating over pairs in bash sucks, so let's generate - # the commands in Nix instead. - copyPath = path: pkg: '' - mkdir -p ${placeholder "out"}/${path} - cp -r ${pkg}/* ${placeholder "out"}/${path} + copyPkg = pkg: '' + cp -r ${pkg}/* ${placeholder "out"}/ ''; prelude = '' mkdir -p ${placeholder "out"} ''; in -pkgs.runCommand name args_ ( - prelude + lib.concatStringsSep "\n" (lib.mapAttrsToList copyPath namePaths) -) +pkgs.runCommand name args_ (prelude + lib.concatStringsSep "\n" (builtins.map copyPkg contents)) diff --git a/builder/lib/torch-extension/arch.nix b/builder/lib/torch-extension/arch.nix index b78909de..769d33de 100644 --- a/builder/lib/torch-extension/arch.nix +++ b/builder/lib/torch-extension/arch.nix @@ -108,7 +108,6 @@ stdenv.mkDerivation (prevAttrs: { # Generate build files. postPatch = '' build2cmake generate-torch \ - --backend ${buildConfig.backend} \ --ops-id ${rev} build.toml ''; @@ -237,33 +236,25 @@ stdenv.mkDerivation (prevAttrs: { #(lib.cmakeFeature "METAL_COMPILER" "${xcrunHost}/bin/xcrunHost") ]; - postInstall = '' - ( - cd .. - cp -r torch-ext/${moduleName}/* $out/ - ) - mv $out/_${moduleName}_*/* $out/ - rm -d $out/_${moduleName}_${rev} - - # Set up a compatibility module for older kernels versions, remove when - # the updated kernels has been around for a while. - mkdir $out/${moduleName} - cp ${./compat.py} $out/${moduleName}/__init__.py - - cp ../metadata.json $out/ - '' - + (lib.optionalString (stripRPath && stdenv.hostPlatform.isLinux)) '' - find $out/ -name '*.so' \ - -exec patchelf --set-rpath "" {} \; - '' - + (lib.optionalString (stripRPath && stdenv.hostPlatform.isDarwin)) '' - find $out/ -name '*.so' \ - -exec rewrite-nix-paths-macho {} \; - - # Stub some rpath. - find $out/ -name '*.so' \ - -exec install_name_tool -add_rpath "@loader_path/lib" {} \; - ''; + postInstall = + let + buildVariant = torch.variant; + in + '' + rm -rf $out/_${moduleName}_${rev} + '' + + (lib.optionalString (stripRPath && stdenv.hostPlatform.isLinux)) '' + find $out/ -name '*.so' \ + -exec patchelf --set-rpath "" {} \; + '' + + (lib.optionalString (stripRPath && stdenv.hostPlatform.isDarwin)) '' + find $out/ -name '*.so' \ + -exec rewrite-nix-paths-macho {} \; + + # Stub some rpath. + find $out/ -name '*.so' \ + -exec install_name_tool -add_rpath "@loader_path/lib" {} \; + ''; doInstallCheck = true; diff --git a/builder/lib/torch-extension/no-arch.nix b/builder/lib/torch-extension/no-arch.nix index 83ebf7b6..2ef038dc 100644 --- a/builder/lib/torch-extension/no-arch.nix +++ b/builder/lib/torch-extension/no-arch.nix @@ -81,17 +81,19 @@ stdenv.mkDerivation (prevAttrs: { # we run it anyway. postPatch = '' build2cmake generate-torch \ - --backend ${buildConfig.backend} \ --ops-id ${rev} build.toml ''; - installPhase = '' - mkdir -p $out - cp -r torch-ext/${moduleName}/* $out/ - mkdir $out/${moduleName} - cp ${./compat.py} $out/${moduleName}/__init__.py - cp metadata.json $out/ - ''; + installPhase = + let + noarchVariant = torch.noarchVariant; + in + '' + mkdir -p $out/${noarchVariant}/${moduleName} + cp -r torch-ext/${moduleName}/* $out/${noarchVariant} + cp compat.py $out/${noarchVariant}/${moduleName}/__init__.py + cp metadata-${buildConfig.backend}.json $out/${noarchVariant} + ''; doInstallCheck = true; diff --git a/builder/scripts/windows/builder.ps1 b/builder/scripts/windows/builder.ps1 index 311f978e..69f14c2c 100644 --- a/builder/scripts/windows/builder.ps1 +++ b/builder/scripts/windows/builder.ps1 @@ -483,7 +483,6 @@ function Invoke-Backend { if ($Target) { $kwargs += $Target } if ($Options.Force) { $kwargs += '--force' } if ($Options.OpsId) { $kwargs += '--ops-id', $Options.OpsId } - if ($Backend -and $Backend -ne 'universal') { $kwargs += '--backend', $Backend } Invoke-Build2Cmake -Build2CmakeExe $Build2CmakeExe -Arguments $kwargs } diff --git a/builder/tests/Dockerfile.test-kernel b/builder/tests/Dockerfile.test-kernel index 2c3ab50a..f6b852b8 100644 --- a/builder/tests/Dockerfile.test-kernel +++ b/builder/tests/Dockerfile.test-kernel @@ -67,6 +67,7 @@ COPY relu-kernel ./relu-kernel COPY relu-kernel-cpu ./relu-kernel-cpu COPY cutlass-gemm-kernel ./cutlass-gemm-kernel COPY silu-and-mul-kernel ./silu-and-mul-kernel +COPY builder/examples/extra-data/tests ./extra_data_tests COPY builder/examples/relu/tests ./relu_tests COPY builder/examples/cutlass-gemm/tests ./cutlass_gemm_tests diff --git a/builder/tests/run-tests.sh b/builder/tests/run-tests.sh index 3498a8c1..a776b89f 100644 --- a/builder/tests/run-tests.sh +++ b/builder/tests/run-tests.sh @@ -1,12 +1,19 @@ #!/bin/bash -PYTHONPATH="relu-kernel:cutlass-gemm-kernel:$PYTHONPATH" \ - .venv/bin/pytest relu_tests cutlass_gemm_tests +# Expand to build variant directories. +EXTRA_DATA_PATH=$(echo extra-data/torch*) +RELU_PATH=$(echo relu-kernel/torch*) +CUTLASS_PATH=$(echo cutlass-gemm-kernel/torch*) +SILU_MUL_PATH=$(echo silu-and-mul-kernel/torch*) +RELU_CPU_PATH=$(echo relu-kernel-cpu/torch*) + +PYTHONPATH="$EXTRA_DATA_PATH:$RELU_PATH:$CUTLASS_PATH:$PYTHONPATH" \ + .venv/bin/pytest extra_data_tests relu_tests cutlass_gemm_tests # We only care about importing, the kernel is trivial. -PYTHONPATH="silu-and-mul-kernel:$PYTHONPATH" \ +PYTHONPATH="$SILU_MUL_PATH:$PYTHONPATH" \ .venv/bin/python -c "import silu_and_mul" -PYTHONPATH="relu-kernel-cpu:$PYTHONPATH" \ +PYTHONPATH="$RELU_CPU_PATH:$PYTHONPATH" \ CUDA_VISIBLE_DEVICES="" \ .venv/bin/pytest relu_tests diff --git a/nix/pkgs/get-kernel-check/get-kernel-check-hook.sh b/nix/pkgs/get-kernel-check/get-kernel-check-hook.sh index 381ed073..4a845ec9 100755 --- a/nix/pkgs/get-kernel-check/get-kernel-check-hook.sh +++ b/nix/pkgs/get-kernel-check/get-kernel-check-hook.sh @@ -21,23 +21,12 @@ _getKernelCheckHook() { export DYLD_LIBRARY_PATH="${TORCH_DIR}/lib:${DYLD_LIBRARY_PATH}" fi - TMPDIR=$(mktemp -d -t test.XXXXXX) || exit 1 - trap "rm -rf '$TMPDIR'" EXIT - # Some kernels want to write stuff (especially when they use Triton). HOME=$(mktemp -d -t test.XXXXXX) || exit 1 trap "rm -rf '$HOME'" EXIT - # Emulate the bundle layout that kernels expects. This even works - # for universal kernels, since kernels checks the non-universal - # path first. - PYTHONPATH="@kernels@" \ - BUILD_VARIANT=$(@python3@ -c "from kernels.utils import build_variant; print(build_variant())") - mkdir -p "${TMPDIR}/build" - ln -s "$out" "${TMPDIR}/build/${BUILD_VARIANT}" - PYTHONPATH="@kernels@" \ - @python3@ -c "from pathlib import Path; import kernels; kernels.get_local_kernel(Path('${TMPDIR}'), '${moduleName}')" + @python3@ -c "from pathlib import Path; import kernels; kernels.get_local_kernel(Path('${out}'), '${moduleName}')" } postInstallCheckHooks+=(_getKernelCheckHook)