diff --git a/CMakePresets.json b/CMakePresets.json index 610fdeca3a9..f2a5e45a9f3 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -406,6 +406,14 @@ "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true } }, + { + "name": "cccl-c-parallel-hostjit", + "displayName": "CCCL C Parallel Library (HostJIT)", + "inherits": "cccl-c-parallel", + "cacheVariables": { + "CCCL_C_Parallel_ENABLE_HOSTJIT": true + } + }, { "name": "cccl-c-stf", "displayName": "CCCL C CUDASTF Library", @@ -638,6 +646,10 @@ "name": "cccl-c-parallel", "configurePreset": "cccl-c-parallel" }, + { + "name": "cccl-c-parallel-hostjit", + "configurePreset": "cccl-c-parallel-hostjit" + }, { "name": "cccl-c-stf", "configurePreset": "cccl-c-stf" @@ -917,6 +929,11 @@ "configurePreset": "cccl-c-parallel", "inherits": "base" }, + { + "name": "cccl-c-parallel-hostjit", + "configurePreset": "cccl-c-parallel-hostjit", + "inherits": "base" + }, { "name": "cccl-c-stf", "configurePreset": "cccl-c-stf", diff --git a/c/parallel/CMakeLists.txt b/c/parallel/CMakeLists.txt index b1796e866ef..f29bdb11c9e 100644 --- a/c/parallel/CMakeLists.txt +++ b/c/parallel/CMakeLists.txt @@ -8,6 +8,11 @@ option( "Build cccl.c.parallel standalone headers." OFF ) +option( + CCCL_C_Parallel_ENABLE_HOSTJIT + "Build HostJIT testing infrastructure (requires LLVM fetch, ~20 min first build)." + OFF +) # FIXME Ideally this would be handled by presets and install rules, but for now # consumers may override this to control the target location of cccl.c.parallel. @@ -27,6 +32,9 @@ file( "src/*.cpp" ) +# hostjit sources are built as a separate library; exclude from cccl.c.parallel +list(FILTER srcs EXCLUDE REGEX "src/hostjit/") + add_library(cccl.c.parallel SHARED ${srcs}) set_property(TARGET cccl.c.parallel PROPERTY POSITION_INDEPENDENT_CODE ON) cccl_configure_target(cccl.c.parallel DIALECT 20) @@ -48,6 +56,10 @@ cccl_get_thrust() add_subdirectory(src/jit_templates) +if (CCCL_C_Parallel_ENABLE_HOSTJIT) + add_subdirectory(src/hostjit) +endif() + set_target_properties(cccl.c.parallel PROPERTIES CUDA_RUNTIME_LIBRARY STATIC) target_link_libraries( cccl.c.parallel diff --git a/c/parallel/src/hostjit/CMakeLists.txt b/c/parallel/src/hostjit/CMakeLists.txt new file mode 100644 index 00000000000..6301e55f454 --- /dev/null +++ b/c/parallel/src/hostjit/CMakeLists.txt @@ -0,0 +1,211 @@ +cmake_minimum_required(VERSION 3.20) + +# -------------------------------------------------------------------------- +# LLVM/Clang/LLD — fetched via CPM as static libraries +# -------------------------------------------------------------------------- +# CPM.cmake is at the cccl repo root: cccl/cmake/CPM.cmake +# From c/parallel/src/hostjit/ that's ../../../../cmake/CPM.cmake +set(_cccl_cmake_dir "${CMAKE_CURRENT_SOURCE_DIR}/../../../../cmake") +if (EXISTS "${_cccl_cmake_dir}/CPM.cmake") + include("${_cccl_cmake_dir}/CPM.cmake") +else() + message(FATAL_ERROR "CPM.cmake not found at ${_cccl_cmake_dir}/CPM.cmake") +endif() + +if (MSVC AND CMAKE_BUILD_TYPE STREQUAL "Debug") + message( + FATAL_ERROR + "hostjit does not support Debug builds on Windows. " + "The statically-linked LLVM Debug build is too large and causes stack " + "overflows at runtime. Use MinSizeRel, Release, or RelWithDebInfo instead." + ) +endif() + +set(HOSTJIT_LLVM_VERSION "llvmorg-22.1.1" CACHE STRING "LLVM git tag to fetch") + +# List options must be set before CPMAddPackage +set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "" FORCE) +set(LLVM_TARGETS_TO_BUILD "X86;NVPTX" CACHE STRING "" FORCE) + +CPMAddPackage( + NAME llvm_project + GIT_REPOSITORY https://github.com/llvm/llvm-project.git + GIT_TAG ${HOSTJIT_LLVM_VERSION} + GIT_SHALLOW ON + SOURCE_SUBDIR llvm + EXCLUDE_FROM_ALL YES + OPTIONS + "LLVM_BUILD_LLVM_C_DYLIB OFF" + "LLVM_BUILD_TOOLS OFF" + "LLVM_BUILD_UTILS OFF" + "LLVM_BUILD_RUNTIME OFF" + "LLVM_BUILD_RUNTIMES OFF" + "LLVM_INCLUDE_BENCHMARKS OFF" + "LLVM_INCLUDE_DOCS OFF" + "LLVM_INCLUDE_EXAMPLES OFF" + "LLVM_INCLUDE_RUNTIMES OFF" + "LLVM_INCLUDE_TESTS OFF" + "LLVM_INCLUDE_TOOLS ON" + "LLVM_INCLUDE_UTILS OFF" + "LLVM_ENABLE_ZLIB OFF" + "LLVM_ENABLE_ZSTD OFF" + "LLVM_ENABLE_TERMINFO OFF" + "LLVM_ENABLE_BINDINGS OFF" + "CLANG_BUILD_TOOLS OFF" + "CLANG_ENABLE_ARCMT OFF" + "CLANG_ENABLE_STATIC_ANALYZER OFF" +) + +# Ensure the clang resource directory exists +file( + MAKE_DIRECTORY "${llvm_project_BINARY_DIR}/lib/clang/${LLVM_VERSION_MAJOR}" +) + +# Find CUDA toolkit (may already be found by parent) +if (NOT CUDAToolkit_FOUND) + find_package(CUDAToolkit) +endif() + +# -------------------------------------------------------------------------- +# hostjit library +# -------------------------------------------------------------------------- +add_library(hostjit_lib compiler.cpp config.cpp loader.cpp jit_compiler.cpp) + +# CCCL_SOURCE_DIR points to the cccl repo root +# From c/parallel/src/hostjit -> c/parallel/src -> c/parallel -> c -> cccl +cmake_path(GET CMAKE_CURRENT_SOURCE_DIR PARENT_PATH _src_dir) # c/parallel/src +cmake_path(GET _src_dir PARENT_PATH _c_parallel_dir) # c/parallel +cmake_path(GET _c_parallel_dir PARENT_PATH _c_dir) # c +cmake_path(GET _c_dir PARENT_PATH _cccl_root) # cccl + +target_include_directories( + hostjit_lib + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${_c_parallel_dir}/include + ${llvm_project_SOURCE_DIR}/llvm/include + ${llvm_project_BINARY_DIR}/include + ${llvm_project_SOURCE_DIR}/clang/include + ${llvm_project_BINARY_DIR}/tools/clang/include + ${llvm_project_SOURCE_DIR}/lld/include + ${llvm_project_BINARY_DIR}/tools/lld/include +) + +target_compile_definitions( + hostjit_lib + PRIVATE + CCCL_C_EXPERIMENTAL=1 + CCCL_SOURCE_DIR="${_cccl_root}" + CLANG_RESOURCE_DIR="${llvm_project_BINARY_DIR}/lib/clang/${LLVM_VERSION_MAJOR}" + CLANG_HEADERS_DIR="${llvm_project_SOURCE_DIR}/clang/lib/Headers" + HOSTJIT_INCLUDE_DIR="${CMAKE_CURRENT_SOURCE_DIR}/include" +) + +if (CUDAToolkit_FOUND) + target_include_directories(hostjit_lib PUBLIC ${CUDAToolkit_INCLUDE_DIRS}) + cmake_path(GET CUDAToolkit_BIN_DIR PARENT_PATH CUDA_TOOLKIT_ROOT_FROM_CMAKE) + target_compile_definitions( + hostjit_lib + PRIVATE + CUDA_TOOLKIT_PATH="${CUDA_TOOLKIT_ROOT_FROM_CMAKE}" + CUDA_SDK_VERSION="${CUDAToolkit_VERSION_MAJOR}.0" + ) +endif() + +# Link against LLVM/Clang/LLD +target_link_libraries( + hostjit_lib + PUBLIC + # LLVM + LLVMCore + LLVMSupport + LLVMIRReader + LLVMMC + LLVMObject + LLVMX86CodeGen + LLVMX86AsmParser + LLVMX86Desc + LLVMX86Info + LLVMNVPTXCodeGen + LLVMNVPTXDesc + LLVMNVPTXInfo + LLVMLinker + LLVMPasses + # Clang + clangAST + clangBasic + clangCodeGen + clangDriver + clangFrontend + clangFrontendTool + clangLex + clangParse + clangSema + clangEdit + clangAnalysis + clangRewrite + clangSerialization + # LLD + $,lldCOFF,lldELF> + lldCommon +) + +if (NOT WIN32) + target_link_libraries(hostjit_lib PUBLIC dl) +endif() + +if (CUDAToolkit_FOUND) + target_link_libraries(hostjit_lib PUBLIC CUDA::cuda_driver CUDA::cudart) + # nvJitLink and nvfatbin are required at link and runtime. + # nvptxcompiler is a transitive dep of libnvJitLink_static. + # Prefer static variants on non-Windows; fall back to the dynamic imported target; + # fall back further to find_library in case FindCUDAToolkit didn't create the target + # (e.g. partial CTK installs on Ubuntu or Windows). + foreach (_lib nvJitLink nvptxcompiler nvfatbin) + if (NOT WIN32 AND TARGET CUDA::${_lib}_static) + target_link_libraries(hostjit_lib PUBLIC CUDA::${_lib}_static) + elseif (TARGET CUDA::${_lib}) + target_link_libraries(hostjit_lib PUBLIC CUDA::${_lib}) + else() + find_library( + _hostjit_${_lib} + NAMES ${_lib} + HINTS + "${CUDAToolkit_LIBRARY_DIR}" + "${CUDAToolkit_ROOT}/lib/x64" + "${CUDAToolkit_ROOT}/lib64" + "${CUDAToolkit_ROOT}/lib" + ) + if (_hostjit_${_lib}) + message(STATUS "hostjit: linking ${_lib} from ${_hostjit_${_lib}}") + target_link_libraries(hostjit_lib PUBLIC "${_hostjit_${_lib}}") + else() + message( + FATAL_ERROR + "hostjit requires ${_lib} but it was not found.\n" + " Ubuntu: apt-get install libnvfatbin-- or libnvjitlink--\n" + " Windows: reinstall the CUDA toolkit and ensure the nvfatbin/nvjitlink " + "components are selected." + ) + endif() + endif() + endforeach() +endif() + +if (NOT MSVC) + target_compile_options(hostjit_lib PRIVATE -fno-rtti) +endif() + +set_target_properties( + hostjit_lib + PROPERTIES CXX_STANDARD 20 POSITION_INDEPENDENT_CODE ON +) + +# On Windows with multi-config generators (Visual Studio), exclude hostjit +# targets from Debug builds — the LLVM Debug build causes stack overflows. +if (MSVC) + set_target_properties( + hostjit_lib + PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_DEBUG TRUE + ) +endif() diff --git a/c/parallel/src/hostjit/compiler.cpp b/c/parallel/src/hostjit/compiler.cpp new file mode 100644 index 00000000000..5881cc6ace0 --- /dev/null +++ b/c/parallel/src/hostjit/compiler.cpp @@ -0,0 +1,1534 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Selective target initialization (X86 for host, NVPTX for device) +extern "C" { +void LLVMInitializeX86TargetInfo(); +void LLVMInitializeX86Target(); +void LLVMInitializeX86TargetMC(); +void LLVMInitializeX86AsmPrinter(); +void LLVMInitializeX86AsmParser(); +void LLVMInitializeNVPTXTargetInfo(); +void LLVMInitializeNVPTXTarget(); +void LLVMInitializeNVPTXTargetMC(); +void LLVMInitializeNVPTXAsmPrinter(); +} + +#ifdef _WIN32 +LLD_HAS_DRIVER(coff) +#else +LLD_HAS_DRIVER(elf) +#endif + +#ifdef _WIN32 +# include +#endif + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace hostjit +{ +static bool llvm_initialized = false; + +static void initialize_llvm() +{ + if (llvm_initialized) + { + return; + } + + LLVMInitializeX86TargetInfo(); + LLVMInitializeX86Target(); + LLVMInitializeX86TargetMC(); + LLVMInitializeX86AsmPrinter(); + LLVMInitializeX86AsmParser(); + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXTargetMC(); + LLVMInitializeNVPTXAsmPrinter(); + + llvm_initialized = true; +} + +#ifdef _WIN32 +// Generate a minimal COFF import library for a given DLL. +// This allows linking without requiring the Windows SDK or MSVC .lib files. +// Symbols can be "name" or "name=dllexport" for aliasing. +static bool generateImportLib( + const std::string& dll_name, + const std::vector& symbols, + const std::string& output_path, + bool data_only = false) +{ + std::vector exports; + for (const auto& sym : symbols) + { + llvm::object::COFFShortExport exp; + auto eq = sym.find('='); + if (eq != std::string::npos) + { + // "atexit=_crt_atexit" means: linker sees "atexit", DLL exports "_crt_atexit" + exp.Name = sym.substr(0, eq); // symbol name the linker resolves + exp.ImportName = sym.substr(eq + 1); // actual DLL export name + } + else + { + exp.Name = sym; + } + exp.Data = data_only; + exports.push_back(exp); + } + auto err = llvm::object::writeImportLibrary( + dll_name, + output_path, + exports, + llvm::COFF::IMAGE_FILE_MACHINE_AMD64, + /*MinGW=*/false); + if (err) + { + llvm::consumeError(std::move(err)); + return false; + } + return true; +} + +// Find the actual DLL filename for cudart (e.g. "cudart64_13.dll") by +// scanning the CUDA toolkit bin directory. +static std::string findCudartDllName(const std::string& cuda_toolkit_path) +{ + namespace fs = std::filesystem; + for (const auto& subdir : {"bin/x64", "bin"}) + { + fs::path dir = fs::path(cuda_toolkit_path) / subdir; + if (!fs::exists(dir)) + { + continue; + } + for (const auto& entry : fs::directory_iterator(dir)) + { + auto name = entry.path().filename().string(); + if (name.starts_with("cudart64_") && name.ends_with(".dll")) + { + return name; + } + } + } + return "cudart64_12.dll"; // fallback +} +#endif + +// Headers precompiled into the PCH cache. Covers the algorithms exposed +// by the C parallel library so that a single pair of PCH files (device + +// host) is reused across reduce, adjacent-difference, etc. +static constexpr const char* pch_preamble_source = + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n" + "#include \n"; + +class CUDACompiler::Impl +{ +public: + Impl() {} + + // Get the persistent PCH cache directory. + static std::filesystem::path getPCHCacheDir() + { + auto dir = std::filesystem::temp_directory_path() / "hostjit_pch"; + std::filesystem::create_directories(dir); + return dir; + } + + // Get a persistent cache path for a PCH file. + static std::string getPCHPath(const std::string& kind, int sm_version) + { + return (getPCHCacheDir() / (kind + "_sm" + std::to_string(sm_version) + ".pch")).string(); + } + + // Get the persistent path for the PCH preamble source file. + // The PCH stores a reference to this path, so it must be stable across runs. + static std::string getPCHSourcePath(const std::string& kind, int sm_version) + { + return (getPCHCacheDir() / (kind + "_sm" + std::to_string(sm_version) + "_preamble.cu")).string(); + } + + // Write preamble to a persistent file and generate a PCH from it. + // arg_strings[0] will be replaced with the persistent preamble path. + bool generatePCH(const std::string& pch_source, + const std::string& pch_source_path, + const std::string& pch_output_path, + std::vector arg_strings, + std::string& diagnostics) + { + // Write preamble to the persistent source path + { + std::ofstream f(pch_source_path); + if (!f) + { + diagnostics += "Failed to write PCH preamble to " + pch_source_path; + return false; + } + f << pch_source; + } + + // Replace the source file arg with the persistent path + arg_strings[0] = pch_source_path; + + std::vector args; + for (const auto& arg : arg_strings) + { + args.push_back(arg.c_str()); + } + + std::string diag_output; + llvm::raw_string_ostream diag_stream(diag_output); + clang::DiagnosticOptions diag_opts; + diag_opts.ShowColors = false; + auto* diag_printer = new clang::TextDiagnosticPrinter(diag_stream, diag_opts); + clang::IntrusiveRefCntPtr diag_ids(new clang::DiagnosticIDs()); + clang::DiagnosticsEngine diag_engine(diag_ids, diag_opts, diag_printer); + + clang::CompilerInstance compiler; + auto& invocation = compiler.getInvocation(); + + if (!clang::CompilerInvocation::CreateFromArgs(invocation, args, diag_engine)) + { + diag_stream.flush(); + diagnostics += diag_output + "\nFailed to create PCH compiler invocation"; + return false; + } + + compiler.createDiagnostics(diag_engine.getClient(), false); + compiler.createFileManager(); + compiler.getFrontendOpts().OutputFile = pch_output_path; + + clang::GeneratePCHAction pch_action; + bool success = compiler.ExecuteAction(pch_action); + + diag_stream.flush(); + diagnostics += diag_output; + + return success; + } + + llvm::IntrusiveRefCntPtr + createVFSWithSource(const std::string& source_code, const std::string& virtual_path) + { + auto mem_fs = llvm::makeIntrusiveRefCnt(); + mem_fs->addFile(virtual_path, 0, llvm::MemoryBuffer::getMemBuffer(source_code)); + + auto overlay = llvm::makeIntrusiveRefCnt(llvm::vfs::getRealFileSystem()); + overlay->pushOverlay(mem_fs); + return overlay; + } + + bool compileDeviceToPTX( + const std::string& source_code, + const std::string& input_file, + const std::string& output_ptx, + const CompilerConfig& config, + std::string& diagnostics) + { + std::string temp_dir = std::filesystem::path(output_ptx).parent_path().string(); + std::string source_file = temp_dir + "/" + input_file; + + std::string resource_dir = CLANG_RESOURCE_DIR; + + int ptx_version = 70; + if (config.sm_version >= 120) + { + ptx_version = 87; + } + else if (config.sm_version >= 100) + { + ptx_version = 85; + } + else if (config.sm_version >= 90) + { + ptx_version = 80; + } + else if (config.sm_version >= 89) + { + ptx_version = 78; + } + else if (config.sm_version >= 80) + { + ptx_version = 75; + } + + std::vector arg_strings; + arg_strings.push_back(source_file); + arg_strings.push_back("-triple"); + arg_strings.push_back("nvptx64-nvidia-cuda"); + arg_strings.push_back("-aux-triple"); +#ifdef _WIN32 + arg_strings.push_back("x86_64-pc-windows-msvc"); +#else + arg_strings.push_back("x86_64-pc-linux-gnu"); +#endif + arg_strings.push_back("-S"); + arg_strings.push_back("-aux-target-cpu"); + arg_strings.push_back("x86-64"); + arg_strings.push_back("-fcuda-is-device"); + arg_strings.push_back("-fcuda-allow-variadic-functions"); +#ifdef _WIN32 + arg_strings.push_back("-fms-compatibility"); + arg_strings.push_back("-fms-compatibility-version=19.40"); +#else + arg_strings.push_back("-fgnuc-version=4.2.1"); +#endif + arg_strings.push_back("-mlink-builtin-bitcode"); + arg_strings.push_back(config.cuda_toolkit_path + "/nvvm/libdevice/libdevice.10.bc"); + arg_strings.push_back("-target-sdk-version=" CUDA_SDK_VERSION); + arg_strings.push_back("-target-cpu"); + arg_strings.push_back("sm_" + std::to_string(config.sm_version)); + arg_strings.push_back("-target-feature"); + arg_strings.push_back("+ptx" + std::to_string(ptx_version)); + arg_strings.push_back("-resource-dir"); + arg_strings.push_back(resource_dir); + arg_strings.push_back("-internal-isystem"); + arg_strings.push_back(config.hostjit_include_path + "/hostjit/cuda_minimal/stubs"); + arg_strings.push_back("-internal-isystem"); + arg_strings.push_back( + config.clang_headers_path.empty() ? std::string(CLANG_HEADERS_DIR) : config.clang_headers_path); + arg_strings.push_back("-internal-isystem"); + if (config.cccl_include_path.empty()) + { + arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/libcudacxx/include/cuda/std"); + } + else + { + arg_strings.push_back(config.cccl_include_path + "/cuda/std"); + } + arg_strings.push_back("-internal-isystem"); + if (config.cccl_include_path.empty()) + { + arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/libcudacxx/include"); + } + else + { + arg_strings.push_back(config.cccl_include_path); + } + arg_strings.push_back("-internal-isystem"); + if (config.cccl_include_path.empty()) + { + arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/cub"); + } + else + { + arg_strings.push_back(config.cccl_include_path); + } + arg_strings.push_back("-internal-isystem"); + if (config.cccl_include_path.empty()) + { + arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/thrust"); + } + else + { + arg_strings.push_back(config.cccl_include_path); + } + arg_strings.push_back("-internal-isystem"); + arg_strings.push_back(config.cuda_toolkit_path + "/include"); + arg_strings.push_back("-include"); + arg_strings.push_back(config.hostjit_include_path + "/hostjit/cuda_minimal/__clang_cuda_runtime_wrapper.h"); + + for (const auto& include_path : config.include_paths) + { + arg_strings.push_back("-I" + include_path); + } + + arg_strings.push_back("-D__HOSTJIT_DEVICE_COMPILATION__=1"); + arg_strings.push_back("-DNDEBUG"); + arg_strings.push_back("-DCCCL_DISABLE_CTK_COMPATIBILITY_CHECK"); + arg_strings.push_back("-DCCCL_DISABLE_FP16_SUPPORT=1"); + arg_strings.push_back("-DCCCL_DISABLE_BF16_SUPPORT=1"); + arg_strings.push_back("-DCCCL_DISABLE_NVTX=1"); + arg_strings.push_back("-DCCCL_DISABLE_EXCEPTIONS=1"); + + std::vector bitcode_files_to_link = config.device_bitcode_files; + + for (const auto& [macro_name, macro_value] : config.macro_definitions) + { + if (macro_value.empty()) + { + arg_strings.push_back("-D" + macro_name); + } + else + { + arg_strings.push_back("-D" + macro_name + "=" + macro_value); + } + } + + arg_strings.push_back("-fdeprecated-macro"); + arg_strings.push_back("--offload-new-driver"); + arg_strings.push_back("-fskip-odr-check-in-gmf"); + arg_strings.push_back("-fcxx-exceptions"); + arg_strings.push_back("-fexceptions"); + arg_strings.push_back("-O" + std::to_string(config.optimization_level)); + arg_strings.push_back("-std=c++17"); + + if (config.trace_includes) + { + arg_strings.push_back("-H"); + } + + arg_strings.push_back("-x"); + arg_strings.push_back("cuda"); + + // --- PCH: ensure device PCH exists --- + std::string device_pch_path; + if (config.enable_pch) + { + device_pch_path = getPCHPath("device", config.sm_version); + if (!std::filesystem::exists(device_pch_path)) + { + auto pch_src_path = getPCHSourcePath("device", config.sm_version); + std::string pch_diag; + if (!generatePCH(pch_preamble_source, pch_src_path, device_pch_path, arg_strings, pch_diag)) + { + diagnostics += "Device PCH generation failed: " + pch_diag + "\n"; + device_pch_path.clear(); + } + else if (config.verbose) + { + diagnostics += "Generated device PCH: " + device_pch_path + "\n"; + } + } + } + + std::vector args; + for (const auto& arg : arg_strings) + { + args.push_back(arg.c_str()); + } + + if (config.verbose) + { + diagnostics += "Device args: "; + for (const auto& arg : arg_strings) + { + diagnostics += arg + " "; + } + diagnostics += "\n"; + } + + std::string diag_output; + llvm::raw_string_ostream diag_stream(diag_output); + + clang::DiagnosticOptions diag_opts; + diag_opts.ShowColors = false; + clang::TextDiagnosticPrinter* diag_printer = new clang::TextDiagnosticPrinter(diag_stream, diag_opts); + clang::IntrusiveRefCntPtr diag_ids(new clang::DiagnosticIDs()); + clang::DiagnosticsEngine diag_engine(diag_ids, diag_opts, diag_printer); + + clang::CompilerInstance compiler; + auto& invocation = compiler.getInvocation(); + + if (!clang::CompilerInvocation::CreateFromArgs(invocation, args, diag_engine)) + { + diag_stream.flush(); + diagnostics += diag_output; + diagnostics += "\nFailed to create device compiler invocation"; + return false; + } + + // --- PCH: load cached device PCH --- + if (!device_pch_path.empty() && std::filesystem::exists(device_pch_path)) + { + invocation.getPreprocessorOpts().ImplicitPCHInclude = device_pch_path; + } + + auto vfs = createVFSWithSource(source_code, source_file); + compiler.createDiagnostics(diag_engine.getClient(), false); + compiler.setVirtualFileSystem(vfs); + compiler.createFileManager(); + compiler.getFrontendOpts().OutputFile = output_ptx; + + if (config.trace_includes) + { + diagnostics += "\n=== Device Header Search Paths ===\n"; + const auto& hso = invocation.getHeaderSearchOpts(); + for (const auto& entry : hso.UserEntries) + { + diagnostics += " " + entry.Path + "\n"; + } + diagnostics += "=== End Header Search Paths ===\n\n"; + } + + llvm::LLVMContext llvm_context; + + clang::EmitLLVMOnlyAction emit_llvm_action(&llvm_context); + bool success = compiler.ExecuteAction(emit_llvm_action); + + if (config.trace_includes && compiler.hasSourceManager()) + { + diagnostics += "\n=== Device Included Files ===\n"; + auto& sm = compiler.getSourceManager(); + for (auto it = sm.fileinfo_begin(); it != sm.fileinfo_end(); ++it) + { + diagnostics += " " + it->first.getName().str() + "\n"; + } + diagnostics += "=== End Included Files ===\n\n"; + } + + if (success) + { + std::unique_ptr mod = emit_llvm_action.takeModule(); + if (mod) + { + for (const auto& bc_file : bitcode_files_to_link) + { + llvm::SMDiagnostic err; + auto bc_mod = llvm::parseIRFile(bc_file, err, llvm_context); + if (bc_mod) + { + if (llvm::Linker::linkModules(*mod, std::move(bc_mod))) + { + diagnostics += "Failed to link bitcode: " + bc_file + "\n"; + success = false; + break; + } + } + else + { + diagnostics += "Failed to parse bitcode: " + bc_file + "\n"; + success = false; + break; + } + } + + // Re-link libdevice to resolve any new references (e.g. __nv_pow) + // introduced by the extra bitcode modules. + if (success && !bitcode_files_to_link.empty()) + { + std::string libdevice_path = config.cuda_toolkit_path + "/nvvm/libdevice/libdevice.10.bc"; + llvm::SMDiagnostic err; + auto libdevice = llvm::parseIRFile(libdevice_path, err, llvm_context); + if (libdevice) + { + // Use AppendToUsed to avoid internalization issues + llvm::Linker::linkModules(*mod, std::move(libdevice), llvm::Linker::LinkOnlyNeeded); + } + } + + if (success) + { + std::string err_str; + const llvm::Target* target = llvm::TargetRegistry::lookupTarget(mod->getTargetTriple(), err_str); + if (target) + { + llvm::TargetOptions opt; + auto tm = target->createTargetMachine( + mod->getTargetTriple(), + "sm_" + std::to_string(config.sm_version), + "+ptx" + std::to_string(ptx_version), + opt, + llvm::Reloc::PIC_); + if (tm) + { + mod->setDataLayout(tm->createDataLayout()); + + // Run optimization passes after linking to inline user-provided + // operations (from bitcode or embedded C++ source). + if (!config.entry_point_name.empty()) + { + // Internalize all functions except the entry point and + // GPU kernels, so the optimizer can inline the linked + // bitcode functions. + for (auto& F : *mod) + { + if (!F.isDeclaration() && F.getLinkage() == llvm::GlobalValue::ExternalLinkage + && F.getName() != config.entry_point_name && F.getCallingConv() != llvm::CallingConv::PTX_Kernel) + { + F.setLinkage(llvm::GlobalValue::InternalLinkage); + // Remove attributes that conflict with inlining + F.removeFnAttr(llvm::Attribute::NoInline); + F.removeFnAttr(llvm::Attribute::OptimizeNone); + F.addFnAttr(llvm::Attribute::AlwaysInline); + } + } + + llvm::OptimizationLevel opt_level; + switch (config.optimization_level) + { + case 0: + opt_level = llvm::OptimizationLevel::O0; + break; + case 1: + opt_level = llvm::OptimizationLevel::O1; + break; + case 3: + opt_level = llvm::OptimizationLevel::O3; + break; + default: + opt_level = llvm::OptimizationLevel::O2; + break; + } + + llvm::LoopAnalysisManager LAM; + llvm::FunctionAnalysisManager FAM; + llvm::CGSCCAnalysisManager CGAM; + llvm::ModuleAnalysisManager MAM; + + llvm::PassBuilder PB(tm); + PB.registerModuleAnalyses(MAM); + PB.registerCGSCCAnalyses(CGAM); + PB.registerFunctionAnalyses(FAM); + PB.registerLoopAnalyses(LAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + + auto MPM = PB.buildPerModuleDefaultPipeline(opt_level); + MPM.run(*mod, MAM); + } + + std::error_code EC; + llvm::raw_fd_ostream dest(output_ptx, EC); + if (!EC) + { + llvm::legacy::PassManager pass; + tm->addPassesToEmitFile(pass, dest, nullptr, llvm::CodeGenFileType::AssemblyFile); + pass.run(*mod); + } + else + { + diagnostics += "Failed to open output file: " + output_ptx + "\n"; + success = false; + } + } + else + { + diagnostics += "Failed to create target machine\n"; + success = false; + } + } + else + { + diagnostics += "Failed to lookup target: " + err_str + "\n"; + success = false; + } + } + } + } + + diag_stream.flush(); + diagnostics += diag_output; + + return success; + } + + BitcodeResult compileToDeviceBitcode(const std::string& source_code, const CompilerConfig& config) + { + BitcodeResult result; + result.success = false; + + std::string error_msg; + if (!validateConfig(config, &error_msg)) + { + result.diagnostics = "Configuration error: " + error_msg; + return result; + } + + initialize_llvm(); + + std::string temp_dir = + (std::filesystem::temp_directory_path() / ("hostjit_bc_" + std::to_string(reinterpret_cast(this)))) + .string(); + std::filesystem::create_directories(temp_dir); + + std::string input_file = "input.cu"; + std::string source_file = temp_dir + "/" + input_file; + std::string resource_dir = CLANG_RESOURCE_DIR; + + int ptx_version = 70; + if (config.sm_version >= 120) + { + ptx_version = 87; + } + else if (config.sm_version >= 100) + { + ptx_version = 85; + } + else if (config.sm_version >= 90) + { + ptx_version = 80; + } + else if (config.sm_version >= 89) + { + ptx_version = 78; + } + else if (config.sm_version >= 80) + { + ptx_version = 75; + } + + std::vector arg_strings; + arg_strings.push_back(source_file); + arg_strings.push_back("-triple"); + arg_strings.push_back("nvptx64-nvidia-cuda"); + arg_strings.push_back("-aux-triple"); +#ifdef _WIN32 + arg_strings.push_back("x86_64-pc-windows-msvc"); +#else + arg_strings.push_back("x86_64-pc-linux-gnu"); +#endif + arg_strings.push_back("-S"); + arg_strings.push_back("-aux-target-cpu"); + arg_strings.push_back("x86-64"); + arg_strings.push_back("-fcuda-is-device"); + arg_strings.push_back("-fcuda-allow-variadic-functions"); +#ifdef _WIN32 + arg_strings.push_back("-fms-compatibility"); + arg_strings.push_back("-fms-compatibility-version=19.40"); +#else + arg_strings.push_back("-fgnuc-version=4.2.1"); +#endif + arg_strings.push_back("-mlink-builtin-bitcode"); + arg_strings.push_back(config.cuda_toolkit_path + "/nvvm/libdevice/libdevice.10.bc"); + arg_strings.push_back("-target-sdk-version=" CUDA_SDK_VERSION); + arg_strings.push_back("-target-cpu"); + arg_strings.push_back("sm_" + std::to_string(config.sm_version)); + arg_strings.push_back("-target-feature"); + arg_strings.push_back("+ptx" + std::to_string(ptx_version)); + arg_strings.push_back("-resource-dir"); + arg_strings.push_back(resource_dir); + arg_strings.push_back("-internal-isystem"); + arg_strings.push_back(config.hostjit_include_path + "/hostjit/cuda_minimal/stubs"); + arg_strings.push_back("-internal-isystem"); + arg_strings.push_back( + config.clang_headers_path.empty() ? std::string(CLANG_HEADERS_DIR) : config.clang_headers_path); + arg_strings.push_back("-internal-isystem"); + if (config.cccl_include_path.empty()) + { + arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/libcudacxx/include/cuda/std"); + } + else + { + arg_strings.push_back(config.cccl_include_path + "/cuda/std"); + } + arg_strings.push_back("-internal-isystem"); + if (config.cccl_include_path.empty()) + { + arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/libcudacxx/include"); + } + else + { + arg_strings.push_back(config.cccl_include_path); + } + arg_strings.push_back("-internal-isystem"); + if (config.cccl_include_path.empty()) + { + arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/cub"); + } + else + { + arg_strings.push_back(config.cccl_include_path); + } + arg_strings.push_back("-internal-isystem"); + if (config.cccl_include_path.empty()) + { + arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/thrust"); + } + else + { + arg_strings.push_back(config.cccl_include_path); + } + arg_strings.push_back("-internal-isystem"); + arg_strings.push_back(config.cuda_toolkit_path + "/include"); + arg_strings.push_back("-include"); + arg_strings.push_back(config.hostjit_include_path + "/hostjit/cuda_minimal/__clang_cuda_runtime_wrapper.h"); + arg_strings.push_back("-D__HOSTJIT_DEVICE_COMPILATION__=1"); + arg_strings.push_back("-DNDEBUG"); + arg_strings.push_back("-DCCCL_DISABLE_CTK_COMPATIBILITY_CHECK"); + arg_strings.push_back("-DCCCL_DISABLE_FP16_SUPPORT=1"); + arg_strings.push_back("-DCCCL_DISABLE_BF16_SUPPORT=1"); + arg_strings.push_back("-DCCCL_DISABLE_NVTX=1"); + arg_strings.push_back("-DCCCL_DISABLE_EXCEPTIONS=1"); + arg_strings.push_back("-fdeprecated-macro"); + arg_strings.push_back("-fcxx-exceptions"); + arg_strings.push_back("-fexceptions"); + arg_strings.push_back("-O" + std::to_string(config.optimization_level)); + arg_strings.push_back("-Wno-c++11-narrowing"); + arg_strings.push_back("-std=c++17"); + arg_strings.push_back("-x"); + arg_strings.push_back("cuda"); + + std::vector args; + for (const auto& arg : arg_strings) + { + args.push_back(arg.c_str()); + } + + std::string diag_output; + llvm::raw_string_ostream diag_stream(diag_output); + + clang::DiagnosticOptions diag_opts; + diag_opts.ShowColors = false; + clang::TextDiagnosticPrinter* diag_printer = new clang::TextDiagnosticPrinter(diag_stream, diag_opts); + clang::IntrusiveRefCntPtr diag_ids(new clang::DiagnosticIDs()); + clang::DiagnosticsEngine diag_engine(diag_ids, diag_opts, diag_printer); + + clang::CompilerInstance compiler; + auto& invocation = compiler.getInvocation(); + + if (!clang::CompilerInvocation::CreateFromArgs(invocation, args, diag_engine)) + { + diag_stream.flush(); + result.diagnostics = diag_output + "\nFailed to create compiler invocation"; + std::filesystem::remove_all(temp_dir); + return result; + } + + auto vfs = createVFSWithSource(source_code, source_file); + compiler.createDiagnostics(diag_engine.getClient(), false); + compiler.setVirtualFileSystem(vfs); + compiler.createFileManager(); + + llvm::LLVMContext llvm_context; + clang::EmitLLVMOnlyAction emit_llvm_action(&llvm_context); + bool success = compiler.ExecuteAction(emit_llvm_action); + + if (success) + { + std::unique_ptr mod = emit_llvm_action.takeModule(); + if (mod) + { + llvm::SmallVector buffer; + llvm::raw_svector_ostream os(buffer); + llvm::WriteBitcodeToFile(*mod, os); + result.bitcode = std::string(buffer.begin(), buffer.end()); + result.success = true; + } + else + { + result.diagnostics = "Failed to get LLVM module"; + } + } + + diag_stream.flush(); + result.diagnostics += diag_output; + std::filesystem::remove_all(temp_dir); + return result; + } + + bool compileHostCode( + const std::string& source_code, + const std::string& input_file, + const std::string& fatbin_path, + const std::string& output_obj, + const CompilerConfig& config, + std::string& diagnostics) + { + std::string temp_dir = std::filesystem::path(output_obj).parent_path().string(); + std::string source_file = temp_dir + "/host_" + input_file; + + std::string resource_dir = CLANG_RESOURCE_DIR; + + std::vector arg_strings; + arg_strings.push_back(source_file); + arg_strings.push_back("-triple"); +#ifdef _WIN32 + arg_strings.push_back("x86_64-pc-windows-msvc"); +#else + arg_strings.push_back("x86_64-pc-linux-gnu"); +#endif + arg_strings.push_back("-aux-triple"); + arg_strings.push_back("nvptx64-nvidia-cuda"); + arg_strings.push_back("-target-sdk-version=" CUDA_SDK_VERSION); + arg_strings.push_back("-emit-obj"); + arg_strings.push_back("-target-cpu"); + arg_strings.push_back("x86-64"); + arg_strings.push_back("-fcuda-allow-variadic-functions"); +#ifdef _WIN32 + arg_strings.push_back("-fms-compatibility"); + arg_strings.push_back("-fms-compatibility-version=19.40"); +#else + arg_strings.push_back("-fgnuc-version=4.2.1"); +#endif + arg_strings.push_back("-mrelocation-model"); + arg_strings.push_back("pic"); + arg_strings.push_back("-pic-level"); + arg_strings.push_back("2"); + arg_strings.push_back("-resource-dir"); + arg_strings.push_back(resource_dir); + arg_strings.push_back("-internal-isystem"); + arg_strings.push_back(config.hostjit_include_path + "/hostjit/cuda_minimal/stubs"); + arg_strings.push_back("-internal-isystem"); + arg_strings.push_back( + config.clang_headers_path.empty() ? std::string(CLANG_HEADERS_DIR) : config.clang_headers_path); + arg_strings.push_back("-internal-isystem"); + if (config.cccl_include_path.empty()) + { + arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/libcudacxx/include/cuda/std"); + } + else + { + arg_strings.push_back(config.cccl_include_path + "/cuda/std"); + } + arg_strings.push_back("-internal-isystem"); + if (config.cccl_include_path.empty()) + { + arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/libcudacxx/include"); + } + else + { + arg_strings.push_back(config.cccl_include_path); + } + arg_strings.push_back("-internal-isystem"); + if (config.cccl_include_path.empty()) + { + arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/cub"); + } + else + { + arg_strings.push_back(config.cccl_include_path); + } + arg_strings.push_back("-internal-isystem"); + if (config.cccl_include_path.empty()) + { + arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/thrust"); + } + else + { + arg_strings.push_back(config.cccl_include_path); + } + arg_strings.push_back("-internal-isystem"); + arg_strings.push_back(config.cuda_toolkit_path + "/include"); + arg_strings.push_back("-include"); + arg_strings.push_back(config.hostjit_include_path + "/hostjit/cuda_minimal/__clang_cuda_runtime_wrapper.h"); + + for (const auto& include_path : config.include_paths) + { + arg_strings.push_back("-I" + include_path); + } + + arg_strings.push_back("-DNDEBUG"); + arg_strings.push_back("-DCCCL_DISABLE_CTK_COMPATIBILITY_CHECK"); + arg_strings.push_back("-D_CCCL_ENABLE_FREESTANDING=1"); + arg_strings.push_back("-DCCCL_DISABLE_FP16_SUPPORT=1"); + arg_strings.push_back("-DCCCL_DISABLE_BF16_SUPPORT=1"); + arg_strings.push_back("-DCCCL_DISABLE_NVTX=1"); + arg_strings.push_back("-DCCCL_DISABLE_EXCEPTIONS=1"); + + for (const auto& [macro_name, macro_value] : config.macro_definitions) + { + if (macro_value.empty()) + { + arg_strings.push_back("-D" + macro_name); + } + else + { + arg_strings.push_back("-D" + macro_name + "=" + macro_value); + } + } + + arg_strings.push_back("-fdeprecated-macro"); + arg_strings.push_back("--offload-new-driver"); + arg_strings.push_back("-fskip-odr-check-in-gmf"); + arg_strings.push_back("-O" + std::to_string(config.optimization_level)); + arg_strings.push_back("-std=c++17"); + + if (config.trace_includes) + { + arg_strings.push_back("-H"); + } + + arg_strings.push_back("-x"); + arg_strings.push_back("cuda"); + + // --- PCH: ensure host PCH exists (before adding fatbin-specific args) --- + std::string host_pch_path; + if (config.enable_pch) + { + host_pch_path = getPCHPath("host", config.sm_version); + if (!std::filesystem::exists(host_pch_path)) + { + auto pch_src_path = getPCHSourcePath("host", config.sm_version); + std::string pch_diag; + if (!generatePCH(pch_preamble_source, pch_src_path, host_pch_path, arg_strings, pch_diag)) + { + diagnostics += "Host PCH generation failed: " + pch_diag + "\n"; + host_pch_path.clear(); + } + else if (config.verbose) + { + diagnostics += "Generated host PCH: " + host_pch_path + "\n"; + } + } + } + + // Add fatbin embedding (per-build, not part of PCH) + arg_strings.push_back("-fcuda-include-gpubinary"); + arg_strings.push_back(fatbin_path); + + std::vector args; + for (const auto& arg : arg_strings) + { + args.push_back(arg.c_str()); + } + + if (config.verbose) + { + diagnostics += "Host args: "; + for (const auto& arg : arg_strings) + { + diagnostics += arg + " "; + } + diagnostics += "\n"; + } + + std::string diag_output; + llvm::raw_string_ostream diag_stream(diag_output); + + clang::DiagnosticOptions diag_opts; + diag_opts.ShowColors = false; + clang::TextDiagnosticPrinter* diag_printer = new clang::TextDiagnosticPrinter(diag_stream, diag_opts); + clang::IntrusiveRefCntPtr diag_ids(new clang::DiagnosticIDs()); + clang::DiagnosticsEngine diag_engine(diag_ids, diag_opts, diag_printer); + + clang::CompilerInstance compiler; + auto& invocation = compiler.getInvocation(); + + if (!clang::CompilerInvocation::CreateFromArgs(invocation, args, diag_engine)) + { + diag_stream.flush(); + diagnostics += diag_output; + diagnostics += "\nFailed to create host compiler invocation"; + return false; + } + + // --- PCH: load cached host PCH --- + if (!host_pch_path.empty() && std::filesystem::exists(host_pch_path)) + { + invocation.getPreprocessorOpts().ImplicitPCHInclude = host_pch_path; + } + + auto vfs = createVFSWithSource(source_code, source_file); + compiler.createDiagnostics(diag_engine.getClient(), false); + compiler.setVirtualFileSystem(vfs); + compiler.createFileManager(); + compiler.getFrontendOpts().OutputFile = output_obj; + + if (config.trace_includes) + { + diagnostics += "\n=== Host Header Search Paths ===\n"; + const auto& hso = invocation.getHeaderSearchOpts(); + for (const auto& entry : hso.UserEntries) + { + diagnostics += " " + entry.Path + "\n"; + } + diagnostics += "=== End Header Search Paths ===\n\n"; + } + + clang::EmitObjAction emit_action; + bool success = compiler.ExecuteAction(emit_action); + + if (config.trace_includes && compiler.hasSourceManager()) + { + diagnostics += "\n=== Host Included Files ===\n"; + auto& sm = compiler.getSourceManager(); + for (auto it = sm.fileinfo_begin(); it != sm.fileinfo_end(); ++it) + { + diagnostics += " " + it->first.getName().str() + "\n"; + } + diagnostics += "=== End Included Files ===\n\n"; + } + + diag_stream.flush(); + diagnostics += diag_output; + + return success; + } + + CompilationResult + compileToObject(const std::string& source_code, const std::string& output_path, const CompilerConfig& config) + { + CompilationResult result; + result.success = false; + result.object_file_path = output_path; + + std::string error_msg; + if (!validateConfig(config, &error_msg)) + { + result.diagnostics = "Configuration error: " + error_msg; + return result; + } + + initialize_llvm(); + + std::string temp_dir = + (std::filesystem::temp_directory_path() / ("hostjit_" + std::to_string(reinterpret_cast(this)))) + .string(); + std::filesystem::create_directories(temp_dir); + + std::string input_file = "input.cu"; + std::string ptx_file = temp_dir + "/device.ptx"; + std::string fatbin_file = temp_dir + "/device.fatbin"; + + if (config.verbose) + { + result.diagnostics += "=== Device compilation ===\n"; + } + + if (!compileDeviceToPTX(source_code, input_file, ptx_file, config, result.diagnostics)) + { + result.diagnostics += "\nDevice compilation failed"; + std::filesystem::remove_all(temp_dir); + return result; + } + + if (config.verbose) + { + result.diagnostics += "\n=== nvJitLink + fatbinary ===\n"; + } + + { + std::vector ptx_data; + { + std::ifstream f(ptx_file, std::ios::binary); + ptx_data.assign(std::istreambuf_iterator(f), std::istreambuf_iterator()); + } + if (ptx_data.empty()) + { + result.diagnostics += "\nFailed to read ptx file"; + std::filesystem::remove_all(temp_dir); + return result; + } + if (ptx_data.back() != '\0') + { + ptx_data.push_back('\0'); + } + + std::string arch_opt = "-arch=sm_" + std::to_string(config.sm_version); + std::string opt_level = "-O" + std::to_string(config.optimization_level >= 1 ? 3 : 0); + const char* jitlink_options[] = {arch_opt.c_str(), opt_level.c_str()}; + nvJitLinkHandle jitlink_handle = nullptr; + nvJitLinkResult jlr = nvJitLinkCreate(&jitlink_handle, 2, jitlink_options); + if (jlr != NVJITLINK_SUCCESS) + { + result.diagnostics += "\nnvJitLinkCreate failed (error " + std::to_string(static_cast(jlr)) + ")"; + std::filesystem::remove_all(temp_dir); + return result; + } + + jlr = nvJitLinkAddData(jitlink_handle, NVJITLINK_INPUT_PTX, ptx_data.data(), ptx_data.size(), "device.ptx"); + if (jlr != NVJITLINK_SUCCESS) + { + size_t log_size = 0; + nvJitLinkGetErrorLogSize(jitlink_handle, &log_size); + if (log_size > 1) + { + std::string log(log_size, '\0'); + nvJitLinkGetErrorLog(jitlink_handle, log.data()); + result.diagnostics += "\n" + log; + } + result.diagnostics += "\nnvJitLinkAddData failed"; + nvJitLinkDestroy(&jitlink_handle); + std::filesystem::remove_all(temp_dir); + return result; + } + + jlr = nvJitLinkComplete(jitlink_handle); + if (jlr != NVJITLINK_SUCCESS) + { + size_t log_size = 0; + nvJitLinkGetErrorLogSize(jitlink_handle, &log_size); + if (log_size > 1) + { + std::string log(log_size, '\0'); + nvJitLinkGetErrorLog(jitlink_handle, log.data()); + result.diagnostics += "\n" + log; + } + result.diagnostics += "\nnvJitLinkComplete failed"; + nvJitLinkDestroy(&jitlink_handle); + std::filesystem::remove_all(temp_dir); + return result; + } + + size_t cubin_size = 0; + nvJitLinkGetLinkedCubinSize(jitlink_handle, &cubin_size); + std::vector cubin_data(cubin_size); + nvJitLinkGetLinkedCubin(jitlink_handle, cubin_data.data()); + nvJitLinkDestroy(&jitlink_handle); + + // Store cubin in the result for inspection + result.cubin = cubin_data; + + std::string arch = std::to_string(config.sm_version); + const char* fatbin_options[] = {"-64", "-cuda"}; + nvFatbinHandle fatbin_handle = nullptr; + nvFatbinResult fbr = nvFatbinCreate(&fatbin_handle, fatbin_options, 2); + if (fbr != NVFATBIN_SUCCESS) + { + result.diagnostics += std::string("\nnvFatbinCreate failed: ") + nvFatbinGetErrorString(fbr); + std::filesystem::remove_all(temp_dir); + return result; + } + + fbr = nvFatbinAddCubin(fatbin_handle, cubin_data.data(), cubin_data.size(), arch.c_str(), "device.cubin"); + if (fbr != NVFATBIN_SUCCESS) + { + result.diagnostics += std::string("\nnvFatbinAddCubin failed: ") + nvFatbinGetErrorString(fbr); + nvFatbinDestroy(&fatbin_handle); + std::filesystem::remove_all(temp_dir); + return result; + } + + fbr = nvFatbinAddPTX(fatbin_handle, ptx_data.data(), ptx_data.size(), arch.c_str(), "device.ptx", nullptr); + if (fbr != NVFATBIN_SUCCESS) + { + result.diagnostics += std::string("\nnvFatbinAddPTX failed: ") + nvFatbinGetErrorString(fbr); + nvFatbinDestroy(&fatbin_handle); + std::filesystem::remove_all(temp_dir); + return result; + } + + size_t fatbin_size = 0; + fbr = nvFatbinSize(fatbin_handle, &fatbin_size); + if (fbr != NVFATBIN_SUCCESS) + { + result.diagnostics += std::string("\nnvFatbinSize failed: ") + nvFatbinGetErrorString(fbr); + nvFatbinDestroy(&fatbin_handle); + std::filesystem::remove_all(temp_dir); + return result; + } + + std::vector fatbin_data(fatbin_size); + fbr = nvFatbinGet(fatbin_handle, fatbin_data.data()); + nvFatbinDestroy(&fatbin_handle); + if (fbr != NVFATBIN_SUCCESS) + { + result.diagnostics += std::string("\nnvFatbinGet failed: ") + nvFatbinGetErrorString(fbr); + std::filesystem::remove_all(temp_dir); + return result; + } + + std::ofstream out(fatbin_file, std::ios::binary); + out.write(fatbin_data.data(), static_cast(fatbin_data.size())); + if (!out) + { + result.diagnostics += "\nFailed to write fatbin file"; + std::filesystem::remove_all(temp_dir); + return result; + } + } + + if (config.verbose) + { + result.diagnostics += "\n=== Host compilation ===\n"; + } + + if (!compileHostCode(source_code, input_file, fatbin_file, output_path, config, result.diagnostics)) + { + result.diagnostics += "\nHost compilation failed"; + std::filesystem::remove_all(temp_dir); + return result; + } + + std::filesystem::remove_all(temp_dir); + result.success = true; + return result; + } + + LinkResult linkToSharedLibrary( + const std::vector& object_files, const std::string& output_path, const CompilerConfig& config) + { + LinkResult result; + result.success = false; + result.library_path = output_path; + + if (object_files.empty()) + { + result.diagnostics = "No object files provided"; + return result; + } + + std::vector arg_strings; + +#ifdef _WIN32 + arg_strings.push_back("lld-link"); + arg_strings.push_back("/DLL"); + arg_strings.push_back("/NOENTRY"); + arg_strings.push_back("/NODEFAULTLIB"); + arg_strings.push_back("/OUT:" + output_path); + + // Generate import libraries from DLLs present on the system, + // so we don't require the Windows SDK or MSVC .lib files. + std::string implib_dir = std::filesystem::path(output_path).parent_path().string(); + + std::string cudart_dll = findCudartDllName(config.cuda_toolkit_path); + generateImportLib( + cudart_dll, + {"cudaMalloc", + "cudaFree", + "cudaMemcpy", + "cudaMemcpyAsync", + "cudaMemset", + "cudaMemsetAsync", + "cudaDeviceSynchronize", + "cudaGetDevice", + "cudaGetDeviceProperties", + "cudaGetLastError", + "cudaPeekAtLastError", + "cudaGetErrorString", + "cudaStreamCreate", + "cudaStreamDestroy", + "cudaStreamSynchronize", + "cudaEventCreate", + "cudaEventDestroy", + "cudaEventRecord", + "cudaEventSynchronize", + "cudaEventElapsedTime", + "cudaMallocAsync", + "cudaFreeAsync", + "cudaDeviceGetAttribute", + "cudaOccupancyMaxActiveBlocksPerMultiprocessor", + "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", + "cudaFuncGetAttributes", + "cudaLaunchKernel", + "cudaLaunchKernelExC", + "__cudaRegisterFatBinary", + "__cudaRegisterFatBinaryEnd", + "__cudaUnregisterFatBinary", + "__cudaRegisterFunction", + "__cudaRegisterVar", + "__cudaPushCallConfiguration", + "__cudaPopCallConfiguration"}, + implib_dir + "/cudart.lib"); + + generateImportLib( + "ucrtbase.dll", + {"malloc", + "free", + "calloc", + "realloc", + "_callnewh", + "_errno", + "abort", + "exit", + "_exit", + "_register_onexit_function", + "_crt_atexit", + "_initterm", + "_initterm_e", + "memcpy", + "memset", + "memmove", + "memcmp", + "strlen", + "strcmp", + "strncmp", + "_initialize_onexit_table", + "_execute_onexit_table", + "_register_thread_local_exe_atexit_callback"}, + implib_dir + "/ucrt.lib"); + + generateImportLib( + "vcruntime140.dll", + {"__std_exception_copy", + "__std_exception_destroy", + "__CxxFrameHandler3", + "_CxxThrowException", + "memcpy", + "memset", + "memmove", + "memcmp", + "__std_type_info_destroy_list", + "_purecall"}, + implib_dir + "/vcruntime.lib"); + + generateImportLib( + "kernel32.dll", + {"InitializeCriticalSection", + "EnterCriticalSection", + "LeaveCriticalSection", + "DeleteCriticalSection", + "InitOnceExecuteOnce", + "LoadLibraryExA", + "LoadLibraryExW", + "GetProcAddress", + "FreeLibrary", + "GetModuleHandleA", + "GetLastError", + "SetLastError", + "GetCurrentProcess", + "GetCurrentThread", + "GetCurrentThreadId", + "VirtualProtect", + "FlushInstructionCache", + "QueryPerformanceCounter", + "QueryPerformanceFrequency"}, + implib_dir + "/kernel32.lib"); + + arg_strings.push_back("/LIBPATH:" + implib_dir); + + for (const auto& obj_file : object_files) + { + arg_strings.push_back(obj_file); + } + + arg_strings.push_back("cudart.lib"); + arg_strings.push_back("ucrt.lib"); + arg_strings.push_back("vcruntime.lib"); + arg_strings.push_back("kernel32.lib"); +#else + arg_strings.push_back("ld.lld"); + arg_strings.push_back("-shared"); + arg_strings.push_back("--build-id"); + arg_strings.push_back("--eh-frame-hdr"); + arg_strings.push_back("-m"); + arg_strings.push_back("elf_x86_64"); + // Allow unresolved symbols — they will be satisfied at dlopen() time + // by libraries already loaded in the host process (libc, libstdc++, + // cudart, etc.). This removes the need for system CRT objects and + // dev packages on the target machine. + arg_strings.push_back("--allow-shlib-undefined"); + arg_strings.push_back("-o"); + arg_strings.push_back(output_path); + + for (const auto& lib_path : config.library_paths) + { + arg_strings.push_back("-L" + lib_path); + // Embed the library path as RPATH so the dynamic linker can find + // libcudart.so.XX at dlopen time without LD_LIBRARY_PATH. + arg_strings.push_back("-rpath"); + arg_strings.push_back(lib_path); + } + + for (const auto& obj_file : object_files) + { + arg_strings.push_back(obj_file); + } + + // pip packages ship libcudart.so.XX without an unversioned symlink, + // so -lcudart won't work. Find the actual .so by scanning library_paths. + { + bool found_cudart = false; + for (const auto& lib_path : config.library_paths) + { + namespace fs = std::filesystem; + if (!fs::exists(lib_path)) + { + continue; + } + for (const auto& entry : fs::directory_iterator(lib_path)) + { + auto fname = entry.path().filename().string(); + if (fname.starts_with("libcudart.so")) + { + arg_strings.push_back(entry.path().string()); + found_cudart = true; + break; + } + } + if (found_cudart) + { + break; + } + } + if (!found_cudart) + { + arg_strings.push_back("-lcudart"); + } + } +#endif + + std::vector args; + for (const auto& arg : arg_strings) + { + args.push_back(arg.c_str()); + } + + std::string stdout_str, stderr_str; + llvm::raw_string_ostream stdout_os(stdout_str); + llvm::raw_string_ostream stderr_os(stderr_str); + +#ifdef _WIN32 + bool link_success = lld::coff::link(args, stdout_os, stderr_os, false, false); +#else + bool link_success = lld::elf::link(args, stdout_os, stderr_os, false, false); +#endif + + stdout_os.flush(); + stderr_os.flush(); + + if (!stdout_str.empty()) + { + result.diagnostics += stdout_str; + } + if (!stderr_str.empty()) + { + result.diagnostics += stderr_str; + } + + if (!link_success) + { + result.diagnostics += "\nLinking failed"; + return result; + } + + result.success = true; + return result; + } +}; + +CUDACompiler::CUDACompiler() + : impl_(new Impl()) +{} +CUDACompiler::~CUDACompiler() +{ + delete impl_; +} + +BitcodeResult CUDACompiler::compileToDeviceBitcode(const std::string& source_code, const CompilerConfig& config) +{ + return impl_->compileToDeviceBitcode(source_code, config); +} + +CompilationResult CUDACompiler::compileToObject( + const std::string& source_code, const std::string& output_path, const CompilerConfig& config) +{ + return impl_->compileToObject(source_code, output_path, config); +} + +LinkResult CUDACompiler::linkToSharedLibrary( + const std::vector& object_files, const std::string& output_path, const CompilerConfig& config) +{ + return impl_->linkToSharedLibrary(object_files, output_path, config); +} +} // namespace hostjit diff --git a/c/parallel/src/hostjit/config.cpp b/c/parallel/src/hostjit/config.cpp new file mode 100644 index 00000000000..2bea7d807f0 --- /dev/null +++ b/c/parallel/src/hostjit/config.cpp @@ -0,0 +1,161 @@ +#include +#include +#include + +#include + +#include + +namespace hostjit +{ +CompilerConfig detectDefaultConfig() +{ + CompilerConfig config; + + // Detect CUDA toolkit path + if (const char* env = std::getenv("CUDA_PATH")) + { + config.cuda_toolkit_path = env; + } + else if (const char* env = std::getenv("CUDA_HOME")) + { + config.cuda_toolkit_path = env; + } +#ifdef CUDA_TOOLKIT_PATH + else + { + config.cuda_toolkit_path = CUDA_TOOLKIT_PATH; + } +#endif + + // Set up library paths if CUDA toolkit was found + if (!config.cuda_toolkit_path.empty()) + { + std::filesystem::path lib64_path = std::filesystem::path(config.cuda_toolkit_path) / "lib64"; + std::filesystem::path lib_path = std::filesystem::path(config.cuda_toolkit_path) / "lib"; + + if (std::filesystem::exists(lib64_path)) + { + config.library_paths.push_back(lib64_path.string()); + } + else if (std::filesystem::exists(lib_path)) + { + config.library_paths.push_back(lib_path.string()); + } + } + + // Auto-detect GPU compute capability using CUDA runtime + int device = 0; + if (cudaGetDevice(&device) == cudaSuccess) + { + cudaDeviceProp prop; + if (cudaGetDeviceProperties(&prop, device) == cudaSuccess) + { + int detected_sm = prop.major * 10 + prop.minor; + if (detected_sm >= 75) + { + config.sm_version = detected_sm; + } + } + } + + if (config.sm_version == 0) + { + config.sm_version = 75; + } + + config.optimization_level = 2; + config.debug = false; + config.verbose = false; + + // Detect hostjit include path + if (const char* env = std::getenv("HOSTJIT_INCLUDE_PATH")) + { + config.hostjit_include_path = env; + } +#ifdef HOSTJIT_INCLUDE_DIR + else + { + config.hostjit_include_path = HOSTJIT_INCLUDE_DIR; + } +#endif + + return config; +} + +bool validateConfig(const CompilerConfig& config, std::string* error_message) +{ + if (config.cuda_toolkit_path.empty()) + { + if (error_message) + { + *error_message = "CUDA toolkit path not found. Please set CUDA_PATH or CUDA_HOME environment variable."; + } + return false; + } + + if (!std::filesystem::exists(config.cuda_toolkit_path)) + { + if (error_message) + { + *error_message = "CUDA toolkit path does not exist: " + config.cuda_toolkit_path; + } + return false; + } + + std::filesystem::path cuda_h = std::filesystem::path(config.cuda_toolkit_path) / "include" / "cuda.h"; + if (!std::filesystem::exists(cuda_h)) + { + if (error_message) + { + *error_message = "CUDA headers not found at: " + cuda_h.string(); + } + return false; + } + + for (const auto& include_path : config.include_paths) + { + if (!std::filesystem::exists(include_path)) + { + if (error_message) + { + *error_message = "Include path does not exist: " + include_path; + } + return false; + } + } + + for (const auto& library_path : config.library_paths) + { + if (!std::filesystem::exists(library_path)) + { + if (error_message) + { + *error_message = "Library path does not exist: " + library_path; + } + return false; + } + } + + if (config.sm_version < 30 || config.sm_version > 150) + { + if (error_message) + { + *error_message = "Invalid SM version: " + std::to_string(config.sm_version) + " (must be between 30 and 150)"; + } + return false; + } + + if (config.optimization_level < 0 || config.optimization_level > 3) + { + if (error_message) + { + *error_message = + "Invalid optimization level: " + std::to_string(config.optimization_level) + " (must be between 0 and 3)"; + } + return false; + } + + return true; +} +} // namespace hostjit diff --git a/c/parallel/src/hostjit/include/hostjit/compiler.hpp b/c/parallel/src/hostjit/include/hostjit/compiler.hpp new file mode 100644 index 00000000000..69f7e140fcd --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/compiler.hpp @@ -0,0 +1,54 @@ +#pragma once + +#include +#include + +namespace hostjit +{ +struct CompilationResult +{ + bool success; + std::string object_file_path; // Path to generated .o file + std::string diagnostics; // Compiler messages + std::vector cubin; // Device cubin extracted during compilation +}; + +struct BitcodeResult +{ + bool success; + std::string bitcode; // LLVM bitcode bytes + std::string diagnostics; +}; + +struct LinkResult +{ + bool success; + std::string library_path; // Path to .so file + std::string diagnostics; +}; + +// Forward declaration to avoid including heavy Clang headers +struct CompilerConfig; + +class CUDACompiler +{ +public: + CUDACompiler(); + ~CUDACompiler(); + + // Compile CUDA device source to LLVM bitcode + BitcodeResult compileToDeviceBitcode(const std::string& source_code, const CompilerConfig& config); + + // Compile CUDA source code to object file + CompilationResult + compileToObject(const std::string& source_code, const std::string& output_path, const CompilerConfig& config); + + // Link object files to shared library + LinkResult linkToSharedLibrary( + const std::vector& object_files, const std::string& output_path, const CompilerConfig& config); + +private: + class Impl; + Impl* impl_; +}; +} // namespace hostjit diff --git a/c/parallel/src/hostjit/include/hostjit/config.hpp b/c/parallel/src/hostjit/include/hostjit/config.hpp new file mode 100644 index 00000000000..2bc248e7369 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/config.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include +#include +#include + +namespace hostjit +{ +struct CompilerConfig +{ + std::string cuda_toolkit_path; + std::string hostjit_include_path; // Path to hostjit include directory (for minimal CUDA runtime) + std::string clang_headers_path; // Path to Clang's built-in CUDA headers (overrides CLANG_HEADERS_DIR) + std::string cccl_include_path; // Path to CCCL headers (overrides CCCL_SOURCE_DIR); contains cub/, thrust/, cuda/ + std::vector include_paths; + std::vector library_paths; + std::vector device_bitcode_files; // Paths to .bc files to link into device code + std::unordered_map macro_definitions; // key=macro name, value=macro value (empty for flag + // macros) + int sm_version = 70; + int optimization_level = 2; + bool debug = false; + bool verbose = false; + bool trace_includes = false; // Show all included headers during compilation (for debugging header search) + bool keep_artifacts = false; // Keep compiled artifacts for inspection (PTX, object files, etc.) + std::string entry_point_name; // Name of the exported entry point function (used for post-link optimization) + bool enable_pch = false; // Cache precompiled headers on disk to speed up repeated builds +}; + +// Auto-detect CUDA toolkit and create default configuration +CompilerConfig detectDefaultConfig(); + +// Validate that the configuration is usable +bool validateConfig(const CompilerConfig& config, std::string* error_message = nullptr); +} // namespace hostjit diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_device_functions.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_device_functions.h new file mode 100644 index 00000000000..fbc5b44808b --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_device_functions.h @@ -0,0 +1,1716 @@ +/*===---- __clang_cuda_device_functions.h - CUDA runtime support -----------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__ +#define __CLANG_CUDA_DEVICE_FUNCTIONS_H__ + +// __DEVICE__ is a helper macro with common set of attributes for the wrappers +// we implement in this file. We need static in order to avoid emitting unused +// functions and __forceinline__ helps inlining these wrappers at -O1. +#pragma push_macro("__DEVICE__") +#define __DEVICE__ static __device__ __forceinline__ + +__DEVICE__ int __all(int __a) +{ + return __nvvm_vote_all(__a); +} +__DEVICE__ int __any(int __a) +{ + return __nvvm_vote_any(__a); +} +__DEVICE__ unsigned int __ballot(int __a) +{ + return __nvvm_vote_ballot(__a); +} +__DEVICE__ unsigned int __brev(unsigned int __a) +{ + return __nv_brev(__a); +} +__DEVICE__ unsigned long long __brevll(unsigned long long __a) +{ + return __nv_brevll(__a); +} +#if defined(__cplusplus) +__DEVICE__ void __brkpt() +{ + __asm__ __volatile__("brkpt;"); +} +__DEVICE__ void __brkpt(int __a) +{ + __brkpt(); +} +#else +__DEVICE__ void __attribute__((overloadable)) __brkpt(void) +{ + __asm__ __volatile__("brkpt;"); +} +__DEVICE__ void __attribute__((overloadable)) __brkpt(int __a) +{ + __brkpt(); +} +#endif +__DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b, unsigned int __c) +{ + return __nv_byte_perm(__a, __b, __c); +} +__DEVICE__ int __clz(int __a) +{ + return __nv_clz(__a); +} +__DEVICE__ int __clzll(long long __a) +{ + return __nv_clzll(__a); +} +__DEVICE__ float __cosf(float __a) +{ + return __nv_fast_cosf(__a); +} +__DEVICE__ double __dAtomicAdd(double* __p, double __v) +{ + return __nvvm_atom_add_gen_d(__p, __v); +} +__DEVICE__ double __dAtomicAdd_block(double* __p, double __v) +{ + return __nvvm_atom_cta_add_gen_d(__p, __v); +} +__DEVICE__ double __dAtomicAdd_system(double* __p, double __v) +{ + return __nvvm_atom_sys_add_gen_d(__p, __v); +} +__DEVICE__ double __dadd_rd(double __a, double __b) +{ + return __nv_dadd_rd(__a, __b); +} +__DEVICE__ double __dadd_rn(double __a, double __b) +{ + return __nv_dadd_rn(__a, __b); +} +__DEVICE__ double __dadd_ru(double __a, double __b) +{ + return __nv_dadd_ru(__a, __b); +} +__DEVICE__ double __dadd_rz(double __a, double __b) +{ + return __nv_dadd_rz(__a, __b); +} +__DEVICE__ double __ddiv_rd(double __a, double __b) +{ + return __nv_ddiv_rd(__a, __b); +} +__DEVICE__ double __ddiv_rn(double __a, double __b) +{ + return __nv_ddiv_rn(__a, __b); +} +__DEVICE__ double __ddiv_ru(double __a, double __b) +{ + return __nv_ddiv_ru(__a, __b); +} +__DEVICE__ double __ddiv_rz(double __a, double __b) +{ + return __nv_ddiv_rz(__a, __b); +} +__DEVICE__ double __dmul_rd(double __a, double __b) +{ + return __nv_dmul_rd(__a, __b); +} +__DEVICE__ double __dmul_rn(double __a, double __b) +{ + return __nv_dmul_rn(__a, __b); +} +__DEVICE__ double __dmul_ru(double __a, double __b) +{ + return __nv_dmul_ru(__a, __b); +} +__DEVICE__ double __dmul_rz(double __a, double __b) +{ + return __nv_dmul_rz(__a, __b); +} +__DEVICE__ float __double2float_rd(double __a) +{ + return __nv_double2float_rd(__a); +} +__DEVICE__ float __double2float_rn(double __a) +{ + return __nv_double2float_rn(__a); +} +__DEVICE__ float __double2float_ru(double __a) +{ + return __nv_double2float_ru(__a); +} +__DEVICE__ float __double2float_rz(double __a) +{ + return __nv_double2float_rz(__a); +} +__DEVICE__ int __double2hiint(double __a) +{ + return __nv_double2hiint(__a); +} +__DEVICE__ int __double2int_rd(double __a) +{ + return __nv_double2int_rd(__a); +} +__DEVICE__ int __double2int_rn(double __a) +{ + return __nv_double2int_rn(__a); +} +__DEVICE__ int __double2int_ru(double __a) +{ + return __nv_double2int_ru(__a); +} +__DEVICE__ int __double2int_rz(double __a) +{ + return __nv_double2int_rz(__a); +} +__DEVICE__ long long __double2ll_rd(double __a) +{ + return __nv_double2ll_rd(__a); +} +__DEVICE__ long long __double2ll_rn(double __a) +{ + return __nv_double2ll_rn(__a); +} +__DEVICE__ long long __double2ll_ru(double __a) +{ + return __nv_double2ll_ru(__a); +} +__DEVICE__ long long __double2ll_rz(double __a) +{ + return __nv_double2ll_rz(__a); +} +__DEVICE__ int __double2loint(double __a) +{ + return __nv_double2loint(__a); +} +__DEVICE__ unsigned int __double2uint_rd(double __a) +{ + return __nv_double2uint_rd(__a); +} +__DEVICE__ unsigned int __double2uint_rn(double __a) +{ + return __nv_double2uint_rn(__a); +} +__DEVICE__ unsigned int __double2uint_ru(double __a) +{ + return __nv_double2uint_ru(__a); +} +__DEVICE__ unsigned int __double2uint_rz(double __a) +{ + return __nv_double2uint_rz(__a); +} +__DEVICE__ unsigned long long __double2ull_rd(double __a) +{ + return __nv_double2ull_rd(__a); +} +__DEVICE__ unsigned long long __double2ull_rn(double __a) +{ + return __nv_double2ull_rn(__a); +} +__DEVICE__ unsigned long long __double2ull_ru(double __a) +{ + return __nv_double2ull_ru(__a); +} +__DEVICE__ unsigned long long __double2ull_rz(double __a) +{ + return __nv_double2ull_rz(__a); +} +__DEVICE__ long long __double_as_longlong(double __a) +{ + return __nv_double_as_longlong(__a); +} +__DEVICE__ double __drcp_rd(double __a) +{ + return __nv_drcp_rd(__a); +} +__DEVICE__ double __drcp_rn(double __a) +{ + return __nv_drcp_rn(__a); +} +__DEVICE__ double __drcp_ru(double __a) +{ + return __nv_drcp_ru(__a); +} +__DEVICE__ double __drcp_rz(double __a) +{ + return __nv_drcp_rz(__a); +} +__DEVICE__ double __dsqrt_rd(double __a) +{ + return __nv_dsqrt_rd(__a); +} +__DEVICE__ double __dsqrt_rn(double __a) +{ + return __nv_dsqrt_rn(__a); +} +__DEVICE__ double __dsqrt_ru(double __a) +{ + return __nv_dsqrt_ru(__a); +} +__DEVICE__ double __dsqrt_rz(double __a) +{ + return __nv_dsqrt_rz(__a); +} +__DEVICE__ double __dsub_rd(double __a, double __b) +{ + return __nv_dsub_rd(__a, __b); +} +__DEVICE__ double __dsub_rn(double __a, double __b) +{ + return __nv_dsub_rn(__a, __b); +} +__DEVICE__ double __dsub_ru(double __a, double __b) +{ + return __nv_dsub_ru(__a, __b); +} +__DEVICE__ double __dsub_rz(double __a, double __b) +{ + return __nv_dsub_rz(__a, __b); +} +__DEVICE__ float __exp10f(float __a) +{ + return __nv_fast_exp10f(__a); +} +__DEVICE__ float __expf(float __a) +{ + return __nv_fast_expf(__a); +} +__DEVICE__ float __fAtomicAdd(float* __p, float __v) +{ + return __nvvm_atom_add_gen_f(__p, __v); +} +__DEVICE__ float __fAtomicAdd_block(float* __p, float __v) +{ + return __nvvm_atom_cta_add_gen_f(__p, __v); +} +__DEVICE__ float __fAtomicAdd_system(float* __p, float __v) +{ + return __nvvm_atom_sys_add_gen_f(__p, __v); +} +__DEVICE__ float __fAtomicExch(float* __p, float __v) +{ + return __nv_int_as_float(__nvvm_atom_xchg_gen_i((int*) __p, __nv_float_as_int(__v))); +} +__DEVICE__ float __fAtomicExch_block(float* __p, float __v) +{ + return __nv_int_as_float(__nvvm_atom_cta_xchg_gen_i((int*) __p, __nv_float_as_int(__v))); +} +__DEVICE__ float __fAtomicExch_system(float* __p, float __v) +{ + return __nv_int_as_float(__nvvm_atom_sys_xchg_gen_i((int*) __p, __nv_float_as_int(__v))); +} +__DEVICE__ float __fadd_rd(float __a, float __b) +{ + return __nv_fadd_rd(__a, __b); +} +__DEVICE__ float __fadd_rn(float __a, float __b) +{ + return __nv_fadd_rn(__a, __b); +} +__DEVICE__ float __fadd_ru(float __a, float __b) +{ + return __nv_fadd_ru(__a, __b); +} +__DEVICE__ float __fadd_rz(float __a, float __b) +{ + return __nv_fadd_rz(__a, __b); +} +__DEVICE__ float __fdiv_rd(float __a, float __b) +{ + return __nv_fdiv_rd(__a, __b); +} +__DEVICE__ float __fdiv_rn(float __a, float __b) +{ + return __nv_fdiv_rn(__a, __b); +} +__DEVICE__ float __fdiv_ru(float __a, float __b) +{ + return __nv_fdiv_ru(__a, __b); +} +__DEVICE__ float __fdiv_rz(float __a, float __b) +{ + return __nv_fdiv_rz(__a, __b); +} +__DEVICE__ float __fdividef(float __a, float __b) +{ + return __nv_fast_fdividef(__a, __b); +} +__DEVICE__ int __ffs(int __a) +{ + return __nv_ffs(__a); +} +__DEVICE__ int __ffsll(long long __a) +{ + return __nv_ffsll(__a); +} +__DEVICE__ int __finite(double __a) +{ + return __nv_isfinited(__a); +} +__DEVICE__ int __finitef(float __a) +{ + return __nv_finitef(__a); +} +#ifdef _MSC_VER +__DEVICE__ int __finitel(long double __a); +#endif +__DEVICE__ int __float2int_rd(float __a) +{ + return __nv_float2int_rd(__a); +} +__DEVICE__ int __float2int_rn(float __a) +{ + return __nv_float2int_rn(__a); +} +__DEVICE__ int __float2int_ru(float __a) +{ + return __nv_float2int_ru(__a); +} +__DEVICE__ int __float2int_rz(float __a) +{ + return __nv_float2int_rz(__a); +} +__DEVICE__ long long __float2ll_rd(float __a) +{ + return __nv_float2ll_rd(__a); +} +__DEVICE__ long long __float2ll_rn(float __a) +{ + return __nv_float2ll_rn(__a); +} +__DEVICE__ long long __float2ll_ru(float __a) +{ + return __nv_float2ll_ru(__a); +} +__DEVICE__ long long __float2ll_rz(float __a) +{ + return __nv_float2ll_rz(__a); +} +__DEVICE__ unsigned int __float2uint_rd(float __a) +{ + return __nv_float2uint_rd(__a); +} +__DEVICE__ unsigned int __float2uint_rn(float __a) +{ + return __nv_float2uint_rn(__a); +} +__DEVICE__ unsigned int __float2uint_ru(float __a) +{ + return __nv_float2uint_ru(__a); +} +__DEVICE__ unsigned int __float2uint_rz(float __a) +{ + return __nv_float2uint_rz(__a); +} +__DEVICE__ unsigned long long __float2ull_rd(float __a) +{ + return __nv_float2ull_rd(__a); +} +__DEVICE__ unsigned long long __float2ull_rn(float __a) +{ + return __nv_float2ull_rn(__a); +} +__DEVICE__ unsigned long long __float2ull_ru(float __a) +{ + return __nv_float2ull_ru(__a); +} +__DEVICE__ unsigned long long __float2ull_rz(float __a) +{ + return __nv_float2ull_rz(__a); +} +__DEVICE__ int __float_as_int(float __a) +{ + return __nv_float_as_int(__a); +} +__DEVICE__ unsigned int __float_as_uint(float __a) +{ + return __nv_float_as_uint(__a); +} +__DEVICE__ double __fma_rd(double __a, double __b, double __c) +{ + return __nv_fma_rd(__a, __b, __c); +} +__DEVICE__ double __fma_rn(double __a, double __b, double __c) +{ + return __nv_fma_rn(__a, __b, __c); +} +__DEVICE__ double __fma_ru(double __a, double __b, double __c) +{ + return __nv_fma_ru(__a, __b, __c); +} +__DEVICE__ double __fma_rz(double __a, double __b, double __c) +{ + return __nv_fma_rz(__a, __b, __c); +} +__DEVICE__ float __fmaf_ieee_rd(float __a, float __b, float __c) +{ + return __nv_fmaf_ieee_rd(__a, __b, __c); +} +__DEVICE__ float __fmaf_ieee_rn(float __a, float __b, float __c) +{ + return __nv_fmaf_ieee_rn(__a, __b, __c); +} +__DEVICE__ float __fmaf_ieee_ru(float __a, float __b, float __c) +{ + return __nv_fmaf_ieee_ru(__a, __b, __c); +} +__DEVICE__ float __fmaf_ieee_rz(float __a, float __b, float __c) +{ + return __nv_fmaf_ieee_rz(__a, __b, __c); +} +__DEVICE__ float __fmaf_rd(float __a, float __b, float __c) +{ + return __nv_fmaf_rd(__a, __b, __c); +} +__DEVICE__ float __fmaf_rn(float __a, float __b, float __c) +{ + return __nv_fmaf_rn(__a, __b, __c); +} +__DEVICE__ float __fmaf_ru(float __a, float __b, float __c) +{ + return __nv_fmaf_ru(__a, __b, __c); +} +__DEVICE__ float __fmaf_rz(float __a, float __b, float __c) +{ + return __nv_fmaf_rz(__a, __b, __c); +} +__DEVICE__ float __fmul_rd(float __a, float __b) +{ + return __nv_fmul_rd(__a, __b); +} +__DEVICE__ float __fmul_rn(float __a, float __b) +{ + return __nv_fmul_rn(__a, __b); +} +__DEVICE__ float __fmul_ru(float __a, float __b) +{ + return __nv_fmul_ru(__a, __b); +} +__DEVICE__ float __fmul_rz(float __a, float __b) +{ + return __nv_fmul_rz(__a, __b); +} +__DEVICE__ float __frcp_rd(float __a) +{ + return __nv_frcp_rd(__a); +} +__DEVICE__ float __frcp_rn(float __a) +{ + return __nv_frcp_rn(__a); +} +__DEVICE__ float __frcp_ru(float __a) +{ + return __nv_frcp_ru(__a); +} +__DEVICE__ float __frcp_rz(float __a) +{ + return __nv_frcp_rz(__a); +} +__DEVICE__ float __frsqrt_rn(float __a) +{ + return __nv_frsqrt_rn(__a); +} +__DEVICE__ float __fsqrt_rd(float __a) +{ + return __nv_fsqrt_rd(__a); +} +__DEVICE__ float __fsqrt_rn(float __a) +{ + return __nv_fsqrt_rn(__a); +} +__DEVICE__ float __fsqrt_ru(float __a) +{ + return __nv_fsqrt_ru(__a); +} +__DEVICE__ float __fsqrt_rz(float __a) +{ + return __nv_fsqrt_rz(__a); +} +__DEVICE__ float __fsub_rd(float __a, float __b) +{ + return __nv_fsub_rd(__a, __b); +} +__DEVICE__ float __fsub_rn(float __a, float __b) +{ + return __nv_fsub_rn(__a, __b); +} +__DEVICE__ float __fsub_ru(float __a, float __b) +{ + return __nv_fsub_ru(__a, __b); +} +__DEVICE__ float __fsub_rz(float __a, float __b) +{ + return __nv_fsub_rz(__a, __b); +} +__DEVICE__ int __hadd(int __a, int __b) +{ + return __nv_hadd(__a, __b); +} +__DEVICE__ double __hiloint2double(int __a, int __b) +{ + return __nv_hiloint2double(__a, __b); +} +__DEVICE__ int __iAtomicAdd(int* __p, int __v) +{ + return __nvvm_atom_add_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAdd_block(int* __p, int __v) +{ + return __nvvm_atom_cta_add_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAdd_system(int* __p, int __v) +{ + return __nvvm_atom_sys_add_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAnd(int* __p, int __v) +{ + return __nvvm_atom_and_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAnd_block(int* __p, int __v) +{ + return __nvvm_atom_cta_and_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAnd_system(int* __p, int __v) +{ + return __nvvm_atom_sys_and_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicCAS(int* __p, int __cmp, int __v) +{ + return __nvvm_atom_cas_gen_i(__p, __cmp, __v); +} +__DEVICE__ int __iAtomicCAS_block(int* __p, int __cmp, int __v) +{ + return __nvvm_atom_cta_cas_gen_i(__p, __cmp, __v); +} +__DEVICE__ int __iAtomicCAS_system(int* __p, int __cmp, int __v) +{ + return __nvvm_atom_sys_cas_gen_i(__p, __cmp, __v); +} +__DEVICE__ int __iAtomicExch(int* __p, int __v) +{ + return __nvvm_atom_xchg_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicExch_block(int* __p, int __v) +{ + return __nvvm_atom_cta_xchg_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicExch_system(int* __p, int __v) +{ + return __nvvm_atom_sys_xchg_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMax(int* __p, int __v) +{ + return __nvvm_atom_max_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMax_block(int* __p, int __v) +{ + return __nvvm_atom_cta_max_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMax_system(int* __p, int __v) +{ + return __nvvm_atom_sys_max_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMin(int* __p, int __v) +{ + return __nvvm_atom_min_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMin_block(int* __p, int __v) +{ + return __nvvm_atom_cta_min_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMin_system(int* __p, int __v) +{ + return __nvvm_atom_sys_min_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicOr(int* __p, int __v) +{ + return __nvvm_atom_or_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicOr_block(int* __p, int __v) +{ + return __nvvm_atom_cta_or_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicOr_system(int* __p, int __v) +{ + return __nvvm_atom_sys_or_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicXor(int* __p, int __v) +{ + return __nvvm_atom_xor_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicXor_block(int* __p, int __v) +{ + return __nvvm_atom_cta_xor_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicXor_system(int* __p, int __v) +{ + return __nvvm_atom_sys_xor_gen_i(__p, __v); +} +__DEVICE__ long long __illAtomicMax(long long* __p, long long __v) +{ + return __nvvm_atom_max_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMax_block(long long* __p, long long __v) +{ + return __nvvm_atom_cta_max_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMax_system(long long* __p, long long __v) +{ + return __nvvm_atom_sys_max_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMin(long long* __p, long long __v) +{ + return __nvvm_atom_min_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMin_block(long long* __p, long long __v) +{ + return __nvvm_atom_cta_min_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMin_system(long long* __p, long long __v) +{ + return __nvvm_atom_sys_min_gen_ll(__p, __v); +} +__DEVICE__ double __int2double_rn(int __a) +{ + return __nv_int2double_rn(__a); +} +__DEVICE__ float __int2float_rd(int __a) +{ + return __nv_int2float_rd(__a); +} +__DEVICE__ float __int2float_rn(int __a) +{ + return __nv_int2float_rn(__a); +} +__DEVICE__ float __int2float_ru(int __a) +{ + return __nv_int2float_ru(__a); +} +__DEVICE__ float __int2float_rz(int __a) +{ + return __nv_int2float_rz(__a); +} +__DEVICE__ float __int_as_float(int __a) +{ + return __nv_int_as_float(__a); +} +__DEVICE__ int __isfinited(double __a) +{ + return __nv_isfinited(__a); +} +__DEVICE__ int __isinf(double __a) +{ + return __nv_isinfd(__a); +} +__DEVICE__ int __isinff(float __a) +{ + return __nv_isinff(__a); +} +#ifdef _MSC_VER +__DEVICE__ int __isinfl(long double __a); +#endif +__DEVICE__ int __isnan(double __a) +{ + return __nv_isnand(__a); +} +__DEVICE__ int __isnanf(float __a) +{ + return __nv_isnanf(__a); +} +#ifdef _MSC_VER +__DEVICE__ int __isnanl(long double __a); +#endif +__DEVICE__ double __ll2double_rd(long long __a) +{ + return __nv_ll2double_rd(__a); +} +__DEVICE__ double __ll2double_rn(long long __a) +{ + return __nv_ll2double_rn(__a); +} +__DEVICE__ double __ll2double_ru(long long __a) +{ + return __nv_ll2double_ru(__a); +} +__DEVICE__ double __ll2double_rz(long long __a) +{ + return __nv_ll2double_rz(__a); +} +__DEVICE__ float __ll2float_rd(long long __a) +{ + return __nv_ll2float_rd(__a); +} +__DEVICE__ float __ll2float_rn(long long __a) +{ + return __nv_ll2float_rn(__a); +} +__DEVICE__ float __ll2float_ru(long long __a) +{ + return __nv_ll2float_ru(__a); +} +__DEVICE__ float __ll2float_rz(long long __a) +{ + return __nv_ll2float_rz(__a); +} +__DEVICE__ long long __llAtomicAnd(long long* __p, long long __v) +{ + return __nvvm_atom_and_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicAnd_block(long long* __p, long long __v) +{ + return __nvvm_atom_cta_and_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicAnd_system(long long* __p, long long __v) +{ + return __nvvm_atom_sys_and_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicOr(long long* __p, long long __v) +{ + return __nvvm_atom_or_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicOr_block(long long* __p, long long __v) +{ + return __nvvm_atom_cta_or_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicOr_system(long long* __p, long long __v) +{ + return __nvvm_atom_sys_or_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicXor(long long* __p, long long __v) +{ + return __nvvm_atom_xor_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicXor_block(long long* __p, long long __v) +{ + return __nvvm_atom_cta_xor_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicXor_system(long long* __p, long long __v) +{ + return __nvvm_atom_sys_xor_gen_ll(__p, __v); +} +__DEVICE__ float __log10f(float __a) +{ + return __nv_fast_log10f(__a); +} +__DEVICE__ float __log2f(float __a) +{ + return __nv_fast_log2f(__a); +} +__DEVICE__ float __logf(float __a) +{ + return __nv_fast_logf(__a); +} +__DEVICE__ double __longlong_as_double(long long __a) +{ + return __nv_longlong_as_double(__a); +} +__DEVICE__ int __mul24(int __a, int __b) +{ + return __nv_mul24(__a, __b); +} +__DEVICE__ long long __mul64hi(long long __a, long long __b) +{ + return __nv_mul64hi(__a, __b); +} +__DEVICE__ int __mulhi(int __a, int __b) +{ + return __nv_mulhi(__a, __b); +} +__DEVICE__ unsigned int __pm0(void) +{ + return __nvvm_read_ptx_sreg_pm0(); +} +__DEVICE__ unsigned int __pm1(void) +{ + return __nvvm_read_ptx_sreg_pm1(); +} +__DEVICE__ unsigned int __pm2(void) +{ + return __nvvm_read_ptx_sreg_pm2(); +} +__DEVICE__ unsigned int __pm3(void) +{ + return __nvvm_read_ptx_sreg_pm3(); +} +__DEVICE__ int __popc(unsigned int __a) +{ + return __nv_popc(__a); +} +__DEVICE__ int __popcll(unsigned long long __a) +{ + return __nv_popcll(__a); +} +__DEVICE__ float __powf(float __a, float __b) +{ + return __nv_fast_powf(__a, __b); +} + +// Parameter must have a known integer value. +#define __prof_trigger(__a) __asm__ __volatile__("pmevent \t%0;" ::"i"(__a)) +__DEVICE__ int __rhadd(int __a, int __b) +{ + return __nv_rhadd(__a, __b); +} +__DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) +{ + return __nv_sad(__a, __b, __c); +} +__DEVICE__ float __saturatef(float __a) +{ + return __nv_saturatef(__a); +} +__DEVICE__ int __signbitd(double __a) +{ + return __nv_signbitd(__a); +} +__DEVICE__ int __signbitf(float __a) +{ + return __nv_signbitf(__a); +} +__DEVICE__ void __sincosf(float __a, float* __s, float* __c) +{ + return __nv_fast_sincosf(__a, __s, __c); +} +__DEVICE__ float __sinf(float __a) +{ + return __nv_fast_sinf(__a); +} +__DEVICE__ int __syncthreads_and(int __a) +{ + return __nvvm_bar0_and(__a); +} +__DEVICE__ int __syncthreads_count(int __a) +{ + return __nvvm_bar0_popc(__a); +} +__DEVICE__ int __syncthreads_or(int __a) +{ + return __nvvm_bar0_or(__a); +} +__DEVICE__ float __tanf(float __a) +{ + return __nv_fast_tanf(__a); +} +__DEVICE__ void __threadfence(void) +{ + __nvvm_membar_gl(); +} +__DEVICE__ void __threadfence_block(void) +{ + __nvvm_membar_cta(); +}; +__DEVICE__ void __threadfence_system(void) +{ + __nvvm_membar_sys(); +}; +__DEVICE__ __attribute__((noreturn)) void __trap(void) +{ + __builtin_trap(); +} +__DEVICE__ unsigned short __usAtomicCAS(unsigned short* __p, unsigned short __cmp, unsigned short __v) +{ + return __nvvm_atom_cas_gen_us(__p, __cmp, __v); +} +__DEVICE__ unsigned short __usAtomicCAS_block(unsigned short* __p, unsigned short __cmp, unsigned short __v) +{ + return __nvvm_atom_cta_cas_gen_us(__p, __cmp, __v); +} +__DEVICE__ unsigned short __usAtomicCAS_system(unsigned short* __p, unsigned short __cmp, unsigned short __v) +{ + return __nvvm_atom_sys_cas_gen_us(__p, __cmp, __v); +} +__DEVICE__ unsigned int __uAtomicAdd(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_add_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uAtomicAdd_block(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_cta_add_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uAtomicAdd_system(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_sys_add_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uAtomicAnd(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_and_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uAtomicAnd_block(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_cta_and_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uAtomicAnd_system(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_sys_and_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uAtomicCAS(unsigned int* __p, unsigned int __cmp, unsigned int __v) +{ + return __nvvm_atom_cas_gen_i((int*) __p, __cmp, __v); +} +__DEVICE__ unsigned int __uAtomicCAS_block(unsigned int* __p, unsigned int __cmp, unsigned int __v) +{ + return __nvvm_atom_cta_cas_gen_i((int*) __p, __cmp, __v); +} +__DEVICE__ unsigned int __uAtomicCAS_system(unsigned int* __p, unsigned int __cmp, unsigned int __v) +{ + return __nvvm_atom_sys_cas_gen_i((int*) __p, __cmp, __v); +} +__DEVICE__ unsigned int __uAtomicDec(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_dec_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicDec_block(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_cta_dec_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicDec_system(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_sys_dec_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicExch(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_xchg_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uAtomicExch_block(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_cta_xchg_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uAtomicExch_system(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_sys_xchg_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uAtomicInc(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_inc_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicInc_block(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_cta_inc_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicInc_system(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_sys_inc_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMax(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_max_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMax_block(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_cta_max_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMax_system(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_sys_max_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMin(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_min_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMin_block(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_cta_min_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMin_system(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_sys_min_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicOr(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_or_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uAtomicOr_block(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_cta_or_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uAtomicOr_system(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_sys_or_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uAtomicXor(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_xor_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uAtomicXor_block(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_cta_xor_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uAtomicXor_system(unsigned int* __p, unsigned int __v) +{ + return __nvvm_atom_sys_xor_gen_i((int*) __p, __v); +} +__DEVICE__ unsigned int __uhadd(unsigned int __a, unsigned int __b) +{ + return __nv_uhadd(__a, __b); +} +__DEVICE__ double __uint2double_rn(unsigned int __a) +{ + return __nv_uint2double_rn(__a); +} +__DEVICE__ float __uint2float_rd(unsigned int __a) +{ + return __nv_uint2float_rd(__a); +} +__DEVICE__ float __uint2float_rn(unsigned int __a) +{ + return __nv_uint2float_rn(__a); +} +__DEVICE__ float __uint2float_ru(unsigned int __a) +{ + return __nv_uint2float_ru(__a); +} +__DEVICE__ float __uint2float_rz(unsigned int __a) +{ + return __nv_uint2float_rz(__a); +} +__DEVICE__ float __uint_as_float(unsigned int __a) +{ + return __nv_uint_as_float(__a); +} // +__DEVICE__ double __ull2double_rd(unsigned long long __a) +{ + return __nv_ull2double_rd(__a); +} +__DEVICE__ double __ull2double_rn(unsigned long long __a) +{ + return __nv_ull2double_rn(__a); +} +__DEVICE__ double __ull2double_ru(unsigned long long __a) +{ + return __nv_ull2double_ru(__a); +} +__DEVICE__ double __ull2double_rz(unsigned long long __a) +{ + return __nv_ull2double_rz(__a); +} +__DEVICE__ float __ull2float_rd(unsigned long long __a) +{ + return __nv_ull2float_rd(__a); +} +__DEVICE__ float __ull2float_rn(unsigned long long __a) +{ + return __nv_ull2float_rn(__a); +} +__DEVICE__ float __ull2float_ru(unsigned long long __a) +{ + return __nv_ull2float_ru(__a); +} +__DEVICE__ float __ull2float_rz(unsigned long long __a) +{ + return __nv_ull2float_rz(__a); +} +__DEVICE__ unsigned long long __ullAtomicAdd(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_add_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAdd_block(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_cta_add_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAdd_system(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_sys_add_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAnd(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_and_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAnd_block(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_cta_and_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAnd_system(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_sys_and_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned long long __ullAtomicCAS(unsigned long long* __p, unsigned long long __cmp, unsigned long long __v) +{ + return __nvvm_atom_cas_gen_ll((long long*) __p, __cmp, __v); +} +__DEVICE__ unsigned long long +__ullAtomicCAS_block(unsigned long long* __p, unsigned long long __cmp, unsigned long long __v) +{ + return __nvvm_atom_cta_cas_gen_ll((long long*) __p, __cmp, __v); +} +__DEVICE__ unsigned long long +__ullAtomicCAS_system(unsigned long long* __p, unsigned long long __cmp, unsigned long long __v) +{ + return __nvvm_atom_sys_cas_gen_ll((long long*) __p, __cmp, __v); +} +__DEVICE__ unsigned long long __ullAtomicExch(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_xchg_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned long long __ullAtomicExch_block(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_cta_xchg_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned long long __ullAtomicExch_system(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_sys_xchg_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMax(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_max_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMax_block(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_cta_max_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMax_system(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_sys_max_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMin(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_min_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMin_block(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_cta_min_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMin_system(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_sys_min_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicOr(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_or_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned long long __ullAtomicOr_block(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_cta_or_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned long long __ullAtomicOr_system(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_sys_or_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned long long __ullAtomicXor(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_xor_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned long long __ullAtomicXor_block(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_cta_xor_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned long long __ullAtomicXor_system(unsigned long long* __p, unsigned long long __v) +{ + return __nvvm_atom_sys_xor_gen_ll((long long*) __p, __v); +} +__DEVICE__ unsigned int __umul24(unsigned int __a, unsigned int __b) +{ + return __nv_umul24(__a, __b); +} +__DEVICE__ unsigned long long __umul64hi(unsigned long long __a, unsigned long long __b) +{ + return __nv_umul64hi(__a, __b); +} +__DEVICE__ unsigned int __umulhi(unsigned int __a, unsigned int __b) +{ + return __nv_umulhi(__a, __b); +} +__DEVICE__ unsigned int __urhadd(unsigned int __a, unsigned int __b) +{ + return __nv_urhadd(__a, __b); +} +__DEVICE__ unsigned int __usad(unsigned int __a, unsigned int __b, unsigned int __c) +{ + return __nv_usad(__a, __b, __c); +} + +// CUDA no longer provides inline assembly (or bitcode) implementation of these +// functions, so we have to reimplement them. The implementation is naive and is +// not optimized for performance. + +// Helper function to convert N-bit boolean subfields into all-0 or all-1. +// E.g. __bool2mask(0x01000100,8) -> 0xff00ff00 +// __bool2mask(0x00010000,16) -> 0xffff0000 +__DEVICE__ unsigned int __bool2mask(unsigned int __a, int shift) +{ + return (__a << shift) - __a; +} +__DEVICE__ unsigned int __vabs2(unsigned int __a) +{ + unsigned int r; + __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabs4(unsigned int __a) +{ + unsigned int r; + __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} + +__DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsss2(unsigned int __a) +{ + unsigned int r; + __asm__("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsss4(unsigned int __a) +{ + unsigned int r; + __asm__("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vadd2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vadd4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vadd2.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vadd4.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vadd2.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vadd4.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vavrg2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vavrg4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vavrg2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vavrg4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset2.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vseteq2(__a, __b), 16); +} +__DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset4.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vseteq4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset2.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetges2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset4.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetges4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset2.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetgeu2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset4.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetgeu4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset2.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetgts2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset4.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetgts4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset2.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetgtu2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset4.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetgtu4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset2.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetles2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset4.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetles4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset2.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetleu2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset4.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetleu4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset2.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetlts2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset4.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetlts4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset2.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetltu2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset4.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetltu4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset2.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetne2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vset4.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) +{ + return __bool2mask(__vsetne4(__a, __b), 8); +} + +// Based on ITEM 23 in AIM-239: http://dspace.mit.edu/handle/1721.1/6086 +// (a & b) + (a | b) = a + b = (a ^ b) + 2 * (a & b) => +// (a + b) / 2 = ((a ^ b) >> 1) + (a & b) +// To operate on multiple sub-elements we need to make sure to mask out bits +// that crossed over into adjacent elements during the shift. +__DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) +{ + return (((__a ^ __b) >> 1) & ~0x80008000u) + (__a & __b); +} +__DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) +{ + return (((__a ^ __b) >> 1) & ~0x80808080u) + (__a & __b); +} + +__DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + if ((__a & 0x8000) && (__b & 0x8000)) + { + // Work around a bug in ptxas which produces invalid result if low element + // is negative. + unsigned mask = __vcmpgts2(__a, __b); + r = (__a & mask) | (__b & ~mask); + } + else + { + __asm__("vmax2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + } + return r; +} +__DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vmax4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vmax2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vmax4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vmin2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vmin4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vmin2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vmin4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} + +__DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vsub2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vneg2(unsigned int __a) +{ + return __vsub2(0, __a); +} + +__DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vsub4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vneg4(unsigned int __a) +{ + return __vsub4(0, __a); +} +__DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vsub2.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vnegss2(unsigned int __a) +{ + return __vsubss2(0, __a); +} +__DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vsub4.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vnegss4(unsigned int __a) +{ + return __vsubss4(0, __a); +} +__DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vsub2.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) +{ + unsigned int r; + __asm__("vsub4.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} + +__DEVICE__ /* clock_t= */ int clock() +{ + return __nvvm_read_ptx_sreg_clock(); +} +__DEVICE__ long long clock64() +{ + return __nvvm_read_ptx_sreg_clock64(); +} + +__DEVICE__ void* memcpy(void* __a, const void* __b, size_t __c) +{ + return __builtin_memcpy(__a, __b, __c); +} +__DEVICE__ void* memset(void* __a, int __b, size_t __c) +{ + return __builtin_memset(__a, __b, __c); +} + +#pragma pop_macro("__DEVICE__") +#endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__ diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_libdevice_declares.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_libdevice_declares.h new file mode 100644 index 00000000000..f83983e7d13 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_libdevice_declares.h @@ -0,0 +1,371 @@ +/*===-- __clang_cuda_libdevice_declares.h - decls for libdevice functions --=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__ +#define __CLANG_CUDA_LIBDEVICE_DECLARES_H__ + +#if defined(__cplusplus) +extern "C" { +#endif + +#define __DEVICE__ __device__ + +__DEVICE__ int __nv_abs(int __a); +__DEVICE__ double __nv_acos(double __a); +__DEVICE__ float __nv_acosf(float __a); +__DEVICE__ double __nv_acosh(double __a); +__DEVICE__ float __nv_acoshf(float __a); +__DEVICE__ double __nv_asin(double __a); +__DEVICE__ float __nv_asinf(float __a); +__DEVICE__ double __nv_asinh(double __a); +__DEVICE__ float __nv_asinhf(float __a); +__DEVICE__ double __nv_atan2(double __a, double __b); +__DEVICE__ float __nv_atan2f(float __a, float __b); +__DEVICE__ double __nv_atan(double __a); +__DEVICE__ float __nv_atanf(float __a); +__DEVICE__ double __nv_atanh(double __a); +__DEVICE__ float __nv_atanhf(float __a); +__DEVICE__ int __nv_brev(int __a); +__DEVICE__ long long __nv_brevll(long long __a); +__DEVICE__ int __nv_byte_perm(int __a, int __b, int __c); +__DEVICE__ double __nv_cbrt(double __a); +__DEVICE__ float __nv_cbrtf(float __a); +__DEVICE__ double __nv_ceil(double __a); +__DEVICE__ float __nv_ceilf(float __a); +__DEVICE__ int __nv_clz(int __a); +__DEVICE__ int __nv_clzll(long long __a); +__DEVICE__ double __nv_copysign(double __a, double __b); +__DEVICE__ float __nv_copysignf(float __a, float __b); +__DEVICE__ double __nv_cos(double __a); +__DEVICE__ float __nv_cosf(float __a); +__DEVICE__ double __nv_cosh(double __a); +__DEVICE__ float __nv_coshf(float __a); +__DEVICE__ double __nv_cospi(double __a); +__DEVICE__ float __nv_cospif(float __a); +__DEVICE__ double __nv_cyl_bessel_i0(double __a); +__DEVICE__ float __nv_cyl_bessel_i0f(float __a); +__DEVICE__ double __nv_cyl_bessel_i1(double __a); +__DEVICE__ float __nv_cyl_bessel_i1f(float __a); +__DEVICE__ double __nv_dadd_rd(double __a, double __b); +__DEVICE__ double __nv_dadd_rn(double __a, double __b); +__DEVICE__ double __nv_dadd_ru(double __a, double __b); +__DEVICE__ double __nv_dadd_rz(double __a, double __b); +__DEVICE__ double __nv_ddiv_rd(double __a, double __b); +__DEVICE__ double __nv_ddiv_rn(double __a, double __b); +__DEVICE__ double __nv_ddiv_ru(double __a, double __b); +__DEVICE__ double __nv_ddiv_rz(double __a, double __b); +__DEVICE__ double __nv_dmul_rd(double __a, double __b); +__DEVICE__ double __nv_dmul_rn(double __a, double __b); +__DEVICE__ double __nv_dmul_ru(double __a, double __b); +__DEVICE__ double __nv_dmul_rz(double __a, double __b); +__DEVICE__ float __nv_double2float_rd(double __a); +__DEVICE__ float __nv_double2float_rn(double __a); +__DEVICE__ float __nv_double2float_ru(double __a); +__DEVICE__ float __nv_double2float_rz(double __a); +__DEVICE__ int __nv_double2hiint(double __a); +__DEVICE__ int __nv_double2int_rd(double __a); +__DEVICE__ int __nv_double2int_rn(double __a); +__DEVICE__ int __nv_double2int_ru(double __a); +__DEVICE__ int __nv_double2int_rz(double __a); +__DEVICE__ long long __nv_double2ll_rd(double __a); +__DEVICE__ long long __nv_double2ll_rn(double __a); +__DEVICE__ long long __nv_double2ll_ru(double __a); +__DEVICE__ long long __nv_double2ll_rz(double __a); +__DEVICE__ int __nv_double2loint(double __a); +__DEVICE__ unsigned int __nv_double2uint_rd(double __a); +__DEVICE__ unsigned int __nv_double2uint_rn(double __a); +__DEVICE__ unsigned int __nv_double2uint_ru(double __a); +__DEVICE__ unsigned int __nv_double2uint_rz(double __a); +__DEVICE__ unsigned long long __nv_double2ull_rd(double __a); +__DEVICE__ unsigned long long __nv_double2ull_rn(double __a); +__DEVICE__ unsigned long long __nv_double2ull_ru(double __a); +__DEVICE__ unsigned long long __nv_double2ull_rz(double __a); +__DEVICE__ unsigned long long __nv_double_as_longlong(double __a); +__DEVICE__ double __nv_drcp_rd(double __a); +__DEVICE__ double __nv_drcp_rn(double __a); +__DEVICE__ double __nv_drcp_ru(double __a); +__DEVICE__ double __nv_drcp_rz(double __a); +__DEVICE__ double __nv_dsqrt_rd(double __a); +__DEVICE__ double __nv_dsqrt_rn(double __a); +__DEVICE__ double __nv_dsqrt_ru(double __a); +__DEVICE__ double __nv_dsqrt_rz(double __a); +__DEVICE__ double __nv_dsub_rd(double __a, double __b); +__DEVICE__ double __nv_dsub_rn(double __a, double __b); +__DEVICE__ double __nv_dsub_ru(double __a, double __b); +__DEVICE__ double __nv_dsub_rz(double __a, double __b); +__DEVICE__ double __nv_erfc(double __a); +__DEVICE__ float __nv_erfcf(float __a); +__DEVICE__ double __nv_erfcinv(double __a); +__DEVICE__ float __nv_erfcinvf(float __a); +__DEVICE__ double __nv_erfcx(double __a); +__DEVICE__ float __nv_erfcxf(float __a); +__DEVICE__ double __nv_erf(double __a); +__DEVICE__ float __nv_erff(float __a); +__DEVICE__ double __nv_erfinv(double __a); +__DEVICE__ float __nv_erfinvf(float __a); +__DEVICE__ double __nv_exp10(double __a); +__DEVICE__ float __nv_exp10f(float __a); +__DEVICE__ double __nv_exp2(double __a); +__DEVICE__ float __nv_exp2f(float __a); +__DEVICE__ double __nv_exp(double __a); +__DEVICE__ float __nv_expf(float __a); +__DEVICE__ double __nv_expm1(double __a); +__DEVICE__ float __nv_expm1f(float __a); +__DEVICE__ double __nv_fabs(double __a); +__DEVICE__ float __nv_fabsf(float __a); +__DEVICE__ float __nv_fadd_rd(float __a, float __b); +__DEVICE__ float __nv_fadd_rn(float __a, float __b); +__DEVICE__ float __nv_fadd_ru(float __a, float __b); +__DEVICE__ float __nv_fadd_rz(float __a, float __b); +__DEVICE__ float __nv_fast_cosf(float __a); +__DEVICE__ float __nv_fast_exp10f(float __a); +__DEVICE__ float __nv_fast_expf(float __a); +__DEVICE__ float __nv_fast_fdividef(float __a, float __b); +__DEVICE__ float __nv_fast_log10f(float __a); +__DEVICE__ float __nv_fast_log2f(float __a); +__DEVICE__ float __nv_fast_logf(float __a); +__DEVICE__ float __nv_fast_powf(float __a, float __b); +__DEVICE__ void __nv_fast_sincosf(float __a, float* __s, float* __c); +__DEVICE__ float __nv_fast_sinf(float __a); +__DEVICE__ float __nv_fast_tanf(float __a); +__DEVICE__ double __nv_fdim(double __a, double __b); +__DEVICE__ float __nv_fdimf(float __a, float __b); +__DEVICE__ float __nv_fdiv_rd(float __a, float __b); +__DEVICE__ float __nv_fdiv_rn(float __a, float __b); +__DEVICE__ float __nv_fdiv_ru(float __a, float __b); +__DEVICE__ float __nv_fdiv_rz(float __a, float __b); +__DEVICE__ int __nv_ffs(int __a); +__DEVICE__ int __nv_ffsll(long long __a); +__DEVICE__ int __nv_finitef(float __a); +__DEVICE__ unsigned short __nv_float2half_rn(float __a); +__DEVICE__ int __nv_float2int_rd(float __a); +__DEVICE__ int __nv_float2int_rn(float __a); +__DEVICE__ int __nv_float2int_ru(float __a); +__DEVICE__ int __nv_float2int_rz(float __a); +__DEVICE__ long long __nv_float2ll_rd(float __a); +__DEVICE__ long long __nv_float2ll_rn(float __a); +__DEVICE__ long long __nv_float2ll_ru(float __a); +__DEVICE__ long long __nv_float2ll_rz(float __a); +__DEVICE__ unsigned int __nv_float2uint_rd(float __a); +__DEVICE__ unsigned int __nv_float2uint_rn(float __a); +__DEVICE__ unsigned int __nv_float2uint_ru(float __a); +__DEVICE__ unsigned int __nv_float2uint_rz(float __a); +__DEVICE__ unsigned long long __nv_float2ull_rd(float __a); +__DEVICE__ unsigned long long __nv_float2ull_rn(float __a); +__DEVICE__ unsigned long long __nv_float2ull_ru(float __a); +__DEVICE__ unsigned long long __nv_float2ull_rz(float __a); +__DEVICE__ int __nv_float_as_int(float __a); +__DEVICE__ unsigned int __nv_float_as_uint(float __a); +__DEVICE__ double __nv_floor(double __a); +__DEVICE__ float __nv_floorf(float __a); +__DEVICE__ double __nv_fma(double __a, double __b, double __c); +__DEVICE__ float __nv_fmaf(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_rd(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_rn(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ru(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_rz(float __a, float __b, float __c); +__DEVICE__ double __nv_fma_rd(double __a, double __b, double __c); +__DEVICE__ double __nv_fma_rn(double __a, double __b, double __c); +__DEVICE__ double __nv_fma_ru(double __a, double __b, double __c); +__DEVICE__ double __nv_fma_rz(double __a, double __b, double __c); +__DEVICE__ double __nv_fmax(double __a, double __b); +__DEVICE__ float __nv_fmaxf(float __a, float __b); +__DEVICE__ double __nv_fmin(double __a, double __b); +__DEVICE__ float __nv_fminf(float __a, float __b); +__DEVICE__ double __nv_fmod(double __a, double __b); +__DEVICE__ float __nv_fmodf(float __a, float __b); +__DEVICE__ float __nv_fmul_rd(float __a, float __b); +__DEVICE__ float __nv_fmul_rn(float __a, float __b); +__DEVICE__ float __nv_fmul_ru(float __a, float __b); +__DEVICE__ float __nv_fmul_rz(float __a, float __b); +__DEVICE__ float __nv_frcp_rd(float __a); +__DEVICE__ float __nv_frcp_rn(float __a); +__DEVICE__ float __nv_frcp_ru(float __a); +__DEVICE__ float __nv_frcp_rz(float __a); +__DEVICE__ double __nv_frexp(double __a, int* __b); +__DEVICE__ float __nv_frexpf(float __a, int* __b); +__DEVICE__ float __nv_frsqrt_rn(float __a); +__DEVICE__ float __nv_fsqrt_rd(float __a); +__DEVICE__ float __nv_fsqrt_rn(float __a); +__DEVICE__ float __nv_fsqrt_ru(float __a); +__DEVICE__ float __nv_fsqrt_rz(float __a); +__DEVICE__ float __nv_fsub_rd(float __a, float __b); +__DEVICE__ float __nv_fsub_rn(float __a, float __b); +__DEVICE__ float __nv_fsub_ru(float __a, float __b); +__DEVICE__ float __nv_fsub_rz(float __a, float __b); +__DEVICE__ int __nv_hadd(int __a, int __b); +__DEVICE__ float __nv_half2float(unsigned short __h); +__DEVICE__ double __nv_hiloint2double(int __a, int __b); +__DEVICE__ double __nv_hypot(double __a, double __b); +__DEVICE__ float __nv_hypotf(float __a, float __b); +__DEVICE__ int __nv_ilogb(double __a); +__DEVICE__ int __nv_ilogbf(float __a); +__DEVICE__ double __nv_int2double_rn(int __a); +__DEVICE__ float __nv_int2float_rd(int __a); +__DEVICE__ float __nv_int2float_rn(int __a); +__DEVICE__ float __nv_int2float_ru(int __a); +__DEVICE__ float __nv_int2float_rz(int __a); +__DEVICE__ float __nv_int_as_float(int __a); +__DEVICE__ int __nv_isfinited(double __a); +__DEVICE__ int __nv_isinfd(double __a); +__DEVICE__ int __nv_isinff(float __a); +__DEVICE__ int __nv_isnand(double __a); +__DEVICE__ int __nv_isnanf(float __a); +__DEVICE__ double __nv_j0(double __a); +__DEVICE__ float __nv_j0f(float __a); +__DEVICE__ double __nv_j1(double __a); +__DEVICE__ float __nv_j1f(float __a); +__DEVICE__ float __nv_jnf(int __a, float __b); +__DEVICE__ double __nv_jn(int __a, double __b); +__DEVICE__ double __nv_ldexp(double __a, int __b); +__DEVICE__ float __nv_ldexpf(float __a, int __b); +__DEVICE__ double __nv_lgamma(double __a); +__DEVICE__ float __nv_lgammaf(float __a); +__DEVICE__ double __nv_ll2double_rd(long long __a); +__DEVICE__ double __nv_ll2double_rn(long long __a); +__DEVICE__ double __nv_ll2double_ru(long long __a); +__DEVICE__ double __nv_ll2double_rz(long long __a); +__DEVICE__ float __nv_ll2float_rd(long long __a); +__DEVICE__ float __nv_ll2float_rn(long long __a); +__DEVICE__ float __nv_ll2float_ru(long long __a); +__DEVICE__ float __nv_ll2float_rz(long long __a); +__DEVICE__ long long __nv_llabs(long long __a); +__DEVICE__ long long __nv_llmax(long long __a, long long __b); +__DEVICE__ long long __nv_llmin(long long __a, long long __b); +__DEVICE__ long long __nv_llrint(double __a); +__DEVICE__ long long __nv_llrintf(float __a); +__DEVICE__ long long __nv_llround(double __a); +__DEVICE__ long long __nv_llroundf(float __a); +__DEVICE__ double __nv_log10(double __a); +__DEVICE__ float __nv_log10f(float __a); +__DEVICE__ double __nv_log1p(double __a); +__DEVICE__ float __nv_log1pf(float __a); +__DEVICE__ double __nv_log2(double __a); +__DEVICE__ float __nv_log2f(float __a); +__DEVICE__ double __nv_logb(double __a); +__DEVICE__ float __nv_logbf(float __a); +__DEVICE__ double __nv_log(double __a); +__DEVICE__ float __nv_logf(float __a); +__DEVICE__ double __nv_longlong_as_double(long long __a); +__DEVICE__ int __nv_max(int __a, int __b); +__DEVICE__ int __nv_min(int __a, int __b); +__DEVICE__ double __nv_modf(double __a, double* __b); +__DEVICE__ float __nv_modff(float __a, float* __b); +__DEVICE__ int __nv_mul24(int __a, int __b); +__DEVICE__ long long __nv_mul64hi(long long __a, long long __b); +__DEVICE__ int __nv_mulhi(int __a, int __b); +__DEVICE__ double __nv_nan(const signed char* __a); +__DEVICE__ float __nv_nanf(const signed char* __a); +__DEVICE__ double __nv_nearbyint(double __a); +__DEVICE__ float __nv_nearbyintf(float __a); +__DEVICE__ double __nv_nextafter(double __a, double __b); +__DEVICE__ float __nv_nextafterf(float __a, float __b); +__DEVICE__ double __nv_norm3d(double __a, double __b, double __c); +__DEVICE__ float __nv_norm3df(float __a, float __b, float __c); +__DEVICE__ double __nv_norm4d(double __a, double __b, double __c, double __d); +__DEVICE__ float __nv_norm4df(float __a, float __b, float __c, float __d); +__DEVICE__ double __nv_normcdf(double __a); +__DEVICE__ float __nv_normcdff(float __a); +__DEVICE__ double __nv_normcdfinv(double __a); +__DEVICE__ float __nv_normcdfinvf(float __a); +__DEVICE__ float __nv_normf(int __a, const float* __b); +__DEVICE__ double __nv_norm(int __a, const double* __b); +__DEVICE__ int __nv_popc(unsigned int __a); +__DEVICE__ int __nv_popcll(unsigned long long __a); +__DEVICE__ double __nv_pow(double __a, double __b); +__DEVICE__ float __nv_powf(float __a, float __b); +__DEVICE__ double __nv_powi(double __a, int __b); +__DEVICE__ float __nv_powif(float __a, int __b); +__DEVICE__ double __nv_rcbrt(double __a); +__DEVICE__ float __nv_rcbrtf(float __a); +__DEVICE__ double __nv_rcp64h(double __a); +__DEVICE__ double __nv_remainder(double __a, double __b); +__DEVICE__ float __nv_remainderf(float __a, float __b); +__DEVICE__ double __nv_remquo(double __a, double __b, int* __c); +__DEVICE__ float __nv_remquof(float __a, float __b, int* __c); +__DEVICE__ int __nv_rhadd(int __a, int __b); +__DEVICE__ double __nv_rhypot(double __a, double __b); +__DEVICE__ float __nv_rhypotf(float __a, float __b); +__DEVICE__ double __nv_rint(double __a); +__DEVICE__ float __nv_rintf(float __a); +__DEVICE__ double __nv_rnorm3d(double __a, double __b, double __c); +__DEVICE__ float __nv_rnorm3df(float __a, float __b, float __c); +__DEVICE__ double __nv_rnorm4d(double __a, double __b, double __c, double __d); +__DEVICE__ float __nv_rnorm4df(float __a, float __b, float __c, float __d); +__DEVICE__ float __nv_rnormf(int __a, const float* __b); +__DEVICE__ double __nv_rnorm(int __a, const double* __b); +__DEVICE__ double __nv_round(double __a); +__DEVICE__ float __nv_roundf(float __a); +__DEVICE__ double __nv_rsqrt(double __a); +__DEVICE__ float __nv_rsqrtf(float __a); +__DEVICE__ int __nv_sad(int __a, int __b, int __c); +__DEVICE__ float __nv_saturatef(float __a); +__DEVICE__ double __nv_scalbn(double __a, int __b); +__DEVICE__ float __nv_scalbnf(float __a, int __b); +__DEVICE__ int __nv_signbitd(double __a); +__DEVICE__ int __nv_signbitf(float __a); +__DEVICE__ void __nv_sincos(double __a, double* __b, double* __c); +__DEVICE__ void __nv_sincosf(float __a, float* __b, float* __c); +__DEVICE__ void __nv_sincospi(double __a, double* __b, double* __c); +__DEVICE__ void __nv_sincospif(float __a, float* __b, float* __c); +__DEVICE__ double __nv_sin(double __a); +__DEVICE__ float __nv_sinf(float __a); +__DEVICE__ double __nv_sinh(double __a); +__DEVICE__ float __nv_sinhf(float __a); +__DEVICE__ double __nv_sinpi(double __a); +__DEVICE__ float __nv_sinpif(float __a); +__DEVICE__ double __nv_sqrt(double __a); +__DEVICE__ float __nv_sqrtf(float __a); +__DEVICE__ double __nv_tan(double __a); +__DEVICE__ float __nv_tanf(float __a); +__DEVICE__ double __nv_tanh(double __a); +__DEVICE__ float __nv_tanhf(float __a); +__DEVICE__ double __nv_tgamma(double __a); +__DEVICE__ float __nv_tgammaf(float __a); +__DEVICE__ double __nv_trunc(double __a); +__DEVICE__ float __nv_truncf(float __a); +__DEVICE__ int __nv_uhadd(unsigned int __a, unsigned int __b); +__DEVICE__ double __nv_uint2double_rn(unsigned int __i); +__DEVICE__ float __nv_uint2float_rd(unsigned int __a); +__DEVICE__ float __nv_uint2float_rn(unsigned int __a); +__DEVICE__ float __nv_uint2float_ru(unsigned int __a); +__DEVICE__ float __nv_uint2float_rz(unsigned int __a); +__DEVICE__ float __nv_uint_as_float(unsigned int __a); +__DEVICE__ double __nv_ull2double_rd(unsigned long long __a); +__DEVICE__ double __nv_ull2double_rn(unsigned long long __a); +__DEVICE__ double __nv_ull2double_ru(unsigned long long __a); +__DEVICE__ double __nv_ull2double_rz(unsigned long long __a); +__DEVICE__ float __nv_ull2float_rd(unsigned long long __a); +__DEVICE__ float __nv_ull2float_rn(unsigned long long __a); +__DEVICE__ float __nv_ull2float_ru(unsigned long long __a); +__DEVICE__ float __nv_ull2float_rz(unsigned long long __a); +__DEVICE__ unsigned long long __nv_ullmax(unsigned long long __a, unsigned long long __b); +__DEVICE__ unsigned long long __nv_ullmin(unsigned long long __a, unsigned long long __b); +__DEVICE__ unsigned int __nv_umax(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned int __nv_umin(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned long long __nv_umul64hi(unsigned long long __a, unsigned long long __b); +__DEVICE__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned int __nv_usad(unsigned int __a, unsigned int __b, unsigned int __c); +__DEVICE__ double __nv_y0(double __a); +__DEVICE__ float __nv_y0f(float __a); +__DEVICE__ double __nv_y1(double __a); +__DEVICE__ float __nv_y1f(float __a); +__DEVICE__ float __nv_ynf(int __a, float __b); +__DEVICE__ double __nv_yn(int __a, double __b); + +#if defined(__cplusplus) +} // extern "C" +#endif +#endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__ diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_math.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_math.h new file mode 100644 index 00000000000..964d949e889 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_math.h @@ -0,0 +1,809 @@ +/*===---- __clang_cuda_math.h - Device-side CUDA math support --------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __CLANG_CUDA_MATH_H__ +#define __CLANG_CUDA_MATH_H__ +#ifndef __CUDA__ +# error "This file is for CUDA compilation only." +#endif + +// The __CLANG_GPU_DISABLE_MATH_WRAPPERS macro provides a way to let standard +// libcalls reach the link step instead of being eagerly replaced. +#ifndef __CLANG_GPU_DISABLE_MATH_WRAPPERS + +// __DEVICE__ is a helper macro with common set of attributes for the wrappers +// we implement in this file. We need static in order to avoid emitting unused +// functions and __forceinline__ helps inlining these wrappers at -O1. +# pragma push_macro("__DEVICE__") +# define __DEVICE__ static __device__ __forceinline__ + +// Specialized version of __DEVICE__ for functions with void return type. +# pragma push_macro("__DEVICE_VOID__") +# define __DEVICE_VOID__ __DEVICE__ + +// libdevice provides fast low precision and slow full-recision implementations +// for some functions. Which one gets selected depends on +// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if +// -ffast-math or -fgpu-approx-transcendentals are in effect. +# pragma push_macro("__FAST_OR_SLOW") +# if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__) +# define __FAST_OR_SLOW(fast, slow) fast +# else +# define __FAST_OR_SLOW(fast, slow) slow +# endif + +__DEVICE__ int abs(int __a) +{ + return __nv_abs(__a); +} +__DEVICE__ double fabs(double __a) +{ + return __nv_fabs(__a); +} +__DEVICE__ double acos(double __a) +{ + return __nv_acos(__a); +} +__DEVICE__ float acosf(float __a) +{ + return __nv_acosf(__a); +} +__DEVICE__ double acosh(double __a) +{ + return __nv_acosh(__a); +} +__DEVICE__ float acoshf(float __a) +{ + return __nv_acoshf(__a); +} +__DEVICE__ double asin(double __a) +{ + return __nv_asin(__a); +} +__DEVICE__ float asinf(float __a) +{ + return __nv_asinf(__a); +} +__DEVICE__ double asinh(double __a) +{ + return __nv_asinh(__a); +} +__DEVICE__ float asinhf(float __a) +{ + return __nv_asinhf(__a); +} +__DEVICE__ double atan(double __a) +{ + return __nv_atan(__a); +} +__DEVICE__ double atan2(double __a, double __b) +{ + return __nv_atan2(__a, __b); +} +__DEVICE__ float atan2f(float __a, float __b) +{ + return __nv_atan2f(__a, __b); +} +__DEVICE__ float atanf(float __a) +{ + return __nv_atanf(__a); +} +__DEVICE__ double atanh(double __a) +{ + return __nv_atanh(__a); +} +__DEVICE__ float atanhf(float __a) +{ + return __nv_atanhf(__a); +} +__DEVICE__ double cbrt(double __a) +{ + return __nv_cbrt(__a); +} +__DEVICE__ float cbrtf(float __a) +{ + return __nv_cbrtf(__a); +} +__DEVICE__ double ceil(double __a) +{ + return __nv_ceil(__a); +} +__DEVICE__ float ceilf(float __a) +{ + return __nv_ceilf(__a); +} +__DEVICE__ double copysign(double __a, double __b) +{ + return __nv_copysign(__a, __b); +} +__DEVICE__ float copysignf(float __a, float __b) +{ + return __nv_copysignf(__a, __b); +} +__DEVICE__ double cos(double __a) +{ + return __nv_cos(__a); +} +__DEVICE__ float cosf(float __a) +{ + return __FAST_OR_SLOW(__nv_fast_cosf, __nv_cosf)(__a); +} +__DEVICE__ double cosh(double __a) +{ + return __nv_cosh(__a); +} +__DEVICE__ float coshf(float __a) +{ + return __nv_coshf(__a); +} +__DEVICE__ double cospi(double __a) +{ + return __nv_cospi(__a); +} +__DEVICE__ float cospif(float __a) +{ + return __nv_cospif(__a); +} +__DEVICE__ double cyl_bessel_i0(double __a) +{ + return __nv_cyl_bessel_i0(__a); +} +__DEVICE__ float cyl_bessel_i0f(float __a) +{ + return __nv_cyl_bessel_i0f(__a); +} +__DEVICE__ double cyl_bessel_i1(double __a) +{ + return __nv_cyl_bessel_i1(__a); +} +__DEVICE__ float cyl_bessel_i1f(float __a) +{ + return __nv_cyl_bessel_i1f(__a); +} +__DEVICE__ double erf(double __a) +{ + return __nv_erf(__a); +} +__DEVICE__ double erfc(double __a) +{ + return __nv_erfc(__a); +} +__DEVICE__ float erfcf(float __a) +{ + return __nv_erfcf(__a); +} +__DEVICE__ double erfcinv(double __a) +{ + return __nv_erfcinv(__a); +} +__DEVICE__ float erfcinvf(float __a) +{ + return __nv_erfcinvf(__a); +} +__DEVICE__ double erfcx(double __a) +{ + return __nv_erfcx(__a); +} +__DEVICE__ float erfcxf(float __a) +{ + return __nv_erfcxf(__a); +} +__DEVICE__ float erff(float __a) +{ + return __nv_erff(__a); +} +__DEVICE__ double erfinv(double __a) +{ + return __nv_erfinv(__a); +} +__DEVICE__ float erfinvf(float __a) +{ + return __nv_erfinvf(__a); +} +__DEVICE__ double exp(double __a) +{ + return __nv_exp(__a); +} +__DEVICE__ double exp10(double __a) +{ + return __nv_exp10(__a); +} +__DEVICE__ float exp10f(float __a) +{ + return __nv_exp10f(__a); +} +__DEVICE__ double exp2(double __a) +{ + return __nv_exp2(__a); +} +__DEVICE__ float exp2f(float __a) +{ + return __nv_exp2f(__a); +} +__DEVICE__ float expf(float __a) +{ + return __nv_expf(__a); +} +__DEVICE__ double expm1(double __a) +{ + return __nv_expm1(__a); +} +__DEVICE__ float expm1f(float __a) +{ + return __nv_expm1f(__a); +} +__DEVICE__ float fabsf(float __a) +{ + return __nv_fabsf(__a); +} +__DEVICE__ double fdim(double __a, double __b) +{ + return __nv_fdim(__a, __b); +} +__DEVICE__ float fdimf(float __a, float __b) +{ + return __nv_fdimf(__a, __b); +} +__DEVICE__ double fdivide(double __a, double __b) +{ + return __a / __b; +} +__DEVICE__ float fdividef(float __a, float __b) +{ +# if __FAST_MATH__ && !__CUDA_PREC_DIV + return __nv_fast_fdividef(__a, __b); +# else + return __a / __b; +# endif +} +__DEVICE__ double floor(double __f) +{ + return __nv_floor(__f); +} +__DEVICE__ float floorf(float __f) +{ + return __nv_floorf(__f); +} +__DEVICE__ double fma(double __a, double __b, double __c) +{ + return __nv_fma(__a, __b, __c); +} +__DEVICE__ float fmaf(float __a, float __b, float __c) +{ + return __nv_fmaf(__a, __b, __c); +} +__DEVICE__ double fmax(double __a, double __b) +{ + return __nv_fmax(__a, __b); +} +__DEVICE__ float fmaxf(float __a, float __b) +{ + return __nv_fmaxf(__a, __b); +} +__DEVICE__ double fmin(double __a, double __b) +{ + return __nv_fmin(__a, __b); +} +__DEVICE__ float fminf(float __a, float __b) +{ + return __nv_fminf(__a, __b); +} +__DEVICE__ double fmod(double __a, double __b) +{ + return __nv_fmod(__a, __b); +} +__DEVICE__ float fmodf(float __a, float __b) +{ + return __nv_fmodf(__a, __b); +} +__DEVICE__ double frexp(double __a, int* __b) +{ + return __nv_frexp(__a, __b); +} +__DEVICE__ float frexpf(float __a, int* __b) +{ + return __nv_frexpf(__a, __b); +} +__DEVICE__ double hypot(double __a, double __b) +{ + return __nv_hypot(__a, __b); +} +__DEVICE__ float hypotf(float __a, float __b) +{ + return __nv_hypotf(__a, __b); +} +__DEVICE__ int ilogb(double __a) +{ + return __nv_ilogb(__a); +} +__DEVICE__ int ilogbf(float __a) +{ + return __nv_ilogbf(__a); +} +__DEVICE__ double j0(double __a) +{ + return __nv_j0(__a); +} +__DEVICE__ float j0f(float __a) +{ + return __nv_j0f(__a); +} +__DEVICE__ double j1(double __a) +{ + return __nv_j1(__a); +} +__DEVICE__ float j1f(float __a) +{ + return __nv_j1f(__a); +} +__DEVICE__ double jn(int __n, double __a) +{ + return __nv_jn(__n, __a); +} +__DEVICE__ float jnf(int __n, float __a) +{ + return __nv_jnf(__n, __a); +} +# if defined(__LP64__) || defined(_WIN64) +__DEVICE__ long labs(long __a) +{ + return __nv_llabs(__a); +}; +# else +__DEVICE__ long labs(long __a) +{ + return __nv_abs(__a); +}; +# endif +__DEVICE__ double ldexp(double __a, int __b) +{ + return __nv_ldexp(__a, __b); +} +__DEVICE__ float ldexpf(float __a, int __b) +{ + return __nv_ldexpf(__a, __b); +} +__DEVICE__ double lgamma(double __a) +{ + return __nv_lgamma(__a); +} +__DEVICE__ float lgammaf(float __a) +{ + return __nv_lgammaf(__a); +} +__DEVICE__ long long llabs(long long __a) +{ + return __nv_llabs(__a); +} +__DEVICE__ long long llmax(long long __a, long long __b) +{ + return __nv_llmax(__a, __b); +} +__DEVICE__ long long llmin(long long __a, long long __b) +{ + return __nv_llmin(__a, __b); +} +__DEVICE__ long long llrint(double __a) +{ + return __nv_llrint(__a); +} +__DEVICE__ long long llrintf(float __a) +{ + return __nv_llrintf(__a); +} +__DEVICE__ long long llround(double __a) +{ + return __nv_llround(__a); +} +__DEVICE__ long long llroundf(float __a) +{ + return __nv_llroundf(__a); +} +__DEVICE__ double round(double __a) +{ + return __nv_round(__a); +} +__DEVICE__ float roundf(float __a) +{ + return __nv_roundf(__a); +} +__DEVICE__ double log(double __a) +{ + return __nv_log(__a); +} +__DEVICE__ double log10(double __a) +{ + return __nv_log10(__a); +} +__DEVICE__ float log10f(float __a) +{ + return __nv_log10f(__a); +} +__DEVICE__ double log1p(double __a) +{ + return __nv_log1p(__a); +} +__DEVICE__ float log1pf(float __a) +{ + return __nv_log1pf(__a); +} +__DEVICE__ double log2(double __a) +{ + return __nv_log2(__a); +} +__DEVICE__ float log2f(float __a) +{ + return __FAST_OR_SLOW(__nv_fast_log2f, __nv_log2f)(__a); +} +__DEVICE__ double logb(double __a) +{ + return __nv_logb(__a); +} +__DEVICE__ float logbf(float __a) +{ + return __nv_logbf(__a); +} +__DEVICE__ float logf(float __a) +{ + return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a); +} +# if defined(__LP64__) || defined(_WIN64) +__DEVICE__ long lrint(double __a) +{ + return llrint(__a); +} +__DEVICE__ long lrintf(float __a) +{ + return __float2ll_rn(__a); +} +__DEVICE__ long lround(double __a) +{ + return llround(__a); +} +__DEVICE__ long lroundf(float __a) +{ + return llroundf(__a); +} +# else +__DEVICE__ long lrint(double __a) +{ + return (long) rint(__a); +} +__DEVICE__ long lrintf(float __a) +{ + return __float2int_rn(__a); +} +__DEVICE__ long lround(double __a) +{ + return round(__a); +} +__DEVICE__ long lroundf(float __a) +{ + return roundf(__a); +} +# endif +__DEVICE__ int max(int __a, int __b) +{ + return __nv_max(__a, __b); +} +__DEVICE__ int min(int __a, int __b) +{ + return __nv_min(__a, __b); +} +__DEVICE__ double modf(double __a, double* __b) +{ + return __nv_modf(__a, __b); +} +__DEVICE__ float modff(float __a, float* __b) +{ + return __nv_modff(__a, __b); +} +__DEVICE__ double nearbyint(double __a) +{ + return __builtin_nearbyint(__a); +} +__DEVICE__ float nearbyintf(float __a) +{ + return __builtin_nearbyintf(__a); +} +__DEVICE__ double nextafter(double __a, double __b) +{ + return __nv_nextafter(__a, __b); +} +__DEVICE__ float nextafterf(float __a, float __b) +{ + return __nv_nextafterf(__a, __b); +} +__DEVICE__ double norm(int __dim, const double* __t) +{ + return __nv_norm(__dim, __t); +} +__DEVICE__ double norm3d(double __a, double __b, double __c) +{ + return __nv_norm3d(__a, __b, __c); +} +__DEVICE__ float norm3df(float __a, float __b, float __c) +{ + return __nv_norm3df(__a, __b, __c); +} +__DEVICE__ double norm4d(double __a, double __b, double __c, double __d) +{ + return __nv_norm4d(__a, __b, __c, __d); +} +__DEVICE__ float norm4df(float __a, float __b, float __c, float __d) +{ + return __nv_norm4df(__a, __b, __c, __d); +} +__DEVICE__ double normcdf(double __a) +{ + return __nv_normcdf(__a); +} +__DEVICE__ float normcdff(float __a) +{ + return __nv_normcdff(__a); +} +__DEVICE__ double normcdfinv(double __a) +{ + return __nv_normcdfinv(__a); +} +__DEVICE__ float normcdfinvf(float __a) +{ + return __nv_normcdfinvf(__a); +} +__DEVICE__ float normf(int __dim, const float* __t) +{ + return __nv_normf(__dim, __t); +} +__DEVICE__ double pow(double __a, double __b) +{ + return __nv_pow(__a, __b); +} +__DEVICE__ float powf(float __a, float __b) +{ + return __nv_powf(__a, __b); +} +__DEVICE__ double powi(double __a, int __b) +{ + return __nv_powi(__a, __b); +} +__DEVICE__ float powif(float __a, int __b) +{ + return __nv_powif(__a, __b); +} +__DEVICE__ double rcbrt(double __a) +{ + return __nv_rcbrt(__a); +} +__DEVICE__ float rcbrtf(float __a) +{ + return __nv_rcbrtf(__a); +} +__DEVICE__ double remainder(double __a, double __b) +{ + return __nv_remainder(__a, __b); +} +__DEVICE__ float remainderf(float __a, float __b) +{ + return __nv_remainderf(__a, __b); +} +__DEVICE__ double remquo(double __a, double __b, int* __c) +{ + return __nv_remquo(__a, __b, __c); +} +__DEVICE__ float remquof(float __a, float __b, int* __c) +{ + return __nv_remquof(__a, __b, __c); +} +__DEVICE__ double rhypot(double __a, double __b) +{ + return __nv_rhypot(__a, __b); +} +__DEVICE__ float rhypotf(float __a, float __b) +{ + return __nv_rhypotf(__a, __b); +} +// __nv_rint* in libdevice is buggy and produces incorrect results. +__DEVICE__ double rint(double __a) +{ + return __builtin_rint(__a); +} +__DEVICE__ float rintf(float __a) +{ + return __builtin_rintf(__a); +} +__DEVICE__ double rnorm(int __a, const double* __b) +{ + return __nv_rnorm(__a, __b); +} +__DEVICE__ double rnorm3d(double __a, double __b, double __c) +{ + return __nv_rnorm3d(__a, __b, __c); +} +__DEVICE__ float rnorm3df(float __a, float __b, float __c) +{ + return __nv_rnorm3df(__a, __b, __c); +} +__DEVICE__ double rnorm4d(double __a, double __b, double __c, double __d) +{ + return __nv_rnorm4d(__a, __b, __c, __d); +} +__DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) +{ + return __nv_rnorm4df(__a, __b, __c, __d); +} +__DEVICE__ float rnormf(int __dim, const float* __t) +{ + return __nv_rnormf(__dim, __t); +} +__DEVICE__ double rsqrt(double __a) +{ + return __nv_rsqrt(__a); +} +__DEVICE__ float rsqrtf(float __a) +{ + return __nv_rsqrtf(__a); +} +__DEVICE__ double scalbn(double __a, int __b) +{ + return __nv_scalbn(__a, __b); +} +__DEVICE__ float scalbnf(float __a, int __b) +{ + return __nv_scalbnf(__a, __b); +} +__DEVICE__ double scalbln(double __a, long __b) +{ + if (__b > INT_MAX) + { + return __a > 0 ? HUGE_VAL : -HUGE_VAL; + } + if (__b < INT_MIN) + { + return __a > 0 ? 0.0 : -0.0; + } + return scalbn(__a, (int) __b); +} +__DEVICE__ float scalblnf(float __a, long __b) +{ + if (__b > INT_MAX) + { + return __a > 0 ? HUGE_VALF : -HUGE_VALF; + } + if (__b < INT_MIN) + { + return __a > 0 ? 0.f : -0.f; + } + return scalbnf(__a, (int) __b); +} +__DEVICE__ double sin(double __a) +{ + return __nv_sin(__a); +} +__DEVICE_VOID__ void sincos(double __a, double* __s, double* __c) +{ + return __nv_sincos(__a, __s, __c); +} +__DEVICE_VOID__ void sincosf(float __a, float* __s, float* __c) +{ + return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c); +} +__DEVICE_VOID__ void sincospi(double __a, double* __s, double* __c) +{ + return __nv_sincospi(__a, __s, __c); +} +__DEVICE_VOID__ void sincospif(float __a, float* __s, float* __c) +{ + return __nv_sincospif(__a, __s, __c); +} +__DEVICE__ float sinf(float __a) +{ + return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a); +} +__DEVICE__ double sinh(double __a) +{ + return __nv_sinh(__a); +} +__DEVICE__ float sinhf(float __a) +{ + return __nv_sinhf(__a); +} +__DEVICE__ double sinpi(double __a) +{ + return __nv_sinpi(__a); +} +__DEVICE__ float sinpif(float __a) +{ + return __nv_sinpif(__a); +} +__DEVICE__ double sqrt(double __a) +{ + return __nv_sqrt(__a); +} +__DEVICE__ float sqrtf(float __a) +{ + return __nv_sqrtf(__a); +} +__DEVICE__ double tan(double __a) +{ + return __nv_tan(__a); +} +__DEVICE__ float tanf(float __a) +{ + return __nv_tanf(__a); +} +__DEVICE__ double tanh(double __a) +{ + return __nv_tanh(__a); +} +__DEVICE__ float tanhf(float __a) +{ + return __nv_tanhf(__a); +} +__DEVICE__ double tgamma(double __a) +{ + return __nv_tgamma(__a); +} +__DEVICE__ float tgammaf(float __a) +{ + return __nv_tgammaf(__a); +} +__DEVICE__ double trunc(double __a) +{ + return __nv_trunc(__a); +} +__DEVICE__ float truncf(float __a) +{ + return __nv_truncf(__a); +} +__DEVICE__ unsigned long long ullmax(unsigned long long __a, unsigned long long __b) +{ + return __nv_ullmax(__a, __b); +} +__DEVICE__ unsigned long long ullmin(unsigned long long __a, unsigned long long __b) +{ + return __nv_ullmin(__a, __b); +} +__DEVICE__ unsigned int umax(unsigned int __a, unsigned int __b) +{ + return __nv_umax(__a, __b); +} +__DEVICE__ unsigned int umin(unsigned int __a, unsigned int __b) +{ + return __nv_umin(__a, __b); +} +__DEVICE__ double y0(double __a) +{ + return __nv_y0(__a); +} +__DEVICE__ float y0f(float __a) +{ + return __nv_y0f(__a); +} +__DEVICE__ double y1(double __a) +{ + return __nv_y1(__a); +} +__DEVICE__ float y1f(float __a) +{ + return __nv_y1f(__a); +} +__DEVICE__ double yn(int __a, double __b) +{ + return __nv_yn(__a, __b); +} +__DEVICE__ float ynf(int __a, float __b) +{ + return __nv_ynf(__a, __b); +} + +# pragma pop_macro("__DEVICE__") +# pragma pop_macro("__DEVICE_VOID__") +# pragma pop_macro("__FAST_OR_SLOW") + +#endif // __CLANG_GPU_DISABLE_MATH_WRAPPERS +#endif // __CLANG_CUDA_MATH_H__ diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_runtime_wrapper.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_runtime_wrapper.h new file mode 100644 index 00000000000..4c18fdca836 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_runtime_wrapper.h @@ -0,0 +1,407 @@ +/*===---- HostJIT CUDA runtime wrapper - replaces clang's wrapper ----------=== + * + * This is a self-contained replacement for clang's __clang_cuda_runtime_wrapper.h. + * Instead of #include_next-ing the real wrapper (which has fragile ordering + * dependencies on system headers and CUDA toolkit version-specific branches), + * we directly include only the clang-provided CUDA helper headers we need and + * pull in the CUDA toolkit headers with explicit preprocessor guards. + * + * Key design decision: all clang-provided device function implementations and + * CCCL-required intrinsics are defined BEFORE any CUDA toolkit headers that + * might transitively include CCCL (via libcudacxx standard headers on our + * include path). This eliminates the need for forward declarations. + * + * Assumptions: + * - CUDA >= 9.0 (no legacy code paths) + * - Clang CUDA compilation (__CUDA__ && __clang__) + * - Freestanding: all standard headers are stubs or from libcudacxx + * - cuda::std is bridged into std via using-directive + *===-----------------------------------------------------------------------===*/ +#ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__ +#define __CLANG_CUDA_RUNTIME_WRAPPER_H__ +#pragma clang system_header + +#if defined(__CUDA__) && defined(__clang__) + +// ============================================================================ +// Phase 1: Forward-declare device math overloads before any inclusion +// ============================================================================ +// This prevents constexpr std library math functions from becoming implicitly +// host+device, which would block our __device__ overloads later. +# include <__clang_cuda_math_forward_declares.h> + +// ============================================================================ +// Phase 2: Device-side definitions before any CUDA toolkit headers +// ============================================================================ +// Everything here uses only compiler builtins and our stubs. No CUDA toolkit +// headers are included yet, so nothing can transitively pull in CCCL. + +# pragma push_macro("__THROW") +# pragma push_macro("__CUDA_ARCH__") + +# ifndef __CUDA_ARCH__ +# define __CUDA_ARCH__ 9999 +# endif + +// host_defines.h provides __device__, __host__, __forceinline__ macros. +// Its only transitive dep (ctype.h) hits our stub. +# define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +# define __CUDACC__ +# define __CUDA_LIBDEVICE__ +# include "host_defines.h" + +// ---- Builtin variables (threadIdx, blockIdx, etc.) ---- +# include "__clang_cuda_builtin_vars.h" + +// ---- Stubs needed by clang device function headers below ---- +# include +# include +# include + +// ---- Clang device function wrappers (local copies, CUDA < 9.0 removed) ---- +// clang-format off +// Order matters: libdevice_declares must precede device_functions (declares __nv_* builtins used there). +# include "__clang_cuda_libdevice_declares.h" +# include "__clang_cuda_device_functions.h" +# include "__clang_cuda_math.h" +// clang-format on + +// ---- Address-space intrinsics needed by CCCL headers ---- +// (e.g. cuda/__memory/address_space.h, cuda/__ptx/ptx_helper_functions.h) +static __device__ __forceinline__ __attribute__((const)) unsigned int __isGlobal(const void* p) +{ + return __nvvm_isspacep_global(p); +} +static __device__ __forceinline__ __attribute__((const)) unsigned int __isShared(const void* p) +{ + return __nvvm_isspacep_shared(p); +} +static __device__ __forceinline__ __attribute__((const)) unsigned int __isConstant(const void* p) +{ + return __nvvm_isspacep_const(p); +} +static __device__ __forceinline__ __attribute__((const)) unsigned int __isLocal(const void* p) +{ + return __nvvm_isspacep_local(p); +} +# define __FWD_DEVICE static __device__ __forceinline__ +__FWD_DEVICE unsigned int __isClusterShared(const void*); +__FWD_DEVICE __SIZE_TYPE__ __cvta_generic_to_shared(const void*); +__FWD_DEVICE __SIZE_TYPE__ __cvta_generic_to_global(const void*); +__FWD_DEVICE void* __cvta_shared_to_generic(__SIZE_TYPE__); +__FWD_DEVICE void* __cvta_global_to_generic(__SIZE_TYPE__); +# undef __FWD_DEVICE +# ifndef _MSC_VER +__device__ bool __nv_fp128_isnan(__float128); +__device__ __float128 __nv_fp128_fmax(__float128, __float128); +__device__ __float128 __nv_fp128_fmin(__float128, __float128); +# endif + +// ---- Bridge cuda::std into std ---- +namespace cuda +{ +namespace std +{ +} +} // namespace cuda +namespace std +{ +using namespace cuda::std; +} + +// ============================================================================ +// Phase 3: CUDA toolkit headers +// ============================================================================ +// By this point all device-side functions and intrinsics are defined, so +// any transitive CCCL includes from these headers will find them. +# pragma push_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__") + +# define __DEVICE_LAUNCH_PARAMETERS_H__ + +// Guard out CUDA's declaration-only headers; clang provides its own. +# define __DEVICE_FUNCTIONS_H__ +# define __MATH_FUNCTIONS_H__ +# define __MATH_FUNCTIONS_HPP__ +# define __COMMON_FUNCTIONS_H__ +# define __DEVICE_FUNCTIONS_DECLS_H__ + +// ---- CUDA runtime types (cudaError_t, dim3, cudaStream_t, etc.) ---- +// (host_defines.h already included in Phase 2) +# undef __CUDACC__ +# include "cuda.h" +# include "driver_types.h" +# include "host_config.h" +# if !defined(CUDA_VERSION) || CUDA_VERSION < 9000 +# error "Unsupported CUDA version (need >= 9.0)!" +# endif + +// Clang does not have __nvvm_memcpy/__nvvm_memset; emulate with builtins. +# define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n) +# define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n) + +// __THROW may be in a weird state; keep it empty for CUDA includes. +# undef __THROW +# define __THROW + +// ============================================================================ +// Phase 4: Device-side function definitions from CUDA toolkit .hpp files +// ============================================================================ +// Poison __host__ to ensure none of these definitions get host attributes. +# pragma push_macro("__host__") +# define __host__ UNEXPECTED_HOST_ATTRIBUTE + +// Redefine __forceinline__ to include __device__. +# pragma push_macro("__forceinline__") +# define __forceinline__ __device__ __inline__ __attribute__((always_inline)) + +// Math functions: use fast or accurate variants based on compiler flag. +# pragma push_macro("__USE_FAST_MATH__") +# if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__) +# define __USE_FAST_MATH__ 1 +# endif +# include "crt/math_functions.hpp" +# pragma pop_macro("__USE_FAST_MATH__") + +# pragma pop_macro("__forceinline__") + +# undef __MATH_FUNCTIONS_HPP__ +# undef __CUDABE__ + +// Re-include device functions with __host__ defined as empty to get +// the "other branch" of #if/#else in the .hpp files. +# define __host__ +# undef __CUDABE__ +# define __CUDACC__ + +// Atomic function declarations (became builtins in CUDA 9). +# include "device_atomic_functions.h" +# undef __DEVICE_FUNCTIONS_HPP__ +# include "crt/device_double_functions.hpp" +# include "crt/device_functions.hpp" +# include "device_atomic_functions.hpp" +# include "sm_20_atomic_functions.hpp" + +// sm_20_intrinsics.hpp defines __isGlobal etc. without const attribute. +// Rename them so the definitions from Phase 4 (with const) prevail. +# pragma push_macro("__isGlobal") +# pragma push_macro("__isShared") +# pragma push_macro("__isConstant") +# pragma push_macro("__isLocal") +# define __isGlobal __ignored_cuda___isGlobal +# define __isShared __ignored_cuda___isShared +# define __isConstant __ignored_cuda___isConstant +# define __isLocal __ignored_cuda___isLocal +# include "sm_20_intrinsics.hpp" +# pragma pop_macro("__isGlobal") +# pragma pop_macro("__isShared") +# pragma pop_macro("__isConstant") +# pragma pop_macro("__isLocal") + +# include "sm_32_atomic_functions.hpp" + +# pragma push_macro("__CUDA_ARCH__") +# undef __CUDA_ARCH__ +# include "sm_60_atomic_functions.hpp" +# include "sm_61_intrinsics.hpp" +# pragma pop_macro("__CUDA_ARCH__") + +# undef __MATH_FUNCTIONS_HPP__ + +// math_functions.hpp ::signbit conflicts with libstdc++ constexpr ::signbit. +# pragma push_macro("signbit") +# pragma push_macro("__GNUC__") +# undef __GNUC__ +# define signbit __ignored_cuda_signbit +# pragma push_macro("_GLIBCXX_MATH_H") +# pragma push_macro("_LIBCPP_VERSION") +# undef _GLIBCXX_MATH_H +# ifdef _LIBCPP_VERSION +# define _LIBCPP_VERSION 3700 +# endif +# include "crt/math_functions.hpp" +# pragma pop_macro("_GLIBCXX_MATH_H") +# pragma pop_macro("_LIBCPP_VERSION") +# pragma pop_macro("__GNUC__") +# pragma pop_macro("signbit") + +# pragma pop_macro("__host__") + +// ============================================================================ +// Phase 5: cuda_runtime.h (first header that transitively pulls in CCCL) +// ============================================================================ +// ============================================================================ +// Phase 5: cuda_runtime.h (first header that transitively pulls in CCCL) +// ============================================================================ +// Verify no libcudacxx header was pulled in yet. If this fires, a header +// above transitively included a system header that resolved to libcudacxx +// before all device-side definitions were ready. +# ifdef CCCL_VERSION +# error "libcudacxx was included before device-side definitions were set up" +# endif + +# pragma push_macro("nv_weak") +# define nv_weak weak +# undef __CUDA_LIBDEVICE__ +# define __CUDACC__ +# include "cuda_runtime.h" +# pragma pop_macro("nv_weak") +# undef __CUDACC__ +# define __CUDABE__ + +# include "crt/host_runtime.h" + +// device_runtime.h defines __cxa_* macros that conflict with cxxabi.h. +# undef __cxa_vec_ctor +# undef __cxa_vec_cctor +# undef __cxa_vec_dtor +# undef __cxa_vec_new +# undef __cxa_vec_new2 +# undef __cxa_vec_new3 +# undef __cxa_vec_delete2 +# undef __cxa_vec_delete +# undef __cxa_vec_delete3 +# undef __cxa_pure_virtual + +// Texture intrinsics (requires C++11). +# if __cplusplus >= 201103L +# include <__clang_cuda_texture_intrinsics.h> +# else +template +struct __nv_tex_needs_cxx11 +{ + const static bool value = false; +}; +template +__host__ __device__ void __nv_tex_surf_handler(const char* name, T* ptr, cudaTextureObject_t obj, float x) +{ + _Static_assert(__nv_tex_needs_cxx11::value, "Texture support requires C++11"); +} +# endif +# include "surface_indirect_functions.h" +# if CUDA_VERSION < 13000 +# include "texture_fetch_functions.h" +# endif +# include "texture_indirect_functions.h" + +// ============================================================================ +// Phase 7: Restore saved state +// ============================================================================ +# pragma pop_macro("__CUDA_ARCH__") +# pragma pop_macro("__THROW") +# undef __CUDABE__ +# define __CUDACC__ + +// ============================================================================ +// Phase 8: Device-side system calls & std wrappers +// ============================================================================ +extern "C" { +__device__ int vprintf(const char*, const char*); +__device__ void free(void*) __attribute((nothrow)); +__device__ void* malloc(size_t) __attribute((nothrow)) __attribute__((malloc)); +__device__ void +__assertfail(const char* __message, const char* __file, unsigned __line, const char* __function, size_t __charSize); +__device__ static inline void +__assert_fail(const char* __message, const char* __file, unsigned __line, const char* __function) +{ + __assertfail(__message, __file, __line, __function, sizeof(char)); +} +__device__ int printf(const char*, ...); +} // extern "C" + +namespace std +{ +__device__ static inline void free(void* __ptr) +{ + ::free(__ptr); +} +__device__ static inline void* malloc(size_t __size) +{ + return ::malloc(__size); +} +} // namespace std + +// ============================================================================ +// Phase 9: Builtin variable conversion operators +// ============================================================================ +// These need dim3 and uint3 to be fully defined (from vector_types.h, pulled +// in by driver_types.h in Phase 5). +__device__ inline __cuda_builtin_threadIdx_t::operator dim3() const +{ + return dim3(x, y, z); +} +__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const +{ + return {x, y, z}; +} +__device__ inline __cuda_builtin_blockIdx_t::operator dim3() const +{ + return dim3(x, y, z); +} +__device__ inline __cuda_builtin_blockIdx_t::operator uint3() const +{ + return {x, y, z}; +} +__device__ inline __cuda_builtin_blockDim_t::operator dim3() const +{ + return dim3(x, y, z); +} +__device__ inline __cuda_builtin_blockDim_t::operator uint3() const +{ + return {x, y, z}; +} +__device__ inline __cuda_builtin_gridDim_t::operator dim3() const +{ + return dim3(x, y, z); +} +__device__ inline __cuda_builtin_gridDim_t::operator uint3() const +{ + return {x, y, z}; +} + +// ============================================================================ +// Phase 10: Remaining clang CUDA headers +// ============================================================================ +# include <__clang_cuda_cmath.h> +# include <__clang_cuda_complex_builtins.h> +# include <__clang_cuda_intrinsics.h> + +// curand_mtgp32_kernel redefines blockDim/threadIdx with dim3/uint3 types, +// which is incompatible with our builtins. Force-include it with types +// redefined to our builtin types. +// Skip when cuRAND headers are unavailable (e.g. pip-installed toolkit). +# if __has_include("curand_mtgp32_kernel.h") +# pragma push_macro("dim3") +# pragma push_macro("uint3") +# define dim3 __cuda_builtin_blockDim_t +# define uint3 __cuda_builtin_threadIdx_t +# include "curand_mtgp32_kernel.h" +# pragma pop_macro("dim3") +# pragma pop_macro("uint3") +# endif +# pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__") + +// Kernel launch configuration function. +# if CUDA_VERSION >= 9020 +extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void* stream = 0); +# endif + +// The JIT shared library is linked without the C runtime (no libc on the link +// line) so atexit is unavailable. The CUDA module constructor calls atexit() +// to register a cleanup function. Provide a no-op stub — the JIT library is +// short-lived and unloaded explicitly. +# if !defined(__HOSTJIT_DEVICE_COMPILATION__) +# if defined(_MSC_VER) +extern "C" int atexit(void(__cdecl*)(void)) +{ + return 0; +} +# else +extern "C" int atexit(void (*)(void)) +{ + return 0; +} +# endif +# endif + +#endif // __CUDA__ && __clang__ +#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__ diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/climits b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/climits new file mode 100644 index 00000000000..3910c27321c --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/climits @@ -0,0 +1,7 @@ +// Minimal climits stub for CUDA JIT compilation +#ifndef _HOSTJIT_CLIMITS +#define _HOSTJIT_CLIMITS + +#include + +#endif // _HOSTJIT_CLIMITS diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cmath b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cmath new file mode 100644 index 00000000000..35f14baa610 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cmath @@ -0,0 +1,7 @@ +// Minimal cmath stub for CUDA JIT compilation +#ifndef _HOSTJIT_CMATH +#define _HOSTJIT_CMATH + +#include + +#endif // _HOSTJIT_CMATH diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cstddef b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cstddef new file mode 100644 index 00000000000..4e628c9ce78 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cstddef @@ -0,0 +1,14 @@ +// Minimal cstddef stub for CUDA JIT compilation +// Compatible with libcu++ which expects to pull types from global namespace +#ifndef _HOSTJIT_CSTDDEF +#define _HOSTJIT_CSTDDEF + +#include + +namespace std { + using ::size_t; + using ::ptrdiff_t; + using nullptr_t = decltype(nullptr); +} + +#endif // _HOSTJIT_CSTDDEF diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cstdlib b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cstdlib new file mode 100644 index 00000000000..7033a7fd3ff --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cstdlib @@ -0,0 +1,20 @@ +#ifndef _HOSTJIT_CSTDLIB +#define _HOSTJIT_CSTDLIB + +#include + +#define EXIT_SUCCESS 0 +#define EXIT_FAILURE 1 +#define RAND_MAX 2147483647 + +extern "C" { +void* malloc(size_t); +void* calloc(size_t, size_t); +void* realloc(void*, size_t); +void free(void*); +void abort(void); +void exit(int); +void _Exit(int); +} + +#endif diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/ctype.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/ctype.h new file mode 100644 index 00000000000..c9e4f3eb462 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/ctype.h @@ -0,0 +1,3 @@ +#ifndef _HOSTJIT_CTYPE_H +#define _HOSTJIT_CTYPE_H +#endif diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/initializer_list b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/initializer_list new file mode 100644 index 00000000000..b8347f771b5 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/initializer_list @@ -0,0 +1,47 @@ +// Minimal initializer_list stub for CUDA JIT compilation +#ifndef _HOSTJIT_INITIALIZER_LIST +#define _HOSTJIT_INITIALIZER_LIST + +#include + +namespace std { + +template +class initializer_list { +public: + using value_type = T; + using reference = const T&; + using const_reference = const T&; + using size_type = size_t; + using iterator = const T*; + using const_iterator = const T*; + +private: + const T* _begin; + size_t _size; + + // This constructor is called by the compiler + constexpr initializer_list(const T* b, size_t s) noexcept + : _begin(b), _size(s) {} + +public: + constexpr initializer_list() noexcept : _begin(nullptr), _size(0) {} + + constexpr size_t size() const noexcept { return _size; } + constexpr const T* begin() const noexcept { return _begin; } + constexpr const T* end() const noexcept { return _begin + _size; } +}; + +template +constexpr const T* begin(initializer_list il) noexcept { + return il.begin(); +} + +template +constexpr const T* end(initializer_list il) noexcept { + return il.end(); +} + +} // namespace std + +#endif // _HOSTJIT_INITIALIZER_LIST diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/limits b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/limits new file mode 100644 index 00000000000..8d388108143 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/limits @@ -0,0 +1,61 @@ +// Minimal stub for hostjit device compilation. +// +// Clang's __clang_cuda_cmath.h includes unconditionally, then expands +// __CUDA_CLANG_FN_INTEGER_OVERLOAD_1/2 macros that reference +// std::numeric_limits<__T>::is_integer in return-type SFINAE at parse time. +// Clang evaluates these dependent names during template parsing, so the struct +// must be declared — not just forward-declared — before the macro expansion. +// +// In the hostjit device-compilation include path, would normally +// resolve to libcudacxx/include/cuda/std/limits, which cascades through +// numeric_limits, bit_cast, popcount, etc. — incompatible with freestanding. +// +// This stub (found first on -internal-isystem) stops that cascade, providing +// only the two members that __clang_cuda_cmath.h actually inspects. +#pragma once + +namespace std { + +template +struct numeric_limits { + static constexpr bool is_specialized = false; + static constexpr bool is_integer = false; +}; + +// Integer specializations — needed so the SFINAE in __clang_cuda_cmath.h +// correctly dispatches integer arguments. +#define _HOSTJIT_NUM_LIM_INT(_T) \ + template <> struct numeric_limits<_T> { \ + static constexpr bool is_specialized = true; \ + static constexpr bool is_integer = true; \ + }; + +_HOSTJIT_NUM_LIM_INT(bool) +_HOSTJIT_NUM_LIM_INT(char) +_HOSTJIT_NUM_LIM_INT(signed char) +_HOSTJIT_NUM_LIM_INT(unsigned char) +_HOSTJIT_NUM_LIM_INT(short) +_HOSTJIT_NUM_LIM_INT(unsigned short) +_HOSTJIT_NUM_LIM_INT(int) +_HOSTJIT_NUM_LIM_INT(unsigned int) +_HOSTJIT_NUM_LIM_INT(long) +_HOSTJIT_NUM_LIM_INT(unsigned long) +_HOSTJIT_NUM_LIM_INT(long long) +_HOSTJIT_NUM_LIM_INT(unsigned long long) + +#undef _HOSTJIT_NUM_LIM_INT + +// Floating-point specializations. +#define _HOSTJIT_NUM_LIM_FP(_T) \ + template <> struct numeric_limits<_T> { \ + static constexpr bool is_specialized = true; \ + static constexpr bool is_integer = false; \ + }; + +_HOSTJIT_NUM_LIM_FP(float) +_HOSTJIT_NUM_LIM_FP(double) +_HOSTJIT_NUM_LIM_FP(long double) + +#undef _HOSTJIT_NUM_LIM_FP + +} // namespace std diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/math.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/math.h new file mode 100644 index 00000000000..7884961cdb4 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/math.h @@ -0,0 +1,21 @@ +#ifndef _HOSTJIT_MATH_H +#define _HOSTJIT_MATH_H + +// Macros needed by __clang_cuda_math.h +#define HUGE_VAL __builtin_huge_val() +#define HUGE_VALF __builtin_huge_valf() +#define HUGE_VALL __builtin_huge_vall() +#define INFINITY __builtin_inff() +#define NAN __builtin_nanf("") +#define MATH_ERRNO 1 +#define MATH_ERREXCEPT 2 +#define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT) +#define FP_NAN 0 +#define FP_INFINITE 1 +#define FP_ZERO 2 +#define FP_SUBNORMAL 3 +#define FP_NORMAL 4 +#define __signbit(x) __builtin_signbit(x) +#define __signbitl(x) __builtin_signbitl(x) + +#endif diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/memory.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/memory.h new file mode 100644 index 00000000000..19b77865017 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/memory.h @@ -0,0 +1,4 @@ +#ifndef _HOSTJIT_MEMORY_H +#define _HOSTJIT_MEMORY_H +#include +#endif diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/new b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/new new file mode 100644 index 00000000000..8e21dbc7761 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/new @@ -0,0 +1,29 @@ +#ifndef _HOSTJIT_NEW +#define _HOSTJIT_NEW +#include + +namespace std { +struct nothrow_t { explicit nothrow_t() = default; }; +extern const nothrow_t nothrow; +enum class align_val_t : size_t {}; +} + +// Placement new — needs __host__ __device__ for CUDA +#if defined(__CUDA__) +__host__ __device__ +#endif +inline void* operator new(std::size_t, void* p) noexcept { return p; } +#if defined(__CUDA__) +__host__ __device__ +#endif +inline void* operator new[](std::size_t, void* p) noexcept { return p; } +#if defined(__CUDA__) +__host__ __device__ +#endif +inline void operator delete(void*, void*) noexcept {} +#if defined(__CUDA__) +__host__ __device__ +#endif +inline void operator delete[](void*, void*) noexcept {} + +#endif diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/stdlib.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/stdlib.h new file mode 100644 index 00000000000..d8d910d8c5a --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/stdlib.h @@ -0,0 +1,7 @@ +// Minimal stdlib.h stub for CUDA JIT compilation +#ifndef _HOSTJIT_STDLIB_H +#define _HOSTJIT_STDLIB_H + +#include + +#endif // _HOSTJIT_STDLIB_H diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/string.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/string.h new file mode 100644 index 00000000000..c38628207bf --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/string.h @@ -0,0 +1,15 @@ +#ifndef _HOSTJIT_STRING_H +#define _HOSTJIT_STRING_H +#include +#ifdef __cplusplus +extern "C" { +#endif +void* memcpy(void*, const void*, size_t); +void* memset(void*, int, size_t); +int memcmp(const void*, const void*, size_t); +void* memmove(void*, const void*, size_t); +size_t strlen(const char*); +#ifdef __cplusplus +} +#endif +#endif diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/utility b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/utility new file mode 100644 index 00000000000..0a2b61ac1b0 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/utility @@ -0,0 +1,34 @@ +// Minimal stub for hostjit device compilation. +// +// cuda_runtime.h includes for std::forward/std::move. In the +// hostjit device-compilation include path, resolves to +// libcudacxx/include/cuda/std/utility, which cascades into the full CCCL +// utility/iterator/concepts hierarchy — incompatible with freestanding mode. +// +// This stub (found first on -internal-isystem) stops that cascade. +// Only std::forward and std::move are provided because that is all +// cuda_runtime.h actually uses at the top level; the full CCCL hierarchy +// is not required for a simple host+device kernel. +#pragma once + +namespace std { + +template struct remove_reference { using type = _Tp; }; +template struct remove_reference<_Tp&> { using type = _Tp; }; +template struct remove_reference<_Tp&&>{ using type = _Tp; }; +template +using remove_reference_t = typename remove_reference<_Tp>::type; + +template +__host__ __device__ constexpr _Tp&& +forward(remove_reference_t<_Tp>& __t) noexcept { return static_cast<_Tp&&>(__t); } + +template +__host__ __device__ constexpr _Tp&& +forward(remove_reference_t<_Tp>&& __t) noexcept { return static_cast<_Tp&&>(__t); } + +template +__host__ __device__ constexpr remove_reference_t<_Tp>&& +move(_Tp&& __t) noexcept { return static_cast&&>(__t); } + +} // namespace std diff --git a/c/parallel/src/hostjit/include/hostjit/jit_compiler.hpp b/c/parallel/src/hostjit/include/hostjit/jit_compiler.hpp new file mode 100644 index 00000000000..9667ee26f78 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/jit_compiler.hpp @@ -0,0 +1,95 @@ +#pragma once + +#include +#include + +#include +#include +#include + +namespace hostjit +{ +class JITCompiler +{ +public: + // Create JIT compiler with default configuration (auto-detected) + JITCompiler(); + + // Create JIT compiler with custom configuration + explicit JITCompiler(const CompilerConfig& config); + + ~JITCompiler(); + + // Disable copy + JITCompiler(const JITCompiler&) = delete; + JITCompiler& operator=(const JITCompiler&) = delete; + + // Compile CUDA source code to shared library and load it + // Returns true on success, false on failure + bool compile(const std::string& source_code); + + // Get function pointer by name + // Returns nullptr if function not found + template + FuncType getFunction(const std::string& name) + { + if (!library_.isLoaded()) + { + last_error_ = "No library loaded"; + return nullptr; + } + + auto func = library_.getFunction(name); + if (!func) + { + last_error_ = "Failed to find function '" + name + "': " + library_.getLastError(); + } + return func; + } + + // Get the last error message + std::string getLastError() const + { + return last_error_; + } + + // Get the configuration being used + const CompilerConfig& getConfig() const + { + return config_; + } + + // Check if a library is currently loaded + bool isLoaded() const + { + return library_.isLoaded(); + } + + // Get the path to compiled artifacts (object file, shared library, etc.) + // Only valid after successful compile() and if keep_artifacts is set + std::string getArtifactsPath() const + { + return temp_dir_; + } + + // Get the cubin extracted during compilation + const std::vector& getCubin() const + { + return cubin_; + } + + // Unload the current library and clean up temporary files + void cleanup(); + +private: + std::string createTempDirectory(); + void removeTempDirectory(); + + CompilerConfig config_; + CUDACompiler compiler_; + DynamicLibrary library_; + std::string temp_dir_; + std::string last_error_; + std::vector cubin_; +}; +} // namespace hostjit diff --git a/c/parallel/src/hostjit/include/hostjit/loader.hpp b/c/parallel/src/hostjit/include/hostjit/loader.hpp new file mode 100644 index 00000000000..f6ee9525c48 --- /dev/null +++ b/c/parallel/src/hostjit/include/hostjit/loader.hpp @@ -0,0 +1,47 @@ +#pragma once + +#include + +namespace hostjit +{ +class DynamicLibrary +{ +public: + DynamicLibrary(); + ~DynamicLibrary(); + + // Disable copy + DynamicLibrary(const DynamicLibrary&) = delete; + DynamicLibrary& operator=(const DynamicLibrary&) = delete; + + // Enable move + DynamicLibrary(DynamicLibrary&& other) noexcept; + DynamicLibrary& operator=(DynamicLibrary&& other) noexcept; + + // Load a shared library + bool load(const std::string& library_path); + + // Get a symbol (function or variable) by name + void* getSymbol(const std::string& symbol_name); + + // Template helper to get function pointers with type safety + template + FuncType getFunction(const std::string& name) + { + return reinterpret_cast(getSymbol(name)); + } + + // Check if library is loaded + bool isLoaded() const; + + // Get the last error message + std::string getLastError() const; + + // Unload the library + void unload(); + +private: + void* handle_; + std::string last_error_; +}; +} // namespace hostjit diff --git a/c/parallel/src/hostjit/jit_compiler.cpp b/c/parallel/src/hostjit/jit_compiler.cpp new file mode 100644 index 00000000000..83490468aad --- /dev/null +++ b/c/parallel/src/hostjit/jit_compiler.cpp @@ -0,0 +1,192 @@ +#include +#include +#include +#include +#include + +#include + +#ifdef _WIN32 +# include +#else +# include +#endif + +namespace hostjit +{ +JITCompiler::JITCompiler() + : config_(detectDefaultConfig()) +{} + +JITCompiler::JITCompiler(const CompilerConfig& config) + : config_(config) +{} + +JITCompiler::~JITCompiler() +{ + cleanup(); +} + +bool JITCompiler::compile(const std::string& source_code) +{ + std::string config_error; + if (!validateConfig(config_, &config_error)) + { + last_error_ = "Configuration error: " + config_error; + return false; + } + + cleanup(); + + temp_dir_ = createTempDirectory(); + if (temp_dir_.empty()) + { + last_error_ = "Failed to create temporary directory"; + return false; + } + + std::string obj_path = temp_dir_ + "/cuda_code.o"; + auto compile_result = compiler_.compileToObject(source_code, obj_path, config_); + + if (!compile_result.success) + { + last_error_ = "Compilation failed:\n" + compile_result.diagnostics; + removeTempDirectory(); + return false; + } + + // Store the cubin for later inspection + cubin_ = std::move(compile_result.cubin); + + if (config_.verbose) + { + std::cout << "Compilation diagnostics:\n" << compile_result.diagnostics << "\n"; + } + +#ifdef _WIN32 + std::string lib_path = temp_dir_ + "/cuda_code.dll"; +#else + std::string lib_path = temp_dir_ + "/libcuda_code.so"; +#endif + auto link_result = compiler_.linkToSharedLibrary({obj_path}, lib_path, config_); + + if (!link_result.success) + { + last_error_ = "Linking failed:\n" + link_result.diagnostics; + removeTempDirectory(); + return false; + } + + if (config_.verbose) + { + std::cout << "Linking diagnostics:\n" << link_result.diagnostics << "\n"; + } + + if (!library_.load(lib_path)) + { + last_error_ = "Failed to load library: " + library_.getLastError(); + removeTempDirectory(); + return false; + } + + if (config_.verbose) + { + std::cout << "Successfully loaded library: " << lib_path << "\n"; + } + + last_error_.clear(); + return true; +} + +void JITCompiler::cleanup() +{ + library_.unload(); + + if (!config_.keep_artifacts) + { + removeTempDirectory(); + } + + last_error_.clear(); +} + +std::string JITCompiler::createTempDirectory() +{ + std::filesystem::path base_tmp_dir; + +#ifdef _WIN32 + const char* tmp_dir = std::getenv("TEMP"); + if (!tmp_dir) + { + tmp_dir = std::getenv("TMP"); + } + if (tmp_dir) + { + base_tmp_dir = tmp_dir; + } + else + { + base_tmp_dir = std::filesystem::temp_directory_path(); + } +#else + const char* tmp_dir = std::getenv("TMPDIR"); + if (tmp_dir) + { + base_tmp_dir = tmp_dir; + } + else + { + base_tmp_dir = "/tmp"; + } +#endif + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dis(0, 999999); + +#ifdef _WIN32 + int pid = _getpid(); +#else + int pid = getpid(); +#endif + + for (int attempt = 0; attempt < 10; ++attempt) + { + std::string dir_name = "hostjit_" + std::to_string(pid) + "_" + std::to_string(dis(gen)); + std::filesystem::path full_path = base_tmp_dir / dir_name; + + std::error_code ec; + if (std::filesystem::create_directories(full_path, ec) && !ec) + { + return full_path.string(); + } + } + + return ""; +} + +void JITCompiler::removeTempDirectory() +{ + if (temp_dir_.empty()) + { + return; + } + + try + { + if (std::filesystem::exists(temp_dir_)) + { + std::filesystem::remove_all(temp_dir_); + } + } + catch (const std::filesystem::filesystem_error& e) + { + if (config_.verbose) + { + std::cerr << "Warning: Failed to remove temporary directory: " << e.what() << "\n"; + } + } + + temp_dir_.clear(); +} +} // namespace hostjit diff --git a/c/parallel/src/hostjit/loader.cpp b/c/parallel/src/hostjit/loader.cpp new file mode 100644 index 00000000000..33d49e4bc20 --- /dev/null +++ b/c/parallel/src/hostjit/loader.cpp @@ -0,0 +1,207 @@ +#include + +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# include +#else +# include +#endif + +namespace hostjit +{ +#ifdef _WIN32 +namespace +{ +// Run C++ static constructors in a DLL loaded with /NOENTRY /NODEFAULTLIB. +// The compiler places CUDA fatbin registration in the .CRT$XCU section. +// Without CRT startup, these never run, so we walk the merged .CRT section +// in the PE and call each non-null function pointer. +void runStaticInitializers(HMODULE module) +{ + auto base = reinterpret_cast(module); + auto dos = reinterpret_cast(base); + auto nt = reinterpret_cast(base + dos->e_lfanew); + auto sec = IMAGE_FIRST_SECTION(nt); + + for (WORD i = 0; i < nt->FileHeader.NumberOfSections; ++i, ++sec) + { + if (memcmp(sec->Name, ".CRT", 4) == 0) + { + using InitFunc = void(__cdecl*)(); + auto funcs = reinterpret_cast(const_cast(base) + sec->VirtualAddress); + size_t count = sec->SizeOfRawData / sizeof(InitFunc); + for (size_t j = 0; j < count; ++j) + { + if (funcs[j]) + { + funcs[j](); + } + } + } + } +} + +std::string getWindowsError() +{ + DWORD error = GetLastError(); + if (error == 0) + { + return ""; + } + + LPSTR buffer = nullptr; + DWORD size = FormatMessageA( + FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + nullptr, + error, + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + reinterpret_cast(&buffer), + 0, + nullptr); + + std::string message; + if (size > 0 && buffer) + { + message = std::string(buffer, size); + while (!message.empty() && (message.back() == '\n' || message.back() == '\r')) + { + message.pop_back(); + } + LocalFree(buffer); + } + else + { + message = "Unknown error (code: " + std::to_string(error) + ")"; + } + + return message; +} +} // anonymous namespace +#endif + +DynamicLibrary::DynamicLibrary() + : handle_(nullptr) +{} + +DynamicLibrary::~DynamicLibrary() +{ + unload(); +} + +DynamicLibrary::DynamicLibrary(DynamicLibrary&& other) noexcept + : handle_(other.handle_) + , last_error_(std::move(other.last_error_)) +{ + other.handle_ = nullptr; +} + +DynamicLibrary& DynamicLibrary::operator=(DynamicLibrary&& other) noexcept +{ + if (this != &other) + { + unload(); + handle_ = other.handle_; + last_error_ = std::move(other.last_error_); + other.handle_ = nullptr; + } + return *this; +} + +bool DynamicLibrary::load(const std::string& library_path) +{ + unload(); + +#ifdef _WIN32 + SetLastError(0); + handle_ = static_cast(LoadLibraryA(library_path.c_str())); + + if (!handle_) + { + last_error_ = getWindowsError(); + if (last_error_.empty()) + { + last_error_ = "Unknown LoadLibrary error"; + } + return false; + } + + // The DLL is linked with /NOENTRY (no CRT startup), so C++ static + // constructors (e.g. CUDA fatbin registration) haven't run yet. + runStaticInitializers(static_cast(handle_)); +#else + dlerror(); + handle_ = dlopen(library_path.c_str(), RTLD_LAZY | RTLD_LOCAL); + + if (!handle_) + { + const char* error = dlerror(); + last_error_ = error ? error : "Unknown dlopen error"; + return false; + } +#endif + + last_error_.clear(); + return true; +} + +void* DynamicLibrary::getSymbol(const std::string& symbol_name) +{ + if (!handle_) + { + last_error_ = "Library not loaded"; + return nullptr; + } + +#ifdef _WIN32 + SetLastError(0); + void* symbol = reinterpret_cast(GetProcAddress(static_cast(handle_), symbol_name.c_str())); + + if (!symbol) + { + last_error_ = getWindowsError(); + if (last_error_.empty()) + { + last_error_ = "Symbol not found: " + symbol_name; + } + return nullptr; + } +#else + dlerror(); + void* symbol = dlsym(handle_, symbol_name.c_str()); + + const char* error = dlerror(); + if (error) + { + last_error_ = error; + return nullptr; + } +#endif + + last_error_.clear(); + return symbol; +} + +bool DynamicLibrary::isLoaded() const +{ + return handle_ != nullptr; +} + +std::string DynamicLibrary::getLastError() const +{ + return last_error_; +} + +void DynamicLibrary::unload() +{ + if (handle_) + { +#ifdef _WIN32 + FreeLibrary(static_cast(handle_)); +#else + dlclose(handle_); +#endif + handle_ = nullptr; + } + last_error_.clear(); +} +} // namespace hostjit diff --git a/c/parallel/test/CMakeLists.txt b/c/parallel/test/CMakeLists.txt index edf616ca8d4..dfde7a4a346 100644 --- a/c/parallel/test/CMakeLists.txt +++ b/c/parallel/test/CMakeLists.txt @@ -51,6 +51,23 @@ file( *.cpp ) +# test_freestanding_compiler is built separately (links hostjit_lib, not cccl.c.parallel) +list(REMOVE_ITEM test_srcs "test_freestanding_compiler.cpp") + foreach (test_src IN LISTS test_srcs) cccl_c_parallel_add_test(test_target "${test_src}") endforeach() + +if (CCCL_C_Parallel_ENABLE_HOSTJIT AND TARGET hostjit_lib) + cccl_add_executable( + cccl.c.parallel.test.freestanding_compiler + ADD_CTEST + NO_METATARGETS + DIALECT 20 + SOURCES test_freestanding_compiler.cpp + ) + target_link_libraries( + cccl.c.parallel.test.freestanding_compiler + PRIVATE hostjit_lib CUDA::cudart + ) +endif() diff --git a/c/parallel/test/test_freestanding_compiler.cpp b/c/parallel/test/test_freestanding_compiler.cpp new file mode 100644 index 00000000000..ee7271068a4 --- /dev/null +++ b/c/parallel/test/test_freestanding_compiler.cpp @@ -0,0 +1,69 @@ +//===----------------------------------------------------------------------===// +// test_freestanding_compiler.cpp +// +// Smoke-test for HostJIT compiler infrastructure (issue #7743). +// JIT-compiles a minimal host+device CUDA source, runs it, and verifies +// the result — without relying on any system CUDA headers at JIT time. +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +#include +#include + +static const char* k_source = R"( +#include + +#ifdef _WIN32 +# define EXPORT __declspec(dllexport) +#else +# define EXPORT __attribute__((visibility("default"))) +#endif + +__global__ void device_kernel(int* ptr) +{ + *ptr = 42; +} + +extern "C" EXPORT void host_entry(int* ptr) +{ + device_kernel<<<1, 1>>>(ptr); +} +)"; + +int main() +{ + // Detect Clang/CUDA configuration from the build environment + auto config = hostjit::detectDefaultConfig(); + + hostjit::JITCompiler compiler(config); + if (!compiler.compile(k_source)) + { + std::fprintf(stderr, "HostJIT compilation failed:\n%s\n", compiler.getLastError().c_str()); + return 1; + } + + auto host_fn = compiler.getFunction("host_entry"); + if (!host_fn) + { + std::fprintf(stderr, "Symbol 'host_entry' not found\n"); + return 1; + } + + int* d_ptr = nullptr; + cudaMalloc(&d_ptr, sizeof(int)); + + host_fn(d_ptr); + cudaDeviceSynchronize(); + + int result = 0; + cudaMemcpy(&result, d_ptr, sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(d_ptr); + + assert(result == 42 && "device kernel did not write expected value"); + std::printf("freestanding compiler test passed (result=%d)\n", result); + return 0; +} diff --git a/ci/build_cccl_c_parallel_hostjit.sh b/ci/build_cccl_c_parallel_hostjit.sh new file mode 100755 index 00000000000..8f3ea508bc3 --- /dev/null +++ b/ci/build_cccl_c_parallel_hostjit.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +set -euo pipefail + +source "$(dirname "${BASH_SOURCE[0]}")/build_common.sh" + +print_environment_details + +# libnvfatbin is required by hostjit but is not included in the base rapidsai devcontainer +# image. Detect the installed CTK version and install the matching package if missing. +if [[ "$(uname -s)" == "Linux" ]] && ! ldconfig -p 2>/dev/null | grep -q libnvfatbin; then + CTK_DEB_VER=$(nvcc --version 2>/dev/null \ + | grep -oP 'release \K[0-9]+\.[0-9]+' | tr '.' '-') + if [[ -n "$CTK_DEB_VER" ]]; then + echo "Installing libnvfatbin-dev-${CTK_DEB_VER}..." + sudo apt-get update -y + sudo apt-get install -y --no-install-recommends "libnvfatbin-dev-${CTK_DEB_VER}" + else + echo "WARNING: could not determine CTK version; skipping libnvfatbin install" + fi +fi + +PRESET="cccl-c-parallel-hostjit" + +CMAKE_OPTIONS="-DCMAKE_CXX_STANDARD=${CXX_STANDARD} -DCMAKE_CUDA_STANDARD=${CXX_STANDARD}" + +configure_and_build_preset "CCCL C Parallel Library (HostJIT)" "$PRESET" "$CMAKE_OPTIONS" + +print_time_summary diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 2604163c6c6..e4079a14ddb 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -63,6 +63,13 @@ workflows: - {jobs: ['test'], project: 'cccl_c_parallel', ctk: '13.X', cxx: ['gcc13', 'msvc'], gpu: ['rtx2080', 'l4', 'h100']} # RTX PRO 6000 coverage (limited due to small number of runners): - {jobs: ['test'], project: 'cccl_c_parallel', ctk: '13.X', cxx: ['gcc13'], gpu: ['rtxpro6000']} + # c.parallel with HostJIT + # + # For now, this is a separate job run for Linux/CUDA13. + # Eventually we will want building with HostJIT to be the + # default, and will do it across the entire matrix. Currently + # blocked on libnvfatbin availability on Windows containers, and for CUDA <12.4. + - {jobs: ['test'], project: 'cccl_c_parallel_hostjit', ctk: '13.X', cxx: ['gcc13'], gpu: 'rtx2080'} # c.experimental.stf-- pinned to gcc13 to match python - {jobs: ['test'], project: 'cccl_c_stf', ctk: '12.X', cxx: 'gcc13', gpu: ['rtx2080']} - {jobs: ['test'], project: 'cccl_c_stf', ctk: '13.X', cxx: 'gcc13', gpu: ['rtx2080', 'l4', 'h100']} @@ -559,6 +566,9 @@ projects: cccl_c_parallel: name: 'CCCL C Parallel' stds: [20] + cccl_c_parallel_hostjit: + name: 'CCCL C Parallel (HostJIT)' + stds: [20] cccl_c_stf: name: 'CCCL C CUDASTF' stds: [20] diff --git a/ci/project_files_and_dependencies.yaml b/ci/project_files_and_dependencies.yaml index 79273d8e092..287fbf3ffb4 100644 --- a/ci/project_files_and_dependencies.yaml +++ b/ci/project_files_and_dependencies.yaml @@ -114,6 +114,16 @@ projects: include_regexes: ["c/parallel/"] exclude_project_files: [cccl_c_parallel_public] + cccl_c_parallel_hostjit: + name: "CCCL C Parallel Library (HostJIT)" + matrix_project: "cccl_c_parallel_hostjit" + lite_dependencies: [libcudacxx_public] + full_dependencies: [cccl_c_parallel_public] + include_regexes: + - "c/parallel/src/hostjit/" + - "ci/build_cccl_c_parallel_hostjit\\.sh" + - "ci/test_cccl_c_parallel_hostjit\\.sh" + cccl_c_stf: name: "CCCL C CUDASTF Library" matrix_project: "cccl_c_stf" diff --git a/ci/test/inspect_changes/core_dirty.output b/ci/test/inspect_changes/core_dirty.output index 18eba6f5119..18fb9e417b8 100644 --- a/ci/test/inspect_changes/core_dirty.output +++ b/ci/test/inspect_changes/core_dirty.output @@ -1,2 +1,2 @@ -FULL_BUILD=libcudacxx cub thrust cudax cccl_c_parallel cccl_c_stf python packaging stdpar nvbench_helper nvrtcc tidy +FULL_BUILD=libcudacxx cub thrust cudax cccl_c_parallel cccl_c_parallel_hostjit cccl_c_stf python packaging stdpar nvbench_helper nvrtcc tidy LITE_BUILD= diff --git a/ci/test/inspect_changes/libcudacxx_both.output b/ci/test/inspect_changes/libcudacxx_both.output index 50134a6a731..f7a59149b12 100644 --- a/ci/test/inspect_changes/libcudacxx_both.output +++ b/ci/test/inspect_changes/libcudacxx_both.output @@ -1,2 +1,2 @@ FULL_BUILD=libcudacxx tidy -LITE_BUILD=cub thrust cudax cccl_c_parallel cccl_c_stf python packaging stdpar nvbench_helper +LITE_BUILD=cub thrust cudax cccl_c_parallel cccl_c_parallel_hostjit cccl_c_stf python packaging stdpar nvbench_helper diff --git a/ci/test/inspect_changes/libcudacxx_public_only.output b/ci/test/inspect_changes/libcudacxx_public_only.output index 50134a6a731..f7a59149b12 100644 --- a/ci/test/inspect_changes/libcudacxx_public_only.output +++ b/ci/test/inspect_changes/libcudacxx_public_only.output @@ -1,2 +1,2 @@ FULL_BUILD=libcudacxx tidy -LITE_BUILD=cub thrust cudax cccl_c_parallel cccl_c_stf python packaging stdpar nvbench_helper +LITE_BUILD=cub thrust cudax cccl_c_parallel cccl_c_parallel_hostjit cccl_c_stf python packaging stdpar nvbench_helper diff --git a/ci/test/inspect_changes/libcudacxx_thrust.output b/ci/test/inspect_changes/libcudacxx_thrust.output index 0c8e975985b..1a24f29859d 100644 --- a/ci/test/inspect_changes/libcudacxx_thrust.output +++ b/ci/test/inspect_changes/libcudacxx_thrust.output @@ -1,2 +1,2 @@ FULL_BUILD=libcudacxx thrust tidy -LITE_BUILD=cub cudax cccl_c_parallel cccl_c_stf python packaging stdpar nvbench_helper +LITE_BUILD=cub cudax cccl_c_parallel cccl_c_parallel_hostjit cccl_c_stf python packaging stdpar nvbench_helper diff --git a/ci/test_cccl_c_parallel_hostjit.sh b/ci/test_cccl_c_parallel_hostjit.sh new file mode 100755 index 00000000000..0180997344e --- /dev/null +++ b/ci/test_cccl_c_parallel_hostjit.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +source "$(dirname "${BASH_SOURCE[0]}")/build_common.sh" + +print_environment_details + +./build_cccl_c_parallel_hostjit.sh "$@" + +PRESET="cccl-c-parallel-hostjit" + +test_preset "CCCL C Parallel Library (HostJIT)" ${PRESET} + +print_time_summary diff --git a/ci/windows/build_cccl_c_parallel_hostjit.ps1 b/ci/windows/build_cccl_c_parallel_hostjit.ps1 new file mode 100644 index 00000000000..3d9c941f40f --- /dev/null +++ b/ci/windows/build_cccl_c_parallel_hostjit.ps1 @@ -0,0 +1,28 @@ +Param( + [Parameter(Mandatory = $false)] + [Alias("arch")] + [string]$CUDA_ARCH = "", + [Parameter(Mandatory = $false)] + [Alias("cmake-options")] + [string]$CMAKE_OPTIONS = "" +) + +$ErrorActionPreference = "Stop" + +$CURRENT_PATH = Split-Path $pwd -leaf +If($CURRENT_PATH -ne "ci") { + Write-Host "Moving to ci folder" + pushd "$PSScriptRoot/.." +} + +Remove-Module -Name build_common -ErrorAction SilentlyContinue +Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList @(20, $CUDA_ARCH, $CMAKE_OPTIONS) + +$PRESET = "cccl-c-parallel-hostjit" +$LOCAL_CMAKE_OPTIONS = "" + +configure_and_build_preset "CCCL C Parallel (HostJIT)" $PRESET $LOCAL_CMAKE_OPTIONS + +If($CURRENT_PATH -ne "ci") { + popd +} diff --git a/ci/windows/test_cccl_c_parallel_hostjit.ps1 b/ci/windows/test_cccl_c_parallel_hostjit.ps1 new file mode 100644 index 00000000000..30c3d675390 --- /dev/null +++ b/ci/windows/test_cccl_c_parallel_hostjit.ps1 @@ -0,0 +1,31 @@ +Param( + [Parameter(Mandatory = $false)] + [Alias("arch")] + [string]$CUDA_ARCH = "", + [Parameter(Mandatory = $false)] + [Alias("cmake-options")] + [string]$CMAKE_OPTIONS = "" +) + +$ErrorActionPreference = "Stop" + +$CURRENT_PATH = Split-Path $pwd -leaf +If($CURRENT_PATH -ne "ci") { + Write-Host "Moving to ci folder" + pushd "$PSScriptRoot/.." +} + +# Build first +$buildCmd = "$PSScriptRoot/build_cccl_c_parallel_hostjit.ps1 -arch '$CUDA_ARCH' -cmake-options '$CMAKE_OPTIONS'" +Write-Host "Running: $buildCmd" +Invoke-Expression $buildCmd + +Remove-Module -Name build_common -ErrorAction SilentlyContinue +Import-Module -Name "$PSScriptRoot/build_common.psm1" -ArgumentList @(20, $CUDA_ARCH, $CMAKE_OPTIONS) + +$PRESET = "cccl-c-parallel-hostjit" +test_preset "CCCL C Parallel (HostJIT)" "$PRESET" + +If($CURRENT_PATH -ne "ci") { + popd +}