diff --git a/CMakePresets.json b/CMakePresets.json
index 610fdeca3a9..f2a5e45a9f3 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -406,6 +406,14 @@
         "CCCL_C_Parallel_ENABLE_HEADER_TESTING": true
       }
     },
+    {
+      "name": "cccl-c-parallel-hostjit",
+      "displayName": "CCCL C Parallel Library (HostJIT)",
+      "inherits": "cccl-c-parallel",
+      "cacheVariables": {
+        "CCCL_C_Parallel_ENABLE_HOSTJIT": true
+      }
+    },
     {
       "name": "cccl-c-stf",
       "displayName": "CCCL C CUDASTF Library",
@@ -638,6 +646,10 @@
       "name": "cccl-c-parallel",
       "configurePreset": "cccl-c-parallel"
     },
+    {
+      "name": "cccl-c-parallel-hostjit",
+      "configurePreset": "cccl-c-parallel-hostjit"
+    },
     {
       "name": "cccl-c-stf",
       "configurePreset": "cccl-c-stf"
@@ -917,6 +929,11 @@
       "configurePreset": "cccl-c-parallel",
       "inherits": "base"
     },
+    {
+      "name": "cccl-c-parallel-hostjit",
+      "configurePreset": "cccl-c-parallel-hostjit",
+      "inherits": "base"
+    },
     {
       "name": "cccl-c-stf",
       "configurePreset": "cccl-c-stf",
diff --git a/c/parallel/CMakeLists.txt b/c/parallel/CMakeLists.txt
index b1796e866ef..f29bdb11c9e 100644
--- a/c/parallel/CMakeLists.txt
+++ b/c/parallel/CMakeLists.txt
@@ -8,6 +8,11 @@ option(
   "Build cccl.c.parallel standalone headers."
   OFF
 )
+option(
+  CCCL_C_Parallel_ENABLE_HOSTJIT
+  "Build HostJIT testing infrastructure (requires LLVM fetch, ~20 min first build)."
+  OFF
+)
 
 # FIXME Ideally this would be handled by presets and install rules, but for now
 # consumers may override this to control the target location of cccl.c.parallel.
@@ -27,6 +32,9 @@ file(
   "src/*.cpp"
 )
 
+# hostjit sources are built as a separate library; exclude from cccl.c.parallel
+list(FILTER srcs EXCLUDE REGEX "src/hostjit/")
+
 add_library(cccl.c.parallel SHARED ${srcs})
 set_property(TARGET cccl.c.parallel PROPERTY POSITION_INDEPENDENT_CODE ON)
 cccl_configure_target(cccl.c.parallel DIALECT 20)
@@ -48,6 +56,10 @@ cccl_get_thrust()
 
 add_subdirectory(src/jit_templates)
 
+if (CCCL_C_Parallel_ENABLE_HOSTJIT)
+  add_subdirectory(src/hostjit)
+endif()
+
 set_target_properties(cccl.c.parallel PROPERTIES CUDA_RUNTIME_LIBRARY STATIC)
 target_link_libraries(
   cccl.c.parallel
diff --git a/c/parallel/src/hostjit/CMakeLists.txt b/c/parallel/src/hostjit/CMakeLists.txt
new file mode 100644
index 00000000000..6301e55f454
--- /dev/null
+++ b/c/parallel/src/hostjit/CMakeLists.txt
@@ -0,0 +1,211 @@
+cmake_minimum_required(VERSION 3.20)
+
+# --------------------------------------------------------------------------
+# LLVM/Clang/LLD — fetched via CPM as static libraries
+# --------------------------------------------------------------------------
+# CPM.cmake is at the cccl repo root: cccl/cmake/CPM.cmake
+# From c/parallel/src/hostjit/ that's ../../../../cmake/CPM.cmake
+set(_cccl_cmake_dir "${CMAKE_CURRENT_SOURCE_DIR}/../../../../cmake")
+if (EXISTS "${_cccl_cmake_dir}/CPM.cmake")
+  include("${_cccl_cmake_dir}/CPM.cmake")
+else()
+  message(FATAL_ERROR "CPM.cmake not found at ${_cccl_cmake_dir}/CPM.cmake")
+endif()
+
+if (MSVC AND CMAKE_BUILD_TYPE STREQUAL "Debug")
+  message(
+    FATAL_ERROR
+    "hostjit does not support Debug builds on Windows. "
+    "The statically-linked LLVM Debug build is too large and causes stack "
+    "overflows at runtime. Use MinSizeRel, Release, or RelWithDebInfo instead."
+  )
+endif()
+
+set(HOSTJIT_LLVM_VERSION "llvmorg-22.1.1" CACHE STRING "LLVM git tag to fetch")
+
+# List options must be set before CPMAddPackage
+set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "" FORCE)
+set(LLVM_TARGETS_TO_BUILD "X86;NVPTX" CACHE STRING "" FORCE)
+
+CPMAddPackage(
+  NAME llvm_project
+  GIT_REPOSITORY https://github.com/llvm/llvm-project.git
+  GIT_TAG ${HOSTJIT_LLVM_VERSION}
+  GIT_SHALLOW ON
+  SOURCE_SUBDIR llvm
+  EXCLUDE_FROM_ALL YES
+  OPTIONS
+    "LLVM_BUILD_LLVM_C_DYLIB OFF"
+    "LLVM_BUILD_TOOLS OFF"
+    "LLVM_BUILD_UTILS OFF"
+    "LLVM_BUILD_RUNTIME OFF"
+    "LLVM_BUILD_RUNTIMES OFF"
+    "LLVM_INCLUDE_BENCHMARKS OFF"
+    "LLVM_INCLUDE_DOCS OFF"
+    "LLVM_INCLUDE_EXAMPLES OFF"
+    "LLVM_INCLUDE_RUNTIMES OFF"
+    "LLVM_INCLUDE_TESTS OFF"
+    "LLVM_INCLUDE_TOOLS ON"
+    "LLVM_INCLUDE_UTILS OFF"
+    "LLVM_ENABLE_ZLIB OFF"
+    "LLVM_ENABLE_ZSTD OFF"
+    "LLVM_ENABLE_TERMINFO OFF"
+    "LLVM_ENABLE_BINDINGS OFF"
+    "CLANG_BUILD_TOOLS OFF"
+    "CLANG_ENABLE_ARCMT OFF"
+    "CLANG_ENABLE_STATIC_ANALYZER OFF"
+)
+
+# Ensure the clang resource directory exists
+file(
+  MAKE_DIRECTORY "${llvm_project_BINARY_DIR}/lib/clang/${LLVM_VERSION_MAJOR}"
+)
+
+# Find CUDA toolkit (may already be found by parent)
+if (NOT CUDAToolkit_FOUND)
+  find_package(CUDAToolkit)
+endif()
+
+# --------------------------------------------------------------------------
+# hostjit library
+# --------------------------------------------------------------------------
+add_library(hostjit_lib compiler.cpp config.cpp loader.cpp jit_compiler.cpp)
+
+# CCCL_SOURCE_DIR points to the cccl repo root
+# From c/parallel/src/hostjit -> c/parallel/src -> c/parallel -> c -> cccl
+cmake_path(GET CMAKE_CURRENT_SOURCE_DIR PARENT_PATH _src_dir) # c/parallel/src
+cmake_path(GET _src_dir PARENT_PATH _c_parallel_dir) # c/parallel
+cmake_path(GET _c_parallel_dir PARENT_PATH _c_dir) # c
+cmake_path(GET _c_dir PARENT_PATH _cccl_root) # cccl
+
+target_include_directories(
+  hostjit_lib
+  PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${_c_parallel_dir}/include
+    ${llvm_project_SOURCE_DIR}/llvm/include
+    ${llvm_project_BINARY_DIR}/include
+    ${llvm_project_SOURCE_DIR}/clang/include
+    ${llvm_project_BINARY_DIR}/tools/clang/include
+    ${llvm_project_SOURCE_DIR}/lld/include
+    ${llvm_project_BINARY_DIR}/tools/lld/include
+)
+
+target_compile_definitions(
+  hostjit_lib
+  PRIVATE
+    CCCL_C_EXPERIMENTAL=1
+    CCCL_SOURCE_DIR="${_cccl_root}"
+    CLANG_RESOURCE_DIR="${llvm_project_BINARY_DIR}/lib/clang/${LLVM_VERSION_MAJOR}"
+    CLANG_HEADERS_DIR="${llvm_project_SOURCE_DIR}/clang/lib/Headers"
+    HOSTJIT_INCLUDE_DIR="${CMAKE_CURRENT_SOURCE_DIR}/include"
+)
+
+if (CUDAToolkit_FOUND)
+  target_include_directories(hostjit_lib PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+  cmake_path(GET CUDAToolkit_BIN_DIR PARENT_PATH CUDA_TOOLKIT_ROOT_FROM_CMAKE)
+  target_compile_definitions(
+    hostjit_lib
+    PRIVATE
+      CUDA_TOOLKIT_PATH="${CUDA_TOOLKIT_ROOT_FROM_CMAKE}"
+      CUDA_SDK_VERSION="${CUDAToolkit_VERSION_MAJOR}.0"
+  )
+endif()
+
+# Link against LLVM/Clang/LLD
+target_link_libraries(
+  hostjit_lib
+  PUBLIC
+    # LLVM
+    LLVMCore
+    LLVMSupport
+    LLVMIRReader
+    LLVMMC
+    LLVMObject
+    LLVMX86CodeGen
+    LLVMX86AsmParser
+    LLVMX86Desc
+    LLVMX86Info
+    LLVMNVPTXCodeGen
+    LLVMNVPTXDesc
+    LLVMNVPTXInfo
+    LLVMLinker
+    LLVMPasses
+    # Clang
+    clangAST
+    clangBasic
+    clangCodeGen
+    clangDriver
+    clangFrontend
+    clangFrontendTool
+    clangLex
+    clangParse
+    clangSema
+    clangEdit
+    clangAnalysis
+    clangRewrite
+    clangSerialization
+    # LLD
+    $<IF:$<PLATFORM_ID:Windows>,lldCOFF,lldELF>
+    lldCommon
+)
+
+if (NOT WIN32)
+  target_link_libraries(hostjit_lib PUBLIC dl)
+endif()
+
+if (CUDAToolkit_FOUND)
+  target_link_libraries(hostjit_lib PUBLIC CUDA::cuda_driver CUDA::cudart)
+  # nvJitLink and nvfatbin are required at link and runtime.
+  # nvptxcompiler is a transitive dep of libnvJitLink_static.
+  # Prefer static variants on non-Windows; fall back to the dynamic imported target;
+  # fall back further to find_library in case FindCUDAToolkit didn't create the target
+  # (e.g. partial CTK installs on Ubuntu or Windows).
+  foreach (_lib nvJitLink nvptxcompiler nvfatbin)
+    if (NOT WIN32 AND TARGET CUDA::${_lib}_static)
+      target_link_libraries(hostjit_lib PUBLIC CUDA::${_lib}_static)
+    elseif (TARGET CUDA::${_lib})
+      target_link_libraries(hostjit_lib PUBLIC CUDA::${_lib})
+    else()
+      find_library(
+        _hostjit_${_lib}
+        NAMES ${_lib}
+        HINTS
+          "${CUDAToolkit_LIBRARY_DIR}"
+          "${CUDAToolkit_ROOT}/lib/x64"
+          "${CUDAToolkit_ROOT}/lib64"
+          "${CUDAToolkit_ROOT}/lib"
+      )
+      if (_hostjit_${_lib})
+        message(STATUS "hostjit: linking ${_lib} from ${_hostjit_${_lib}}")
+        target_link_libraries(hostjit_lib PUBLIC "${_hostjit_${_lib}}")
+      else()
+        message(
+          FATAL_ERROR
+          "hostjit requires ${_lib} but it was not found.\n"
+          "  Ubuntu:  apt-get install libnvfatbin-<maj>-<min> or libnvjitlink-<maj>-<min>\n"
+          "  Windows: reinstall the CUDA toolkit and ensure the nvfatbin/nvjitlink "
+          "components are selected."
+        )
+      endif()
+    endif()
+  endforeach()
+endif()
+
+if (NOT MSVC)
+  target_compile_options(hostjit_lib PRIVATE -fno-rtti)
+endif()
+
+set_target_properties(
+  hostjit_lib
+  PROPERTIES CXX_STANDARD 20 POSITION_INDEPENDENT_CODE ON
+)
+
+# On Windows with multi-config generators (Visual Studio), exclude hostjit
+# targets from Debug builds — the LLVM Debug build causes stack overflows.
+if (MSVC)
+  set_target_properties(
+    hostjit_lib
+    PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_DEBUG TRUE
+  )
+endif()
diff --git a/c/parallel/src/hostjit/compiler.cpp b/c/parallel/src/hostjit/compiler.cpp
new file mode 100644
index 00000000000..5881cc6ace0
--- /dev/null
+++ b/c/parallel/src/hostjit/compiler.cpp
@@ -0,0 +1,1534 @@
+#include <clang/Basic/DiagnosticOptions.h>
+#include <clang/Basic/TargetInfo.h>
+#include <clang/CodeGen/CodeGenAction.h>
+#include <clang/Frontend/CompilerInstance.h>
+#include <clang/Frontend/CompilerInvocation.h>
+#include <clang/Frontend/FrontendActions.h>
+#include <clang/Frontend/FrontendOptions.h>
+#include <clang/Frontend/TextDiagnosticPrinter.h>
+#include <clang/Lex/PreprocessorOptions.h>
+#include <hostjit/compiler.hpp>
+#include <hostjit/config.hpp>
+#include <lld/Common/Driver.h>
+#include <llvm/Bitcode/BitcodeWriter.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/IRReader/IRReader.h>
+#include <llvm/Linker/Linker.h>
+#include <llvm/MC/TargetRegistry.h>
+#include <llvm/Passes/OptimizationLevel.h>
+#include <llvm/Passes/PassBuilder.h>
+#include <llvm/Support/FileSystem.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Support/VirtualFileSystem.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/TargetParser/Host.h>
+
+// Selective target initialization (X86 for host, NVPTX for device)
+extern "C" {
+void LLVMInitializeX86TargetInfo();
+void LLVMInitializeX86Target();
+void LLVMInitializeX86TargetMC();
+void LLVMInitializeX86AsmPrinter();
+void LLVMInitializeX86AsmParser();
+void LLVMInitializeNVPTXTargetInfo();
+void LLVMInitializeNVPTXTarget();
+void LLVMInitializeNVPTXTargetMC();
+void LLVMInitializeNVPTXAsmPrinter();
+}
+
+#ifdef _WIN32
+LLD_HAS_DRIVER(coff)
+#else
+LLD_HAS_DRIVER(elf)
+#endif
+
+#ifdef _WIN32
+#  include <llvm/Object/COFFImportFile.h>
+#endif
+
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <memory>
+#include <sstream>
+#include <vector>
+
+#include <nvFatbin.h>
+#include <nvJitLink.h>
+
+namespace hostjit
+{
+static bool llvm_initialized = false;
+
+static void initialize_llvm()
+{
+  if (llvm_initialized)
+  {
+    return;
+  }
+
+  LLVMInitializeX86TargetInfo();
+  LLVMInitializeX86Target();
+  LLVMInitializeX86TargetMC();
+  LLVMInitializeX86AsmPrinter();
+  LLVMInitializeX86AsmParser();
+  LLVMInitializeNVPTXTargetInfo();
+  LLVMInitializeNVPTXTarget();
+  LLVMInitializeNVPTXTargetMC();
+  LLVMInitializeNVPTXAsmPrinter();
+
+  llvm_initialized = true;
+}
+
+#ifdef _WIN32
+// Generate a minimal COFF import library for a given DLL.
+// This allows linking without requiring the Windows SDK or MSVC .lib files.
+// Symbols can be "name" or "name=dllexport" for aliasing.
+static bool generateImportLib(
+  const std::string& dll_name,
+  const std::vector<std::string>& symbols,
+  const std::string& output_path,
+  bool data_only = false)
+{
+  std::vector<llvm::object::COFFShortExport> exports;
+  for (const auto& sym : symbols)
+  {
+    llvm::object::COFFShortExport exp;
+    auto eq = sym.find('=');
+    if (eq != std::string::npos)
+    {
+      // "atexit=_crt_atexit" means: linker sees "atexit", DLL exports "_crt_atexit"
+      exp.Name       = sym.substr(0, eq); // symbol name the linker resolves
+      exp.ImportName = sym.substr(eq + 1); // actual DLL export name
+    }
+    else
+    {
+      exp.Name = sym;
+    }
+    exp.Data = data_only;
+    exports.push_back(exp);
+  }
+  auto err = llvm::object::writeImportLibrary(
+    dll_name,
+    output_path,
+    exports,
+    llvm::COFF::IMAGE_FILE_MACHINE_AMD64,
+    /*MinGW=*/false);
+  if (err)
+  {
+    llvm::consumeError(std::move(err));
+    return false;
+  }
+  return true;
+}
+
+// Find the actual DLL filename for cudart (e.g. "cudart64_13.dll") by
+// scanning the CUDA toolkit bin directory.
+static std::string findCudartDllName(const std::string& cuda_toolkit_path)
+{
+  namespace fs = std::filesystem;
+  for (const auto& subdir : {"bin/x64", "bin"})
+  {
+    fs::path dir = fs::path(cuda_toolkit_path) / subdir;
+    if (!fs::exists(dir))
+    {
+      continue;
+    }
+    for (const auto& entry : fs::directory_iterator(dir))
+    {
+      auto name = entry.path().filename().string();
+      if (name.starts_with("cudart64_") && name.ends_with(".dll"))
+      {
+        return name;
+      }
+    }
+  }
+  return "cudart64_12.dll"; // fallback
+}
+#endif
+
+// Headers precompiled into the PCH cache.  Covers the algorithms exposed
+// by the C parallel library so that a single pair of PCH files (device +
+// host) is reused across reduce, adjacent-difference, etc.
+static constexpr const char* pch_preamble_source =
+  "#include <cuda_runtime.h>\n"
+  "#include <cuda/std/iterator>\n"
+  "#include <cuda/std/functional>\n"
+  "#include <cuda/functional>\n"
+  "#include <cub/device/device_reduce.cuh>\n"
+  "#include <cub/device/device_adjacent_difference.cuh>\n";
+
+class CUDACompiler::Impl
+{
+public:
+  Impl() {}
+
+  // Get the persistent PCH cache directory.
+  static std::filesystem::path getPCHCacheDir()
+  {
+    auto dir = std::filesystem::temp_directory_path() / "hostjit_pch";
+    std::filesystem::create_directories(dir);
+    return dir;
+  }
+
+  // Get a persistent cache path for a PCH file.
+  static std::string getPCHPath(const std::string& kind, int sm_version)
+  {
+    return (getPCHCacheDir() / (kind + "_sm" + std::to_string(sm_version) + ".pch")).string();
+  }
+
+  // Get the persistent path for the PCH preamble source file.
+  // The PCH stores a reference to this path, so it must be stable across runs.
+  static std::string getPCHSourcePath(const std::string& kind, int sm_version)
+  {
+    return (getPCHCacheDir() / (kind + "_sm" + std::to_string(sm_version) + "_preamble.cu")).string();
+  }
+
+  // Write preamble to a persistent file and generate a PCH from it.
+  // arg_strings[0] will be replaced with the persistent preamble path.
+  bool generatePCH(const std::string& pch_source,
+                   const std::string& pch_source_path,
+                   const std::string& pch_output_path,
+                   std::vector<std::string> arg_strings,
+                   std::string& diagnostics)
+  {
+    // Write preamble to the persistent source path
+    {
+      std::ofstream f(pch_source_path);
+      if (!f)
+      {
+        diagnostics += "Failed to write PCH preamble to " + pch_source_path;
+        return false;
+      }
+      f << pch_source;
+    }
+
+    // Replace the source file arg with the persistent path
+    arg_strings[0] = pch_source_path;
+
+    std::vector<const char*> args;
+    for (const auto& arg : arg_strings)
+    {
+      args.push_back(arg.c_str());
+    }
+
+    std::string diag_output;
+    llvm::raw_string_ostream diag_stream(diag_output);
+    clang::DiagnosticOptions diag_opts;
+    diag_opts.ShowColors = false;
+    auto* diag_printer   = new clang::TextDiagnosticPrinter(diag_stream, diag_opts);
+    clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diag_ids(new clang::DiagnosticIDs());
+    clang::DiagnosticsEngine diag_engine(diag_ids, diag_opts, diag_printer);
+
+    clang::CompilerInstance compiler;
+    auto& invocation = compiler.getInvocation();
+
+    if (!clang::CompilerInvocation::CreateFromArgs(invocation, args, diag_engine))
+    {
+      diag_stream.flush();
+      diagnostics += diag_output + "\nFailed to create PCH compiler invocation";
+      return false;
+    }
+
+    compiler.createDiagnostics(diag_engine.getClient(), false);
+    compiler.createFileManager();
+    compiler.getFrontendOpts().OutputFile = pch_output_path;
+
+    clang::GeneratePCHAction pch_action;
+    bool success = compiler.ExecuteAction(pch_action);
+
+    diag_stream.flush();
+    diagnostics += diag_output;
+
+    return success;
+  }
+
+  llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem>
+  createVFSWithSource(const std::string& source_code, const std::string& virtual_path)
+  {
+    auto mem_fs = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
+    mem_fs->addFile(virtual_path, 0, llvm::MemoryBuffer::getMemBuffer(source_code));
+
+    auto overlay = llvm::makeIntrusiveRefCnt<llvm::vfs::OverlayFileSystem>(llvm::vfs::getRealFileSystem());
+    overlay->pushOverlay(mem_fs);
+    return overlay;
+  }
+
+  bool compileDeviceToPTX(
+    const std::string& source_code,
+    const std::string& input_file,
+    const std::string& output_ptx,
+    const CompilerConfig& config,
+    std::string& diagnostics)
+  {
+    std::string temp_dir    = std::filesystem::path(output_ptx).parent_path().string();
+    std::string source_file = temp_dir + "/" + input_file;
+
+    std::string resource_dir = CLANG_RESOURCE_DIR;
+
+    int ptx_version = 70;
+    if (config.sm_version >= 120)
+    {
+      ptx_version = 87;
+    }
+    else if (config.sm_version >= 100)
+    {
+      ptx_version = 85;
+    }
+    else if (config.sm_version >= 90)
+    {
+      ptx_version = 80;
+    }
+    else if (config.sm_version >= 89)
+    {
+      ptx_version = 78;
+    }
+    else if (config.sm_version >= 80)
+    {
+      ptx_version = 75;
+    }
+
+    std::vector<std::string> arg_strings;
+    arg_strings.push_back(source_file);
+    arg_strings.push_back("-triple");
+    arg_strings.push_back("nvptx64-nvidia-cuda");
+    arg_strings.push_back("-aux-triple");
+#ifdef _WIN32
+    arg_strings.push_back("x86_64-pc-windows-msvc");
+#else
+    arg_strings.push_back("x86_64-pc-linux-gnu");
+#endif
+    arg_strings.push_back("-S");
+    arg_strings.push_back("-aux-target-cpu");
+    arg_strings.push_back("x86-64");
+    arg_strings.push_back("-fcuda-is-device");
+    arg_strings.push_back("-fcuda-allow-variadic-functions");
+#ifdef _WIN32
+    arg_strings.push_back("-fms-compatibility");
+    arg_strings.push_back("-fms-compatibility-version=19.40");
+#else
+    arg_strings.push_back("-fgnuc-version=4.2.1");
+#endif
+    arg_strings.push_back("-mlink-builtin-bitcode");
+    arg_strings.push_back(config.cuda_toolkit_path + "/nvvm/libdevice/libdevice.10.bc");
+    arg_strings.push_back("-target-sdk-version=" CUDA_SDK_VERSION);
+    arg_strings.push_back("-target-cpu");
+    arg_strings.push_back("sm_" + std::to_string(config.sm_version));
+    arg_strings.push_back("-target-feature");
+    arg_strings.push_back("+ptx" + std::to_string(ptx_version));
+    arg_strings.push_back("-resource-dir");
+    arg_strings.push_back(resource_dir);
+    arg_strings.push_back("-internal-isystem");
+    arg_strings.push_back(config.hostjit_include_path + "/hostjit/cuda_minimal/stubs");
+    arg_strings.push_back("-internal-isystem");
+    arg_strings.push_back(
+      config.clang_headers_path.empty() ? std::string(CLANG_HEADERS_DIR) : config.clang_headers_path);
+    arg_strings.push_back("-internal-isystem");
+    if (config.cccl_include_path.empty())
+    {
+      arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/libcudacxx/include/cuda/std");
+    }
+    else
+    {
+      arg_strings.push_back(config.cccl_include_path + "/cuda/std");
+    }
+    arg_strings.push_back("-internal-isystem");
+    if (config.cccl_include_path.empty())
+    {
+      arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/libcudacxx/include");
+    }
+    else
+    {
+      arg_strings.push_back(config.cccl_include_path);
+    }
+    arg_strings.push_back("-internal-isystem");
+    if (config.cccl_include_path.empty())
+    {
+      arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/cub");
+    }
+    else
+    {
+      arg_strings.push_back(config.cccl_include_path);
+    }
+    arg_strings.push_back("-internal-isystem");
+    if (config.cccl_include_path.empty())
+    {
+      arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/thrust");
+    }
+    else
+    {
+      arg_strings.push_back(config.cccl_include_path);
+    }
+    arg_strings.push_back("-internal-isystem");
+    arg_strings.push_back(config.cuda_toolkit_path + "/include");
+    arg_strings.push_back("-include");
+    arg_strings.push_back(config.hostjit_include_path + "/hostjit/cuda_minimal/__clang_cuda_runtime_wrapper.h");
+
+    for (const auto& include_path : config.include_paths)
+    {
+      arg_strings.push_back("-I" + include_path);
+    }
+
+    arg_strings.push_back("-D__HOSTJIT_DEVICE_COMPILATION__=1");
+    arg_strings.push_back("-DNDEBUG");
+    arg_strings.push_back("-DCCCL_DISABLE_CTK_COMPATIBILITY_CHECK");
+    arg_strings.push_back("-DCCCL_DISABLE_FP16_SUPPORT=1");
+    arg_strings.push_back("-DCCCL_DISABLE_BF16_SUPPORT=1");
+    arg_strings.push_back("-DCCCL_DISABLE_NVTX=1");
+    arg_strings.push_back("-DCCCL_DISABLE_EXCEPTIONS=1");
+
+    std::vector<std::string> bitcode_files_to_link = config.device_bitcode_files;
+
+    for (const auto& [macro_name, macro_value] : config.macro_definitions)
+    {
+      if (macro_value.empty())
+      {
+        arg_strings.push_back("-D" + macro_name);
+      }
+      else
+      {
+        arg_strings.push_back("-D" + macro_name + "=" + macro_value);
+      }
+    }
+
+    arg_strings.push_back("-fdeprecated-macro");
+    arg_strings.push_back("--offload-new-driver");
+    arg_strings.push_back("-fskip-odr-check-in-gmf");
+    arg_strings.push_back("-fcxx-exceptions");
+    arg_strings.push_back("-fexceptions");
+    arg_strings.push_back("-O" + std::to_string(config.optimization_level));
+    arg_strings.push_back("-std=c++17");
+
+    if (config.trace_includes)
+    {
+      arg_strings.push_back("-H");
+    }
+
+    arg_strings.push_back("-x");
+    arg_strings.push_back("cuda");
+
+    // --- PCH: ensure device PCH exists ---
+    std::string device_pch_path;
+    if (config.enable_pch)
+    {
+      device_pch_path = getPCHPath("device", config.sm_version);
+      if (!std::filesystem::exists(device_pch_path))
+      {
+        auto pch_src_path = getPCHSourcePath("device", config.sm_version);
+        std::string pch_diag;
+        if (!generatePCH(pch_preamble_source, pch_src_path, device_pch_path, arg_strings, pch_diag))
+        {
+          diagnostics += "Device PCH generation failed: " + pch_diag + "\n";
+          device_pch_path.clear();
+        }
+        else if (config.verbose)
+        {
+          diagnostics += "Generated device PCH: " + device_pch_path + "\n";
+        }
+      }
+    }
+
+    std::vector<const char*> args;
+    for (const auto& arg : arg_strings)
+    {
+      args.push_back(arg.c_str());
+    }
+
+    if (config.verbose)
+    {
+      diagnostics += "Device args: ";
+      for (const auto& arg : arg_strings)
+      {
+        diagnostics += arg + " ";
+      }
+      diagnostics += "\n";
+    }
+
+    std::string diag_output;
+    llvm::raw_string_ostream diag_stream(diag_output);
+
+    clang::DiagnosticOptions diag_opts;
+    diag_opts.ShowColors                       = false;
+    clang::TextDiagnosticPrinter* diag_printer = new clang::TextDiagnosticPrinter(diag_stream, diag_opts);
+    clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diag_ids(new clang::DiagnosticIDs());
+    clang::DiagnosticsEngine diag_engine(diag_ids, diag_opts, diag_printer);
+
+    clang::CompilerInstance compiler;
+    auto& invocation = compiler.getInvocation();
+
+    if (!clang::CompilerInvocation::CreateFromArgs(invocation, args, diag_engine))
+    {
+      diag_stream.flush();
+      diagnostics += diag_output;
+      diagnostics += "\nFailed to create device compiler invocation";
+      return false;
+    }
+
+    // --- PCH: load cached device PCH ---
+    if (!device_pch_path.empty() && std::filesystem::exists(device_pch_path))
+    {
+      invocation.getPreprocessorOpts().ImplicitPCHInclude = device_pch_path;
+    }
+
+    auto vfs = createVFSWithSource(source_code, source_file);
+    compiler.createDiagnostics(diag_engine.getClient(), false);
+    compiler.setVirtualFileSystem(vfs);
+    compiler.createFileManager();
+    compiler.getFrontendOpts().OutputFile = output_ptx;
+
+    if (config.trace_includes)
+    {
+      diagnostics += "\n=== Device Header Search Paths ===\n";
+      const auto& hso = invocation.getHeaderSearchOpts();
+      for (const auto& entry : hso.UserEntries)
+      {
+        diagnostics += "  " + entry.Path + "\n";
+      }
+      diagnostics += "=== End Header Search Paths ===\n\n";
+    }
+
+    llvm::LLVMContext llvm_context;
+
+    clang::EmitLLVMOnlyAction emit_llvm_action(&llvm_context);
+    bool success = compiler.ExecuteAction(emit_llvm_action);
+
+    if (config.trace_includes && compiler.hasSourceManager())
+    {
+      diagnostics += "\n=== Device Included Files ===\n";
+      auto& sm = compiler.getSourceManager();
+      for (auto it = sm.fileinfo_begin(); it != sm.fileinfo_end(); ++it)
+      {
+        diagnostics += "  " + it->first.getName().str() + "\n";
+      }
+      diagnostics += "=== End Included Files ===\n\n";
+    }
+
+    if (success)
+    {
+      std::unique_ptr<llvm::Module> mod = emit_llvm_action.takeModule();
+      if (mod)
+      {
+        for (const auto& bc_file : bitcode_files_to_link)
+        {
+          llvm::SMDiagnostic err;
+          auto bc_mod = llvm::parseIRFile(bc_file, err, llvm_context);
+          if (bc_mod)
+          {
+            if (llvm::Linker::linkModules(*mod, std::move(bc_mod)))
+            {
+              diagnostics += "Failed to link bitcode: " + bc_file + "\n";
+              success = false;
+              break;
+            }
+          }
+          else
+          {
+            diagnostics += "Failed to parse bitcode: " + bc_file + "\n";
+            success = false;
+            break;
+          }
+        }
+
+        // Re-link libdevice to resolve any new references (e.g. __nv_pow)
+        // introduced by the extra bitcode modules.
+        if (success && !bitcode_files_to_link.empty())
+        {
+          std::string libdevice_path = config.cuda_toolkit_path + "/nvvm/libdevice/libdevice.10.bc";
+          llvm::SMDiagnostic err;
+          auto libdevice = llvm::parseIRFile(libdevice_path, err, llvm_context);
+          if (libdevice)
+          {
+            // Use AppendToUsed to avoid internalization issues
+            llvm::Linker::linkModules(*mod, std::move(libdevice), llvm::Linker::LinkOnlyNeeded);
+          }
+        }
+
+        if (success)
+        {
+          std::string err_str;
+          const llvm::Target* target = llvm::TargetRegistry::lookupTarget(mod->getTargetTriple(), err_str);
+          if (target)
+          {
+            llvm::TargetOptions opt;
+            auto tm = target->createTargetMachine(
+              mod->getTargetTriple(),
+              "sm_" + std::to_string(config.sm_version),
+              "+ptx" + std::to_string(ptx_version),
+              opt,
+              llvm::Reloc::PIC_);
+            if (tm)
+            {
+              mod->setDataLayout(tm->createDataLayout());
+
+              // Run optimization passes after linking to inline user-provided
+              // operations (from bitcode or embedded C++ source).
+              if (!config.entry_point_name.empty())
+              {
+                // Internalize all functions except the entry point and
+                // GPU kernels, so the optimizer can inline the linked
+                // bitcode functions.
+                for (auto& F : *mod)
+                {
+                  if (!F.isDeclaration() && F.getLinkage() == llvm::GlobalValue::ExternalLinkage
+                      && F.getName() != config.entry_point_name && F.getCallingConv() != llvm::CallingConv::PTX_Kernel)
+                  {
+                    F.setLinkage(llvm::GlobalValue::InternalLinkage);
+                    // Remove attributes that conflict with inlining
+                    F.removeFnAttr(llvm::Attribute::NoInline);
+                    F.removeFnAttr(llvm::Attribute::OptimizeNone);
+                    F.addFnAttr(llvm::Attribute::AlwaysInline);
+                  }
+                }
+
+                llvm::OptimizationLevel opt_level;
+                switch (config.optimization_level)
+                {
+                  case 0:
+                    opt_level = llvm::OptimizationLevel::O0;
+                    break;
+                  case 1:
+                    opt_level = llvm::OptimizationLevel::O1;
+                    break;
+                  case 3:
+                    opt_level = llvm::OptimizationLevel::O3;
+                    break;
+                  default:
+                    opt_level = llvm::OptimizationLevel::O2;
+                    break;
+                }
+
+                llvm::LoopAnalysisManager LAM;
+                llvm::FunctionAnalysisManager FAM;
+                llvm::CGSCCAnalysisManager CGAM;
+                llvm::ModuleAnalysisManager MAM;
+
+                llvm::PassBuilder PB(tm);
+                PB.registerModuleAnalyses(MAM);
+                PB.registerCGSCCAnalyses(CGAM);
+                PB.registerFunctionAnalyses(FAM);
+                PB.registerLoopAnalyses(LAM);
+                PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+                auto MPM = PB.buildPerModuleDefaultPipeline(opt_level);
+                MPM.run(*mod, MAM);
+              }
+
+              std::error_code EC;
+              llvm::raw_fd_ostream dest(output_ptx, EC);
+              if (!EC)
+              {
+                llvm::legacy::PassManager pass;
+                tm->addPassesToEmitFile(pass, dest, nullptr, llvm::CodeGenFileType::AssemblyFile);
+                pass.run(*mod);
+              }
+              else
+              {
+                diagnostics += "Failed to open output file: " + output_ptx + "\n";
+                success = false;
+              }
+            }
+            else
+            {
+              diagnostics += "Failed to create target machine\n";
+              success = false;
+            }
+          }
+          else
+          {
+            diagnostics += "Failed to lookup target: " + err_str + "\n";
+            success = false;
+          }
+        }
+      }
+    }
+
+    diag_stream.flush();
+    diagnostics += diag_output;
+
+    return success;
+  }
+
+  BitcodeResult compileToDeviceBitcode(const std::string& source_code, const CompilerConfig& config)
+  {
+    BitcodeResult result;
+    result.success = false;
+
+    std::string error_msg;
+    if (!validateConfig(config, &error_msg))
+    {
+      result.diagnostics = "Configuration error: " + error_msg;
+      return result;
+    }
+
+    initialize_llvm();
+
+    std::string temp_dir =
+      (std::filesystem::temp_directory_path() / ("hostjit_bc_" + std::to_string(reinterpret_cast<uintptr_t>(this))))
+        .string();
+    std::filesystem::create_directories(temp_dir);
+
+    std::string input_file   = "input.cu";
+    std::string source_file  = temp_dir + "/" + input_file;
+    std::string resource_dir = CLANG_RESOURCE_DIR;
+
+    int ptx_version = 70;
+    if (config.sm_version >= 120)
+    {
+      ptx_version = 87;
+    }
+    else if (config.sm_version >= 100)
+    {
+      ptx_version = 85;
+    }
+    else if (config.sm_version >= 90)
+    {
+      ptx_version = 80;
+    }
+    else if (config.sm_version >= 89)
+    {
+      ptx_version = 78;
+    }
+    else if (config.sm_version >= 80)
+    {
+      ptx_version = 75;
+    }
+
+    std::vector<std::string> arg_strings;
+    arg_strings.push_back(source_file);
+    arg_strings.push_back("-triple");
+    arg_strings.push_back("nvptx64-nvidia-cuda");
+    arg_strings.push_back("-aux-triple");
+#ifdef _WIN32
+    arg_strings.push_back("x86_64-pc-windows-msvc");
+#else
+    arg_strings.push_back("x86_64-pc-linux-gnu");
+#endif
+    arg_strings.push_back("-S");
+    arg_strings.push_back("-aux-target-cpu");
+    arg_strings.push_back("x86-64");
+    arg_strings.push_back("-fcuda-is-device");
+    arg_strings.push_back("-fcuda-allow-variadic-functions");
+#ifdef _WIN32
+    arg_strings.push_back("-fms-compatibility");
+    arg_strings.push_back("-fms-compatibility-version=19.40");
+#else
+    arg_strings.push_back("-fgnuc-version=4.2.1");
+#endif
+    arg_strings.push_back("-mlink-builtin-bitcode");
+    arg_strings.push_back(config.cuda_toolkit_path + "/nvvm/libdevice/libdevice.10.bc");
+    arg_strings.push_back("-target-sdk-version=" CUDA_SDK_VERSION);
+    arg_strings.push_back("-target-cpu");
+    arg_strings.push_back("sm_" + std::to_string(config.sm_version));
+    arg_strings.push_back("-target-feature");
+    arg_strings.push_back("+ptx" + std::to_string(ptx_version));
+    arg_strings.push_back("-resource-dir");
+    arg_strings.push_back(resource_dir);
+    arg_strings.push_back("-internal-isystem");
+    arg_strings.push_back(config.hostjit_include_path + "/hostjit/cuda_minimal/stubs");
+    arg_strings.push_back("-internal-isystem");
+    arg_strings.push_back(
+      config.clang_headers_path.empty() ? std::string(CLANG_HEADERS_DIR) : config.clang_headers_path);
+    arg_strings.push_back("-internal-isystem");
+    if (config.cccl_include_path.empty())
+    {
+      arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/libcudacxx/include/cuda/std");
+    }
+    else
+    {
+      arg_strings.push_back(config.cccl_include_path + "/cuda/std");
+    }
+    arg_strings.push_back("-internal-isystem");
+    if (config.cccl_include_path.empty())
+    {
+      arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/libcudacxx/include");
+    }
+    else
+    {
+      arg_strings.push_back(config.cccl_include_path);
+    }
+    arg_strings.push_back("-internal-isystem");
+    if (config.cccl_include_path.empty())
+    {
+      arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/cub");
+    }
+    else
+    {
+      arg_strings.push_back(config.cccl_include_path);
+    }
+    arg_strings.push_back("-internal-isystem");
+    if (config.cccl_include_path.empty())
+    {
+      arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/thrust");
+    }
+    else
+    {
+      arg_strings.push_back(config.cccl_include_path);
+    }
+    arg_strings.push_back("-internal-isystem");
+    arg_strings.push_back(config.cuda_toolkit_path + "/include");
+    arg_strings.push_back("-include");
+    arg_strings.push_back(config.hostjit_include_path + "/hostjit/cuda_minimal/__clang_cuda_runtime_wrapper.h");
+    arg_strings.push_back("-D__HOSTJIT_DEVICE_COMPILATION__=1");
+    arg_strings.push_back("-DNDEBUG");
+    arg_strings.push_back("-DCCCL_DISABLE_CTK_COMPATIBILITY_CHECK");
+    arg_strings.push_back("-DCCCL_DISABLE_FP16_SUPPORT=1");
+    arg_strings.push_back("-DCCCL_DISABLE_BF16_SUPPORT=1");
+    arg_strings.push_back("-DCCCL_DISABLE_NVTX=1");
+    arg_strings.push_back("-DCCCL_DISABLE_EXCEPTIONS=1");
+    arg_strings.push_back("-fdeprecated-macro");
+    arg_strings.push_back("-fcxx-exceptions");
+    arg_strings.push_back("-fexceptions");
+    arg_strings.push_back("-O" + std::to_string(config.optimization_level));
+    arg_strings.push_back("-Wno-c++11-narrowing");
+    arg_strings.push_back("-std=c++17");
+    arg_strings.push_back("-x");
+    arg_strings.push_back("cuda");
+
+    std::vector<const char*> args;
+    for (const auto& arg : arg_strings)
+    {
+      args.push_back(arg.c_str());
+    }
+
+    std::string diag_output;
+    llvm::raw_string_ostream diag_stream(diag_output);
+
+    clang::DiagnosticOptions diag_opts;
+    diag_opts.ShowColors                       = false;
+    clang::TextDiagnosticPrinter* diag_printer = new clang::TextDiagnosticPrinter(diag_stream, diag_opts);
+    clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diag_ids(new clang::DiagnosticIDs());
+    clang::DiagnosticsEngine diag_engine(diag_ids, diag_opts, diag_printer);
+
+    clang::CompilerInstance compiler;
+    auto& invocation = compiler.getInvocation();
+
+    if (!clang::CompilerInvocation::CreateFromArgs(invocation, args, diag_engine))
+    {
+      diag_stream.flush();
+      result.diagnostics = diag_output + "\nFailed to create compiler invocation";
+      std::filesystem::remove_all(temp_dir);
+      return result;
+    }
+
+    auto vfs = createVFSWithSource(source_code, source_file);
+    compiler.createDiagnostics(diag_engine.getClient(), false);
+    compiler.setVirtualFileSystem(vfs);
+    compiler.createFileManager();
+
+    llvm::LLVMContext llvm_context;
+    clang::EmitLLVMOnlyAction emit_llvm_action(&llvm_context);
+    bool success = compiler.ExecuteAction(emit_llvm_action);
+
+    if (success)
+    {
+      std::unique_ptr<llvm::Module> mod = emit_llvm_action.takeModule();
+      if (mod)
+      {
+        llvm::SmallVector<char, 0> buffer;
+        llvm::raw_svector_ostream os(buffer);
+        llvm::WriteBitcodeToFile(*mod, os);
+        result.bitcode = std::string(buffer.begin(), buffer.end());
+        result.success = true;
+      }
+      else
+      {
+        result.diagnostics = "Failed to get LLVM module";
+      }
+    }
+
+    diag_stream.flush();
+    result.diagnostics += diag_output;
+    std::filesystem::remove_all(temp_dir);
+    return result;
+  }
+
+  bool compileHostCode(
+    const std::string& source_code,
+    const std::string& input_file,
+    const std::string& fatbin_path,
+    const std::string& output_obj,
+    const CompilerConfig& config,
+    std::string& diagnostics)
+  {
+    std::string temp_dir    = std::filesystem::path(output_obj).parent_path().string();
+    std::string source_file = temp_dir + "/host_" + input_file;
+
+    std::string resource_dir = CLANG_RESOURCE_DIR;
+
+    std::vector<std::string> arg_strings;
+    arg_strings.push_back(source_file);
+    arg_strings.push_back("-triple");
+#ifdef _WIN32
+    arg_strings.push_back("x86_64-pc-windows-msvc");
+#else
+    arg_strings.push_back("x86_64-pc-linux-gnu");
+#endif
+    arg_strings.push_back("-aux-triple");
+    arg_strings.push_back("nvptx64-nvidia-cuda");
+    arg_strings.push_back("-target-sdk-version=" CUDA_SDK_VERSION);
+    arg_strings.push_back("-emit-obj");
+    arg_strings.push_back("-target-cpu");
+    arg_strings.push_back("x86-64");
+    arg_strings.push_back("-fcuda-allow-variadic-functions");
+#ifdef _WIN32
+    arg_strings.push_back("-fms-compatibility");
+    arg_strings.push_back("-fms-compatibility-version=19.40");
+#else
+    arg_strings.push_back("-fgnuc-version=4.2.1");
+#endif
+    arg_strings.push_back("-mrelocation-model");
+    arg_strings.push_back("pic");
+    arg_strings.push_back("-pic-level");
+    arg_strings.push_back("2");
+    arg_strings.push_back("-resource-dir");
+    arg_strings.push_back(resource_dir);
+    arg_strings.push_back("-internal-isystem");
+    arg_strings.push_back(config.hostjit_include_path + "/hostjit/cuda_minimal/stubs");
+    arg_strings.push_back("-internal-isystem");
+    arg_strings.push_back(
+      config.clang_headers_path.empty() ? std::string(CLANG_HEADERS_DIR) : config.clang_headers_path);
+    arg_strings.push_back("-internal-isystem");
+    if (config.cccl_include_path.empty())
+    {
+      arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/libcudacxx/include/cuda/std");
+    }
+    else
+    {
+      arg_strings.push_back(config.cccl_include_path + "/cuda/std");
+    }
+    arg_strings.push_back("-internal-isystem");
+    if (config.cccl_include_path.empty())
+    {
+      arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/libcudacxx/include");
+    }
+    else
+    {
+      arg_strings.push_back(config.cccl_include_path);
+    }
+    arg_strings.push_back("-internal-isystem");
+    if (config.cccl_include_path.empty())
+    {
+      arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/cub");
+    }
+    else
+    {
+      arg_strings.push_back(config.cccl_include_path);
+    }
+    arg_strings.push_back("-internal-isystem");
+    if (config.cccl_include_path.empty())
+    {
+      arg_strings.push_back(std::string(CCCL_SOURCE_DIR) + "/thrust");
+    }
+    else
+    {
+      arg_strings.push_back(config.cccl_include_path);
+    }
+    arg_strings.push_back("-internal-isystem");
+    arg_strings.push_back(config.cuda_toolkit_path + "/include");
+    arg_strings.push_back("-include");
+    arg_strings.push_back(config.hostjit_include_path + "/hostjit/cuda_minimal/__clang_cuda_runtime_wrapper.h");
+
+    for (const auto& include_path : config.include_paths)
+    {
+      arg_strings.push_back("-I" + include_path);
+    }
+
+    arg_strings.push_back("-DNDEBUG");
+    arg_strings.push_back("-DCCCL_DISABLE_CTK_COMPATIBILITY_CHECK");
+    arg_strings.push_back("-D_CCCL_ENABLE_FREESTANDING=1");
+    arg_strings.push_back("-DCCCL_DISABLE_FP16_SUPPORT=1");
+    arg_strings.push_back("-DCCCL_DISABLE_BF16_SUPPORT=1");
+    arg_strings.push_back("-DCCCL_DISABLE_NVTX=1");
+    arg_strings.push_back("-DCCCL_DISABLE_EXCEPTIONS=1");
+
+    for (const auto& [macro_name, macro_value] : config.macro_definitions)
+    {
+      if (macro_value.empty())
+      {
+        arg_strings.push_back("-D" + macro_name);
+      }
+      else
+      {
+        arg_strings.push_back("-D" + macro_name + "=" + macro_value);
+      }
+    }
+
+    arg_strings.push_back("-fdeprecated-macro");
+    arg_strings.push_back("--offload-new-driver");
+    arg_strings.push_back("-fskip-odr-check-in-gmf");
+    arg_strings.push_back("-O" + std::to_string(config.optimization_level));
+    arg_strings.push_back("-std=c++17");
+
+    if (config.trace_includes)
+    {
+      arg_strings.push_back("-H");
+    }
+
+    arg_strings.push_back("-x");
+    arg_strings.push_back("cuda");
+
+    // --- PCH: ensure host PCH exists (before adding fatbin-specific args) ---
+    std::string host_pch_path;
+    if (config.enable_pch)
+    {
+      host_pch_path = getPCHPath("host", config.sm_version);
+      if (!std::filesystem::exists(host_pch_path))
+      {
+        auto pch_src_path = getPCHSourcePath("host", config.sm_version);
+        std::string pch_diag;
+        if (!generatePCH(pch_preamble_source, pch_src_path, host_pch_path, arg_strings, pch_diag))
+        {
+          diagnostics += "Host PCH generation failed: " + pch_diag + "\n";
+          host_pch_path.clear();
+        }
+        else if (config.verbose)
+        {
+          diagnostics += "Generated host PCH: " + host_pch_path + "\n";
+        }
+      }
+    }
+
+    // Add fatbin embedding (per-build, not part of PCH)
+    arg_strings.push_back("-fcuda-include-gpubinary");
+    arg_strings.push_back(fatbin_path);
+
+    std::vector<const char*> args;
+    for (const auto& arg : arg_strings)
+    {
+      args.push_back(arg.c_str());
+    }
+
+    if (config.verbose)
+    {
+      diagnostics += "Host args: ";
+      for (const auto& arg : arg_strings)
+      {
+        diagnostics += arg + " ";
+      }
+      diagnostics += "\n";
+    }
+
+    std::string diag_output;
+    llvm::raw_string_ostream diag_stream(diag_output);
+
+    clang::DiagnosticOptions diag_opts;
+    diag_opts.ShowColors                       = false;
+    clang::TextDiagnosticPrinter* diag_printer = new clang::TextDiagnosticPrinter(diag_stream, diag_opts);
+    clang::IntrusiveRefCntPtr<clang::DiagnosticIDs> diag_ids(new clang::DiagnosticIDs());
+    clang::DiagnosticsEngine diag_engine(diag_ids, diag_opts, diag_printer);
+
+    clang::CompilerInstance compiler;
+    auto& invocation = compiler.getInvocation();
+
+    if (!clang::CompilerInvocation::CreateFromArgs(invocation, args, diag_engine))
+    {
+      diag_stream.flush();
+      diagnostics += diag_output;
+      diagnostics += "\nFailed to create host compiler invocation";
+      return false;
+    }
+
+    // --- PCH: load cached host PCH ---
+    if (!host_pch_path.empty() && std::filesystem::exists(host_pch_path))
+    {
+      invocation.getPreprocessorOpts().ImplicitPCHInclude = host_pch_path;
+    }
+
+    auto vfs = createVFSWithSource(source_code, source_file);
+    compiler.createDiagnostics(diag_engine.getClient(), false);
+    compiler.setVirtualFileSystem(vfs);
+    compiler.createFileManager();
+    compiler.getFrontendOpts().OutputFile = output_obj;
+
+    if (config.trace_includes)
+    {
+      diagnostics += "\n=== Host Header Search Paths ===\n";
+      const auto& hso = invocation.getHeaderSearchOpts();
+      for (const auto& entry : hso.UserEntries)
+      {
+        diagnostics += "  " + entry.Path + "\n";
+      }
+      diagnostics += "=== End Header Search Paths ===\n\n";
+    }
+
+    clang::EmitObjAction emit_action;
+    bool success = compiler.ExecuteAction(emit_action);
+
+    if (config.trace_includes && compiler.hasSourceManager())
+    {
+      diagnostics += "\n=== Host Included Files ===\n";
+      auto& sm = compiler.getSourceManager();
+      for (auto it = sm.fileinfo_begin(); it != sm.fileinfo_end(); ++it)
+      {
+        diagnostics += "  " + it->first.getName().str() + "\n";
+      }
+      diagnostics += "=== End Included Files ===\n\n";
+    }
+
+    diag_stream.flush();
+    diagnostics += diag_output;
+
+    return success;
+  }
+
+  CompilationResult
+  compileToObject(const std::string& source_code, const std::string& output_path, const CompilerConfig& config)
+  {
+    CompilationResult result;
+    result.success          = false;
+    result.object_file_path = output_path;
+
+    std::string error_msg;
+    if (!validateConfig(config, &error_msg))
+    {
+      result.diagnostics = "Configuration error: " + error_msg;
+      return result;
+    }
+
+    initialize_llvm();
+
+    std::string temp_dir =
+      (std::filesystem::temp_directory_path() / ("hostjit_" + std::to_string(reinterpret_cast<uintptr_t>(this))))
+        .string();
+    std::filesystem::create_directories(temp_dir);
+
+    std::string input_file  = "input.cu";
+    std::string ptx_file    = temp_dir + "/device.ptx";
+    std::string fatbin_file = temp_dir + "/device.fatbin";
+
+    if (config.verbose)
+    {
+      result.diagnostics += "=== Device compilation ===\n";
+    }
+
+    if (!compileDeviceToPTX(source_code, input_file, ptx_file, config, result.diagnostics))
+    {
+      result.diagnostics += "\nDevice compilation failed";
+      std::filesystem::remove_all(temp_dir);
+      return result;
+    }
+
+    if (config.verbose)
+    {
+      result.diagnostics += "\n=== nvJitLink + fatbinary ===\n";
+    }
+
+    {
+      std::vector<char> ptx_data;
+      {
+        std::ifstream f(ptx_file, std::ios::binary);
+        ptx_data.assign(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>());
+      }
+      if (ptx_data.empty())
+      {
+        result.diagnostics += "\nFailed to read ptx file";
+        std::filesystem::remove_all(temp_dir);
+        return result;
+      }
+      if (ptx_data.back() != '\0')
+      {
+        ptx_data.push_back('\0');
+      }
+
+      std::string arch_opt           = "-arch=sm_" + std::to_string(config.sm_version);
+      std::string opt_level          = "-O" + std::to_string(config.optimization_level >= 1 ? 3 : 0);
+      const char* jitlink_options[]  = {arch_opt.c_str(), opt_level.c_str()};
+      nvJitLinkHandle jitlink_handle = nullptr;
+      nvJitLinkResult jlr            = nvJitLinkCreate(&jitlink_handle, 2, jitlink_options);
+      if (jlr != NVJITLINK_SUCCESS)
+      {
+        result.diagnostics += "\nnvJitLinkCreate failed (error " + std::to_string(static_cast<int>(jlr)) + ")";
+        std::filesystem::remove_all(temp_dir);
+        return result;
+      }
+
+      jlr = nvJitLinkAddData(jitlink_handle, NVJITLINK_INPUT_PTX, ptx_data.data(), ptx_data.size(), "device.ptx");
+      if (jlr != NVJITLINK_SUCCESS)
+      {
+        size_t log_size = 0;
+        nvJitLinkGetErrorLogSize(jitlink_handle, &log_size);
+        if (log_size > 1)
+        {
+          std::string log(log_size, '\0');
+          nvJitLinkGetErrorLog(jitlink_handle, log.data());
+          result.diagnostics += "\n" + log;
+        }
+        result.diagnostics += "\nnvJitLinkAddData failed";
+        nvJitLinkDestroy(&jitlink_handle);
+        std::filesystem::remove_all(temp_dir);
+        return result;
+      }
+
+      jlr = nvJitLinkComplete(jitlink_handle);
+      if (jlr != NVJITLINK_SUCCESS)
+      {
+        size_t log_size = 0;
+        nvJitLinkGetErrorLogSize(jitlink_handle, &log_size);
+        if (log_size > 1)
+        {
+          std::string log(log_size, '\0');
+          nvJitLinkGetErrorLog(jitlink_handle, log.data());
+          result.diagnostics += "\n" + log;
+        }
+        result.diagnostics += "\nnvJitLinkComplete failed";
+        nvJitLinkDestroy(&jitlink_handle);
+        std::filesystem::remove_all(temp_dir);
+        return result;
+      }
+
+      size_t cubin_size = 0;
+      nvJitLinkGetLinkedCubinSize(jitlink_handle, &cubin_size);
+      std::vector<char> cubin_data(cubin_size);
+      nvJitLinkGetLinkedCubin(jitlink_handle, cubin_data.data());
+      nvJitLinkDestroy(&jitlink_handle);
+
+      // Store cubin in the result for inspection
+      result.cubin = cubin_data;
+
+      std::string arch             = std::to_string(config.sm_version);
+      const char* fatbin_options[] = {"-64", "-cuda"};
+      nvFatbinHandle fatbin_handle = nullptr;
+      nvFatbinResult fbr           = nvFatbinCreate(&fatbin_handle, fatbin_options, 2);
+      if (fbr != NVFATBIN_SUCCESS)
+      {
+        result.diagnostics += std::string("\nnvFatbinCreate failed: ") + nvFatbinGetErrorString(fbr);
+        std::filesystem::remove_all(temp_dir);
+        return result;
+      }
+
+      fbr = nvFatbinAddCubin(fatbin_handle, cubin_data.data(), cubin_data.size(), arch.c_str(), "device.cubin");
+      if (fbr != NVFATBIN_SUCCESS)
+      {
+        result.diagnostics += std::string("\nnvFatbinAddCubin failed: ") + nvFatbinGetErrorString(fbr);
+        nvFatbinDestroy(&fatbin_handle);
+        std::filesystem::remove_all(temp_dir);
+        return result;
+      }
+
+      fbr = nvFatbinAddPTX(fatbin_handle, ptx_data.data(), ptx_data.size(), arch.c_str(), "device.ptx", nullptr);
+      if (fbr != NVFATBIN_SUCCESS)
+      {
+        result.diagnostics += std::string("\nnvFatbinAddPTX failed: ") + nvFatbinGetErrorString(fbr);
+        nvFatbinDestroy(&fatbin_handle);
+        std::filesystem::remove_all(temp_dir);
+        return result;
+      }
+
+      size_t fatbin_size = 0;
+      fbr                = nvFatbinSize(fatbin_handle, &fatbin_size);
+      if (fbr != NVFATBIN_SUCCESS)
+      {
+        result.diagnostics += std::string("\nnvFatbinSize failed: ") + nvFatbinGetErrorString(fbr);
+        nvFatbinDestroy(&fatbin_handle);
+        std::filesystem::remove_all(temp_dir);
+        return result;
+      }
+
+      std::vector<char> fatbin_data(fatbin_size);
+      fbr = nvFatbinGet(fatbin_handle, fatbin_data.data());
+      nvFatbinDestroy(&fatbin_handle);
+      if (fbr != NVFATBIN_SUCCESS)
+      {
+        result.diagnostics += std::string("\nnvFatbinGet failed: ") + nvFatbinGetErrorString(fbr);
+        std::filesystem::remove_all(temp_dir);
+        return result;
+      }
+
+      std::ofstream out(fatbin_file, std::ios::binary);
+      out.write(fatbin_data.data(), static_cast<std::streamsize>(fatbin_data.size()));
+      if (!out)
+      {
+        result.diagnostics += "\nFailed to write fatbin file";
+        std::filesystem::remove_all(temp_dir);
+        return result;
+      }
+    }
+
+    if (config.verbose)
+    {
+      result.diagnostics += "\n=== Host compilation ===\n";
+    }
+
+    if (!compileHostCode(source_code, input_file, fatbin_file, output_path, config, result.diagnostics))
+    {
+      result.diagnostics += "\nHost compilation failed";
+      std::filesystem::remove_all(temp_dir);
+      return result;
+    }
+
+    std::filesystem::remove_all(temp_dir);
+    result.success = true;
+    return result;
+  }
+
+  LinkResult linkToSharedLibrary(
+    const std::vector<std::string>& object_files, const std::string& output_path, const CompilerConfig& config)
+  {
+    LinkResult result;
+    result.success      = false;
+    result.library_path = output_path;
+
+    if (object_files.empty())
+    {
+      result.diagnostics = "No object files provided";
+      return result;
+    }
+
+    std::vector<std::string> arg_strings;
+
+#ifdef _WIN32
+    arg_strings.push_back("lld-link");
+    arg_strings.push_back("/DLL");
+    arg_strings.push_back("/NOENTRY");
+    arg_strings.push_back("/NODEFAULTLIB");
+    arg_strings.push_back("/OUT:" + output_path);
+
+    // Generate import libraries from DLLs present on the system,
+    // so we don't require the Windows SDK or MSVC .lib files.
+    std::string implib_dir = std::filesystem::path(output_path).parent_path().string();
+
+    std::string cudart_dll = findCudartDllName(config.cuda_toolkit_path);
+    generateImportLib(
+      cudart_dll,
+      {"cudaMalloc",
+       "cudaFree",
+       "cudaMemcpy",
+       "cudaMemcpyAsync",
+       "cudaMemset",
+       "cudaMemsetAsync",
+       "cudaDeviceSynchronize",
+       "cudaGetDevice",
+       "cudaGetDeviceProperties",
+       "cudaGetLastError",
+       "cudaPeekAtLastError",
+       "cudaGetErrorString",
+       "cudaStreamCreate",
+       "cudaStreamDestroy",
+       "cudaStreamSynchronize",
+       "cudaEventCreate",
+       "cudaEventDestroy",
+       "cudaEventRecord",
+       "cudaEventSynchronize",
+       "cudaEventElapsedTime",
+       "cudaMallocAsync",
+       "cudaFreeAsync",
+       "cudaDeviceGetAttribute",
+       "cudaOccupancyMaxActiveBlocksPerMultiprocessor",
+       "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",
+       "cudaFuncGetAttributes",
+       "cudaLaunchKernel",
+       "cudaLaunchKernelExC",
+       "__cudaRegisterFatBinary",
+       "__cudaRegisterFatBinaryEnd",
+       "__cudaUnregisterFatBinary",
+       "__cudaRegisterFunction",
+       "__cudaRegisterVar",
+       "__cudaPushCallConfiguration",
+       "__cudaPopCallConfiguration"},
+      implib_dir + "/cudart.lib");
+
+    generateImportLib(
+      "ucrtbase.dll",
+      {"malloc",
+       "free",
+       "calloc",
+       "realloc",
+       "_callnewh",
+       "_errno",
+       "abort",
+       "exit",
+       "_exit",
+       "_register_onexit_function",
+       "_crt_atexit",
+       "_initterm",
+       "_initterm_e",
+       "memcpy",
+       "memset",
+       "memmove",
+       "memcmp",
+       "strlen",
+       "strcmp",
+       "strncmp",
+       "_initialize_onexit_table",
+       "_execute_onexit_table",
+       "_register_thread_local_exe_atexit_callback"},
+      implib_dir + "/ucrt.lib");
+
+    generateImportLib(
+      "vcruntime140.dll",
+      {"__std_exception_copy",
+       "__std_exception_destroy",
+       "__CxxFrameHandler3",
+       "_CxxThrowException",
+       "memcpy",
+       "memset",
+       "memmove",
+       "memcmp",
+       "__std_type_info_destroy_list",
+       "_purecall"},
+      implib_dir + "/vcruntime.lib");
+
+    generateImportLib(
+      "kernel32.dll",
+      {"InitializeCriticalSection",
+       "EnterCriticalSection",
+       "LeaveCriticalSection",
+       "DeleteCriticalSection",
+       "InitOnceExecuteOnce",
+       "LoadLibraryExA",
+       "LoadLibraryExW",
+       "GetProcAddress",
+       "FreeLibrary",
+       "GetModuleHandleA",
+       "GetLastError",
+       "SetLastError",
+       "GetCurrentProcess",
+       "GetCurrentThread",
+       "GetCurrentThreadId",
+       "VirtualProtect",
+       "FlushInstructionCache",
+       "QueryPerformanceCounter",
+       "QueryPerformanceFrequency"},
+      implib_dir + "/kernel32.lib");
+
+    arg_strings.push_back("/LIBPATH:" + implib_dir);
+
+    for (const auto& obj_file : object_files)
+    {
+      arg_strings.push_back(obj_file);
+    }
+
+    arg_strings.push_back("cudart.lib");
+    arg_strings.push_back("ucrt.lib");
+    arg_strings.push_back("vcruntime.lib");
+    arg_strings.push_back("kernel32.lib");
+#else
+    arg_strings.push_back("ld.lld");
+    arg_strings.push_back("-shared");
+    arg_strings.push_back("--build-id");
+    arg_strings.push_back("--eh-frame-hdr");
+    arg_strings.push_back("-m");
+    arg_strings.push_back("elf_x86_64");
+    // Allow unresolved symbols — they will be satisfied at dlopen() time
+    // by libraries already loaded in the host process (libc, libstdc++,
+    // cudart, etc.).  This removes the need for system CRT objects and
+    // dev packages on the target machine.
+    arg_strings.push_back("--allow-shlib-undefined");
+    arg_strings.push_back("-o");
+    arg_strings.push_back(output_path);
+
+    for (const auto& lib_path : config.library_paths)
+    {
+      arg_strings.push_back("-L" + lib_path);
+      // Embed the library path as RPATH so the dynamic linker can find
+      // libcudart.so.XX at dlopen time without LD_LIBRARY_PATH.
+      arg_strings.push_back("-rpath");
+      arg_strings.push_back(lib_path);
+    }
+
+    for (const auto& obj_file : object_files)
+    {
+      arg_strings.push_back(obj_file);
+    }
+
+    // pip packages ship libcudart.so.XX without an unversioned symlink,
+    // so -lcudart won't work.  Find the actual .so by scanning library_paths.
+    {
+      bool found_cudart = false;
+      for (const auto& lib_path : config.library_paths)
+      {
+        namespace fs = std::filesystem;
+        if (!fs::exists(lib_path))
+        {
+          continue;
+        }
+        for (const auto& entry : fs::directory_iterator(lib_path))
+        {
+          auto fname = entry.path().filename().string();
+          if (fname.starts_with("libcudart.so"))
+          {
+            arg_strings.push_back(entry.path().string());
+            found_cudart = true;
+            break;
+          }
+        }
+        if (found_cudart)
+        {
+          break;
+        }
+      }
+      if (!found_cudart)
+      {
+        arg_strings.push_back("-lcudart");
+      }
+    }
+#endif
+
+    std::vector<const char*> args;
+    for (const auto& arg : arg_strings)
+    {
+      args.push_back(arg.c_str());
+    }
+
+    std::string stdout_str, stderr_str;
+    llvm::raw_string_ostream stdout_os(stdout_str);
+    llvm::raw_string_ostream stderr_os(stderr_str);
+
+#ifdef _WIN32
+    bool link_success = lld::coff::link(args, stdout_os, stderr_os, false, false);
+#else
+    bool link_success = lld::elf::link(args, stdout_os, stderr_os, false, false);
+#endif
+
+    stdout_os.flush();
+    stderr_os.flush();
+
+    if (!stdout_str.empty())
+    {
+      result.diagnostics += stdout_str;
+    }
+    if (!stderr_str.empty())
+    {
+      result.diagnostics += stderr_str;
+    }
+
+    if (!link_success)
+    {
+      result.diagnostics += "\nLinking failed";
+      return result;
+    }
+
+    result.success = true;
+    return result;
+  }
+};
+
+CUDACompiler::CUDACompiler()
+    : impl_(new Impl())
+{}
+CUDACompiler::~CUDACompiler()
+{
+  delete impl_;
+}
+
+BitcodeResult CUDACompiler::compileToDeviceBitcode(const std::string& source_code, const CompilerConfig& config)
+{
+  return impl_->compileToDeviceBitcode(source_code, config);
+}
+
+CompilationResult CUDACompiler::compileToObject(
+  const std::string& source_code, const std::string& output_path, const CompilerConfig& config)
+{
+  return impl_->compileToObject(source_code, output_path, config);
+}
+
+LinkResult CUDACompiler::linkToSharedLibrary(
+  const std::vector<std::string>& object_files, const std::string& output_path, const CompilerConfig& config)
+{
+  return impl_->linkToSharedLibrary(object_files, output_path, config);
+}
+} // namespace hostjit
diff --git a/c/parallel/src/hostjit/config.cpp b/c/parallel/src/hostjit/config.cpp
new file mode 100644
index 00000000000..2bea7d807f0
--- /dev/null
+++ b/c/parallel/src/hostjit/config.cpp
@@ -0,0 +1,161 @@
+#include <cstdlib>
+#include <filesystem>
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+#include <hostjit/config.hpp>
+
+namespace hostjit
+{
+CompilerConfig detectDefaultConfig()
+{
+  CompilerConfig config;
+
+  // Detect CUDA toolkit path
+  if (const char* env = std::getenv("CUDA_PATH"))
+  {
+    config.cuda_toolkit_path = env;
+  }
+  else if (const char* env = std::getenv("CUDA_HOME"))
+  {
+    config.cuda_toolkit_path = env;
+  }
+#ifdef CUDA_TOOLKIT_PATH
+  else
+  {
+    config.cuda_toolkit_path = CUDA_TOOLKIT_PATH;
+  }
+#endif
+
+  // Set up library paths if CUDA toolkit was found
+  if (!config.cuda_toolkit_path.empty())
+  {
+    std::filesystem::path lib64_path = std::filesystem::path(config.cuda_toolkit_path) / "lib64";
+    std::filesystem::path lib_path   = std::filesystem::path(config.cuda_toolkit_path) / "lib";
+
+    if (std::filesystem::exists(lib64_path))
+    {
+      config.library_paths.push_back(lib64_path.string());
+    }
+    else if (std::filesystem::exists(lib_path))
+    {
+      config.library_paths.push_back(lib_path.string());
+    }
+  }
+
+  // Auto-detect GPU compute capability using CUDA runtime
+  int device = 0;
+  if (cudaGetDevice(&device) == cudaSuccess)
+  {
+    cudaDeviceProp prop;
+    if (cudaGetDeviceProperties(&prop, device) == cudaSuccess)
+    {
+      int detected_sm = prop.major * 10 + prop.minor;
+      if (detected_sm >= 75)
+      {
+        config.sm_version = detected_sm;
+      }
+    }
+  }
+
+  if (config.sm_version == 0)
+  {
+    config.sm_version = 75;
+  }
+
+  config.optimization_level = 2;
+  config.debug              = false;
+  config.verbose            = false;
+
+  // Detect hostjit include path
+  if (const char* env = std::getenv("HOSTJIT_INCLUDE_PATH"))
+  {
+    config.hostjit_include_path = env;
+  }
+#ifdef HOSTJIT_INCLUDE_DIR
+  else
+  {
+    config.hostjit_include_path = HOSTJIT_INCLUDE_DIR;
+  }
+#endif
+
+  return config;
+}
+
+bool validateConfig(const CompilerConfig& config, std::string* error_message)
+{
+  if (config.cuda_toolkit_path.empty())
+  {
+    if (error_message)
+    {
+      *error_message = "CUDA toolkit path not found. Please set CUDA_PATH or CUDA_HOME environment variable.";
+    }
+    return false;
+  }
+
+  if (!std::filesystem::exists(config.cuda_toolkit_path))
+  {
+    if (error_message)
+    {
+      *error_message = "CUDA toolkit path does not exist: " + config.cuda_toolkit_path;
+    }
+    return false;
+  }
+
+  std::filesystem::path cuda_h = std::filesystem::path(config.cuda_toolkit_path) / "include" / "cuda.h";
+  if (!std::filesystem::exists(cuda_h))
+  {
+    if (error_message)
+    {
+      *error_message = "CUDA headers not found at: " + cuda_h.string();
+    }
+    return false;
+  }
+
+  for (const auto& include_path : config.include_paths)
+  {
+    if (!std::filesystem::exists(include_path))
+    {
+      if (error_message)
+      {
+        *error_message = "Include path does not exist: " + include_path;
+      }
+      return false;
+    }
+  }
+
+  for (const auto& library_path : config.library_paths)
+  {
+    if (!std::filesystem::exists(library_path))
+    {
+      if (error_message)
+      {
+        *error_message = "Library path does not exist: " + library_path;
+      }
+      return false;
+    }
+  }
+
+  if (config.sm_version < 30 || config.sm_version > 150)
+  {
+    if (error_message)
+    {
+      *error_message = "Invalid SM version: " + std::to_string(config.sm_version) + " (must be between 30 and 150)";
+    }
+    return false;
+  }
+
+  if (config.optimization_level < 0 || config.optimization_level > 3)
+  {
+    if (error_message)
+    {
+      *error_message =
+        "Invalid optimization level: " + std::to_string(config.optimization_level) + " (must be between 0 and 3)";
+    }
+    return false;
+  }
+
+  return true;
+}
+} // namespace hostjit
diff --git a/c/parallel/src/hostjit/include/hostjit/compiler.hpp b/c/parallel/src/hostjit/include/hostjit/compiler.hpp
new file mode 100644
index 00000000000..69f7e140fcd
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/compiler.hpp
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+namespace hostjit
+{
+struct CompilationResult
+{
+  bool success;
+  std::string object_file_path; // Path to generated .o file
+  std::string diagnostics; // Compiler messages
+  std::vector<char> cubin; // Device cubin extracted during compilation
+};
+
+struct BitcodeResult
+{
+  bool success;
+  std::string bitcode; // LLVM bitcode bytes
+  std::string diagnostics;
+};
+
+struct LinkResult
+{
+  bool success;
+  std::string library_path; // Path to .so file
+  std::string diagnostics;
+};
+
+// Forward declaration to avoid including heavy Clang headers
+struct CompilerConfig;
+
+class CUDACompiler
+{
+public:
+  CUDACompiler();
+  ~CUDACompiler();
+
+  // Compile CUDA device source to LLVM bitcode
+  BitcodeResult compileToDeviceBitcode(const std::string& source_code, const CompilerConfig& config);
+
+  // Compile CUDA source code to object file
+  CompilationResult
+  compileToObject(const std::string& source_code, const std::string& output_path, const CompilerConfig& config);
+
+  // Link object files to shared library
+  LinkResult linkToSharedLibrary(
+    const std::vector<std::string>& object_files, const std::string& output_path, const CompilerConfig& config);
+
+private:
+  class Impl;
+  Impl* impl_;
+};
+} // namespace hostjit
diff --git a/c/parallel/src/hostjit/include/hostjit/config.hpp b/c/parallel/src/hostjit/include/hostjit/config.hpp
new file mode 100644
index 00000000000..2bc248e7369
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/config.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace hostjit
+{
+struct CompilerConfig
+{
+  std::string cuda_toolkit_path;
+  std::string hostjit_include_path; // Path to hostjit include directory (for minimal CUDA runtime)
+  std::string clang_headers_path; // Path to Clang's built-in CUDA headers (overrides CLANG_HEADERS_DIR)
+  std::string cccl_include_path; // Path to CCCL headers (overrides CCCL_SOURCE_DIR); contains cub/, thrust/, cuda/
+  std::vector<std::string> include_paths;
+  std::vector<std::string> library_paths;
+  std::vector<std::string> device_bitcode_files; // Paths to .bc files to link into device code
+  std::unordered_map<std::string, std::string> macro_definitions; // key=macro name, value=macro value (empty for flag
+                                                                  // macros)
+  int sm_version         = 70;
+  int optimization_level = 2;
+  bool debug             = false;
+  bool verbose           = false;
+  bool trace_includes    = false; // Show all included headers during compilation (for debugging header search)
+  bool keep_artifacts    = false; // Keep compiled artifacts for inspection (PTX, object files, etc.)
+  std::string entry_point_name; // Name of the exported entry point function (used for post-link optimization)
+  bool enable_pch = false; // Cache precompiled headers on disk to speed up repeated builds
+};
+
+// Auto-detect CUDA toolkit and create default configuration
+CompilerConfig detectDefaultConfig();
+
+// Validate that the configuration is usable
+bool validateConfig(const CompilerConfig& config, std::string* error_message = nullptr);
+} // namespace hostjit
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_device_functions.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_device_functions.h
new file mode 100644
index 00000000000..fbc5b44808b
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_device_functions.h
@@ -0,0 +1,1716 @@
+/*===---- __clang_cuda_device_functions.h - CUDA runtime support -----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__
+#define __CLANG_CUDA_DEVICE_FUNCTIONS_H__
+
+// __DEVICE__ is a helper macro with common set of attributes for the wrappers
+// we implement in this file. We need static in order to avoid emitting unused
+// functions and __forceinline__ helps inlining these wrappers at -O1.
+#pragma push_macro("__DEVICE__")
+#define __DEVICE__ static __device__ __forceinline__
+
+__DEVICE__ int __all(int __a)
+{
+  return __nvvm_vote_all(__a);
+}
+__DEVICE__ int __any(int __a)
+{
+  return __nvvm_vote_any(__a);
+}
+__DEVICE__ unsigned int __ballot(int __a)
+{
+  return __nvvm_vote_ballot(__a);
+}
+__DEVICE__ unsigned int __brev(unsigned int __a)
+{
+  return __nv_brev(__a);
+}
+__DEVICE__ unsigned long long __brevll(unsigned long long __a)
+{
+  return __nv_brevll(__a);
+}
+#if defined(__cplusplus)
+__DEVICE__ void __brkpt()
+{
+  __asm__ __volatile__("brkpt;");
+}
+__DEVICE__ void __brkpt(int __a)
+{
+  __brkpt();
+}
+#else
+__DEVICE__ void __attribute__((overloadable)) __brkpt(void)
+{
+  __asm__ __volatile__("brkpt;");
+}
+__DEVICE__ void __attribute__((overloadable)) __brkpt(int __a)
+{
+  __brkpt();
+}
+#endif
+__DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b, unsigned int __c)
+{
+  return __nv_byte_perm(__a, __b, __c);
+}
+__DEVICE__ int __clz(int __a)
+{
+  return __nv_clz(__a);
+}
+__DEVICE__ int __clzll(long long __a)
+{
+  return __nv_clzll(__a);
+}
+__DEVICE__ float __cosf(float __a)
+{
+  return __nv_fast_cosf(__a);
+}
+__DEVICE__ double __dAtomicAdd(double* __p, double __v)
+{
+  return __nvvm_atom_add_gen_d(__p, __v);
+}
+__DEVICE__ double __dAtomicAdd_block(double* __p, double __v)
+{
+  return __nvvm_atom_cta_add_gen_d(__p, __v);
+}
+__DEVICE__ double __dAtomicAdd_system(double* __p, double __v)
+{
+  return __nvvm_atom_sys_add_gen_d(__p, __v);
+}
+__DEVICE__ double __dadd_rd(double __a, double __b)
+{
+  return __nv_dadd_rd(__a, __b);
+}
+__DEVICE__ double __dadd_rn(double __a, double __b)
+{
+  return __nv_dadd_rn(__a, __b);
+}
+__DEVICE__ double __dadd_ru(double __a, double __b)
+{
+  return __nv_dadd_ru(__a, __b);
+}
+__DEVICE__ double __dadd_rz(double __a, double __b)
+{
+  return __nv_dadd_rz(__a, __b);
+}
+__DEVICE__ double __ddiv_rd(double __a, double __b)
+{
+  return __nv_ddiv_rd(__a, __b);
+}
+__DEVICE__ double __ddiv_rn(double __a, double __b)
+{
+  return __nv_ddiv_rn(__a, __b);
+}
+__DEVICE__ double __ddiv_ru(double __a, double __b)
+{
+  return __nv_ddiv_ru(__a, __b);
+}
+__DEVICE__ double __ddiv_rz(double __a, double __b)
+{
+  return __nv_ddiv_rz(__a, __b);
+}
+__DEVICE__ double __dmul_rd(double __a, double __b)
+{
+  return __nv_dmul_rd(__a, __b);
+}
+__DEVICE__ double __dmul_rn(double __a, double __b)
+{
+  return __nv_dmul_rn(__a, __b);
+}
+__DEVICE__ double __dmul_ru(double __a, double __b)
+{
+  return __nv_dmul_ru(__a, __b);
+}
+__DEVICE__ double __dmul_rz(double __a, double __b)
+{
+  return __nv_dmul_rz(__a, __b);
+}
+__DEVICE__ float __double2float_rd(double __a)
+{
+  return __nv_double2float_rd(__a);
+}
+__DEVICE__ float __double2float_rn(double __a)
+{
+  return __nv_double2float_rn(__a);
+}
+__DEVICE__ float __double2float_ru(double __a)
+{
+  return __nv_double2float_ru(__a);
+}
+__DEVICE__ float __double2float_rz(double __a)
+{
+  return __nv_double2float_rz(__a);
+}
+__DEVICE__ int __double2hiint(double __a)
+{
+  return __nv_double2hiint(__a);
+}
+__DEVICE__ int __double2int_rd(double __a)
+{
+  return __nv_double2int_rd(__a);
+}
+__DEVICE__ int __double2int_rn(double __a)
+{
+  return __nv_double2int_rn(__a);
+}
+__DEVICE__ int __double2int_ru(double __a)
+{
+  return __nv_double2int_ru(__a);
+}
+__DEVICE__ int __double2int_rz(double __a)
+{
+  return __nv_double2int_rz(__a);
+}
+__DEVICE__ long long __double2ll_rd(double __a)
+{
+  return __nv_double2ll_rd(__a);
+}
+__DEVICE__ long long __double2ll_rn(double __a)
+{
+  return __nv_double2ll_rn(__a);
+}
+__DEVICE__ long long __double2ll_ru(double __a)
+{
+  return __nv_double2ll_ru(__a);
+}
+__DEVICE__ long long __double2ll_rz(double __a)
+{
+  return __nv_double2ll_rz(__a);
+}
+__DEVICE__ int __double2loint(double __a)
+{
+  return __nv_double2loint(__a);
+}
+__DEVICE__ unsigned int __double2uint_rd(double __a)
+{
+  return __nv_double2uint_rd(__a);
+}
+__DEVICE__ unsigned int __double2uint_rn(double __a)
+{
+  return __nv_double2uint_rn(__a);
+}
+__DEVICE__ unsigned int __double2uint_ru(double __a)
+{
+  return __nv_double2uint_ru(__a);
+}
+__DEVICE__ unsigned int __double2uint_rz(double __a)
+{
+  return __nv_double2uint_rz(__a);
+}
+__DEVICE__ unsigned long long __double2ull_rd(double __a)
+{
+  return __nv_double2ull_rd(__a);
+}
+__DEVICE__ unsigned long long __double2ull_rn(double __a)
+{
+  return __nv_double2ull_rn(__a);
+}
+__DEVICE__ unsigned long long __double2ull_ru(double __a)
+{
+  return __nv_double2ull_ru(__a);
+}
+__DEVICE__ unsigned long long __double2ull_rz(double __a)
+{
+  return __nv_double2ull_rz(__a);
+}
+__DEVICE__ long long __double_as_longlong(double __a)
+{
+  return __nv_double_as_longlong(__a);
+}
+__DEVICE__ double __drcp_rd(double __a)
+{
+  return __nv_drcp_rd(__a);
+}
+__DEVICE__ double __drcp_rn(double __a)
+{
+  return __nv_drcp_rn(__a);
+}
+__DEVICE__ double __drcp_ru(double __a)
+{
+  return __nv_drcp_ru(__a);
+}
+__DEVICE__ double __drcp_rz(double __a)
+{
+  return __nv_drcp_rz(__a);
+}
+__DEVICE__ double __dsqrt_rd(double __a)
+{
+  return __nv_dsqrt_rd(__a);
+}
+__DEVICE__ double __dsqrt_rn(double __a)
+{
+  return __nv_dsqrt_rn(__a);
+}
+__DEVICE__ double __dsqrt_ru(double __a)
+{
+  return __nv_dsqrt_ru(__a);
+}
+__DEVICE__ double __dsqrt_rz(double __a)
+{
+  return __nv_dsqrt_rz(__a);
+}
+__DEVICE__ double __dsub_rd(double __a, double __b)
+{
+  return __nv_dsub_rd(__a, __b);
+}
+__DEVICE__ double __dsub_rn(double __a, double __b)
+{
+  return __nv_dsub_rn(__a, __b);
+}
+__DEVICE__ double __dsub_ru(double __a, double __b)
+{
+  return __nv_dsub_ru(__a, __b);
+}
+__DEVICE__ double __dsub_rz(double __a, double __b)
+{
+  return __nv_dsub_rz(__a, __b);
+}
+__DEVICE__ float __exp10f(float __a)
+{
+  return __nv_fast_exp10f(__a);
+}
+__DEVICE__ float __expf(float __a)
+{
+  return __nv_fast_expf(__a);
+}
+__DEVICE__ float __fAtomicAdd(float* __p, float __v)
+{
+  return __nvvm_atom_add_gen_f(__p, __v);
+}
+__DEVICE__ float __fAtomicAdd_block(float* __p, float __v)
+{
+  return __nvvm_atom_cta_add_gen_f(__p, __v);
+}
+__DEVICE__ float __fAtomicAdd_system(float* __p, float __v)
+{
+  return __nvvm_atom_sys_add_gen_f(__p, __v);
+}
+__DEVICE__ float __fAtomicExch(float* __p, float __v)
+{
+  return __nv_int_as_float(__nvvm_atom_xchg_gen_i((int*) __p, __nv_float_as_int(__v)));
+}
+__DEVICE__ float __fAtomicExch_block(float* __p, float __v)
+{
+  return __nv_int_as_float(__nvvm_atom_cta_xchg_gen_i((int*) __p, __nv_float_as_int(__v)));
+}
+__DEVICE__ float __fAtomicExch_system(float* __p, float __v)
+{
+  return __nv_int_as_float(__nvvm_atom_sys_xchg_gen_i((int*) __p, __nv_float_as_int(__v)));
+}
+__DEVICE__ float __fadd_rd(float __a, float __b)
+{
+  return __nv_fadd_rd(__a, __b);
+}
+__DEVICE__ float __fadd_rn(float __a, float __b)
+{
+  return __nv_fadd_rn(__a, __b);
+}
+__DEVICE__ float __fadd_ru(float __a, float __b)
+{
+  return __nv_fadd_ru(__a, __b);
+}
+__DEVICE__ float __fadd_rz(float __a, float __b)
+{
+  return __nv_fadd_rz(__a, __b);
+}
+__DEVICE__ float __fdiv_rd(float __a, float __b)
+{
+  return __nv_fdiv_rd(__a, __b);
+}
+__DEVICE__ float __fdiv_rn(float __a, float __b)
+{
+  return __nv_fdiv_rn(__a, __b);
+}
+__DEVICE__ float __fdiv_ru(float __a, float __b)
+{
+  return __nv_fdiv_ru(__a, __b);
+}
+__DEVICE__ float __fdiv_rz(float __a, float __b)
+{
+  return __nv_fdiv_rz(__a, __b);
+}
+__DEVICE__ float __fdividef(float __a, float __b)
+{
+  return __nv_fast_fdividef(__a, __b);
+}
+__DEVICE__ int __ffs(int __a)
+{
+  return __nv_ffs(__a);
+}
+__DEVICE__ int __ffsll(long long __a)
+{
+  return __nv_ffsll(__a);
+}
+__DEVICE__ int __finite(double __a)
+{
+  return __nv_isfinited(__a);
+}
+__DEVICE__ int __finitef(float __a)
+{
+  return __nv_finitef(__a);
+}
+#ifdef _MSC_VER
+__DEVICE__ int __finitel(long double __a);
+#endif
+__DEVICE__ int __float2int_rd(float __a)
+{
+  return __nv_float2int_rd(__a);
+}
+__DEVICE__ int __float2int_rn(float __a)
+{
+  return __nv_float2int_rn(__a);
+}
+__DEVICE__ int __float2int_ru(float __a)
+{
+  return __nv_float2int_ru(__a);
+}
+__DEVICE__ int __float2int_rz(float __a)
+{
+  return __nv_float2int_rz(__a);
+}
+__DEVICE__ long long __float2ll_rd(float __a)
+{
+  return __nv_float2ll_rd(__a);
+}
+__DEVICE__ long long __float2ll_rn(float __a)
+{
+  return __nv_float2ll_rn(__a);
+}
+__DEVICE__ long long __float2ll_ru(float __a)
+{
+  return __nv_float2ll_ru(__a);
+}
+__DEVICE__ long long __float2ll_rz(float __a)
+{
+  return __nv_float2ll_rz(__a);
+}
+__DEVICE__ unsigned int __float2uint_rd(float __a)
+{
+  return __nv_float2uint_rd(__a);
+}
+__DEVICE__ unsigned int __float2uint_rn(float __a)
+{
+  return __nv_float2uint_rn(__a);
+}
+__DEVICE__ unsigned int __float2uint_ru(float __a)
+{
+  return __nv_float2uint_ru(__a);
+}
+__DEVICE__ unsigned int __float2uint_rz(float __a)
+{
+  return __nv_float2uint_rz(__a);
+}
+__DEVICE__ unsigned long long __float2ull_rd(float __a)
+{
+  return __nv_float2ull_rd(__a);
+}
+__DEVICE__ unsigned long long __float2ull_rn(float __a)
+{
+  return __nv_float2ull_rn(__a);
+}
+__DEVICE__ unsigned long long __float2ull_ru(float __a)
+{
+  return __nv_float2ull_ru(__a);
+}
+__DEVICE__ unsigned long long __float2ull_rz(float __a)
+{
+  return __nv_float2ull_rz(__a);
+}
+__DEVICE__ int __float_as_int(float __a)
+{
+  return __nv_float_as_int(__a);
+}
+__DEVICE__ unsigned int __float_as_uint(float __a)
+{
+  return __nv_float_as_uint(__a);
+}
+__DEVICE__ double __fma_rd(double __a, double __b, double __c)
+{
+  return __nv_fma_rd(__a, __b, __c);
+}
+__DEVICE__ double __fma_rn(double __a, double __b, double __c)
+{
+  return __nv_fma_rn(__a, __b, __c);
+}
+__DEVICE__ double __fma_ru(double __a, double __b, double __c)
+{
+  return __nv_fma_ru(__a, __b, __c);
+}
+__DEVICE__ double __fma_rz(double __a, double __b, double __c)
+{
+  return __nv_fma_rz(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ieee_rd(float __a, float __b, float __c)
+{
+  return __nv_fmaf_ieee_rd(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ieee_rn(float __a, float __b, float __c)
+{
+  return __nv_fmaf_ieee_rn(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ieee_ru(float __a, float __b, float __c)
+{
+  return __nv_fmaf_ieee_ru(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ieee_rz(float __a, float __b, float __c)
+{
+  return __nv_fmaf_ieee_rz(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_rd(float __a, float __b, float __c)
+{
+  return __nv_fmaf_rd(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_rn(float __a, float __b, float __c)
+{
+  return __nv_fmaf_rn(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_ru(float __a, float __b, float __c)
+{
+  return __nv_fmaf_ru(__a, __b, __c);
+}
+__DEVICE__ float __fmaf_rz(float __a, float __b, float __c)
+{
+  return __nv_fmaf_rz(__a, __b, __c);
+}
+__DEVICE__ float __fmul_rd(float __a, float __b)
+{
+  return __nv_fmul_rd(__a, __b);
+}
+__DEVICE__ float __fmul_rn(float __a, float __b)
+{
+  return __nv_fmul_rn(__a, __b);
+}
+__DEVICE__ float __fmul_ru(float __a, float __b)
+{
+  return __nv_fmul_ru(__a, __b);
+}
+__DEVICE__ float __fmul_rz(float __a, float __b)
+{
+  return __nv_fmul_rz(__a, __b);
+}
+__DEVICE__ float __frcp_rd(float __a)
+{
+  return __nv_frcp_rd(__a);
+}
+__DEVICE__ float __frcp_rn(float __a)
+{
+  return __nv_frcp_rn(__a);
+}
+__DEVICE__ float __frcp_ru(float __a)
+{
+  return __nv_frcp_ru(__a);
+}
+__DEVICE__ float __frcp_rz(float __a)
+{
+  return __nv_frcp_rz(__a);
+}
+__DEVICE__ float __frsqrt_rn(float __a)
+{
+  return __nv_frsqrt_rn(__a);
+}
+__DEVICE__ float __fsqrt_rd(float __a)
+{
+  return __nv_fsqrt_rd(__a);
+}
+__DEVICE__ float __fsqrt_rn(float __a)
+{
+  return __nv_fsqrt_rn(__a);
+}
+__DEVICE__ float __fsqrt_ru(float __a)
+{
+  return __nv_fsqrt_ru(__a);
+}
+__DEVICE__ float __fsqrt_rz(float __a)
+{
+  return __nv_fsqrt_rz(__a);
+}
+__DEVICE__ float __fsub_rd(float __a, float __b)
+{
+  return __nv_fsub_rd(__a, __b);
+}
+__DEVICE__ float __fsub_rn(float __a, float __b)
+{
+  return __nv_fsub_rn(__a, __b);
+}
+__DEVICE__ float __fsub_ru(float __a, float __b)
+{
+  return __nv_fsub_ru(__a, __b);
+}
+__DEVICE__ float __fsub_rz(float __a, float __b)
+{
+  return __nv_fsub_rz(__a, __b);
+}
+__DEVICE__ int __hadd(int __a, int __b)
+{
+  return __nv_hadd(__a, __b);
+}
+__DEVICE__ double __hiloint2double(int __a, int __b)
+{
+  return __nv_hiloint2double(__a, __b);
+}
+__DEVICE__ int __iAtomicAdd(int* __p, int __v)
+{
+  return __nvvm_atom_add_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAdd_block(int* __p, int __v)
+{
+  return __nvvm_atom_cta_add_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAdd_system(int* __p, int __v)
+{
+  return __nvvm_atom_sys_add_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAnd(int* __p, int __v)
+{
+  return __nvvm_atom_and_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAnd_block(int* __p, int __v)
+{
+  return __nvvm_atom_cta_and_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicAnd_system(int* __p, int __v)
+{
+  return __nvvm_atom_sys_and_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicCAS(int* __p, int __cmp, int __v)
+{
+  return __nvvm_atom_cas_gen_i(__p, __cmp, __v);
+}
+__DEVICE__ int __iAtomicCAS_block(int* __p, int __cmp, int __v)
+{
+  return __nvvm_atom_cta_cas_gen_i(__p, __cmp, __v);
+}
+__DEVICE__ int __iAtomicCAS_system(int* __p, int __cmp, int __v)
+{
+  return __nvvm_atom_sys_cas_gen_i(__p, __cmp, __v);
+}
+__DEVICE__ int __iAtomicExch(int* __p, int __v)
+{
+  return __nvvm_atom_xchg_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicExch_block(int* __p, int __v)
+{
+  return __nvvm_atom_cta_xchg_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicExch_system(int* __p, int __v)
+{
+  return __nvvm_atom_sys_xchg_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMax(int* __p, int __v)
+{
+  return __nvvm_atom_max_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMax_block(int* __p, int __v)
+{
+  return __nvvm_atom_cta_max_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMax_system(int* __p, int __v)
+{
+  return __nvvm_atom_sys_max_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMin(int* __p, int __v)
+{
+  return __nvvm_atom_min_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMin_block(int* __p, int __v)
+{
+  return __nvvm_atom_cta_min_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicMin_system(int* __p, int __v)
+{
+  return __nvvm_atom_sys_min_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicOr(int* __p, int __v)
+{
+  return __nvvm_atom_or_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicOr_block(int* __p, int __v)
+{
+  return __nvvm_atom_cta_or_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicOr_system(int* __p, int __v)
+{
+  return __nvvm_atom_sys_or_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicXor(int* __p, int __v)
+{
+  return __nvvm_atom_xor_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicXor_block(int* __p, int __v)
+{
+  return __nvvm_atom_cta_xor_gen_i(__p, __v);
+}
+__DEVICE__ int __iAtomicXor_system(int* __p, int __v)
+{
+  return __nvvm_atom_sys_xor_gen_i(__p, __v);
+}
+__DEVICE__ long long __illAtomicMax(long long* __p, long long __v)
+{
+  return __nvvm_atom_max_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMax_block(long long* __p, long long __v)
+{
+  return __nvvm_atom_cta_max_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMax_system(long long* __p, long long __v)
+{
+  return __nvvm_atom_sys_max_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMin(long long* __p, long long __v)
+{
+  return __nvvm_atom_min_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMin_block(long long* __p, long long __v)
+{
+  return __nvvm_atom_cta_min_gen_ll(__p, __v);
+}
+__DEVICE__ long long __illAtomicMin_system(long long* __p, long long __v)
+{
+  return __nvvm_atom_sys_min_gen_ll(__p, __v);
+}
+__DEVICE__ double __int2double_rn(int __a)
+{
+  return __nv_int2double_rn(__a);
+}
+__DEVICE__ float __int2float_rd(int __a)
+{
+  return __nv_int2float_rd(__a);
+}
+__DEVICE__ float __int2float_rn(int __a)
+{
+  return __nv_int2float_rn(__a);
+}
+__DEVICE__ float __int2float_ru(int __a)
+{
+  return __nv_int2float_ru(__a);
+}
+__DEVICE__ float __int2float_rz(int __a)
+{
+  return __nv_int2float_rz(__a);
+}
+__DEVICE__ float __int_as_float(int __a)
+{
+  return __nv_int_as_float(__a);
+}
+__DEVICE__ int __isfinited(double __a)
+{
+  return __nv_isfinited(__a);
+}
+__DEVICE__ int __isinf(double __a)
+{
+  return __nv_isinfd(__a);
+}
+__DEVICE__ int __isinff(float __a)
+{
+  return __nv_isinff(__a);
+}
+#ifdef _MSC_VER
+__DEVICE__ int __isinfl(long double __a);
+#endif
+__DEVICE__ int __isnan(double __a)
+{
+  return __nv_isnand(__a);
+}
+__DEVICE__ int __isnanf(float __a)
+{
+  return __nv_isnanf(__a);
+}
+#ifdef _MSC_VER
+__DEVICE__ int __isnanl(long double __a);
+#endif
+__DEVICE__ double __ll2double_rd(long long __a)
+{
+  return __nv_ll2double_rd(__a);
+}
+__DEVICE__ double __ll2double_rn(long long __a)
+{
+  return __nv_ll2double_rn(__a);
+}
+__DEVICE__ double __ll2double_ru(long long __a)
+{
+  return __nv_ll2double_ru(__a);
+}
+__DEVICE__ double __ll2double_rz(long long __a)
+{
+  return __nv_ll2double_rz(__a);
+}
+__DEVICE__ float __ll2float_rd(long long __a)
+{
+  return __nv_ll2float_rd(__a);
+}
+__DEVICE__ float __ll2float_rn(long long __a)
+{
+  return __nv_ll2float_rn(__a);
+}
+__DEVICE__ float __ll2float_ru(long long __a)
+{
+  return __nv_ll2float_ru(__a);
+}
+__DEVICE__ float __ll2float_rz(long long __a)
+{
+  return __nv_ll2float_rz(__a);
+}
+__DEVICE__ long long __llAtomicAnd(long long* __p, long long __v)
+{
+  return __nvvm_atom_and_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicAnd_block(long long* __p, long long __v)
+{
+  return __nvvm_atom_cta_and_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicAnd_system(long long* __p, long long __v)
+{
+  return __nvvm_atom_sys_and_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicOr(long long* __p, long long __v)
+{
+  return __nvvm_atom_or_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicOr_block(long long* __p, long long __v)
+{
+  return __nvvm_atom_cta_or_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicOr_system(long long* __p, long long __v)
+{
+  return __nvvm_atom_sys_or_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicXor(long long* __p, long long __v)
+{
+  return __nvvm_atom_xor_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicXor_block(long long* __p, long long __v)
+{
+  return __nvvm_atom_cta_xor_gen_ll(__p, __v);
+}
+__DEVICE__ long long __llAtomicXor_system(long long* __p, long long __v)
+{
+  return __nvvm_atom_sys_xor_gen_ll(__p, __v);
+}
+__DEVICE__ float __log10f(float __a)
+{
+  return __nv_fast_log10f(__a);
+}
+__DEVICE__ float __log2f(float __a)
+{
+  return __nv_fast_log2f(__a);
+}
+__DEVICE__ float __logf(float __a)
+{
+  return __nv_fast_logf(__a);
+}
+__DEVICE__ double __longlong_as_double(long long __a)
+{
+  return __nv_longlong_as_double(__a);
+}
+__DEVICE__ int __mul24(int __a, int __b)
+{
+  return __nv_mul24(__a, __b);
+}
+__DEVICE__ long long __mul64hi(long long __a, long long __b)
+{
+  return __nv_mul64hi(__a, __b);
+}
+__DEVICE__ int __mulhi(int __a, int __b)
+{
+  return __nv_mulhi(__a, __b);
+}
+__DEVICE__ unsigned int __pm0(void)
+{
+  return __nvvm_read_ptx_sreg_pm0();
+}
+__DEVICE__ unsigned int __pm1(void)
+{
+  return __nvvm_read_ptx_sreg_pm1();
+}
+__DEVICE__ unsigned int __pm2(void)
+{
+  return __nvvm_read_ptx_sreg_pm2();
+}
+__DEVICE__ unsigned int __pm3(void)
+{
+  return __nvvm_read_ptx_sreg_pm3();
+}
+__DEVICE__ int __popc(unsigned int __a)
+{
+  return __nv_popc(__a);
+}
+__DEVICE__ int __popcll(unsigned long long __a)
+{
+  return __nv_popcll(__a);
+}
+__DEVICE__ float __powf(float __a, float __b)
+{
+  return __nv_fast_powf(__a, __b);
+}
+
+// Parameter must have a known integer value.
+#define __prof_trigger(__a) __asm__ __volatile__("pmevent \t%0;" ::"i"(__a))
+__DEVICE__ int __rhadd(int __a, int __b)
+{
+  return __nv_rhadd(__a, __b);
+}
+__DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c)
+{
+  return __nv_sad(__a, __b, __c);
+}
+__DEVICE__ float __saturatef(float __a)
+{
+  return __nv_saturatef(__a);
+}
+__DEVICE__ int __signbitd(double __a)
+{
+  return __nv_signbitd(__a);
+}
+__DEVICE__ int __signbitf(float __a)
+{
+  return __nv_signbitf(__a);
+}
+__DEVICE__ void __sincosf(float __a, float* __s, float* __c)
+{
+  return __nv_fast_sincosf(__a, __s, __c);
+}
+__DEVICE__ float __sinf(float __a)
+{
+  return __nv_fast_sinf(__a);
+}
+__DEVICE__ int __syncthreads_and(int __a)
+{
+  return __nvvm_bar0_and(__a);
+}
+__DEVICE__ int __syncthreads_count(int __a)
+{
+  return __nvvm_bar0_popc(__a);
+}
+__DEVICE__ int __syncthreads_or(int __a)
+{
+  return __nvvm_bar0_or(__a);
+}
+__DEVICE__ float __tanf(float __a)
+{
+  return __nv_fast_tanf(__a);
+}
+__DEVICE__ void __threadfence(void)
+{
+  __nvvm_membar_gl();
+}
+__DEVICE__ void __threadfence_block(void)
+{
+  __nvvm_membar_cta();
+};
+__DEVICE__ void __threadfence_system(void)
+{
+  __nvvm_membar_sys();
+};
+__DEVICE__ __attribute__((noreturn)) void __trap(void)
+{
+  __builtin_trap();
+}
+__DEVICE__ unsigned short __usAtomicCAS(unsigned short* __p, unsigned short __cmp, unsigned short __v)
+{
+  return __nvvm_atom_cas_gen_us(__p, __cmp, __v);
+}
+__DEVICE__ unsigned short __usAtomicCAS_block(unsigned short* __p, unsigned short __cmp, unsigned short __v)
+{
+  return __nvvm_atom_cta_cas_gen_us(__p, __cmp, __v);
+}
+__DEVICE__ unsigned short __usAtomicCAS_system(unsigned short* __p, unsigned short __cmp, unsigned short __v)
+{
+  return __nvvm_atom_sys_cas_gen_us(__p, __cmp, __v);
+}
+__DEVICE__ unsigned int __uAtomicAdd(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_add_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAdd_block(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_cta_add_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAdd_system(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_sys_add_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAnd(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_and_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAnd_block(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_cta_and_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uAtomicAnd_system(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_sys_and_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uAtomicCAS(unsigned int* __p, unsigned int __cmp, unsigned int __v)
+{
+  return __nvvm_atom_cas_gen_i((int*) __p, __cmp, __v);
+}
+__DEVICE__ unsigned int __uAtomicCAS_block(unsigned int* __p, unsigned int __cmp, unsigned int __v)
+{
+  return __nvvm_atom_cta_cas_gen_i((int*) __p, __cmp, __v);
+}
+__DEVICE__ unsigned int __uAtomicCAS_system(unsigned int* __p, unsigned int __cmp, unsigned int __v)
+{
+  return __nvvm_atom_sys_cas_gen_i((int*) __p, __cmp, __v);
+}
+__DEVICE__ unsigned int __uAtomicDec(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_dec_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicDec_block(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_cta_dec_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicDec_system(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_sys_dec_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicExch(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_xchg_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uAtomicExch_block(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_cta_xchg_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uAtomicExch_system(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_sys_xchg_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uAtomicInc(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_inc_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicInc_block(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_cta_inc_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicInc_system(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_sys_inc_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMax(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_max_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMax_block(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_cta_max_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMax_system(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_sys_max_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMin(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_min_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMin_block(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_cta_min_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicMin_system(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_sys_min_gen_ui(__p, __v);
+}
+__DEVICE__ unsigned int __uAtomicOr(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_or_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uAtomicOr_block(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_cta_or_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uAtomicOr_system(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_sys_or_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uAtomicXor(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_xor_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uAtomicXor_block(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_cta_xor_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uAtomicXor_system(unsigned int* __p, unsigned int __v)
+{
+  return __nvvm_atom_sys_xor_gen_i((int*) __p, __v);
+}
+__DEVICE__ unsigned int __uhadd(unsigned int __a, unsigned int __b)
+{
+  return __nv_uhadd(__a, __b);
+}
+__DEVICE__ double __uint2double_rn(unsigned int __a)
+{
+  return __nv_uint2double_rn(__a);
+}
+__DEVICE__ float __uint2float_rd(unsigned int __a)
+{
+  return __nv_uint2float_rd(__a);
+}
+__DEVICE__ float __uint2float_rn(unsigned int __a)
+{
+  return __nv_uint2float_rn(__a);
+}
+__DEVICE__ float __uint2float_ru(unsigned int __a)
+{
+  return __nv_uint2float_ru(__a);
+}
+__DEVICE__ float __uint2float_rz(unsigned int __a)
+{
+  return __nv_uint2float_rz(__a);
+}
+__DEVICE__ float __uint_as_float(unsigned int __a)
+{
+  return __nv_uint_as_float(__a);
+} //
+__DEVICE__ double __ull2double_rd(unsigned long long __a)
+{
+  return __nv_ull2double_rd(__a);
+}
+__DEVICE__ double __ull2double_rn(unsigned long long __a)
+{
+  return __nv_ull2double_rn(__a);
+}
+__DEVICE__ double __ull2double_ru(unsigned long long __a)
+{
+  return __nv_ull2double_ru(__a);
+}
+__DEVICE__ double __ull2double_rz(unsigned long long __a)
+{
+  return __nv_ull2double_rz(__a);
+}
+__DEVICE__ float __ull2float_rd(unsigned long long __a)
+{
+  return __nv_ull2float_rd(__a);
+}
+__DEVICE__ float __ull2float_rn(unsigned long long __a)
+{
+  return __nv_ull2float_rn(__a);
+}
+__DEVICE__ float __ull2float_ru(unsigned long long __a)
+{
+  return __nv_ull2float_ru(__a);
+}
+__DEVICE__ float __ull2float_rz(unsigned long long __a)
+{
+  return __nv_ull2float_rz(__a);
+}
+__DEVICE__ unsigned long long __ullAtomicAdd(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_add_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicAdd_block(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_cta_add_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicAdd_system(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_sys_add_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicAnd(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_and_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicAnd_block(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_cta_and_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicAnd_system(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_sys_and_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicCAS(unsigned long long* __p, unsigned long long __cmp, unsigned long long __v)
+{
+  return __nvvm_atom_cas_gen_ll((long long*) __p, __cmp, __v);
+}
+__DEVICE__ unsigned long long
+__ullAtomicCAS_block(unsigned long long* __p, unsigned long long __cmp, unsigned long long __v)
+{
+  return __nvvm_atom_cta_cas_gen_ll((long long*) __p, __cmp, __v);
+}
+__DEVICE__ unsigned long long
+__ullAtomicCAS_system(unsigned long long* __p, unsigned long long __cmp, unsigned long long __v)
+{
+  return __nvvm_atom_sys_cas_gen_ll((long long*) __p, __cmp, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicExch(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_xchg_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicExch_block(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_cta_xchg_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicExch_system(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_sys_xchg_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMax(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_max_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMax_block(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_cta_max_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMax_system(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_sys_max_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMin(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_min_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMin_block(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_cta_min_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicMin_system(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_sys_min_gen_ull(__p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicOr(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_or_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicOr_block(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_cta_or_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicOr_system(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_sys_or_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicXor(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_xor_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicXor_block(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_cta_xor_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned long long __ullAtomicXor_system(unsigned long long* __p, unsigned long long __v)
+{
+  return __nvvm_atom_sys_xor_gen_ll((long long*) __p, __v);
+}
+__DEVICE__ unsigned int __umul24(unsigned int __a, unsigned int __b)
+{
+  return __nv_umul24(__a, __b);
+}
+__DEVICE__ unsigned long long __umul64hi(unsigned long long __a, unsigned long long __b)
+{
+  return __nv_umul64hi(__a, __b);
+}
+__DEVICE__ unsigned int __umulhi(unsigned int __a, unsigned int __b)
+{
+  return __nv_umulhi(__a, __b);
+}
+__DEVICE__ unsigned int __urhadd(unsigned int __a, unsigned int __b)
+{
+  return __nv_urhadd(__a, __b);
+}
+__DEVICE__ unsigned int __usad(unsigned int __a, unsigned int __b, unsigned int __c)
+{
+  return __nv_usad(__a, __b, __c);
+}
+
+// CUDA no longer provides inline assembly (or bitcode) implementation of these
+// functions, so we have to reimplement them. The implementation is naive and is
+// not optimized for performance.
+
+// Helper function to convert N-bit boolean subfields into all-0 or all-1.
+// E.g. __bool2mask(0x01000100,8) -> 0xff00ff00
+//      __bool2mask(0x00010000,16) -> 0xffff0000
+__DEVICE__ unsigned int __bool2mask(unsigned int __a, int shift)
+{
+  return (__a << shift) - __a;
+}
+__DEVICE__ unsigned int __vabs2(unsigned int __a)
+{
+  unsigned int r;
+  __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(0), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vabs4(unsigned int __a)
+{
+  unsigned int r;
+  __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(0), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+
+__DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vabsss2(unsigned int __a)
+{
+  unsigned int r;
+  __asm__("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(0), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vabsss4(unsigned int __a)
+{
+  unsigned int r;
+  __asm__("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(0), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vadd2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vadd4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vadd2.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vadd4.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vadd2.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vadd4.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vavrg2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vavrg4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vavrg2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vavrg4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset2.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vseteq2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset4.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vseteq4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset2.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetges2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset4.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetges4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset2.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetgeu2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset4.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetgeu4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset2.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetgts2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset4.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetgts4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset2.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetgtu2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset4.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetgtu4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset2.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetles2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset4.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetles4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset2.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetleu2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset4.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetleu4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset2.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetlts2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset4.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetlts4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset2.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetltu2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset4.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetltu4(__a, __b), 8);
+}
+__DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset2.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetne2(__a, __b), 16);
+}
+__DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vset4.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b)
+{
+  return __bool2mask(__vsetne4(__a, __b), 8);
+}
+
+// Based on ITEM 23 in AIM-239: http://dspace.mit.edu/handle/1721.1/6086
+// (a & b) + (a | b) = a + b = (a ^ b) + 2 * (a & b) =>
+// (a + b) / 2 = ((a ^ b) >> 1) + (a & b)
+// To operate on multiple sub-elements we need to make sure to mask out bits
+// that crossed over into adjacent elements during the shift.
+__DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b)
+{
+  return (((__a ^ __b) >> 1) & ~0x80008000u) + (__a & __b);
+}
+__DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b)
+{
+  return (((__a ^ __b) >> 1) & ~0x80808080u) + (__a & __b);
+}
+
+__DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  if ((__a & 0x8000) && (__b & 0x8000))
+  {
+    // Work around a bug in ptxas which produces invalid result if low element
+    // is negative.
+    unsigned mask = __vcmpgts2(__a, __b);
+    r             = (__a & mask) | (__b & ~mask);
+  }
+  else
+  {
+    __asm__("vmax2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  }
+  return r;
+}
+__DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vmax4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vmax2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vmax4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vmin2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vmin4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vmin2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vmin4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+
+__DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vsub2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vneg2(unsigned int __a)
+{
+  return __vsub2(0, __a);
+}
+
+__DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vsub4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vneg4(unsigned int __a)
+{
+  return __vsub4(0, __a);
+}
+__DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vsub2.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vnegss2(unsigned int __a)
+{
+  return __vsubss2(0, __a);
+}
+__DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vsub4.s32.s32.s32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vnegss4(unsigned int __a)
+{
+  return __vsubss4(0, __a);
+}
+__DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vsub2.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+__DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b)
+{
+  unsigned int r;
+  __asm__("vsub4.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  return r;
+}
+
+__DEVICE__ /* clock_t= */ int clock()
+{
+  return __nvvm_read_ptx_sreg_clock();
+}
+__DEVICE__ long long clock64()
+{
+  return __nvvm_read_ptx_sreg_clock64();
+}
+
+__DEVICE__ void* memcpy(void* __a, const void* __b, size_t __c)
+{
+  return __builtin_memcpy(__a, __b, __c);
+}
+__DEVICE__ void* memset(void* __a, int __b, size_t __c)
+{
+  return __builtin_memset(__a, __b, __c);
+}
+
+#pragma pop_macro("__DEVICE__")
+#endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_libdevice_declares.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_libdevice_declares.h
new file mode 100644
index 00000000000..f83983e7d13
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_libdevice_declares.h
@@ -0,0 +1,371 @@
+/*===-- __clang_cuda_libdevice_declares.h - decls for libdevice functions --===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__
+#define __CLANG_CUDA_LIBDEVICE_DECLARES_H__
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define __DEVICE__ __device__
+
+__DEVICE__ int __nv_abs(int __a);
+__DEVICE__ double __nv_acos(double __a);
+__DEVICE__ float __nv_acosf(float __a);
+__DEVICE__ double __nv_acosh(double __a);
+__DEVICE__ float __nv_acoshf(float __a);
+__DEVICE__ double __nv_asin(double __a);
+__DEVICE__ float __nv_asinf(float __a);
+__DEVICE__ double __nv_asinh(double __a);
+__DEVICE__ float __nv_asinhf(float __a);
+__DEVICE__ double __nv_atan2(double __a, double __b);
+__DEVICE__ float __nv_atan2f(float __a, float __b);
+__DEVICE__ double __nv_atan(double __a);
+__DEVICE__ float __nv_atanf(float __a);
+__DEVICE__ double __nv_atanh(double __a);
+__DEVICE__ float __nv_atanhf(float __a);
+__DEVICE__ int __nv_brev(int __a);
+__DEVICE__ long long __nv_brevll(long long __a);
+__DEVICE__ int __nv_byte_perm(int __a, int __b, int __c);
+__DEVICE__ double __nv_cbrt(double __a);
+__DEVICE__ float __nv_cbrtf(float __a);
+__DEVICE__ double __nv_ceil(double __a);
+__DEVICE__ float __nv_ceilf(float __a);
+__DEVICE__ int __nv_clz(int __a);
+__DEVICE__ int __nv_clzll(long long __a);
+__DEVICE__ double __nv_copysign(double __a, double __b);
+__DEVICE__ float __nv_copysignf(float __a, float __b);
+__DEVICE__ double __nv_cos(double __a);
+__DEVICE__ float __nv_cosf(float __a);
+__DEVICE__ double __nv_cosh(double __a);
+__DEVICE__ float __nv_coshf(float __a);
+__DEVICE__ double __nv_cospi(double __a);
+__DEVICE__ float __nv_cospif(float __a);
+__DEVICE__ double __nv_cyl_bessel_i0(double __a);
+__DEVICE__ float __nv_cyl_bessel_i0f(float __a);
+__DEVICE__ double __nv_cyl_bessel_i1(double __a);
+__DEVICE__ float __nv_cyl_bessel_i1f(float __a);
+__DEVICE__ double __nv_dadd_rd(double __a, double __b);
+__DEVICE__ double __nv_dadd_rn(double __a, double __b);
+__DEVICE__ double __nv_dadd_ru(double __a, double __b);
+__DEVICE__ double __nv_dadd_rz(double __a, double __b);
+__DEVICE__ double __nv_ddiv_rd(double __a, double __b);
+__DEVICE__ double __nv_ddiv_rn(double __a, double __b);
+__DEVICE__ double __nv_ddiv_ru(double __a, double __b);
+__DEVICE__ double __nv_ddiv_rz(double __a, double __b);
+__DEVICE__ double __nv_dmul_rd(double __a, double __b);
+__DEVICE__ double __nv_dmul_rn(double __a, double __b);
+__DEVICE__ double __nv_dmul_ru(double __a, double __b);
+__DEVICE__ double __nv_dmul_rz(double __a, double __b);
+__DEVICE__ float __nv_double2float_rd(double __a);
+__DEVICE__ float __nv_double2float_rn(double __a);
+__DEVICE__ float __nv_double2float_ru(double __a);
+__DEVICE__ float __nv_double2float_rz(double __a);
+__DEVICE__ int __nv_double2hiint(double __a);
+__DEVICE__ int __nv_double2int_rd(double __a);
+__DEVICE__ int __nv_double2int_rn(double __a);
+__DEVICE__ int __nv_double2int_ru(double __a);
+__DEVICE__ int __nv_double2int_rz(double __a);
+__DEVICE__ long long __nv_double2ll_rd(double __a);
+__DEVICE__ long long __nv_double2ll_rn(double __a);
+__DEVICE__ long long __nv_double2ll_ru(double __a);
+__DEVICE__ long long __nv_double2ll_rz(double __a);
+__DEVICE__ int __nv_double2loint(double __a);
+__DEVICE__ unsigned int __nv_double2uint_rd(double __a);
+__DEVICE__ unsigned int __nv_double2uint_rn(double __a);
+__DEVICE__ unsigned int __nv_double2uint_ru(double __a);
+__DEVICE__ unsigned int __nv_double2uint_rz(double __a);
+__DEVICE__ unsigned long long __nv_double2ull_rd(double __a);
+__DEVICE__ unsigned long long __nv_double2ull_rn(double __a);
+__DEVICE__ unsigned long long __nv_double2ull_ru(double __a);
+__DEVICE__ unsigned long long __nv_double2ull_rz(double __a);
+__DEVICE__ unsigned long long __nv_double_as_longlong(double __a);
+__DEVICE__ double __nv_drcp_rd(double __a);
+__DEVICE__ double __nv_drcp_rn(double __a);
+__DEVICE__ double __nv_drcp_ru(double __a);
+__DEVICE__ double __nv_drcp_rz(double __a);
+__DEVICE__ double __nv_dsqrt_rd(double __a);
+__DEVICE__ double __nv_dsqrt_rn(double __a);
+__DEVICE__ double __nv_dsqrt_ru(double __a);
+__DEVICE__ double __nv_dsqrt_rz(double __a);
+__DEVICE__ double __nv_dsub_rd(double __a, double __b);
+__DEVICE__ double __nv_dsub_rn(double __a, double __b);
+__DEVICE__ double __nv_dsub_ru(double __a, double __b);
+__DEVICE__ double __nv_dsub_rz(double __a, double __b);
+__DEVICE__ double __nv_erfc(double __a);
+__DEVICE__ float __nv_erfcf(float __a);
+__DEVICE__ double __nv_erfcinv(double __a);
+__DEVICE__ float __nv_erfcinvf(float __a);
+__DEVICE__ double __nv_erfcx(double __a);
+__DEVICE__ float __nv_erfcxf(float __a);
+__DEVICE__ double __nv_erf(double __a);
+__DEVICE__ float __nv_erff(float __a);
+__DEVICE__ double __nv_erfinv(double __a);
+__DEVICE__ float __nv_erfinvf(float __a);
+__DEVICE__ double __nv_exp10(double __a);
+__DEVICE__ float __nv_exp10f(float __a);
+__DEVICE__ double __nv_exp2(double __a);
+__DEVICE__ float __nv_exp2f(float __a);
+__DEVICE__ double __nv_exp(double __a);
+__DEVICE__ float __nv_expf(float __a);
+__DEVICE__ double __nv_expm1(double __a);
+__DEVICE__ float __nv_expm1f(float __a);
+__DEVICE__ double __nv_fabs(double __a);
+__DEVICE__ float __nv_fabsf(float __a);
+__DEVICE__ float __nv_fadd_rd(float __a, float __b);
+__DEVICE__ float __nv_fadd_rn(float __a, float __b);
+__DEVICE__ float __nv_fadd_ru(float __a, float __b);
+__DEVICE__ float __nv_fadd_rz(float __a, float __b);
+__DEVICE__ float __nv_fast_cosf(float __a);
+__DEVICE__ float __nv_fast_exp10f(float __a);
+__DEVICE__ float __nv_fast_expf(float __a);
+__DEVICE__ float __nv_fast_fdividef(float __a, float __b);
+__DEVICE__ float __nv_fast_log10f(float __a);
+__DEVICE__ float __nv_fast_log2f(float __a);
+__DEVICE__ float __nv_fast_logf(float __a);
+__DEVICE__ float __nv_fast_powf(float __a, float __b);
+__DEVICE__ void __nv_fast_sincosf(float __a, float* __s, float* __c);
+__DEVICE__ float __nv_fast_sinf(float __a);
+__DEVICE__ float __nv_fast_tanf(float __a);
+__DEVICE__ double __nv_fdim(double __a, double __b);
+__DEVICE__ float __nv_fdimf(float __a, float __b);
+__DEVICE__ float __nv_fdiv_rd(float __a, float __b);
+__DEVICE__ float __nv_fdiv_rn(float __a, float __b);
+__DEVICE__ float __nv_fdiv_ru(float __a, float __b);
+__DEVICE__ float __nv_fdiv_rz(float __a, float __b);
+__DEVICE__ int __nv_ffs(int __a);
+__DEVICE__ int __nv_ffsll(long long __a);
+__DEVICE__ int __nv_finitef(float __a);
+__DEVICE__ unsigned short __nv_float2half_rn(float __a);
+__DEVICE__ int __nv_float2int_rd(float __a);
+__DEVICE__ int __nv_float2int_rn(float __a);
+__DEVICE__ int __nv_float2int_ru(float __a);
+__DEVICE__ int __nv_float2int_rz(float __a);
+__DEVICE__ long long __nv_float2ll_rd(float __a);
+__DEVICE__ long long __nv_float2ll_rn(float __a);
+__DEVICE__ long long __nv_float2ll_ru(float __a);
+__DEVICE__ long long __nv_float2ll_rz(float __a);
+__DEVICE__ unsigned int __nv_float2uint_rd(float __a);
+__DEVICE__ unsigned int __nv_float2uint_rn(float __a);
+__DEVICE__ unsigned int __nv_float2uint_ru(float __a);
+__DEVICE__ unsigned int __nv_float2uint_rz(float __a);
+__DEVICE__ unsigned long long __nv_float2ull_rd(float __a);
+__DEVICE__ unsigned long long __nv_float2ull_rn(float __a);
+__DEVICE__ unsigned long long __nv_float2ull_ru(float __a);
+__DEVICE__ unsigned long long __nv_float2ull_rz(float __a);
+__DEVICE__ int __nv_float_as_int(float __a);
+__DEVICE__ unsigned int __nv_float_as_uint(float __a);
+__DEVICE__ double __nv_floor(double __a);
+__DEVICE__ float __nv_floorf(float __a);
+__DEVICE__ double __nv_fma(double __a, double __b, double __c);
+__DEVICE__ float __nv_fmaf(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_rd(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_rn(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_ru(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_rz(float __a, float __b, float __c);
+__DEVICE__ double __nv_fma_rd(double __a, double __b, double __c);
+__DEVICE__ double __nv_fma_rn(double __a, double __b, double __c);
+__DEVICE__ double __nv_fma_ru(double __a, double __b, double __c);
+__DEVICE__ double __nv_fma_rz(double __a, double __b, double __c);
+__DEVICE__ double __nv_fmax(double __a, double __b);
+__DEVICE__ float __nv_fmaxf(float __a, float __b);
+__DEVICE__ double __nv_fmin(double __a, double __b);
+__DEVICE__ float __nv_fminf(float __a, float __b);
+__DEVICE__ double __nv_fmod(double __a, double __b);
+__DEVICE__ float __nv_fmodf(float __a, float __b);
+__DEVICE__ float __nv_fmul_rd(float __a, float __b);
+__DEVICE__ float __nv_fmul_rn(float __a, float __b);
+__DEVICE__ float __nv_fmul_ru(float __a, float __b);
+__DEVICE__ float __nv_fmul_rz(float __a, float __b);
+__DEVICE__ float __nv_frcp_rd(float __a);
+__DEVICE__ float __nv_frcp_rn(float __a);
+__DEVICE__ float __nv_frcp_ru(float __a);
+__DEVICE__ float __nv_frcp_rz(float __a);
+__DEVICE__ double __nv_frexp(double __a, int* __b);
+__DEVICE__ float __nv_frexpf(float __a, int* __b);
+__DEVICE__ float __nv_frsqrt_rn(float __a);
+__DEVICE__ float __nv_fsqrt_rd(float __a);
+__DEVICE__ float __nv_fsqrt_rn(float __a);
+__DEVICE__ float __nv_fsqrt_ru(float __a);
+__DEVICE__ float __nv_fsqrt_rz(float __a);
+__DEVICE__ float __nv_fsub_rd(float __a, float __b);
+__DEVICE__ float __nv_fsub_rn(float __a, float __b);
+__DEVICE__ float __nv_fsub_ru(float __a, float __b);
+__DEVICE__ float __nv_fsub_rz(float __a, float __b);
+__DEVICE__ int __nv_hadd(int __a, int __b);
+__DEVICE__ float __nv_half2float(unsigned short __h);
+__DEVICE__ double __nv_hiloint2double(int __a, int __b);
+__DEVICE__ double __nv_hypot(double __a, double __b);
+__DEVICE__ float __nv_hypotf(float __a, float __b);
+__DEVICE__ int __nv_ilogb(double __a);
+__DEVICE__ int __nv_ilogbf(float __a);
+__DEVICE__ double __nv_int2double_rn(int __a);
+__DEVICE__ float __nv_int2float_rd(int __a);
+__DEVICE__ float __nv_int2float_rn(int __a);
+__DEVICE__ float __nv_int2float_ru(int __a);
+__DEVICE__ float __nv_int2float_rz(int __a);
+__DEVICE__ float __nv_int_as_float(int __a);
+__DEVICE__ int __nv_isfinited(double __a);
+__DEVICE__ int __nv_isinfd(double __a);
+__DEVICE__ int __nv_isinff(float __a);
+__DEVICE__ int __nv_isnand(double __a);
+__DEVICE__ int __nv_isnanf(float __a);
+__DEVICE__ double __nv_j0(double __a);
+__DEVICE__ float __nv_j0f(float __a);
+__DEVICE__ double __nv_j1(double __a);
+__DEVICE__ float __nv_j1f(float __a);
+__DEVICE__ float __nv_jnf(int __a, float __b);
+__DEVICE__ double __nv_jn(int __a, double __b);
+__DEVICE__ double __nv_ldexp(double __a, int __b);
+__DEVICE__ float __nv_ldexpf(float __a, int __b);
+__DEVICE__ double __nv_lgamma(double __a);
+__DEVICE__ float __nv_lgammaf(float __a);
+__DEVICE__ double __nv_ll2double_rd(long long __a);
+__DEVICE__ double __nv_ll2double_rn(long long __a);
+__DEVICE__ double __nv_ll2double_ru(long long __a);
+__DEVICE__ double __nv_ll2double_rz(long long __a);
+__DEVICE__ float __nv_ll2float_rd(long long __a);
+__DEVICE__ float __nv_ll2float_rn(long long __a);
+__DEVICE__ float __nv_ll2float_ru(long long __a);
+__DEVICE__ float __nv_ll2float_rz(long long __a);
+__DEVICE__ long long __nv_llabs(long long __a);
+__DEVICE__ long long __nv_llmax(long long __a, long long __b);
+__DEVICE__ long long __nv_llmin(long long __a, long long __b);
+__DEVICE__ long long __nv_llrint(double __a);
+__DEVICE__ long long __nv_llrintf(float __a);
+__DEVICE__ long long __nv_llround(double __a);
+__DEVICE__ long long __nv_llroundf(float __a);
+__DEVICE__ double __nv_log10(double __a);
+__DEVICE__ float __nv_log10f(float __a);
+__DEVICE__ double __nv_log1p(double __a);
+__DEVICE__ float __nv_log1pf(float __a);
+__DEVICE__ double __nv_log2(double __a);
+__DEVICE__ float __nv_log2f(float __a);
+__DEVICE__ double __nv_logb(double __a);
+__DEVICE__ float __nv_logbf(float __a);
+__DEVICE__ double __nv_log(double __a);
+__DEVICE__ float __nv_logf(float __a);
+__DEVICE__ double __nv_longlong_as_double(long long __a);
+__DEVICE__ int __nv_max(int __a, int __b);
+__DEVICE__ int __nv_min(int __a, int __b);
+__DEVICE__ double __nv_modf(double __a, double* __b);
+__DEVICE__ float __nv_modff(float __a, float* __b);
+__DEVICE__ int __nv_mul24(int __a, int __b);
+__DEVICE__ long long __nv_mul64hi(long long __a, long long __b);
+__DEVICE__ int __nv_mulhi(int __a, int __b);
+__DEVICE__ double __nv_nan(const signed char* __a);
+__DEVICE__ float __nv_nanf(const signed char* __a);
+__DEVICE__ double __nv_nearbyint(double __a);
+__DEVICE__ float __nv_nearbyintf(float __a);
+__DEVICE__ double __nv_nextafter(double __a, double __b);
+__DEVICE__ float __nv_nextafterf(float __a, float __b);
+__DEVICE__ double __nv_norm3d(double __a, double __b, double __c);
+__DEVICE__ float __nv_norm3df(float __a, float __b, float __c);
+__DEVICE__ double __nv_norm4d(double __a, double __b, double __c, double __d);
+__DEVICE__ float __nv_norm4df(float __a, float __b, float __c, float __d);
+__DEVICE__ double __nv_normcdf(double __a);
+__DEVICE__ float __nv_normcdff(float __a);
+__DEVICE__ double __nv_normcdfinv(double __a);
+__DEVICE__ float __nv_normcdfinvf(float __a);
+__DEVICE__ float __nv_normf(int __a, const float* __b);
+__DEVICE__ double __nv_norm(int __a, const double* __b);
+__DEVICE__ int __nv_popc(unsigned int __a);
+__DEVICE__ int __nv_popcll(unsigned long long __a);
+__DEVICE__ double __nv_pow(double __a, double __b);
+__DEVICE__ float __nv_powf(float __a, float __b);
+__DEVICE__ double __nv_powi(double __a, int __b);
+__DEVICE__ float __nv_powif(float __a, int __b);
+__DEVICE__ double __nv_rcbrt(double __a);
+__DEVICE__ float __nv_rcbrtf(float __a);
+__DEVICE__ double __nv_rcp64h(double __a);
+__DEVICE__ double __nv_remainder(double __a, double __b);
+__DEVICE__ float __nv_remainderf(float __a, float __b);
+__DEVICE__ double __nv_remquo(double __a, double __b, int* __c);
+__DEVICE__ float __nv_remquof(float __a, float __b, int* __c);
+__DEVICE__ int __nv_rhadd(int __a, int __b);
+__DEVICE__ double __nv_rhypot(double __a, double __b);
+__DEVICE__ float __nv_rhypotf(float __a, float __b);
+__DEVICE__ double __nv_rint(double __a);
+__DEVICE__ float __nv_rintf(float __a);
+__DEVICE__ double __nv_rnorm3d(double __a, double __b, double __c);
+__DEVICE__ float __nv_rnorm3df(float __a, float __b, float __c);
+__DEVICE__ double __nv_rnorm4d(double __a, double __b, double __c, double __d);
+__DEVICE__ float __nv_rnorm4df(float __a, float __b, float __c, float __d);
+__DEVICE__ float __nv_rnormf(int __a, const float* __b);
+__DEVICE__ double __nv_rnorm(int __a, const double* __b);
+__DEVICE__ double __nv_round(double __a);
+__DEVICE__ float __nv_roundf(float __a);
+__DEVICE__ double __nv_rsqrt(double __a);
+__DEVICE__ float __nv_rsqrtf(float __a);
+__DEVICE__ int __nv_sad(int __a, int __b, int __c);
+__DEVICE__ float __nv_saturatef(float __a);
+__DEVICE__ double __nv_scalbn(double __a, int __b);
+__DEVICE__ float __nv_scalbnf(float __a, int __b);
+__DEVICE__ int __nv_signbitd(double __a);
+__DEVICE__ int __nv_signbitf(float __a);
+__DEVICE__ void __nv_sincos(double __a, double* __b, double* __c);
+__DEVICE__ void __nv_sincosf(float __a, float* __b, float* __c);
+__DEVICE__ void __nv_sincospi(double __a, double* __b, double* __c);
+__DEVICE__ void __nv_sincospif(float __a, float* __b, float* __c);
+__DEVICE__ double __nv_sin(double __a);
+__DEVICE__ float __nv_sinf(float __a);
+__DEVICE__ double __nv_sinh(double __a);
+__DEVICE__ float __nv_sinhf(float __a);
+__DEVICE__ double __nv_sinpi(double __a);
+__DEVICE__ float __nv_sinpif(float __a);
+__DEVICE__ double __nv_sqrt(double __a);
+__DEVICE__ float __nv_sqrtf(float __a);
+__DEVICE__ double __nv_tan(double __a);
+__DEVICE__ float __nv_tanf(float __a);
+__DEVICE__ double __nv_tanh(double __a);
+__DEVICE__ float __nv_tanhf(float __a);
+__DEVICE__ double __nv_tgamma(double __a);
+__DEVICE__ float __nv_tgammaf(float __a);
+__DEVICE__ double __nv_trunc(double __a);
+__DEVICE__ float __nv_truncf(float __a);
+__DEVICE__ int __nv_uhadd(unsigned int __a, unsigned int __b);
+__DEVICE__ double __nv_uint2double_rn(unsigned int __i);
+__DEVICE__ float __nv_uint2float_rd(unsigned int __a);
+__DEVICE__ float __nv_uint2float_rn(unsigned int __a);
+__DEVICE__ float __nv_uint2float_ru(unsigned int __a);
+__DEVICE__ float __nv_uint2float_rz(unsigned int __a);
+__DEVICE__ float __nv_uint_as_float(unsigned int __a);
+__DEVICE__ double __nv_ull2double_rd(unsigned long long __a);
+__DEVICE__ double __nv_ull2double_rn(unsigned long long __a);
+__DEVICE__ double __nv_ull2double_ru(unsigned long long __a);
+__DEVICE__ double __nv_ull2double_rz(unsigned long long __a);
+__DEVICE__ float __nv_ull2float_rd(unsigned long long __a);
+__DEVICE__ float __nv_ull2float_rn(unsigned long long __a);
+__DEVICE__ float __nv_ull2float_ru(unsigned long long __a);
+__DEVICE__ float __nv_ull2float_rz(unsigned long long __a);
+__DEVICE__ unsigned long long __nv_ullmax(unsigned long long __a, unsigned long long __b);
+__DEVICE__ unsigned long long __nv_ullmin(unsigned long long __a, unsigned long long __b);
+__DEVICE__ unsigned int __nv_umax(unsigned int __a, unsigned int __b);
+__DEVICE__ unsigned int __nv_umin(unsigned int __a, unsigned int __b);
+__DEVICE__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b);
+__DEVICE__ unsigned long long __nv_umul64hi(unsigned long long __a, unsigned long long __b);
+__DEVICE__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b);
+__DEVICE__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b);
+__DEVICE__ unsigned int __nv_usad(unsigned int __a, unsigned int __b, unsigned int __c);
+__DEVICE__ double __nv_y0(double __a);
+__DEVICE__ float __nv_y0f(float __a);
+__DEVICE__ double __nv_y1(double __a);
+__DEVICE__ float __nv_y1f(float __a);
+__DEVICE__ float __nv_ynf(int __a, float __b);
+__DEVICE__ double __nv_yn(int __a, double __b);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_math.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_math.h
new file mode 100644
index 00000000000..964d949e889
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_math.h
@@ -0,0 +1,809 @@
+/*===---- __clang_cuda_math.h - Device-side CUDA math support --------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG_CUDA_MATH_H__
+#define __CLANG_CUDA_MATH_H__
+#ifndef __CUDA__
+#  error "This file is for CUDA compilation only."
+#endif
+
+// The __CLANG_GPU_DISABLE_MATH_WRAPPERS macro provides a way to let standard
+// libcalls reach the link step instead of being eagerly replaced.
+#ifndef __CLANG_GPU_DISABLE_MATH_WRAPPERS
+
+// __DEVICE__ is a helper macro with common set of attributes for the wrappers
+// we implement in this file. We need static in order to avoid emitting unused
+// functions and __forceinline__ helps inlining these wrappers at -O1.
+#  pragma push_macro("__DEVICE__")
+#  define __DEVICE__ static __device__ __forceinline__
+
+// Specialized version of __DEVICE__ for functions with void return type.
+#  pragma push_macro("__DEVICE_VOID__")
+#  define __DEVICE_VOID__ __DEVICE__
+
+// libdevice provides fast low precision and slow full-recision implementations
+// for some functions. Which one gets selected depends on
+// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
+// -ffast-math or -fgpu-approx-transcendentals are in effect.
+#  pragma push_macro("__FAST_OR_SLOW")
+#  if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
+#    define __FAST_OR_SLOW(fast, slow) fast
+#  else
+#    define __FAST_OR_SLOW(fast, slow) slow
+#  endif
+
+__DEVICE__ int abs(int __a)
+{
+  return __nv_abs(__a);
+}
+__DEVICE__ double fabs(double __a)
+{
+  return __nv_fabs(__a);
+}
+__DEVICE__ double acos(double __a)
+{
+  return __nv_acos(__a);
+}
+__DEVICE__ float acosf(float __a)
+{
+  return __nv_acosf(__a);
+}
+__DEVICE__ double acosh(double __a)
+{
+  return __nv_acosh(__a);
+}
+__DEVICE__ float acoshf(float __a)
+{
+  return __nv_acoshf(__a);
+}
+__DEVICE__ double asin(double __a)
+{
+  return __nv_asin(__a);
+}
+__DEVICE__ float asinf(float __a)
+{
+  return __nv_asinf(__a);
+}
+__DEVICE__ double asinh(double __a)
+{
+  return __nv_asinh(__a);
+}
+__DEVICE__ float asinhf(float __a)
+{
+  return __nv_asinhf(__a);
+}
+__DEVICE__ double atan(double __a)
+{
+  return __nv_atan(__a);
+}
+__DEVICE__ double atan2(double __a, double __b)
+{
+  return __nv_atan2(__a, __b);
+}
+__DEVICE__ float atan2f(float __a, float __b)
+{
+  return __nv_atan2f(__a, __b);
+}
+__DEVICE__ float atanf(float __a)
+{
+  return __nv_atanf(__a);
+}
+__DEVICE__ double atanh(double __a)
+{
+  return __nv_atanh(__a);
+}
+__DEVICE__ float atanhf(float __a)
+{
+  return __nv_atanhf(__a);
+}
+__DEVICE__ double cbrt(double __a)
+{
+  return __nv_cbrt(__a);
+}
+__DEVICE__ float cbrtf(float __a)
+{
+  return __nv_cbrtf(__a);
+}
+__DEVICE__ double ceil(double __a)
+{
+  return __nv_ceil(__a);
+}
+__DEVICE__ float ceilf(float __a)
+{
+  return __nv_ceilf(__a);
+}
+__DEVICE__ double copysign(double __a, double __b)
+{
+  return __nv_copysign(__a, __b);
+}
+__DEVICE__ float copysignf(float __a, float __b)
+{
+  return __nv_copysignf(__a, __b);
+}
+__DEVICE__ double cos(double __a)
+{
+  return __nv_cos(__a);
+}
+__DEVICE__ float cosf(float __a)
+{
+  return __FAST_OR_SLOW(__nv_fast_cosf, __nv_cosf)(__a);
+}
+__DEVICE__ double cosh(double __a)
+{
+  return __nv_cosh(__a);
+}
+__DEVICE__ float coshf(float __a)
+{
+  return __nv_coshf(__a);
+}
+__DEVICE__ double cospi(double __a)
+{
+  return __nv_cospi(__a);
+}
+__DEVICE__ float cospif(float __a)
+{
+  return __nv_cospif(__a);
+}
+__DEVICE__ double cyl_bessel_i0(double __a)
+{
+  return __nv_cyl_bessel_i0(__a);
+}
+__DEVICE__ float cyl_bessel_i0f(float __a)
+{
+  return __nv_cyl_bessel_i0f(__a);
+}
+__DEVICE__ double cyl_bessel_i1(double __a)
+{
+  return __nv_cyl_bessel_i1(__a);
+}
+__DEVICE__ float cyl_bessel_i1f(float __a)
+{
+  return __nv_cyl_bessel_i1f(__a);
+}
+__DEVICE__ double erf(double __a)
+{
+  return __nv_erf(__a);
+}
+__DEVICE__ double erfc(double __a)
+{
+  return __nv_erfc(__a);
+}
+__DEVICE__ float erfcf(float __a)
+{
+  return __nv_erfcf(__a);
+}
+__DEVICE__ double erfcinv(double __a)
+{
+  return __nv_erfcinv(__a);
+}
+__DEVICE__ float erfcinvf(float __a)
+{
+  return __nv_erfcinvf(__a);
+}
+__DEVICE__ double erfcx(double __a)
+{
+  return __nv_erfcx(__a);
+}
+__DEVICE__ float erfcxf(float __a)
+{
+  return __nv_erfcxf(__a);
+}
+__DEVICE__ float erff(float __a)
+{
+  return __nv_erff(__a);
+}
+__DEVICE__ double erfinv(double __a)
+{
+  return __nv_erfinv(__a);
+}
+__DEVICE__ float erfinvf(float __a)
+{
+  return __nv_erfinvf(__a);
+}
+__DEVICE__ double exp(double __a)
+{
+  return __nv_exp(__a);
+}
+__DEVICE__ double exp10(double __a)
+{
+  return __nv_exp10(__a);
+}
+__DEVICE__ float exp10f(float __a)
+{
+  return __nv_exp10f(__a);
+}
+__DEVICE__ double exp2(double __a)
+{
+  return __nv_exp2(__a);
+}
+__DEVICE__ float exp2f(float __a)
+{
+  return __nv_exp2f(__a);
+}
+__DEVICE__ float expf(float __a)
+{
+  return __nv_expf(__a);
+}
+__DEVICE__ double expm1(double __a)
+{
+  return __nv_expm1(__a);
+}
+__DEVICE__ float expm1f(float __a)
+{
+  return __nv_expm1f(__a);
+}
+__DEVICE__ float fabsf(float __a)
+{
+  return __nv_fabsf(__a);
+}
+__DEVICE__ double fdim(double __a, double __b)
+{
+  return __nv_fdim(__a, __b);
+}
+__DEVICE__ float fdimf(float __a, float __b)
+{
+  return __nv_fdimf(__a, __b);
+}
+__DEVICE__ double fdivide(double __a, double __b)
+{
+  return __a / __b;
+}
+__DEVICE__ float fdividef(float __a, float __b)
+{
+#  if __FAST_MATH__ && !__CUDA_PREC_DIV
+  return __nv_fast_fdividef(__a, __b);
+#  else
+  return __a / __b;
+#  endif
+}
+__DEVICE__ double floor(double __f)
+{
+  return __nv_floor(__f);
+}
+__DEVICE__ float floorf(float __f)
+{
+  return __nv_floorf(__f);
+}
+__DEVICE__ double fma(double __a, double __b, double __c)
+{
+  return __nv_fma(__a, __b, __c);
+}
+__DEVICE__ float fmaf(float __a, float __b, float __c)
+{
+  return __nv_fmaf(__a, __b, __c);
+}
+__DEVICE__ double fmax(double __a, double __b)
+{
+  return __nv_fmax(__a, __b);
+}
+__DEVICE__ float fmaxf(float __a, float __b)
+{
+  return __nv_fmaxf(__a, __b);
+}
+__DEVICE__ double fmin(double __a, double __b)
+{
+  return __nv_fmin(__a, __b);
+}
+__DEVICE__ float fminf(float __a, float __b)
+{
+  return __nv_fminf(__a, __b);
+}
+__DEVICE__ double fmod(double __a, double __b)
+{
+  return __nv_fmod(__a, __b);
+}
+__DEVICE__ float fmodf(float __a, float __b)
+{
+  return __nv_fmodf(__a, __b);
+}
+__DEVICE__ double frexp(double __a, int* __b)
+{
+  return __nv_frexp(__a, __b);
+}
+__DEVICE__ float frexpf(float __a, int* __b)
+{
+  return __nv_frexpf(__a, __b);
+}
+__DEVICE__ double hypot(double __a, double __b)
+{
+  return __nv_hypot(__a, __b);
+}
+__DEVICE__ float hypotf(float __a, float __b)
+{
+  return __nv_hypotf(__a, __b);
+}
+__DEVICE__ int ilogb(double __a)
+{
+  return __nv_ilogb(__a);
+}
+__DEVICE__ int ilogbf(float __a)
+{
+  return __nv_ilogbf(__a);
+}
+__DEVICE__ double j0(double __a)
+{
+  return __nv_j0(__a);
+}
+__DEVICE__ float j0f(float __a)
+{
+  return __nv_j0f(__a);
+}
+__DEVICE__ double j1(double __a)
+{
+  return __nv_j1(__a);
+}
+__DEVICE__ float j1f(float __a)
+{
+  return __nv_j1f(__a);
+}
+__DEVICE__ double jn(int __n, double __a)
+{
+  return __nv_jn(__n, __a);
+}
+__DEVICE__ float jnf(int __n, float __a)
+{
+  return __nv_jnf(__n, __a);
+}
+#  if defined(__LP64__) || defined(_WIN64)
+__DEVICE__ long labs(long __a)
+{
+  return __nv_llabs(__a);
+};
+#  else
+__DEVICE__ long labs(long __a)
+{
+  return __nv_abs(__a);
+};
+#  endif
+__DEVICE__ double ldexp(double __a, int __b)
+{
+  return __nv_ldexp(__a, __b);
+}
+__DEVICE__ float ldexpf(float __a, int __b)
+{
+  return __nv_ldexpf(__a, __b);
+}
+__DEVICE__ double lgamma(double __a)
+{
+  return __nv_lgamma(__a);
+}
+__DEVICE__ float lgammaf(float __a)
+{
+  return __nv_lgammaf(__a);
+}
+__DEVICE__ long long llabs(long long __a)
+{
+  return __nv_llabs(__a);
+}
+__DEVICE__ long long llmax(long long __a, long long __b)
+{
+  return __nv_llmax(__a, __b);
+}
+__DEVICE__ long long llmin(long long __a, long long __b)
+{
+  return __nv_llmin(__a, __b);
+}
+__DEVICE__ long long llrint(double __a)
+{
+  return __nv_llrint(__a);
+}
+__DEVICE__ long long llrintf(float __a)
+{
+  return __nv_llrintf(__a);
+}
+__DEVICE__ long long llround(double __a)
+{
+  return __nv_llround(__a);
+}
+__DEVICE__ long long llroundf(float __a)
+{
+  return __nv_llroundf(__a);
+}
+__DEVICE__ double round(double __a)
+{
+  return __nv_round(__a);
+}
+__DEVICE__ float roundf(float __a)
+{
+  return __nv_roundf(__a);
+}
+__DEVICE__ double log(double __a)
+{
+  return __nv_log(__a);
+}
+__DEVICE__ double log10(double __a)
+{
+  return __nv_log10(__a);
+}
+__DEVICE__ float log10f(float __a)
+{
+  return __nv_log10f(__a);
+}
+__DEVICE__ double log1p(double __a)
+{
+  return __nv_log1p(__a);
+}
+__DEVICE__ float log1pf(float __a)
+{
+  return __nv_log1pf(__a);
+}
+__DEVICE__ double log2(double __a)
+{
+  return __nv_log2(__a);
+}
+__DEVICE__ float log2f(float __a)
+{
+  return __FAST_OR_SLOW(__nv_fast_log2f, __nv_log2f)(__a);
+}
+__DEVICE__ double logb(double __a)
+{
+  return __nv_logb(__a);
+}
+__DEVICE__ float logbf(float __a)
+{
+  return __nv_logbf(__a);
+}
+__DEVICE__ float logf(float __a)
+{
+  return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a);
+}
+#  if defined(__LP64__) || defined(_WIN64)
+__DEVICE__ long lrint(double __a)
+{
+  return llrint(__a);
+}
+__DEVICE__ long lrintf(float __a)
+{
+  return __float2ll_rn(__a);
+}
+__DEVICE__ long lround(double __a)
+{
+  return llround(__a);
+}
+__DEVICE__ long lroundf(float __a)
+{
+  return llroundf(__a);
+}
+#  else
+__DEVICE__ long lrint(double __a)
+{
+  return (long) rint(__a);
+}
+__DEVICE__ long lrintf(float __a)
+{
+  return __float2int_rn(__a);
+}
+__DEVICE__ long lround(double __a)
+{
+  return round(__a);
+}
+__DEVICE__ long lroundf(float __a)
+{
+  return roundf(__a);
+}
+#  endif
+__DEVICE__ int max(int __a, int __b)
+{
+  return __nv_max(__a, __b);
+}
+__DEVICE__ int min(int __a, int __b)
+{
+  return __nv_min(__a, __b);
+}
+__DEVICE__ double modf(double __a, double* __b)
+{
+  return __nv_modf(__a, __b);
+}
+__DEVICE__ float modff(float __a, float* __b)
+{
+  return __nv_modff(__a, __b);
+}
+__DEVICE__ double nearbyint(double __a)
+{
+  return __builtin_nearbyint(__a);
+}
+__DEVICE__ float nearbyintf(float __a)
+{
+  return __builtin_nearbyintf(__a);
+}
+__DEVICE__ double nextafter(double __a, double __b)
+{
+  return __nv_nextafter(__a, __b);
+}
+__DEVICE__ float nextafterf(float __a, float __b)
+{
+  return __nv_nextafterf(__a, __b);
+}
+__DEVICE__ double norm(int __dim, const double* __t)
+{
+  return __nv_norm(__dim, __t);
+}
+__DEVICE__ double norm3d(double __a, double __b, double __c)
+{
+  return __nv_norm3d(__a, __b, __c);
+}
+__DEVICE__ float norm3df(float __a, float __b, float __c)
+{
+  return __nv_norm3df(__a, __b, __c);
+}
+__DEVICE__ double norm4d(double __a, double __b, double __c, double __d)
+{
+  return __nv_norm4d(__a, __b, __c, __d);
+}
+__DEVICE__ float norm4df(float __a, float __b, float __c, float __d)
+{
+  return __nv_norm4df(__a, __b, __c, __d);
+}
+__DEVICE__ double normcdf(double __a)
+{
+  return __nv_normcdf(__a);
+}
+__DEVICE__ float normcdff(float __a)
+{
+  return __nv_normcdff(__a);
+}
+__DEVICE__ double normcdfinv(double __a)
+{
+  return __nv_normcdfinv(__a);
+}
+__DEVICE__ float normcdfinvf(float __a)
+{
+  return __nv_normcdfinvf(__a);
+}
+__DEVICE__ float normf(int __dim, const float* __t)
+{
+  return __nv_normf(__dim, __t);
+}
+__DEVICE__ double pow(double __a, double __b)
+{
+  return __nv_pow(__a, __b);
+}
+__DEVICE__ float powf(float __a, float __b)
+{
+  return __nv_powf(__a, __b);
+}
+__DEVICE__ double powi(double __a, int __b)
+{
+  return __nv_powi(__a, __b);
+}
+__DEVICE__ float powif(float __a, int __b)
+{
+  return __nv_powif(__a, __b);
+}
+__DEVICE__ double rcbrt(double __a)
+{
+  return __nv_rcbrt(__a);
+}
+__DEVICE__ float rcbrtf(float __a)
+{
+  return __nv_rcbrtf(__a);
+}
+__DEVICE__ double remainder(double __a, double __b)
+{
+  return __nv_remainder(__a, __b);
+}
+__DEVICE__ float remainderf(float __a, float __b)
+{
+  return __nv_remainderf(__a, __b);
+}
+__DEVICE__ double remquo(double __a, double __b, int* __c)
+{
+  return __nv_remquo(__a, __b, __c);
+}
+__DEVICE__ float remquof(float __a, float __b, int* __c)
+{
+  return __nv_remquof(__a, __b, __c);
+}
+__DEVICE__ double rhypot(double __a, double __b)
+{
+  return __nv_rhypot(__a, __b);
+}
+__DEVICE__ float rhypotf(float __a, float __b)
+{
+  return __nv_rhypotf(__a, __b);
+}
+// __nv_rint* in libdevice is buggy and produces incorrect results.
+__DEVICE__ double rint(double __a)
+{
+  return __builtin_rint(__a);
+}
+__DEVICE__ float rintf(float __a)
+{
+  return __builtin_rintf(__a);
+}
+__DEVICE__ double rnorm(int __a, const double* __b)
+{
+  return __nv_rnorm(__a, __b);
+}
+__DEVICE__ double rnorm3d(double __a, double __b, double __c)
+{
+  return __nv_rnorm3d(__a, __b, __c);
+}
+__DEVICE__ float rnorm3df(float __a, float __b, float __c)
+{
+  return __nv_rnorm3df(__a, __b, __c);
+}
+__DEVICE__ double rnorm4d(double __a, double __b, double __c, double __d)
+{
+  return __nv_rnorm4d(__a, __b, __c, __d);
+}
+__DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d)
+{
+  return __nv_rnorm4df(__a, __b, __c, __d);
+}
+__DEVICE__ float rnormf(int __dim, const float* __t)
+{
+  return __nv_rnormf(__dim, __t);
+}
+__DEVICE__ double rsqrt(double __a)
+{
+  return __nv_rsqrt(__a);
+}
+__DEVICE__ float rsqrtf(float __a)
+{
+  return __nv_rsqrtf(__a);
+}
+__DEVICE__ double scalbn(double __a, int __b)
+{
+  return __nv_scalbn(__a, __b);
+}
+__DEVICE__ float scalbnf(float __a, int __b)
+{
+  return __nv_scalbnf(__a, __b);
+}
+__DEVICE__ double scalbln(double __a, long __b)
+{
+  if (__b > INT_MAX)
+  {
+    return __a > 0 ? HUGE_VAL : -HUGE_VAL;
+  }
+  if (__b < INT_MIN)
+  {
+    return __a > 0 ? 0.0 : -0.0;
+  }
+  return scalbn(__a, (int) __b);
+}
+__DEVICE__ float scalblnf(float __a, long __b)
+{
+  if (__b > INT_MAX)
+  {
+    return __a > 0 ? HUGE_VALF : -HUGE_VALF;
+  }
+  if (__b < INT_MIN)
+  {
+    return __a > 0 ? 0.f : -0.f;
+  }
+  return scalbnf(__a, (int) __b);
+}
+__DEVICE__ double sin(double __a)
+{
+  return __nv_sin(__a);
+}
+__DEVICE_VOID__ void sincos(double __a, double* __s, double* __c)
+{
+  return __nv_sincos(__a, __s, __c);
+}
+__DEVICE_VOID__ void sincosf(float __a, float* __s, float* __c)
+{
+  return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c);
+}
+__DEVICE_VOID__ void sincospi(double __a, double* __s, double* __c)
+{
+  return __nv_sincospi(__a, __s, __c);
+}
+__DEVICE_VOID__ void sincospif(float __a, float* __s, float* __c)
+{
+  return __nv_sincospif(__a, __s, __c);
+}
+__DEVICE__ float sinf(float __a)
+{
+  return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a);
+}
+__DEVICE__ double sinh(double __a)
+{
+  return __nv_sinh(__a);
+}
+__DEVICE__ float sinhf(float __a)
+{
+  return __nv_sinhf(__a);
+}
+__DEVICE__ double sinpi(double __a)
+{
+  return __nv_sinpi(__a);
+}
+__DEVICE__ float sinpif(float __a)
+{
+  return __nv_sinpif(__a);
+}
+__DEVICE__ double sqrt(double __a)
+{
+  return __nv_sqrt(__a);
+}
+__DEVICE__ float sqrtf(float __a)
+{
+  return __nv_sqrtf(__a);
+}
+__DEVICE__ double tan(double __a)
+{
+  return __nv_tan(__a);
+}
+__DEVICE__ float tanf(float __a)
+{
+  return __nv_tanf(__a);
+}
+__DEVICE__ double tanh(double __a)
+{
+  return __nv_tanh(__a);
+}
+__DEVICE__ float tanhf(float __a)
+{
+  return __nv_tanhf(__a);
+}
+__DEVICE__ double tgamma(double __a)
+{
+  return __nv_tgamma(__a);
+}
+__DEVICE__ float tgammaf(float __a)
+{
+  return __nv_tgammaf(__a);
+}
+__DEVICE__ double trunc(double __a)
+{
+  return __nv_trunc(__a);
+}
+__DEVICE__ float truncf(float __a)
+{
+  return __nv_truncf(__a);
+}
+__DEVICE__ unsigned long long ullmax(unsigned long long __a, unsigned long long __b)
+{
+  return __nv_ullmax(__a, __b);
+}
+__DEVICE__ unsigned long long ullmin(unsigned long long __a, unsigned long long __b)
+{
+  return __nv_ullmin(__a, __b);
+}
+__DEVICE__ unsigned int umax(unsigned int __a, unsigned int __b)
+{
+  return __nv_umax(__a, __b);
+}
+__DEVICE__ unsigned int umin(unsigned int __a, unsigned int __b)
+{
+  return __nv_umin(__a, __b);
+}
+__DEVICE__ double y0(double __a)
+{
+  return __nv_y0(__a);
+}
+__DEVICE__ float y0f(float __a)
+{
+  return __nv_y0f(__a);
+}
+__DEVICE__ double y1(double __a)
+{
+  return __nv_y1(__a);
+}
+__DEVICE__ float y1f(float __a)
+{
+  return __nv_y1f(__a);
+}
+__DEVICE__ double yn(int __a, double __b)
+{
+  return __nv_yn(__a, __b);
+}
+__DEVICE__ float ynf(int __a, float __b)
+{
+  return __nv_ynf(__a, __b);
+}
+
+#  pragma pop_macro("__DEVICE__")
+#  pragma pop_macro("__DEVICE_VOID__")
+#  pragma pop_macro("__FAST_OR_SLOW")
+
+#endif // __CLANG_GPU_DISABLE_MATH_WRAPPERS
+#endif // __CLANG_CUDA_MATH_H__
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_runtime_wrapper.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_runtime_wrapper.h
new file mode 100644
index 00000000000..4c18fdca836
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/__clang_cuda_runtime_wrapper.h
@@ -0,0 +1,407 @@
+/*===---- HostJIT CUDA runtime wrapper - replaces clang's wrapper ----------===
+ *
+ * This is a self-contained replacement for clang's __clang_cuda_runtime_wrapper.h.
+ * Instead of #include_next-ing the real wrapper (which has fragile ordering
+ * dependencies on system headers and CUDA toolkit version-specific branches),
+ * we directly include only the clang-provided CUDA helper headers we need and
+ * pull in the CUDA toolkit headers with explicit preprocessor guards.
+ *
+ * Key design decision: all clang-provided device function implementations and
+ * CCCL-required intrinsics are defined BEFORE any CUDA toolkit headers that
+ * might transitively include CCCL (via libcudacxx standard headers on our
+ * include path). This eliminates the need for forward declarations.
+ *
+ * Assumptions:
+ *   - CUDA >= 9.0  (no legacy code paths)
+ *   - Clang CUDA compilation (__CUDA__ && __clang__)
+ *   - Freestanding: all standard headers are stubs or from libcudacxx
+ *   - cuda::std is bridged into std via using-directive
+ *===-----------------------------------------------------------------------===*/
+#ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
+#define __CLANG_CUDA_RUNTIME_WRAPPER_H__
+#pragma clang system_header
+
+#if defined(__CUDA__) && defined(__clang__)
+
+// ============================================================================
+// Phase 1: Forward-declare device math overloads before any <cmath> inclusion
+// ============================================================================
+// This prevents constexpr std library math functions from becoming implicitly
+// host+device, which would block our __device__ overloads later.
+#  include <__clang_cuda_math_forward_declares.h>
+
+// ============================================================================
+// Phase 2: Device-side definitions before any CUDA toolkit headers
+// ============================================================================
+// Everything here uses only compiler builtins and our stubs. No CUDA toolkit
+// headers are included yet, so nothing can transitively pull in CCCL.
+
+#  pragma push_macro("__THROW")
+#  pragma push_macro("__CUDA_ARCH__")
+
+#  ifndef __CUDA_ARCH__
+#    define __CUDA_ARCH__ 9999
+#  endif
+
+// host_defines.h provides __device__, __host__, __forceinline__ macros.
+// Its only transitive dep (ctype.h) hits our stub.
+#  define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#  define __CUDACC__
+#  define __CUDA_LIBDEVICE__
+#  include "host_defines.h"
+
+// ---- Builtin variables (threadIdx, blockIdx, etc.) ----
+#  include "__clang_cuda_builtin_vars.h"
+
+// ---- Stubs needed by clang device function headers below ----
+#  include <climits>
+#  include <cmath>
+#  include <cstddef>
+
+// ---- Clang device function wrappers (local copies, CUDA < 9.0 removed) ----
+// clang-format off
+// Order matters: libdevice_declares must precede device_functions (declares __nv_* builtins used there).
+#  include "__clang_cuda_libdevice_declares.h"
+#  include "__clang_cuda_device_functions.h"
+#  include "__clang_cuda_math.h"
+// clang-format on
+
+// ---- Address-space intrinsics needed by CCCL headers ----
+// (e.g. cuda/__memory/address_space.h, cuda/__ptx/ptx_helper_functions.h)
+static __device__ __forceinline__ __attribute__((const)) unsigned int __isGlobal(const void* p)
+{
+  return __nvvm_isspacep_global(p);
+}
+static __device__ __forceinline__ __attribute__((const)) unsigned int __isShared(const void* p)
+{
+  return __nvvm_isspacep_shared(p);
+}
+static __device__ __forceinline__ __attribute__((const)) unsigned int __isConstant(const void* p)
+{
+  return __nvvm_isspacep_const(p);
+}
+static __device__ __forceinline__ __attribute__((const)) unsigned int __isLocal(const void* p)
+{
+  return __nvvm_isspacep_local(p);
+}
+#  define __FWD_DEVICE static __device__ __forceinline__
+__FWD_DEVICE unsigned int __isClusterShared(const void*);
+__FWD_DEVICE __SIZE_TYPE__ __cvta_generic_to_shared(const void*);
+__FWD_DEVICE __SIZE_TYPE__ __cvta_generic_to_global(const void*);
+__FWD_DEVICE void* __cvta_shared_to_generic(__SIZE_TYPE__);
+__FWD_DEVICE void* __cvta_global_to_generic(__SIZE_TYPE__);
+#  undef __FWD_DEVICE
+#  ifndef _MSC_VER
+__device__ bool __nv_fp128_isnan(__float128);
+__device__ __float128 __nv_fp128_fmax(__float128, __float128);
+__device__ __float128 __nv_fp128_fmin(__float128, __float128);
+#  endif
+
+// ---- Bridge cuda::std into std ----
+namespace cuda
+{
+namespace std
+{
+}
+} // namespace cuda
+namespace std
+{
+using namespace cuda::std;
+}
+
+// ============================================================================
+// Phase 3: CUDA toolkit headers
+// ============================================================================
+// By this point all device-side functions and intrinsics are defined, so
+// any transitive CCCL includes from these headers will find them.
+#  pragma push_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
+
+#  define __DEVICE_LAUNCH_PARAMETERS_H__
+
+// Guard out CUDA's declaration-only headers; clang provides its own.
+#  define __DEVICE_FUNCTIONS_H__
+#  define __MATH_FUNCTIONS_H__
+#  define __MATH_FUNCTIONS_HPP__
+#  define __COMMON_FUNCTIONS_H__
+#  define __DEVICE_FUNCTIONS_DECLS_H__
+
+// ---- CUDA runtime types (cudaError_t, dim3, cudaStream_t, etc.) ----
+// (host_defines.h already included in Phase 2)
+#  undef __CUDACC__
+#  include "cuda.h"
+#  include "driver_types.h"
+#  include "host_config.h"
+#  if !defined(CUDA_VERSION) || CUDA_VERSION < 9000
+#    error "Unsupported CUDA version (need >= 9.0)!"
+#  endif
+
+// Clang does not have __nvvm_memcpy/__nvvm_memset; emulate with builtins.
+#  define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
+#  define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
+
+// __THROW may be in a weird state; keep it empty for CUDA includes.
+#  undef __THROW
+#  define __THROW
+
+// ============================================================================
+// Phase 4: Device-side function definitions from CUDA toolkit .hpp files
+// ============================================================================
+// Poison __host__ to ensure none of these definitions get host attributes.
+#  pragma push_macro("__host__")
+#  define __host__ UNEXPECTED_HOST_ATTRIBUTE
+
+// Redefine __forceinline__ to include __device__.
+#  pragma push_macro("__forceinline__")
+#  define __forceinline__ __device__ __inline__ __attribute__((always_inline))
+
+// Math functions: use fast or accurate variants based on compiler flag.
+#  pragma push_macro("__USE_FAST_MATH__")
+#  if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
+#    define __USE_FAST_MATH__ 1
+#  endif
+#  include "crt/math_functions.hpp"
+#  pragma pop_macro("__USE_FAST_MATH__")
+
+#  pragma pop_macro("__forceinline__")
+
+#  undef __MATH_FUNCTIONS_HPP__
+#  undef __CUDABE__
+
+// Re-include device functions with __host__ defined as empty to get
+// the "other branch" of #if/#else in the .hpp files.
+#  define __host__
+#  undef __CUDABE__
+#  define __CUDACC__
+
+// Atomic function declarations (became builtins in CUDA 9).
+#  include "device_atomic_functions.h"
+#  undef __DEVICE_FUNCTIONS_HPP__
+#  include "crt/device_double_functions.hpp"
+#  include "crt/device_functions.hpp"
+#  include "device_atomic_functions.hpp"
+#  include "sm_20_atomic_functions.hpp"
+
+// sm_20_intrinsics.hpp defines __isGlobal etc. without const attribute.
+// Rename them so the definitions from Phase 4 (with const) prevail.
+#  pragma push_macro("__isGlobal")
+#  pragma push_macro("__isShared")
+#  pragma push_macro("__isConstant")
+#  pragma push_macro("__isLocal")
+#  define __isGlobal   __ignored_cuda___isGlobal
+#  define __isShared   __ignored_cuda___isShared
+#  define __isConstant __ignored_cuda___isConstant
+#  define __isLocal    __ignored_cuda___isLocal
+#  include "sm_20_intrinsics.hpp"
+#  pragma pop_macro("__isGlobal")
+#  pragma pop_macro("__isShared")
+#  pragma pop_macro("__isConstant")
+#  pragma pop_macro("__isLocal")
+
+#  include "sm_32_atomic_functions.hpp"
+
+#  pragma push_macro("__CUDA_ARCH__")
+#  undef __CUDA_ARCH__
+#  include "sm_60_atomic_functions.hpp"
+#  include "sm_61_intrinsics.hpp"
+#  pragma pop_macro("__CUDA_ARCH__")
+
+#  undef __MATH_FUNCTIONS_HPP__
+
+// math_functions.hpp ::signbit conflicts with libstdc++ constexpr ::signbit.
+#  pragma push_macro("signbit")
+#  pragma push_macro("__GNUC__")
+#  undef __GNUC__
+#  define signbit __ignored_cuda_signbit
+#  pragma push_macro("_GLIBCXX_MATH_H")
+#  pragma push_macro("_LIBCPP_VERSION")
+#  undef _GLIBCXX_MATH_H
+#  ifdef _LIBCPP_VERSION
+#    define _LIBCPP_VERSION 3700
+#  endif
+#  include "crt/math_functions.hpp"
+#  pragma pop_macro("_GLIBCXX_MATH_H")
+#  pragma pop_macro("_LIBCPP_VERSION")
+#  pragma pop_macro("__GNUC__")
+#  pragma pop_macro("signbit")
+
+#  pragma pop_macro("__host__")
+
+// ============================================================================
+// Phase 5: cuda_runtime.h (first header that transitively pulls in CCCL)
+// ============================================================================
+// ============================================================================
+// Phase 5: cuda_runtime.h (first header that transitively pulls in CCCL)
+// ============================================================================
+// Verify no libcudacxx header was pulled in yet. If this fires, a header
+// above transitively included a system header that resolved to libcudacxx
+// before all device-side definitions were ready.
+#  ifdef CCCL_VERSION
+#    error "libcudacxx was included before device-side definitions were set up"
+#  endif
+
+#  pragma push_macro("nv_weak")
+#  define nv_weak weak
+#  undef __CUDA_LIBDEVICE__
+#  define __CUDACC__
+#  include "cuda_runtime.h"
+#  pragma pop_macro("nv_weak")
+#  undef __CUDACC__
+#  define __CUDABE__
+
+#  include "crt/host_runtime.h"
+
+// device_runtime.h defines __cxa_* macros that conflict with cxxabi.h.
+#  undef __cxa_vec_ctor
+#  undef __cxa_vec_cctor
+#  undef __cxa_vec_dtor
+#  undef __cxa_vec_new
+#  undef __cxa_vec_new2
+#  undef __cxa_vec_new3
+#  undef __cxa_vec_delete2
+#  undef __cxa_vec_delete
+#  undef __cxa_vec_delete3
+#  undef __cxa_pure_virtual
+
+// Texture intrinsics (requires C++11).
+#  if __cplusplus >= 201103L
+#    include <__clang_cuda_texture_intrinsics.h>
+#  else
+template <typename T>
+struct __nv_tex_needs_cxx11
+{
+  const static bool value = false;
+};
+template <class T>
+__host__ __device__ void __nv_tex_surf_handler(const char* name, T* ptr, cudaTextureObject_t obj, float x)
+{
+  _Static_assert(__nv_tex_needs_cxx11<T>::value, "Texture support requires C++11");
+}
+#  endif
+#  include "surface_indirect_functions.h"
+#  if CUDA_VERSION < 13000
+#    include "texture_fetch_functions.h"
+#  endif
+#  include "texture_indirect_functions.h"
+
+// ============================================================================
+// Phase 7: Restore saved state
+// ============================================================================
+#  pragma pop_macro("__CUDA_ARCH__")
+#  pragma pop_macro("__THROW")
+#  undef __CUDABE__
+#  define __CUDACC__
+
+// ============================================================================
+// Phase 8: Device-side system calls & std wrappers
+// ============================================================================
+extern "C" {
+__device__ int vprintf(const char*, const char*);
+__device__ void free(void*) __attribute((nothrow));
+__device__ void* malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
+__device__ void
+__assertfail(const char* __message, const char* __file, unsigned __line, const char* __function, size_t __charSize);
+__device__ static inline void
+__assert_fail(const char* __message, const char* __file, unsigned __line, const char* __function)
+{
+  __assertfail(__message, __file, __line, __function, sizeof(char));
+}
+__device__ int printf(const char*, ...);
+} // extern "C"
+
+namespace std
+{
+__device__ static inline void free(void* __ptr)
+{
+  ::free(__ptr);
+}
+__device__ static inline void* malloc(size_t __size)
+{
+  return ::malloc(__size);
+}
+} // namespace std
+
+// ============================================================================
+// Phase 9: Builtin variable conversion operators
+// ============================================================================
+// These need dim3 and uint3 to be fully defined (from vector_types.h, pulled
+// in by driver_types.h in Phase 5).
+__device__ inline __cuda_builtin_threadIdx_t::operator dim3() const
+{
+  return dim3(x, y, z);
+}
+__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const
+{
+  return {x, y, z};
+}
+__device__ inline __cuda_builtin_blockIdx_t::operator dim3() const
+{
+  return dim3(x, y, z);
+}
+__device__ inline __cuda_builtin_blockIdx_t::operator uint3() const
+{
+  return {x, y, z};
+}
+__device__ inline __cuda_builtin_blockDim_t::operator dim3() const
+{
+  return dim3(x, y, z);
+}
+__device__ inline __cuda_builtin_blockDim_t::operator uint3() const
+{
+  return {x, y, z};
+}
+__device__ inline __cuda_builtin_gridDim_t::operator dim3() const
+{
+  return dim3(x, y, z);
+}
+__device__ inline __cuda_builtin_gridDim_t::operator uint3() const
+{
+  return {x, y, z};
+}
+
+// ============================================================================
+// Phase 10: Remaining clang CUDA headers
+// ============================================================================
+#  include <__clang_cuda_cmath.h>
+#  include <__clang_cuda_complex_builtins.h>
+#  include <__clang_cuda_intrinsics.h>
+
+// curand_mtgp32_kernel redefines blockDim/threadIdx with dim3/uint3 types,
+// which is incompatible with our builtins. Force-include it with types
+// redefined to our builtin types.
+// Skip when cuRAND headers are unavailable (e.g. pip-installed toolkit).
+#  if __has_include("curand_mtgp32_kernel.h")
+#    pragma push_macro("dim3")
+#    pragma push_macro("uint3")
+#    define dim3  __cuda_builtin_blockDim_t
+#    define uint3 __cuda_builtin_threadIdx_t
+#    include "curand_mtgp32_kernel.h"
+#    pragma pop_macro("dim3")
+#    pragma pop_macro("uint3")
+#  endif
+#  pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
+
+// Kernel launch configuration function.
+#  if CUDA_VERSION >= 9020
+extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void* stream = 0);
+#  endif
+
+// The JIT shared library is linked without the C runtime (no libc on the link
+// line) so atexit is unavailable.  The CUDA module constructor calls atexit()
+// to register a cleanup function.  Provide a no-op stub — the JIT library is
+// short-lived and unloaded explicitly.
+#  if !defined(__HOSTJIT_DEVICE_COMPILATION__)
+#    if defined(_MSC_VER)
+extern "C" int atexit(void(__cdecl*)(void))
+{
+  return 0;
+}
+#    else
+extern "C" int atexit(void (*)(void))
+{
+  return 0;
+}
+#    endif
+#  endif
+
+#endif // __CUDA__ && __clang__
+#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/climits b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/climits
new file mode 100644
index 00000000000..3910c27321c
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/climits
@@ -0,0 +1,7 @@
+// Minimal climits stub for CUDA JIT compilation
+#ifndef _HOSTJIT_CLIMITS
+#define _HOSTJIT_CLIMITS
+
+#include <limits.h>
+
+#endif // _HOSTJIT_CLIMITS
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cmath b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cmath
new file mode 100644
index 00000000000..35f14baa610
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cmath
@@ -0,0 +1,7 @@
+// Minimal cmath stub for CUDA JIT compilation
+#ifndef _HOSTJIT_CMATH
+#define _HOSTJIT_CMATH
+
+#include <math.h>
+
+#endif // _HOSTJIT_CMATH
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cstddef b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cstddef
new file mode 100644
index 00000000000..4e628c9ce78
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cstddef
@@ -0,0 +1,14 @@
+// Minimal cstddef stub for CUDA JIT compilation
+// Compatible with libcu++ which expects to pull types from global namespace
+#ifndef _HOSTJIT_CSTDDEF
+#define _HOSTJIT_CSTDDEF
+
+#include <stddef.h>
+
+namespace std {
+    using ::size_t;
+    using ::ptrdiff_t;
+    using nullptr_t = decltype(nullptr);
+}
+
+#endif // _HOSTJIT_CSTDDEF
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cstdlib b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cstdlib
new file mode 100644
index 00000000000..7033a7fd3ff
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/cstdlib
@@ -0,0 +1,20 @@
+#ifndef _HOSTJIT_CSTDLIB
+#define _HOSTJIT_CSTDLIB
+
+#include <cstddef>
+
+#define EXIT_SUCCESS 0
+#define EXIT_FAILURE 1
+#define RAND_MAX     2147483647
+
+extern "C" {
+void* malloc(size_t);
+void* calloc(size_t, size_t);
+void* realloc(void*, size_t);
+void  free(void*);
+void  abort(void);
+void  exit(int);
+void  _Exit(int);
+}
+
+#endif
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/ctype.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/ctype.h
new file mode 100644
index 00000000000..c9e4f3eb462
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/ctype.h
@@ -0,0 +1,3 @@
+#ifndef _HOSTJIT_CTYPE_H
+#define _HOSTJIT_CTYPE_H
+#endif
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/initializer_list b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/initializer_list
new file mode 100644
index 00000000000..b8347f771b5
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/initializer_list
@@ -0,0 +1,47 @@
+// Minimal initializer_list stub for CUDA JIT compilation
+#ifndef _HOSTJIT_INITIALIZER_LIST
+#define _HOSTJIT_INITIALIZER_LIST
+
+#include <cstddef>
+
+namespace std {
+
+template<class T>
+class initializer_list {
+public:
+    using value_type = T;
+    using reference = const T&;
+    using const_reference = const T&;
+    using size_type = size_t;
+    using iterator = const T*;
+    using const_iterator = const T*;
+
+private:
+    const T* _begin;
+    size_t _size;
+
+    // This constructor is called by the compiler
+    constexpr initializer_list(const T* b, size_t s) noexcept
+        : _begin(b), _size(s) {}
+
+public:
+    constexpr initializer_list() noexcept : _begin(nullptr), _size(0) {}
+
+    constexpr size_t size() const noexcept { return _size; }
+    constexpr const T* begin() const noexcept { return _begin; }
+    constexpr const T* end() const noexcept { return _begin + _size; }
+};
+
+template<class T>
+constexpr const T* begin(initializer_list<T> il) noexcept {
+    return il.begin();
+}
+
+template<class T>
+constexpr const T* end(initializer_list<T> il) noexcept {
+    return il.end();
+}
+
+} // namespace std
+
+#endif // _HOSTJIT_INITIALIZER_LIST
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/limits b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/limits
new file mode 100644
index 00000000000..8d388108143
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/limits
@@ -0,0 +1,61 @@
+// Minimal <limits> stub for hostjit device compilation.
+//
+// Clang's __clang_cuda_cmath.h includes <limits> unconditionally, then expands
+// __CUDA_CLANG_FN_INTEGER_OVERLOAD_1/2 macros that reference
+// std::numeric_limits<__T>::is_integer in return-type SFINAE at parse time.
+// Clang evaluates these dependent names during template parsing, so the struct
+// must be declared — not just forward-declared — before the macro expansion.
+//
+// In the hostjit device-compilation include path, <limits> would normally
+// resolve to libcudacxx/include/cuda/std/limits, which cascades through
+// numeric_limits, bit_cast, popcount, etc. — incompatible with freestanding.
+//
+// This stub (found first on -internal-isystem) stops that cascade, providing
+// only the two members that __clang_cuda_cmath.h actually inspects.
+#pragma once
+
+namespace std {
+
+template <typename _Tp>
+struct numeric_limits {
+  static constexpr bool is_specialized = false;
+  static constexpr bool is_integer     = false;
+};
+
+// Integer specializations — needed so the SFINAE in __clang_cuda_cmath.h
+// correctly dispatches integer arguments.
+#define _HOSTJIT_NUM_LIM_INT(_T) \
+  template <> struct numeric_limits<_T> { \
+    static constexpr bool is_specialized = true; \
+    static constexpr bool is_integer     = true; \
+  };
+
+_HOSTJIT_NUM_LIM_INT(bool)
+_HOSTJIT_NUM_LIM_INT(char)
+_HOSTJIT_NUM_LIM_INT(signed char)
+_HOSTJIT_NUM_LIM_INT(unsigned char)
+_HOSTJIT_NUM_LIM_INT(short)
+_HOSTJIT_NUM_LIM_INT(unsigned short)
+_HOSTJIT_NUM_LIM_INT(int)
+_HOSTJIT_NUM_LIM_INT(unsigned int)
+_HOSTJIT_NUM_LIM_INT(long)
+_HOSTJIT_NUM_LIM_INT(unsigned long)
+_HOSTJIT_NUM_LIM_INT(long long)
+_HOSTJIT_NUM_LIM_INT(unsigned long long)
+
+#undef _HOSTJIT_NUM_LIM_INT
+
+// Floating-point specializations.
+#define _HOSTJIT_NUM_LIM_FP(_T) \
+  template <> struct numeric_limits<_T> { \
+    static constexpr bool is_specialized = true; \
+    static constexpr bool is_integer     = false; \
+  };
+
+_HOSTJIT_NUM_LIM_FP(float)
+_HOSTJIT_NUM_LIM_FP(double)
+_HOSTJIT_NUM_LIM_FP(long double)
+
+#undef _HOSTJIT_NUM_LIM_FP
+
+} // namespace std
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/math.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/math.h
new file mode 100644
index 00000000000..7884961cdb4
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/math.h
@@ -0,0 +1,21 @@
+#ifndef _HOSTJIT_MATH_H
+#define _HOSTJIT_MATH_H
+
+// Macros needed by __clang_cuda_math.h
+#define HUGE_VAL         __builtin_huge_val()
+#define HUGE_VALF        __builtin_huge_valf()
+#define HUGE_VALL        __builtin_huge_vall()
+#define INFINITY         __builtin_inff()
+#define NAN              __builtin_nanf("")
+#define MATH_ERRNO       1
+#define MATH_ERREXCEPT   2
+#define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT)
+#define FP_NAN           0
+#define FP_INFINITE      1
+#define FP_ZERO          2
+#define FP_SUBNORMAL     3
+#define FP_NORMAL        4
+#define __signbit(x)     __builtin_signbit(x)
+#define __signbitl(x)    __builtin_signbitl(x)
+
+#endif
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/memory.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/memory.h
new file mode 100644
index 00000000000..19b77865017
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/memory.h
@@ -0,0 +1,4 @@
+#ifndef _HOSTJIT_MEMORY_H
+#define _HOSTJIT_MEMORY_H
+#include <string.h>
+#endif
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/new b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/new
new file mode 100644
index 00000000000..8e21dbc7761
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/new
@@ -0,0 +1,29 @@
+#ifndef _HOSTJIT_NEW
+#define _HOSTJIT_NEW
+#include <cstddef>
+
+namespace std {
+struct nothrow_t { explicit nothrow_t() = default; };
+extern const nothrow_t nothrow;
+enum class align_val_t : size_t {};
+}
+
+// Placement new — needs __host__ __device__ for CUDA
+#if defined(__CUDA__)
+__host__ __device__
+#endif
+inline void* operator new(std::size_t, void* p) noexcept { return p; }
+#if defined(__CUDA__)
+__host__ __device__
+#endif
+inline void* operator new[](std::size_t, void* p) noexcept { return p; }
+#if defined(__CUDA__)
+__host__ __device__
+#endif
+inline void operator delete(void*, void*) noexcept {}
+#if defined(__CUDA__)
+__host__ __device__
+#endif
+inline void operator delete[](void*, void*) noexcept {}
+
+#endif
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/stdlib.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/stdlib.h
new file mode 100644
index 00000000000..d8d910d8c5a
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/stdlib.h
@@ -0,0 +1,7 @@
+// Minimal stdlib.h stub for CUDA JIT compilation
+#ifndef _HOSTJIT_STDLIB_H
+#define _HOSTJIT_STDLIB_H
+
+#include <cstdlib>
+
+#endif // _HOSTJIT_STDLIB_H
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/string.h b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/string.h
new file mode 100644
index 00000000000..c38628207bf
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/string.h
@@ -0,0 +1,15 @@
+#ifndef _HOSTJIT_STRING_H
+#define _HOSTJIT_STRING_H
+#include <stddef.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+void* memcpy(void*, const void*, size_t);
+void* memset(void*, int, size_t);
+int memcmp(const void*, const void*, size_t);
+void* memmove(void*, const void*, size_t);
+size_t strlen(const char*);
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/utility b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/utility
new file mode 100644
index 00000000000..0a2b61ac1b0
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/cuda_minimal/stubs/utility
@@ -0,0 +1,34 @@
+// Minimal <utility> stub for hostjit device compilation.
+//
+// cuda_runtime.h includes <utility> for std::forward/std::move.  In the
+// hostjit device-compilation include path, <utility> resolves to
+// libcudacxx/include/cuda/std/utility, which cascades into the full CCCL
+// utility/iterator/concepts hierarchy — incompatible with freestanding mode.
+//
+// This stub (found first on -internal-isystem) stops that cascade.
+// Only std::forward and std::move are provided because that is all
+// cuda_runtime.h actually uses at the top level; the full CCCL hierarchy
+// is not required for a simple host+device kernel.
+#pragma once
+
+namespace std {
+
+template <typename _Tp> struct remove_reference       { using type = _Tp; };
+template <typename _Tp> struct remove_reference<_Tp&> { using type = _Tp; };
+template <typename _Tp> struct remove_reference<_Tp&&>{ using type = _Tp; };
+template <typename _Tp>
+using remove_reference_t = typename remove_reference<_Tp>::type;
+
+template <typename _Tp>
+__host__ __device__ constexpr _Tp&&
+forward(remove_reference_t<_Tp>& __t) noexcept { return static_cast<_Tp&&>(__t); }
+
+template <typename _Tp>
+__host__ __device__ constexpr _Tp&&
+forward(remove_reference_t<_Tp>&& __t) noexcept { return static_cast<_Tp&&>(__t); }
+
+template <typename _Tp>
+__host__ __device__ constexpr remove_reference_t<_Tp>&&
+move(_Tp&& __t) noexcept { return static_cast<remove_reference_t<_Tp>&&>(__t); }
+
+} // namespace std
diff --git a/c/parallel/src/hostjit/include/hostjit/jit_compiler.hpp b/c/parallel/src/hostjit/include/hostjit/jit_compiler.hpp
new file mode 100644
index 00000000000..9667ee26f78
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/jit_compiler.hpp
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include <hostjit/compiler.hpp>
+#include <hostjit/config.hpp>
+#include <hostjit/loader.hpp>
+
+namespace hostjit
+{
+class JITCompiler
+{
+public:
+  // Create JIT compiler with default configuration (auto-detected)
+  JITCompiler();
+
+  // Create JIT compiler with custom configuration
+  explicit JITCompiler(const CompilerConfig& config);
+
+  ~JITCompiler();
+
+  // Disable copy
+  JITCompiler(const JITCompiler&)            = delete;
+  JITCompiler& operator=(const JITCompiler&) = delete;
+
+  // Compile CUDA source code to shared library and load it
+  // Returns true on success, false on failure
+  bool compile(const std::string& source_code);
+
+  // Get function pointer by name
+  // Returns nullptr if function not found
+  template <typename FuncType>
+  FuncType getFunction(const std::string& name)
+  {
+    if (!library_.isLoaded())
+    {
+      last_error_ = "No library loaded";
+      return nullptr;
+    }
+
+    auto func = library_.getFunction<FuncType>(name);
+    if (!func)
+    {
+      last_error_ = "Failed to find function '" + name + "': " + library_.getLastError();
+    }
+    return func;
+  }
+
+  // Get the last error message
+  std::string getLastError() const
+  {
+    return last_error_;
+  }
+
+  // Get the configuration being used
+  const CompilerConfig& getConfig() const
+  {
+    return config_;
+  }
+
+  // Check if a library is currently loaded
+  bool isLoaded() const
+  {
+    return library_.isLoaded();
+  }
+
+  // Get the path to compiled artifacts (object file, shared library, etc.)
+  // Only valid after successful compile() and if keep_artifacts is set
+  std::string getArtifactsPath() const
+  {
+    return temp_dir_;
+  }
+
+  // Get the cubin extracted during compilation
+  const std::vector<char>& getCubin() const
+  {
+    return cubin_;
+  }
+
+  // Unload the current library and clean up temporary files
+  void cleanup();
+
+private:
+  std::string createTempDirectory();
+  void removeTempDirectory();
+
+  CompilerConfig config_;
+  CUDACompiler compiler_;
+  DynamicLibrary library_;
+  std::string temp_dir_;
+  std::string last_error_;
+  std::vector<char> cubin_;
+};
+} // namespace hostjit
diff --git a/c/parallel/src/hostjit/include/hostjit/loader.hpp b/c/parallel/src/hostjit/include/hostjit/loader.hpp
new file mode 100644
index 00000000000..f6ee9525c48
--- /dev/null
+++ b/c/parallel/src/hostjit/include/hostjit/loader.hpp
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <string>
+
+namespace hostjit
+{
+class DynamicLibrary
+{
+public:
+  DynamicLibrary();
+  ~DynamicLibrary();
+
+  // Disable copy
+  DynamicLibrary(const DynamicLibrary&)            = delete;
+  DynamicLibrary& operator=(const DynamicLibrary&) = delete;
+
+  // Enable move
+  DynamicLibrary(DynamicLibrary&& other) noexcept;
+  DynamicLibrary& operator=(DynamicLibrary&& other) noexcept;
+
+  // Load a shared library
+  bool load(const std::string& library_path);
+
+  // Get a symbol (function or variable) by name
+  void* getSymbol(const std::string& symbol_name);
+
+  // Template helper to get function pointers with type safety
+  template <typename FuncType>
+  FuncType getFunction(const std::string& name)
+  {
+    return reinterpret_cast<FuncType>(getSymbol(name));
+  }
+
+  // Check if library is loaded
+  bool isLoaded() const;
+
+  // Get the last error message
+  std::string getLastError() const;
+
+  // Unload the library
+  void unload();
+
+private:
+  void* handle_;
+  std::string last_error_;
+};
+} // namespace hostjit
diff --git a/c/parallel/src/hostjit/jit_compiler.cpp b/c/parallel/src/hostjit/jit_compiler.cpp
new file mode 100644
index 00000000000..83490468aad
--- /dev/null
+++ b/c/parallel/src/hostjit/jit_compiler.cpp
@@ -0,0 +1,192 @@
+#include <cstdlib>
+#include <filesystem>
+#include <iostream>
+#include <random>
+#include <sstream>
+
+#include <hostjit/jit_compiler.hpp>
+
+#ifdef _WIN32
+#  include <process.h>
+#else
+#  include <unistd.h>
+#endif
+
+namespace hostjit
+{
+JITCompiler::JITCompiler()
+    : config_(detectDefaultConfig())
+{}
+
+JITCompiler::JITCompiler(const CompilerConfig& config)
+    : config_(config)
+{}
+
+JITCompiler::~JITCompiler()
+{
+  cleanup();
+}
+
+bool JITCompiler::compile(const std::string& source_code)
+{
+  std::string config_error;
+  if (!validateConfig(config_, &config_error))
+  {
+    last_error_ = "Configuration error: " + config_error;
+    return false;
+  }
+
+  cleanup();
+
+  temp_dir_ = createTempDirectory();
+  if (temp_dir_.empty())
+  {
+    last_error_ = "Failed to create temporary directory";
+    return false;
+  }
+
+  std::string obj_path = temp_dir_ + "/cuda_code.o";
+  auto compile_result  = compiler_.compileToObject(source_code, obj_path, config_);
+
+  if (!compile_result.success)
+  {
+    last_error_ = "Compilation failed:\n" + compile_result.diagnostics;
+    removeTempDirectory();
+    return false;
+  }
+
+  // Store the cubin for later inspection
+  cubin_ = std::move(compile_result.cubin);
+
+  if (config_.verbose)
+  {
+    std::cout << "Compilation diagnostics:\n" << compile_result.diagnostics << "\n";
+  }
+
+#ifdef _WIN32
+  std::string lib_path = temp_dir_ + "/cuda_code.dll";
+#else
+  std::string lib_path = temp_dir_ + "/libcuda_code.so";
+#endif
+  auto link_result = compiler_.linkToSharedLibrary({obj_path}, lib_path, config_);
+
+  if (!link_result.success)
+  {
+    last_error_ = "Linking failed:\n" + link_result.diagnostics;
+    removeTempDirectory();
+    return false;
+  }
+
+  if (config_.verbose)
+  {
+    std::cout << "Linking diagnostics:\n" << link_result.diagnostics << "\n";
+  }
+
+  if (!library_.load(lib_path))
+  {
+    last_error_ = "Failed to load library: " + library_.getLastError();
+    removeTempDirectory();
+    return false;
+  }
+
+  if (config_.verbose)
+  {
+    std::cout << "Successfully loaded library: " << lib_path << "\n";
+  }
+
+  last_error_.clear();
+  return true;
+}
+
+void JITCompiler::cleanup()
+{
+  library_.unload();
+
+  if (!config_.keep_artifacts)
+  {
+    removeTempDirectory();
+  }
+
+  last_error_.clear();
+}
+
+std::string JITCompiler::createTempDirectory()
+{
+  std::filesystem::path base_tmp_dir;
+
+#ifdef _WIN32
+  const char* tmp_dir = std::getenv("TEMP");
+  if (!tmp_dir)
+  {
+    tmp_dir = std::getenv("TMP");
+  }
+  if (tmp_dir)
+  {
+    base_tmp_dir = tmp_dir;
+  }
+  else
+  {
+    base_tmp_dir = std::filesystem::temp_directory_path();
+  }
+#else
+  const char* tmp_dir = std::getenv("TMPDIR");
+  if (tmp_dir)
+  {
+    base_tmp_dir = tmp_dir;
+  }
+  else
+  {
+    base_tmp_dir = "/tmp";
+  }
+#endif
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<> dis(0, 999999);
+
+#ifdef _WIN32
+  int pid = _getpid();
+#else
+  int pid = getpid();
+#endif
+
+  for (int attempt = 0; attempt < 10; ++attempt)
+  {
+    std::string dir_name            = "hostjit_" + std::to_string(pid) + "_" + std::to_string(dis(gen));
+    std::filesystem::path full_path = base_tmp_dir / dir_name;
+
+    std::error_code ec;
+    if (std::filesystem::create_directories(full_path, ec) && !ec)
+    {
+      return full_path.string();
+    }
+  }
+
+  return "";
+}
+
+void JITCompiler::removeTempDirectory()
+{
+  if (temp_dir_.empty())
+  {
+    return;
+  }
+
+  try
+  {
+    if (std::filesystem::exists(temp_dir_))
+    {
+      std::filesystem::remove_all(temp_dir_);
+    }
+  }
+  catch (const std::filesystem::filesystem_error& e)
+  {
+    if (config_.verbose)
+    {
+      std::cerr << "Warning: Failed to remove temporary directory: " << e.what() << "\n";
+    }
+  }
+
+  temp_dir_.clear();
+}
+} // namespace hostjit
diff --git a/c/parallel/src/hostjit/loader.cpp b/c/parallel/src/hostjit/loader.cpp
new file mode 100644
index 00000000000..33d49e4bc20
--- /dev/null
+++ b/c/parallel/src/hostjit/loader.cpp
@@ -0,0 +1,207 @@
+#include <hostjit/loader.hpp>
+
+#ifdef _WIN32
+#  define WIN32_LEAN_AND_MEAN
+#  include <windows.h>
+#else
+#  include <dlfcn.h>
+#endif
+
+namespace hostjit
+{
+#ifdef _WIN32
+namespace
+{
+// Run C++ static constructors in a DLL loaded with /NOENTRY /NODEFAULTLIB.
+// The compiler places CUDA fatbin registration in the .CRT$XCU section.
+// Without CRT startup, these never run, so we walk the merged .CRT section
+// in the PE and call each non-null function pointer.
+void runStaticInitializers(HMODULE module)
+{
+  auto base = reinterpret_cast<const unsigned char*>(module);
+  auto dos  = reinterpret_cast<const IMAGE_DOS_HEADER*>(base);
+  auto nt   = reinterpret_cast<const IMAGE_NT_HEADERS*>(base + dos->e_lfanew);
+  auto sec  = IMAGE_FIRST_SECTION(nt);
+
+  for (WORD i = 0; i < nt->FileHeader.NumberOfSections; ++i, ++sec)
+  {
+    if (memcmp(sec->Name, ".CRT", 4) == 0)
+    {
+      using InitFunc = void(__cdecl*)();
+      auto funcs     = reinterpret_cast<InitFunc*>(const_cast<unsigned char*>(base) + sec->VirtualAddress);
+      size_t count   = sec->SizeOfRawData / sizeof(InitFunc);
+      for (size_t j = 0; j < count; ++j)
+      {
+        if (funcs[j])
+        {
+          funcs[j]();
+        }
+      }
+    }
+  }
+}
+
+std::string getWindowsError()
+{
+  DWORD error = GetLastError();
+  if (error == 0)
+  {
+    return "";
+  }
+
+  LPSTR buffer = nullptr;
+  DWORD size   = FormatMessageA(
+    FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+    nullptr,
+    error,
+    MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+    reinterpret_cast<LPSTR>(&buffer),
+    0,
+    nullptr);
+
+  std::string message;
+  if (size > 0 && buffer)
+  {
+    message = std::string(buffer, size);
+    while (!message.empty() && (message.back() == '\n' || message.back() == '\r'))
+    {
+      message.pop_back();
+    }
+    LocalFree(buffer);
+  }
+  else
+  {
+    message = "Unknown error (code: " + std::to_string(error) + ")";
+  }
+
+  return message;
+}
+} // anonymous namespace
+#endif
+
+DynamicLibrary::DynamicLibrary()
+    : handle_(nullptr)
+{}
+
+DynamicLibrary::~DynamicLibrary()
+{
+  unload();
+}
+
+DynamicLibrary::DynamicLibrary(DynamicLibrary&& other) noexcept
+    : handle_(other.handle_)
+    , last_error_(std::move(other.last_error_))
+{
+  other.handle_ = nullptr;
+}
+
+DynamicLibrary& DynamicLibrary::operator=(DynamicLibrary&& other) noexcept
+{
+  if (this != &other)
+  {
+    unload();
+    handle_       = other.handle_;
+    last_error_   = std::move(other.last_error_);
+    other.handle_ = nullptr;
+  }
+  return *this;
+}
+
+bool DynamicLibrary::load(const std::string& library_path)
+{
+  unload();
+
+#ifdef _WIN32
+  SetLastError(0);
+  handle_ = static_cast<void*>(LoadLibraryA(library_path.c_str()));
+
+  if (!handle_)
+  {
+    last_error_ = getWindowsError();
+    if (last_error_.empty())
+    {
+      last_error_ = "Unknown LoadLibrary error";
+    }
+    return false;
+  }
+
+  // The DLL is linked with /NOENTRY (no CRT startup), so C++ static
+  // constructors (e.g. CUDA fatbin registration) haven't run yet.
+  runStaticInitializers(static_cast<HMODULE>(handle_));
+#else
+  dlerror();
+  handle_ = dlopen(library_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
+
+  if (!handle_)
+  {
+    const char* error = dlerror();
+    last_error_       = error ? error : "Unknown dlopen error";
+    return false;
+  }
+#endif
+
+  last_error_.clear();
+  return true;
+}
+
+void* DynamicLibrary::getSymbol(const std::string& symbol_name)
+{
+  if (!handle_)
+  {
+    last_error_ = "Library not loaded";
+    return nullptr;
+  }
+
+#ifdef _WIN32
+  SetLastError(0);
+  void* symbol = reinterpret_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle_), symbol_name.c_str()));
+
+  if (!symbol)
+  {
+    last_error_ = getWindowsError();
+    if (last_error_.empty())
+    {
+      last_error_ = "Symbol not found: " + symbol_name;
+    }
+    return nullptr;
+  }
+#else
+  dlerror();
+  void* symbol = dlsym(handle_, symbol_name.c_str());
+
+  const char* error = dlerror();
+  if (error)
+  {
+    last_error_ = error;
+    return nullptr;
+  }
+#endif
+
+  last_error_.clear();
+  return symbol;
+}
+
+bool DynamicLibrary::isLoaded() const
+{
+  return handle_ != nullptr;
+}
+
+std::string DynamicLibrary::getLastError() const
+{
+  return last_error_;
+}
+
+void DynamicLibrary::unload()
+{
+  if (handle_)
+  {
+#ifdef _WIN32
+    FreeLibrary(static_cast<HMODULE>(handle_));
+#else
+    dlclose(handle_);
+#endif
+    handle_ = nullptr;
+  }
+  last_error_.clear();
+}
+} // namespace hostjit
diff --git a/c/parallel/test/CMakeLists.txt b/c/parallel/test/CMakeLists.txt
index edf616ca8d4..dfde7a4a346 100644
--- a/c/parallel/test/CMakeLists.txt
+++ b/c/parallel/test/CMakeLists.txt
@@ -51,6 +51,23 @@ file(
   *.cpp
 )
 
+# test_freestanding_compiler is built separately (links hostjit_lib, not cccl.c.parallel)
+list(REMOVE_ITEM test_srcs "test_freestanding_compiler.cpp")
+
 foreach (test_src IN LISTS test_srcs)
   cccl_c_parallel_add_test(test_target "${test_src}")
 endforeach()
+
+if (CCCL_C_Parallel_ENABLE_HOSTJIT AND TARGET hostjit_lib)
+  cccl_add_executable(
+    cccl.c.parallel.test.freestanding_compiler
+    ADD_CTEST
+    NO_METATARGETS
+    DIALECT 20
+    SOURCES test_freestanding_compiler.cpp
+  )
+  target_link_libraries(
+    cccl.c.parallel.test.freestanding_compiler
+    PRIVATE hostjit_lib CUDA::cudart
+  )
+endif()
diff --git a/c/parallel/test/test_freestanding_compiler.cpp b/c/parallel/test/test_freestanding_compiler.cpp
new file mode 100644
index 00000000000..ee7271068a4
--- /dev/null
+++ b/c/parallel/test/test_freestanding_compiler.cpp
@@ -0,0 +1,69 @@
+//===----------------------------------------------------------------------===//
+// test_freestanding_compiler.cpp
+//
+// Smoke-test for HostJIT compiler infrastructure (issue #7743).
+// JIT-compiles a minimal host+device CUDA source, runs it, and verifies
+// the result — without relying on any system CUDA headers at JIT time.
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstdio>
+
+#include <cuda_runtime.h>
+
+#include <hostjit/config.hpp>
+#include <hostjit/jit_compiler.hpp>
+
+static const char* k_source = R"(
+#include <cuda_runtime.h>
+
+#ifdef _WIN32
+#  define EXPORT __declspec(dllexport)
+#else
+#  define EXPORT __attribute__((visibility("default")))
+#endif
+
+__global__ void device_kernel(int* ptr)
+{
+  *ptr = 42;
+}
+
+extern "C" EXPORT void host_entry(int* ptr)
+{
+  device_kernel<<<1, 1>>>(ptr);
+}
+)";
+
+int main()
+{
+  // Detect Clang/CUDA configuration from the build environment
+  auto config = hostjit::detectDefaultConfig();
+
+  hostjit::JITCompiler compiler(config);
+  if (!compiler.compile(k_source))
+  {
+    std::fprintf(stderr, "HostJIT compilation failed:\n%s\n", compiler.getLastError().c_str());
+    return 1;
+  }
+
+  auto host_fn = compiler.getFunction<void (*)(int*)>("host_entry");
+  if (!host_fn)
+  {
+    std::fprintf(stderr, "Symbol 'host_entry' not found\n");
+    return 1;
+  }
+
+  int* d_ptr = nullptr;
+  cudaMalloc(&d_ptr, sizeof(int));
+
+  host_fn(d_ptr);
+  cudaDeviceSynchronize();
+
+  int result = 0;
+  cudaMemcpy(&result, d_ptr, sizeof(int), cudaMemcpyDeviceToHost);
+  cudaFree(d_ptr);
+
+  assert(result == 42 && "device kernel did not write expected value");
+  std::printf("freestanding compiler test passed (result=%d)\n", result);
+  return 0;
+}
diff --git a/ci/build_cccl_c_parallel_hostjit.sh b/ci/build_cccl_c_parallel_hostjit.sh
new file mode 100755
index 00000000000..8f3ea508bc3
--- /dev/null
+++ b/ci/build_cccl_c_parallel_hostjit.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+set -euo pipefail
+
+source "$(dirname "${BASH_SOURCE[0]}")/build_common.sh"
+
+print_environment_details
+
+# libnvfatbin is required by hostjit but is not included in the base rapidsai devcontainer
+# image. Detect the installed CTK version and install the matching package if missing.
+if [[ "$(uname -s)" == "Linux" ]] && ! ldconfig -p 2>/dev/null | grep -q libnvfatbin; then
+  CTK_DEB_VER=$(nvcc --version 2>/dev/null \
+    | grep -oP 'release \K[0-9]+\.[0-9]+' | tr '.' '-')
+  if [[ -n "$CTK_DEB_VER" ]]; then
+    echo "Installing libnvfatbin-dev-${CTK_DEB_VER}..."
+    sudo apt-get update -y
+    sudo apt-get install -y --no-install-recommends "libnvfatbin-dev-${CTK_DEB_VER}"
+  else
+    echo "WARNING: could not determine CTK version; skipping libnvfatbin install"
+  fi
+fi
+
+PRESET="cccl-c-parallel-hostjit"
+
+CMAKE_OPTIONS="-DCMAKE_CXX_STANDARD=${CXX_STANDARD} -DCMAKE_CUDA_STANDARD=${CXX_STANDARD}"
+
+configure_and_build_preset "CCCL C Parallel Library (HostJIT)" "$PRESET" "$CMAKE_OPTIONS"
+
+print_time_summary
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 2604163c6c6..e4079a14ddb 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -63,6 +63,13 @@ workflows:
     - {jobs: ['test'], project: 'cccl_c_parallel', ctk: '13.X', cxx: ['gcc13', 'msvc'], gpu: ['rtx2080', 'l4', 'h100']}
     # RTX PRO 6000 coverage (limited due to small number of runners):
     - {jobs: ['test'], project: 'cccl_c_parallel', ctk: '13.X', cxx: ['gcc13'], gpu: ['rtxpro6000']}
+    # c.parallel with HostJIT
+    #
+    # For now, this is a separate job run for Linux/CUDA13.
+    # Eventually we will want building with HostJIT to be the
+    # default, and will do it across the entire matrix. Currently
+    # blocked on libnvfatbin availability on Windows containers, and for CUDA <12.4.
+    - {jobs: ['test'], project: 'cccl_c_parallel_hostjit', ctk: '13.X', cxx: ['gcc13'], gpu: 'rtx2080'}
     # c.experimental.stf-- pinned to gcc13 to match python
     - {jobs: ['test'], project: 'cccl_c_stf', ctk: '12.X', cxx: 'gcc13', gpu: ['rtx2080']}
     - {jobs: ['test'], project: 'cccl_c_stf', ctk: '13.X', cxx: 'gcc13', gpu: ['rtx2080', 'l4', 'h100']}
@@ -559,6 +566,9 @@ projects:
   cccl_c_parallel:
     name: 'CCCL C Parallel'
     stds: [20]
+  cccl_c_parallel_hostjit:
+    name: 'CCCL C Parallel (HostJIT)'
+    stds: [20]
   cccl_c_stf:
     name: 'CCCL C CUDASTF'
     stds: [20]
diff --git a/ci/project_files_and_dependencies.yaml b/ci/project_files_and_dependencies.yaml
index 79273d8e092..287fbf3ffb4 100644
--- a/ci/project_files_and_dependencies.yaml
+++ b/ci/project_files_and_dependencies.yaml
@@ -114,6 +114,16 @@ projects:
     include_regexes: ["c/parallel/"]
     exclude_project_files: [cccl_c_parallel_public]
 
+  cccl_c_parallel_hostjit:
+    name: "CCCL C Parallel Library (HostJIT)"
+    matrix_project: "cccl_c_parallel_hostjit"
+    lite_dependencies: [libcudacxx_public]
+    full_dependencies: [cccl_c_parallel_public]
+    include_regexes:
+      - "c/parallel/src/hostjit/"
+      - "ci/build_cccl_c_parallel_hostjit\\.sh"
+      - "ci/test_cccl_c_parallel_hostjit\\.sh"
+
   cccl_c_stf:
     name: "CCCL C CUDASTF Library"
     matrix_project: "cccl_c_stf"
diff --git a/ci/test/inspect_changes/core_dirty.output b/ci/test/inspect_changes/core_dirty.output
index 18eba6f5119..18fb9e417b8 100644
--- a/ci/test/inspect_changes/core_dirty.output
+++ b/ci/test/inspect_changes/core_dirty.output
@@ -1,2 +1,2 @@
-FULL_BUILD=libcudacxx cub thrust cudax cccl_c_parallel cccl_c_stf python packaging stdpar nvbench_helper nvrtcc tidy
+FULL_BUILD=libcudacxx cub thrust cudax cccl_c_parallel cccl_c_parallel_hostjit cccl_c_stf python packaging stdpar nvbench_helper nvrtcc tidy
 LITE_BUILD=
diff --git a/ci/test/inspect_changes/libcudacxx_both.output b/ci/test/inspect_changes/libcudacxx_both.output
index 50134a6a731..f7a59149b12 100644
--- a/ci/test/inspect_changes/libcudacxx_both.output
+++ b/ci/test/inspect_changes/libcudacxx_both.output
@@ -1,2 +1,2 @@
 FULL_BUILD=libcudacxx tidy
-LITE_BUILD=cub thrust cudax cccl_c_parallel cccl_c_stf python packaging stdpar nvbench_helper
+LITE_BUILD=cub thrust cudax cccl_c_parallel cccl_c_parallel_hostjit cccl_c_stf python packaging stdpar nvbench_helper
diff --git a/ci/test/inspect_changes/libcudacxx_public_only.output b/ci/test/inspect_changes/libcudacxx_public_only.output
index 50134a6a731..f7a59149b12 100644
--- a/ci/test/inspect_changes/libcudacxx_public_only.output
+++ b/ci/test/inspect_changes/libcudacxx_public_only.output
@@ -1,2 +1,2 @@
 FULL_BUILD=libcudacxx tidy
-LITE_BUILD=cub thrust cudax cccl_c_parallel cccl_c_stf python packaging stdpar nvbench_helper
+LITE_BUILD=cub thrust cudax cccl_c_parallel cccl_c_parallel_hostjit cccl_c_stf python packaging stdpar nvbench_helper
diff --git a/ci/test/inspect_changes/libcudacxx_thrust.output b/ci/test/inspect_changes/libcudacxx_thrust.output
index 0c8e975985b..1a24f29859d 100644
--- a/ci/test/inspect_changes/libcudacxx_thrust.output
+++ b/ci/test/inspect_changes/libcudacxx_thrust.output
@@ -1,2 +1,2 @@
 FULL_BUILD=libcudacxx thrust tidy
-LITE_BUILD=cub cudax cccl_c_parallel cccl_c_stf python packaging stdpar nvbench_helper
+LITE_BUILD=cub cudax cccl_c_parallel cccl_c_parallel_hostjit cccl_c_stf python packaging stdpar nvbench_helper
diff --git a/ci/test_cccl_c_parallel_hostjit.sh b/ci/test_cccl_c_parallel_hostjit.sh
new file mode 100755
index 00000000000..0180997344e
--- /dev/null
+++ b/ci/test_cccl_c_parallel_hostjit.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+source "$(dirname "${BASH_SOURCE[0]}")/build_common.sh"
+
+print_environment_details
+
+./build_cccl_c_parallel_hostjit.sh "$@"
+
+PRESET="cccl-c-parallel-hostjit"
+
+test_preset "CCCL C Parallel Library (HostJIT)" ${PRESET}
+
+print_time_summary
diff --git a/ci/windows/build_cccl_c_parallel_hostjit.ps1 b/ci/windows/build_cccl_c_parallel_hostjit.ps1
new file mode 100644
index 00000000000..3d9c941f40f
--- /dev/null
+++ b/ci/windows/build_cccl_c_parallel_hostjit.ps1
@@ -0,0 +1,28 @@
+Param(
+    [Parameter(Mandatory = $false)]
+    [Alias("arch")]
+    [string]$CUDA_ARCH = "",
+    [Parameter(Mandatory = $false)]
+    [Alias("cmake-options")]
+    [string]$CMAKE_OPTIONS = ""
+)
+
+$ErrorActionPreference = "Stop"
+
+$CURRENT_PATH = Split-Path $pwd -leaf
+If($CURRENT_PATH -ne "ci") {
+    Write-Host "Moving to ci folder"
+    pushd "$PSScriptRoot/.."
+}
+
+Remove-Module -Name build_common -ErrorAction SilentlyContinue
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList @(20, $CUDA_ARCH, $CMAKE_OPTIONS)
+
+$PRESET = "cccl-c-parallel-hostjit"
+$LOCAL_CMAKE_OPTIONS = ""
+
+configure_and_build_preset "CCCL C Parallel (HostJIT)" $PRESET $LOCAL_CMAKE_OPTIONS
+
+If($CURRENT_PATH -ne "ci") {
+    popd
+}
diff --git a/ci/windows/test_cccl_c_parallel_hostjit.ps1 b/ci/windows/test_cccl_c_parallel_hostjit.ps1
new file mode 100644
index 00000000000..30c3d675390
--- /dev/null
+++ b/ci/windows/test_cccl_c_parallel_hostjit.ps1
@@ -0,0 +1,31 @@
+Param(
+    [Parameter(Mandatory = $false)]
+    [Alias("arch")]
+    [string]$CUDA_ARCH = "",
+    [Parameter(Mandatory = $false)]
+    [Alias("cmake-options")]
+    [string]$CMAKE_OPTIONS = ""
+)
+
+$ErrorActionPreference = "Stop"
+
+$CURRENT_PATH = Split-Path $pwd -leaf
+If($CURRENT_PATH -ne "ci") {
+    Write-Host "Moving to ci folder"
+    pushd "$PSScriptRoot/.."
+}
+
+# Build first
+$buildCmd = "$PSScriptRoot/build_cccl_c_parallel_hostjit.ps1 -arch '$CUDA_ARCH' -cmake-options '$CMAKE_OPTIONS'"
+Write-Host "Running: $buildCmd"
+Invoke-Expression $buildCmd
+
+Remove-Module -Name build_common -ErrorAction SilentlyContinue
+Import-Module -Name "$PSScriptRoot/build_common.psm1" -ArgumentList @(20, $CUDA_ARCH, $CMAKE_OPTIONS)
+
+$PRESET = "cccl-c-parallel-hostjit"
+test_preset "CCCL C Parallel (HostJIT)" "$PRESET"
+
+If($CURRENT_PATH -ne "ci") {
+    popd
+}