diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp index cb5ab11195..05c32fbc8f 100644 --- a/backends/cuda/ceed-cuda-compile.cpp +++ b/backends/cuda/ceed-cuda-compile.cpp @@ -62,101 +62,121 @@ static int CeedCallSystem_Core(Ceed ceed, const char *command, const char *messa } //------------------------------------------------------------------------------ -// Compile CUDA kernel +// Build array of JIT flags //------------------------------------------------------------------------------ -using std::ifstream; -using std::ofstream; -using std::ostringstream; - -static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module, - const CeedInt num_defines, va_list args) { - size_t ptx_size; - char *ptx; - const int num_opts = 4; - CeedInt num_jit_source_dirs = 0, num_jit_defines = 0; - const char **opts; - nvrtcProgram prog; - struct cudaDeviceProp prop; - Ceed_Cuda *ceed_data; - - cudaFree(0); // Make sure a Context exists for nvrtc - - std::ostringstream code; - bool using_clang; - - CeedCallBackend(CeedGetIsClang(ceed, &using_clang)); - - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, - using_clang ? "Compiling CUDA with Clang backend (with Rust QFunction support)" - : "Compiling CUDA with NVRTC backend (without Rust QFunction support).\nTo use the Clang backend, set the environment " - "variable GPU_CLANG=1"); +static inline int CeedJitGetOpts_Cuda(Ceed ceed, const char ***opts, int *num_opts) { + int opts_count = 4; - // Get kernel specific options, such as kernel constants - if (num_defines > 0) { - char *name; - int val; - - for (int i = 0; i < num_defines; i++) { - name = va_arg(args, char *); - val = va_arg(args, int); - code << "#define " << name << " " << val << "\n"; - } - } - - // Standard libCEED definitions for CUDA backends - code << "#include \n\n"; + // Standard options + CeedCallBackend(CeedCalloc(opts_count, opts)); + CeedCallBackend(CeedStringAllocCopy("-default-device", (char **)&(*opts)[0])); + { + Ceed_Cuda *ceed_data; + struct cudaDeviceProp prop; - // Non-macro options - CeedCallBackend(CeedCalloc(num_opts, &opts)); - opts[0] = "-default-device"; - CeedCallBackend(CeedGetData(ceed, &ceed_data)); - CeedCallCuda(ceed, cudaGetDeviceProperties(&prop, ceed_data->device_id)); - std::string arch_arg = + CeedCallBackend(CeedGetData(ceed, &ceed_data)); + CeedCallCuda(ceed, cudaGetDeviceProperties(&prop, ceed_data->device_id)); + std::string arch_arg = #if CUDA_VERSION >= 11010 - // NVRTC used to support only virtual architectures through the option - // -arch, since it was only emitting PTX. It will now support actual - // architectures as well to emit SASS. - // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#dynamic-code-generation - "-arch=sm_" + // NVRTC used to support only virtual architectures through the option + // -arch, since it was only emitting PTX. It will now support actual + // architectures as well to emit SASS. + // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#dynamic-code-generation + "-arch=sm_" #else - "-arch=compute_" + "-arch=compute_" #endif - + std::to_string(prop.major) + std::to_string(prop.minor); - opts[1] = arch_arg.c_str(); - opts[2] = "-Dint32_t=int"; - opts[3] = "-DCEED_RUNNING_JIT_PASS=1"; + + std::to_string(prop.major) + std::to_string(prop.minor); + + CeedCallBackend(CeedStringAllocCopy(arch_arg.c_str(), (char **)&(*opts)[1])); + } + CeedCallBackend(CeedStringAllocCopy("-Dint32_t=int", (char **)&(*opts)[2])); + CeedCallBackend(CeedStringAllocCopy("-DCEED_RUNNING_JIT_PASS=1", (char **)&(*opts)[3])); + // Additional include dirs { const char **jit_source_dirs; + CeedInt num_jit_source_dirs; CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs)); - CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts)); + CeedCallBackend(CeedRealloc(opts_count + num_jit_source_dirs, opts)); for (CeedInt i = 0; i < num_jit_source_dirs; i++) { std::ostringstream include_dir_arg; include_dir_arg << "-I" << jit_source_dirs[i]; - CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&opts[num_opts + i])); + CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&(*opts)[opts_count + i])); } CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs)); + opts_count += num_jit_source_dirs; } + // User defines { const char **jit_defines; + CeedInt num_jit_defines; CeedCallBackend(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines)); - CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs + num_jit_defines, &opts)); + CeedCallBackend(CeedRealloc(opts_count + num_jit_defines, opts)); for (CeedInt i = 0; i < num_jit_defines; i++) { std::ostringstream define_arg; define_arg << "-D" << jit_defines[i]; - CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&opts[num_opts + num_jit_source_dirs + i])); + CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&(*opts)[opts_count + i])); } CeedCallBackend(CeedRestoreJitDefines(ceed, &jit_defines)); + opts_count += num_jit_defines; } + *num_opts = opts_count; + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Compile CUDA kernel +//------------------------------------------------------------------------------ +using std::ifstream; +using std::ofstream; +using std::ostringstream; + +static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module, + const CeedInt num_defines, va_list args) { + bool using_clang; + size_t ptx_size; + char *ptx; + const char **opts; + int num_opts; + nvrtcProgram prog; + std::ostringstream code; + + // Make sure a Context exists for nvrtc + cudaFree(0); + + CeedCallBackend(CeedGetIsClang(ceed, &using_clang)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, + using_clang ? "Compiling CUDA with Clang backend (with Rust QFunction support)" + : "Compiling CUDA with NVRTC backend (without Rust QFunction support)." + "\nTo use the Clang backend, set the environment variable GPU_CLANG=1"); + + // Get kernel specific options, such as kernel constants + if (num_defines > 0) { + char *name; + int val; + + for (int i = 0; i < num_defines; i++) { + name = va_arg(args, char *); + val = va_arg(args, int); + code << "#define " << name << " " << val << "\n\n"; + } + } + + // Standard libCEED definitions for CUDA backends + code << "#include \n\n"; // Add string source argument provided in call code << source; + // Get compile options + CeedCallBackend(CeedJitGetOpts_Cuda(ceed, &opts, &num_opts)); + // Compile kernel CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n"); CeedDebug(ceed, "Source:\n%s\n", code.str().c_str()); @@ -168,23 +188,16 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_ if (CeedDebugFlag(ceed)) { // LCOV_EXCL_START CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n"); - for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) { - CeedDebug(ceed, "Option %d: %s", i, opts[i]); - } + for (CeedInt i = 0; i < num_opts; i++) CeedDebug(ceed, "Option %d: %s", i, opts[i]); CeedDebug(ceed, ""); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n"); // LCOV_EXCL_STOP } + nvrtcResult result = nvrtcCompileProgram(prog, num_opts, opts); - nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts); - - for (CeedInt i = 0; i < num_jit_source_dirs; i++) { - CeedCallBackend(CeedFree(&opts[num_opts + i])); - } - for (CeedInt i = 0; i < num_jit_defines; i++) { - CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i])); - } + for (CeedInt i = 0; i < num_opts; i++) CeedCallBackend(CeedFree(&opts[i])); CeedCallBackend(CeedFree(&opts)); + *is_compile_good = result == NVRTC_SUCCESS; if (!*is_compile_good) { // LCOV_EXCL_START @@ -217,13 +230,13 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_ CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx)); #endif CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog)); - CeedCallCuda(ceed, cuModuleLoadData(module, ptx)); CeedCallBackend(CeedFree(&ptx)); return CEED_ERROR_SUCCESS; } else { srand(time(NULL)); - const int build_id = rand(); + const int build_id = rand(); + struct cudaDeviceProp prop; // Create temp dir if needed { @@ -283,6 +296,9 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_ } // Get Clang version + Ceed_Cuda *ceed_data; + + CeedCallBackend(CeedGetData(ceed, &ceed_data)); bool use_llvm_version = ceed_data->use_llvm_version; int llvm_version = ceed_data->llvm_version; @@ -324,6 +340,7 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_ } // Compile wrapper kernel + CeedCallCuda(ceed, cudaGetDeviceProperties(&prop, ceed_data->device_id)); command = "clang++" + (use_llvm_version ? (std::string("-") + std::to_string(llvm_version)) : "") + " -flto=thin --cuda-gpu-arch=sm_" + std::to_string(prop.major) + std::to_string(prop.minor) + " --cuda-device-only -emit-llvm -S temp/kernel_" + std::to_string(build_id) + "_0_source.cu -o temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll "; diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp index 6cd147687e..c00f738aeb 100644 --- a/backends/hip/ceed-hip-compile.cpp +++ b/backends/hip/ceed-hip-compile.cpp @@ -32,103 +32,123 @@ } while (0) //------------------------------------------------------------------------------ -// Compile HIP kernel +// Build array of JIT flags //------------------------------------------------------------------------------ -static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, hipModule_t *module, - const CeedInt num_defines, va_list args) { - size_t ptx_size; - char *ptx; - CeedInt num_jit_source_dirs = 0, num_jit_defines = 0; - const char **opts; - int runtime_version; - hiprtcProgram prog; - struct hipDeviceProp_t prop; - Ceed_Hip *ceed_data; - - hipFree(0); // Make sure a Context exists for hiprtc - - std::ostringstream code; - - // Add hip runtime include statement for generation if runtime < 40400000 (implies ROCm < 4.5) - CeedCallHip(ceed, hipRuntimeGetVersion(&runtime_version)); - if (runtime_version < 40400000) { - code << "\n#include \n"; - } - // With ROCm 4.5, need to include these definitions specifically for hiprtc (but cannot include the runtime header) - else { - code << "#include \n"; - code << "#define __forceinline__ inline __attribute__((always_inline))\n"; - code << "#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];\n"; - } - - // Kernel specific options, such as kernel constants - if (num_defines > 0) { - char *name; - int val; - - for (int i = 0; i < num_defines; i++) { - name = va_arg(args, char *); - val = va_arg(args, int); - code << "#define " << name << " " << val << "\n"; - } - } - - // Standard libCEED definitions for HIP backends - code << "#include \n\n"; - - // Non-macro options +static inline int CeedJitGetOpts_Hip(Ceed ceed, const char ***opts, int *num_opts) { + // Standard options #if CEED_HIP_USE_CHIPSTAR - const int num_opts = 1; + int opts_count = 1; - CeedCallBackend(CeedCalloc(num_opts, &opts)); - opts[0] = "-DCEED_RUNNING_JIT_PASS=1"; + CeedCallBackend(CeedCalloc(opts_count, opts)); + CeedCallBackend(CeedStringAllocCopy("-DCEED_RUNNING_JIT_PASS=1", (char **)&(*opts)[0])); #else - const int num_opts = 4; + int opts_count = 4; - CeedCallBackend(CeedCalloc(num_opts, &opts)); - opts[0] = "-default-device"; + CeedCallBackend(CeedCalloc(opts_count, opts)); + CeedCallBackend(CeedStringAllocCopy("-default-device", (char **)&(*opts)[0])); { + Ceed_Hip *ceed_data; + struct hipDeviceProp_t prop; + CeedCallBackend(CeedGetData(ceed, (void **)&ceed_data)); CeedCallHip(ceed, hipGetDeviceProperties(&prop, ceed_data->device_id)); std::string arch_arg = "--gpu-architecture=" + std::string(prop.gcnArchName); - opts[1] = arch_arg.c_str(); + CeedCallBackend(CeedStringAllocCopy(arch_arg.c_str(), (char **)&(*opts)[1])); } - opts[2] = "-munsafe-fp-atomics"; - opts[3] = "-DCEED_RUNNING_JIT_PASS=1"; + CeedCallBackend(CeedStringAllocCopy("-munsafe-fp-atomics", (char **)&(*opts)[2])); + CeedCallBackend(CeedStringAllocCopy("-DCEED_RUNNING_JIT_PASS=1", (char **)&(*opts)[3])); #endif + // Additional include dirs { const char **jit_source_dirs; + CeedInt num_jit_source_dirs; CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs)); - CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts)); + CeedCallBackend(CeedRealloc(opts_count + num_jit_source_dirs, opts)); for (CeedInt i = 0; i < num_jit_source_dirs; i++) { std::ostringstream include_dir_arg; include_dir_arg << "-I" << jit_source_dirs[i]; - CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&opts[num_opts + i])); + CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&(*opts)[opts_count + i])); } CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs)); + opts_count += num_jit_source_dirs; } + // User defines { const char **jit_defines; + CeedInt num_jit_defines; CeedCallBackend(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines)); - CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs + num_jit_defines, &opts)); + CeedCallBackend(CeedRealloc(opts_count + num_jit_defines, opts)); for (CeedInt i = 0; i < num_jit_defines; i++) { std::ostringstream define_arg; define_arg << "-D" << jit_defines[i]; - CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&opts[num_opts + num_jit_source_dirs + i])); + CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&(*opts)[opts_count + i])); } CeedCallBackend(CeedRestoreJitDefines(ceed, &jit_defines)); + opts_count += num_jit_defines; } + *num_opts = opts_count; + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Compile HIP kernel +//------------------------------------------------------------------------------ +static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, hipModule_t *module, + const CeedInt num_defines, va_list args) { + size_t ptx_size; + char *ptx; + const char **opts; + int num_opts; + hiprtcProgram prog; + std::ostringstream code; + + // Make sure a Context exists for hiprtc + hipFree(0); + + // Add hip runtime include statement for generation if runtime < 40400000 (implies ROCm < 4.5) + { + int runtime_version; + + CeedCallHip(ceed, hipRuntimeGetVersion(&runtime_version)); + if (runtime_version < 40400000) { + code << "#include \n\n"; + } + // With ROCm 4.5, need to include these definitions specifically for hiprtc (but cannot include the runtime header) + else { + code << "#include \n"; + code << "#define __forceinline__ inline __attribute__((always_inline))\n"; + code << "#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];\n\n"; + } + } + + // Kernel specific options, such as kernel constants + if (num_defines > 0) { + char *name; + int val; + + for (int i = 0; i < num_defines; i++) { + name = va_arg(args, char *); + val = va_arg(args, int); + code << "#define " << name << " " << val << "\n"; + } + } + + // Standard libCEED definitions for HIP backends + code << "#include \n\n"; // Add string source argument provided in call code << source; + // Get compile options + CeedCallBackend(CeedJitGetOpts_Hip(ceed, &opts, &num_opts)); + // Create Program CeedCallHiprtc(ceed, hiprtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL)); @@ -139,22 +159,16 @@ static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_e if (CeedDebugFlag(ceed)) { // LCOV_EXCL_START CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n"); - for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) { - CeedDebug(ceed, "Option %d: %s", i, opts[i]); - } + for (CeedInt i = 0; i < num_opts; i++) CeedDebug(ceed, "Option %d: %s", i, opts[i]); CeedDebug(ceed, ""); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n"); // LCOV_EXCL_STOP } - hiprtcResult result = hiprtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts); + hiprtcResult result = hiprtcCompileProgram(prog, num_opts, opts); - for (CeedInt i = 0; i < num_jit_source_dirs; i++) { - CeedCallBackend(CeedFree(&opts[num_opts + i])); - } - for (CeedInt i = 0; i < num_jit_defines; i++) { - CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i])); - } + for (CeedInt i = 0; i < num_opts; i++) CeedCallBackend(CeedFree(&opts[i])); CeedCallBackend(CeedFree(&opts)); + *is_compile_good = result == HIPRTC_SUCCESS; if (!*is_compile_good) { // LCOV_EXCL_START @@ -181,7 +195,6 @@ static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_e CeedCallBackend(CeedMalloc(ptx_size, &ptx)); CeedCallHiprtc(ceed, hiprtcGetCode(prog, ptx)); CeedCallHiprtc(ceed, hiprtcDestroyProgram(&prog)); - CeedCallHip(ceed, hipModuleLoadData(module, ptx)); CeedCallBackend(CeedFree(&ptx)); return CEED_ERROR_SUCCESS; diff --git a/backends/sycl/ceed-sycl-compile.sycl.cpp b/backends/sycl/ceed-sycl-compile.sycl.cpp index f939ca940f..a72744d9a5 100644 --- a/backends/sycl/ceed-sycl-compile.sycl.cpp +++ b/backends/sycl/ceed-sycl-compile.sycl.cpp @@ -36,7 +36,7 @@ static int CeedJitAddDefinitions_Sycl(Ceed ceed, const std::string &kernel_sourc oss << "#define " << name << " " << value << "\n"; } - // libCeed definitions for Sycl Backends + // libCEED definitions for Sycl Backends CeedCallBackend(CeedGetJitAbsolutePath(ceed, sycl_jith_path, &jit_defs_path)); { char *source; diff --git a/doc/sphinx/source/libCEEDdev.md b/doc/sphinx/source/libCEEDdev.md index e311171a79..a2e3297fb0 100644 --- a/doc/sphinx/source/libCEEDdev.md +++ b/doc/sphinx/source/libCEEDdev.md @@ -21,15 +21,15 @@ Once the user facing API and the default implementation are in place and verifie ## Backend Inheritance -A Ceed backend is not required to implement all libCeed objects or {ref}`CeedOperator` methods. +A Ceed backend is not required to implement all libCEED objects or {ref}`CeedOperator` methods. There are three mechanisms by which a Ceed backend can inherit implementations from another Ceed backend. 1. Delegation - Developers may use {c:func}`CeedSetDelegate` to set a general delegate {ref}`Ceed` object. - This delegate {ref}`Ceed` will provide the implementation of any libCeed objects that parent backend does not implement. + This delegate {ref}`Ceed` will provide the implementation of any libCEED objects that parent backend does not implement. For example, the `/cpu/self/xsmm/serial` backend implements the `CeedTensorContract` object itself but delegates all other functionality to the `/cpu/self/opt/serial` backend. 2. Object delegation - Developers may use {c:func}`CeedSetObjectDelegate` to set a delegate {ref}`Ceed` object for a specific libCEED object. - This delegate {ref}`Ceed` will only provide the implementation of that specific libCeed object for the parent backend. + This delegate {ref}`Ceed` will only provide the implementation of that specific libCEED object for the parent backend. Object delegation has higher precedence than delegation. 3. Operator fallback - Developers may use {c:func}`CeedSetOperatorFallbackCeed` to set a {ref}`Ceed` object to provide any unimplemented {ref}`CeedOperator` methods that support preconditioning, such as {c:func}`CeedOperatorLinearAssemble`. diff --git a/interface/ceed.c b/interface/ceed.c index 4adfdad887..2088980c8e 100644 --- a/interface/ceed.c +++ b/interface/ceed.c @@ -344,6 +344,7 @@ int CeedReallocArray(size_t n, size_t unit, void *p) { **/ int CeedStringAllocCopy(const char *source, char **copy) { size_t len = strlen(source); + CeedCall(CeedCalloc(len + 1, copy)); memcpy(*copy, source, len); return CEED_ERROR_SUCCESS;