@@ -62,101 +62,121 @@ static int CeedCallSystem_Core(Ceed ceed, const char *command, const char *messa
6262}
6363
6464// ------------------------------------------------------------------------------
65- // Compile CUDA kernel
65+ // Build array of JIT flags
6666// ------------------------------------------------------------------------------
67- using std::ifstream;
68- using std::ofstream;
69- using std::ostringstream;
70-
71- static int CeedCompileCore_Cuda (Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module ,
72- const CeedInt num_defines, va_list args) {
73- size_t ptx_size;
74- char *ptx;
75- const int num_opts = 4 ;
76- CeedInt num_jit_source_dirs = 0 , num_jit_defines = 0 ;
77- const char **opts;
78- nvrtcProgram prog;
79- struct cudaDeviceProp prop;
80- Ceed_Cuda *ceed_data;
81-
82- cudaFree (0 ); // Make sure a Context exists for nvrtc
83-
84- std::ostringstream code;
85- bool using_clang;
86-
87- CeedCallBackend (CeedGetIsClang (ceed, &using_clang));
88-
89- CeedDebug256 (ceed, CEED_DEBUG_COLOR_SUCCESS,
90- using_clang ? " Compiling CUDA with Clang backend (with Rust QFunction support)"
91- : " Compiling CUDA with NVRTC backend (without Rust QFunction support).\n To use the Clang backend, set the environment "
92- " variable GPU_CLANG=1" );
67+ static inline int CeedJitGetOpts_Cuda (Ceed ceed, const char ***opts, int *num_opts) {
68+ int opts_count = 4 ;
9369
94- // Get kernel specific options, such as kernel constants
95- if (num_defines > 0 ) {
96- char *name;
97- int val;
98-
99- for (int i = 0 ; i < num_defines; i++) {
100- name = va_arg (args, char *);
101- val = va_arg (args, int );
102- code << " #define " << name << " " << val << " \n " ;
103- }
104- }
105-
106- // Standard libCEED definitions for CUDA backends
107- code << " #include <ceed/jit-source/cuda/cuda-jit.h>\n\n " ;
70+ // Standard options
71+ CeedCallBackend (CeedCalloc (opts_count, opts));
72+ CeedCallBackend (CeedStringAllocCopy (" -default-device" , (char **)&(*opts)[0 ]));
73+ {
74+ Ceed_Cuda *ceed_data;
75+ struct cudaDeviceProp prop;
10876
109- // Non-macro options
110- CeedCallBackend (CeedCalloc (num_opts, &opts));
111- opts[0 ] = " -default-device" ;
112- CeedCallBackend (CeedGetData (ceed, &ceed_data));
113- CeedCallCuda (ceed, cudaGetDeviceProperties (&prop, ceed_data->device_id ));
114- std::string arch_arg =
77+ CeedCallBackend (CeedGetData (ceed, &ceed_data));
78+ CeedCallCuda (ceed, cudaGetDeviceProperties (&prop, ceed_data->device_id ));
79+ std::string arch_arg =
11580#if CUDA_VERSION >= 11010
116- // NVRTC used to support only virtual architectures through the option
117- // -arch, since it was only emitting PTX. It will now support actual
118- // architectures as well to emit SASS.
119- // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#dynamic-code-generation
120- " -arch=sm_"
81+ // NVRTC used to support only virtual architectures through the option
82+ // -arch, since it was only emitting PTX. It will now support actual
83+ // architectures as well to emit SASS.
84+ // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#dynamic-code-generation
85+ " -arch=sm_"
12186#else
122- " -arch=compute_"
87+ " -arch=compute_"
12388#endif
124- + std::to_string (prop.major ) + std::to_string (prop.minor );
125- opts[1 ] = arch_arg.c_str ();
126- opts[2 ] = " -Dint32_t=int" ;
127- opts[3 ] = " -DCEED_RUNNING_JIT_PASS=1" ;
89+ + std::to_string (prop.major ) + std::to_string (prop.minor );
90+
91+ CeedCallBackend (CeedStringAllocCopy (arch_arg.c_str (), (char **)&(*opts)[1 ]));
92+ }
93+ CeedCallBackend (CeedStringAllocCopy (" -Dint32_t=int" , (char **)&(*opts)[2 ]));
94+ CeedCallBackend (CeedStringAllocCopy (" -DCEED_RUNNING_JIT_PASS=1" , (char **)&(*opts)[3 ]));
95+
12896 // Additional include dirs
12997 {
13098 const char **jit_source_dirs;
99+ CeedInt num_jit_source_dirs;
131100
132101 CeedCallBackend (CeedGetJitSourceRoots (ceed, &num_jit_source_dirs, &jit_source_dirs));
133- CeedCallBackend (CeedRealloc (num_opts + num_jit_source_dirs, & opts));
102+ CeedCallBackend (CeedRealloc (opts_count + num_jit_source_dirs, opts));
134103 for (CeedInt i = 0 ; i < num_jit_source_dirs; i++) {
135104 std::ostringstream include_dir_arg;
136105
137106 include_dir_arg << " -I" << jit_source_dirs[i];
138- CeedCallBackend (CeedStringAllocCopy (include_dir_arg.str ().c_str (), (char **)&opts[num_opts + i]));
107+ CeedCallBackend (CeedStringAllocCopy (include_dir_arg.str ().c_str (), (char **)&(* opts)[opts_count + i]));
139108 }
140109 CeedCallBackend (CeedRestoreJitSourceRoots (ceed, &jit_source_dirs));
110+ opts_count += num_jit_source_dirs;
141111 }
112+
142113 // User defines
143114 {
144115 const char **jit_defines;
116+ CeedInt num_jit_defines;
145117
146118 CeedCallBackend (CeedGetJitDefines (ceed, &num_jit_defines, &jit_defines));
147- CeedCallBackend (CeedRealloc (num_opts + num_jit_source_dirs + num_jit_defines, & opts));
119+ CeedCallBackend (CeedRealloc (opts_count + num_jit_defines, opts));
148120 for (CeedInt i = 0 ; i < num_jit_defines; i++) {
149121 std::ostringstream define_arg;
150122
151123 define_arg << " -D" << jit_defines[i];
152- CeedCallBackend (CeedStringAllocCopy (define_arg.str ().c_str (), (char **)&opts[num_opts + num_jit_source_dirs + i]));
124+ CeedCallBackend (CeedStringAllocCopy (define_arg.str ().c_str (), (char **)&(* opts)[opts_count + i]));
153125 }
154126 CeedCallBackend (CeedRestoreJitDefines (ceed, &jit_defines));
127+ opts_count += num_jit_defines;
155128 }
129+ *num_opts = opts_count;
130+ return CEED_ERROR_SUCCESS;
131+ }
132+
133+ // ------------------------------------------------------------------------------
134+ // Compile CUDA kernel
135+ // ------------------------------------------------------------------------------
136+ using std::ifstream;
137+ using std::ofstream;
138+ using std::ostringstream;
139+
140+ static int CeedCompileCore_Cuda (Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module ,
141+ const CeedInt num_defines, va_list args) {
142+ bool using_clang;
143+ size_t ptx_size;
144+ char *ptx;
145+ const char **opts;
146+ int num_opts;
147+ nvrtcProgram prog;
148+ std::ostringstream code;
149+
150+ // Make sure a Context exists for nvrtc
151+ cudaFree (0 );
152+
153+ CeedCallBackend (CeedGetIsClang (ceed, &using_clang));
154+ CeedDebug256 (ceed, CEED_DEBUG_COLOR_SUCCESS,
155+ using_clang ? " Compiling CUDA with Clang backend (with Rust QFunction support)"
156+ : " Compiling CUDA with NVRTC backend (without Rust QFunction support)."
157+ " \n To use the Clang backend, set the environment variable GPU_CLANG=1" );
158+
159+ // Get kernel specific options, such as kernel constants
160+ if (num_defines > 0 ) {
161+ char *name;
162+ int val;
163+
164+ for (int i = 0 ; i < num_defines; i++) {
165+ name = va_arg (args, char *);
166+ val = va_arg (args, int );
167+ code << " #define " << name << " " << val << " \n\n " ;
168+ }
169+ }
170+
171+ // Standard libCEED definitions for CUDA backends
172+ code << " #include <ceed/jit-source/cuda/cuda-jit.h>\n\n " ;
156173
157174 // Add string source argument provided in call
158175 code << source;
159176
177+ // Get compile options
178+ CeedCallBackend (CeedJitGetOpts_Cuda (ceed, &opts, &num_opts));
179+
160180 // Compile kernel
161181 CeedDebug256 (ceed, CEED_DEBUG_COLOR_SUCCESS, " ---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n " );
162182 CeedDebug (ceed, " Source:\n %s\n " , code.str ().c_str ());
@@ -168,23 +188,16 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
168188 if (CeedDebugFlag (ceed)) {
169189 // LCOV_EXCL_START
170190 CeedDebug256 (ceed, CEED_DEBUG_COLOR_SUCCESS, " ---------- JiT COMPILER OPTIONS ----------\n " );
171- for (CeedInt i = 0 ; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) {
172- CeedDebug (ceed, " Option %d: %s" , i, opts[i]);
173- }
191+ for (CeedInt i = 0 ; i < num_opts; i++) CeedDebug (ceed, " Option %d: %s" , i, opts[i]);
174192 CeedDebug (ceed, " " );
175193 CeedDebug256 (ceed, CEED_DEBUG_COLOR_SUCCESS, " ---------- END OF JiT COMPILER OPTIONS ----------\n " );
176194 // LCOV_EXCL_STOP
177195 }
196+ nvrtcResult result = nvrtcCompileProgram (prog, num_opts, opts);
178197
179- nvrtcResult result = nvrtcCompileProgram (prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
180-
181- for (CeedInt i = 0 ; i < num_jit_source_dirs; i++) {
182- CeedCallBackend (CeedFree (&opts[num_opts + i]));
183- }
184- for (CeedInt i = 0 ; i < num_jit_defines; i++) {
185- CeedCallBackend (CeedFree (&opts[num_opts + num_jit_source_dirs + i]));
186- }
198+ for (CeedInt i = 0 ; i < num_opts; i++) CeedCallBackend (CeedFree (&opts[i]));
187199 CeedCallBackend (CeedFree (&opts));
200+
188201 *is_compile_good = result == NVRTC_SUCCESS;
189202 if (!*is_compile_good) {
190203 // LCOV_EXCL_START
@@ -217,13 +230,13 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
217230 CeedCallNvrtc (ceed, nvrtcGetPTX (prog, ptx));
218231#endif
219232 CeedCallNvrtc (ceed, nvrtcDestroyProgram (&prog));
220-
221233 CeedCallCuda (ceed, cuModuleLoadData (module , ptx));
222234 CeedCallBackend (CeedFree (&ptx));
223235 return CEED_ERROR_SUCCESS;
224236 } else {
225237 srand (time (NULL ));
226- const int build_id = rand ();
238+ const int build_id = rand ();
239+ struct cudaDeviceProp prop;
227240
228241 // Create temp dir if needed
229242 {
@@ -283,6 +296,9 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
283296 }
284297
285298 // Get Clang version
299+ Ceed_Cuda *ceed_data;
300+
301+ CeedCallBackend (CeedGetData (ceed, &ceed_data));
286302 bool use_llvm_version = ceed_data->use_llvm_version ;
287303 int llvm_version = ceed_data->llvm_version ;
288304
@@ -324,6 +340,7 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
324340 }
325341
326342 // Compile wrapper kernel
343+ CeedCallCuda (ceed, cudaGetDeviceProperties (&prop, ceed_data->device_id ));
327344 command = " clang++" + (use_llvm_version ? (std::string (" -" ) + std::to_string (llvm_version)) : " " ) + " -flto=thin --cuda-gpu-arch=sm_" +
328345 std::to_string (prop.major ) + std::to_string (prop.minor ) + " --cuda-device-only -emit-llvm -S temp/kernel_" + std::to_string (build_id) +
329346 " _0_source.cu -o temp/kernel_" + std::to_string (build_id) + " _1_wrapped.ll " ;
0 commit comments