From fe9c17097c1a585ad208da84eff645e62ec1ae4c Mon Sep 17 00:00:00 2001 From: Benjamin Pachev Date: Fri, 13 Sep 2024 17:27:34 -0500 Subject: [PATCH 01/14] Add CUDA wrapper capability. --- ffcx/codegeneration/C/integrals.py | 10 ++++- ffcx/codegeneration/C/integrals_template.py | 46 +++++++++++++++++++++ ffcx/codegeneration/jit.py | 3 ++ ffcx/codegeneration/ufcx.h | 22 ++++++++++ ffcx/options.py | 1 + 5 files changed, 81 insertions(+), 1 deletion(-) diff --git a/ffcx/codegeneration/C/integrals.py b/ffcx/codegeneration/C/integrals.py index 1115e3731..4c37efc76 100644 --- a/ffcx/codegeneration/C/integrals.py +++ b/ffcx/codegeneration/C/integrals.py @@ -69,6 +69,13 @@ def generator(ir: IntegralIR, options): else: code["tabulate_tensor_complex64"] = ".tabulate_tensor_complex64 = NULL," code["tabulate_tensor_complex128"] = ".tabulate_tensor_complex128 = NULL," + if options.get("cuda"): + code["tabulate_tensor_cuda"] = ( + f".tabulate_tensor_cuda = tabulate_tensor_cuda_{factory_name}" + ) + else: + code["tabulate_tensor_cuda"] = "" + np_scalar_type = np.dtype(options["scalar_type"]).name code[f"tabulate_tensor_{np_scalar_type}"] = ( f".tabulate_tensor_{np_scalar_type} = tabulate_tensor_{factory_name}," @@ -76,7 +83,7 @@ def generator(ir: IntegralIR, options): element_hash = 0 if ir.coordinate_element_hash is None else ir.coordinate_element_hash - implementation = ufcx_integrals.factory.format( + implementation = ufcx_integrals.get_factory(options).format( factory_name=factory_name, enabled_coefficients=code["enabled_coefficients"], enabled_coefficients_init=code["enabled_coefficients_init"], @@ -89,6 +96,7 @@ def generator(ir: IntegralIR, options): tabulate_tensor_float64=code["tabulate_tensor_float64"], tabulate_tensor_complex64=code["tabulate_tensor_complex64"], tabulate_tensor_complex128=code["tabulate_tensor_complex128"], + tabulate_tensor_cuda=code["tabulate_tensor_cuda"], ) return declaration, implementation diff --git a/ffcx/codegeneration/C/integrals_template.py b/ffcx/codegeneration/C/integrals_template.py index 2bb1568ec..87cd5228c 100644 --- a/ffcx/codegeneration/C/integrals_template.py +++ b/ffcx/codegeneration/C/integrals_template.py @@ -30,9 +30,55 @@ {tabulate_tensor_float64} {tabulate_tensor_complex64} {tabulate_tensor_complex128} + {tabulate_tensor_cuda} .needs_facet_permutations = {needs_facet_permutations}, .coordinate_element_hash = {coordinate_element_hash}, }}; // End of code for integral {factory_name} """ + +cuda_wrapper = """ + +// Begin CUDA wrapper for integral {factory_name} +void tabulate_tensor_cuda_{factory_name}(int* num_program_headers, + const char*** program_headers, + const char*** program_include_names, + const char** out_program_src, + const char** tabulate_tensor_function_name) +{{ + const char* program_src = "" + "#define alignas(x)\\n" + "#define restrict __restrict__\\n" + "\\n" + "typedef unsigned char uint8_t;\\n" + "typedef unsigned int uint32_t;\\n" + "typedef double ufc_scalar_t;\\n" + "\\n" + "extern \\"C\\" __global__\\n" + "void tabulate_tensor_{factory_name}({scalar_type}* restrict A,\\n" + " const {scalar_type}* restrict w,\\n" + " const {scalar_type}* restrict c,\\n" + " const {geom_type}* restrict coordinate_dofs,\\n" + " const int* restrict entity_local_index,\\n" + " const uint8_t* restrict quadrature_permutation\\n" + " )\\n" + "{{\\n" + "{tabulate_tensor_quoted}\\n" + "}}"; + *num_program_headers = 0; + *program_headers = NULL; + *program_include_names = NULL; + *out_program_src = program_src; + *tabulate_tensor_function_name = "tabulate_tensor_{factory_name}"; +}} + +// End CUDA wrapper for integral {factory_name} + +""" + +def get_factory(options): + if options.get("cuda"): + return cuda_wrapper + factory + else: + return factory diff --git a/ffcx/codegeneration/jit.py b/ffcx/codegeneration/jit.py index 6eb5dbb8f..94bef517f 100644 --- a/ffcx/codegeneration/jit.py +++ b/ffcx/codegeneration/jit.py @@ -68,6 +68,9 @@ UFC_INTEGRAL_DECL += "\n".join( re.findall(r"typedef void ?\(ufcx_tabulate_tensor_complex128\).*?\);", ufcx_h, re.DOTALL) ) +UFC_INTEGRAL_DECL += "\n".join( + re.findall(r"typedef void ?\(ufcx_tabulate_tensor_cuda\).*?\);", ufcx_h, re.DOTALL) +) UFC_INTEGRAL_DECL += "\n".join( re.findall("typedef struct ufcx_integral.*?ufcx_integral;", ufcx_h, re.DOTALL) diff --git a/ffcx/codegeneration/ufcx.h b/ffcx/codegeneration/ufcx.h index e1dd838d1..a4bc8ae21 100644 --- a/ffcx/codegeneration/ufcx.h +++ b/ffcx/codegeneration/ufcx.h @@ -125,6 +125,27 @@ extern "C" const uint8_t* restrict quadrature_permutation); #endif // __STDC_NO_COMPLEX__ + /// Return CUDA C++ source code for the ufc_tabulate_tensor kernel + /// + /// @param[out] num_program_headers + /// The number of headers required by the program + /// @param[out] program_headers + /// Entire contents of each header file + /// @param[out] program_include_names + /// Names of each header file + /// @param[out] program_src + /// CUDA C++ source code for the program containing the + /// tabulate_tensor function. + /// @param[out] tabulate_tensor_function_name + /// The name of the device-side function. + /// + typedef void(ufcx_tabulate_tensor_cuda)( + int* num_program_headers, + const char*** program_headers, + const char*** program_include_names, + const char** program_src, + const char** tabulate_tensor_function_name); + typedef struct ufcx_integral { const bool* enabled_coefficients; @@ -134,6 +155,7 @@ extern "C" ufcx_tabulate_tensor_complex64* tabulate_tensor_complex64; ufcx_tabulate_tensor_complex128* tabulate_tensor_complex128; #endif // __STDC_NO_COMPLEX__ + ufcx_tabulate_tensor_cuda* tabulate_tensor_cuda; bool needs_facet_permutations; /// Get the hash of the coordinate element associated with the geometry of the mesh. diff --git a/ffcx/options.py b/ffcx/options.py index 536f02a35..252ea74a8 100644 --- a/ffcx/options.py +++ b/ffcx/options.py @@ -20,6 +20,7 @@ logger = logging.getLogger("ffcx") FFCX_DEFAULT_OPTIONS = { + "cuda": (bool, False, "generate CUDA wrapped versions of tabulate tensor functions", None), "epsilon": (float, 1e-14, "machine precision, used for dropping zero terms in tables.", None), "scalar_type": ( str, From 3369f070c58ea2d372a1e4aef170e45597601612 Mon Sep 17 00:00:00 2001 From: Benjamin Pachev Date: Wed, 18 Sep 2024 19:18:45 -0500 Subject: [PATCH 02/14] Rename tabulate_tensor_cuda to tabulate_tensor_cuda_nvrtc. Add comments clarifying the use of NVRTC and the need for typedefs in generated CUDA source code. --- ffcx/codegeneration/C/integrals.py | 8 ++++---- ffcx/codegeneration/C/integrals_template.py | 13 +++++++++---- ffcx/codegeneration/jit.py | 2 +- ffcx/codegeneration/ufcx.h | 5 +++-- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/ffcx/codegeneration/C/integrals.py b/ffcx/codegeneration/C/integrals.py index 4c37efc76..226b2dabc 100644 --- a/ffcx/codegeneration/C/integrals.py +++ b/ffcx/codegeneration/C/integrals.py @@ -70,11 +70,11 @@ def generator(ir: IntegralIR, options): code["tabulate_tensor_complex64"] = ".tabulate_tensor_complex64 = NULL," code["tabulate_tensor_complex128"] = ".tabulate_tensor_complex128 = NULL," if options.get("cuda"): - code["tabulate_tensor_cuda"] = ( - f".tabulate_tensor_cuda = tabulate_tensor_cuda_{factory_name}" + code["tabulate_tensor_cuda_nvrtc"] = ( + f".tabulate_tensor_cuda_nvrtc = tabulate_tensor_cuda_nvrtc_{factory_name}" ) else: - code["tabulate_tensor_cuda"] = "" + code["tabulate_tensor_cuda_nvrtc"] = "" np_scalar_type = np.dtype(options["scalar_type"]).name code[f"tabulate_tensor_{np_scalar_type}"] = ( @@ -96,7 +96,7 @@ def generator(ir: IntegralIR, options): tabulate_tensor_float64=code["tabulate_tensor_float64"], tabulate_tensor_complex64=code["tabulate_tensor_complex64"], tabulate_tensor_complex128=code["tabulate_tensor_complex128"], - tabulate_tensor_cuda=code["tabulate_tensor_cuda"], + tabulate_tensor_cuda_nvrtc=code["tabulate_tensor_cuda_nvrtc"], ) return declaration, implementation diff --git a/ffcx/codegeneration/C/integrals_template.py b/ffcx/codegeneration/C/integrals_template.py index 87cd5228c..28e36a508 100644 --- a/ffcx/codegeneration/C/integrals_template.py +++ b/ffcx/codegeneration/C/integrals_template.py @@ -30,7 +30,7 @@ {tabulate_tensor_float64} {tabulate_tensor_complex64} {tabulate_tensor_complex128} - {tabulate_tensor_cuda} + {tabulate_tensor_cuda_nvrtc} .needs_facet_permutations = {needs_facet_permutations}, .coordinate_element_hash = {coordinate_element_hash}, }}; @@ -40,13 +40,16 @@ cuda_wrapper = """ -// Begin CUDA wrapper for integral {factory_name} -void tabulate_tensor_cuda_{factory_name}(int* num_program_headers, +// Begin NVRTC CUDA wrapper for integral {factory_name} +// The wrapper is compiled with a standard C++ compiler, and is called at runtime to generate +// source code which is then compiled into a CUDA kernel at runtime via NVRTC. +void tabulate_tensor_cuda_nvrtc_{factory_name}(int* num_program_headers, const char*** program_headers, const char*** program_include_names, const char** out_program_src, const char** tabulate_tensor_function_name) {{ + // The below typedefs are needed due to issues with including stdint.h in NVRTC source code const char* program_src = "" "#define alignas(x)\\n" "#define restrict __restrict__\\n" @@ -73,11 +76,13 @@ *tabulate_tensor_function_name = "tabulate_tensor_{factory_name}"; }} -// End CUDA wrapper for integral {factory_name} +// End NVRTC CUDA wrapper for integral {factory_name} """ + def get_factory(options): + """Return the template string for constructing form integrals.""" if options.get("cuda"): return cuda_wrapper + factory else: diff --git a/ffcx/codegeneration/jit.py b/ffcx/codegeneration/jit.py index 94bef517f..467de689c 100644 --- a/ffcx/codegeneration/jit.py +++ b/ffcx/codegeneration/jit.py @@ -69,7 +69,7 @@ re.findall(r"typedef void ?\(ufcx_tabulate_tensor_complex128\).*?\);", ufcx_h, re.DOTALL) ) UFC_INTEGRAL_DECL += "\n".join( - re.findall(r"typedef void ?\(ufcx_tabulate_tensor_cuda\).*?\);", ufcx_h, re.DOTALL) + re.findall(r"typedef void ?\(ufcx_tabulate_tensor_cuda_nvrtc\).*?\);", ufcx_h, re.DOTALL) ) UFC_INTEGRAL_DECL += "\n".join( diff --git a/ffcx/codegeneration/ufcx.h b/ffcx/codegeneration/ufcx.h index a4bc8ae21..782a1a04d 100644 --- a/ffcx/codegeneration/ufcx.h +++ b/ffcx/codegeneration/ufcx.h @@ -126,6 +126,7 @@ extern "C" #endif // __STDC_NO_COMPLEX__ /// Return CUDA C++ source code for the ufc_tabulate_tensor kernel + /// The resulting source code is passed to NVRTC for runtime compilation /// /// @param[out] num_program_headers /// The number of headers required by the program @@ -139,7 +140,7 @@ extern "C" /// @param[out] tabulate_tensor_function_name /// The name of the device-side function. /// - typedef void(ufcx_tabulate_tensor_cuda)( + typedef void(ufcx_tabulate_tensor_cuda_nvrtc)( int* num_program_headers, const char*** program_headers, const char*** program_include_names, @@ -155,7 +156,7 @@ extern "C" ufcx_tabulate_tensor_complex64* tabulate_tensor_complex64; ufcx_tabulate_tensor_complex128* tabulate_tensor_complex128; #endif // __STDC_NO_COMPLEX__ - ufcx_tabulate_tensor_cuda* tabulate_tensor_cuda; + ufcx_tabulate_tensor_cuda_nvrtc* tabulate_tensor_cuda_nvrtc; bool needs_facet_permutations; /// Get the hash of the coordinate element associated with the geometry of the mesh. From b8e717b1e9316a59fce321bdffad726c044e46e4 Mon Sep 17 00:00:00 2001 From: Benjamin Pachev Date: Mon, 11 Nov 2024 15:10:06 -0600 Subject: [PATCH 03/14] Added unit test for cuda wrapper functionality via nvrtc. --- demo/nvrtc_test.cpp | 89 ++++++++++++++++++++++++++++++ demo/test_demos.py | 40 +++++++++++++- ffcx/codegeneration/C/integrals.py | 4 +- pyproject.toml | 2 +- 4 files changed, 132 insertions(+), 3 deletions(-) create mode 100644 demo/nvrtc_test.cpp diff --git a/demo/nvrtc_test.cpp b/demo/nvrtc_test.cpp new file mode 100644 index 000000000..983ba335b --- /dev/null +++ b/demo/nvrtc_test.cpp @@ -0,0 +1,89 @@ +#include "Components.h" +#include "ufcx.h" +#include "nvrtc.h" +#include +#include +#include +#include + +int main() +{ + // extract kernel + ufcx_integral* integral = form_Components_L->form_integrals[0]; + ufcx_tabulate_tensor_cuda_nvrtc* kernel = integral->tabulate_tensor_cuda_nvrtc; + // call kernel to get CUDA-wrapped source code + int num_program_headers; + const char** program_headers; + const char** program_include_names; + const char* program_src; + const char* tabulate_tensor_function_name; + if (!kernel) { + throw std::runtime_error("NVRTC wrapper function is NULL!"); + } + (*kernel)( + &num_program_headers, &program_headers, + &program_include_names, &program_src, + &tabulate_tensor_function_name); + // compile CUDA-wrapped source code with NVRTC + // with proper error checking + + nvrtcResult nvrtc_err; + nvrtcProgram program; + nvrtc_err = nvrtcCreateProgram( + &program, program_src, tabulate_tensor_function_name, + num_program_headers, program_headers, + program_include_names); + + if (nvrtc_err != NVRTC_SUCCESS) { + throw std::runtime_error( + "nvrtcCreateProgram() failed with " + + std::string(nvrtcGetErrorString(nvrtc_err)) + " " + "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); + } + + int num_compile_options = 0; + const char** compile_options; + // Compile the CUDA C++ program + nvrtcResult nvrtc_compile_err = nvrtcCompileProgram( + program, num_compile_options, compile_options); + if (nvrtc_compile_err != NVRTC_SUCCESS) { + // If the compiler failed, obtain the compiler log + std::string program_log; + size_t log_size; + nvrtc_err = nvrtcGetProgramLogSize(program, &log_size); + if (nvrtc_err != NVRTC_SUCCESS) { + program_log = std::string( + "nvrtcGetProgramLogSize() failed with " + + std::string(nvrtcGetErrorString(nvrtc_err)) + " " + "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); + } else { + program_log.resize(log_size); + nvrtc_err = nvrtcGetProgramLog( + program, const_cast(program_log.c_str())); + if (nvrtc_err != NVRTC_SUCCESS) { + program_log = std::string( + "nvrtcGetProgramLog() failed with " + + std::string(nvrtcGetErrorString(nvrtc_err))) + " " + "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__); + } + if (log_size > 0) + program_log.resize(log_size-1); + } + nvrtcDestroyProgram(&program); + + std::stringstream ss; + ss << "nvrtcCompileProgram() failed with " + << nvrtcGetErrorString(nvrtc_compile_err) << "\n" + << "CUDA C++ source code:\n" + << std::string(60, '-') << "\n" + << program_src + << std::string(60, '-') << "\n" + << "NVRTC compiler log:\n" + << std::string(60, '-') << "\n" + << program_log << "\n" + << std::string(60, '-') << "\n"; + throw std::runtime_error(ss.str()); + } + + return 0; +} diff --git a/demo/test_demos.py b/demo/test_demos.py index 78b18234f..3ccb722e3 100644 --- a/demo/test_demos.py +++ b/demo/test_demos.py @@ -12,7 +12,6 @@ if file.endswith(".py") and not file == "test_demos.py": ufl_files.append(file[:-3]) - @pytest.mark.parametrize("file", ufl_files) @pytest.mark.parametrize("scalar_type", ["float64", "float32", "complex128", "complex64"]) def test_demo(file, scalar_type): @@ -73,3 +72,42 @@ def test_demo(file, scalar_type): ) == 0 ) + +@pytest.mark.parametrize("scalar_type", ["float64", "float32"]) +def test_demo_nvrtc(scalar_type): + """Test generated CUDA code with NVRTC.""" + file = "Components" + opts = f"--scalar_type {scalar_type} --cuda" + if sys.platform.startswith("win32"): + pytest.skip(reason="NVRTC support not tested on Windows") + else: + from nvidia import cuda_nvrtc + nvrtc_dir = os.path.dirname(os.path.realpath(cuda_nvrtc.__file__)) + cc = os.environ.get("CC", "cc") + extra_flags = ( + "-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration" + ) + assert os.system(f"cd {demo_dir} && ffcx {opts} {file}.py") == 0 + assert ( + os.system( + f"cd {demo_dir} && " + f"{cc} -I../ffcx/codegeneration " + f"{extra_flags} " + f"-c {file}.c" + ) + == 0 + ) + cxx = os.environ.get("CXX", "c++") + assert ( + os.system( + f"cd {demo_dir} && " + f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib" + f" -o nvrtc_test nvrtc_test.cpp {file}.o -l:libnvrtc.so.12" + ) + == 0 + ) + assert ( + os.system(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{nvrtc_dir}/lib {demo_dir}/nvrtc_test") + == 0 + ) + diff --git a/ffcx/codegeneration/C/integrals.py b/ffcx/codegeneration/C/integrals.py index 226b2dabc..a169ffdf7 100644 --- a/ffcx/codegeneration/C/integrals.py +++ b/ffcx/codegeneration/C/integrals.py @@ -60,6 +60,7 @@ def generator(ir: IntegralIR, options): code["enabled_coefficients"] = "NULL" code["tabulate_tensor"] = body + code["tabulate_tensor_quoted"] = body.replace('\n', '\\n"\n "') code["tabulate_tensor_float32"] = ".tabulate_tensor_float32 = NULL," code["tabulate_tensor_float64"] = ".tabulate_tensor_float64 = NULL," @@ -71,7 +72,7 @@ def generator(ir: IntegralIR, options): code["tabulate_tensor_complex128"] = ".tabulate_tensor_complex128 = NULL," if options.get("cuda"): code["tabulate_tensor_cuda_nvrtc"] = ( - f".tabulate_tensor_cuda_nvrtc = tabulate_tensor_cuda_nvrtc_{factory_name}" + f".tabulate_tensor_cuda_nvrtc = tabulate_tensor_cuda_nvrtc_{factory_name}," ) else: code["tabulate_tensor_cuda_nvrtc"] = "" @@ -88,6 +89,7 @@ def generator(ir: IntegralIR, options): enabled_coefficients=code["enabled_coefficients"], enabled_coefficients_init=code["enabled_coefficients_init"], tabulate_tensor=code["tabulate_tensor"], + tabulate_tensor_quoted=code["tabulate_tensor_quoted"], needs_facet_permutations="true" if ir.expression.needs_facet_permutations else "false", scalar_type=dtype_to_c_type(options["scalar_type"]), geom_type=dtype_to_c_type(dtype_to_scalar_dtype(options["scalar_type"])), diff --git a/pyproject.toml b/pyproject.toml index 82061ab0d..3714dd41c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ ffcx = "ffcx:__main__.main" [project.optional-dependencies] lint = ["ruff"] docs = ["sphinx", "sphinx_rtd_theme"] -optional = ["numba", "pygraphviz==1.7"] +optional = ["numba", "pygraphviz==1.7", "nvidia-cuda-nvrtc-cu12"] test = ["pytest >= 6.0", "sympy", "numba"] ci = [ "coveralls", From 5b8bddeece476ada3e3df63cd7b7b7c38f6e9fa9 Mon Sep 17 00:00:00 2001 From: Benjamin Pachev Date: Mon, 11 Nov 2024 15:17:00 -0600 Subject: [PATCH 04/14] Fixed lint issues. --- demo/test_demos.py | 4 +++- ffcx/codegeneration/C/integrals.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/demo/test_demos.py b/demo/test_demos.py index 3ccb722e3..b64e9cbd5 100644 --- a/demo/test_demos.py +++ b/demo/test_demos.py @@ -12,6 +12,7 @@ if file.endswith(".py") and not file == "test_demos.py": ufl_files.append(file[:-3]) + @pytest.mark.parametrize("file", ufl_files) @pytest.mark.parametrize("scalar_type", ["float64", "float32", "complex128", "complex64"]) def test_demo(file, scalar_type): @@ -73,6 +74,7 @@ def test_demo(file, scalar_type): == 0 ) + @pytest.mark.parametrize("scalar_type", ["float64", "float32"]) def test_demo_nvrtc(scalar_type): """Test generated CUDA code with NVRTC.""" @@ -82,6 +84,7 @@ def test_demo_nvrtc(scalar_type): pytest.skip(reason="NVRTC support not tested on Windows") else: from nvidia import cuda_nvrtc + nvrtc_dir = os.path.dirname(os.path.realpath(cuda_nvrtc.__file__)) cc = os.environ.get("CC", "cc") extra_flags = ( @@ -110,4 +113,3 @@ def test_demo_nvrtc(scalar_type): os.system(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{nvrtc_dir}/lib {demo_dir}/nvrtc_test") == 0 ) - diff --git a/ffcx/codegeneration/C/integrals.py b/ffcx/codegeneration/C/integrals.py index a169ffdf7..5616faa1f 100644 --- a/ffcx/codegeneration/C/integrals.py +++ b/ffcx/codegeneration/C/integrals.py @@ -60,7 +60,7 @@ def generator(ir: IntegralIR, options): code["enabled_coefficients"] = "NULL" code["tabulate_tensor"] = body - code["tabulate_tensor_quoted"] = body.replace('\n', '\\n"\n "') + code["tabulate_tensor_quoted"] = body.replace("\n", '\\n"\n "') code["tabulate_tensor_float32"] = ".tabulate_tensor_float32 = NULL," code["tabulate_tensor_float64"] = ".tabulate_tensor_float64 = NULL," From 359b7099193b76b043e78d3e3f9d6a1a580da40c Mon Sep 17 00:00:00 2001 From: Benjamin Pachev Date: Mon, 11 Nov 2024 15:26:47 -0600 Subject: [PATCH 05/14] Only run NVRTC tests on Linux. --- demo/test_demos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demo/test_demos.py b/demo/test_demos.py index b64e9cbd5..c5055dda5 100644 --- a/demo/test_demos.py +++ b/demo/test_demos.py @@ -80,8 +80,8 @@ def test_demo_nvrtc(scalar_type): """Test generated CUDA code with NVRTC.""" file = "Components" opts = f"--scalar_type {scalar_type} --cuda" - if sys.platform.startswith("win32"): - pytest.skip(reason="NVRTC support not tested on Windows") + if not sys.platform.startswith("linux"): + pytest.skip(reason="NVRTC support only tested on Linux") else: from nvidia import cuda_nvrtc From 86bca5ed773e79afd47ad6e64f5ae38477d8abe7 Mon Sep 17 00:00:00 2001 From: Benjamin Pachev Date: Wed, 13 Nov 2024 19:39:00 -0600 Subject: [PATCH 06/14] Updated NVRTC test with more forms, more explicit naming, and experimental windows support. --- demo/nvrtc_test.cpp | 27 ++++++++- demo/test_demos.py | 62 ++++++++++++++++----- ffcx/codegeneration/C/integrals.py | 5 +- ffcx/codegeneration/C/integrals_template.py | 2 +- ffcx/options.py | 7 ++- 5 files changed, 81 insertions(+), 22 deletions(-) diff --git a/demo/nvrtc_test.cpp b/demo/nvrtc_test.cpp index 983ba335b..ed426baf3 100644 --- a/demo/nvrtc_test.cpp +++ b/demo/nvrtc_test.cpp @@ -1,15 +1,21 @@ #include "Components.h" +#include "FacetIntegrals.h" +#include "HyperElasticity.h" +#include "MathFunctions.h" +#include "StabilisedStokes.h" +#include "VectorPoisson.h" #include "ufcx.h" #include "nvrtc.h" -#include #include +#include #include #include +#include -int main() +void check_nvrtc_compilation(ufcx_form* form) { // extract kernel - ufcx_integral* integral = form_Components_L->form_integrals[0]; + ufcx_integral* integral = form->form_integrals[0]; ufcx_tabulate_tensor_cuda_nvrtc* kernel = integral->tabulate_tensor_cuda_nvrtc; // call kernel to get CUDA-wrapped source code int num_program_headers; @@ -84,6 +90,21 @@ int main() << std::string(60, '-') << "\n"; throw std::runtime_error(ss.str()); } +} +int main() +{ + std::vector forms = { + form_Components_L, + form_FacetIntegrals_a, + form_HyperElasticity_a_F, form_HyperElasticity_a_J, + form_MathFunctions_a, + form_StabilisedStokes_a, form_StabilisedStokes_L, + form_VectorPoisson_a, form_VectorPoisson_L + }; + + for (ufcx_form* form : forms) check_nvrtc_compilation(form); + return 0; } + diff --git a/demo/test_demos.py b/demo/test_demos.py index c5055dda5..23943501e 100644 --- a/demo/test_demos.py +++ b/demo/test_demos.py @@ -78,34 +78,66 @@ def test_demo(file, scalar_type): @pytest.mark.parametrize("scalar_type", ["float64", "float32"]) def test_demo_nvrtc(scalar_type): """Test generated CUDA code with NVRTC.""" - file = "Components" - opts = f"--scalar_type {scalar_type} --cuda" - if not sys.platform.startswith("linux"): - pytest.skip(reason="NVRTC support only tested on Linux") - else: + try: from nvidia import cuda_nvrtc + except ImportError: + pytest.skip(reason="Must have NVRTC pip package installed to run test.") - nvrtc_dir = os.path.dirname(os.path.realpath(cuda_nvrtc.__file__)) - cc = os.environ.get("CC", "cc") - extra_flags = ( - "-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration" - ) + files = [ + "Components", + "FacetIntegrals", + "HyperElasticity", + "MathFunctions", + "StabilisedStokes", + "VectorPoisson", + ] + opts = f"--scalar_type {scalar_type} --cuda_nvrtc" + windows = sys.platform.startswith("win32") + nvrtc_dir = os.path.dirname(os.path.realpath(cuda_nvrtc.__file__)) + for file in files: assert os.system(f"cd {demo_dir} && ffcx {opts} {file}.py") == 0 + if windows: + extra_flags = "/std:c17" + assert ( + os.system( + f"cd {demo_dir} && " + f'cl.exe /I "../ffcx/codegeneration" {extra_flags} /c {file}.c' + ) + ) == 0 + else: + cc = os.environ.get("CC", "cc") + extra_flags = ( + "-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration" + ) + assert ( + os.system( + f"cd {demo_dir} && " + f"{cc} -I../ffcx/codegeneration " + f"{extra_flags} " + f"-c {file}.c" + ) + == 0 + ) + + if windows: assert ( os.system( f"cd {demo_dir} && " - f"{cc} -I../ffcx/codegeneration " - f"{extra_flags} " - f"-c {file}.c" + f'cl.exe /I "../ffcx/codegeneration" /I "{nvrtc_dir}\\include" ' + f'/link /LIBPATH:"{nvrtc_dir}\\lib" nvrtc_test.cpp ' + f"{' '.join([file+'.obj' for file in files])} nvrtc.lib /OUT:nvrtc_test.exe" ) == 0 ) + assert os.system(f"set PATH=%PATH%;{nvrtc_dir}\\lib && {demo_dir}\\nvrtc_test.exe") == 0 + else: cxx = os.environ.get("CXX", "c++") assert ( os.system( f"cd {demo_dir} && " - f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib" - f" -o nvrtc_test nvrtc_test.cpp {file}.o -l:libnvrtc.so.12" + f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib " + f" -Werror -o nvrtc_test nvrtc_test.cpp " + f"{' '.join([file+'.o' for file in files])} -l:libnvrtc.so.12" ) == 0 ) diff --git a/ffcx/codegeneration/C/integrals.py b/ffcx/codegeneration/C/integrals.py index 5616faa1f..4497a543c 100644 --- a/ffcx/codegeneration/C/integrals.py +++ b/ffcx/codegeneration/C/integrals.py @@ -60,7 +60,6 @@ def generator(ir: IntegralIR, options): code["enabled_coefficients"] = "NULL" code["tabulate_tensor"] = body - code["tabulate_tensor_quoted"] = body.replace("\n", '\\n"\n "') code["tabulate_tensor_float32"] = ".tabulate_tensor_float32 = NULL," code["tabulate_tensor_float64"] = ".tabulate_tensor_float64 = NULL," @@ -70,12 +69,14 @@ def generator(ir: IntegralIR, options): else: code["tabulate_tensor_complex64"] = ".tabulate_tensor_complex64 = NULL," code["tabulate_tensor_complex128"] = ".tabulate_tensor_complex128 = NULL," - if options.get("cuda"): + if options.get("cuda_nvrtc"): code["tabulate_tensor_cuda_nvrtc"] = ( f".tabulate_tensor_cuda_nvrtc = tabulate_tensor_cuda_nvrtc_{factory_name}," ) + code["tabulate_tensor_quoted"] = body.replace("\n", '\\n"\n "') else: code["tabulate_tensor_cuda_nvrtc"] = "" + code["tabulate_tensor_quoted"] = "" np_scalar_type = np.dtype(options["scalar_type"]).name code[f"tabulate_tensor_{np_scalar_type}"] = ( diff --git a/ffcx/codegeneration/C/integrals_template.py b/ffcx/codegeneration/C/integrals_template.py index 28e36a508..732e29702 100644 --- a/ffcx/codegeneration/C/integrals_template.py +++ b/ffcx/codegeneration/C/integrals_template.py @@ -83,7 +83,7 @@ def get_factory(options): """Return the template string for constructing form integrals.""" - if options.get("cuda"): + if options.get("cuda_nvrtc"): return cuda_wrapper + factory else: return factory diff --git a/ffcx/options.py b/ffcx/options.py index 252ea74a8..71783a3fe 100644 --- a/ffcx/options.py +++ b/ffcx/options.py @@ -20,7 +20,12 @@ logger = logging.getLogger("ffcx") FFCX_DEFAULT_OPTIONS = { - "cuda": (bool, False, "generate CUDA wrapped versions of tabulate tensor functions", None), + "cuda_nvrtc": ( + bool, + False, + "generate CUDA wrapped versions of tabulate tensor functions for use with NVRTC", + None, + ), "epsilon": (float, 1e-14, "machine precision, used for dropping zero terms in tables.", None), "scalar_type": ( str, From 0a5af4f5b363cfe8f7c07dcd414a03309a281deb Mon Sep 17 00:00:00 2001 From: Benjamin Pachev Date: Wed, 13 Nov 2024 20:34:28 -0600 Subject: [PATCH 07/14] Only explicitly skip tests for Windows. --- demo/test_demos.py | 64 +++++++++++++++------------------------------- 1 file changed, 21 insertions(+), 43 deletions(-) diff --git a/demo/test_demos.py b/demo/test_demos.py index 23943501e..d18d773c1 100644 --- a/demo/test_demos.py +++ b/demo/test_demos.py @@ -83,6 +83,9 @@ def test_demo_nvrtc(scalar_type): except ImportError: pytest.skip(reason="Must have NVRTC pip package installed to run test.") + if sys.platform.startswith("win32"): + pytest.skip(reason="NVRTC CUDA wrappers not currently supported for Windows.") + files = [ "Components", "FacetIntegrals", @@ -92,56 +95,31 @@ def test_demo_nvrtc(scalar_type): "VectorPoisson", ] opts = f"--scalar_type {scalar_type} --cuda_nvrtc" - windows = sys.platform.startswith("win32") nvrtc_dir = os.path.dirname(os.path.realpath(cuda_nvrtc.__file__)) + cc = os.environ.get("CC", "cc") + extra_flags = ( + "-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration" + ) for file in files: assert os.system(f"cd {demo_dir} && ffcx {opts} {file}.py") == 0 - if windows: - extra_flags = "/std:c17" - assert ( - os.system( - f"cd {demo_dir} && " - f'cl.exe /I "../ffcx/codegeneration" {extra_flags} /c {file}.c' - ) - ) == 0 - else: - cc = os.environ.get("CC", "cc") - extra_flags = ( - "-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration" - ) - assert ( - os.system( - f"cd {demo_dir} && " - f"{cc} -I../ffcx/codegeneration " - f"{extra_flags} " - f"-c {file}.c" - ) - == 0 - ) - - if windows: - assert ( - os.system( - f"cd {demo_dir} && " - f'cl.exe /I "../ffcx/codegeneration" /I "{nvrtc_dir}\\include" ' - f'/link /LIBPATH:"{nvrtc_dir}\\lib" nvrtc_test.cpp ' - f"{' '.join([file+'.obj' for file in files])} nvrtc.lib /OUT:nvrtc_test.exe" - ) - == 0 - ) - assert os.system(f"set PATH=%PATH%;{nvrtc_dir}\\lib && {demo_dir}\\nvrtc_test.exe") == 0 - else: - cxx = os.environ.get("CXX", "c++") assert ( os.system( f"cd {demo_dir} && " - f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib " - f" -Werror -o nvrtc_test nvrtc_test.cpp " - f"{' '.join([file+'.o' for file in files])} -l:libnvrtc.so.12" + f"{cc} -I../ffcx/codegeneration " + f"{extra_flags} " + f"-c {file}.c" ) == 0 ) - assert ( - os.system(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{nvrtc_dir}/lib {demo_dir}/nvrtc_test") - == 0 + + cxx = os.environ.get("CXX", "c++") + assert ( + os.system( + f"cd {demo_dir} && " + f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib " + f" -Werror -o nvrtc_test nvrtc_test.cpp " + f"{' '.join([file+'.o' for file in files])} -l:libnvrtc.so.12" ) + == 0 + ) + assert os.system(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{nvrtc_dir}/lib {demo_dir}/nvrtc_test") == 0 From 3338af942e5622e4d4a783de69b5c844bbf823ec Mon Sep 17 00:00:00 2001 From: "Jack S. Hale" Date: Thu, 19 Jun 2025 12:46:48 +0200 Subject: [PATCH 08/14] ruff format --- demo/test_demos.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/demo/test_demos.py b/demo/test_demos.py index bf131eacc..168054e93 100644 --- a/demo/test_demos.py +++ b/demo/test_demos.py @@ -88,12 +88,7 @@ def test_demo_nvrtc(scalar_type): for file in files: assert os.system(f"cd {demo_dir} && ffcx {opts} {file}.py") == 0 assert ( - os.system( - f"cd {demo_dir} && " - f"{cc} -I../ffcx/codegeneration " - f"{extra_flags} " - f"-c {file}.c" - ) + os.system(f"cd {demo_dir} && {cc} -I../ffcx/codegeneration {extra_flags} -c {file}.c") == 0 ) @@ -103,7 +98,7 @@ def test_demo_nvrtc(scalar_type): f"cd {demo_dir} && " f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib " f" -Werror -o nvrtc_test nvrtc_test.cpp " - f"{' '.join([file+'.o' for file in files])} -l:libnvrtc.so.12" + f"{' '.join([file + '.o' for file in files])} -l:libnvrtc.so.12" ) == 0 ) From 06627c6f87bb167571f8dc24730dc9aad2bb9aac Mon Sep 17 00:00:00 2001 From: "Jack S. Hale" Date: Thu, 19 Jun 2025 14:56:17 +0200 Subject: [PATCH 09/14] Try adding pyindex --- .github/workflows/pythonapp.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index a6a8dd453..bef25ad39 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -79,7 +79,9 @@ jobs: - name: Install FFCx (Linux, with optional dependencies) if: runner.os == 'Linux' - run: pip install .[ci,optional] + run: | + pip install nvidia-pyindex # NVIDIA has its own pypi index + pip install .[ci,optional] - name: Install FFCx (macOS, Windows) if: runner.os != 'Linux' run: pip install .[ci] From 4e8ac19edd3cb024e84229ebc862cc05d12dca4f Mon Sep 17 00:00:00 2001 From: "Jack S. Hale" Date: Thu, 19 Jun 2025 15:55:15 +0200 Subject: [PATCH 10/14] Doesn't work anyway. --- .github/workflows/pythonapp.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index bef25ad39..5a42fffd9 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -80,7 +80,6 @@ jobs: - name: Install FFCx (Linux, with optional dependencies) if: runner.os == 'Linux' run: | - pip install nvidia-pyindex # NVIDIA has its own pypi index pip install .[ci,optional] - name: Install FFCx (macOS, Windows) if: runner.os != 'Linux' From 1426b4c699485641f1278d98beb1d3a20a9561fc Mon Sep 17 00:00:00 2001 From: Benjamin Pachev Date: Fri, 20 Jun 2025 09:02:48 +0000 Subject: [PATCH 11/14] Use importlib to find nvrtc source files, and pin nvrtc version. --- demo/test_demos.py | 9 +++++---- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/demo/test_demos.py b/demo/test_demos.py index 168054e93..1eca5a741 100644 --- a/demo/test_demos.py +++ b/demo/test_demos.py @@ -63,9 +63,10 @@ def test_demo(file, scalar_type): @pytest.mark.parametrize("scalar_type", ["float64", "float32"]) def test_demo_nvrtc(scalar_type): """Test generated CUDA code with NVRTC.""" - try: - from nvidia import cuda_nvrtc - except ImportError: + import importlib.util + + spec = importlib.util.find_spec("nvidia.cuda_nvrtc") + if spec is None: pytest.skip(reason="Must have NVRTC pip package installed to run test.") if sys.platform.startswith("win32"): @@ -80,7 +81,7 @@ def test_demo_nvrtc(scalar_type): "VectorPoisson", ] opts = f"--scalar_type {scalar_type} --cuda_nvrtc" - nvrtc_dir = os.path.dirname(os.path.realpath(cuda_nvrtc.__file__)) + nvrtc_dir = os.path.realpath(spec.submodule_search_locations[0]) cc = os.environ.get("CC", "cc") extra_flags = ( "-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration" diff --git a/pyproject.toml b/pyproject.toml index 37e269dcd..9df075b62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ ffcx = "ffcx:__main__.main" [project.optional-dependencies] lint = ["ruff"] docs = ["sphinx", "sphinx_rtd_theme"] -optional = ["numba", "pygraphviz==1.7", "nvidia-cuda-nvrtc-cu12"] +optional = ["numba", "pygraphviz==1.7", "nvidia-cuda-nvrtc-cu12==12.9.86"] test = ["pytest >= 6.0", "sympy", "numba"] ci = [ "coveralls", From 006c23c7ce30c5eb6656a3a53a36c809b009ef9d Mon Sep 17 00:00:00 2001 From: Benjamin Pachev Date: Fri, 20 Jun 2025 09:16:07 +0000 Subject: [PATCH 12/14] Properly handle missing NVRTC pip package. --- demo/test_demos.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/demo/test_demos.py b/demo/test_demos.py index 1eca5a741..eb8519230 100644 --- a/demo/test_demos.py +++ b/demo/test_demos.py @@ -65,8 +65,9 @@ def test_demo_nvrtc(scalar_type): """Test generated CUDA code with NVRTC.""" import importlib.util - spec = importlib.util.find_spec("nvidia.cuda_nvrtc") - if spec is None: + try: + spec = importlib.util.find_spec("nvidia.cuda_nvrtc") + except ModuleNotFoundError: pytest.skip(reason="Must have NVRTC pip package installed to run test.") if sys.platform.startswith("win32"): From 9e0464ed07b4b8dac7edfbef9ba31f5db7adc25e Mon Sep 17 00:00:00 2001 From: "Jack S. Hale" Date: Thu, 26 Jun 2025 08:51:50 +0200 Subject: [PATCH 13/14] Change to lower bound --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9df075b62..97907db29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ ffcx = "ffcx:__main__.main" [project.optional-dependencies] lint = ["ruff"] docs = ["sphinx", "sphinx_rtd_theme"] -optional = ["numba", "pygraphviz==1.7", "nvidia-cuda-nvrtc-cu12==12.9.86"] +optional = ["numba", "pygraphviz==1.7", "nvidia-cuda-nvrtc-cu12>=12.9.86"] test = ["pytest >= 6.0", "sympy", "numba"] ci = [ "coveralls", From 8981aa97764f9fa772540c03aed9f112efa69445 Mon Sep 17 00:00:00 2001 From: "Jack S. Hale" Date: Thu, 26 Jun 2025 08:52:45 +0200 Subject: [PATCH 14/14] Try without pygraphviz equality --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 97907db29..38220d5f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ ffcx = "ffcx:__main__.main" [project.optional-dependencies] lint = ["ruff"] docs = ["sphinx", "sphinx_rtd_theme"] -optional = ["numba", "pygraphviz==1.7", "nvidia-cuda-nvrtc-cu12>=12.9.86"] +optional = ["numba", "pygraphviz", "nvidia-cuda-nvrtc-cu12>=12.9.86"] test = ["pytest >= 6.0", "sympy", "numba"] ci = [ "coveralls",