From fe9c17097c1a585ad208da84eff645e62ec1ae4c Mon Sep 17 00:00:00 2001
From: Benjamin Pachev <benjaminpachev@gmail.com>
Date: Fri, 13 Sep 2024 17:27:34 -0500
Subject: [PATCH 01/14] Add CUDA wrapper capability.

---
 ffcx/codegeneration/C/integrals.py          | 10 ++++-
 ffcx/codegeneration/C/integrals_template.py | 46 +++++++++++++++++++++
 ffcx/codegeneration/jit.py                  |  3 ++
 ffcx/codegeneration/ufcx.h                  | 22 ++++++++++
 ffcx/options.py                             |  1 +
 5 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/ffcx/codegeneration/C/integrals.py b/ffcx/codegeneration/C/integrals.py
index 1115e3731..4c37efc76 100644
--- a/ffcx/codegeneration/C/integrals.py
+++ b/ffcx/codegeneration/C/integrals.py
@@ -69,6 +69,13 @@ def generator(ir: IntegralIR, options):
     else:
         code["tabulate_tensor_complex64"] = ".tabulate_tensor_complex64 = NULL,"
         code["tabulate_tensor_complex128"] = ".tabulate_tensor_complex128 = NULL,"
+    if options.get("cuda"):
+        code["tabulate_tensor_cuda"] = (
+            f".tabulate_tensor_cuda = tabulate_tensor_cuda_{factory_name}"
+        )
+    else:
+        code["tabulate_tensor_cuda"] = ""
+
     np_scalar_type = np.dtype(options["scalar_type"]).name
     code[f"tabulate_tensor_{np_scalar_type}"] = (
         f".tabulate_tensor_{np_scalar_type} = tabulate_tensor_{factory_name},"
@@ -76,7 +83,7 @@ def generator(ir: IntegralIR, options):
 
     element_hash = 0 if ir.coordinate_element_hash is None else ir.coordinate_element_hash
 
-    implementation = ufcx_integrals.factory.format(
+    implementation = ufcx_integrals.get_factory(options).format(
         factory_name=factory_name,
         enabled_coefficients=code["enabled_coefficients"],
         enabled_coefficients_init=code["enabled_coefficients_init"],
@@ -89,6 +96,7 @@ def generator(ir: IntegralIR, options):
         tabulate_tensor_float64=code["tabulate_tensor_float64"],
         tabulate_tensor_complex64=code["tabulate_tensor_complex64"],
         tabulate_tensor_complex128=code["tabulate_tensor_complex128"],
+        tabulate_tensor_cuda=code["tabulate_tensor_cuda"],
     )
 
     return declaration, implementation
diff --git a/ffcx/codegeneration/C/integrals_template.py b/ffcx/codegeneration/C/integrals_template.py
index 2bb1568ec..87cd5228c 100644
--- a/ffcx/codegeneration/C/integrals_template.py
+++ b/ffcx/codegeneration/C/integrals_template.py
@@ -30,9 +30,55 @@
   {tabulate_tensor_float64}
   {tabulate_tensor_complex64}
   {tabulate_tensor_complex128}
+  {tabulate_tensor_cuda}
   .needs_facet_permutations = {needs_facet_permutations},
   .coordinate_element_hash = {coordinate_element_hash},
 }};
 
 // End of code for integral {factory_name}
 """
+
+cuda_wrapper = """
+
+// Begin CUDA wrapper for integral {factory_name}
+void tabulate_tensor_cuda_{factory_name}(int* num_program_headers,
+                                         const char*** program_headers,
+                                         const char*** program_include_names,
+                                         const char** out_program_src,
+                                         const char** tabulate_tensor_function_name)
+{{
+  const char* program_src = ""
+    "#define alignas(x)\\n"
+    "#define restrict __restrict__\\n"
+    "\\n"
+    "typedef unsigned char uint8_t;\\n"
+    "typedef unsigned int uint32_t;\\n"
+    "typedef double ufc_scalar_t;\\n"
+    "\\n"
+    "extern \\"C\\" __global__\\n"
+    "void tabulate_tensor_{factory_name}({scalar_type}* restrict A,\\n"
+    "                                    const {scalar_type}* restrict w,\\n"
+    "                                    const {scalar_type}* restrict c,\\n"
+    "                                    const {geom_type}* restrict coordinate_dofs,\\n"
+    "                                    const int* restrict entity_local_index,\\n"
+    "                                    const uint8_t* restrict quadrature_permutation\\n"
+    "                                    )\\n"
+    "{{\\n"
+    "{tabulate_tensor_quoted}\\n"
+    "}}";
+  *num_program_headers = 0;
+  *program_headers = NULL;
+  *program_include_names = NULL;
+  *out_program_src = program_src;
+  *tabulate_tensor_function_name = "tabulate_tensor_{factory_name}";
+}}
+
+// End CUDA wrapper for integral {factory_name}
+
+"""
+
+def get_factory(options):
+    if options.get("cuda"):
+        return cuda_wrapper + factory
+    else:
+        return factory
diff --git a/ffcx/codegeneration/jit.py b/ffcx/codegeneration/jit.py
index 6eb5dbb8f..94bef517f 100644
--- a/ffcx/codegeneration/jit.py
+++ b/ffcx/codegeneration/jit.py
@@ -68,6 +68,9 @@
 UFC_INTEGRAL_DECL += "\n".join(
     re.findall(r"typedef void ?\(ufcx_tabulate_tensor_complex128\).*?\);", ufcx_h, re.DOTALL)
 )
+UFC_INTEGRAL_DECL += "\n".join(
+    re.findall(r"typedef void ?\(ufcx_tabulate_tensor_cuda\).*?\);", ufcx_h, re.DOTALL)
+)
 
 UFC_INTEGRAL_DECL += "\n".join(
     re.findall("typedef struct ufcx_integral.*?ufcx_integral;", ufcx_h, re.DOTALL)
diff --git a/ffcx/codegeneration/ufcx.h b/ffcx/codegeneration/ufcx.h
index e1dd838d1..a4bc8ae21 100644
--- a/ffcx/codegeneration/ufcx.h
+++ b/ffcx/codegeneration/ufcx.h
@@ -125,6 +125,27 @@ extern "C"
       const uint8_t* restrict quadrature_permutation);
 #endif // __STDC_NO_COMPLEX__
 
+  /// Return CUDA C++ source code for the ufc_tabulate_tensor kernel
+  ///
+  /// @param[out] num_program_headers
+  ///         The number of headers required by the program
+  /// @param[out] program_headers
+  ///         Entire contents of each header file
+  /// @param[out] program_include_names
+  ///         Names of each header file
+  /// @param[out] program_src
+  ///         CUDA C++ source code for the program containing the
+  ///         tabulate_tensor function.
+  /// @param[out] tabulate_tensor_function_name
+  ///         The name of the device-side function.
+  ///
+  typedef void(ufcx_tabulate_tensor_cuda)(
+      int* num_program_headers,
+      const char*** program_headers,
+      const char*** program_include_names,
+      const char** program_src,
+      const char** tabulate_tensor_function_name);
+
   typedef struct ufcx_integral
   {
     const bool* enabled_coefficients;
@@ -134,6 +155,7 @@ extern "C"
     ufcx_tabulate_tensor_complex64* tabulate_tensor_complex64;
     ufcx_tabulate_tensor_complex128* tabulate_tensor_complex128;
 #endif // __STDC_NO_COMPLEX__
+    ufcx_tabulate_tensor_cuda* tabulate_tensor_cuda;
     bool needs_facet_permutations;
 
     /// Get the hash of the coordinate element associated with the geometry of the mesh.
diff --git a/ffcx/options.py b/ffcx/options.py
index 536f02a35..252ea74a8 100644
--- a/ffcx/options.py
+++ b/ffcx/options.py
@@ -20,6 +20,7 @@
 logger = logging.getLogger("ffcx")
 
 FFCX_DEFAULT_OPTIONS = {
+    "cuda": (bool, False, "generate CUDA wrapped versions of tabulate tensor functions", None),
     "epsilon": (float, 1e-14, "machine precision, used for dropping zero terms in tables.", None),
     "scalar_type": (
         str,

From 3369f070c58ea2d372a1e4aef170e45597601612 Mon Sep 17 00:00:00 2001
From: Benjamin Pachev <benjaminpachev@gmail.com>
Date: Wed, 18 Sep 2024 19:18:45 -0500
Subject: [PATCH 02/14] Rename tabulate_tensor_cuda to
 tabulate_tensor_cuda_nvrtc. Add comments clarifying the use of NVRTC and the
 need for typedefs in generated CUDA source code.

---
 ffcx/codegeneration/C/integrals.py          |  8 ++++----
 ffcx/codegeneration/C/integrals_template.py | 13 +++++++++----
 ffcx/codegeneration/jit.py                  |  2 +-
 ffcx/codegeneration/ufcx.h                  |  5 +++--
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/ffcx/codegeneration/C/integrals.py b/ffcx/codegeneration/C/integrals.py
index 4c37efc76..226b2dabc 100644
--- a/ffcx/codegeneration/C/integrals.py
+++ b/ffcx/codegeneration/C/integrals.py
@@ -70,11 +70,11 @@ def generator(ir: IntegralIR, options):
         code["tabulate_tensor_complex64"] = ".tabulate_tensor_complex64 = NULL,"
         code["tabulate_tensor_complex128"] = ".tabulate_tensor_complex128 = NULL,"
     if options.get("cuda"):
-        code["tabulate_tensor_cuda"] = (
-            f".tabulate_tensor_cuda = tabulate_tensor_cuda_{factory_name}"
+        code["tabulate_tensor_cuda_nvrtc"] = (
+            f".tabulate_tensor_cuda_nvrtc = tabulate_tensor_cuda_nvrtc_{factory_name}"
         )
     else:
-        code["tabulate_tensor_cuda"] = ""
+        code["tabulate_tensor_cuda_nvrtc"] = ""
 
     np_scalar_type = np.dtype(options["scalar_type"]).name
     code[f"tabulate_tensor_{np_scalar_type}"] = (
@@ -96,7 +96,7 @@ def generator(ir: IntegralIR, options):
         tabulate_tensor_float64=code["tabulate_tensor_float64"],
         tabulate_tensor_complex64=code["tabulate_tensor_complex64"],
         tabulate_tensor_complex128=code["tabulate_tensor_complex128"],
-        tabulate_tensor_cuda=code["tabulate_tensor_cuda"],
+        tabulate_tensor_cuda_nvrtc=code["tabulate_tensor_cuda_nvrtc"],
     )
 
     return declaration, implementation
diff --git a/ffcx/codegeneration/C/integrals_template.py b/ffcx/codegeneration/C/integrals_template.py
index 87cd5228c..28e36a508 100644
--- a/ffcx/codegeneration/C/integrals_template.py
+++ b/ffcx/codegeneration/C/integrals_template.py
@@ -30,7 +30,7 @@
   {tabulate_tensor_float64}
   {tabulate_tensor_complex64}
   {tabulate_tensor_complex128}
-  {tabulate_tensor_cuda}
+  {tabulate_tensor_cuda_nvrtc}
   .needs_facet_permutations = {needs_facet_permutations},
   .coordinate_element_hash = {coordinate_element_hash},
 }};
@@ -40,13 +40,16 @@
 
 cuda_wrapper = """
 
-// Begin CUDA wrapper for integral {factory_name}
-void tabulate_tensor_cuda_{factory_name}(int* num_program_headers,
+// Begin NVRTC CUDA wrapper for integral {factory_name}
+// The wrapper is compiled with a standard C++ compiler, and is called at runtime to generate
+// source code which is then compiled into a CUDA kernel at runtime via NVRTC.
+void tabulate_tensor_cuda_nvrtc_{factory_name}(int* num_program_headers,
                                          const char*** program_headers,
                                          const char*** program_include_names,
                                          const char** out_program_src,
                                          const char** tabulate_tensor_function_name)
 {{
+  // The below typedefs are needed due to issues with including stdint.h in NVRTC source code
   const char* program_src = ""
     "#define alignas(x)\\n"
     "#define restrict __restrict__\\n"
@@ -73,11 +76,13 @@
   *tabulate_tensor_function_name = "tabulate_tensor_{factory_name}";
 }}
 
-// End CUDA wrapper for integral {factory_name}
+// End NVRTC CUDA wrapper for integral {factory_name}
 
 """
 
+
 def get_factory(options):
+    """Return the template string for constructing form integrals."""
     if options.get("cuda"):
         return cuda_wrapper + factory
     else:
diff --git a/ffcx/codegeneration/jit.py b/ffcx/codegeneration/jit.py
index 94bef517f..467de689c 100644
--- a/ffcx/codegeneration/jit.py
+++ b/ffcx/codegeneration/jit.py
@@ -69,7 +69,7 @@
     re.findall(r"typedef void ?\(ufcx_tabulate_tensor_complex128\).*?\);", ufcx_h, re.DOTALL)
 )
 UFC_INTEGRAL_DECL += "\n".join(
-    re.findall(r"typedef void ?\(ufcx_tabulate_tensor_cuda\).*?\);", ufcx_h, re.DOTALL)
+    re.findall(r"typedef void ?\(ufcx_tabulate_tensor_cuda_nvrtc\).*?\);", ufcx_h, re.DOTALL)
 )
 
 UFC_INTEGRAL_DECL += "\n".join(
diff --git a/ffcx/codegeneration/ufcx.h b/ffcx/codegeneration/ufcx.h
index a4bc8ae21..782a1a04d 100644
--- a/ffcx/codegeneration/ufcx.h
+++ b/ffcx/codegeneration/ufcx.h
@@ -126,6 +126,7 @@ extern "C"
 #endif // __STDC_NO_COMPLEX__
 
   /// Return CUDA C++ source code for the ufc_tabulate_tensor kernel
+  /// The resulting source code is passed to NVRTC for runtime compilation
   ///
   /// @param[out] num_program_headers
   ///         The number of headers required by the program
@@ -139,7 +140,7 @@ extern "C"
   /// @param[out] tabulate_tensor_function_name
   ///         The name of the device-side function.
   ///
-  typedef void(ufcx_tabulate_tensor_cuda)(
+  typedef void(ufcx_tabulate_tensor_cuda_nvrtc)(
       int* num_program_headers,
       const char*** program_headers,
       const char*** program_include_names,
@@ -155,7 +156,7 @@ extern "C"
     ufcx_tabulate_tensor_complex64* tabulate_tensor_complex64;
     ufcx_tabulate_tensor_complex128* tabulate_tensor_complex128;
 #endif // __STDC_NO_COMPLEX__
-    ufcx_tabulate_tensor_cuda* tabulate_tensor_cuda;
+    ufcx_tabulate_tensor_cuda_nvrtc* tabulate_tensor_cuda_nvrtc;
     bool needs_facet_permutations;
 
     /// Get the hash of the coordinate element associated with the geometry of the mesh.

From b8e717b1e9316a59fce321bdffad726c044e46e4 Mon Sep 17 00:00:00 2001
From: Benjamin Pachev <benjaminpachev@gmail.com>
Date: Mon, 11 Nov 2024 15:10:06 -0600
Subject: [PATCH 03/14] Added unit test for cuda wrapper functionality via
 nvrtc.

---
 demo/nvrtc_test.cpp                | 89 ++++++++++++++++++++++++++++++
 demo/test_demos.py                 | 40 +++++++++++++-
 ffcx/codegeneration/C/integrals.py |  4 +-
 pyproject.toml                     |  2 +-
 4 files changed, 132 insertions(+), 3 deletions(-)
 create mode 100644 demo/nvrtc_test.cpp

diff --git a/demo/nvrtc_test.cpp b/demo/nvrtc_test.cpp
new file mode 100644
index 000000000..983ba335b
--- /dev/null
+++ b/demo/nvrtc_test.cpp
@@ -0,0 +1,89 @@
+#include "Components.h"
+#include "ufcx.h"
+#include "nvrtc.h"
+#include <stdexcept>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+int main()
+{
+  // extract kernel
+  ufcx_integral* integral = form_Components_L->form_integrals[0];
+  ufcx_tabulate_tensor_cuda_nvrtc* kernel = integral->tabulate_tensor_cuda_nvrtc;
+  // call kernel to get CUDA-wrapped source code
+  int num_program_headers;
+  const char** program_headers;
+  const char** program_include_names;
+  const char* program_src;
+  const char* tabulate_tensor_function_name;
+  if (!kernel) {
+    throw std::runtime_error("NVRTC wrapper function is NULL!");
+  }
+  (*kernel)(
+    &num_program_headers, &program_headers,
+    &program_include_names, &program_src,
+    &tabulate_tensor_function_name);
+  // compile CUDA-wrapped source code with NVRTC
+  // with proper error checking
+
+  nvrtcResult nvrtc_err;
+  nvrtcProgram program;
+  nvrtc_err = nvrtcCreateProgram(
+    &program, program_src, tabulate_tensor_function_name,
+    num_program_headers, program_headers,
+    program_include_names);
+
+ if (nvrtc_err != NVRTC_SUCCESS) {
+    throw std::runtime_error(
+      "nvrtcCreateProgram() failed with " +
+      std::string(nvrtcGetErrorString(nvrtc_err)) + " "
+      "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
+  }
+
+  int num_compile_options = 0;
+  const char** compile_options;
+  // Compile the CUDA C++ program
+  nvrtcResult nvrtc_compile_err = nvrtcCompileProgram(
+    program, num_compile_options, compile_options);
+  if (nvrtc_compile_err != NVRTC_SUCCESS) {
+    // If the compiler failed, obtain the compiler log
+    std::string program_log;
+    size_t log_size;
+    nvrtc_err = nvrtcGetProgramLogSize(program, &log_size);
+    if (nvrtc_err != NVRTC_SUCCESS) {
+      program_log = std::string(
+        "nvrtcGetProgramLogSize() failed with " +
+        std::string(nvrtcGetErrorString(nvrtc_err)) + " "
+        "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
+    } else {
+      program_log.resize(log_size);
+      nvrtc_err = nvrtcGetProgramLog(
+        program, const_cast<char*>(program_log.c_str()));
+      if (nvrtc_err != NVRTC_SUCCESS) {
+        program_log = std::string(
+          "nvrtcGetProgramLog() failed with " +
+          std::string(nvrtcGetErrorString(nvrtc_err))) + " "
+          "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__);
+      }
+      if (log_size > 0)
+        program_log.resize(log_size-1);
+    }
+    nvrtcDestroyProgram(&program);
+
+    std::stringstream ss;
+    ss << "nvrtcCompileProgram() failed with "
+       << nvrtcGetErrorString(nvrtc_compile_err) << "\n"
+       << "CUDA C++ source code:\n"
+       << std::string(60, '-') << "\n"
+       << program_src
+       << std::string(60, '-') << "\n"
+       << "NVRTC compiler log:\n"
+       << std::string(60, '-') << "\n"
+       << program_log << "\n"
+       << std::string(60, '-') << "\n";
+    throw std::runtime_error(ss.str());
+  }
+
+  return 0;
+}
diff --git a/demo/test_demos.py b/demo/test_demos.py
index 78b18234f..3ccb722e3 100644
--- a/demo/test_demos.py
+++ b/demo/test_demos.py
@@ -12,7 +12,6 @@
     if file.endswith(".py") and not file == "test_demos.py":
         ufl_files.append(file[:-3])
 
-
 @pytest.mark.parametrize("file", ufl_files)
 @pytest.mark.parametrize("scalar_type", ["float64", "float32", "complex128", "complex64"])
 def test_demo(file, scalar_type):
@@ -73,3 +72,42 @@ def test_demo(file, scalar_type):
             )
             == 0
         )
+
+@pytest.mark.parametrize("scalar_type", ["float64", "float32"])
+def test_demo_nvrtc(scalar_type):
+    """Test generated CUDA code with NVRTC."""
+    file = "Components"
+    opts = f"--scalar_type {scalar_type} --cuda"
+    if sys.platform.startswith("win32"):
+        pytest.skip(reason="NVRTC support not tested on Windows")
+    else:
+        from nvidia import cuda_nvrtc
+        nvrtc_dir = os.path.dirname(os.path.realpath(cuda_nvrtc.__file__))
+        cc = os.environ.get("CC", "cc")
+        extra_flags = (
+            "-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration"
+        )
+        assert os.system(f"cd {demo_dir} && ffcx {opts} {file}.py") == 0
+        assert (
+            os.system(
+                f"cd {demo_dir} && "
+                f"{cc} -I../ffcx/codegeneration "
+                f"{extra_flags} "
+                f"-c {file}.c"
+            )
+            == 0
+        )
+        cxx = os.environ.get("CXX", "c++")
+        assert (
+            os.system(
+                f"cd {demo_dir} && "
+                f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib"
+                f" -o nvrtc_test nvrtc_test.cpp {file}.o -l:libnvrtc.so.12"
+            )
+            == 0
+        )
+        assert (
+            os.system(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{nvrtc_dir}/lib {demo_dir}/nvrtc_test")
+            == 0
+        )
+
diff --git a/ffcx/codegeneration/C/integrals.py b/ffcx/codegeneration/C/integrals.py
index 226b2dabc..a169ffdf7 100644
--- a/ffcx/codegeneration/C/integrals.py
+++ b/ffcx/codegeneration/C/integrals.py
@@ -60,6 +60,7 @@ def generator(ir: IntegralIR, options):
         code["enabled_coefficients"] = "NULL"
 
     code["tabulate_tensor"] = body
+    code["tabulate_tensor_quoted"] = body.replace('\n', '\\n"\n    "')
 
     code["tabulate_tensor_float32"] = ".tabulate_tensor_float32 = NULL,"
     code["tabulate_tensor_float64"] = ".tabulate_tensor_float64 = NULL,"
@@ -71,7 +72,7 @@ def generator(ir: IntegralIR, options):
         code["tabulate_tensor_complex128"] = ".tabulate_tensor_complex128 = NULL,"
     if options.get("cuda"):
         code["tabulate_tensor_cuda_nvrtc"] = (
-            f".tabulate_tensor_cuda_nvrtc = tabulate_tensor_cuda_nvrtc_{factory_name}"
+            f".tabulate_tensor_cuda_nvrtc = tabulate_tensor_cuda_nvrtc_{factory_name},"
         )
     else:
         code["tabulate_tensor_cuda_nvrtc"] = ""
@@ -88,6 +89,7 @@ def generator(ir: IntegralIR, options):
         enabled_coefficients=code["enabled_coefficients"],
         enabled_coefficients_init=code["enabled_coefficients_init"],
         tabulate_tensor=code["tabulate_tensor"],
+        tabulate_tensor_quoted=code["tabulate_tensor_quoted"],
         needs_facet_permutations="true" if ir.expression.needs_facet_permutations else "false",
         scalar_type=dtype_to_c_type(options["scalar_type"]),
         geom_type=dtype_to_c_type(dtype_to_scalar_dtype(options["scalar_type"])),
diff --git a/pyproject.toml b/pyproject.toml
index 82061ab0d..3714dd41c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ ffcx = "ffcx:__main__.main"
 [project.optional-dependencies]
 lint = ["ruff"]
 docs = ["sphinx", "sphinx_rtd_theme"]
-optional = ["numba", "pygraphviz==1.7"]
+optional = ["numba", "pygraphviz==1.7", "nvidia-cuda-nvrtc-cu12"]
 test = ["pytest >= 6.0", "sympy", "numba"]
 ci = [
     "coveralls",

From 5b8bddeece476ada3e3df63cd7b7b7c38f6e9fa9 Mon Sep 17 00:00:00 2001
From: Benjamin Pachev <benjaminpachev@gmail.com>
Date: Mon, 11 Nov 2024 15:17:00 -0600
Subject: [PATCH 04/14] Fixed lint issues.

---
 demo/test_demos.py                 | 4 +++-
 ffcx/codegeneration/C/integrals.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/demo/test_demos.py b/demo/test_demos.py
index 3ccb722e3..b64e9cbd5 100644
--- a/demo/test_demos.py
+++ b/demo/test_demos.py
@@ -12,6 +12,7 @@
     if file.endswith(".py") and not file == "test_demos.py":
         ufl_files.append(file[:-3])
 
+
 @pytest.mark.parametrize("file", ufl_files)
 @pytest.mark.parametrize("scalar_type", ["float64", "float32", "complex128", "complex64"])
 def test_demo(file, scalar_type):
@@ -73,6 +74,7 @@ def test_demo(file, scalar_type):
             == 0
         )
 
+
 @pytest.mark.parametrize("scalar_type", ["float64", "float32"])
 def test_demo_nvrtc(scalar_type):
     """Test generated CUDA code with NVRTC."""
@@ -82,6 +84,7 @@ def test_demo_nvrtc(scalar_type):
         pytest.skip(reason="NVRTC support not tested on Windows")
     else:
         from nvidia import cuda_nvrtc
+
         nvrtc_dir = os.path.dirname(os.path.realpath(cuda_nvrtc.__file__))
         cc = os.environ.get("CC", "cc")
         extra_flags = (
@@ -110,4 +113,3 @@ def test_demo_nvrtc(scalar_type):
             os.system(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{nvrtc_dir}/lib {demo_dir}/nvrtc_test")
             == 0
         )
-
diff --git a/ffcx/codegeneration/C/integrals.py b/ffcx/codegeneration/C/integrals.py
index a169ffdf7..5616faa1f 100644
--- a/ffcx/codegeneration/C/integrals.py
+++ b/ffcx/codegeneration/C/integrals.py
@@ -60,7 +60,7 @@ def generator(ir: IntegralIR, options):
         code["enabled_coefficients"] = "NULL"
 
     code["tabulate_tensor"] = body
-    code["tabulate_tensor_quoted"] = body.replace('\n', '\\n"\n    "')
+    code["tabulate_tensor_quoted"] = body.replace("\n", '\\n"\n    "')
 
     code["tabulate_tensor_float32"] = ".tabulate_tensor_float32 = NULL,"
     code["tabulate_tensor_float64"] = ".tabulate_tensor_float64 = NULL,"

From 359b7099193b76b043e78d3e3f9d6a1a580da40c Mon Sep 17 00:00:00 2001
From: Benjamin Pachev <benjaminpachev@gmail.com>
Date: Mon, 11 Nov 2024 15:26:47 -0600
Subject: [PATCH 05/14] Only run NVRTC tests on Linux.

---
 demo/test_demos.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/demo/test_demos.py b/demo/test_demos.py
index b64e9cbd5..c5055dda5 100644
--- a/demo/test_demos.py
+++ b/demo/test_demos.py
@@ -80,8 +80,8 @@ def test_demo_nvrtc(scalar_type):
     """Test generated CUDA code with NVRTC."""
     file = "Components"
     opts = f"--scalar_type {scalar_type} --cuda"
-    if sys.platform.startswith("win32"):
-        pytest.skip(reason="NVRTC support not tested on Windows")
+    if not sys.platform.startswith("linux"):
+        pytest.skip(reason="NVRTC support only tested on Linux")
     else:
         from nvidia import cuda_nvrtc
 

From 86bca5ed773e79afd47ad6e64f5ae38477d8abe7 Mon Sep 17 00:00:00 2001
From: Benjamin Pachev <benjaminpachev@gmail.com>
Date: Wed, 13 Nov 2024 19:39:00 -0600
Subject: [PATCH 06/14] Updated NVRTC test with more forms, more explicit
 naming, and experimental windows support.

---
 demo/nvrtc_test.cpp                         | 27 ++++++++-
 demo/test_demos.py                          | 62 ++++++++++++++++-----
 ffcx/codegeneration/C/integrals.py          |  5 +-
 ffcx/codegeneration/C/integrals_template.py |  2 +-
 ffcx/options.py                             |  7 ++-
 5 files changed, 81 insertions(+), 22 deletions(-)

diff --git a/demo/nvrtc_test.cpp b/demo/nvrtc_test.cpp
index 983ba335b..ed426baf3 100644
--- a/demo/nvrtc_test.cpp
+++ b/demo/nvrtc_test.cpp
@@ -1,15 +1,21 @@
 #include "Components.h"
+#include "FacetIntegrals.h"
+#include "HyperElasticity.h"
+#include "MathFunctions.h"
+#include "StabilisedStokes.h"
+#include "VectorPoisson.h"
 #include "ufcx.h"
 #include "nvrtc.h"
-#include <stdexcept>
 #include <iostream>
+#include <stdexcept>
 #include <sstream>
 #include <string>
+#include <vector>
 
-int main()
+void check_nvrtc_compilation(ufcx_form* form)
 {
   // extract kernel
-  ufcx_integral* integral = form_Components_L->form_integrals[0];
+  ufcx_integral* integral = form->form_integrals[0];
   ufcx_tabulate_tensor_cuda_nvrtc* kernel = integral->tabulate_tensor_cuda_nvrtc;
   // call kernel to get CUDA-wrapped source code
   int num_program_headers;
@@ -84,6 +90,21 @@ int main()
        << std::string(60, '-') << "\n";
     throw std::runtime_error(ss.str());
   }
+}
 
+int main()
+{
+  std::vector<ufcx_form*> forms = {
+    form_Components_L,
+    form_FacetIntegrals_a,
+    form_HyperElasticity_a_F, form_HyperElasticity_a_J,
+    form_MathFunctions_a,
+    form_StabilisedStokes_a, form_StabilisedStokes_L,
+    form_VectorPoisson_a, form_VectorPoisson_L  
+  };
+  
+  for (ufcx_form* form : forms) check_nvrtc_compilation(form);
+ 
   return 0;
 }
+
diff --git a/demo/test_demos.py b/demo/test_demos.py
index c5055dda5..23943501e 100644
--- a/demo/test_demos.py
+++ b/demo/test_demos.py
@@ -78,34 +78,66 @@ def test_demo(file, scalar_type):
 @pytest.mark.parametrize("scalar_type", ["float64", "float32"])
 def test_demo_nvrtc(scalar_type):
     """Test generated CUDA code with NVRTC."""
-    file = "Components"
-    opts = f"--scalar_type {scalar_type} --cuda"
-    if not sys.platform.startswith("linux"):
-        pytest.skip(reason="NVRTC support only tested on Linux")
-    else:
+    try:
         from nvidia import cuda_nvrtc
+    except ImportError:
+        pytest.skip(reason="Must have NVRTC pip package installed to run test.")
 
-        nvrtc_dir = os.path.dirname(os.path.realpath(cuda_nvrtc.__file__))
-        cc = os.environ.get("CC", "cc")
-        extra_flags = (
-            "-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration"
-        )
+    files = [
+        "Components",
+        "FacetIntegrals",
+        "HyperElasticity",
+        "MathFunctions",
+        "StabilisedStokes",
+        "VectorPoisson",
+    ]
+    opts = f"--scalar_type {scalar_type} --cuda_nvrtc"
+    windows = sys.platform.startswith("win32")
+    nvrtc_dir = os.path.dirname(os.path.realpath(cuda_nvrtc.__file__))
+    for file in files:
         assert os.system(f"cd {demo_dir} && ffcx {opts} {file}.py") == 0
+        if windows:
+            extra_flags = "/std:c17"
+            assert (
+                os.system(
+                    f"cd {demo_dir} && "
+                    f'cl.exe /I "../ffcx/codegeneration" {extra_flags} /c {file}.c'
+                )
+            ) == 0
+        else:
+            cc = os.environ.get("CC", "cc")
+            extra_flags = (
+                "-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration"
+            )
+            assert (
+                os.system(
+                    f"cd {demo_dir} && "
+                    f"{cc} -I../ffcx/codegeneration "
+                    f"{extra_flags} "
+                    f"-c {file}.c"
+                )
+                == 0
+            )
+
+    if windows:
         assert (
             os.system(
                 f"cd {demo_dir} && "
-                f"{cc} -I../ffcx/codegeneration "
-                f"{extra_flags} "
-                f"-c {file}.c"
+                f'cl.exe /I "../ffcx/codegeneration" /I "{nvrtc_dir}\\include" '
+                f'/link /LIBPATH:"{nvrtc_dir}\\lib" nvrtc_test.cpp '
+                f"{' '.join([file+'.obj' for file in files])} nvrtc.lib /OUT:nvrtc_test.exe"
             )
             == 0
         )
+        assert os.system(f"set PATH=%PATH%;{nvrtc_dir}\\lib && {demo_dir}\\nvrtc_test.exe") == 0
+    else:
         cxx = os.environ.get("CXX", "c++")
         assert (
             os.system(
                 f"cd {demo_dir} && "
-                f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib"
-                f" -o nvrtc_test nvrtc_test.cpp {file}.o -l:libnvrtc.so.12"
+                f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib "
+                f" -Werror -o nvrtc_test nvrtc_test.cpp "
+                f"{' '.join([file+'.o' for file in files])} -l:libnvrtc.so.12"
             )
             == 0
         )
diff --git a/ffcx/codegeneration/C/integrals.py b/ffcx/codegeneration/C/integrals.py
index 5616faa1f..4497a543c 100644
--- a/ffcx/codegeneration/C/integrals.py
+++ b/ffcx/codegeneration/C/integrals.py
@@ -60,7 +60,6 @@ def generator(ir: IntegralIR, options):
         code["enabled_coefficients"] = "NULL"
 
     code["tabulate_tensor"] = body
-    code["tabulate_tensor_quoted"] = body.replace("\n", '\\n"\n    "')
 
     code["tabulate_tensor_float32"] = ".tabulate_tensor_float32 = NULL,"
     code["tabulate_tensor_float64"] = ".tabulate_tensor_float64 = NULL,"
@@ -70,12 +69,14 @@ def generator(ir: IntegralIR, options):
     else:
         code["tabulate_tensor_complex64"] = ".tabulate_tensor_complex64 = NULL,"
         code["tabulate_tensor_complex128"] = ".tabulate_tensor_complex128 = NULL,"
-    if options.get("cuda"):
+    if options.get("cuda_nvrtc"):
         code["tabulate_tensor_cuda_nvrtc"] = (
             f".tabulate_tensor_cuda_nvrtc = tabulate_tensor_cuda_nvrtc_{factory_name},"
         )
+        code["tabulate_tensor_quoted"] = body.replace("\n", '\\n"\n    "')
     else:
         code["tabulate_tensor_cuda_nvrtc"] = ""
+        code["tabulate_tensor_quoted"] = ""
 
     np_scalar_type = np.dtype(options["scalar_type"]).name
     code[f"tabulate_tensor_{np_scalar_type}"] = (
diff --git a/ffcx/codegeneration/C/integrals_template.py b/ffcx/codegeneration/C/integrals_template.py
index 28e36a508..732e29702 100644
--- a/ffcx/codegeneration/C/integrals_template.py
+++ b/ffcx/codegeneration/C/integrals_template.py
@@ -83,7 +83,7 @@
 
 def get_factory(options):
     """Return the template string for constructing form integrals."""
-    if options.get("cuda"):
+    if options.get("cuda_nvrtc"):
         return cuda_wrapper + factory
     else:
         return factory
diff --git a/ffcx/options.py b/ffcx/options.py
index 252ea74a8..71783a3fe 100644
--- a/ffcx/options.py
+++ b/ffcx/options.py
@@ -20,7 +20,12 @@
 logger = logging.getLogger("ffcx")
 
 FFCX_DEFAULT_OPTIONS = {
-    "cuda": (bool, False, "generate CUDA wrapped versions of tabulate tensor functions", None),
+    "cuda_nvrtc": (
+        bool,
+        False,
+        "generate CUDA wrapped versions of tabulate tensor functions for use with NVRTC",
+        None,
+    ),
     "epsilon": (float, 1e-14, "machine precision, used for dropping zero terms in tables.", None),
     "scalar_type": (
         str,

From 0a5af4f5b363cfe8f7c07dcd414a03309a281deb Mon Sep 17 00:00:00 2001
From: Benjamin Pachev <benjaminpachev@gmail.com>
Date: Wed, 13 Nov 2024 20:34:28 -0600
Subject: [PATCH 07/14] Only explicitly skip tests for Windows.

---
 demo/test_demos.py | 64 +++++++++++++++-------------------------------
 1 file changed, 21 insertions(+), 43 deletions(-)

diff --git a/demo/test_demos.py b/demo/test_demos.py
index 23943501e..d18d773c1 100644
--- a/demo/test_demos.py
+++ b/demo/test_demos.py
@@ -83,6 +83,9 @@ def test_demo_nvrtc(scalar_type):
     except ImportError:
         pytest.skip(reason="Must have NVRTC pip package installed to run test.")
 
+    if sys.platform.startswith("win32"):
+        pytest.skip(reason="NVRTC CUDA wrappers not currently supported for Windows.")
+
     files = [
         "Components",
         "FacetIntegrals",
@@ -92,56 +95,31 @@ def test_demo_nvrtc(scalar_type):
         "VectorPoisson",
     ]
     opts = f"--scalar_type {scalar_type} --cuda_nvrtc"
-    windows = sys.platform.startswith("win32")
     nvrtc_dir = os.path.dirname(os.path.realpath(cuda_nvrtc.__file__))
+    cc = os.environ.get("CC", "cc")
+    extra_flags = (
+        "-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration"
+    )
     for file in files:
         assert os.system(f"cd {demo_dir} && ffcx {opts} {file}.py") == 0
-        if windows:
-            extra_flags = "/std:c17"
-            assert (
-                os.system(
-                    f"cd {demo_dir} && "
-                    f'cl.exe /I "../ffcx/codegeneration" {extra_flags} /c {file}.c'
-                )
-            ) == 0
-        else:
-            cc = os.environ.get("CC", "cc")
-            extra_flags = (
-                "-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration"
-            )
-            assert (
-                os.system(
-                    f"cd {demo_dir} && "
-                    f"{cc} -I../ffcx/codegeneration "
-                    f"{extra_flags} "
-                    f"-c {file}.c"
-                )
-                == 0
-            )
-
-    if windows:
-        assert (
-            os.system(
-                f"cd {demo_dir} && "
-                f'cl.exe /I "../ffcx/codegeneration" /I "{nvrtc_dir}\\include" '
-                f'/link /LIBPATH:"{nvrtc_dir}\\lib" nvrtc_test.cpp '
-                f"{' '.join([file+'.obj' for file in files])} nvrtc.lib /OUT:nvrtc_test.exe"
-            )
-            == 0
-        )
-        assert os.system(f"set PATH=%PATH%;{nvrtc_dir}\\lib && {demo_dir}\\nvrtc_test.exe") == 0
-    else:
-        cxx = os.environ.get("CXX", "c++")
         assert (
             os.system(
                 f"cd {demo_dir} && "
-                f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib "
-                f" -Werror -o nvrtc_test nvrtc_test.cpp "
-                f"{' '.join([file+'.o' for file in files])} -l:libnvrtc.so.12"
+                f"{cc} -I../ffcx/codegeneration "
+                f"{extra_flags} "
+                f"-c {file}.c"
             )
             == 0
         )
-        assert (
-            os.system(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{nvrtc_dir}/lib {demo_dir}/nvrtc_test")
-            == 0
+
+    cxx = os.environ.get("CXX", "c++")
+    assert (
+        os.system(
+            f"cd {demo_dir} && "
+            f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib "
+            f" -Werror -o nvrtc_test nvrtc_test.cpp "
+            f"{' '.join([file+'.o' for file in files])} -l:libnvrtc.so.12"
         )
+        == 0
+    )
+    assert os.system(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{nvrtc_dir}/lib {demo_dir}/nvrtc_test") == 0

From 3338af942e5622e4d4a783de69b5c844bbf823ec Mon Sep 17 00:00:00 2001
From: "Jack S. Hale" <mail@jackhale.co.uk>
Date: Thu, 19 Jun 2025 12:46:48 +0200
Subject: [PATCH 08/14] ruff format

---
 demo/test_demos.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/demo/test_demos.py b/demo/test_demos.py
index bf131eacc..168054e93 100644
--- a/demo/test_demos.py
+++ b/demo/test_demos.py
@@ -88,12 +88,7 @@ def test_demo_nvrtc(scalar_type):
     for file in files:
         assert os.system(f"cd {demo_dir} && ffcx {opts} {file}.py") == 0
         assert (
-            os.system(
-                f"cd {demo_dir} && "
-                f"{cc} -I../ffcx/codegeneration "
-                f"{extra_flags} "
-                f"-c {file}.c"
-            )
+            os.system(f"cd {demo_dir} && {cc} -I../ffcx/codegeneration {extra_flags} -c {file}.c")
             == 0
         )
 
@@ -103,7 +98,7 @@ def test_demo_nvrtc(scalar_type):
             f"cd {demo_dir} && "
             f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib "
             f" -Werror -o nvrtc_test nvrtc_test.cpp "
-            f"{' '.join([file+'.o' for file in files])} -l:libnvrtc.so.12"
+            f"{' '.join([file + '.o' for file in files])} -l:libnvrtc.so.12"
         )
         == 0
     )

From 06627c6f87bb167571f8dc24730dc9aad2bb9aac Mon Sep 17 00:00:00 2001
From: "Jack S. Hale" <mail@jackhale.co.uk>
Date: Thu, 19 Jun 2025 14:56:17 +0200
Subject: [PATCH 09/14] Try adding pyindex

---
 .github/workflows/pythonapp.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml
index a6a8dd453..bef25ad39 100644
--- a/.github/workflows/pythonapp.yml
+++ b/.github/workflows/pythonapp.yml
@@ -79,7 +79,9 @@ jobs:
 
       - name: Install FFCx (Linux, with optional dependencies)
         if: runner.os == 'Linux'
-        run: pip install .[ci,optional]
+        run: | 
+          pip install nvidia-pyindex # NVIDIA has its own pypi index
+          pip install .[ci,optional]
       - name: Install FFCx (macOS, Windows)
         if: runner.os != 'Linux'
         run: pip install .[ci]

From 4e8ac19edd3cb024e84229ebc862cc05d12dca4f Mon Sep 17 00:00:00 2001
From: "Jack S. Hale" <mail@jackhale.co.uk>
Date: Thu, 19 Jun 2025 15:55:15 +0200
Subject: [PATCH 10/14] Doesn't work anyway.

---
 .github/workflows/pythonapp.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml
index bef25ad39..5a42fffd9 100644
--- a/.github/workflows/pythonapp.yml
+++ b/.github/workflows/pythonapp.yml
@@ -80,7 +80,6 @@ jobs:
       - name: Install FFCx (Linux, with optional dependencies)
         if: runner.os == 'Linux'
         run: | 
-          pip install nvidia-pyindex # NVIDIA has its own pypi index
           pip install .[ci,optional]
       - name: Install FFCx (macOS, Windows)
         if: runner.os != 'Linux'

From 1426b4c699485641f1278d98beb1d3a20a9561fc Mon Sep 17 00:00:00 2001
From: Benjamin Pachev <bpachev@umass.edu>
Date: Fri, 20 Jun 2025 09:02:48 +0000
Subject: [PATCH 11/14] Use importlib to find nvrtc source files, and pin nvrtc
 version.

---
 demo/test_demos.py | 9 +++++----
 pyproject.toml     | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/demo/test_demos.py b/demo/test_demos.py
index 168054e93..1eca5a741 100644
--- a/demo/test_demos.py
+++ b/demo/test_demos.py
@@ -63,9 +63,10 @@ def test_demo(file, scalar_type):
 @pytest.mark.parametrize("scalar_type", ["float64", "float32"])
 def test_demo_nvrtc(scalar_type):
     """Test generated CUDA code with NVRTC."""
-    try:
-        from nvidia import cuda_nvrtc
-    except ImportError:
+    import importlib.util
+
+    spec = importlib.util.find_spec("nvidia.cuda_nvrtc")
+    if spec is None:
         pytest.skip(reason="Must have NVRTC pip package installed to run test.")
 
     if sys.platform.startswith("win32"):
@@ -80,7 +81,7 @@ def test_demo_nvrtc(scalar_type):
         "VectorPoisson",
     ]
     opts = f"--scalar_type {scalar_type} --cuda_nvrtc"
-    nvrtc_dir = os.path.dirname(os.path.realpath(cuda_nvrtc.__file__))
+    nvrtc_dir = os.path.realpath(spec.submodule_search_locations[0])
     cc = os.environ.get("CC", "cc")
     extra_flags = (
         "-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration"
diff --git a/pyproject.toml b/pyproject.toml
index 37e269dcd..9df075b62 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ ffcx = "ffcx:__main__.main"
 [project.optional-dependencies]
 lint = ["ruff"]
 docs = ["sphinx", "sphinx_rtd_theme"]
-optional = ["numba", "pygraphviz==1.7", "nvidia-cuda-nvrtc-cu12"]
+optional = ["numba", "pygraphviz==1.7", "nvidia-cuda-nvrtc-cu12==12.9.86"]
 test = ["pytest >= 6.0", "sympy", "numba"]
 ci = [
     "coveralls",

From 006c23c7ce30c5eb6656a3a53a36c809b009ef9d Mon Sep 17 00:00:00 2001
From: Benjamin Pachev <bpachev@umass.edu>
Date: Fri, 20 Jun 2025 09:16:07 +0000
Subject: [PATCH 12/14] Properly handle missing NVRTC pip package.

---
 demo/test_demos.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/demo/test_demos.py b/demo/test_demos.py
index 1eca5a741..eb8519230 100644
--- a/demo/test_demos.py
+++ b/demo/test_demos.py
@@ -65,8 +65,9 @@ def test_demo_nvrtc(scalar_type):
     """Test generated CUDA code with NVRTC."""
     import importlib.util
 
-    spec = importlib.util.find_spec("nvidia.cuda_nvrtc")
-    if spec is None:
+    try:
+        spec = importlib.util.find_spec("nvidia.cuda_nvrtc")
+    except ModuleNotFoundError:
         pytest.skip(reason="Must have NVRTC pip package installed to run test.")
 
     if sys.platform.startswith("win32"):

From 9e0464ed07b4b8dac7edfbef9ba31f5db7adc25e Mon Sep 17 00:00:00 2001
From: "Jack S. Hale" <mail@jackhale.co.uk>
Date: Thu, 26 Jun 2025 08:51:50 +0200
Subject: [PATCH 13/14] Change to lower bound

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9df075b62..97907db29 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ ffcx = "ffcx:__main__.main"
 [project.optional-dependencies]
 lint = ["ruff"]
 docs = ["sphinx", "sphinx_rtd_theme"]
-optional = ["numba", "pygraphviz==1.7", "nvidia-cuda-nvrtc-cu12==12.9.86"]
+optional = ["numba", "pygraphviz==1.7", "nvidia-cuda-nvrtc-cu12>=12.9.86"]
 test = ["pytest >= 6.0", "sympy", "numba"]
 ci = [
     "coveralls",

From 8981aa97764f9fa772540c03aed9f112efa69445 Mon Sep 17 00:00:00 2001
From: "Jack S. Hale" <mail@jackhale.co.uk>
Date: Thu, 26 Jun 2025 08:52:45 +0200
Subject: [PATCH 14/14] Try without pygraphviz equality

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 97907db29..38220d5f5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ ffcx = "ffcx:__main__.main"
 [project.optional-dependencies]
 lint = ["ruff"]
 docs = ["sphinx", "sphinx_rtd_theme"]
-optional = ["numba", "pygraphviz==1.7", "nvidia-cuda-nvrtc-cu12>=12.9.86"]
+optional = ["numba", "pygraphviz", "nvidia-cuda-nvrtc-cu12>=12.9.86"]
 test = ["pytest >= 6.0", "sympy", "numba"]
 ci = [
     "coveralls",