implement GPU codegen helpers

kaushikcfd · kaushikcfd · commit ecba4dcb3cdf · 2022-07-11T21:31:18.000-05:00
diff --git a/pyop2/compilation.py b/pyop2/compilation.py
@@ -696,3 +696,111 @@ def clear_cache(prompt=False):
         shutil.rmtree(cachedir)
     else:
         print("Not removing cached libraries")
+
+
+@collective
+def get_prepared_cuda_function(comm, global_kernel):
+    from pycuda.compiler import SourceModule
+
+    # Determine cache key
+    hsh = md5(str(global_kernel.cache_key[1:]).encode())
+    basename = hsh.hexdigest()
+    cachedir = configuration["cache_dir"]
+    dirpart, basename = basename[:2], basename[2:]
+    cachedir = os.path.join(cachedir, dirpart)
+    cname = os.path.join(cachedir, f"{basename}_code.cu")
+
+    nvcc_opts = ["-use_fast_math", "-w"]
+
+    if configuration["check_src_hashes"] or configuration["debug"]:
+        matching = comm.allreduce(basename, op=_check_op)
+        if matching != basename:
+            # Dump all src code to disk for debugging
+            output = os.path.join(cachedir, "mismatching-kernels")
+            srcfile = os.path.join(output, "src-rank%d.cu" % comm.rank)
+            if comm.rank == 0:
+                os.makedirs(output, exist_ok=True)
+            comm.barrier()
+            with open(srcfile, "w") as f:
+                f.write(global_kernel.code_to_compile)
+            comm.barrier()
+            raise CompilationError("Generated code differs across ranks"
+                                   f" (see output in {output})")
+
+    if os.path.isfile(cname):
+        # Are we in the cache?
+        with open(cname, "r") as f:
+            source_module = SourceModule(f.read(), options=nvcc_opts,
+                                         cache_dir=cachedir)
+    else:
+        # No, let"s go ahead and build
+        if comm.rank == 0:
+            # No need to do this on all ranks
+            os.makedirs(cachedir, exist_ok=True)
+            with progress(INFO, "Compiling wrapper"):
+                # make sure that compiles successfully before writing to file
+                source_module = SourceModule(global_kernel.code_to_compile,
+                                             options=nvcc_opts, cache_dir=cachedir)
+                with open(cname, "w") as f:
+                    f.write(global_kernel.code_to_compile)
+        comm.barrier()
+
+    cu_func = source_module.get_function(global_kernel.name)
+
+    type_map = {ctypes.c_void_p: "P", ctypes.c_int: "i"}
+    argtypes = "".join(type_map[t] for t in global_kernel.argtypes)
+    cu_func.prepare(argtypes)
+
+    return cu_func
+
+
+@collective
+def get_opencl_kernel(comm, global_kernel):
+    import pyopencl as cl
+    from pyop2.backends.opencl import opencl_backend
+    cl_ctx = opencl_backend.context
+
+    # Determine cache key
+    hsh = md5(str(global_kernel.cache_key[1:]).encode())
+    basename = hsh.hexdigest()
+    cachedir = configuration["cache_dir"]
+    dirpart, basename = basename[:2], basename[2:]
+    cachedir = os.path.join(cachedir, dirpart)
+    cname = os.path.join(cachedir, f"{basename}_code.cl")
+
+    if configuration["check_src_hashes"] or configuration["debug"]:
+        matching = comm.allreduce(basename, op=_check_op)
+        if matching != basename:
+            # Dump all src code to disk for debugging
+            output = os.path.join(cachedir, "mismatching-kernels")
+            srcfile = os.path.join(output, "src-rank%d.cl" % comm.rank)
+            if comm.rank == 0:
+                os.makedirs(output, exist_ok=True)
+            comm.barrier()
+            with open(srcfile, "w") as f:
+                f.write(global_kernel.code_to_compile)
+            comm.barrier()
+            raise CompilationError("Generated code differs across ranks"
+                                   f" (see output in {output})")
+
+    if os.path.isfile(cname):
+        # Are we in the cache?
+        with open(cname, "r") as f:
+            prg = cl.Program(cl_ctx, f.read()).build(options=[],
+                                                     cache_dir=cachedir)
+    else:
+        # No, let"s go ahead and build
+        if comm.rank == 0:
+            # No need to do this on all ranks
+            os.makedirs(cachedir, exist_ok=True)
+            with progress(INFO, "Compiling wrapper"):
+                # make sure that compiles successfully before writing to file\
+                prg = (cl.Program(cl_ctx,
+                                  global_kernel.code_to_compile)
+                       .build(options=[], cache_dir=cachedir))
+                with open(cname, "w") as f:
+                    f.write(global_kernel.code_to_compile)
+        comm.barrier()
+
+    cl_knl = cl.Kernel(prg, global_kernel.name)
+    return cl_knl
diff --git a/pyop2/configuration.py b/pyop2/configuration.py
@@ -113,7 +113,9 @@ class Configuration(dict):
         "matnest":
             ("PYOP2_MATNEST", bool, True),
         "block_sparsity":
-            ("PYOP2_BLOCK_SPARSITY", bool, True)
+            ("PYOP2_BLOCK_SPARSITY", bool, True),
+        "gpu_strategy":
+            ("PYOP2_GPU_STRATEGY", str, "scpt"),
     }
     """Default values for PyOP2 configuration parameters"""
 
diff --git a/pyop2/transforms/gpu_utils.py b/pyop2/transforms/gpu_utils.py
@@ -0,0 +1,102 @@
+import loopy as lp
+
+
+def get_loopy_target(target):
+    if target == "opencl":
+        return lp.PyOpenCLTarget()
+    elif target == "cuda":
+        return lp.CudaTarget()
+    else:
+        raise NotImplementedError()
+
+
+def preprocess_t_unit_for_gpu(t_unit):
+
+    # {{{ inline all kernels in t_unit
+
+    kernels_to_inline = {
+        name for name, clbl in t_unit.callables_table.items()
+        if isinstance(clbl, lp.CallableKernel)}
+
+    for knl_name in kernels_to_inline:
+        t_unit = lp.inline_callable_kernel(t_unit, knl_name)
+
+    # }}}
+
+    kernel = t_unit.default_entrypoint
+
+    # changing the address space of temps
+    def _change_aspace_tvs(tv):
+        if tv.read_only:
+            assert tv.initializer is not None
+            return tv.copy(address_space=lp.AddressSpace.GLOBAL)
+        else:
+            return tv.copy(address_space=lp.AddressSpace.PRIVATE)
+
+    new_tvs = {tv_name: _change_aspace_tvs(tv) for tv_name, tv in
+               kernel.temporary_variables.items()}
+    kernel = kernel.copy(temporary_variables=new_tvs)
+
+    def insn_needs_atomic(insn):
+        # updates to global variables are atomic
+        import pymbolic
+        if isinstance(insn, lp.Assignment):
+            if isinstance(insn.assignee, pymbolic.primitives.Subscript):
+                assignee_name = insn.assignee.aggregate.name
+            else:
+                assert isinstance(insn.assignee, pymbolic.primitives.Variable)
+                assignee_name = insn.assignee.name
+
+            if assignee_name in kernel.arg_dict:
+                return assignee_name in insn.read_dependency_names()
+        return False
+
+    new_insns = []
+    args_marked_for_atomic = set()
+    for insn in kernel.instructions:
+        if insn_needs_atomic(insn):
+            atomicity = (lp.AtomicUpdate(insn.assignee.aggregate.name), )
+            insn = insn.copy(atomicity=atomicity)
+            args_marked_for_atomic |= set([insn.assignee.aggregate.name])
+
+        new_insns.append(insn)
+
+    # label args as atomic
+    new_args = []
+    for arg in kernel.args:
+        if arg.name in args_marked_for_atomic:
+            new_args.append(arg.copy(for_atomic=True))
+        else:
+            new_args.append(arg)
+
+    kernel = kernel.copy(instructions=new_insns, args=new_args)
+
+    return t_unit.with_kernel(kernel)
+
+
+def apply_gpu_transforms(t_unit, target):
+    t_unit = t_unit.copy(target=get_loopy_target(target))
+    t_unit = preprocess_t_unit_for_gpu(t_unit)
+    kernel = t_unit.default_entrypoint
+
+    kernel = lp.assume(kernel, "end > start")
+
+    if kernel.name in [
+            "wrap_form0_cell_integral_otherwise",
+            "wrap_form0_exterior_facet_integral_otherwise",
+            "wrap_form0_interior_facet_integral_otherwise",
+            "wrap_form1_cell_integral_otherwise",
+            "wrap_zero", "wrap_expression_kernel",
+            "wrap_expression", "wrap_pyop2_kernel_uniform_extrusion",
+            "wrap_form_cell_integral_otherwise",
+            "wrap_loopy_kernel_prolong",
+            "wrap_loopy_kernel_restrict",
+            "wrap_loopy_kernel_inject", "wrap_copy", "wrap_inner"]:
+        from pyop2.transforms.snpt import snpt_transform
+        kernel, args_to_make_global = snpt_transform(kernel, 32)
+    else:
+        raise NotImplementedError(f"Transformation for '{kernel.name}'.")
+
+    t_unit = t_unit.with_kernel(kernel)
+
+    return t_unit, args_to_make_global
diff --git a/pyop2/transforms/snpt.py b/pyop2/transforms/snpt.py
@@ -0,0 +1,52 @@
+import loopy as lp
+
+
+def _make_tv_array_arg(tv):
+    assert tv.address_space != lp.AddressSpace.PRIVATE
+    arg = lp.ArrayArg(name=tv.name,
+                      dtype=tv.dtype,
+                      shape=tv.shape,
+                      dim_tags=tv.dim_tags,
+                      offset=tv.offset,
+                      dim_names=tv.dim_names,
+                      order=tv.order,
+                      alignment=tv.alignment,
+                      address_space=tv.address_space,
+                      is_output=not tv.read_only,
+                      is_input=tv.read_only)
+    return arg
+
+
+def snpt_transform(kernel, block_size):
+    """
+    SNPT := Single 'n' Per Thread.
+
+    Implements outer-loop parallelization strategy.
+
+    PyOP2 uses 'n' as the outer loop iname. In Firedrake 'n' might denote
+    either a cell or a DOF.
+    """
+
+    kernel = lp.assume(kernel, "start < end")
+    kernel = lp.split_iname(kernel, "n", block_size,
+                            outer_tag="g.0", inner_tag="l.0")
+
+    # {{{ making consts as globals: necessary to make the strategy emit valid
+    # kernels for all forms
+
+    old_temps = kernel.temporary_variables.copy()
+    args_to_make_global = [tv.initializer.flatten()
+                           for tv in old_temps.values()
+                           if tv.initializer is not None]
+
+    new_temps = {tv.name: tv
+                 for tv in old_temps.values()
+                 if tv.initializer is None}
+    kernel = kernel.copy(args=kernel.args+[_make_tv_array_arg(tv)
+                                           for tv in old_temps.values()
+                                           if tv.initializer is not None],
+                         temporary_variables=new_temps)
+
+    # }}}
+
+    return kernel, args_to_make_global