Skip to content

Commit ecba4dc

Browse files
committed
implement GPU codegen helpers
1 parent a41672c commit ecba4dc

4 files changed

Lines changed: 265 additions & 1 deletion

File tree

pyop2/compilation.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -696,3 +696,111 @@ def clear_cache(prompt=False):
696696
shutil.rmtree(cachedir)
697697
else:
698698
print("Not removing cached libraries")
699+
700+
701+
@collective
702+
def get_prepared_cuda_function(comm, global_kernel):
703+
from pycuda.compiler import SourceModule
704+
705+
# Determine cache key
706+
hsh = md5(str(global_kernel.cache_key[1:]).encode())
707+
basename = hsh.hexdigest()
708+
cachedir = configuration["cache_dir"]
709+
dirpart, basename = basename[:2], basename[2:]
710+
cachedir = os.path.join(cachedir, dirpart)
711+
cname = os.path.join(cachedir, f"{basename}_code.cu")
712+
713+
nvcc_opts = ["-use_fast_math", "-w"]
714+
715+
if configuration["check_src_hashes"] or configuration["debug"]:
716+
matching = comm.allreduce(basename, op=_check_op)
717+
if matching != basename:
718+
# Dump all src code to disk for debugging
719+
output = os.path.join(cachedir, "mismatching-kernels")
720+
srcfile = os.path.join(output, "src-rank%d.cu" % comm.rank)
721+
if comm.rank == 0:
722+
os.makedirs(output, exist_ok=True)
723+
comm.barrier()
724+
with open(srcfile, "w") as f:
725+
f.write(global_kernel.code_to_compile)
726+
comm.barrier()
727+
raise CompilationError("Generated code differs across ranks"
728+
f" (see output in {output})")
729+
730+
if os.path.isfile(cname):
731+
# Are we in the cache?
732+
with open(cname, "r") as f:
733+
source_module = SourceModule(f.read(), options=nvcc_opts,
734+
cache_dir=cachedir)
735+
else:
736+
# No, let"s go ahead and build
737+
if comm.rank == 0:
738+
# No need to do this on all ranks
739+
os.makedirs(cachedir, exist_ok=True)
740+
with progress(INFO, "Compiling wrapper"):
741+
# make sure that compiles successfully before writing to file
742+
source_module = SourceModule(global_kernel.code_to_compile,
743+
options=nvcc_opts, cache_dir=cachedir)
744+
with open(cname, "w") as f:
745+
f.write(global_kernel.code_to_compile)
746+
comm.barrier()
747+
748+
cu_func = source_module.get_function(global_kernel.name)
749+
750+
type_map = {ctypes.c_void_p: "P", ctypes.c_int: "i"}
751+
argtypes = "".join(type_map[t] for t in global_kernel.argtypes)
752+
cu_func.prepare(argtypes)
753+
754+
return cu_func
755+
756+
757+
@collective
758+
def get_opencl_kernel(comm, global_kernel):
759+
import pyopencl as cl
760+
from pyop2.backends.opencl import opencl_backend
761+
cl_ctx = opencl_backend.context
762+
763+
# Determine cache key
764+
hsh = md5(str(global_kernel.cache_key[1:]).encode())
765+
basename = hsh.hexdigest()
766+
cachedir = configuration["cache_dir"]
767+
dirpart, basename = basename[:2], basename[2:]
768+
cachedir = os.path.join(cachedir, dirpart)
769+
cname = os.path.join(cachedir, f"{basename}_code.cl")
770+
771+
if configuration["check_src_hashes"] or configuration["debug"]:
772+
matching = comm.allreduce(basename, op=_check_op)
773+
if matching != basename:
774+
# Dump all src code to disk for debugging
775+
output = os.path.join(cachedir, "mismatching-kernels")
776+
srcfile = os.path.join(output, "src-rank%d.cl" % comm.rank)
777+
if comm.rank == 0:
778+
os.makedirs(output, exist_ok=True)
779+
comm.barrier()
780+
with open(srcfile, "w") as f:
781+
f.write(global_kernel.code_to_compile)
782+
comm.barrier()
783+
raise CompilationError("Generated code differs across ranks"
784+
f" (see output in {output})")
785+
786+
if os.path.isfile(cname):
787+
# Are we in the cache?
788+
with open(cname, "r") as f:
789+
prg = cl.Program(cl_ctx, f.read()).build(options=[],
790+
cache_dir=cachedir)
791+
else:
792+
# No, let"s go ahead and build
793+
if comm.rank == 0:
794+
# No need to do this on all ranks
795+
os.makedirs(cachedir, exist_ok=True)
796+
with progress(INFO, "Compiling wrapper"):
797+
# make sure that compiles successfully before writing to file\
798+
prg = (cl.Program(cl_ctx,
799+
global_kernel.code_to_compile)
800+
.build(options=[], cache_dir=cachedir))
801+
with open(cname, "w") as f:
802+
f.write(global_kernel.code_to_compile)
803+
comm.barrier()
804+
805+
cl_knl = cl.Kernel(prg, global_kernel.name)
806+
return cl_knl

pyop2/configuration.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,9 @@ class Configuration(dict):
113113
"matnest":
114114
("PYOP2_MATNEST", bool, True),
115115
"block_sparsity":
116-
("PYOP2_BLOCK_SPARSITY", bool, True)
116+
("PYOP2_BLOCK_SPARSITY", bool, True),
117+
"gpu_strategy":
118+
("PYOP2_GPU_STRATEGY", str, "scpt"),
117119
}
118120
"""Default values for PyOP2 configuration parameters"""
119121

pyop2/transforms/gpu_utils.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import loopy as lp
2+
3+
4+
def get_loopy_target(target):
5+
if target == "opencl":
6+
return lp.PyOpenCLTarget()
7+
elif target == "cuda":
8+
return lp.CudaTarget()
9+
else:
10+
raise NotImplementedError()
11+
12+
13+
def preprocess_t_unit_for_gpu(t_unit):
14+
15+
# {{{ inline all kernels in t_unit
16+
17+
kernels_to_inline = {
18+
name for name, clbl in t_unit.callables_table.items()
19+
if isinstance(clbl, lp.CallableKernel)}
20+
21+
for knl_name in kernels_to_inline:
22+
t_unit = lp.inline_callable_kernel(t_unit, knl_name)
23+
24+
# }}}
25+
26+
kernel = t_unit.default_entrypoint
27+
28+
# changing the address space of temps
29+
def _change_aspace_tvs(tv):
30+
if tv.read_only:
31+
assert tv.initializer is not None
32+
return tv.copy(address_space=lp.AddressSpace.GLOBAL)
33+
else:
34+
return tv.copy(address_space=lp.AddressSpace.PRIVATE)
35+
36+
new_tvs = {tv_name: _change_aspace_tvs(tv) for tv_name, tv in
37+
kernel.temporary_variables.items()}
38+
kernel = kernel.copy(temporary_variables=new_tvs)
39+
40+
def insn_needs_atomic(insn):
41+
# updates to global variables are atomic
42+
import pymbolic
43+
if isinstance(insn, lp.Assignment):
44+
if isinstance(insn.assignee, pymbolic.primitives.Subscript):
45+
assignee_name = insn.assignee.aggregate.name
46+
else:
47+
assert isinstance(insn.assignee, pymbolic.primitives.Variable)
48+
assignee_name = insn.assignee.name
49+
50+
if assignee_name in kernel.arg_dict:
51+
return assignee_name in insn.read_dependency_names()
52+
return False
53+
54+
new_insns = []
55+
args_marked_for_atomic = set()
56+
for insn in kernel.instructions:
57+
if insn_needs_atomic(insn):
58+
atomicity = (lp.AtomicUpdate(insn.assignee.aggregate.name), )
59+
insn = insn.copy(atomicity=atomicity)
60+
args_marked_for_atomic |= set([insn.assignee.aggregate.name])
61+
62+
new_insns.append(insn)
63+
64+
# label args as atomic
65+
new_args = []
66+
for arg in kernel.args:
67+
if arg.name in args_marked_for_atomic:
68+
new_args.append(arg.copy(for_atomic=True))
69+
else:
70+
new_args.append(arg)
71+
72+
kernel = kernel.copy(instructions=new_insns, args=new_args)
73+
74+
return t_unit.with_kernel(kernel)
75+
76+
77+
def apply_gpu_transforms(t_unit, target):
78+
t_unit = t_unit.copy(target=get_loopy_target(target))
79+
t_unit = preprocess_t_unit_for_gpu(t_unit)
80+
kernel = t_unit.default_entrypoint
81+
82+
kernel = lp.assume(kernel, "end > start")
83+
84+
if kernel.name in [
85+
"wrap_form0_cell_integral_otherwise",
86+
"wrap_form0_exterior_facet_integral_otherwise",
87+
"wrap_form0_interior_facet_integral_otherwise",
88+
"wrap_form1_cell_integral_otherwise",
89+
"wrap_zero", "wrap_expression_kernel",
90+
"wrap_expression", "wrap_pyop2_kernel_uniform_extrusion",
91+
"wrap_form_cell_integral_otherwise",
92+
"wrap_loopy_kernel_prolong",
93+
"wrap_loopy_kernel_restrict",
94+
"wrap_loopy_kernel_inject", "wrap_copy", "wrap_inner"]:
95+
from pyop2.transforms.snpt import snpt_transform
96+
kernel, args_to_make_global = snpt_transform(kernel, 32)
97+
else:
98+
raise NotImplementedError(f"Transformation for '{kernel.name}'.")
99+
100+
t_unit = t_unit.with_kernel(kernel)
101+
102+
return t_unit, args_to_make_global

pyop2/transforms/snpt.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import loopy as lp
2+
3+
4+
def _make_tv_array_arg(tv):
5+
assert tv.address_space != lp.AddressSpace.PRIVATE
6+
arg = lp.ArrayArg(name=tv.name,
7+
dtype=tv.dtype,
8+
shape=tv.shape,
9+
dim_tags=tv.dim_tags,
10+
offset=tv.offset,
11+
dim_names=tv.dim_names,
12+
order=tv.order,
13+
alignment=tv.alignment,
14+
address_space=tv.address_space,
15+
is_output=not tv.read_only,
16+
is_input=tv.read_only)
17+
return arg
18+
19+
20+
def snpt_transform(kernel, block_size):
21+
"""
22+
SNPT := Single 'n' Per Thread.
23+
24+
Implements outer-loop parallelization strategy.
25+
26+
PyOP2 uses 'n' as the outer loop iname. In Firedrake 'n' might denote
27+
either a cell or a DOF.
28+
"""
29+
30+
kernel = lp.assume(kernel, "start < end")
31+
kernel = lp.split_iname(kernel, "n", block_size,
32+
outer_tag="g.0", inner_tag="l.0")
33+
34+
# {{{ making consts as globals: necessary to make the strategy emit valid
35+
# kernels for all forms
36+
37+
old_temps = kernel.temporary_variables.copy()
38+
args_to_make_global = [tv.initializer.flatten()
39+
for tv in old_temps.values()
40+
if tv.initializer is not None]
41+
42+
new_temps = {tv.name: tv
43+
for tv in old_temps.values()
44+
if tv.initializer is None}
45+
kernel = kernel.copy(args=kernel.args+[_make_tv_array_arg(tv)
46+
for tv in old_temps.values()
47+
if tv.initializer is not None],
48+
temporary_variables=new_temps)
49+
50+
# }}}
51+
52+
return kernel, args_to_make_global

0 commit comments

Comments
 (0)