Skip to content

Commit 0c07d2a

Browse files
committed
implement GPU codegen helpers
1 parent 3238129 commit 0c07d2a

6 files changed

Lines changed: 258 additions & 15 deletions

File tree

pyop2/compilation.py

Lines changed: 103 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,29 @@ def sniff_compiler(exe):
147147
return compiler
148148

149149

150+
def _check_src_hashes(comm, global_kernel):
151+
hsh = md5(str(global_kernel.cache_key[1:]).encode())
152+
basename = hsh.hexdigest()
153+
dirpart, basename = basename[:2], basename[2:]
154+
cachedir = configuration["cache_dir"]
155+
cachedir = os.path.join(cachedir, dirpart)
156+
157+
if configuration["check_src_hashes"] or configuration["debug"]:
158+
matching = comm.allreduce(basename, op=_check_op)
159+
if matching != basename:
160+
# Dump all src code to disk for debugging
161+
output = os.path.join(cachedir, "mismatching-kernels")
162+
srcfile = os.path.join(output, "src-rank%d.c" % comm.rank)
163+
if comm.rank == 0:
164+
os.makedirs(output, exist_ok=True)
165+
comm.barrier()
166+
with open(srcfile, "w") as f:
167+
f.write(global_kernel.code_to_compile)
168+
comm.barrier()
169+
raise CompilationError("Generated code differs across ranks"
170+
f" (see output in {output})")
171+
172+
150173
class Compiler(ABC):
151174
"""A compiler for shared libraries.
152175
@@ -317,19 +340,8 @@ def get_so(self, jitmodule, extension):
317340
# atomically (avoiding races).
318341
tmpname = os.path.join(cachedir, "%s_p%d.so.tmp" % (basename, pid))
319342

320-
if configuration['check_src_hashes'] or configuration['debug']:
321-
matching = self.comm.allreduce(basename, op=_check_op)
322-
if matching != basename:
323-
# Dump all src code to disk for debugging
324-
output = os.path.join(configuration["cache_dir"], "mismatching-kernels")
325-
srcfile = os.path.join(output, "src-rank%d.c" % self.comm.rank)
326-
if self.comm.rank == 0:
327-
os.makedirs(output, exist_ok=True)
328-
self.comm.barrier()
329-
with open(srcfile, "w") as f:
330-
f.write(jitmodule.code_to_compile)
331-
self.comm.barrier()
332-
raise CompilationError("Generated code differs across ranks (see output in %s)" % output)
343+
_check_src_hashes(self.comm, jitmodule)
344+
333345
try:
334346
# Are we in the cache?
335347
return ctypes.CDLL(soname)
@@ -662,3 +674,81 @@ def clear_cache(prompt=False):
662674
shutil.rmtree(cachedir)
663675
else:
664676
print("Not removing cached libraries")
677+
678+
679+
def _get_code_to_compile(comm, global_kernel):
680+
# Determine cache key
681+
hsh = md5(str(global_kernel.cache_key[1:]).encode())
682+
basename = hsh.hexdigest()
683+
cachedir = configuration["cache_dir"]
684+
dirpart, basename = basename[:2], basename[2:]
685+
cachedir = os.path.join(cachedir, dirpart)
686+
cname = os.path.join(cachedir, f"{basename}_code.cu")
687+
688+
_check_src_hashes(comm, global_kernel)
689+
690+
if os.path.isfile(cname):
691+
# Are we in the cache?
692+
with open(cname, "r") as f:
693+
code_to_compile = f.read()
694+
else:
695+
# No, let"s go ahead and build
696+
if comm.rank == 0:
697+
# No need to do this on all ranks
698+
os.makedirs(cachedir, exist_ok=True)
699+
with progress(INFO, "Compiling wrapper"):
700+
# make sure that compiles successfully before writing to file
701+
code_to_compile = global_kernel.code_to_compile
702+
with open(cname, "w") as f:
703+
f.write(code_to_compile)
704+
comm.barrier()
705+
706+
return code_to_compile
707+
708+
709+
@mpi.collective
710+
def get_prepared_cuda_function(comm, global_kernel):
711+
from pycuda.compiler import SourceModule
712+
713+
# Determine cache key
714+
hsh = md5(str(global_kernel.cache_key[1:]).encode())
715+
basename = hsh.hexdigest()
716+
cachedir = configuration["cache_dir"]
717+
dirpart, basename = basename[:2], basename[2:]
718+
cachedir = os.path.join(cachedir, dirpart)
719+
720+
nvcc_opts = ["-use_fast_math", "-w"]
721+
722+
code_to_compile = _get_code_to_compile(comm, global_kernel)
723+
source_module = SourceModule(code_to_compile, options=nvcc_opts,
724+
cache_dir=cachedir)
725+
726+
cu_func = source_module.get_function(global_kernel.name)
727+
728+
type_map = {ctypes.c_void_p: "P", ctypes.c_int: "i"}
729+
argtypes = "".join(type_map[t] for t in global_kernel.argtypes)
730+
cu_func.prepare(argtypes)
731+
732+
return cu_func
733+
734+
735+
@mpi.collective
736+
def get_opencl_kernel(comm, global_kernel):
737+
import pyopencl as cl
738+
from pyop2.backends.opencl import opencl_backend
739+
cl_ctx = opencl_backend.context
740+
741+
# Determine cache key
742+
hsh = md5(str(global_kernel.cache_key[1:]).encode())
743+
basename = hsh.hexdigest()
744+
cachedir = configuration["cache_dir"]
745+
dirpart, basename = basename[:2], basename[2:]
746+
cachedir = os.path.join(cachedir, dirpart)
747+
748+
code_to_compile = _get_code_to_compile(comm, global_kernel)
749+
750+
prg = cl.Program(cl_ctx, code_to_compile).build(options=[],
751+
cache_dir=cachedir)
752+
753+
cl_knl = cl.Kernel(prg, global_kernel.name)
754+
return cl_knl

pyop2/configuration.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,12 @@ class Configuration(dict):
7474
cdim > 1 be built as block sparsities, or dof sparsities. The
7575
former saves memory but changes which preconditioners are
7676
available for the resulting matrices. (Default yes)
77+
:param gpu_strategy: A :class:str` indicating the transformation strategy
78+
that must be applied to a :class:`pyop2.global_kernel.GlobalKernel`
79+
when offloading to a GPGPU. Can be one of:
80+
- ``"snpt"``: Single-"N" Per Thread. In the transform strategy, the
81+
work of each element of the iteration set over which a global kernel
82+
operates is assigned to a work-item (i.e. a CUDA thread)
7783
"""
7884
# name, env variable, type, default, write once
7985
cache_dir = os.path.join(gettempdir(), "pyop2-cache-uid%s" % os.getuid())
@@ -113,7 +119,9 @@ class Configuration(dict):
113119
"matnest":
114120
("PYOP2_MATNEST", bool, True),
115121
"block_sparsity":
116-
("PYOP2_BLOCK_SPARSITY", bool, True)
122+
("PYOP2_BLOCK_SPARSITY", bool, True),
123+
"gpu_strategy":
124+
("PYOP2_GPU_STRATEGY", str, "snpt"),
117125
}
118126
"""Default values for PyOP2 configuration parameters"""
119127

pyop2/transforms/__init__.py

Whitespace-only changes.

pyop2/transforms/gpu_utils.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import loopy as lp
2+
from pyop2.configuration import configuration
3+
4+
5+
def get_loopy_target(target):
6+
if target == "opencl":
7+
return lp.PyOpenCLTarget()
8+
elif target == "cuda":
9+
return lp.CudaTarget()
10+
else:
11+
raise NotImplementedError()
12+
13+
14+
def preprocess_t_unit_for_gpu(t_unit):
15+
16+
# {{{ inline all kernels in t_unit
17+
18+
kernels_to_inline = {
19+
name for name, clbl in t_unit.callables_table.items()
20+
if isinstance(clbl, lp.CallableKernel)}
21+
22+
for knl_name in kernels_to_inline:
23+
t_unit = lp.inline_callable_kernel(t_unit, knl_name)
24+
25+
# }}}
26+
27+
kernel = t_unit.default_entrypoint
28+
29+
# changing the address space of temps
30+
def _change_aspace_tvs(tv):
31+
if tv.read_only:
32+
assert tv.initializer is not None
33+
return tv.copy(address_space=lp.AddressSpace.GLOBAL)
34+
else:
35+
return tv.copy(address_space=lp.AddressSpace.PRIVATE)
36+
37+
new_tvs = {tv_name: _change_aspace_tvs(tv) for tv_name, tv in
38+
kernel.temporary_variables.items()}
39+
kernel = kernel.copy(temporary_variables=new_tvs)
40+
41+
def insn_needs_atomic(insn):
42+
# updates to global variables are atomic
43+
import pymbolic
44+
if isinstance(insn, lp.Assignment):
45+
if isinstance(insn.assignee, pymbolic.primitives.Subscript):
46+
assignee_name = insn.assignee.aggregate.name
47+
else:
48+
assert isinstance(insn.assignee, pymbolic.primitives.Variable)
49+
assignee_name = insn.assignee.name
50+
51+
if assignee_name in kernel.arg_dict:
52+
return assignee_name in insn.read_dependency_names()
53+
return False
54+
55+
new_insns = []
56+
args_marked_for_atomic = set()
57+
for insn in kernel.instructions:
58+
if insn_needs_atomic(insn):
59+
atomicity = (lp.AtomicUpdate(insn.assignee.aggregate.name), )
60+
insn = insn.copy(atomicity=atomicity)
61+
args_marked_for_atomic |= set([insn.assignee.aggregate.name])
62+
63+
new_insns.append(insn)
64+
65+
# label args as atomic
66+
new_args = []
67+
for arg in kernel.args:
68+
if arg.name in args_marked_for_atomic:
69+
new_args.append(arg.copy(for_atomic=True))
70+
else:
71+
new_args.append(arg)
72+
73+
kernel = kernel.copy(instructions=new_insns, args=new_args)
74+
75+
return t_unit.with_kernel(kernel)
76+
77+
78+
def apply_gpu_transforms(t_unit, target):
79+
t_unit = t_unit.copy(target=get_loopy_target(target))
80+
t_unit = preprocess_t_unit_for_gpu(t_unit)
81+
kernel = t_unit.default_entrypoint
82+
transform_strategy = configuration["gpu_strategy"]
83+
84+
kernel = lp.assume(kernel, "end > start")
85+
86+
if transform_strategy == "snpt":
87+
from pyop2.transforms.snpt import split_n_across_workgroups
88+
kernel, args_to_make_global = split_n_across_workgroups(kernel, 32)
89+
else:
90+
raise NotImplementedError(f"'{transform_strategy}' transform strategy.")
91+
92+
t_unit = t_unit.with_kernel(kernel)
93+
94+
return t_unit, args_to_make_global

pyop2/transforms/snpt.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import loopy as lp
2+
3+
4+
def _make_tv_array_arg(tv):
5+
assert tv.address_space != lp.AddressSpace.PRIVATE
6+
arg = lp.ArrayArg(name=tv.name,
7+
dtype=tv.dtype,
8+
shape=tv.shape,
9+
dim_tags=tv.dim_tags,
10+
offset=tv.offset,
11+
dim_names=tv.dim_names,
12+
order=tv.order,
13+
alignment=tv.alignment,
14+
address_space=tv.address_space,
15+
is_output=not tv.read_only,
16+
is_input=tv.read_only)
17+
return arg
18+
19+
20+
def split_n_across_workgroups(kernel, workgroup_size):
21+
"""
22+
Returns a transformed version of *kernel* with the workload in the loop
23+
with induction variable 'n' distributed across work-groups of size
24+
*workgroup_size* and each work-item in the work-group performing the work
25+
of a single iteration of 'n'.
26+
"""
27+
28+
kernel = lp.assume(kernel, "start < end")
29+
kernel = lp.split_iname(kernel, "n", workgroup_size,
30+
outer_tag="g.0", inner_tag="l.0")
31+
32+
# {{{ making consts as globals: necessary to make the strategy emit valid
33+
# kernels for all forms
34+
35+
old_temps = kernel.temporary_variables.copy()
36+
args_to_make_global = [tv.initializer.flatten()
37+
for tv in old_temps.values()
38+
if tv.initializer is not None]
39+
40+
new_temps = {tv.name: tv
41+
for tv in old_temps.values()
42+
if tv.initializer is None}
43+
kernel = kernel.copy(args=kernel.args+[_make_tv_array_arg(tv)
44+
for tv in old_temps.values()
45+
if tv.initializer is not None],
46+
temporary_variables=new_temps)
47+
48+
# }}}
49+
50+
return kernel, args_to_make_global

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,8 @@ def run(self):
142142
],
143143
install_requires=install_requires + test_requires,
144144
dependency_links=dep_links,
145-
packages=['pyop2', 'pyop2.backends', 'pyop2.codegen', 'pyop2.types'],
145+
packages=['pyop2', 'pyop2.backends', 'pyop2.codegen', 'pyop2.types',
146+
'pyop2.transforms'],
146147
package_data={
147148
'pyop2': ['assets/*', '*.h', '*.pxd', '*.pyx', 'codegen/c/*.c']},
148149
scripts=glob('scripts/*'),

0 commit comments

Comments
 (0)