Skip to content

Commit d0b6daf

Browse files
committed
squash commits
1 parent 5bdbc5c commit d0b6daf

11 files changed

Lines changed: 1984 additions & 25 deletions

File tree

pyop2/base.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@
6565

6666

6767
def _make_object(name, *args, **kwargs):
68-
from pyop2 import sequential
69-
return getattr(sequential, name)(*args, **kwargs)
68+
from pyop2.gpu import cuda as backend
69+
return getattr(backend, name)(*args, **kwargs)
7070

7171

7272
# Data API
@@ -2475,10 +2475,6 @@ def __itruediv__(self, other):
24752475
"""Pointwise division or scaling of fields."""
24762476
return self._iop(other, operator.itruediv)
24772477

2478-
def inner(self, other):
2479-
assert isinstance(other, Global)
2480-
return np.dot(self.data_ro, other.data_ro)
2481-
24822478

24832479
class Map(object):
24842480

@@ -2506,9 +2502,16 @@ def __init__(self, iterset, toset, arity, values=None, name=None, offset=None):
25062502
self._toset = toset
25072503
self.comm = toset.comm
25082504
self._arity = arity
2509-
self._values = verify_reshape(values, IntType,
2510-
(iterset.total_size, arity),
2511-
allow_none=True)
2505+
if False:
2506+
# maps indexed as `map[idof, icell]`
2507+
self._values = verify_reshape(values, IntType,
2508+
(arity, iterset.total_size),
2509+
allow_none=True)
2510+
else:
2511+
# maps indexed as `map[icell, idof]`
2512+
self._values = verify_reshape(values, IntType,
2513+
(iterset.total_size, arity),
2514+
allow_none=True)
25122515
self.shape = (iterset.total_size, arity)
25132516
self._name = name or "map_%d" % Map._globalcount
25142517
if offset is None or len(offset) == 0:
@@ -2586,7 +2589,11 @@ def values(self):
25862589
25872590
This only returns the map values for local points, to see the
25882591
halo points too, use :meth:`values_with_halo`."""
2589-
return self._values[:self.iterset.size]
2592+
if False:
2593+
# Transposed maps
2594+
return self._values[:, :self.iterset.size]
2595+
else:
2596+
return self._values[:self.iterset.size]
25902597

25912598
@cached_property
25922599
def values_with_halo(self):
@@ -3655,6 +3662,8 @@ def update_arg_data_state(self):
36553662
state = {WRITE: Mat.INSERT_VALUES,
36563663
INC: Mat.ADD_VALUES}[access]
36573664
arg.data.assembly_state = state
3665+
if arg._is_global and arg.access is not READ:
3666+
pass
36583667

36593668
@cached_property
36603669
def dat_args(self):

pyop2/codegen/rep2loopy.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -528,14 +528,20 @@ def statement_assign(expr, context):
528528
if isinstance(lvalue, Indexed):
529529
context.index_ordering.append(tuple(i.name for i in lvalue.index_ordering()))
530530
lvalue, rvalue = tuple(expression(c, context.parameters) for c in expr.children)
531-
within_inames = context.within_inames[expr]
531+
if isinstance(expr.label, UnpackInst):
532+
tag = "scatter"
533+
elif isinstance(expr.label, PackInst):
534+
tag = "gather"
532535

536+
within_inames = context.within_inames[expr]
533537
id, depends_on = context.instruction_dependencies[expr]
534538
predicates = frozenset(context.conditions)
535539
return loopy.Assignment(lvalue, rvalue, within_inames=within_inames,
536540
predicates=predicates,
537541
id=id,
538-
depends_on=depends_on, depends_on_is_final=True)
542+
depends_on=depends_on, depends_on_is_final=True,
543+
tags=frozenset([tag]))
544+
539545

540546

541547
@statement.register(FunctionCall)
@@ -719,7 +725,7 @@ def expression_namedliteral(expr, parameters):
719725
val = loopy.TemporaryVariable(name,
720726
dtype=expr.dtype,
721727
shape=expr.shape,
722-
address_space=loopy.AddressSpace.LOCAL,
728+
address_space=loopy.AddressSpace.GLOBAL,
723729
read_only=True,
724730
initializer=expr.value)
725731
parameters.temporaries[name] = val

pyop2/compilation.py

Lines changed: 103 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ def workaround_cflags(self):
219219
# combination (disappears without
220220
# -fno-tree-loop-vectorize!)
221221
return ["-fno-tree-loop-vectorize", "-mno-avx512f"]
222+
222223
return []
223224

224225
@collective
@@ -349,6 +350,14 @@ def get_so(self, jitmodule, extension):
349350
# Load resulting library
350351
return ctypes.CDLL(soname)
351352

353+
def get_function(self, code, extension, fn_name, argtypes, restype):
354+
dll = self.get_so(code, extension)
355+
356+
fn = getattr(dll, fn_name)
357+
fn.argtypes = code.argtypes
358+
fn.restype = restype
359+
return fn
360+
352361

353362
class MacCompiler(Compiler):
354363
"""A compiler for building a shared library on mac systems.
@@ -407,6 +416,97 @@ def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None):
407416
cpp=cpp, comm=comm)
408417

409418

419+
class CUDACompiler(Compiler):
420+
"""Compiler for the Nvidia CUDA backend.
421+
422+
:arg cppargs: A list of arguments to pass to the nvcc compiler
423+
(optional).
424+
:arg ldargs: A list of arguments to pass to the linker (optional).
425+
:arg cpp: Are we actually using the C++ compiler?
426+
:kwarg comm: Optional communicator to compile the code on (only
427+
rank 0 compiles code) (defaults to COMM_WORLD)."""
428+
def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None):
429+
cppargs = ["-use_fast_math", "-w"] # , "-lineinfo"]
430+
# TODO: Should we get the nvcc from petsc config?
431+
cc = "nvcc"
432+
433+
super(CUDACompiler, self).__init__(cc, cppargs=cppargs, ldargs=[],
434+
cpp=False, comm=comm)
435+
436+
@collective
437+
def get_source_module(self, jitmodule):
438+
"""Build a shared library and load it
439+
440+
:arg jitmodule: The JIT Module which can generate the code to compile.
441+
:arg extension: extension of the source file (c, cpp).
442+
443+
Returns a :class:`ctypes.CDLL` object of the resulting shared
444+
library."""
445+
446+
from pycuda.compiler import SourceModule
447+
448+
# Determine cache key
449+
hsh = md5(str(jitmodule.cache_key).encode())
450+
hsh.update(self._cc.encode())
451+
hsh.update("".join(self._cppargs).encode())
452+
hsh.update("".join(self._ldargs).encode())
453+
454+
basename = hsh.hexdigest()
455+
456+
cachedir = configuration['cache_dir']
457+
458+
dirpart, basename = basename[:2], basename[2:]
459+
cachedir = os.path.join(cachedir, dirpart)
460+
cname = os.path.join(cachedir, "%s_code.cu" % basename)
461+
462+
if configuration['check_src_hashes'] or configuration['debug']:
463+
matching = self.comm.allreduce(basename, op=_check_op)
464+
if matching != basename:
465+
# Dump all src code to disk for debugging
466+
output = os.path.join(cachedir, "mismatching-kernels")
467+
srcfile = os.path.join(output, "src-rank%d.c" % self.comm.rank)
468+
if self.comm.rank == 0:
469+
os.makedirs(output, exist_ok=True)
470+
self.comm.barrier()
471+
with open(srcfile, "w") as f:
472+
f.write(jitmodule.code_to_compile)
473+
self.comm.barrier()
474+
raise CompilationError("Generated code differs across ranks (see output in %s)" % output)
475+
476+
if os.path.isfile(cname):
477+
# Are we in the cache?
478+
with open(cname, 'r') as f:
479+
source_module = SourceModule(f.read(), nvcc=self._cc,
480+
options=self._cppargs, cache_dir=cachedir)
481+
else:
482+
# No, let's go ahead and build
483+
if self.comm.rank == 0:
484+
# No need to do this on all ranks
485+
os.makedirs(cachedir, exist_ok=True)
486+
with progress(INFO, 'Compiling wrapper'):
487+
# make sure that compiles successfully before writing to file
488+
source_module = SourceModule(jitmodule.code_to_compile,
489+
nvcc=self._cc, options=self._cppargs,
490+
cache_dir=cachedir)
491+
with open(cname, "w") as f:
492+
f.write(jitmodule.code_to_compile)
493+
self.comm.barrier()
494+
495+
return source_module
496+
497+
def get_function(self, code, extension, fn_name, argtypes=None,
498+
restype=None):
499+
"""
500+
.. warning::
501+
Callee does not prepare the function
502+
"""
503+
504+
assert argtypes is None
505+
assert restype is None
506+
fn = self.get_source_module(code).get_function(fn_name)
507+
return fn
508+
509+
410510
class LinuxIntelCompiler(Compiler):
411511
"""The intel compiler for building a shared library on linux systems.
412512
@@ -473,19 +573,17 @@ def __init__(self, code, argtypes):
473573
compiler = LinuxIntelCompiler(cppargs, ldargs, cpp=cpp, comm=comm)
474574
elif compiler == 'gcc':
475575
compiler = LinuxCompiler(cppargs, ldargs, cpp=cpp, comm=comm)
576+
elif compiler == 'nvcc':
577+
compiler = CUDACompiler(cppargs, ldargs, cpp=cpp, comm=comm)
476578
else:
477579
raise CompilationError("Unrecognized compiler name '%s'" % compiler)
478580
elif platform.find('darwin') == 0:
479581
compiler = MacCompiler(cppargs, ldargs, cpp=cpp, comm=comm)
480582
else:
481583
raise CompilationError("Don't know what compiler to use for platform '%s'" %
482584
platform)
483-
dll = compiler.get_so(code, extension)
484585

485-
fn = getattr(dll, fn_name)
486-
fn.argtypes = code.argtypes
487-
fn.restype = restype
488-
return fn
586+
return compiler.get_function(code, extension, fn_name, argtypes, restype)
489587

490588

491589
def clear_cache(prompt=False):

pyop2/configuration.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,25 @@ class Configuration(dict):
7676
DEFAULTS = {
7777
"compiler": ("PYOP2_BACKEND_COMPILER", str, "gcc"),
7878
"simd_width": ("PYOP2_SIMD_WIDTH", int, 4),
79+
80+
# {{{ GPU params
81+
82+
"gpu_timer": ("PYOP2_GPU_TIMER", bool, False),
83+
"gpu_cells_per_block": ("PYOP2_GPU_CELLS_PER_BLOCK", int, 32),
84+
"gpu_strategy": ("PYOP2_GPU_STRATEGY", str, "scpt"),
85+
"gpu_threads_per_cell": ("PYOP2_GPU_THREADS_PER_CELL", int, 1),
86+
"gpu_op_tile_descriptions": ("PYOP2_GPU_OP_TILE_DESCRS", tuple, ()),
87+
"gpu_quad_rowtile_lengths": ("PYOP2_GPU_QUAD_ROWTILE_LENGTHS", tuple, ()),
88+
"gpu_coords_to_shared": ("PYOP2_GPU_COORDS_TO_SHARED", bool, False),
89+
"gpu_input_to_shared": ("PYOP2_GPU_INPUT_TO_SHARED", bool, False),
90+
"gpu_mats_to_shared": ("PYOP2_GPU_MATS_TO_SHARED", bool, False),
91+
"gpu_quad_weights_to_shared": ("PYOP2_GPU_QUAD_WEIGHTS_TO_SHARED", bool, False),
92+
"gpu_tiled_prefetch_of_input": ("PYOP2_GPU_TILED_PREFETCH_OF_INPUTS", bool, False),
93+
"gpu_tiled_prefetch_of_quad_weights": ("PYOP2_GPU_TILED_PREFETCH_OF_QUAD_WEIGHTS", bool, False),
94+
"gpu_planner_kernel_evals": ("PYOP2_GPU_PLANNER_KNL_EVLS", int, 10),
95+
96+
# }}}
97+
7998
"debug": ("PYOP2_DEBUG", bool, False),
8099
"cflags": ("PYOP2_CFLAGS", str, ""),
81100
"ldflags": ("PYOP2_LDFLAGS", str, ""),

pyop2/gpu/TODO.org

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
* Limitations/TODOs
2+
** Changes in TSFC so that PyOP2 could have a better understanding of the variable names
3+
- [[https://github.com/OP2/PyOP2/blob/630e55118013966e84dcc62328c45fc9061196e6/pyop2/gpu/tile.py#L65-L79][Currently]] variable names have been hard coded for CG type FE kernel on
4+
triangular meshes.
5+
- Once this has been done it would then be reasonable to tackle other elements
6+
7+
*** Information to be fed from TSFC
8+
- [ ] variable name of the action input
9+
- [ ] variable name of the action output
10+
- [ ] variable name of mesh coordinates
11+
- [ ] variable name of quadrature weights
12+
- [ ] quadrature iname
13+
- [ ] DOF iname(s)
14+
- [ ] tagging instructions responsible for computing the Jacobian
15+
- [ ] tagging the stages(init, update, assign) for each of the two sum
16+
reductions in the TSFC kernel
17+
18+
One way to solve this is tagging these names into loopy kernels from TSFC while
19+
we are going from GEM representation to loopy kernel.
20+
21+
** Adding support for explicit matrix assembly
22+
*** Proposed path
23+
- The pyop2 configuration should have a configuration parameter ~backend~ which
24+
would be one of ~"cpu", "gpu.cuda", "gpu.opencl"~
25+
- And based on the "backend" parameter the appropriate instance of ~Dat, Mat, Map, ...~
26+
should be init-ed at runtime.
27+
28+
*** Obstacles
29+
- [[https://github.com/OP2/PyOP2/blob/8e1c5720fe0a8f7b4e870a49c43608d97c66ad14/pyop2/op2.py#L45-L49][Current in PyOP2]], backend selection happens only once which would be incorrect
30+
for ex. when we are running the matrix-free kernel ~op2.Map~ should stay in
31+
the device's address space while during explicit assembly it should be a part
32+
of host's address space.(similarly the kernel execution in matrix free
33+
happens on device which is not the case for explicit assembly)
34+
- Transformation strategy selection, sufficient?
35+
- This might lead to some refactoring in ~firedrake~, especially where the
36+
objects are instantiated.
37+
- Backend switching would be a bit tricky for subclasses like [[https://github.com/firedrakeproject/firedrake/blob/3498fdf3e33721adda448755addc11c20bef75a9/firedrake/preconditioners/patch.py#L77][here.]]
38+
39+
** Global reduction kernels. For ex. ~assemble(dot(f,f)*dx)~
40+
- Currently all the threads write to a single memory location atomically,
41+
thereby losing concurrency.
42+
- Possible solution:
43+
- Fix the block size, say 256.
44+
- Map single cell to single thread.
45+
- Reduce across threads and get the result for each block
46+
- Write the solution of each group to a global intermediary variable.
47+
- Finally another reduction across the newly created intermediary variable.
48+
- One starting step would be to map the '+=' to a loopy's sum node.
49+
50+
** Do we need atomic additions of the output DoFs for a DG kernel?
51+
52+
** Tiling transformation logic fails for low orders
53+
- The TSFC kernel receive has a slightly different representation at low orders
54+
like P_0, P_1, DG0, DG1, etc. because some loops are unrolled, causing to
55+
diverge from the "assumed" template of all the kernel's loop structures.
56+
57+
** CI for the GPU target

0 commit comments

Comments
 (0)