OP2
diff --git a/‎pyop2/base.py‎
Lines changed: 19 additions & 10 deletions b/‎pyop2/base.py‎
Lines changed: 19 additions & 10 deletions
diff --git a/‎pyop2/codegen/rep2loopy.py‎
Lines changed: 9 additions & 3 deletions b/‎pyop2/codegen/rep2loopy.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎pyop2/compilation.py‎
Lines changed: 103 additions & 5 deletions b/‎pyop2/compilation.py‎
Lines changed: 103 additions & 5 deletions
diff --git a/‎pyop2/configuration.py‎
Lines changed: 19 additions & 0 deletions b/‎pyop2/configuration.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎pyop2/gpu/TODO.org‎
Lines changed: 57 additions & 0 deletions b/‎pyop2/gpu/TODO.org‎
Lines changed: 57 additions & 0 deletions
@@ -65,8 +65,8 @@
 
 
 def _make_object(name, *args, **kwargs):
-    from pyop2 import sequential
-    return getattr(sequential, name)(*args, **kwargs)
+    from pyop2.gpu import cuda as backend
+    return getattr(backend, name)(*args, **kwargs)
 
 
 # Data API
@@ -2475,10 +2475,6 @@ def __itruediv__(self, other):
         """Pointwise division or scaling of fields."""
         return self._iop(other, operator.itruediv)
 
-    def inner(self, other):
-        assert isinstance(other, Global)
-        return np.dot(self.data_ro, other.data_ro)
-
 
 class Map(object):
 
@@ -2506,9 +2502,16 @@ def __init__(self, iterset, toset, arity, values=None, name=None, offset=None):
         self._toset = toset
         self.comm = toset.comm
         self._arity = arity
-        self._values = verify_reshape(values, IntType,
-                                      (iterset.total_size, arity),
-                                      allow_none=True)
+        if False:
+            # maps indexed as `map[idof, icell]`
+            self._values = verify_reshape(values, IntType,
+                                          (arity, iterset.total_size),
+                                          allow_none=True)
+        else:
+            # maps indexed as `map[icell, idof]`
+            self._values = verify_reshape(values, IntType,
+                                          (iterset.total_size, arity),
+                                          allow_none=True)
         self.shape = (iterset.total_size, arity)
         self._name = name or "map_%d" % Map._globalcount
         if offset is None or len(offset) == 0:
@@ -2586,7 +2589,11 @@ def values(self):
 
         This only returns the map values for local points, to see the
         halo points too, use :meth:`values_with_halo`."""
-        return self._values[:self.iterset.size]
+        if False:
+            # Transposed maps
+            return self._values[:, :self.iterset.size]
+        else:
+            return self._values[:self.iterset.size]
 
     @cached_property
     def values_with_halo(self):
@@ -3655,6 +3662,8 @@ def update_arg_data_state(self):
                 state = {WRITE: Mat.INSERT_VALUES,
                          INC: Mat.ADD_VALUES}[access]
                 arg.data.assembly_state = state
+            if arg._is_global and arg.access is not READ:
+                pass
 
     @cached_property
     def dat_args(self):
 
@@ -528,14 +528,20 @@ def statement_assign(expr, context):
     if isinstance(lvalue, Indexed):
         context.index_ordering.append(tuple(i.name for i in lvalue.index_ordering()))
     lvalue, rvalue = tuple(expression(c, context.parameters) for c in expr.children)
-    within_inames = context.within_inames[expr]
+    if isinstance(expr.label, UnpackInst):
+        tag = "scatter"
+    elif isinstance(expr.label, PackInst):
+        tag = "gather"
 
+    within_inames = context.within_inames[expr]
     id, depends_on = context.instruction_dependencies[expr]
     predicates = frozenset(context.conditions)
     return loopy.Assignment(lvalue, rvalue, within_inames=within_inames,
                             predicates=predicates,
                             id=id,
-                            depends_on=depends_on, depends_on_is_final=True)
+                            depends_on=depends_on, depends_on_is_final=True,
+                            tags=frozenset([tag]))
+
 
 
 @statement.register(FunctionCall)
@@ -719,7 +725,7 @@ def expression_namedliteral(expr, parameters):
     val = loopy.TemporaryVariable(name,
                                   dtype=expr.dtype,
                                   shape=expr.shape,
-                                  address_space=loopy.AddressSpace.LOCAL,
+                                  address_space=loopy.AddressSpace.GLOBAL,
                                   read_only=True,
                                   initializer=expr.value)
     parameters.temporaries[name] = val
 
@@ -219,6 +219,7 @@ def workaround_cflags(self):
                 # combination (disappears without
                 # -fno-tree-loop-vectorize!)
                 return ["-fno-tree-loop-vectorize", "-mno-avx512f"]
+
         return []
 
     @collective
@@ -349,6 +350,14 @@ def get_so(self, jitmodule, extension):
             # Load resulting library
             return ctypes.CDLL(soname)
 
+    def get_function(self, code, extension, fn_name, argtypes, restype):
+        dll = self.get_so(code, extension)
+
+        fn = getattr(dll, fn_name)
+        fn.argtypes = code.argtypes
+        fn.restype = restype
+        return fn
+
 
 class MacCompiler(Compiler):
     """A compiler for building a shared library on mac systems.
@@ -407,6 +416,97 @@ def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None):
                                             cpp=cpp, comm=comm)
 
 
+class CUDACompiler(Compiler):
+    """Compiler for the Nvidia CUDA backend.
+
+    :arg cppargs: A list of arguments to pass to the nvcc compiler
+         (optional).
+    :arg ldargs: A list of arguments to pass to the linker (optional).
+    :arg cpp: Are we actually using the C++ compiler?
+    :kwarg comm: Optional communicator to compile the code on (only
+    rank 0 compiles code) (defaults to COMM_WORLD)."""
+    def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None):
+        cppargs = ["-use_fast_math", "-w"]  # , "-lineinfo"]
+        # TODO: Should we get the nvcc from petsc config?
+        cc = "nvcc"
+
+        super(CUDACompiler, self).__init__(cc, cppargs=cppargs, ldargs=[],
+                                            cpp=False, comm=comm)
+
+    @collective
+    def get_source_module(self, jitmodule):
+        """Build a shared library and load it
+
+        :arg jitmodule: The JIT Module which can generate the code to compile.
+        :arg extension: extension of the source file (c, cpp).
+
+        Returns a :class:`ctypes.CDLL` object of the resulting shared
+        library."""
+
+        from pycuda.compiler import SourceModule
+
+        # Determine cache key
+        hsh = md5(str(jitmodule.cache_key).encode())
+        hsh.update(self._cc.encode())
+        hsh.update("".join(self._cppargs).encode())
+        hsh.update("".join(self._ldargs).encode())
+
+        basename = hsh.hexdigest()
+
+        cachedir = configuration['cache_dir']
+
+        dirpart, basename = basename[:2], basename[2:]
+        cachedir = os.path.join(cachedir, dirpart)
+        cname = os.path.join(cachedir, "%s_code.cu" % basename)
+
+        if configuration['check_src_hashes'] or configuration['debug']:
+            matching = self.comm.allreduce(basename, op=_check_op)
+            if matching != basename:
+                # Dump all src code to disk for debugging
+                output = os.path.join(cachedir, "mismatching-kernels")
+                srcfile = os.path.join(output, "src-rank%d.c" % self.comm.rank)
+                if self.comm.rank == 0:
+                    os.makedirs(output, exist_ok=True)
+                self.comm.barrier()
+                with open(srcfile, "w") as f:
+                    f.write(jitmodule.code_to_compile)
+                self.comm.barrier()
+                raise CompilationError("Generated code differs across ranks (see output in %s)" % output)
+
+        if os.path.isfile(cname):
+            # Are we in the cache?
+            with open(cname, 'r') as f:
+                source_module = SourceModule(f.read(), nvcc=self._cc,
+                        options=self._cppargs, cache_dir=cachedir)
+        else:
+            # No, let's go ahead and build
+            if self.comm.rank == 0:
+                # No need to do this on all ranks
+                os.makedirs(cachedir, exist_ok=True)
+                with progress(INFO, 'Compiling wrapper'):
+                    # make sure that compiles successfully before writing to file
+                    source_module = SourceModule(jitmodule.code_to_compile,
+                            nvcc=self._cc, options=self._cppargs,
+                            cache_dir=cachedir)
+                    with open(cname, "w") as f:
+                        f.write(jitmodule.code_to_compile)
+            self.comm.barrier()
+
+        return source_module
+
+    def get_function(self, code, extension, fn_name, argtypes=None,
+            restype=None):
+        """
+        .. warning::
+            Callee does not prepare the function
+        """
+
+        assert argtypes is None
+        assert restype is None
+        fn = self.get_source_module(code).get_function(fn_name)
+        return fn
+
+
 class LinuxIntelCompiler(Compiler):
     """The intel compiler for building a shared library on linux systems.
 
@@ -473,19 +573,17 @@ def __init__(self, code, argtypes):
             compiler = LinuxIntelCompiler(cppargs, ldargs, cpp=cpp, comm=comm)
         elif compiler == 'gcc':
             compiler = LinuxCompiler(cppargs, ldargs, cpp=cpp, comm=comm)
+        elif compiler == 'nvcc':
+            compiler = CUDACompiler(cppargs, ldargs, cpp=cpp, comm=comm)
         else:
             raise CompilationError("Unrecognized compiler name '%s'" % compiler)
     elif platform.find('darwin') == 0:
         compiler = MacCompiler(cppargs, ldargs, cpp=cpp, comm=comm)
     else:
         raise CompilationError("Don't know what compiler to use for platform '%s'" %
                                platform)
-    dll = compiler.get_so(code, extension)
 
-    fn = getattr(dll, fn_name)
-    fn.argtypes = code.argtypes
-    fn.restype = restype
-    return fn
+    return compiler.get_function(code, extension, fn_name, argtypes, restype)
 
 
 def clear_cache(prompt=False):
 
@@ -76,6 +76,25 @@ class Configuration(dict):
     DEFAULTS = {
         "compiler": ("PYOP2_BACKEND_COMPILER", str, "gcc"),
         "simd_width": ("PYOP2_SIMD_WIDTH", int, 4),
+
+        # {{{ GPU params
+
+        "gpu_timer": ("PYOP2_GPU_TIMER", bool, False),
+        "gpu_cells_per_block": ("PYOP2_GPU_CELLS_PER_BLOCK", int, 32),
+        "gpu_strategy": ("PYOP2_GPU_STRATEGY", str, "scpt"),
+        "gpu_threads_per_cell": ("PYOP2_GPU_THREADS_PER_CELL", int, 1),
+        "gpu_op_tile_descriptions": ("PYOP2_GPU_OP_TILE_DESCRS", tuple, ()),
+        "gpu_quad_rowtile_lengths": ("PYOP2_GPU_QUAD_ROWTILE_LENGTHS", tuple, ()),
+        "gpu_coords_to_shared": ("PYOP2_GPU_COORDS_TO_SHARED", bool, False),
+        "gpu_input_to_shared": ("PYOP2_GPU_INPUT_TO_SHARED", bool, False),
+        "gpu_mats_to_shared": ("PYOP2_GPU_MATS_TO_SHARED", bool, False),
+        "gpu_quad_weights_to_shared": ("PYOP2_GPU_QUAD_WEIGHTS_TO_SHARED", bool, False),
+        "gpu_tiled_prefetch_of_input": ("PYOP2_GPU_TILED_PREFETCH_OF_INPUTS", bool, False),
+        "gpu_tiled_prefetch_of_quad_weights": ("PYOP2_GPU_TILED_PREFETCH_OF_QUAD_WEIGHTS", bool, False),
+        "gpu_planner_kernel_evals": ("PYOP2_GPU_PLANNER_KNL_EVLS", int, 10),
+
+        # }}}
+
         "debug": ("PYOP2_DEBUG", bool, False),
         "cflags": ("PYOP2_CFLAGS", str, ""),
         "ldflags": ("PYOP2_LDFLAGS", str, ""),
 
@@ -0,0 +1,57 @@
+* Limitations/TODOs
+** Changes in TSFC so that PyOP2 could have a better understanding of the variable names
+- [[https://github.com/OP2/PyOP2/blob/630e55118013966e84dcc62328c45fc9061196e6/pyop2/gpu/tile.py#L65-L79][Currently]] variable names have been hard coded for CG type FE kernel on
+  triangular meshes.
+- Once this has been done it would then be reasonable to tackle other elements
+
+*** Information to be fed from TSFC
+- [ ] variable name of the action input
+- [ ] variable name of the action output
+- [ ] variable name of mesh coordinates
+- [ ] variable name of quadrature weights
+- [ ] quadrature iname
+- [ ] DOF iname(s)
+- [ ] tagging instructions responsible for computing the Jacobian
+- [ ] tagging the stages(init, update, assign) for each of the two sum
+  reductions in the TSFC kernel
+
+One way to solve this is tagging these names into loopy kernels from TSFC while
+we are going from GEM representation to loopy kernel.
+
+** Adding support for explicit matrix assembly
+*** Proposed path
+- The pyop2 configuration should have a configuration parameter ~backend~ which
+  would be one of ~"cpu", "gpu.cuda", "gpu.opencl"~
+- And based on the "backend" parameter the appropriate instance of ~Dat, Mat, Map, ...~
+  should be init-ed at runtime.
+
+*** Obstacles
+- [[https://github.com/OP2/PyOP2/blob/8e1c5720fe0a8f7b4e870a49c43608d97c66ad14/pyop2/op2.py#L45-L49][Current in PyOP2]], backend selection happens only once which would be incorrect
+  for ex. when we are running the matrix-free kernel ~op2.Map~ should stay in
+  the device's address space while during explicit assembly it should be a part
+  of host's address space.(similarly the kernel execution in matrix free
+  happens on device which is not the case for explicit assembly)
+- Transformation strategy selection, sufficient?
+- This might lead to some refactoring in ~firedrake~, especially where the
+  objects are instantiated.
+- Backend switching would be a bit tricky for subclasses like [[https://github.com/firedrakeproject/firedrake/blob/3498fdf3e33721adda448755addc11c20bef75a9/firedrake/preconditioners/patch.py#L77][here.]]
+
+** Global reduction kernels. For ex. ~assemble(dot(f,f)*dx)~
+- Currently all the threads write to a single memory location atomically,
+  thereby losing concurrency.
+- Possible solution:
+    - Fix the block size, say 256.
+    - Map single cell to single thread.
+    - Reduce across threads and get the result for each block
+    - Write the solution of each group to a global intermediary variable.
+    - Finally another reduction across the newly created intermediary variable.
+    - One starting step would be to map the '+=' to a loopy's sum node.
+
+** Do we need atomic additions of the output DoFs for a DG kernel?
+
+** Tiling transformation logic fails for low orders
+- The TSFC kernel receive has a slightly different representation at low orders
+  like P_0, P_1, DG0, DG1, etc. because some loops are unrolled, causing to
+  diverge from the "assumed" template of all the kernel's loop structures.
+
+** CI for the GPU target