Skip to content

[FEA] NVIDIA nvvm defaults to fma=1 and wish there was a way to tell it fma=0 #118

@marioroy

Description

@marioroy

I like to compare CUDA and CPU results. NVIDIA's nvvm defaults to fma=1 behind the scene. This I can manage using PyCUDA but not NUMBA cuda.jit. The following exposes the fma option. Note: there is a blank line at the end of the diff. Oh how happy seeing the cuda.jit results matching the CPU.

Exposing the fma option:

diff -uarp a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -63,7 +63,7 @@ class _Kernel(serialize.ReduceMixin):
     ]

     @global_compiler_lock
-    def __init__(self, py_func, argtypes, link=None, debug=False,
+    def __init__(self, py_func, argtypes, link=None, debug=False, fma=True,
                  lineinfo=False, inline=False, fastmath=False, extensions=None,
                  max_registers=None, lto=False, opt=True, device=False):

@@ -95,6 +95,7 @@ class _Kernel(serialize.ReduceMixin):

         nvvm_options = {
             'fastmath': fastmath,
+            'fma': fma,
             'opt': 3 if opt else 0
         }

diff -uarp a/numba_cuda/numba/cuda/cudadrv/nvvm.py b/numba_cuda/numba/cuda/cudadrv/nvvm.py
--- a/numba_cuda/numba/cuda/cudadrv/nvvm.py
+++ b/numba_cuda/numba/cuda/cudadrv/nvvm.py
@@ -632,6 +632,9 @@ def compile_ir(llvmir, **opts):
             'prec_sqrt': False,
         })
 
+    if not opts.pop('fma', True):
+        opts.update({ 'fma': False })
+
     cu = CompilationUnit()
     libdevice = LibDevice()
 

Use case: https://github.com/marioroy/mandelbrot-python

Disabling fma allows me to witness cuda.jit match the CPU results. E.g. Launch mandel_kernel.py and press the letter x. That will auto zoom to location 2. The RGB values total matches the CPU.

diff --git a/app/mandel_common.py b/app/mandel_common.py
index e9055f5..2c7fc47 100644
--- a/app/mandel_common.py
+++ b/app/mandel_common.py
@@ -124,7 +124,7 @@ def _mandel1(colors, creal, cimag, max_iters):
     return INSIDE_COLOR1
 
 mandel1 = \
-    cuda.jit(device=True)(_mandel1) if USE_CUDA else \
+    cuda.jit(device=True, fma=False)(_mandel1) if USE_CUDA else \
     njit('UniTuple(u1,3)(i2[:,:], f8, f8, i4)', nogil=True)(_mandel1)
 
 
diff --git a/app/mandel_kernel.py b/app/mandel_kernel.py
index 056e57a..7037e6f 100644
--- a/app/mandel_kernel.py
+++ b/app/mandel_kernel.py
@@ -17,7 +17,7 @@ from numba import cuda, float32, uint8, int16, int32
 
 ESCAPE_RADIUS_2 = RADIUS * RADIUS
 
-@cuda.jit('void(u1[:,:,:], i2[:,:], i4, i4, f8, f8, f8, f8, i4)')
+@cuda.jit('void(u1[:,:,:], i2[:,:], i4, i4, f8, f8, f8, f8, i4)', fma=False)
 def mandelbrot1(temp, colors, width, height, min_x, min_y, step_x, step_y, max_iters):
 
     y = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
@@ -31,7 +31,7 @@ def mandelbrot1(temp, colors, width, height, min_x, min_y, step_x, step_y, max_i
     temp[y,x] = mandel1(colors, creal, cimag, max_iters)
 
 
-@cuda.jit('void(u1[:,:,:], i2[:,:], i4, i4, f8, f8, f8, f8, i4, i4, f8[:], u1[:,:,:])')
+@cuda.jit('void(u1[:,:,:], i2[:,:], i4, i4, f8, f8, f8, f8, i4, i4, f8[:], u1[:,:,:])', fma=False)
 def mandelbrot2(temp, colors, width, height, min_x, min_y, step_x, step_y, max_iters, aafactor, offset, output):
 
     y = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
@@ -131,7 +131,7 @@ def mandelbrot2(temp, colors, width, height, min_x, min_y, step_x, step_y, max_i
     output[y,x] = (uint8(r/aaarea), uint8(g/aaarea), uint8(b/aaarea))
 
 
-@cuda.jit('void(f4[:], u1[:,:,:], u1[:,:,:], i4, i4)')
+@cuda.jit('void(f4[:], u1[:,:,:], u1[:,:,:], i4, i4)', fma=False)
 def horizontal_gaussian_blur(matrix, src, dst, width, height):
 
     y = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
@@ -169,7 +169,7 @@ def horizontal_gaussian_blur(matrix, src, dst, width, height):
     dst[y,x] = (uint8(r), uint8(g), uint8(b))
 
 
-@cuda.jit('void(f4[:], u1[:,:,:], u1[:,:,:], i4, i4)')
+@cuda.jit('void(f4[:], u1[:,:,:], u1[:,:,:], i4, i4)', fma=False)
 def vertical_gaussian_blur(matrix, src, dst, width, height):
 
     y = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
@@ -207,7 +207,7 @@ def vertical_gaussian_blur(matrix, src, dst, width, height):
     dst[y,x] = (uint8(r), uint8(g), uint8(b))
 
 
-@cuda.jit('void(u1[:,:,:], u1[:,:,:], i4, i4)')
+@cuda.jit('void(u1[:,:,:], u1[:,:,:], i4, i4)', fma=False)
 def unsharp_mask(src, dst, width, height):
     """
     Sharpen the destination image using the Unsharp Mask technique.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions