diff --git a/CUDA/template/GB_jit_kernel_cuda_colscale.cu b/CUDA/template/GB_jit_kernel_cuda_colscale.cu index 8cd76843f3..fddb9c7e3c 100644 --- a/CUDA/template/GB_jit_kernel_cuda_colscale.cu +++ b/CUDA/template/GB_jit_kernel_cuda_colscale.cu @@ -21,8 +21,10 @@ __global__ void GB_cuda_colscale_kernel GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *) C->x ; #if ( GB_A_IS_SPARSE || GB_A_IS_HYPER ) + GB_Cp_TYPE *__restrict__ Cp = (GB_Ap_TYPE *) C->p ; const GB_Ap_TYPE *__restrict__ Ap = (GB_Ap_TYPE *) A->p ; #if ( GB_A_IS_HYPER ) + GB_Cj_TYPE *__restrict__ Ch = (int64_t *) C->h ; const GB_Aj_TYPE *__restrict__ Ah = (GB_Aj_TYPE *) A->h ; #endif #endif @@ -33,12 +35,13 @@ __global__ void GB_cuda_colscale_kernel GB_A_NHELD (anz) ; + int nthreads = blockDim.x * gridDim.x ; + int tid = blockIdx.x * blockDim.x + threadIdx.x ; + #if (GB_A_IS_BITMAP || GB_A_IS_FULL) const int64_t avlen = A->vlen ; // bitmap/full case - int nthreads_in_entire_grid = blockDim.x * gridDim.x ; - int tid = blockIdx.x * blockDim.x + threadIdx.x ; - for (int64_t p = tid ; p < anz ; p += nthreads_in_entire_grid) + for (int64_t p = tid ; p < anz ; p += nthreads) { if (!GBb_A (Ab, p)) continue ; // the pth entry in A is A(i,j) where i = p%avlen and j = p/avlen @@ -54,6 +57,18 @@ __global__ void GB_cuda_colscale_kernel #else const int64_t anvec = A->nvec ; + // Copy A->p, A->h to C->p, C->h here instead of using + // GB_dup_worker on the CPU, so A->p, A->h stay on the GPU + // if they were already there + for (int64_t kA = tid ; kA < anvec ; kA += nthreads) + { + Cp [kA] = Ap [kA] ; + #if ( GB_A_IS_HYPER ) + Ch [kA] = Ah [kA] ; + #endif + } + Cp [anvec] = Ap [anvec] ; + // sparse/hypersparse case (cuda_ek_slice only works for sparse/hypersparse) for (int64_t pfirst = blockIdx.x << log2_chunk_size ; pfirst < anz ; diff --git a/CUDA/template/GB_jit_kernel_cuda_rowscale.cu b/CUDA/template/GB_jit_kernel_cuda_rowscale.cu index 16e92b736d..c9daa66c1e 100644 --- a/CUDA/template/GB_jit_kernel_cuda_rowscale.cu +++ b/CUDA/template/GB_jit_kernel_cuda_rowscale.cu @@ -18,6 +18,7 @@ __global__ void GB_cuda_rowscale_kernel #define B_iso GB_B_ISO #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER ) + GB_Ci_TYPE *__restrict__ Ci = (GB_Bi_TYPE *) C->i ; const GB_Bi_TYPE *__restrict__ Bi = (GB_Bi_TYPE *) B->i ; #endif @@ -31,14 +32,18 @@ __global__ void GB_cuda_rowscale_kernel const int64_t bvlen = B->vlen ; #endif - int ntasks = gridDim.x * blockDim.x; + int nthreads = gridDim.x * blockDim.x; int tid = blockIdx.x * blockDim.x + threadIdx.x; - for (int64_t p = tid ; p < bnz ; p += ntasks) + for (int64_t p = tid ; p < bnz ; p += nthreads) { if (!GBb_B (Bb, p)) { continue ; } int64_t i = GBi_B (Bi, p, bvlen) ; // get row index of B(i,j) + #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER ) + // Copy B->i to C->i here instead of using GB_dup_worker + Ci [p] = i ; + #endif GB_DECLAREA (dii) ; GB_GETA (dii, Dx, i, D_iso) ; // dii = D(i,i) GB_DECLAREB (bij) ; diff --git a/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu b/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu index 43b3aa9794..2873a81273 100644 --- a/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu +++ b/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu @@ -21,10 +21,14 @@ __global__ void GB_cuda_select_bitmap_kernel { int8_t *Cb_out = C->b ; - #if ( GB_DEPENDS_ON_X ) + #if ( GB_DEPENDS_ON_X || !GB_ISO_SELECT ) const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *) A->x ; #endif + #if ( !GB_ISO_SELECT ) + GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *) C->x ; + #endif + #if ( GB_A_IS_BITMAP ) const int8_t *__restrict__ Ab = A->b ; #endif @@ -41,6 +45,10 @@ __global__ void GB_cuda_select_bitmap_kernel int nthreads = blockDim.x * gridDim.x ; for (int64_t p = tid ; p < anz ; p += nthreads) { + #if ( !GB_ISO_SELECT ) + Cx [p] = Ax [p] ; + #endif + Cb_out [p] = 0 ; if (!GBb_A (Ab, p)) { continue; } diff --git a/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu b/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu index c303e9a38f..a60f61a39d 100644 --- a/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu +++ b/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu @@ -330,7 +330,7 @@ GB_JIT_CUDA_KERNEL_SELECT_SPARSE_PROTO (GB_jit_kernel) GB_cuda_select_sparse_phase2 <<>> (Map, A, Ak_keep, (GB_Ci_TYPE *) C->i, (GB_C_TYPE *) C->x) ; - CUDA_OK (cudaGetLastError ( )) ; + // CUDA_OK (cudaGetLastError ( )) ; CUDA_OK (cudaStreamSynchronize (stream)) ; //-------------------------------------------------------------------------- diff --git a/Source/mxm/GB_colscale.c b/Source/mxm/GB_colscale.c index afa6e741b4..dcc2ac62bc 100644 --- a/Source/mxm/GB_colscale.c +++ b/Source/mxm/GB_colscale.c @@ -92,21 +92,17 @@ GrB_Info GB_colscale // C = A*D, column scale with diagonal D GB_void cscalar [GB_VLA(zsize)] ; bool C_iso = GB_AxB_iso (cscalar, A, D, A->vdim, semiring, flipxy, true) ; - //-------------------------------------------------------------------------- - // copy the pattern of A into C - //-------------------------------------------------------------------------- - - // allocate C->x but do not initialize it - GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ; info = GrB_NO_VALUE ; - ASSERT (C->type == ztype) ; //-------------------------------------------------------------------------- // C = A*D, column scale, compute numerical values //-------------------------------------------------------------------------- if (GB_IS_BUILTIN_BINOP_CODE_POSITIONAL (opcode)) - { + { + // Copy the pattern of A into C. Allocates, but does not initialize C->x. + GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ; + ASSERT (C->type == ztype) ; //---------------------------------------------------------------------- // apply a positional operator: convert C=A*D to C=op(A) @@ -157,7 +153,10 @@ GrB_Info GB_colscale // C = A*D, column scale with diagonal D } else if (C_iso) - { + { + // Copy the pattern of A into C. Allocates, but does not initialize C->x. + GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ; + ASSERT (C->type == ztype) ; //---------------------------------------------------------------------- // via the iso kernel @@ -179,6 +178,18 @@ GrB_Info GB_colscale // C = A*D, column scale with diagonal D // determine if the values are accessed //---------------------------------------------------------------------- + // Do not dup A->p, A->h into C yet; if we use CUDA, we'll do it on the + // GPU + // FIXME: Add flags to GB_dup_worker for which arrays to copy + int64_t *tmp_Ap = A->p ; + int64_t *tmp_Ah = A->h ; + A->p = NULL ; + A->h = NULL ; + GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ; + A->p = tmp_Ap ; + A->h = tmp_Ah ; + ASSERT (C->type == ztype) ; + ASSERT (fmult != NULL) ; bool op_is_first = (opcode == GB_FIRST_binop_code) ; bool op_is_second = (opcode == GB_SECOND_binop_code) ; @@ -217,6 +228,38 @@ GrB_Info GB_colscale // C = A*D, column scale with diagonal D } #endif + // We are using the CPU. Finish the dup from A -> C. + if (info == GrB_NO_VALUE) + { + // copy A->p, A->h into C->p, C->h + size_t A_psize = A->p_is_32 ? + sizeof (uint32_t) : sizeof (uint64_t) ; + size_t A_isize = A->i_is_32 ? + sizeof (uint32_t) : sizeof (uint64_t) ; + + int64_t anvec = A->nvec ; + int64_t cnvec = C->nvec ; + ASSERT (cnvec == anvec) ; + + int nthreads_max = GB_Context_nthreads_max ( ) ; + + if (A->p != NULL) + { + size_t C_psize = C->p_is_32 ? + sizeof (uint32_t) : sizeof (uint64_t) ; + + ASSERT (C_psize == A_psize) ; + GB_memcpy (C->p, A->p, (anvec+1) * A_psize, nthreads_max) ; + } + if (A->h != NULL) + { + size_t C_isize = C->i_is_32 ? + sizeof (uint32_t) : sizeof (uint64_t) ; + + ASSERT (C_isize == A_isize) ; + GB_memcpy (C->h, A->h, anvec * A_isize, nthreads_max) ; + } + } //---------------------------------------------------------------------- // determine the number of threads to use //---------------------------------------------------------------------- diff --git a/Source/mxm/GB_rowscale.c b/Source/mxm/GB_rowscale.c index 150c061253..a9f3c4ab39 100644 --- a/Source/mxm/GB_rowscale.c +++ b/Source/mxm/GB_rowscale.c @@ -81,14 +81,7 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D GB_void cscalar [GB_VLA(zsize)] ; bool C_iso = GB_AxB_iso (cscalar, D, B, D->vdim, semiring, flipxy, true) ; - //-------------------------------------------------------------------------- - // copy the pattern of B into C - //-------------------------------------------------------------------------- - - // allocate C->x but do not initialize it - GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ; info = GrB_NO_VALUE ; - ASSERT (C->type == ztype) ; //-------------------------------------------------------------------------- // C = D*B, row scale, compute numerical values @@ -96,6 +89,9 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D if (GB_IS_BUILTIN_BINOP_CODE_POSITIONAL (opcode)) { + // Copy the pattern of B into C. Allocates, but does not initialize C->x. + GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ; + ASSERT (C->type == ztype) ; //---------------------------------------------------------------------- // apply a positional operator: convert C=D*B to C=op(B) @@ -147,6 +143,9 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D } else if (C_iso) { + // Copy the pattern of B into C. Allocates, but does not initialize C->x. + GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ; + ASSERT (C->type == ztype) ; //---------------------------------------------------------------------- // via the iso kernel @@ -169,6 +168,15 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D // determine if the values are accessed //---------------------------------------------------------------------- + // Do not dup B->i into C yet; if we use CUDA, we'll do it on the GPU + // FIXME: Add flags to GB_dup_worker for which arrays to copy + int64_t *tmp_Bi = B->i ; + B->i = NULL ; + GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ; + B->i = tmp_Bi ; + ASSERT (C->type == ztype) ; + + ASSERT (fmult != NULL) ; bool op_is_first = (opcode == GB_FIRST_binop_code) ; bool op_is_second = (opcode == GB_SECOND_binop_code) ; @@ -207,6 +215,27 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D } #endif + // We are using the CPU. Finish the dup from B -> C + if (info == GrB_NO_VALUE) + { + // Copy in B->i + size_t B_isize = B->i_is_32 ? + sizeof (uint32_t) : sizeof (uint64_t) ; + size_t C_isize = C->i_is_32 ? + sizeof (uint32_t) : sizeof (uint64_t) ; + + int64_t bnz = GB_nnz_held (B) ; + int64_t cnz = GB_nnz_held (C) ; + ASSERT (cnz == bnz) ; + + int nthreads_max = GB_Context_nthreads_max ( ) ; + + if (B->i != NULL) + { + ASSERT (C_isize == B_isize) ; + GB_memcpy (C->i, B->i, bnz * B_isize, nthreads_max) ; + } + } //---------------------------------------------------------------------- // determine the number of threads to use //---------------------------------------------------------------------- diff --git a/Source/select/GB_select_bitmap.c b/Source/select/GB_select_bitmap.c index 0f38cff73f..40a854ce80 100644 --- a/Source/select/GB_select_bitmap.c +++ b/Source/select/GB_select_bitmap.c @@ -83,12 +83,6 @@ GrB_Info GB_select_bitmap // Cx [0] = Ax [0] or (A->type) thunk GB_select_iso (C->x, opcode, athunk, A->x, asize) ; } - else - { - // Cx [0:anz-1] = Ax [0:anz-1] - // Fixme for CUDA: do this on the GPU if appropriate - GB_memcpy (C->x, A->x, anz * asize, nthreads) ; - } //-------------------------------------------------------------------------- // bitmap selector kernel @@ -105,6 +99,11 @@ GrB_Info GB_select_bitmap if (info == GrB_NO_VALUE) { + if (!C_iso) { + // Cx [0:anz-1] = Ax [0:anz-1] + GB_memcpy (C->x, A->x, anz * asize, nthreads) ; + } + if (GB_IS_INDEXUNARYOP_CODE_POSITIONAL (opcode)) {