Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions CUDA/template/GB_jit_kernel_cuda_colscale.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ __global__ void GB_cuda_colscale_kernel
GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *) C->x ;

#if ( GB_A_IS_SPARSE || GB_A_IS_HYPER )
GB_Cp_TYPE *__restrict__ Cp = (GB_Ap_TYPE *) C->p ;
const GB_Ap_TYPE *__restrict__ Ap = (GB_Ap_TYPE *) A->p ;
#if ( GB_A_IS_HYPER )
GB_Cj_TYPE *__restrict__ Ch = (int64_t *) C->h ;
const GB_Aj_TYPE *__restrict__ Ah = (GB_Aj_TYPE *) A->h ;
#endif
#endif
Expand All @@ -33,12 +35,13 @@ __global__ void GB_cuda_colscale_kernel

GB_A_NHELD (anz) ;

int nthreads = blockDim.x * gridDim.x ;
int tid = blockIdx.x * blockDim.x + threadIdx.x ;

#if (GB_A_IS_BITMAP || GB_A_IS_FULL)
const int64_t avlen = A->vlen ;
// bitmap/full case
int nthreads_in_entire_grid = blockDim.x * gridDim.x ;
int tid = blockIdx.x * blockDim.x + threadIdx.x ;
for (int64_t p = tid ; p < anz ; p += nthreads_in_entire_grid)
for (int64_t p = tid ; p < anz ; p += nthreads)
{
if (!GBb_A (Ab, p)) continue ;
// the pth entry in A is A(i,j) where i = p%avlen and j = p/avlen
Expand All @@ -54,6 +57,18 @@ __global__ void GB_cuda_colscale_kernel

#else
const int64_t anvec = A->nvec ;
// Copy A->p, A->h to C->p, C->h here instead of using
// GB_dup_worker on the CPU, so A->p, A->h stay on the GPU
// if they were already there
for (int64_t kA = tid ; kA < anvec ; kA += nthreads)
{
Cp [kA] = Ap [kA] ;
#if ( GB_A_IS_HYPER )
Ch [kA] = Ah [kA] ;
#endif
}
Cp [anvec] = Ap [anvec] ;

// sparse/hypersparse case (cuda_ek_slice only works for sparse/hypersparse)
for (int64_t pfirst = blockIdx.x << log2_chunk_size ;
pfirst < anz ;
Expand Down
9 changes: 7 additions & 2 deletions CUDA/template/GB_jit_kernel_cuda_rowscale.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ __global__ void GB_cuda_rowscale_kernel
#define B_iso GB_B_ISO

#if ( GB_B_IS_SPARSE || GB_B_IS_HYPER )
GB_Ci_TYPE *__restrict__ Ci = (GB_Bi_TYPE *) C->i ;
const GB_Bi_TYPE *__restrict__ Bi = (GB_Bi_TYPE *) B->i ;
#endif

Expand All @@ -31,14 +32,18 @@ __global__ void GB_cuda_rowscale_kernel
const int64_t bvlen = B->vlen ;
#endif

int ntasks = gridDim.x * blockDim.x;
int nthreads = gridDim.x * blockDim.x;
int tid = blockIdx.x * blockDim.x + threadIdx.x;

for (int64_t p = tid ; p < bnz ; p += ntasks)
for (int64_t p = tid ; p < bnz ; p += nthreads)
{
if (!GBb_B (Bb, p)) { continue ; }

int64_t i = GBi_B (Bi, p, bvlen) ; // get row index of B(i,j)
#if ( GB_B_IS_SPARSE || GB_B_IS_HYPER )
// Copy B->i to C->i here instead of using GB_dup_worker
Ci [p] = i ;
#endif
GB_DECLAREA (dii) ;
GB_GETA (dii, Dx, i, D_iso) ; // dii = D(i,i)
GB_DECLAREB (bij) ;
Expand Down
10 changes: 9 additions & 1 deletion CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,14 @@ __global__ void GB_cuda_select_bitmap_kernel
{
int8_t *Cb_out = C->b ;

#if ( GB_DEPENDS_ON_X )
#if ( GB_DEPENDS_ON_X || !GB_ISO_SELECT )
const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *) A->x ;
#endif

#if ( !GB_ISO_SELECT )
GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *) C->x ;
#endif

#if ( GB_A_IS_BITMAP )
const int8_t *__restrict__ Ab = A->b ;
#endif
Expand All @@ -41,6 +45,10 @@ __global__ void GB_cuda_select_bitmap_kernel
int nthreads = blockDim.x * gridDim.x ;
for (int64_t p = tid ; p < anz ; p += nthreads)
{
#if ( !GB_ISO_SELECT )
Cx [p] = Ax [p] ;
#endif

Cb_out [p] = 0 ;
if (!GBb_A (Ab, p)) { continue; }

Expand Down
2 changes: 1 addition & 1 deletion CUDA/template/GB_jit_kernel_cuda_select_sparse.cu
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ GB_JIT_CUDA_KERNEL_SELECT_SPARSE_PROTO (GB_jit_kernel)

GB_cuda_select_sparse_phase2 <<<grid, block, 0, stream>>>
(Map, A, Ak_keep, (GB_Ci_TYPE *) C->i, (GB_C_TYPE *) C->x) ;
CUDA_OK (cudaGetLastError ( )) ;
// CUDA_OK (cudaGetLastError ( )) ;
Comment thread
VidithM marked this conversation as resolved.
CUDA_OK (cudaStreamSynchronize (stream)) ;

//--------------------------------------------------------------------------
Expand Down
61 changes: 52 additions & 9 deletions Source/mxm/GB_colscale.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,21 +92,17 @@ GrB_Info GB_colscale // C = A*D, column scale with diagonal D
GB_void cscalar [GB_VLA(zsize)] ;
bool C_iso = GB_AxB_iso (cscalar, A, D, A->vdim, semiring, flipxy, true) ;

//--------------------------------------------------------------------------
// copy the pattern of A into C
//--------------------------------------------------------------------------

// allocate C->x but do not initialize it
GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ;
info = GrB_NO_VALUE ;
ASSERT (C->type == ztype) ;

//--------------------------------------------------------------------------
// C = A*D, column scale, compute numerical values
//--------------------------------------------------------------------------

if (GB_IS_BUILTIN_BINOP_CODE_POSITIONAL (opcode))
{
{
// Copy the pattern of A into C. Allocates, but does not initialize C->x.
GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ;
ASSERT (C->type == ztype) ;

//----------------------------------------------------------------------
// apply a positional operator: convert C=A*D to C=op(A)
Expand Down Expand Up @@ -157,7 +153,10 @@ GrB_Info GB_colscale // C = A*D, column scale with diagonal D

}
else if (C_iso)
{
{
// Copy the pattern of A into C. Allocates, but does not initialize C->x.
GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ;
ASSERT (C->type == ztype) ;

//----------------------------------------------------------------------
// via the iso kernel
Expand All @@ -179,6 +178,18 @@ GrB_Info GB_colscale // C = A*D, column scale with diagonal D
// determine if the values are accessed
//----------------------------------------------------------------------

// Do not dup A->p, A->h into C yet; if we use CUDA, we'll do it on the
// GPU
// FIXME: Add flags to GB_dup_worker for which arrays to copy
int64_t *tmp_Ap = A->p ;
int64_t *tmp_Ah = A->h ;
A->p = NULL ;
A->h = NULL ;
GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ;

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hadn't considered this option for using GB_dup_worker but I see from my code that it can work. However, it might be cleaner to revise GB_dup_worker, and pass in flags that disable the copy of specific parts of the matrix, like "bool Ap_dup" or something, for each component. I can pull in this PR for now, but a "// FIXME: revise this to use flags for each component" can be added here.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense. It looks like this wouldn't be a huge change, I can make a follow-up PR for this.

A->p = tmp_Ap ;
A->h = tmp_Ah ;
ASSERT (C->type == ztype) ;

ASSERT (fmult != NULL) ;
bool op_is_first = (opcode == GB_FIRST_binop_code) ;
bool op_is_second = (opcode == GB_SECOND_binop_code) ;
Expand Down Expand Up @@ -217,6 +228,38 @@ GrB_Info GB_colscale // C = A*D, column scale with diagonal D
}
#endif

// We are using the CPU. Finish the dup from A -> C.
if (info == GrB_NO_VALUE)
{
// copy A->p, A->h into C->p, C->h
size_t A_psize = A->p_is_32 ?
sizeof (uint32_t) : sizeof (uint64_t) ;
size_t A_isize = A->i_is_32 ?
sizeof (uint32_t) : sizeof (uint64_t) ;

int64_t anvec = A->nvec ;
int64_t cnvec = C->nvec ;
ASSERT (cnvec == anvec) ;

int nthreads_max = GB_Context_nthreads_max ( ) ;

if (A->p != NULL)
{
size_t C_psize = C->p_is_32 ?
sizeof (uint32_t) : sizeof (uint64_t) ;

ASSERT (C_psize == A_psize) ;
GB_memcpy (C->p, A->p, (anvec+1) * A_psize, nthreads_max) ;
}
if (A->h != NULL)
{
size_t C_isize = C->i_is_32 ?
sizeof (uint32_t) : sizeof (uint64_t) ;

ASSERT (C_isize == A_isize) ;
GB_memcpy (C->h, A->h, anvec * A_isize, nthreads_max) ;
}
}
//----------------------------------------------------------------------
// determine the number of threads to use
//----------------------------------------------------------------------
Expand Down
43 changes: 36 additions & 7 deletions Source/mxm/GB_rowscale.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,21 +81,17 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D
GB_void cscalar [GB_VLA(zsize)] ;
bool C_iso = GB_AxB_iso (cscalar, D, B, D->vdim, semiring, flipxy, true) ;

//--------------------------------------------------------------------------
// copy the pattern of B into C
//--------------------------------------------------------------------------

// allocate C->x but do not initialize it
GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ;
info = GrB_NO_VALUE ;
ASSERT (C->type == ztype) ;

//--------------------------------------------------------------------------
// C = D*B, row scale, compute numerical values
//--------------------------------------------------------------------------

if (GB_IS_BUILTIN_BINOP_CODE_POSITIONAL (opcode))
{
// Copy the pattern of B into C. Allocates, but does not initialize C->x.
GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ;
ASSERT (C->type == ztype) ;

//----------------------------------------------------------------------
// apply a positional operator: convert C=D*B to C=op(B)
Expand Down Expand Up @@ -147,6 +143,9 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D
}
else if (C_iso)
{
// Copy the pattern of B into C. Allocates, but does not initialize C->x.
GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ;
ASSERT (C->type == ztype) ;

//----------------------------------------------------------------------
// via the iso kernel
Expand All @@ -169,6 +168,15 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D
// determine if the values are accessed
//----------------------------------------------------------------------

// Do not dup B->i into C yet; if we use CUDA, we'll do it on the GPU
// FIXME: Add flags to GB_dup_worker for which arrays to copy
int64_t *tmp_Bi = B->i ;
B->i = NULL ;
GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ;
B->i = tmp_Bi ;
ASSERT (C->type == ztype) ;


ASSERT (fmult != NULL) ;
bool op_is_first = (opcode == GB_FIRST_binop_code) ;
bool op_is_second = (opcode == GB_SECOND_binop_code) ;
Expand Down Expand Up @@ -207,6 +215,27 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D
}
#endif

// We are using the CPU. Finish the dup from B -> C
if (info == GrB_NO_VALUE)
{
// Copy in B->i
size_t B_isize = B->i_is_32 ?
sizeof (uint32_t) : sizeof (uint64_t) ;
size_t C_isize = C->i_is_32 ?
sizeof (uint32_t) : sizeof (uint64_t) ;

int64_t bnz = GB_nnz_held (B) ;
int64_t cnz = GB_nnz_held (C) ;
ASSERT (cnz == bnz) ;

int nthreads_max = GB_Context_nthreads_max ( ) ;

if (B->i != NULL)
{
ASSERT (C_isize == B_isize) ;
GB_memcpy (C->i, B->i, bnz * B_isize, nthreads_max) ;
}
}
//----------------------------------------------------------------------
// determine the number of threads to use
//----------------------------------------------------------------------
Expand Down
11 changes: 5 additions & 6 deletions Source/select/GB_select_bitmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,6 @@ GrB_Info GB_select_bitmap
// Cx [0] = Ax [0] or (A->type) thunk
GB_select_iso (C->x, opcode, athunk, A->x, asize) ;
}
else
{
// Cx [0:anz-1] = Ax [0:anz-1]
// Fixme for CUDA: do this on the GPU if appropriate
GB_memcpy (C->x, A->x, anz * asize, nthreads) ;
}

//--------------------------------------------------------------------------
// bitmap selector kernel
Expand All @@ -105,6 +99,11 @@ GrB_Info GB_select_bitmap

if (info == GrB_NO_VALUE)
{
if (!C_iso) {
// Cx [0:anz-1] = Ax [0:anz-1]
GB_memcpy (C->x, A->x, anz * asize, nthreads) ;
}

if (GB_IS_INDEXUNARYOP_CODE_POSITIONAL (opcode))
{

Expand Down