From 4a03d25717d427b668323cba4da95f7d6b29ea8d Mon Sep 17 00:00:00 2001 From: Vidith Madhu Date: Fri, 7 Mar 2025 00:06:38 -0600 Subject: [PATCH 1/3] Make row/colscale, select_bitmap more memory-friendly for CUDA --- CUDA/template/GB_jit_kernel_cuda_colscale.cu | 29 ++++++++--- CUDA/template/GB_jit_kernel_cuda_rowscale.cu | 9 +++- .../GB_jit_kernel_cuda_select_bitmap.cu | 10 +++- .../GB_jit_kernel_cuda_select_sparse.cu | 2 +- Source/mxm/GB_colscale.c | 48 +++++++++++++++---- Source/mxm/GB_rowscale.c | 34 ++++++++++--- Source/select/GB_select_bitmap.c | 11 ++--- 7 files changed, 110 insertions(+), 33 deletions(-) diff --git a/CUDA/template/GB_jit_kernel_cuda_colscale.cu b/CUDA/template/GB_jit_kernel_cuda_colscale.cu index 45a319375e..4f7bc5323b 100644 --- a/CUDA/template/GB_jit_kernel_cuda_colscale.cu +++ b/CUDA/template/GB_jit_kernel_cuda_colscale.cu @@ -19,8 +19,10 @@ __global__ void GB_cuda_colscale_kernel GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *) C->x ; #if ( GB_A_IS_SPARSE || GB_A_IS_HYPER ) + GB_Ap_TYPE *__restrict__ Cp = (GB_Ap_TYPE *) C->p ; const GB_Ap_TYPE *__restrict__ Ap = (GB_Ap_TYPE *) A->p ; #if ( GB_A_IS_HYPER ) + GB_Aj_TYPE *__restrict__ Ch = (int64_t *) C->h ; const GB_Aj_TYPE *__restrict__ Ah = (GB_Aj_TYPE *) A->h ; #endif #endif @@ -31,16 +33,17 @@ __global__ void GB_cuda_colscale_kernel GB_A_NHELD (anz) ; + int nthreads = blockDim.x * gridDim.x ; + int tid = blockIdx.x * blockDim.x + threadIdx.x ; + #if (GB_A_IS_BITMAP || GB_A_IS_FULL) const int64_t avlen = A->vlen ; // bitmap/full case - int nthreads_in_entire_grid = blockDim.x * gridDim.x ; - int tid = blockIdx.x * blockDim.x + threadIdx.x ; - for (int64_t p = tid ; p < anz ; p += nthreads_in_entire_grid) + for (int64_t p = tid ; p < anz ; p += nthreads) { if (!GBb_A (Ab, p)) continue ; // the pth entry in A is A(i,j) where i = p%avlen and j = p/avlen - int64_t col_idx = p / avlen ; + GB_Aj_TYPE col_idx = p / avlen ; // int64_t row_idx = p % avlen ; GB_DECLAREB (djj) ; GB_GETB (djj, Dx, col_idx, ) ; @@ -51,7 +54,19 @@ __global__ void GB_cuda_colscale_kernel } #else - const int64_t anvec = A->nvec ; + const GB_Aj_TYPE anvec = A->nvec ; + // Copy A->p, A->h to C->p, C->h here instead of using + // GB_dup_worker on the CPU, so A->p, A->h stay on the GPU + // if they were already there + for (GB_Aj_TYPE kA = tid ; kA < anvec ; kA += nthreads) + { + Cp [kA] = Ap [kA] ; + #if ( GB_A_IS_HYPER ) + Ch [kA] = Ah [kA] ; + #endif + } + Cp [anvec] = Ap [anvec] ; + // sparse/hypersparse case (cuda_ek_slice only works for sparse/hypersparse) for (int64_t pfirst = blockIdx.x << log2_chunk_size ; pfirst < anz ; @@ -65,8 +80,8 @@ __global__ void GB_cuda_colscale_kernel for (int64_t pdelta = threadIdx.x ; pdelta < my_chunk_size ; pdelta += blockDim.x) { int64_t p_final ; - int64_t k = GB_cuda_ek_slice_entry (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ; - int64_t j = GBh_A (Ah, k) ; + GB_Aj_TYPE k = GB_cuda_ek_slice_entry (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ; + GB_Aj_TYPE j = GBh_A (Ah, k) ; GB_DECLAREB (djj) ; GB_GETB (djj, Dx, j, ) ; diff --git a/CUDA/template/GB_jit_kernel_cuda_rowscale.cu b/CUDA/template/GB_jit_kernel_cuda_rowscale.cu index adf6d864f3..8dbaaa4831 100644 --- a/CUDA/template/GB_jit_kernel_cuda_rowscale.cu +++ b/CUDA/template/GB_jit_kernel_cuda_rowscale.cu @@ -16,6 +16,7 @@ __global__ void GB_cuda_rowscale_kernel #define B_iso GB_B_ISO #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER ) + GB_Bi_TYPE *__restrict__ Ci = (GB_Bi_TYPE *) C->i ; const GB_Bi_TYPE *__restrict__ Bi = (GB_Bi_TYPE *) B->i ; #endif @@ -29,14 +30,18 @@ __global__ void GB_cuda_rowscale_kernel const int64_t bvlen = B->vlen ; #endif - int ntasks = gridDim.x * blockDim.x; + int nthreads = gridDim.x * blockDim.x; int tid = blockIdx.x * blockDim.x + threadIdx.x; - for (int64_t p = tid ; p < bnz ; p += ntasks) + for (int64_t p = tid ; p < bnz ; p += nthreads) { if (!GBb_B (Bb, p)) { continue ; } int64_t i = GBi_B (Bi, p, bvlen) ; // get row index of B(i,j) + #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER ) + // Copy B->i to C->i here instead of using GB_dup_worker + Ci [p] = i ; + #endif GB_DECLAREA (dii) ; GB_GETA (dii, Dx, i, D_iso) ; // dii = D(i,i) GB_DECLAREB (bij) ; diff --git a/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu b/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu index 6b87a5687d..7089731f10 100644 --- a/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu +++ b/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu @@ -19,10 +19,14 @@ __global__ void GB_cuda_select_bitmap_kernel { int8_t *Cb_out = C->b ; - #if ( GB_DEPENDS_ON_X ) + #if ( GB_DEPENDS_ON_X || !GB_ISO_SELECT ) const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *) A->x ; #endif + #if ( !GB_ISO_SELECT ) + GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *) C->x ; + #endif + #if ( GB_A_IS_BITMAP ) const int8_t *__restrict__ Ab = A->b ; #endif @@ -39,6 +43,10 @@ __global__ void GB_cuda_select_bitmap_kernel int nthreads = blockDim.x * gridDim.x ; for (int64_t p = tid ; p < anz ; p += nthreads) { + #if ( !GB_ISO_SELECT ) + Cx [p] = Ax [p] ; + #endif + Cb_out [p] = 0 ; if (!GBb_A (Ab, p)) { continue; } diff --git a/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu b/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu index 384d92a693..a0c19c7ea9 100644 --- a/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu +++ b/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu @@ -317,7 +317,7 @@ GB_JIT_CUDA_KERNEL_SELECT_SPARSE_PROTO (GB_jit_kernel) GB_cuda_select_sparse_phase2 <<>> (Map, A, Ak_keep, (GB_Ci_TYPE *) C->i, (GB_C_TYPE *) C->x) ; - CUDA_OK (cudaGetLastError ( )) ; + // CUDA_OK (cudaGetLastError ( )) ; CUDA_OK (cudaStreamSynchronize (stream)) ; //-------------------------------------------------------------------------- diff --git a/Source/mxm/GB_colscale.c b/Source/mxm/GB_colscale.c index afa6e741b4..b957ae2f10 100644 --- a/Source/mxm/GB_colscale.c +++ b/Source/mxm/GB_colscale.c @@ -92,21 +92,17 @@ GrB_Info GB_colscale // C = A*D, column scale with diagonal D GB_void cscalar [GB_VLA(zsize)] ; bool C_iso = GB_AxB_iso (cscalar, A, D, A->vdim, semiring, flipxy, true) ; - //-------------------------------------------------------------------------- - // copy the pattern of A into C - //-------------------------------------------------------------------------- - - // allocate C->x but do not initialize it - GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ; info = GrB_NO_VALUE ; - ASSERT (C->type == ztype) ; //-------------------------------------------------------------------------- // C = A*D, column scale, compute numerical values //-------------------------------------------------------------------------- if (GB_IS_BUILTIN_BINOP_CODE_POSITIONAL (opcode)) - { + { + // Copy the pattern of A into C. Allocates, but does not initialize C->x. + GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ; + ASSERT (C->type == ztype) ; //---------------------------------------------------------------------- // apply a positional operator: convert C=A*D to C=op(A) @@ -157,7 +153,10 @@ GrB_Info GB_colscale // C = A*D, column scale with diagonal D } else if (C_iso) - { + { + // Copy the pattern of A into C. Allocates, but does not initialize C->x. + GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ; + ASSERT (C->type == ztype) ; //---------------------------------------------------------------------- // via the iso kernel @@ -179,6 +178,17 @@ GrB_Info GB_colscale // C = A*D, column scale with diagonal D // determine if the values are accessed //---------------------------------------------------------------------- + // Do not dup A->p, A->h into C yet; if we use CUDA, we'll do it on the + // GPU + int64_t *tmp_Ap = A->p ; + int64_t *tmp_Ah = A->h ; + A->p = NULL ; + A->h = NULL ; + GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ; + A->p = tmp_Ap ; + A->h = tmp_Ah ; + ASSERT (C->type == ztype) ; + ASSERT (fmult != NULL) ; bool op_is_first = (opcode == GB_FIRST_binop_code) ; bool op_is_second = (opcode == GB_SECOND_binop_code) ; @@ -217,6 +227,26 @@ GrB_Info GB_colscale // C = A*D, column scale with diagonal D } #endif + // We are using the CPU. Finish the dup from A -> C. + if (info == GrB_NO_VALUE) + { + // copy A->p, A->h into C->p, C->h + size_t psize = A->p_is_32 ? + sizeof (uint32_t) : sizeof (uint64_t) ; + size_t isize = A->i_is_32 ? + sizeof (uint32_t) : sizeof (uint64_t) ; + int64_t anvec = A->nvec ; + int nthreads_max = GB_Context_nthreads_max ( ) ; + + if (A->p != NULL) + { + GB_memcpy (C->p, A->p, (anvec+1) * psize, nthreads_max) ; + } + if (A->h != NULL) + { + GB_memcpy (C->h, A->h, anvec * isize, nthreads_max) ; + } + } //---------------------------------------------------------------------- // determine the number of threads to use //---------------------------------------------------------------------- diff --git a/Source/mxm/GB_rowscale.c b/Source/mxm/GB_rowscale.c index 150c061253..e03443ed86 100644 --- a/Source/mxm/GB_rowscale.c +++ b/Source/mxm/GB_rowscale.c @@ -81,14 +81,7 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D GB_void cscalar [GB_VLA(zsize)] ; bool C_iso = GB_AxB_iso (cscalar, D, B, D->vdim, semiring, flipxy, true) ; - //-------------------------------------------------------------------------- - // copy the pattern of B into C - //-------------------------------------------------------------------------- - - // allocate C->x but do not initialize it - GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ; info = GrB_NO_VALUE ; - ASSERT (C->type == ztype) ; //-------------------------------------------------------------------------- // C = D*B, row scale, compute numerical values @@ -96,6 +89,9 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D if (GB_IS_BUILTIN_BINOP_CODE_POSITIONAL (opcode)) { + // Copy the pattern of B into C. Allocates, but does not initialize C->x. + GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ; + ASSERT (C->type == ztype) ; //---------------------------------------------------------------------- // apply a positional operator: convert C=D*B to C=op(B) @@ -147,6 +143,9 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D } else if (C_iso) { + // Copy the pattern of B into C. Allocates, but does not initialize C->x. + GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ; + ASSERT (C->type == ztype) ; //---------------------------------------------------------------------- // via the iso kernel @@ -169,6 +168,14 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D // determine if the values are accessed //---------------------------------------------------------------------- + // Do not dup B->i into C yet; if we use CUDA, we'll do it on the GPU + int64_t *tmp_Bi = B->i ; + B->i = NULL ; + GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ; + B->i = tmp_Bi ; + ASSERT (C->type == ztype) ; + + ASSERT (fmult != NULL) ; bool op_is_first = (opcode == GB_FIRST_binop_code) ; bool op_is_second = (opcode == GB_SECOND_binop_code) ; @@ -207,6 +214,19 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D } #endif + // We are using the CPU. Finish the dup from B -> C + if (info == GrB_NO_VALUE) + { + // Copy in B->i + int64_t bnz = GB_nnz_held (B) ; + size_t isize = B->i_is_32 ? sizeof (uint32_t) : sizeof (uint64_t) ; + int nthreads_max = GB_Context_nthreads_max ( ) ; + + if (B->i != NULL) + { + GB_memcpy (C->i, B->i, bnz * isize, nthreads_max) ; + } + } //---------------------------------------------------------------------- // determine the number of threads to use //---------------------------------------------------------------------- diff --git a/Source/select/GB_select_bitmap.c b/Source/select/GB_select_bitmap.c index 0f38cff73f..40a854ce80 100644 --- a/Source/select/GB_select_bitmap.c +++ b/Source/select/GB_select_bitmap.c @@ -83,12 +83,6 @@ GrB_Info GB_select_bitmap // Cx [0] = Ax [0] or (A->type) thunk GB_select_iso (C->x, opcode, athunk, A->x, asize) ; } - else - { - // Cx [0:anz-1] = Ax [0:anz-1] - // Fixme for CUDA: do this on the GPU if appropriate - GB_memcpy (C->x, A->x, anz * asize, nthreads) ; - } //-------------------------------------------------------------------------- // bitmap selector kernel @@ -105,6 +99,11 @@ GrB_Info GB_select_bitmap if (info == GrB_NO_VALUE) { + if (!C_iso) { + // Cx [0:anz-1] = Ax [0:anz-1] + GB_memcpy (C->x, A->x, anz * asize, nthreads) ; + } + if (GB_IS_INDEXUNARYOP_CODE_POSITIONAL (opcode)) { From b96f496df0a49a9f205ff1a4b1291f4241cd7a66 Mon Sep 17 00:00:00 2001 From: Vidith Madhu Date: Fri, 7 Mar 2025 12:58:40 -0600 Subject: [PATCH 2/3] small fixes --- CUDA/template/GB_jit_kernel_cuda_colscale.cu | 10 +++++----- Source/mxm/GB_colscale.c | 1 + Source/mxm/GB_rowscale.c | 1 + 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/CUDA/template/GB_jit_kernel_cuda_colscale.cu b/CUDA/template/GB_jit_kernel_cuda_colscale.cu index 4f7bc5323b..37b4566549 100644 --- a/CUDA/template/GB_jit_kernel_cuda_colscale.cu +++ b/CUDA/template/GB_jit_kernel_cuda_colscale.cu @@ -43,7 +43,7 @@ __global__ void GB_cuda_colscale_kernel { if (!GBb_A (Ab, p)) continue ; // the pth entry in A is A(i,j) where i = p%avlen and j = p/avlen - GB_Aj_TYPE col_idx = p / avlen ; + int64_t col_idx = p / avlen ; // int64_t row_idx = p % avlen ; GB_DECLAREB (djj) ; GB_GETB (djj, Dx, col_idx, ) ; @@ -54,11 +54,11 @@ __global__ void GB_cuda_colscale_kernel } #else - const GB_Aj_TYPE anvec = A->nvec ; + const int64_t anvec = A->nvec ; // Copy A->p, A->h to C->p, C->h here instead of using // GB_dup_worker on the CPU, so A->p, A->h stay on the GPU // if they were already there - for (GB_Aj_TYPE kA = tid ; kA < anvec ; kA += nthreads) + for (int64_t kA = tid ; kA < anvec ; kA += nthreads) { Cp [kA] = Ap [kA] ; #if ( GB_A_IS_HYPER ) @@ -80,8 +80,8 @@ __global__ void GB_cuda_colscale_kernel for (int64_t pdelta = threadIdx.x ; pdelta < my_chunk_size ; pdelta += blockDim.x) { int64_t p_final ; - GB_Aj_TYPE k = GB_cuda_ek_slice_entry (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ; - GB_Aj_TYPE j = GBh_A (Ah, k) ; + int64_t k = GB_cuda_ek_slice_entry (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ; + int64_t j = GBh_A (Ah, k) ; GB_DECLAREB (djj) ; GB_GETB (djj, Dx, j, ) ; diff --git a/Source/mxm/GB_colscale.c b/Source/mxm/GB_colscale.c index b957ae2f10..621cc9168c 100644 --- a/Source/mxm/GB_colscale.c +++ b/Source/mxm/GB_colscale.c @@ -180,6 +180,7 @@ GrB_Info GB_colscale // C = A*D, column scale with diagonal D // Do not dup A->p, A->h into C yet; if we use CUDA, we'll do it on the // GPU + // FIXME: Add flags to GB_dup_worker for which arrays to copy int64_t *tmp_Ap = A->p ; int64_t *tmp_Ah = A->h ; A->p = NULL ; diff --git a/Source/mxm/GB_rowscale.c b/Source/mxm/GB_rowscale.c index e03443ed86..7d008e72bc 100644 --- a/Source/mxm/GB_rowscale.c +++ b/Source/mxm/GB_rowscale.c @@ -169,6 +169,7 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D //---------------------------------------------------------------------- // Do not dup B->i into C yet; if we use CUDA, we'll do it on the GPU + // FIXME: Add flags to GB_dup_worker for which arrays to copy int64_t *tmp_Bi = B->i ; B->i = NULL ; GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ; From 290429b9b912342f0fc097589a23f73a7fd168fc Mon Sep 17 00:00:00 2001 From: Vidith Madhu Date: Mon, 10 Mar 2025 13:56:37 -0500 Subject: [PATCH 3/3] Fix types, add asserts --- CUDA/template/GB_jit_kernel_cuda_colscale.cu | 4 ++-- CUDA/template/GB_jit_kernel_cuda_rowscale.cu | 2 +- Source/mxm/GB_colscale.c | 20 ++++++++++++++++---- Source/mxm/GB_rowscale.c | 14 +++++++++++--- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/CUDA/template/GB_jit_kernel_cuda_colscale.cu b/CUDA/template/GB_jit_kernel_cuda_colscale.cu index 104f78c453..fddb9c7e3c 100644 --- a/CUDA/template/GB_jit_kernel_cuda_colscale.cu +++ b/CUDA/template/GB_jit_kernel_cuda_colscale.cu @@ -21,10 +21,10 @@ __global__ void GB_cuda_colscale_kernel GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *) C->x ; #if ( GB_A_IS_SPARSE || GB_A_IS_HYPER ) - GB_Ap_TYPE *__restrict__ Cp = (GB_Ap_TYPE *) C->p ; + GB_Cp_TYPE *__restrict__ Cp = (GB_Ap_TYPE *) C->p ; const GB_Ap_TYPE *__restrict__ Ap = (GB_Ap_TYPE *) A->p ; #if ( GB_A_IS_HYPER ) - GB_Aj_TYPE *__restrict__ Ch = (int64_t *) C->h ; + GB_Cj_TYPE *__restrict__ Ch = (int64_t *) C->h ; const GB_Aj_TYPE *__restrict__ Ah = (GB_Aj_TYPE *) A->h ; #endif #endif diff --git a/CUDA/template/GB_jit_kernel_cuda_rowscale.cu b/CUDA/template/GB_jit_kernel_cuda_rowscale.cu index 8519a3d79e..c9daa66c1e 100644 --- a/CUDA/template/GB_jit_kernel_cuda_rowscale.cu +++ b/CUDA/template/GB_jit_kernel_cuda_rowscale.cu @@ -18,7 +18,7 @@ __global__ void GB_cuda_rowscale_kernel #define B_iso GB_B_ISO #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER ) - GB_Bi_TYPE *__restrict__ Ci = (GB_Bi_TYPE *) C->i ; + GB_Ci_TYPE *__restrict__ Ci = (GB_Bi_TYPE *) C->i ; const GB_Bi_TYPE *__restrict__ Bi = (GB_Bi_TYPE *) B->i ; #endif diff --git a/Source/mxm/GB_colscale.c b/Source/mxm/GB_colscale.c index 621cc9168c..dcc2ac62bc 100644 --- a/Source/mxm/GB_colscale.c +++ b/Source/mxm/GB_colscale.c @@ -232,20 +232,32 @@ GrB_Info GB_colscale // C = A*D, column scale with diagonal D if (info == GrB_NO_VALUE) { // copy A->p, A->h into C->p, C->h - size_t psize = A->p_is_32 ? + size_t A_psize = A->p_is_32 ? sizeof (uint32_t) : sizeof (uint64_t) ; - size_t isize = A->i_is_32 ? + size_t A_isize = A->i_is_32 ? sizeof (uint32_t) : sizeof (uint64_t) ; + int64_t anvec = A->nvec ; + int64_t cnvec = C->nvec ; + ASSERT (cnvec == anvec) ; + int nthreads_max = GB_Context_nthreads_max ( ) ; if (A->p != NULL) { - GB_memcpy (C->p, A->p, (anvec+1) * psize, nthreads_max) ; + size_t C_psize = C->p_is_32 ? + sizeof (uint32_t) : sizeof (uint64_t) ; + + ASSERT (C_psize == A_psize) ; + GB_memcpy (C->p, A->p, (anvec+1) * A_psize, nthreads_max) ; } if (A->h != NULL) { - GB_memcpy (C->h, A->h, anvec * isize, nthreads_max) ; + size_t C_isize = C->i_is_32 ? + sizeof (uint32_t) : sizeof (uint64_t) ; + + ASSERT (C_isize == A_isize) ; + GB_memcpy (C->h, A->h, anvec * A_isize, nthreads_max) ; } } //---------------------------------------------------------------------- diff --git a/Source/mxm/GB_rowscale.c b/Source/mxm/GB_rowscale.c index 7d008e72bc..a9f3c4ab39 100644 --- a/Source/mxm/GB_rowscale.c +++ b/Source/mxm/GB_rowscale.c @@ -219,13 +219,21 @@ GrB_Info GB_rowscale // C = D*B, row scale with diagonal D if (info == GrB_NO_VALUE) { // Copy in B->i + size_t B_isize = B->i_is_32 ? + sizeof (uint32_t) : sizeof (uint64_t) ; + size_t C_isize = C->i_is_32 ? + sizeof (uint32_t) : sizeof (uint64_t) ; + int64_t bnz = GB_nnz_held (B) ; - size_t isize = B->i_is_32 ? sizeof (uint32_t) : sizeof (uint64_t) ; + int64_t cnz = GB_nnz_held (C) ; + ASSERT (cnz == bnz) ; + int nthreads_max = GB_Context_nthreads_max ( ) ; if (B->i != NULL) - { - GB_memcpy (C->i, B->i, bnz * isize, nthreads_max) ; + { + ASSERT (C_isize == B_isize) ; + GB_memcpy (C->i, B->i, bnz * B_isize, nthreads_max) ; } } //----------------------------------------------------------------------