From 4a03d25717d427b668323cba4da95f7d6b29ea8d Mon Sep 17 00:00:00 2001
From: Vidith Madhu <vidhmadh1892@gmail.com>
Date: Fri, 7 Mar 2025 00:06:38 -0600
Subject: [PATCH 1/3] Make row/colscale, select_bitmap more memory-friendly for
 CUDA

---
 CUDA/template/GB_jit_kernel_cuda_colscale.cu  | 29 ++++++++---
 CUDA/template/GB_jit_kernel_cuda_rowscale.cu  |  9 +++-
 .../GB_jit_kernel_cuda_select_bitmap.cu       | 10 +++-
 .../GB_jit_kernel_cuda_select_sparse.cu       |  2 +-
 Source/mxm/GB_colscale.c                      | 48 +++++++++++++++----
 Source/mxm/GB_rowscale.c                      | 34 ++++++++++---
 Source/select/GB_select_bitmap.c              | 11 ++---
 7 files changed, 110 insertions(+), 33 deletions(-)

diff --git a/CUDA/template/GB_jit_kernel_cuda_colscale.cu b/CUDA/template/GB_jit_kernel_cuda_colscale.cu
index 45a319375e..4f7bc5323b 100644
--- a/CUDA/template/GB_jit_kernel_cuda_colscale.cu
+++ b/CUDA/template/GB_jit_kernel_cuda_colscale.cu
@@ -19,8 +19,10 @@ __global__ void GB_cuda_colscale_kernel
     GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *) C->x ;
 
     #if ( GB_A_IS_SPARSE || GB_A_IS_HYPER )
+    GB_Ap_TYPE *__restrict__ Cp = (GB_Ap_TYPE *) C->p ;
     const GB_Ap_TYPE *__restrict__ Ap = (GB_Ap_TYPE *) A->p ;
         #if ( GB_A_IS_HYPER )
+        GB_Aj_TYPE *__restrict__ Ch = (int64_t *) C->h ;
         const GB_Aj_TYPE *__restrict__ Ah = (GB_Aj_TYPE *) A->h ;
         #endif
     #endif
@@ -31,16 +33,17 @@ __global__ void GB_cuda_colscale_kernel
 
     GB_A_NHELD (anz) ;
 
+    int nthreads = blockDim.x * gridDim.x ;
+    int tid = blockIdx.x * blockDim.x + threadIdx.x ;
+
     #if (GB_A_IS_BITMAP || GB_A_IS_FULL)
         const int64_t avlen = A->vlen ;
         // bitmap/full case
-        int nthreads_in_entire_grid = blockDim.x * gridDim.x ;
-        int tid = blockIdx.x * blockDim.x + threadIdx.x ;
-        for (int64_t p = tid ; p < anz ; p += nthreads_in_entire_grid)
+        for (int64_t p = tid ; p < anz ; p += nthreads)
         {
             if (!GBb_A (Ab, p)) continue ;
             // the pth entry in A is A(i,j) where i = p%avlen and j = p/avlen
-            int64_t col_idx = p / avlen ;
+            GB_Aj_TYPE col_idx = p / avlen ;
     //      int64_t row_idx = p % avlen ;
             GB_DECLAREB (djj) ;
             GB_GETB (djj, Dx, col_idx, ) ;
@@ -51,7 +54,19 @@ __global__ void GB_cuda_colscale_kernel
         }
 
     #else
-        const int64_t anvec = A->nvec ;
+        const GB_Aj_TYPE anvec = A->nvec ;
+        // Copy A->p, A->h to C->p, C->h here instead of using
+        // GB_dup_worker on the CPU, so A->p, A->h stay on the GPU
+        // if they were already there
+        for (GB_Aj_TYPE kA = tid ; kA < anvec ; kA += nthreads)
+        {
+            Cp [kA] = Ap [kA] ;
+            #if ( GB_A_IS_HYPER )
+            Ch [kA] = Ah [kA] ;
+            #endif
+        }
+        Cp [anvec] = Ap [anvec] ;
+
         // sparse/hypersparse case (cuda_ek_slice only works for sparse/hypersparse)
         for (int64_t pfirst = blockIdx.x << log2_chunk_size ;
                     pfirst < anz ;
@@ -65,8 +80,8 @@ __global__ void GB_cuda_colscale_kernel
                 for (int64_t pdelta = threadIdx.x ; pdelta < my_chunk_size ; pdelta += blockDim.x)
                 {
                     int64_t p_final ;
-                    int64_t k = GB_cuda_ek_slice_entry (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ;
-                    int64_t j = GBh_A (Ah, k) ;
+                    GB_Aj_TYPE k = GB_cuda_ek_slice_entry (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ;
+                    GB_Aj_TYPE j = GBh_A (Ah, k) ;
 
                     GB_DECLAREB (djj) ;
                     GB_GETB (djj, Dx, j, ) ;
diff --git a/CUDA/template/GB_jit_kernel_cuda_rowscale.cu b/CUDA/template/GB_jit_kernel_cuda_rowscale.cu
index adf6d864f3..8dbaaa4831 100644
--- a/CUDA/template/GB_jit_kernel_cuda_rowscale.cu
+++ b/CUDA/template/GB_jit_kernel_cuda_rowscale.cu
@@ -16,6 +16,7 @@ __global__ void GB_cuda_rowscale_kernel
     #define B_iso GB_B_ISO
 
     #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER )
+    GB_Bi_TYPE *__restrict__ Ci = (GB_Bi_TYPE *) C->i ;
     const GB_Bi_TYPE *__restrict__ Bi = (GB_Bi_TYPE *) B->i ;
     #endif
 
@@ -29,14 +30,18 @@ __global__ void GB_cuda_rowscale_kernel
     const int64_t bvlen = B->vlen ;
     #endif
 
-    int ntasks = gridDim.x * blockDim.x;
+    int nthreads = gridDim.x * blockDim.x;
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-    for (int64_t p = tid ; p < bnz ; p += ntasks)
+    for (int64_t p = tid ; p < bnz ; p += nthreads)
     {
         if (!GBb_B (Bb, p)) { continue ; }
 
         int64_t i = GBi_B (Bi, p, bvlen) ;      // get row index of B(i,j)
+        #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER )
+        // Copy B->i to C->i here instead of using GB_dup_worker
+        Ci [p] = i ;
+        #endif
         GB_DECLAREA (dii) ;
         GB_GETA (dii, Dx, i, D_iso) ;           // dii = D(i,i)
         GB_DECLAREB (bij) ;
diff --git a/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu b/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu
index 6b87a5687d..7089731f10 100644
--- a/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu
+++ b/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu
@@ -19,10 +19,14 @@ __global__ void GB_cuda_select_bitmap_kernel
 {
     int8_t *Cb_out = C->b ;
 
-    #if ( GB_DEPENDS_ON_X )
+    #if ( GB_DEPENDS_ON_X || !GB_ISO_SELECT )
     const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *) A->x ;
     #endif
 
+    #if ( !GB_ISO_SELECT )
+    GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *) C->x ;
+    #endif
+
     #if ( GB_A_IS_BITMAP )
     const int8_t *__restrict__ Ab = A->b ;
     #endif
@@ -39,6 +43,10 @@ __global__ void GB_cuda_select_bitmap_kernel
     int nthreads = blockDim.x * gridDim.x ;
     for (int64_t p = tid ; p < anz ; p += nthreads)
     {
+        #if ( !GB_ISO_SELECT )
+        Cx [p] = Ax [p] ;
+        #endif
+
         Cb_out [p] = 0 ;
         if (!GBb_A (Ab, p)) { continue; }
 
diff --git a/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu b/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu
index 384d92a693..a0c19c7ea9 100644
--- a/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu
+++ b/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu
@@ -317,7 +317,7 @@ GB_JIT_CUDA_KERNEL_SELECT_SPARSE_PROTO (GB_jit_kernel)
     
     GB_cuda_select_sparse_phase2 <<<grid, block, 0, stream>>>
         (Map, A, Ak_keep, (GB_Ci_TYPE *) C->i, (GB_C_TYPE *) C->x) ;
-    CUDA_OK (cudaGetLastError ( )) ;
+    // CUDA_OK (cudaGetLastError ( )) ;
     CUDA_OK (cudaStreamSynchronize (stream)) ;
 
     //--------------------------------------------------------------------------
diff --git a/Source/mxm/GB_colscale.c b/Source/mxm/GB_colscale.c
index afa6e741b4..b957ae2f10 100644
--- a/Source/mxm/GB_colscale.c
+++ b/Source/mxm/GB_colscale.c
@@ -92,21 +92,17 @@ GrB_Info GB_colscale                // C = A*D, column scale with diagonal D
     GB_void cscalar [GB_VLA(zsize)] ;
     bool C_iso = GB_AxB_iso (cscalar, A, D, A->vdim, semiring, flipxy, true) ;
 
-    //--------------------------------------------------------------------------
-    // copy the pattern of A into C
-    //--------------------------------------------------------------------------
-
-    // allocate C->x but do not initialize it
-    GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ;
     info = GrB_NO_VALUE ;
-    ASSERT (C->type == ztype) ;
 
     //--------------------------------------------------------------------------
     // C = A*D, column scale, compute numerical values
     //--------------------------------------------------------------------------
 
     if (GB_IS_BUILTIN_BINOP_CODE_POSITIONAL (opcode))
-    { 
+    {
+        // Copy the pattern of A into C. Allocates, but does not initialize C->x.
+        GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ;
+        ASSERT (C->type == ztype) ;
 
         //----------------------------------------------------------------------
         // apply a positional operator: convert C=A*D to C=op(A)
@@ -157,7 +153,10 @@ GrB_Info GB_colscale                // C = A*D, column scale with diagonal D
 
     }
     else if (C_iso)
-    { 
+    {
+        // Copy the pattern of A into C. Allocates, but does not initialize C->x.
+        GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ;
+        ASSERT (C->type == ztype) ;
 
         //----------------------------------------------------------------------
         // via the iso kernel
@@ -179,6 +178,17 @@ GrB_Info GB_colscale                // C = A*D, column scale with diagonal D
         // determine if the values are accessed
         //----------------------------------------------------------------------
 
+        // Do not dup A->p, A->h into C yet; if we use CUDA, we'll do it on the
+        // GPU
+        int64_t *tmp_Ap = A->p ;
+        int64_t *tmp_Ah = A->h ;
+        A->p = NULL ;
+        A->h = NULL ;
+        GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ;
+        A->p = tmp_Ap ;
+        A->h = tmp_Ah ;
+        ASSERT (C->type == ztype) ;
+
         ASSERT (fmult != NULL) ;
         bool op_is_first  = (opcode == GB_FIRST_binop_code) ;
         bool op_is_second = (opcode == GB_SECOND_binop_code) ;
@@ -217,6 +227,26 @@ GrB_Info GB_colscale                // C = A*D, column scale with diagonal D
         }
         #endif
 
+        // We are using the CPU. Finish the dup from A -> C.
+        if (info == GrB_NO_VALUE)
+        {
+            // copy A->p, A->h into C->p, C->h
+            size_t psize = A->p_is_32 ?
+                sizeof (uint32_t) : sizeof (uint64_t) ;
+            size_t isize = A->i_is_32 ?
+                sizeof (uint32_t) : sizeof (uint64_t) ;
+            int64_t anvec = A->nvec ;
+            int nthreads_max = GB_Context_nthreads_max ( ) ;
+
+            if (A->p != NULL)
+            {
+                GB_memcpy (C->p, A->p, (anvec+1) * psize, nthreads_max) ;
+            }
+            if (A->h != NULL)
+            {
+                GB_memcpy (C->h, A->h, anvec * isize, nthreads_max) ;
+            }
+        }
         //----------------------------------------------------------------------
         // determine the number of threads to use
         //----------------------------------------------------------------------
diff --git a/Source/mxm/GB_rowscale.c b/Source/mxm/GB_rowscale.c
index 150c061253..e03443ed86 100644
--- a/Source/mxm/GB_rowscale.c
+++ b/Source/mxm/GB_rowscale.c
@@ -81,14 +81,7 @@ GrB_Info GB_rowscale                // C = D*B, row scale with diagonal D
     GB_void cscalar [GB_VLA(zsize)] ;
     bool C_iso = GB_AxB_iso (cscalar, D, B, D->vdim, semiring, flipxy, true) ;
 
-    //--------------------------------------------------------------------------
-    // copy the pattern of B into C
-    //--------------------------------------------------------------------------
-
-    // allocate C->x but do not initialize it
-    GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ;
     info = GrB_NO_VALUE ;
-    ASSERT (C->type == ztype) ;
 
     //--------------------------------------------------------------------------
     // C = D*B, row scale, compute numerical values
@@ -96,6 +89,9 @@ GrB_Info GB_rowscale                // C = D*B, row scale with diagonal D
 
     if (GB_IS_BUILTIN_BINOP_CODE_POSITIONAL (opcode))
     {
+        // Copy the pattern of B into C. Allocates, but does not initialize C->x.
+        GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ;
+        ASSERT (C->type == ztype) ;
 
         //----------------------------------------------------------------------
         // apply a positional operator: convert C=D*B to C=op(B)
@@ -147,6 +143,9 @@ GrB_Info GB_rowscale                // C = D*B, row scale with diagonal D
     }
     else if (C_iso)
     { 
+        // Copy the pattern of B into C. Allocates, but does not initialize C->x.
+        GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ;
+        ASSERT (C->type == ztype) ;
 
         //----------------------------------------------------------------------
         // via the iso kernel
@@ -169,6 +168,14 @@ GrB_Info GB_rowscale                // C = D*B, row scale with diagonal D
         // determine if the values are accessed
         //----------------------------------------------------------------------
 
+        // Do not dup B->i into C yet; if we use CUDA, we'll do it on the GPU
+        int64_t *tmp_Bi = B->i ;
+        B->i = NULL ;
+        GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ;
+        B->i = tmp_Bi ;
+        ASSERT (C->type == ztype) ;
+
+
         ASSERT (fmult != NULL) ;
         bool op_is_first  = (opcode == GB_FIRST_binop_code) ;
         bool op_is_second = (opcode == GB_SECOND_binop_code) ;
@@ -207,6 +214,19 @@ GrB_Info GB_rowscale                // C = D*B, row scale with diagonal D
         }
         #endif
 
+        // We are using the CPU. Finish the dup from B -> C
+        if (info == GrB_NO_VALUE)
+        {
+            // Copy in B->i
+            int64_t bnz = GB_nnz_held (B) ;
+            size_t isize = B->i_is_32 ? sizeof (uint32_t) : sizeof (uint64_t) ;
+            int nthreads_max = GB_Context_nthreads_max ( ) ;
+
+            if (B->i != NULL)
+            { 
+                GB_memcpy (C->i, B->i, bnz * isize, nthreads_max) ;
+            }
+        }
         //----------------------------------------------------------------------
         // determine the number of threads to use
         //----------------------------------------------------------------------
diff --git a/Source/select/GB_select_bitmap.c b/Source/select/GB_select_bitmap.c
index 0f38cff73f..40a854ce80 100644
--- a/Source/select/GB_select_bitmap.c
+++ b/Source/select/GB_select_bitmap.c
@@ -83,12 +83,6 @@ GrB_Info GB_select_bitmap
         // Cx [0] = Ax [0] or (A->type) thunk
         GB_select_iso (C->x, opcode, athunk, A->x, asize) ;
     }
-    else
-    { 
-        // Cx [0:anz-1] = Ax [0:anz-1]
-        // Fixme for CUDA: do this on the GPU if appropriate
-        GB_memcpy (C->x, A->x, anz * asize, nthreads) ;
-    }
 
     //--------------------------------------------------------------------------
     // bitmap selector kernel
@@ -105,6 +99,11 @@ GrB_Info GB_select_bitmap
 
     if (info == GrB_NO_VALUE)
     {
+        if (!C_iso) {
+            // Cx [0:anz-1] = Ax [0:anz-1]
+            GB_memcpy (C->x, A->x, anz * asize, nthreads) ;
+        }
+        
         if (GB_IS_INDEXUNARYOP_CODE_POSITIONAL (opcode))
         { 
 

From b96f496df0a49a9f205ff1a4b1291f4241cd7a66 Mon Sep 17 00:00:00 2001
From: Vidith Madhu <vidhmadh1892@gmail.com>
Date: Fri, 7 Mar 2025 12:58:40 -0600
Subject: [PATCH 2/3] small fixes

---
 CUDA/template/GB_jit_kernel_cuda_colscale.cu | 10 +++++-----
 Source/mxm/GB_colscale.c                     |  1 +
 Source/mxm/GB_rowscale.c                     |  1 +
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/CUDA/template/GB_jit_kernel_cuda_colscale.cu b/CUDA/template/GB_jit_kernel_cuda_colscale.cu
index 4f7bc5323b..37b4566549 100644
--- a/CUDA/template/GB_jit_kernel_cuda_colscale.cu
+++ b/CUDA/template/GB_jit_kernel_cuda_colscale.cu
@@ -43,7 +43,7 @@ __global__ void GB_cuda_colscale_kernel
         {
             if (!GBb_A (Ab, p)) continue ;
             // the pth entry in A is A(i,j) where i = p%avlen and j = p/avlen
-            GB_Aj_TYPE col_idx = p / avlen ;
+            int64_t col_idx = p / avlen ;
     //      int64_t row_idx = p % avlen ;
             GB_DECLAREB (djj) ;
             GB_GETB (djj, Dx, col_idx, ) ;
@@ -54,11 +54,11 @@ __global__ void GB_cuda_colscale_kernel
         }
 
     #else
-        const GB_Aj_TYPE anvec = A->nvec ;
+        const int64_t anvec = A->nvec ;
         // Copy A->p, A->h to C->p, C->h here instead of using
         // GB_dup_worker on the CPU, so A->p, A->h stay on the GPU
         // if they were already there
-        for (GB_Aj_TYPE kA = tid ; kA < anvec ; kA += nthreads)
+        for (int64_t kA = tid ; kA < anvec ; kA += nthreads)
         {
             Cp [kA] = Ap [kA] ;
             #if ( GB_A_IS_HYPER )
@@ -80,8 +80,8 @@ __global__ void GB_cuda_colscale_kernel
                 for (int64_t pdelta = threadIdx.x ; pdelta < my_chunk_size ; pdelta += blockDim.x)
                 {
                     int64_t p_final ;
-                    GB_Aj_TYPE k = GB_cuda_ek_slice_entry (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ;
-                    GB_Aj_TYPE j = GBh_A (Ah, k) ;
+                    int64_t k = GB_cuda_ek_slice_entry (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ;
+                    int64_t j = GBh_A (Ah, k) ;
 
                     GB_DECLAREB (djj) ;
                     GB_GETB (djj, Dx, j, ) ;
diff --git a/Source/mxm/GB_colscale.c b/Source/mxm/GB_colscale.c
index b957ae2f10..621cc9168c 100644
--- a/Source/mxm/GB_colscale.c
+++ b/Source/mxm/GB_colscale.c
@@ -180,6 +180,7 @@ GrB_Info GB_colscale                // C = A*D, column scale with diagonal D
 
         // Do not dup A->p, A->h into C yet; if we use CUDA, we'll do it on the
         // GPU
+        // FIXME: Add flags to GB_dup_worker for which arrays to copy
         int64_t *tmp_Ap = A->p ;
         int64_t *tmp_Ah = A->h ;
         A->p = NULL ;
diff --git a/Source/mxm/GB_rowscale.c b/Source/mxm/GB_rowscale.c
index e03443ed86..7d008e72bc 100644
--- a/Source/mxm/GB_rowscale.c
+++ b/Source/mxm/GB_rowscale.c
@@ -169,6 +169,7 @@ GrB_Info GB_rowscale                // C = D*B, row scale with diagonal D
         //----------------------------------------------------------------------
 
         // Do not dup B->i into C yet; if we use CUDA, we'll do it on the GPU
+        // FIXME: Add flags to GB_dup_worker for which arrays to copy
         int64_t *tmp_Bi = B->i ;
         B->i = NULL ;
         GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ;

From 290429b9b912342f0fc097589a23f73a7fd168fc Mon Sep 17 00:00:00 2001
From: Vidith Madhu <vidhmadh1892@gmail.com>
Date: Mon, 10 Mar 2025 13:56:37 -0500
Subject: [PATCH 3/3] Fix types, add asserts

---
 CUDA/template/GB_jit_kernel_cuda_colscale.cu |  4 ++--
 CUDA/template/GB_jit_kernel_cuda_rowscale.cu |  2 +-
 Source/mxm/GB_colscale.c                     | 20 ++++++++++++++++----
 Source/mxm/GB_rowscale.c                     | 14 +++++++++++---
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/CUDA/template/GB_jit_kernel_cuda_colscale.cu b/CUDA/template/GB_jit_kernel_cuda_colscale.cu
index 104f78c453..fddb9c7e3c 100644
--- a/CUDA/template/GB_jit_kernel_cuda_colscale.cu
+++ b/CUDA/template/GB_jit_kernel_cuda_colscale.cu
@@ -21,10 +21,10 @@ __global__ void GB_cuda_colscale_kernel
     GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *) C->x ;
 
     #if ( GB_A_IS_SPARSE || GB_A_IS_HYPER )
-    GB_Ap_TYPE *__restrict__ Cp = (GB_Ap_TYPE *) C->p ;
+    GB_Cp_TYPE *__restrict__ Cp = (GB_Ap_TYPE *) C->p ;
     const GB_Ap_TYPE *__restrict__ Ap = (GB_Ap_TYPE *) A->p ;
         #if ( GB_A_IS_HYPER )
-        GB_Aj_TYPE *__restrict__ Ch = (int64_t *) C->h ;
+        GB_Cj_TYPE *__restrict__ Ch = (int64_t *) C->h ;
         const GB_Aj_TYPE *__restrict__ Ah = (GB_Aj_TYPE *) A->h ;
         #endif
     #endif
diff --git a/CUDA/template/GB_jit_kernel_cuda_rowscale.cu b/CUDA/template/GB_jit_kernel_cuda_rowscale.cu
index 8519a3d79e..c9daa66c1e 100644
--- a/CUDA/template/GB_jit_kernel_cuda_rowscale.cu
+++ b/CUDA/template/GB_jit_kernel_cuda_rowscale.cu
@@ -18,7 +18,7 @@ __global__ void GB_cuda_rowscale_kernel
     #define B_iso GB_B_ISO
 
     #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER )
-    GB_Bi_TYPE *__restrict__ Ci = (GB_Bi_TYPE *) C->i ;
+    GB_Ci_TYPE *__restrict__ Ci = (GB_Bi_TYPE *) C->i ;
     const GB_Bi_TYPE *__restrict__ Bi = (GB_Bi_TYPE *) B->i ;
     #endif
 
diff --git a/Source/mxm/GB_colscale.c b/Source/mxm/GB_colscale.c
index 621cc9168c..dcc2ac62bc 100644
--- a/Source/mxm/GB_colscale.c
+++ b/Source/mxm/GB_colscale.c
@@ -232,20 +232,32 @@ GrB_Info GB_colscale                // C = A*D, column scale with diagonal D
         if (info == GrB_NO_VALUE)
         {
             // copy A->p, A->h into C->p, C->h
-            size_t psize = A->p_is_32 ?
+            size_t A_psize = A->p_is_32 ?
                 sizeof (uint32_t) : sizeof (uint64_t) ;
-            size_t isize = A->i_is_32 ?
+            size_t A_isize = A->i_is_32 ?
                 sizeof (uint32_t) : sizeof (uint64_t) ;
+
             int64_t anvec = A->nvec ;
+            int64_t cnvec = C->nvec ;
+            ASSERT (cnvec == anvec) ;
+
             int nthreads_max = GB_Context_nthreads_max ( ) ;
 
             if (A->p != NULL)
             {
-                GB_memcpy (C->p, A->p, (anvec+1) * psize, nthreads_max) ;
+                size_t C_psize = C->p_is_32 ?
+                    sizeof (uint32_t) : sizeof (uint64_t) ;
+                
+                ASSERT (C_psize == A_psize) ;    
+                GB_memcpy (C->p, A->p, (anvec+1) * A_psize, nthreads_max) ;
             }
             if (A->h != NULL)
             {
-                GB_memcpy (C->h, A->h, anvec * isize, nthreads_max) ;
+                size_t C_isize = C->i_is_32 ?
+                    sizeof (uint32_t) : sizeof (uint64_t) ;
+
+                ASSERT (C_isize == A_isize) ;
+                GB_memcpy (C->h, A->h, anvec * A_isize, nthreads_max) ;
             }
         }
         //----------------------------------------------------------------------
diff --git a/Source/mxm/GB_rowscale.c b/Source/mxm/GB_rowscale.c
index 7d008e72bc..a9f3c4ab39 100644
--- a/Source/mxm/GB_rowscale.c
+++ b/Source/mxm/GB_rowscale.c
@@ -219,13 +219,21 @@ GrB_Info GB_rowscale                // C = D*B, row scale with diagonal D
         if (info == GrB_NO_VALUE)
         {
             // Copy in B->i
+            size_t B_isize = B->i_is_32 ?
+                sizeof (uint32_t) : sizeof (uint64_t) ;
+            size_t C_isize = C->i_is_32 ?
+                sizeof (uint32_t) : sizeof (uint64_t) ;
+
             int64_t bnz = GB_nnz_held (B) ;
-            size_t isize = B->i_is_32 ? sizeof (uint32_t) : sizeof (uint64_t) ;
+            int64_t cnz = GB_nnz_held (C) ;
+            ASSERT (cnz == bnz) ;
+            
             int nthreads_max = GB_Context_nthreads_max ( ) ;
 
             if (B->i != NULL)
-            { 
-                GB_memcpy (C->i, B->i, bnz * isize, nthreads_max) ;
+            {
+                ASSERT (C_isize == B_isize) ;
+                GB_memcpy (C->i, B->i, bnz * B_isize, nthreads_max) ;
             }
         }
         //----------------------------------------------------------------------