diff --git a/CUDA/template/GB_jit_kernel_cuda_colscale.cu b/CUDA/template/GB_jit_kernel_cuda_colscale.cu
index 8cd76843f3..fddb9c7e3c 100644
--- a/CUDA/template/GB_jit_kernel_cuda_colscale.cu
+++ b/CUDA/template/GB_jit_kernel_cuda_colscale.cu
@@ -21,8 +21,10 @@ __global__ void GB_cuda_colscale_kernel
     GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *) C->x ;
 
     #if ( GB_A_IS_SPARSE || GB_A_IS_HYPER )
+    GB_Cp_TYPE *__restrict__ Cp = (GB_Ap_TYPE *) C->p ;
     const GB_Ap_TYPE *__restrict__ Ap = (GB_Ap_TYPE *) A->p ;
         #if ( GB_A_IS_HYPER )
+        GB_Cj_TYPE *__restrict__ Ch = (int64_t *) C->h ;
         const GB_Aj_TYPE *__restrict__ Ah = (GB_Aj_TYPE *) A->h ;
         #endif
     #endif
@@ -33,12 +35,13 @@ __global__ void GB_cuda_colscale_kernel
 
     GB_A_NHELD (anz) ;
 
+    int nthreads = blockDim.x * gridDim.x ;
+    int tid = blockIdx.x * blockDim.x + threadIdx.x ;
+
     #if (GB_A_IS_BITMAP || GB_A_IS_FULL)
         const int64_t avlen = A->vlen ;
         // bitmap/full case
-        int nthreads_in_entire_grid = blockDim.x * gridDim.x ;
-        int tid = blockIdx.x * blockDim.x + threadIdx.x ;
-        for (int64_t p = tid ; p < anz ; p += nthreads_in_entire_grid)
+        for (int64_t p = tid ; p < anz ; p += nthreads)
         {
             if (!GBb_A (Ab, p)) continue ;
             // the pth entry in A is A(i,j) where i = p%avlen and j = p/avlen
@@ -54,6 +57,18 @@ __global__ void GB_cuda_colscale_kernel
 
     #else
         const int64_t anvec = A->nvec ;
+        // Copy A->p, A->h to C->p, C->h here instead of using
+        // GB_dup_worker on the CPU, so A->p, A->h stay on the GPU
+        // if they were already there
+        for (int64_t kA = tid ; kA < anvec ; kA += nthreads)
+        {
+            Cp [kA] = Ap [kA] ;
+            #if ( GB_A_IS_HYPER )
+            Ch [kA] = Ah [kA] ;
+            #endif
+        }
+        Cp [anvec] = Ap [anvec] ;
+
         // sparse/hypersparse case (cuda_ek_slice only works for sparse/hypersparse)
         for (int64_t pfirst = blockIdx.x << log2_chunk_size ;
                     pfirst < anz ;
diff --git a/CUDA/template/GB_jit_kernel_cuda_rowscale.cu b/CUDA/template/GB_jit_kernel_cuda_rowscale.cu
index 16e92b736d..c9daa66c1e 100644
--- a/CUDA/template/GB_jit_kernel_cuda_rowscale.cu
+++ b/CUDA/template/GB_jit_kernel_cuda_rowscale.cu
@@ -18,6 +18,7 @@ __global__ void GB_cuda_rowscale_kernel
     #define B_iso GB_B_ISO
 
     #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER )
+    GB_Ci_TYPE *__restrict__ Ci = (GB_Bi_TYPE *) C->i ;
     const GB_Bi_TYPE *__restrict__ Bi = (GB_Bi_TYPE *) B->i ;
     #endif
 
@@ -31,14 +32,18 @@ __global__ void GB_cuda_rowscale_kernel
     const int64_t bvlen = B->vlen ;
     #endif
 
-    int ntasks = gridDim.x * blockDim.x;
+    int nthreads = gridDim.x * blockDim.x;
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-    for (int64_t p = tid ; p < bnz ; p += ntasks)
+    for (int64_t p = tid ; p < bnz ; p += nthreads)
     {
         if (!GBb_B (Bb, p)) { continue ; }
 
         int64_t i = GBi_B (Bi, p, bvlen) ;      // get row index of B(i,j)
+        #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER )
+        // Copy B->i to C->i here instead of using GB_dup_worker
+        Ci [p] = i ;
+        #endif
         GB_DECLAREA (dii) ;
         GB_GETA (dii, Dx, i, D_iso) ;           // dii = D(i,i)
         GB_DECLAREB (bij) ;
diff --git a/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu b/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu
index 43b3aa9794..2873a81273 100644
--- a/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu
+++ b/CUDA/template/GB_jit_kernel_cuda_select_bitmap.cu
@@ -21,10 +21,14 @@ __global__ void GB_cuda_select_bitmap_kernel
 {
     int8_t *Cb_out = C->b ;
 
-    #if ( GB_DEPENDS_ON_X )
+    #if ( GB_DEPENDS_ON_X || !GB_ISO_SELECT )
     const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *) A->x ;
     #endif
 
+    #if ( !GB_ISO_SELECT )
+    GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *) C->x ;
+    #endif
+
     #if ( GB_A_IS_BITMAP )
     const int8_t *__restrict__ Ab = A->b ;
     #endif
@@ -41,6 +45,10 @@ __global__ void GB_cuda_select_bitmap_kernel
     int nthreads = blockDim.x * gridDim.x ;
     for (int64_t p = tid ; p < anz ; p += nthreads)
     {
+        #if ( !GB_ISO_SELECT )
+        Cx [p] = Ax [p] ;
+        #endif
+
         Cb_out [p] = 0 ;
         if (!GBb_A (Ab, p)) { continue; }
 
diff --git a/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu b/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu
index c303e9a38f..a60f61a39d 100644
--- a/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu
+++ b/CUDA/template/GB_jit_kernel_cuda_select_sparse.cu
@@ -330,7 +330,7 @@ GB_JIT_CUDA_KERNEL_SELECT_SPARSE_PROTO (GB_jit_kernel)
     
     GB_cuda_select_sparse_phase2 <<<grid, block, 0, stream>>>
         (Map, A, Ak_keep, (GB_Ci_TYPE *) C->i, (GB_C_TYPE *) C->x) ;
-    CUDA_OK (cudaGetLastError ( )) ;
+    // CUDA_OK (cudaGetLastError ( )) ;
     CUDA_OK (cudaStreamSynchronize (stream)) ;
 
     //--------------------------------------------------------------------------
diff --git a/Source/mxm/GB_colscale.c b/Source/mxm/GB_colscale.c
index afa6e741b4..dcc2ac62bc 100644
--- a/Source/mxm/GB_colscale.c
+++ b/Source/mxm/GB_colscale.c
@@ -92,21 +92,17 @@ GrB_Info GB_colscale                // C = A*D, column scale with diagonal D
     GB_void cscalar [GB_VLA(zsize)] ;
     bool C_iso = GB_AxB_iso (cscalar, A, D, A->vdim, semiring, flipxy, true) ;
 
-    //--------------------------------------------------------------------------
-    // copy the pattern of A into C
-    //--------------------------------------------------------------------------
-
-    // allocate C->x but do not initialize it
-    GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ;
     info = GrB_NO_VALUE ;
-    ASSERT (C->type == ztype) ;
 
     //--------------------------------------------------------------------------
     // C = A*D, column scale, compute numerical values
     //--------------------------------------------------------------------------
 
     if (GB_IS_BUILTIN_BINOP_CODE_POSITIONAL (opcode))
-    { 
+    {
+        // Copy the pattern of A into C. Allocates, but does not initialize C->x.
+        GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ;
+        ASSERT (C->type == ztype) ;
 
         //----------------------------------------------------------------------
         // apply a positional operator: convert C=A*D to C=op(A)
@@ -157,7 +153,10 @@ GrB_Info GB_colscale                // C = A*D, column scale with diagonal D
 
     }
     else if (C_iso)
-    { 
+    {
+        // Copy the pattern of A into C. Allocates, but does not initialize C->x.
+        GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ;
+        ASSERT (C->type == ztype) ;
 
         //----------------------------------------------------------------------
         // via the iso kernel
@@ -179,6 +178,18 @@ GrB_Info GB_colscale                // C = A*D, column scale with diagonal D
         // determine if the values are accessed
         //----------------------------------------------------------------------
 
+        // Do not dup A->p, A->h into C yet; if we use CUDA, we'll do it on the
+        // GPU
+        // FIXME: Add flags to GB_dup_worker for which arrays to copy
+        int64_t *tmp_Ap = A->p ;
+        int64_t *tmp_Ah = A->h ;
+        A->p = NULL ;
+        A->h = NULL ;
+        GB_OK (GB_dup_worker (&C, C_iso, A, false, ztype)) ;
+        A->p = tmp_Ap ;
+        A->h = tmp_Ah ;
+        ASSERT (C->type == ztype) ;
+
         ASSERT (fmult != NULL) ;
         bool op_is_first  = (opcode == GB_FIRST_binop_code) ;
         bool op_is_second = (opcode == GB_SECOND_binop_code) ;
@@ -217,6 +228,38 @@ GrB_Info GB_colscale                // C = A*D, column scale with diagonal D
         }
         #endif
 
+        // We are using the CPU. Finish the dup from A -> C.
+        if (info == GrB_NO_VALUE)
+        {
+            // copy A->p, A->h into C->p, C->h
+            size_t A_psize = A->p_is_32 ?
+                sizeof (uint32_t) : sizeof (uint64_t) ;
+            size_t A_isize = A->i_is_32 ?
+                sizeof (uint32_t) : sizeof (uint64_t) ;
+
+            int64_t anvec = A->nvec ;
+            int64_t cnvec = C->nvec ;
+            ASSERT (cnvec == anvec) ;
+
+            int nthreads_max = GB_Context_nthreads_max ( ) ;
+
+            if (A->p != NULL)
+            {
+                size_t C_psize = C->p_is_32 ?
+                    sizeof (uint32_t) : sizeof (uint64_t) ;
+                
+                ASSERT (C_psize == A_psize) ;    
+                GB_memcpy (C->p, A->p, (anvec+1) * A_psize, nthreads_max) ;
+            }
+            if (A->h != NULL)
+            {
+                size_t C_isize = C->i_is_32 ?
+                    sizeof (uint32_t) : sizeof (uint64_t) ;
+
+                ASSERT (C_isize == A_isize) ;
+                GB_memcpy (C->h, A->h, anvec * A_isize, nthreads_max) ;
+            }
+        }
         //----------------------------------------------------------------------
         // determine the number of threads to use
         //----------------------------------------------------------------------
diff --git a/Source/mxm/GB_rowscale.c b/Source/mxm/GB_rowscale.c
index 150c061253..a9f3c4ab39 100644
--- a/Source/mxm/GB_rowscale.c
+++ b/Source/mxm/GB_rowscale.c
@@ -81,14 +81,7 @@ GrB_Info GB_rowscale                // C = D*B, row scale with diagonal D
     GB_void cscalar [GB_VLA(zsize)] ;
     bool C_iso = GB_AxB_iso (cscalar, D, B, D->vdim, semiring, flipxy, true) ;
 
-    //--------------------------------------------------------------------------
-    // copy the pattern of B into C
-    //--------------------------------------------------------------------------
-
-    // allocate C->x but do not initialize it
-    GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ;
     info = GrB_NO_VALUE ;
-    ASSERT (C->type == ztype) ;
 
     //--------------------------------------------------------------------------
     // C = D*B, row scale, compute numerical values
@@ -96,6 +89,9 @@ GrB_Info GB_rowscale                // C = D*B, row scale with diagonal D
 
     if (GB_IS_BUILTIN_BINOP_CODE_POSITIONAL (opcode))
     {
+        // Copy the pattern of B into C. Allocates, but does not initialize C->x.
+        GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ;
+        ASSERT (C->type == ztype) ;
 
         //----------------------------------------------------------------------
         // apply a positional operator: convert C=D*B to C=op(B)
@@ -147,6 +143,9 @@ GrB_Info GB_rowscale                // C = D*B, row scale with diagonal D
     }
     else if (C_iso)
     { 
+        // Copy the pattern of B into C. Allocates, but does not initialize C->x.
+        GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ;
+        ASSERT (C->type == ztype) ;
 
         //----------------------------------------------------------------------
         // via the iso kernel
@@ -169,6 +168,15 @@ GrB_Info GB_rowscale                // C = D*B, row scale with diagonal D
         // determine if the values are accessed
         //----------------------------------------------------------------------
 
+        // Do not dup B->i into C yet; if we use CUDA, we'll do it on the GPU
+        // FIXME: Add flags to GB_dup_worker for which arrays to copy
+        int64_t *tmp_Bi = B->i ;
+        B->i = NULL ;
+        GB_OK (GB_dup_worker (&C, C_iso, B, false, ztype)) ;
+        B->i = tmp_Bi ;
+        ASSERT (C->type == ztype) ;
+
+
         ASSERT (fmult != NULL) ;
         bool op_is_first  = (opcode == GB_FIRST_binop_code) ;
         bool op_is_second = (opcode == GB_SECOND_binop_code) ;
@@ -207,6 +215,27 @@ GrB_Info GB_rowscale                // C = D*B, row scale with diagonal D
         }
         #endif
 
+        // We are using the CPU. Finish the dup from B -> C
+        if (info == GrB_NO_VALUE)
+        {
+            // Copy in B->i
+            size_t B_isize = B->i_is_32 ?
+                sizeof (uint32_t) : sizeof (uint64_t) ;
+            size_t C_isize = C->i_is_32 ?
+                sizeof (uint32_t) : sizeof (uint64_t) ;
+
+            int64_t bnz = GB_nnz_held (B) ;
+            int64_t cnz = GB_nnz_held (C) ;
+            ASSERT (cnz == bnz) ;
+            
+            int nthreads_max = GB_Context_nthreads_max ( ) ;
+
+            if (B->i != NULL)
+            {
+                ASSERT (C_isize == B_isize) ;
+                GB_memcpy (C->i, B->i, bnz * B_isize, nthreads_max) ;
+            }
+        }
         //----------------------------------------------------------------------
         // determine the number of threads to use
         //----------------------------------------------------------------------
diff --git a/Source/select/GB_select_bitmap.c b/Source/select/GB_select_bitmap.c
index 0f38cff73f..40a854ce80 100644
--- a/Source/select/GB_select_bitmap.c
+++ b/Source/select/GB_select_bitmap.c
@@ -83,12 +83,6 @@ GrB_Info GB_select_bitmap
         // Cx [0] = Ax [0] or (A->type) thunk
         GB_select_iso (C->x, opcode, athunk, A->x, asize) ;
     }
-    else
-    { 
-        // Cx [0:anz-1] = Ax [0:anz-1]
-        // Fixme for CUDA: do this on the GPU if appropriate
-        GB_memcpy (C->x, A->x, anz * asize, nthreads) ;
-    }
 
     //--------------------------------------------------------------------------
     // bitmap selector kernel
@@ -105,6 +99,11 @@ GrB_Info GB_select_bitmap
 
     if (info == GrB_NO_VALUE)
     {
+        if (!C_iso) {
+            // Cx [0:anz-1] = Ax [0:anz-1]
+            GB_memcpy (C->x, A->x, anz * asize, nthreads) ;
+        }
+        
         if (GB_IS_INDEXUNARYOP_CODE_POSITIONAL (opcode))
         {