@@ -277,18 +277,18 @@ static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_t
277277//------------------------------------------------------------------------------
278278// Set host array to value
279279//------------------------------------------------------------------------------
280- static int CeedHostSetValue_Cuda (CeedScalar * h_array , CeedInt length , CeedScalar val ) {
281- for (int i = 0 ; i < length ; i ++ ) h_array [i ] = val ;
280+ static int CeedHostSetValue_Cuda (CeedScalar * h_array , CeedSize length , CeedScalar val ) {
281+ for (CeedSize i = 0 ; i < length ; i ++ ) h_array [i ] = val ;
282282 return CEED_ERROR_SUCCESS ;
283283}
284284
285285//------------------------------------------------------------------------------
286286// Set device array to value (impl in .cu file)
287287//------------------------------------------------------------------------------
288- int CeedDeviceSetValue_Cuda (CeedScalar * d_array , CeedInt length , CeedScalar val );
288+ int CeedDeviceSetValue_Cuda (CeedScalar * d_array , CeedSize length , CeedScalar val );
289289
290290//------------------------------------------------------------------------------
291- // Set a vector to a value,
291+ // Set a vector to a value
292292//------------------------------------------------------------------------------
293293static int CeedVectorSetValue_Cuda (CeedVector vec , CeedScalar val ) {
294294 Ceed ceed ;
@@ -449,36 +449,129 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no
449449 cublasHandle_t handle ;
450450 CeedCallBackend (CeedGetCublasHandle_Cuda (ceed , & handle ));
451451
452+ #if CUDA_VERSION < 12000
453+ // With CUDA 12, we can use the 64-bit integer interface. Prior to that,
454+ // we need to check if the vector is too long to handle with int32,
455+ // and if so, divide it into subsections for repeated cuBLAS calls
456+ CeedSize num_calls = length / INT_MAX ;
457+ if (length % INT_MAX > 0 ) num_calls += 1 ;
458+ #endif
459+
452460 // Compute norm
453461 const CeedScalar * d_array ;
454462 CeedCallBackend (CeedVectorGetArrayRead (vec , CEED_MEM_DEVICE , & d_array ));
455463 switch (type ) {
456464 case CEED_NORM_1 : {
465+ * norm = 0.0 ;
457466 if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ) {
458- CeedCallCublas (ceed , cublasSasum (handle , length , (float * )d_array , 1 , (float * )norm ));
467+ #if CUDA_VERSION >= 12000 // We have CUDA 12, and can use 64-bit integers
468+ CeedCallCublas (ceed , cublasSasum_64 (handle , (int64_t )length , (float * )d_array , 1 , (float * )norm ));
469+ #else
470+ float sub_norm = 0.0 ;
471+ float * d_array_start ;
472+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
473+ d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
474+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
475+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
476+ CeedCallCublas (ceed , cublasSasum (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & sub_norm ));
477+ * norm += sub_norm ;
478+ }
479+ #endif
459480 } else {
460- CeedCallCublas (ceed , cublasDasum (handle , length , (double * )d_array , 1 , (double * )norm ));
481+ #if CUDA_VERSION >= 12000
482+ CeedCallCublas (ceed , cublasDasum_64 (handle , (int64_t )length , (double * )d_array , 1 , (double * )norm ));
483+ #else
484+ double sub_norm = 0.0 ;
485+ double * d_array_start ;
486+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
487+ d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
488+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
489+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
490+ CeedCallCublas (ceed , cublasDasum (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & sub_norm ));
491+ * norm += sub_norm ;
492+ }
493+ #endif
461494 }
462495 break ;
463496 }
464497 case CEED_NORM_2 : {
465498 if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ) {
466- CeedCallCublas (ceed , cublasSnrm2 (handle , length , (float * )d_array , 1 , (float * )norm ));
499+ #if CUDA_VERSION >= 12000
500+ CeedCallCublas (ceed , cublasSnrm2_64 (handle , (int64_t )length , (float * )d_array , 1 , (float * )norm ));
501+ #else
502+ float sub_norm = 0.0 , norm_sum = 0.0 ;
503+ float * d_array_start ;
504+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
505+ d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
506+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
507+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
508+ CeedCallCublas (ceed , cublasSnrm2 (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & sub_norm ));
509+ norm_sum += sub_norm * sub_norm ;
510+ }
511+ * norm = sqrt (norm_sum );
512+ #endif
467513 } else {
468- CeedCallCublas (ceed , cublasDnrm2 (handle , length , (double * )d_array , 1 , (double * )norm ));
514+ #if CUDA_VERSION >= 12000
515+ CeedCallCublas (ceed , cublasDnrm2_64 (handle , (int64_t )length , (double * )d_array , 1 , (double * )norm ));
516+ #else
517+ double sub_norm = 0.0 , norm_sum = 0.0 ;
518+ double * d_array_start ;
519+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
520+ d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
521+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
522+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
523+ CeedCallCublas (ceed , cublasDnrm2 (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & sub_norm ));
524+ norm_sum += sub_norm * sub_norm ;
525+ }
526+ * norm = sqrt (norm_sum );
527+ #endif
469528 }
470529 break ;
471530 }
472531 case CEED_NORM_MAX : {
473- CeedInt indx ;
474532 if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32 ) {
475- CeedCallCublas (ceed , cublasIsamax (handle , length , (float * )d_array , 1 , & indx ));
533+ #if CUDA_VERSION >= 12000
534+ int64_t indx ;
535+ CeedCallCublas (ceed , cublasIsamax_64 (handle , (int64_t )length , (float * )d_array , 1 , & indx ));
536+ CeedScalar normNoAbs ;
537+ CeedCallCuda (ceed , cudaMemcpy (& normNoAbs , impl -> d_array + indx - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
538+ * norm = fabs (normNoAbs );
539+ #else
540+ CeedInt indx ;
541+ float sub_max = 0.0 , current_max = 0.0 ;
542+ float * d_array_start ;
543+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
544+ d_array_start = (float * )d_array + (CeedSize )(i )* INT_MAX ;
545+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
546+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
547+ CeedCallCublas (ceed , cublasIsamax (handle , (CeedInt )sub_length , (float * )d_array_start , 1 , & indx ));
548+ CeedCallCuda (ceed , cudaMemcpy (& sub_max , d_array_start + indx - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
549+ if (fabs (sub_max ) > current_max ) current_max = fabs (sub_max );
550+ }
551+ * norm = current_max ;
552+ #endif
476553 } else {
477- CeedCallCublas (ceed , cublasIdamax (handle , length , (double * )d_array , 1 , & indx ));
554+ #if CUDA_VERSION >= 12000
555+ int64_t indx ;
556+ CeedCallCublas (ceed , cublasIdamax_64 (handle , (int64_t )length , (double * )d_array , 1 , & indx ));
557+ CeedScalar normNoAbs ;
558+ CeedCallCuda (ceed , cudaMemcpy (& normNoAbs , impl -> d_array + indx - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
559+ * norm = fabs (normNoAbs );
560+ #else
561+ CeedInt indx ;
562+ double sub_max = 0.0 , current_max = 0.0 ;
563+ double * d_array_start ;
564+ for (CeedInt i = 0 ; i < num_calls ; i ++ ) {
565+ d_array_start = (double * )d_array + (CeedSize )(i )* INT_MAX ;
566+ CeedSize remaining_length = length - (CeedSize )(i )* INT_MAX ;
567+ CeedInt sub_length = (i == num_calls - 1 ) ? (CeedInt )(remaining_length ) : INT_MAX ;
568+ CeedCallCublas (ceed , cublasIdamax (handle , (CeedInt )sub_length , (double * )d_array_start , 1 , & indx ));
569+ CeedCallCuda (ceed , cudaMemcpy (& sub_max , d_array_start + indx - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
570+ if (fabs (sub_max ) > current_max ) current_max = fabs (sub_max );
571+ }
572+ * norm = current_max ;
573+ #endif
478574 }
479- CeedScalar normNoAbs ;
480- CeedCallCuda (ceed , cudaMemcpy (& normNoAbs , impl -> d_array + indx - 1 , sizeof (CeedScalar ), cudaMemcpyDeviceToHost ));
481- * norm = fabs (normNoAbs );
482575 break ;
483576 }
484577 }
@@ -490,8 +583,8 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no
490583//------------------------------------------------------------------------------
491584// Take reciprocal of a vector on host
492585//------------------------------------------------------------------------------
493- static int CeedHostReciprocal_Cuda (CeedScalar * h_array , CeedInt length ) {
494- for (int i = 0 ; i < length ; i ++ ) {
586+ static int CeedHostReciprocal_Cuda (CeedScalar * h_array , CeedSize length ) {
587+ for (CeedSize i = 0 ; i < length ; i ++ ) {
495588 if (fabs (h_array [i ]) > CEED_EPSILON ) h_array [i ] = 1. / h_array [i ];
496589 }
497590 return CEED_ERROR_SUCCESS ;
@@ -500,7 +593,7 @@ static int CeedHostReciprocal_Cuda(CeedScalar *h_array, CeedInt length) {
500593//------------------------------------------------------------------------------
501594// Take reciprocal of a vector on device (impl in .cu file)
502595//------------------------------------------------------------------------------
503- int CeedDeviceReciprocal_Cuda (CeedScalar * d_array , CeedInt length );
596+ int CeedDeviceReciprocal_Cuda (CeedScalar * d_array , CeedSize length );
504597
505598//------------------------------------------------------------------------------
506599// Take reciprocal of a vector
@@ -523,15 +616,15 @@ static int CeedVectorReciprocal_Cuda(CeedVector vec) {
523616//------------------------------------------------------------------------------
524617// Compute x = alpha x on the host
525618//------------------------------------------------------------------------------
526- static int CeedHostScale_Cuda (CeedScalar * x_array , CeedScalar alpha , CeedInt length ) {
527- for (int i = 0 ; i < length ; i ++ ) x_array [i ] *= alpha ;
619+ static int CeedHostScale_Cuda (CeedScalar * x_array , CeedScalar alpha , CeedSize length ) {
620+ for (CeedSize i = 0 ; i < length ; i ++ ) x_array [i ] *= alpha ;
528621 return CEED_ERROR_SUCCESS ;
529622}
530623
531624//------------------------------------------------------------------------------
532625// Compute x = alpha x on device (impl in .cu file)
533626//------------------------------------------------------------------------------
534- int CeedDeviceScale_Cuda (CeedScalar * x_array , CeedScalar alpha , CeedInt length );
627+ int CeedDeviceScale_Cuda (CeedScalar * x_array , CeedScalar alpha , CeedSize length );
535628
536629//------------------------------------------------------------------------------
537630// Compute x = alpha x
@@ -554,15 +647,15 @@ static int CeedVectorScale_Cuda(CeedVector x, CeedScalar alpha) {
554647//------------------------------------------------------------------------------
555648// Compute y = alpha x + y on the host
556649//------------------------------------------------------------------------------
557- static int CeedHostAXPY_Cuda (CeedScalar * y_array , CeedScalar alpha , CeedScalar * x_array , CeedInt length ) {
558- for (int i = 0 ; i < length ; i ++ ) y_array [i ] += alpha * x_array [i ];
650+ static int CeedHostAXPY_Cuda (CeedScalar * y_array , CeedScalar alpha , CeedScalar * x_array , CeedSize length ) {
651+ for (CeedSize i = 0 ; i < length ; i ++ ) y_array [i ] += alpha * x_array [i ];
559652 return CEED_ERROR_SUCCESS ;
560653}
561654
562655//------------------------------------------------------------------------------
563656// Compute y = alpha x + y on device (impl in .cu file)
564657//------------------------------------------------------------------------------
565- int CeedDeviceAXPY_Cuda (CeedScalar * y_array , CeedScalar alpha , CeedScalar * x_array , CeedInt length );
658+ int CeedDeviceAXPY_Cuda (CeedScalar * y_array , CeedScalar alpha , CeedScalar * x_array , CeedSize length );
566659
567660//------------------------------------------------------------------------------
568661// Compute y = alpha x + y
@@ -592,15 +685,15 @@ static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) {
592685//------------------------------------------------------------------------------
593686// Compute y = alpha x + beta y on the host
594687//------------------------------------------------------------------------------
595- static int CeedHostAXPBY_Cuda (CeedScalar * y_array , CeedScalar alpha , CeedScalar beta , CeedScalar * x_array , CeedInt length ) {
596- for (int i = 0 ; i < length ; i ++ ) y_array [i ] += alpha * x_array [i ] + beta * y_array [i ];
688+ static int CeedHostAXPBY_Cuda (CeedScalar * y_array , CeedScalar alpha , CeedScalar beta , CeedScalar * x_array , CeedSize length ) {
689+ for (CeedSize i = 0 ; i < length ; i ++ ) y_array [i ] += alpha * x_array [i ] + beta * y_array [i ];
597690 return CEED_ERROR_SUCCESS ;
598691}
599692
600693//------------------------------------------------------------------------------
601694// Compute y = alpha x + beta y on device (impl in .cu file)
602695//------------------------------------------------------------------------------
603- int CeedDeviceAXPBY_Cuda (CeedScalar * y_array , CeedScalar alpha , CeedScalar beta , CeedScalar * x_array , CeedInt length );
696+ int CeedDeviceAXPBY_Cuda (CeedScalar * y_array , CeedScalar alpha , CeedScalar beta , CeedScalar * x_array , CeedSize length );
604697
605698//------------------------------------------------------------------------------
606699// Compute y = alpha x + beta y
@@ -630,15 +723,15 @@ static int CeedVectorAXPBY_Cuda(CeedVector y, CeedScalar alpha, CeedScalar beta,
630723//------------------------------------------------------------------------------
631724// Compute the pointwise multiplication w = x .* y on the host
632725//------------------------------------------------------------------------------
633- static int CeedHostPointwiseMult_Cuda (CeedScalar * w_array , CeedScalar * x_array , CeedScalar * y_array , CeedInt length ) {
634- for (int i = 0 ; i < length ; i ++ ) w_array [i ] = x_array [i ] * y_array [i ];
726+ static int CeedHostPointwiseMult_Cuda (CeedScalar * w_array , CeedScalar * x_array , CeedScalar * y_array , CeedSize length ) {
727+ for (CeedSize i = 0 ; i < length ; i ++ ) w_array [i ] = x_array [i ] * y_array [i ];
635728 return CEED_ERROR_SUCCESS ;
636729}
637730
638731//------------------------------------------------------------------------------
639732// Compute the pointwise multiplication w = x .* y on device (impl in .cu file)
640733//------------------------------------------------------------------------------
641- int CeedDevicePointwiseMult_Cuda (CeedScalar * w_array , CeedScalar * x_array , CeedScalar * y_array , CeedInt length );
734+ int CeedDevicePointwiseMult_Cuda (CeedScalar * w_array , CeedScalar * x_array , CeedScalar * y_array , CeedSize length );
642735
643736//------------------------------------------------------------------------------
644737// Compute the pointwise multiplication w = x .* y
0 commit comments