diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c index fc90b93228..3b780d295a 100644 --- a/backends/hip-gen/ceed-hip-gen-operator.c +++ b/backends/hip-gen/ceed-hip-gen-operator.c @@ -22,9 +22,20 @@ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) { Ceed ceed; CeedOperator_Hip_gen *impl; + bool is_composite; CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorIsComposite(op, &is_composite)); + if (is_composite) { + CeedInt num_suboperators; + + CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); + for (CeedInt i = 0; i < num_suboperators; i++) { + if (impl->streams[i]) CeedCallHip(ceed, hipStreamDestroy(impl->streams[i])); + impl->streams[i] = NULL; + } + } if (impl->module) CeedCallHip(ceed, hipModuleUnload(impl->module)); if (impl->points.num_per_elem) CeedCallHip(ceed, hipFree((void **)impl->points.num_per_elem)); CeedCallBackend(CeedFree(&impl)); @@ -239,28 +250,35 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C } static int CeedOperatorApplyAddComposite_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { - bool is_run_good[CEED_COMPOSITE_MAX] = {false}; - CeedInt num_suboperators; - const CeedScalar *input_arr = NULL; - CeedScalar *output_arr = NULL; - Ceed ceed; - CeedOperator *sub_operators; + bool is_run_good[CEED_COMPOSITE_MAX] = {true}; + CeedInt num_suboperators; + const CeedScalar *input_arr = NULL; + CeedScalar *output_arr; + Ceed ceed; + CeedOperator_Hip_gen *impl; + CeedOperator *sub_operators; CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCall(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); - CeedCall(CeedCompositeOperatorGetSubList(op, &sub_operators)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedCompositeOperatorGetNumSub(op, &num_suboperators)); + CeedCallBackend(CeedCompositeOperatorGetSubList(op, &sub_operators)); if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr)); if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr)); for (CeedInt i = 0; i < num_suboperators; i++) { CeedInt num_elem = 0; - CeedCall(CeedOperatorGetNumElements(sub_operators[i], &num_elem)); + CeedCallBackend(CeedOperatorGetNumElements(sub_operators[i], &num_elem)); if (num_elem > 0) { - hipStream_t stream = NULL; + if (!impl->streams[i]) CeedCallHip(ceed, hipStreamCreate(&impl->streams[i])); + CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], impl->streams[i], input_arr, output_arr, &is_run_good[i], request)); + } else { + is_run_good[i] = true; + } + } - CeedCallHip(ceed, hipStreamCreate(&stream)); - CeedCallBackend(CeedOperatorApplyAddCore_Hip_gen(sub_operators[i], stream, input_arr, output_arr, &is_run_good[i], request)); - CeedCallHip(ceed, hipStreamDestroy(stream)); + for (CeedInt i = 0; i < num_suboperators; i++) { + if (impl->streams[i]) { + if (is_run_good[i]) CeedCallHip(ceed, hipStreamSynchronize(impl->streams[i])); } } if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr)); diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h index e3e5c18975..4335302471 100644 --- a/backends/hip-gen/ceed-hip-gen.h +++ b/backends/hip-gen/ceed-hip-gen.h @@ -17,6 +17,7 @@ typedef struct { CeedInt Q, Q_1d; CeedInt max_P_1d; CeedInt thread_1d; + hipStream_t streams[CEED_COMPOSITE_MAX]; hipModule_t module; hipFunction_t op; FieldsInt_Hip indices; diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c index 2c1748033e..77f63b2fe4 100644 --- a/backends/hip-ref/ceed-hip-ref-vector.c +++ b/backends/hip-ref/ceed-hip-ref-vector.c @@ -305,19 +305,21 @@ static int CeedVectorCopyStrided_Hip(CeedVector vec, CeedSize start, CeedSize st // Set value for synced device/host array if (impl->d_array) { CeedScalar *copy_array; + Ceed ceed; + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_DEVICE, ©_array)); #if (HIP_VERSION >= 60000000) hipblasHandle_t handle; - Ceed ceed; - - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + hipStream_t stream; CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle)); + CeedCallHipblas(ceed, hipblasGetStream(handle, &stream)); #if defined(CEED_SCALAR_IS_FP32) CeedCallHipblas(ceed, hipblasScopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step)); #else /* CEED_SCALAR */ CeedCallHipblas(ceed, hipblasDcopy_64(handle, (int64_t)(stop - start), impl->d_array + start, (int64_t)step, copy_array + start, (int64_t)step)); #endif /* CEED_SCALAR */ + CeedCallHip(ceed, hipStreamSynchronize(stream)); #else /* HIP_VERSION */ CeedCallBackend(CeedDeviceCopyStrided_Hip(impl->d_array, start, stop, step, copy_array)); #endif /* HIP_VERSION */ @@ -557,6 +559,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor const CeedScalar *d_array; CeedVector_Hip *impl; hipblasHandle_t handle; + hipStream_t stream; Ceed_Hip *hip_data; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); @@ -564,7 +567,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorGetLength(vec, &length)); CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle)); - + CeedCallHipblas(ceed, hipblasGetStream(handle, &stream)); #if (HIP_VERSION < 60000000) // With ROCm 6, we can use the 64-bit integer interface. Prior to that, // we need to check if the vector is too long to handle with int32, @@ -581,6 +584,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor #if defined(CEED_SCALAR_IS_FP32) #if (HIP_VERSION >= 60000000) // We have ROCm 6, and can use 64-bit integers CeedCallHipblas(ceed, hipblasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); #else /* HIP_VERSION */ float sub_norm = 0.0; float *d_array_start; @@ -591,12 +595,14 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; CeedCallHipblas(ceed, hipblasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); *norm += sub_norm; } #endif /* HIP_VERSION */ #else /* CEED_SCALAR */ #if (HIP_VERSION >= 60000000) CeedCallHipblas(ceed, hipblasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); #else /* HIP_VERSION */ double sub_norm = 0.0; double *d_array_start; @@ -607,6 +613,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; CeedCallHipblas(ceed, hipblasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); *norm += sub_norm; } #endif /* HIP_VERSION */ @@ -617,6 +624,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor #if defined(CEED_SCALAR_IS_FP32) #if (HIP_VERSION >= 60000000) CeedCallHipblas(ceed, hipblasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); #else /* HIP_VERSION */ float sub_norm = 0.0, norm_sum = 0.0; float *d_array_start; @@ -627,6 +635,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; CeedCallHipblas(ceed, hipblasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); norm_sum += sub_norm * sub_norm; } *norm = sqrt(norm_sum); @@ -634,6 +643,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor #else /* CEED_SCALAR */ #if (HIP_VERSION >= 60000000) CeedCallHipblas(ceed, hipblasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); #else /* HIP_VERSION */ double sub_norm = 0.0, norm_sum = 0.0; double *d_array_start; @@ -644,6 +654,7 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; CeedCallHipblas(ceed, hipblasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); norm_sum += sub_norm * sub_norm; } *norm = sqrt(norm_sum); @@ -658,7 +669,8 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor CeedScalar norm_no_abs; CeedCallHipblas(ceed, hipblasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index)); - CeedCallHip(ceed, hipMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost)); + CeedCallHip(ceed, hipMemcpyAsync(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); *norm = fabs(norm_no_abs); #else /* HIP_VERSION */ CeedInt index; @@ -672,10 +684,11 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor CeedCallHipblas(ceed, hipblasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index)); if (hip_data->has_unified_addressing) { - CeedCallHip(ceed, hipDeviceSynchronize()); + CeedCallHip(ceed, hipStreamSynchronize(stream)); sub_max = fabs(d_array[index - 1]); } else { - CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost)); + CeedCallHip(ceed, hipMemcpyAsync(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); } if (fabs(sub_max) > current_max) current_max = fabs(sub_max); } @@ -688,10 +701,11 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor CeedCallHipblas(ceed, hipblasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index)); if (hip_data->has_unified_addressing) { - CeedCallHip(ceed, hipDeviceSynchronize()); + CeedCallHip(ceed, hipStreamSynchronize(stream)); norm_no_abs = fabs(d_array[index - 1]); } else { - CeedCallHip(ceed, hipMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost)); + CeedCallHip(ceed, hipMemcpyAsync(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); } *norm = fabs(norm_no_abs); #else /* HIP_VERSION */ @@ -706,10 +720,11 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor CeedCallHipblas(ceed, hipblasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index)); if (hip_data->has_unified_addressing) { - CeedCallHip(ceed, hipDeviceSynchronize()); + CeedCallHip(ceed, hipStreamSynchronize(stream)); sub_max = fabs(d_array[index - 1]); } else { - CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost)); + CeedCallHip(ceed, hipMemcpyAsync(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost, stream)); + CeedCallHip(ceed, hipStreamSynchronize(stream)); } if (fabs(sub_max) > current_max) current_max = fabs(sub_max); } @@ -780,13 +795,16 @@ static int CeedVectorScale_Hip(CeedVector x, CeedScalar alpha) { if (impl->d_array) { #if (HIP_VERSION >= 60000000) hipblasHandle_t handle; + hipStream_t stream; CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(x), &handle)); + CeedCallHipblas(CeedVectorReturnCeed(x), hipblasGetStream(handle, &stream)); #if defined(CEED_SCALAR_IS_FP32) CeedCallHipblas(CeedVectorReturnCeed(x), hipblasSscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1)); #else /* CEED_SCALAR */ CeedCallHipblas(CeedVectorReturnCeed(x), hipblasDscal_64(handle, (int64_t)length, &alpha, impl->d_array, 1)); #endif /* CEED_SCALAR */ + CeedCallHip(CeedVectorReturnCeed(x), hipStreamSynchronize(stream)); #else /* HIP_VERSION */ CeedCallBackend(CeedDeviceScale_Hip(impl->d_array, alpha, length)); #endif /* HIP_VERSION */ @@ -827,13 +845,16 @@ static int CeedVectorAXPY_Hip(CeedVector y, CeedScalar alpha, CeedVector x) { CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE)); #if (HIP_VERSION >= 60000000) hipblasHandle_t handle; + hipStream_t stream; - CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(y), &handle)); + CeedCallBackend(CeedGetHipblasHandle_Hip(CeedVectorReturnCeed(x), &handle)); + CeedCallHipblas(CeedVectorReturnCeed(y), hipblasGetStream(handle, &stream)); #if defined(CEED_SCALAR_IS_FP32) CeedCallHipblas(CeedVectorReturnCeed(y), hipblasSaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1)); #else /* CEED_SCALAR */ CeedCallHipblas(CeedVectorReturnCeed(y), hipblasDaxpy_64(handle, (int64_t)length, &alpha, x_impl->d_array, 1, y_impl->d_array, 1)); #endif /* CEED_SCALAR */ + CeedCallHip(CeedVectorReturnCeed(y), hipStreamSynchronize(stream)); #else /* HIP_VERSION */ CeedCallBackend(CeedDeviceAXPY_Hip(y_impl->d_array, alpha, x_impl->d_array, length)); #endif /* HIP_VERSION */ diff --git a/backends/hip-ref/ceed-hip-ref.c b/backends/hip-ref/ceed-hip-ref.c index b15686b8dc..2587e7fba3 100644 --- a/backends/hip-ref/ceed-hip-ref.c +++ b/backends/hip-ref/ceed-hip-ref.c @@ -29,7 +29,10 @@ int CeedGetHipblasHandle_Hip(Ceed ceed, hipblasHandle_t *handle) { Ceed_Hip *data; CeedCallBackend(CeedGetData(ceed, &data)); - if (!data->hipblas_handle) CeedCallHipblas(ceed, hipblasCreate(&data->hipblas_handle)); + if (!data->hipblas_handle) { + CeedCallHipblas(ceed, hipblasCreate(&data->hipblas_handle)); + CeedCallHipblas(ceed, hipblasSetPointerMode(data->hipblas_handle, HIPBLAS_POINTER_MODE_HOST)); + } *handle = data->hipblas_handle; return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c index 410d13af2e..ae1591995f 100644 --- a/backends/hip-shared/ceed-hip-shared-basis.c +++ b/backends/hip-shared/ceed-hip-shared-basis.c @@ -489,6 +489,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -644,6 +645,7 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) { CeedCallHip(ceed, hipFree(data->d_collo_grad_1d)); CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d)); CeedCallBackend(CeedFree(&data)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } @@ -737,6 +739,7 @@ int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) > hip_data->device_prop.sharedMemPerBlock) { CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis)); + CeedCallBackend(CeedDestroy(&ceed)); return CEED_ERROR_SUCCESS; } } diff --git a/interface/ceed.c b/interface/ceed.c index 203d2e5790..2b8bfd1c1e 100644 --- a/interface/ceed.c +++ b/interface/ceed.c @@ -827,6 +827,15 @@ int CeedReference(Ceed ceed) { @ref Developer **/ int CeedGetWorkVectorMemoryUsage(Ceed ceed, CeedScalar *usage_mb) { + if (!ceed->VectorCreate) { + Ceed delegate; + + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector")); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate"); + CeedCall(CeedGetWorkVectorMemoryUsage(delegate, usage_mb)); + CeedCall(CeedDestroy(&delegate)); + return CEED_ERROR_SUCCESS; + } *usage_mb = 0.0; if (ceed->work_vectors) { for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) { @@ -852,6 +861,15 @@ int CeedGetWorkVectorMemoryUsage(Ceed ceed, CeedScalar *usage_mb) { @ref Backend **/ int CeedClearWorkVectors(Ceed ceed, CeedSize min_len) { + if (!ceed->VectorCreate) { + Ceed delegate; + + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector")); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate"); + CeedCall(CeedClearWorkVectors(delegate, min_len)); + CeedCall(CeedDestroy(&delegate)); + return CEED_ERROR_SUCCESS; + } if (!ceed->work_vectors) return CEED_ERROR_SUCCESS; for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) { if (ceed->work_vectors->is_in_use[i]) continue; @@ -890,6 +908,16 @@ int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec) { CeedInt i = 0; CeedScalar usage_mb; + if (!ceed->VectorCreate) { + Ceed delegate; + + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector")); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate"); + CeedCall(CeedGetWorkVector(delegate, len, vec)); + CeedCall(CeedDestroy(&delegate)); + return CEED_ERROR_SUCCESS; + } + if (!ceed->work_vectors) CeedCall(CeedWorkVectorsCreate(ceed)); // Search for big enough work vector @@ -936,6 +964,16 @@ int CeedGetWorkVector(Ceed ceed, CeedSize len, CeedVector *vec) { @ref Backend **/ int CeedRestoreWorkVector(Ceed ceed, CeedVector *vec) { + if (!ceed->VectorCreate) { + Ceed delegate; + + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Vector")); + CeedCheck(delegate, ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement VectorCreate"); + CeedCall(CeedRestoreWorkVector(delegate, vec)); + CeedCall(CeedDestroy(&delegate)); + return CEED_ERROR_SUCCESS; + } + for (CeedInt i = 0; i < ceed->work_vectors->num_vecs; i++) { if (*vec == ceed->work_vectors->vecs[i]) { CeedCheck(ceed->work_vectors->is_in_use[i], ceed, CEED_ERROR_ACCESS, "Work vector %" CeedSize_FMT " was not checked out but is being returned");